pointblank 0.11.1__py3-none-any.whl → 0.11.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/cli.py +426 -166
- {pointblank-0.11.1.dist-info → pointblank-0.11.2.dist-info}/METADATA +14 -16
- {pointblank-0.11.1.dist-info → pointblank-0.11.2.dist-info}/RECORD +7 -7
- {pointblank-0.11.1.dist-info → pointblank-0.11.2.dist-info}/WHEEL +0 -0
- {pointblank-0.11.1.dist-info → pointblank-0.11.2.dist-info}/entry_points.txt +0 -0
- {pointblank-0.11.1.dist-info → pointblank-0.11.2.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.11.1.dist-info → pointblank-0.11.2.dist-info}/top_level.txt +0 -0
pointblank/cli.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import copy
|
|
3
4
|
import sys
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Any
|
|
@@ -274,7 +275,7 @@ def _format_dtype_compact(dtype_str: str) -> str:
|
|
|
274
275
|
elif "str" in dtype_str:
|
|
275
276
|
return "str"
|
|
276
277
|
|
|
277
|
-
# Unknown or complex types
|
|
278
|
+
# Unknown or complex types: truncate if too long
|
|
278
279
|
elif len(dtype_str) > 8:
|
|
279
280
|
return dtype_str[:8] + "…"
|
|
280
281
|
else:
|
|
@@ -395,7 +396,7 @@ def _rich_print_scan_table(
|
|
|
395
396
|
# Clean up HTML formatting from the raw data
|
|
396
397
|
str_val = str(value)
|
|
397
398
|
|
|
398
|
-
# Handle multi-line values with <br> tags FIRST
|
|
399
|
+
# Handle multi-line values with <br> tags FIRST: take the first line (absolute number)
|
|
399
400
|
if "<br>" in str_val:
|
|
400
401
|
str_val = str_val.split("<br>")[0].strip()
|
|
401
402
|
# For unique values, we want just the integer part
|
|
@@ -414,14 +415,14 @@ def _rich_print_scan_table(
|
|
|
414
415
|
# Clean up extra whitespace
|
|
415
416
|
str_val = re.sub(r"\s+", " ", str_val).strip()
|
|
416
417
|
|
|
417
|
-
# Handle values like "2<.01"
|
|
418
|
+
# Handle values like "2<.01": extract the first number
|
|
418
419
|
if "<" in str_val and not (str_val.startswith("<") and str_val.endswith(">")):
|
|
419
420
|
# Extract number before the < symbol
|
|
420
421
|
before_lt = str_val.split("<")[0].strip()
|
|
421
422
|
if before_lt and before_lt.replace(".", "").replace("-", "").isdigit():
|
|
422
423
|
str_val = before_lt
|
|
423
424
|
|
|
424
|
-
# Handle boolean unique values like "T0.62F0.38"
|
|
425
|
+
# Handle boolean unique values like "T0.62F0.38": extract the more readable format
|
|
425
426
|
if re.match(r"^[TF]\d+\.\d+[TF]\d+\.\d+$", str_val):
|
|
426
427
|
# Extract T and F values
|
|
427
428
|
t_match = re.search(r"T(\d+\.\d+)", str_val)
|
|
@@ -451,7 +452,7 @@ def _rich_print_scan_table(
|
|
|
451
452
|
# Simple integers under 10000
|
|
452
453
|
return str(int(num_val))
|
|
453
454
|
elif abs(num_val) >= 10000000 and abs(num_val) < 100000000:
|
|
454
|
-
# Likely dates in YYYYMMDD format
|
|
455
|
+
# Likely dates in YYYYMMDD format: format as date-like
|
|
455
456
|
int_val = int(num_val)
|
|
456
457
|
if 19000101 <= int_val <= 29991231: # Reasonable date range
|
|
457
458
|
str_date = str(int_val)
|
|
@@ -463,29 +464,29 @@ def _rich_print_scan_table(
|
|
|
463
464
|
# Otherwise treat as large number
|
|
464
465
|
return f"{num_val / 1000000:.1f}M"
|
|
465
466
|
elif abs(num_val) >= 1000000:
|
|
466
|
-
# Large numbers
|
|
467
|
+
# Large numbers: use scientific notation or M/k notation
|
|
467
468
|
|
|
468
469
|
if abs(num_val) >= 1000000000:
|
|
469
470
|
return f"{num_val:.1e}"
|
|
470
471
|
else:
|
|
471
472
|
return f"{num_val / 1000000:.1f}M"
|
|
472
473
|
elif abs(num_val) >= 10000:
|
|
473
|
-
# Numbers >= 10k
|
|
474
|
+
# Numbers >= 10k: use compact notation
|
|
474
475
|
return f"{num_val / 1000:.1f}k"
|
|
475
476
|
elif abs(num_val) >= 100:
|
|
476
|
-
# Numbers 100-9999
|
|
477
|
+
# Numbers 100-9999: show with minimal decimals
|
|
477
478
|
return f"{num_val:.1f}"
|
|
478
479
|
elif abs(num_val) >= 10:
|
|
479
|
-
# Numbers 10-99
|
|
480
|
+
# Numbers 10-99: show with one decimal
|
|
480
481
|
return f"{num_val:.1f}"
|
|
481
482
|
elif abs(num_val) >= 1:
|
|
482
|
-
# Numbers 1-9
|
|
483
|
+
# Numbers 1-9: show with two decimals
|
|
483
484
|
return f"{num_val:.2f}"
|
|
484
485
|
elif abs(num_val) >= 0.01:
|
|
485
|
-
# Small numbers
|
|
486
|
+
# Small numbers: show with appropriate precision
|
|
486
487
|
return f"{num_val:.2f}"
|
|
487
488
|
else:
|
|
488
|
-
# Very small numbers
|
|
489
|
+
# Very small numbers: use scientific notation
|
|
489
490
|
|
|
490
491
|
return f"{num_val:.1e}"
|
|
491
492
|
|
|
@@ -493,7 +494,7 @@ def _rich_print_scan_table(
|
|
|
493
494
|
# Not a number, handle as string
|
|
494
495
|
pass
|
|
495
496
|
|
|
496
|
-
# Handle date/datetime strings
|
|
497
|
+
# Handle date/datetime strings: show abbreviated format
|
|
497
498
|
if len(str_val) > 10 and any(char in str_val for char in ["-", "/", ":"]):
|
|
498
499
|
# Likely a date/datetime, show abbreviated
|
|
499
500
|
if len(str_val) > max_width:
|
|
@@ -933,14 +934,19 @@ def _rich_print_gt_table(
|
|
|
933
934
|
|
|
934
935
|
|
|
935
936
|
def _display_validation_summary(validation: Any) -> None:
|
|
936
|
-
"""Display a validation summary in a Rich table format."""
|
|
937
|
+
"""Display a validation summary in a compact Rich table format."""
|
|
937
938
|
try:
|
|
938
939
|
# Try to get the summary from the validation report
|
|
939
940
|
if hasattr(validation, "validation_info") and validation.validation_info is not None:
|
|
940
941
|
# Use the validation_info to create a summary
|
|
941
942
|
info = validation.validation_info
|
|
942
943
|
n_steps = len(info)
|
|
943
|
-
|
|
944
|
+
|
|
945
|
+
# Count steps based on their threshold status
|
|
946
|
+
n_passed = sum(
|
|
947
|
+
1 for step in info if not step.warning and not step.error and not step.critical
|
|
948
|
+
)
|
|
949
|
+
n_all_passed = sum(1 for step in info if step.all_passed)
|
|
944
950
|
n_failed = n_steps - n_passed
|
|
945
951
|
|
|
946
952
|
# Calculate severity counts
|
|
@@ -950,64 +956,213 @@ def _display_validation_summary(validation: Any) -> None:
|
|
|
950
956
|
|
|
951
957
|
all_passed = n_failed == 0
|
|
952
958
|
|
|
953
|
-
# Determine highest severity
|
|
959
|
+
# Determine highest severity and its color
|
|
954
960
|
if n_critical > 0:
|
|
955
961
|
highest_severity = "critical"
|
|
962
|
+
severity_color = "red"
|
|
956
963
|
elif n_error > 0:
|
|
957
964
|
highest_severity = "error"
|
|
965
|
+
severity_color = "yellow"
|
|
958
966
|
elif n_warning > 0:
|
|
959
967
|
highest_severity = "warning"
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
968
|
+
severity_color = "bright_black" # gray
|
|
969
|
+
elif n_all_passed == n_steps:
|
|
970
|
+
# All steps passed AND all steps had 100% pass rate
|
|
963
971
|
highest_severity = "all passed"
|
|
972
|
+
severity_color = "bold green"
|
|
973
|
+
else:
|
|
974
|
+
# Steps passed (no threshold exceedances) but some had failing test units
|
|
975
|
+
highest_severity = "passed"
|
|
976
|
+
severity_color = "green"
|
|
977
|
+
|
|
978
|
+
# Create compact summary header
|
|
979
|
+
# Format: Steps: 6 / P: 3 (3 AP) / W: 3 / E: 0 / C: 0 / warning
|
|
980
|
+
summary_header = (
|
|
981
|
+
f"Steps: {n_steps} / P: {n_passed} ({n_all_passed} AP) / "
|
|
982
|
+
f"W: {n_warning} / E: {n_error} / C: {n_critical} / "
|
|
983
|
+
f"[{severity_color}]{highest_severity}[/{severity_color}]"
|
|
984
|
+
)
|
|
964
985
|
|
|
965
|
-
#
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
# Add summary statistics
|
|
971
|
-
table.add_row("Total Steps", str(n_steps))
|
|
972
|
-
table.add_row("Passing Steps", str(n_passed))
|
|
973
|
-
table.add_row("Failing Steps", str(n_failed))
|
|
974
|
-
table.add_row("Warning Steps", str(n_warning))
|
|
975
|
-
table.add_row("Error Steps", str(n_error))
|
|
976
|
-
table.add_row("Critical Steps", str(n_critical))
|
|
977
|
-
table.add_row("All Passed", str(all_passed))
|
|
978
|
-
table.add_row("Highest Severity", highest_severity)
|
|
979
|
-
|
|
980
|
-
console.print(table)
|
|
986
|
+
# Print the report title and summary
|
|
987
|
+
console.print()
|
|
988
|
+
console.print("[blue]Validation Report[/blue]")
|
|
989
|
+
console.print(f"[white]{summary_header}[/white]")
|
|
981
990
|
|
|
982
991
|
# Display step details
|
|
983
992
|
if n_steps > 0:
|
|
993
|
+
from rich.box import SIMPLE_HEAD
|
|
994
|
+
|
|
984
995
|
steps_table = Table(
|
|
985
|
-
|
|
996
|
+
show_header=True,
|
|
997
|
+
header_style="bold cyan",
|
|
998
|
+
box=SIMPLE_HEAD,
|
|
986
999
|
)
|
|
987
|
-
steps_table.add_column("
|
|
988
|
-
steps_table.add_column("
|
|
1000
|
+
steps_table.add_column("", style="dim")
|
|
1001
|
+
steps_table.add_column("Step", style="white")
|
|
989
1002
|
steps_table.add_column("Column", style="cyan")
|
|
990
|
-
steps_table.add_column("
|
|
991
|
-
steps_table.add_column("
|
|
1003
|
+
steps_table.add_column("Values", style="yellow")
|
|
1004
|
+
steps_table.add_column("Units", style="blue")
|
|
1005
|
+
steps_table.add_column("Pass", style="green")
|
|
1006
|
+
steps_table.add_column("Fail", style="red")
|
|
1007
|
+
steps_table.add_column("W", style="bright_black")
|
|
1008
|
+
steps_table.add_column("E", style="yellow")
|
|
1009
|
+
steps_table.add_column("C", style="red")
|
|
1010
|
+
steps_table.add_column("Ext", style="blue", justify="center")
|
|
1011
|
+
|
|
1012
|
+
def format_units(n: int) -> str:
|
|
1013
|
+
"""Format large numbers with K, M, B abbreviations for values above 10,000."""
|
|
1014
|
+
if n is None:
|
|
1015
|
+
return "—"
|
|
1016
|
+
if n >= 1000000000: # Billions
|
|
1017
|
+
return f"{n / 1000000000:.1f}B"
|
|
1018
|
+
elif n >= 1000000: # Millions
|
|
1019
|
+
return f"{n / 1000000:.1f}M"
|
|
1020
|
+
elif n >= 10000: # Use K for 10,000 and above
|
|
1021
|
+
return f"{n / 1000:.0f}K"
|
|
1022
|
+
else:
|
|
1023
|
+
return str(n)
|
|
1024
|
+
|
|
1025
|
+
def format_pass_fail(passed: int, total: int) -> str:
|
|
1026
|
+
"""Format pass/fail counts with abbreviated numbers and fractions."""
|
|
1027
|
+
if passed is None or total is None or total == 0:
|
|
1028
|
+
return "—/—"
|
|
1029
|
+
|
|
1030
|
+
# Calculate fraction
|
|
1031
|
+
fraction = passed / total
|
|
1032
|
+
|
|
1033
|
+
# Format fraction with special handling for very small and very large values
|
|
1034
|
+
if fraction == 0.0:
|
|
1035
|
+
fraction_str = "0.00"
|
|
1036
|
+
elif fraction == 1.0:
|
|
1037
|
+
fraction_str = "1.00"
|
|
1038
|
+
elif fraction < 0.005: # Less than 0.005 rounds to 0.00
|
|
1039
|
+
fraction_str = "<0.01"
|
|
1040
|
+
elif fraction > 0.995: # Greater than 0.995 rounds to 1.00
|
|
1041
|
+
fraction_str = ">0.99"
|
|
1042
|
+
else:
|
|
1043
|
+
fraction_str = f"{fraction:.2f}"
|
|
1044
|
+
|
|
1045
|
+
# Format absolute number with abbreviations
|
|
1046
|
+
absolute_str = format_units(passed)
|
|
1047
|
+
|
|
1048
|
+
return f"{absolute_str}/{fraction_str}"
|
|
992
1049
|
|
|
993
1050
|
for step in info:
|
|
994
|
-
|
|
995
|
-
|
|
1051
|
+
# Extract values information for the Values column
|
|
1052
|
+
values_str = "—" # Default to em dash if no values
|
|
1053
|
+
|
|
1054
|
+
# Handle different validation types
|
|
1055
|
+
if step.assertion_type == "col_schema_match":
|
|
1056
|
+
values_str = "—" # Schema is too complex to display inline
|
|
1057
|
+
elif step.assertion_type == "col_vals_between":
|
|
1058
|
+
# For between validations, try to get left and right bounds
|
|
1059
|
+
if (
|
|
1060
|
+
hasattr(step, "left")
|
|
1061
|
+
and hasattr(step, "right")
|
|
1062
|
+
and step.left is not None
|
|
1063
|
+
and step.right is not None
|
|
1064
|
+
):
|
|
1065
|
+
values_str = f"[{step.left}, {step.right}]"
|
|
1066
|
+
elif hasattr(step, "values") and step.values is not None:
|
|
1067
|
+
if isinstance(step.values, (list, tuple)) and len(step.values) >= 2:
|
|
1068
|
+
values_str = f"[{step.values[0]}, {step.values[1]}]"
|
|
1069
|
+
else:
|
|
1070
|
+
values_str = str(step.values)
|
|
1071
|
+
elif step.assertion_type in ["row_count_match", "col_count_match"]:
|
|
1072
|
+
# For count match validations, extract the 'count' value from the dictionary
|
|
1073
|
+
if hasattr(step, "values") and step.values is not None:
|
|
1074
|
+
if isinstance(step.values, dict) and "count" in step.values:
|
|
1075
|
+
values_str = str(step.values["count"])
|
|
1076
|
+
else:
|
|
1077
|
+
values_str = str(step.values)
|
|
1078
|
+
else:
|
|
1079
|
+
values_str = "—"
|
|
1080
|
+
elif step.assertion_type in ["col_vals_expr", "conjointly"]:
|
|
1081
|
+
values_str = "COLUMN EXPR"
|
|
1082
|
+
elif step.assertion_type == "specially":
|
|
1083
|
+
values_str = "EXPR"
|
|
1084
|
+
elif hasattr(step, "values") and step.values is not None:
|
|
1085
|
+
if isinstance(step.values, (list, tuple)):
|
|
1086
|
+
if len(step.values) <= 3:
|
|
1087
|
+
values_str = ", ".join(str(v) for v in step.values)
|
|
1088
|
+
else:
|
|
1089
|
+
values_str = f"{', '.join(str(v) for v in step.values[:3])}..."
|
|
1090
|
+
else:
|
|
1091
|
+
values_str = str(step.values)
|
|
1092
|
+
elif hasattr(step, "value") and step.value is not None:
|
|
1093
|
+
values_str = str(step.value)
|
|
1094
|
+
elif hasattr(step, "set") and step.set is not None:
|
|
1095
|
+
if isinstance(step.set, (list, tuple)):
|
|
1096
|
+
if len(step.set) <= 3:
|
|
1097
|
+
values_str = ", ".join(str(v) for v in step.set)
|
|
1098
|
+
else:
|
|
1099
|
+
values_str = f"{', '.join(str(v) for v in step.set[:3])}..."
|
|
1100
|
+
else:
|
|
1101
|
+
values_str = str(step.set)
|
|
1102
|
+
|
|
1103
|
+
# Determine threshold status for W, E, C columns
|
|
1104
|
+
# Check if thresholds are set and whether they were exceeded
|
|
1105
|
+
|
|
1106
|
+
# Warning threshold
|
|
1107
|
+
if (
|
|
1108
|
+
hasattr(step, "thresholds")
|
|
1109
|
+
and step.thresholds
|
|
1110
|
+
and hasattr(step.thresholds, "warning")
|
|
1111
|
+
and step.thresholds.warning is not None
|
|
1112
|
+
):
|
|
1113
|
+
w_status = (
|
|
1114
|
+
"[bright_black]●[/bright_black]"
|
|
1115
|
+
if step.warning
|
|
1116
|
+
else "[bright_black]○[/bright_black]"
|
|
1117
|
+
)
|
|
1118
|
+
else:
|
|
1119
|
+
w_status = "—"
|
|
1120
|
+
|
|
1121
|
+
# Error threshold
|
|
1122
|
+
if (
|
|
1123
|
+
hasattr(step, "thresholds")
|
|
1124
|
+
and step.thresholds
|
|
1125
|
+
and hasattr(step.thresholds, "error")
|
|
1126
|
+
and step.thresholds.error is not None
|
|
1127
|
+
):
|
|
1128
|
+
e_status = "[yellow]●[/yellow]" if step.error else "[yellow]○[/yellow]"
|
|
1129
|
+
else:
|
|
1130
|
+
e_status = "—"
|
|
996
1131
|
|
|
997
|
-
|
|
998
|
-
if
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1132
|
+
# Critical threshold
|
|
1133
|
+
if (
|
|
1134
|
+
hasattr(step, "thresholds")
|
|
1135
|
+
and step.thresholds
|
|
1136
|
+
and hasattr(step.thresholds, "critical")
|
|
1137
|
+
and step.thresholds.critical is not None
|
|
1138
|
+
):
|
|
1139
|
+
c_status = "[red]●[/red]" if step.critical else "[red]○[/red]"
|
|
1140
|
+
else:
|
|
1141
|
+
c_status = "—"
|
|
1142
|
+
|
|
1143
|
+
# Extract status, here we check if the step has any extract data
|
|
1144
|
+
if (
|
|
1145
|
+
hasattr(step, "extract")
|
|
1146
|
+
and step.extract is not None
|
|
1147
|
+
and hasattr(step.extract, "__len__")
|
|
1148
|
+
and len(step.extract) > 0
|
|
1149
|
+
):
|
|
1150
|
+
ext_status = "[blue]✓[/blue]"
|
|
1151
|
+
else:
|
|
1152
|
+
ext_status = "[bright_black]—[/bright_black]"
|
|
1004
1153
|
|
|
1005
1154
|
steps_table.add_row(
|
|
1006
1155
|
str(step.i),
|
|
1007
1156
|
step.assertion_type,
|
|
1008
1157
|
str(step.column) if step.column else "—",
|
|
1009
|
-
|
|
1010
|
-
|
|
1158
|
+
values_str,
|
|
1159
|
+
format_units(step.n),
|
|
1160
|
+
format_pass_fail(step.n_passed, step.n),
|
|
1161
|
+
format_pass_fail(step.n - step.n_passed, step.n),
|
|
1162
|
+
w_status,
|
|
1163
|
+
e_status,
|
|
1164
|
+
c_status,
|
|
1165
|
+
ext_status,
|
|
1011
1166
|
)
|
|
1012
1167
|
|
|
1013
1168
|
console.print(steps_table)
|
|
@@ -1015,18 +1170,32 @@ def _display_validation_summary(validation: Any) -> None:
|
|
|
1015
1170
|
# Display status with appropriate color
|
|
1016
1171
|
if highest_severity == "all passed":
|
|
1017
1172
|
console.print(
|
|
1018
|
-
Panel(
|
|
1173
|
+
Panel(
|
|
1174
|
+
"[green]✓ All validations passed![/green]",
|
|
1175
|
+
border_style="green",
|
|
1176
|
+
expand=False,
|
|
1177
|
+
)
|
|
1019
1178
|
)
|
|
1020
|
-
elif highest_severity == "
|
|
1179
|
+
elif highest_severity == "passed":
|
|
1021
1180
|
console.print(
|
|
1022
|
-
Panel(
|
|
1181
|
+
Panel(
|
|
1182
|
+
"[dim green]⚠ Some steps had failing test units[/dim green]",
|
|
1183
|
+
border_style="dim green",
|
|
1184
|
+
expand=False,
|
|
1185
|
+
)
|
|
1023
1186
|
)
|
|
1024
1187
|
elif highest_severity in ["warning", "error", "critical"]:
|
|
1025
|
-
|
|
1188
|
+
if highest_severity == "warning":
|
|
1189
|
+
color = "bright_black" # gray
|
|
1190
|
+
elif highest_severity == "error":
|
|
1191
|
+
color = "yellow"
|
|
1192
|
+
else: # critical
|
|
1193
|
+
color = "red"
|
|
1026
1194
|
console.print(
|
|
1027
1195
|
Panel(
|
|
1028
1196
|
f"[{color}]✗ Validation failed with {highest_severity} severity[/{color}]",
|
|
1029
1197
|
border_style=color,
|
|
1198
|
+
expand=False,
|
|
1030
1199
|
)
|
|
1031
1200
|
)
|
|
1032
1201
|
else:
|
|
@@ -1043,7 +1212,7 @@ def _display_validation_summary(validation: Any) -> None:
|
|
|
1043
1212
|
@click.version_option(version=pb.__version__, prog_name="pb")
|
|
1044
1213
|
def cli():
|
|
1045
1214
|
"""
|
|
1046
|
-
Pointblank CLI
|
|
1215
|
+
Pointblank CLI: Data validation and quality tools for data engineers.
|
|
1047
1216
|
|
|
1048
1217
|
Use this CLI to run validation scripts, preview tables, and generate reports
|
|
1049
1218
|
directly from the command line.
|
|
@@ -1455,10 +1624,11 @@ def missing(data_source: str, output_html: str | None):
|
|
|
1455
1624
|
|
|
1456
1625
|
|
|
1457
1626
|
@cli.command(name="validate")
|
|
1458
|
-
@click.argument("data_source", type=str)
|
|
1627
|
+
@click.argument("data_source", type=str, required=False)
|
|
1628
|
+
@click.option("--list-checks", is_flag=True, help="List available validation checks and exit")
|
|
1459
1629
|
@click.option(
|
|
1460
1630
|
"--check",
|
|
1461
|
-
"checks",
|
|
1631
|
+
"checks",
|
|
1462
1632
|
type=click.Choice(
|
|
1463
1633
|
[
|
|
1464
1634
|
"rows-distinct",
|
|
@@ -1472,25 +1642,25 @@ def missing(data_source: str, output_html: str | None):
|
|
|
1472
1642
|
"col-vals-le",
|
|
1473
1643
|
]
|
|
1474
1644
|
),
|
|
1645
|
+
metavar="CHECK_TYPE",
|
|
1475
1646
|
multiple=True, # Allow multiple --check options
|
|
1476
1647
|
help="Type of validation check to perform. Can be used multiple times for multiple checks.",
|
|
1477
1648
|
)
|
|
1478
|
-
@click.option("--list-checks", is_flag=True, help="List available validation checks and exit")
|
|
1479
1649
|
@click.option(
|
|
1480
1650
|
"--column",
|
|
1481
|
-
"columns",
|
|
1651
|
+
"columns",
|
|
1482
1652
|
multiple=True, # Allow multiple --column options
|
|
1483
1653
|
help="Column name or integer position as #N (1-based index) for validation.",
|
|
1484
1654
|
)
|
|
1485
1655
|
@click.option(
|
|
1486
1656
|
"--set",
|
|
1487
|
-
"sets",
|
|
1657
|
+
"sets",
|
|
1488
1658
|
multiple=True, # Allow multiple --set options
|
|
1489
1659
|
help="Comma-separated allowed values for col-vals-in-set checks.",
|
|
1490
1660
|
)
|
|
1491
1661
|
@click.option(
|
|
1492
1662
|
"--value",
|
|
1493
|
-
"values",
|
|
1663
|
+
"values",
|
|
1494
1664
|
type=float,
|
|
1495
1665
|
multiple=True, # Allow multiple --value options
|
|
1496
1666
|
help="Numeric value for comparison checks.",
|
|
@@ -1502,17 +1672,17 @@ def missing(data_source: str, output_html: str | None):
|
|
|
1502
1672
|
"--write-extract", type=str, help="Save failing rows to folder. Provide base name for folder."
|
|
1503
1673
|
)
|
|
1504
1674
|
@click.option(
|
|
1505
|
-
"--limit",
|
|
1675
|
+
"--limit", default=500, help="Maximum number of failing rows to save to CSV (default: 500)"
|
|
1506
1676
|
)
|
|
1507
1677
|
@click.option("--exit-code", is_flag=True, help="Exit with non-zero code if validation fails")
|
|
1508
1678
|
@click.pass_context
|
|
1509
1679
|
def validate(
|
|
1510
1680
|
ctx: click.Context,
|
|
1511
|
-
data_source: str,
|
|
1512
|
-
checks: tuple[str, ...],
|
|
1513
|
-
columns: tuple[str, ...],
|
|
1514
|
-
sets: tuple[str, ...],
|
|
1515
|
-
values: tuple[float, ...],
|
|
1681
|
+
data_source: str | None,
|
|
1682
|
+
checks: tuple[str, ...],
|
|
1683
|
+
columns: tuple[str, ...],
|
|
1684
|
+
sets: tuple[str, ...],
|
|
1685
|
+
values: tuple[float, ...],
|
|
1516
1686
|
show_extract: bool,
|
|
1517
1687
|
write_extract: str | None,
|
|
1518
1688
|
limit: int,
|
|
@@ -1534,21 +1704,21 @@ def validate(
|
|
|
1534
1704
|
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1535
1705
|
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1536
1706
|
|
|
1537
|
-
AVAILABLE
|
|
1707
|
+
AVAILABLE CHECK_TYPES:
|
|
1538
1708
|
|
|
1539
1709
|
Use --list-checks to see all available validation methods with examples.
|
|
1540
1710
|
|
|
1541
|
-
The default
|
|
1711
|
+
The default CHECK_TYPE is 'rows-distinct' which checks for duplicate rows.
|
|
1542
1712
|
|
|
1543
1713
|
\b
|
|
1544
1714
|
- rows-distinct: Check if all rows in the dataset are unique (no duplicates)
|
|
1545
1715
|
- rows-complete: Check if all rows are complete (no missing values in any column)
|
|
1546
1716
|
- col-exists: Check if a specific column exists in the dataset (requires --column)
|
|
1547
1717
|
- col-vals-not-null: Check if all values in a column are not null/missing (requires --column)
|
|
1548
|
-
- col-vals-gt: Check if all values in a column are greater than a
|
|
1549
|
-
- col-vals-ge: Check if all values in a column are greater than or equal to a
|
|
1550
|
-
- col-vals-lt: Check if all values in a column are less than a
|
|
1551
|
-
- col-vals-le: Check if all values in a column are less than or equal to a
|
|
1718
|
+
- col-vals-gt: Check if all values in a column are greater than a comparison value (requires --column and --value)
|
|
1719
|
+
- col-vals-ge: Check if all values in a column are greater than or equal to a comparison value (requires --column and --value)
|
|
1720
|
+
- col-vals-lt: Check if all values in a column are less than a comparison value (requires --column and --value)
|
|
1721
|
+
- col-vals-le: Check if all values in a column are less than or equal to a comparison value (requires --column and --value)
|
|
1552
1722
|
- col-vals-in-set: Check if all values in a column are in an allowed set (requires --column and --set)
|
|
1553
1723
|
|
|
1554
1724
|
Examples:
|
|
@@ -1571,28 +1741,7 @@ def validate(
|
|
|
1571
1741
|
pb validate data.csv --check col-vals-not-null --column email --check col-vals-gt --column age --value 18
|
|
1572
1742
|
"""
|
|
1573
1743
|
try:
|
|
1574
|
-
# Handle
|
|
1575
|
-
import sys
|
|
1576
|
-
|
|
1577
|
-
# Convert parameter tuples to lists, handling default case
|
|
1578
|
-
if not checks:
|
|
1579
|
-
# No --check options provided, use default
|
|
1580
|
-
checks_list = ["rows-distinct"]
|
|
1581
|
-
is_using_default_check = True
|
|
1582
|
-
else:
|
|
1583
|
-
checks_list = list(checks)
|
|
1584
|
-
is_using_default_check = False
|
|
1585
|
-
|
|
1586
|
-
columns_list = list(columns) if columns else []
|
|
1587
|
-
sets_list = list(sets) if sets else []
|
|
1588
|
-
values_list = list(values) if values else []
|
|
1589
|
-
|
|
1590
|
-
# Map parameters to checks intelligently
|
|
1591
|
-
mapped_columns, mapped_sets, mapped_values = _map_parameters_to_checks(
|
|
1592
|
-
checks_list, columns_list, sets_list, values_list
|
|
1593
|
-
)
|
|
1594
|
-
|
|
1595
|
-
# Handle --list-checks option
|
|
1744
|
+
# Handle --list-checks option early (doesn't need data source)
|
|
1596
1745
|
if list_checks:
|
|
1597
1746
|
console.print("[bold bright_cyan]Available Validation Checks:[/bold bright_cyan]")
|
|
1598
1747
|
console.print()
|
|
@@ -1616,14 +1765,16 @@ def validate(
|
|
|
1616
1765
|
"[bold magenta]Value comparison checks [bright_black](require --column and --value)[/bright_black]:[/bold magenta]"
|
|
1617
1766
|
)
|
|
1618
1767
|
console.print(
|
|
1619
|
-
" • [bold cyan]col-vals-gt[/bold cyan] Values greater than
|
|
1768
|
+
" • [bold cyan]col-vals-gt[/bold cyan] Values greater than comparison value"
|
|
1620
1769
|
)
|
|
1621
1770
|
console.print(
|
|
1622
|
-
" • [bold cyan]col-vals-ge[/bold cyan] Values greater than or equal to
|
|
1771
|
+
" • [bold cyan]col-vals-ge[/bold cyan] Values greater than or equal to comparison value"
|
|
1623
1772
|
)
|
|
1624
|
-
console.print(" • [bold cyan]col-vals-lt[/bold cyan] Values less than threshold")
|
|
1625
1773
|
console.print(
|
|
1626
|
-
" • [bold cyan]col-vals-
|
|
1774
|
+
" • [bold cyan]col-vals-lt[/bold cyan] Values less than comparison value"
|
|
1775
|
+
)
|
|
1776
|
+
console.print(
|
|
1777
|
+
" • [bold cyan]col-vals-le[/bold cyan] Values less than or equal to comparison value"
|
|
1627
1778
|
)
|
|
1628
1779
|
console.print()
|
|
1629
1780
|
console.print(
|
|
@@ -1634,19 +1785,47 @@ def validate(
|
|
|
1634
1785
|
)
|
|
1635
1786
|
console.print()
|
|
1636
1787
|
console.print("[bold bright_yellow]Examples:[/bold bright_yellow]")
|
|
1788
|
+
console.print(" [bright_blue]pb validate data.csv --check rows-distinct[/bright_blue]")
|
|
1637
1789
|
console.print(
|
|
1638
|
-
|
|
1639
|
-
)
|
|
1640
|
-
console.print(
|
|
1641
|
-
f" [bright_blue]pb validate {data_source} --check col-vals-not-null --column price[/bright_blue]"
|
|
1790
|
+
" [bright_blue]pb validate data.csv --check col-vals-not-null --column price[/bright_blue]"
|
|
1642
1791
|
)
|
|
1643
1792
|
console.print(
|
|
1644
|
-
|
|
1793
|
+
" [bright_blue]pb validate data.csv --check col-vals-gt --column age --value 18[/bright_blue]"
|
|
1645
1794
|
)
|
|
1646
1795
|
import sys
|
|
1647
1796
|
|
|
1648
1797
|
sys.exit(0)
|
|
1649
1798
|
|
|
1799
|
+
# Check if data_source is provided (required for all operations except --list-checks)
|
|
1800
|
+
if data_source is None:
|
|
1801
|
+
console.print("[red]Error:[/red] DATA_SOURCE is required")
|
|
1802
|
+
console.print("Use 'pb validate --help' for usage information")
|
|
1803
|
+
console.print("Or use 'pb validate --list-checks' to see available validation types")
|
|
1804
|
+
import sys
|
|
1805
|
+
|
|
1806
|
+
sys.exit(1)
|
|
1807
|
+
|
|
1808
|
+
# Handle backward compatibility and parameter conversion
|
|
1809
|
+
import sys
|
|
1810
|
+
|
|
1811
|
+
# Convert parameter tuples to lists, handling default case
|
|
1812
|
+
if not checks:
|
|
1813
|
+
# No --check options provided, use default
|
|
1814
|
+
checks_list = ["rows-distinct"]
|
|
1815
|
+
is_using_default_check = True
|
|
1816
|
+
else:
|
|
1817
|
+
checks_list = list(checks)
|
|
1818
|
+
is_using_default_check = False
|
|
1819
|
+
|
|
1820
|
+
columns_list = list(columns) if columns else []
|
|
1821
|
+
sets_list = list(sets) if sets else []
|
|
1822
|
+
values_list = list(values) if values else []
|
|
1823
|
+
|
|
1824
|
+
# Map parameters to checks intelligently
|
|
1825
|
+
mapped_columns, mapped_sets, mapped_values = _map_parameters_to_checks(
|
|
1826
|
+
checks_list, columns_list, sets_list, values_list
|
|
1827
|
+
)
|
|
1828
|
+
|
|
1650
1829
|
# Validate required parameters for different check types
|
|
1651
1830
|
# Check parameters for each check in the list using mapped parameters
|
|
1652
1831
|
for i, check in enumerate(checks_list):
|
|
@@ -1791,7 +1970,7 @@ def validate(
|
|
|
1791
1970
|
|
|
1792
1971
|
# Display results based on whether we have single or multiple checks
|
|
1793
1972
|
if len(checks_list) == 1:
|
|
1794
|
-
# Single check
|
|
1973
|
+
# Single check: use current display format
|
|
1795
1974
|
_display_validation_result(
|
|
1796
1975
|
validation,
|
|
1797
1976
|
checks_list,
|
|
@@ -1806,7 +1985,7 @@ def validate(
|
|
|
1806
1985
|
limit,
|
|
1807
1986
|
)
|
|
1808
1987
|
else:
|
|
1809
|
-
# Multiple checks
|
|
1988
|
+
# Multiple checks: use stacked display format
|
|
1810
1989
|
any_failed = False
|
|
1811
1990
|
for i in range(len(checks_list)):
|
|
1812
1991
|
console.print() # Add spacing between results
|
|
@@ -1845,7 +2024,7 @@ def validate(
|
|
|
1845
2024
|
console.print()
|
|
1846
2025
|
console.print("[bold magenta]Common validation options:[/bold magenta]")
|
|
1847
2026
|
console.print(
|
|
1848
|
-
" • [bold cyan]--check rows-complete[/bold cyan]
|
|
2027
|
+
" • [bold cyan]--check rows-complete[/bold cyan] Check for rows with missing values"
|
|
1849
2028
|
)
|
|
1850
2029
|
console.print(
|
|
1851
2030
|
" • [bold cyan]--check col-vals-not-null[/bold cyan] Check for null values in a column [bright_black](requires --column)[/bright_black]"
|
|
@@ -2070,7 +2249,7 @@ def _rich_print_scan_table(
|
|
|
2070
2249
|
# Clean up HTML formatting from the raw data
|
|
2071
2250
|
str_val = str(value)
|
|
2072
2251
|
|
|
2073
|
-
# Handle multi-line values with <br> tags FIRST
|
|
2252
|
+
# Handle multi-line values with <br> tags FIRST: take the first line (absolute number)
|
|
2074
2253
|
if "<br>" in str_val:
|
|
2075
2254
|
str_val = str_val.split("<br>")[0].strip()
|
|
2076
2255
|
# For unique values, we want just the integer part
|
|
@@ -2089,14 +2268,14 @@ def _rich_print_scan_table(
|
|
|
2089
2268
|
# Clean up extra whitespace
|
|
2090
2269
|
str_val = re.sub(r"\s+", " ", str_val).strip()
|
|
2091
2270
|
|
|
2092
|
-
# Handle values like "2<.01"
|
|
2271
|
+
# Handle values like "2<.01": extract the first number
|
|
2093
2272
|
if "<" in str_val and not (str_val.startswith("<") and str_val.endswith(">")):
|
|
2094
2273
|
# Extract number before the < symbol
|
|
2095
2274
|
before_lt = str_val.split("<")[0].strip()
|
|
2096
2275
|
if before_lt and before_lt.replace(".", "").replace("-", "").isdigit():
|
|
2097
2276
|
str_val = before_lt
|
|
2098
2277
|
|
|
2099
|
-
# Handle boolean unique values like "T0.62F0.38"
|
|
2278
|
+
# Handle boolean unique values like "T0.62F0.38": extract the more readable format
|
|
2100
2279
|
if re.match(r"^[TF]\d+\.\d+[TF]\d+\.\d+$", str_val):
|
|
2101
2280
|
# Extract T and F values
|
|
2102
2281
|
t_match = re.search(r"T(\d+\.\d+)", str_val)
|
|
@@ -2126,7 +2305,7 @@ def _rich_print_scan_table(
|
|
|
2126
2305
|
# Simple integers under 10000
|
|
2127
2306
|
return str(int(num_val))
|
|
2128
2307
|
elif abs(num_val) >= 10000000 and abs(num_val) < 100000000:
|
|
2129
|
-
# Likely dates in YYYYMMDD format
|
|
2308
|
+
# Likely dates in YYYYMMDD format: format as date-like
|
|
2130
2309
|
int_val = int(num_val)
|
|
2131
2310
|
if 19000101 <= int_val <= 29991231: # Reasonable date range
|
|
2132
2311
|
str_date = str(int_val)
|
|
@@ -2138,29 +2317,29 @@ def _rich_print_scan_table(
|
|
|
2138
2317
|
# Otherwise treat as large number
|
|
2139
2318
|
return f"{num_val / 1000000:.1f}M"
|
|
2140
2319
|
elif abs(num_val) >= 1000000:
|
|
2141
|
-
# Large numbers
|
|
2320
|
+
# Large numbers: use scientific notation or M/k notation
|
|
2142
2321
|
|
|
2143
2322
|
if abs(num_val) >= 1000000000:
|
|
2144
2323
|
return f"{num_val:.1e}"
|
|
2145
2324
|
else:
|
|
2146
2325
|
return f"{num_val / 1000000:.1f}M"
|
|
2147
2326
|
elif abs(num_val) >= 10000:
|
|
2148
|
-
# Numbers >= 10k
|
|
2327
|
+
# Numbers >= 10k: use compact notation
|
|
2149
2328
|
return f"{num_val / 1000:.1f}k"
|
|
2150
2329
|
elif abs(num_val) >= 100:
|
|
2151
|
-
# Numbers 100-9999
|
|
2330
|
+
# Numbers 100-9999: show with minimal decimals
|
|
2152
2331
|
return f"{num_val:.1f}"
|
|
2153
2332
|
elif abs(num_val) >= 10:
|
|
2154
|
-
# Numbers 10-99
|
|
2333
|
+
# Numbers 10-99: show with one decimal
|
|
2155
2334
|
return f"{num_val:.1f}"
|
|
2156
2335
|
elif abs(num_val) >= 1:
|
|
2157
|
-
# Numbers 1-9
|
|
2336
|
+
# Numbers 1-9: show with two decimals
|
|
2158
2337
|
return f"{num_val:.2f}"
|
|
2159
2338
|
elif abs(num_val) >= 0.01:
|
|
2160
|
-
# Small numbers
|
|
2339
|
+
# Small numbers: show with appropriate precision
|
|
2161
2340
|
return f"{num_val:.2f}"
|
|
2162
2341
|
else:
|
|
2163
|
-
# Very small numbers
|
|
2342
|
+
# Very small numbers: use scientific notation
|
|
2164
2343
|
|
|
2165
2344
|
return f"{num_val:.1e}"
|
|
2166
2345
|
|
|
@@ -2168,7 +2347,7 @@ def _rich_print_scan_table(
|
|
|
2168
2347
|
# Not a number, handle as string
|
|
2169
2348
|
pass
|
|
2170
2349
|
|
|
2171
|
-
# Handle date/datetime strings
|
|
2350
|
+
# Handle date/datetime strings: show abbreviated format
|
|
2172
2351
|
if len(str_val) > 10 and any(char in str_val for char in ["-", "/", ":"]):
|
|
2173
2352
|
# Likely a date/datetime, show abbreviated
|
|
2174
2353
|
if len(str_val) > max_width:
|
|
@@ -2528,7 +2707,7 @@ def _display_validation_result(
|
|
|
2528
2707
|
|
|
2529
2708
|
# Create friendly title for table
|
|
2530
2709
|
if total_checks == 1:
|
|
2531
|
-
# Single check
|
|
2710
|
+
# Single check: use original title format
|
|
2532
2711
|
if check == "rows-distinct":
|
|
2533
2712
|
table_title = "Validation Result: Rows Distinct"
|
|
2534
2713
|
elif check == "col-vals-not-null":
|
|
@@ -2550,7 +2729,7 @@ def _display_validation_result(
|
|
|
2550
2729
|
else:
|
|
2551
2730
|
table_title = f"Validation Result: {check.replace('-', ' ').title()}"
|
|
2552
2731
|
else:
|
|
2553
|
-
# Multiple checks
|
|
2732
|
+
# Multiple checks: add numbering
|
|
2554
2733
|
if check == "rows-distinct":
|
|
2555
2734
|
base_title = "Rows Distinct"
|
|
2556
2735
|
elif check == "col-vals-not-null":
|
|
@@ -2617,7 +2796,7 @@ def _display_validation_result(
|
|
|
2617
2796
|
operator = "<"
|
|
2618
2797
|
elif check == "col-vals-le":
|
|
2619
2798
|
operator = "<="
|
|
2620
|
-
result_table.add_row("
|
|
2799
|
+
result_table.add_row("Comparison Value", f"{operator} {value}")
|
|
2621
2800
|
|
|
2622
2801
|
# Get validation details
|
|
2623
2802
|
if step_info:
|
|
@@ -2728,6 +2907,7 @@ def _display_validation_result(
|
|
|
2728
2907
|
Panel(
|
|
2729
2908
|
success_message,
|
|
2730
2909
|
border_style="green",
|
|
2910
|
+
expand=False,
|
|
2731
2911
|
)
|
|
2732
2912
|
)
|
|
2733
2913
|
else:
|
|
@@ -2757,6 +2937,7 @@ def _display_validation_result(
|
|
|
2757
2937
|
Panel(
|
|
2758
2938
|
failure_message,
|
|
2759
2939
|
border_style="red",
|
|
2940
|
+
expand=False,
|
|
2760
2941
|
)
|
|
2761
2942
|
)
|
|
2762
2943
|
|
|
@@ -2837,7 +3018,7 @@ def _show_extract_for_multi_check(
|
|
|
2837
3018
|
console.print()
|
|
2838
3019
|
console.print(extract_message)
|
|
2839
3020
|
|
|
2840
|
-
# Special handling for col-exists check
|
|
3021
|
+
# Special handling for col-exists check: no rows to show when column doesn't exist
|
|
2841
3022
|
if check == "col-exists":
|
|
2842
3023
|
if show_extract:
|
|
2843
3024
|
console.print(f"[dim]The column '{column}' was not found in the dataset.[/dim]")
|
|
@@ -2848,16 +3029,17 @@ def _show_extract_for_multi_check(
|
|
|
2848
3029
|
console.print("[yellow]Cannot save failing rows when column doesn't exist[/yellow]")
|
|
2849
3030
|
else:
|
|
2850
3031
|
try:
|
|
2851
|
-
# Get failing rows extract
|
|
3032
|
+
# Get failing rows extract: use step_index + 1 since extracts are 1-indexed
|
|
2852
3033
|
failing_rows = validation.get_data_extracts(i=step_index + 1, frame=True)
|
|
2853
3034
|
|
|
2854
3035
|
if failing_rows is not None and len(failing_rows) > 0:
|
|
2855
3036
|
if show_extract:
|
|
2856
|
-
#
|
|
2857
|
-
|
|
2858
|
-
|
|
3037
|
+
# Always limit to 10 rows for display, regardless of limit option
|
|
3038
|
+
display_limit = 10
|
|
3039
|
+
if len(failing_rows) > display_limit:
|
|
3040
|
+
display_rows = failing_rows.head(display_limit)
|
|
2859
3041
|
console.print(
|
|
2860
|
-
f"[dim]Showing first {
|
|
3042
|
+
f"[dim]Showing first {display_limit} of {len(failing_rows)} {row_type}[/dim]"
|
|
2861
3043
|
)
|
|
2862
3044
|
else:
|
|
2863
3045
|
display_rows = failing_rows
|
|
@@ -2868,9 +3050,9 @@ def _show_extract_for_multi_check(
|
|
|
2868
3050
|
|
|
2869
3051
|
preview_table = pb.preview(
|
|
2870
3052
|
data=display_rows,
|
|
2871
|
-
n_head=min(
|
|
3053
|
+
n_head=min(display_limit, len(display_rows)),
|
|
2872
3054
|
n_tail=0,
|
|
2873
|
-
limit=
|
|
3055
|
+
limit=display_limit,
|
|
2874
3056
|
show_row_numbers=True,
|
|
2875
3057
|
)
|
|
2876
3058
|
|
|
@@ -2892,7 +3074,7 @@ def _show_extract_for_multi_check(
|
|
|
2892
3074
|
filename = f"step_{step_index + 1:02d}_{safe_check_type}.csv"
|
|
2893
3075
|
filepath = output_folder / filename
|
|
2894
3076
|
|
|
2895
|
-
#
|
|
3077
|
+
# Use limit option for write_extract
|
|
2896
3078
|
write_rows = failing_rows
|
|
2897
3079
|
if len(failing_rows) > limit:
|
|
2898
3080
|
write_rows = failing_rows.head(limit)
|
|
@@ -2997,7 +3179,7 @@ def _show_extract_and_summary(
|
|
|
2997
3179
|
if show_extract:
|
|
2998
3180
|
console.print(extract_message)
|
|
2999
3181
|
|
|
3000
|
-
# Special handling for col-exists check
|
|
3182
|
+
# Special handling for col-exists check: no rows to show when column doesn't exist
|
|
3001
3183
|
if check == "col-exists" and not step_passed:
|
|
3002
3184
|
if show_extract:
|
|
3003
3185
|
console.print(f"[dim]The column '{column}' was not found in the dataset.[/dim]")
|
|
@@ -3008,16 +3190,17 @@ def _show_extract_and_summary(
|
|
|
3008
3190
|
console.print("[yellow]Cannot save failing rows when column doesn't exist[/yellow]")
|
|
3009
3191
|
else:
|
|
3010
3192
|
try:
|
|
3011
|
-
# Get failing rows extract
|
|
3193
|
+
# Get failing rows extract: use step_index + 1 since extracts are 1-indexed
|
|
3012
3194
|
failing_rows = validation.get_data_extracts(i=step_index + 1, frame=True)
|
|
3013
3195
|
|
|
3014
3196
|
if failing_rows is not None and len(failing_rows) > 0:
|
|
3015
3197
|
if show_extract:
|
|
3016
|
-
#
|
|
3017
|
-
|
|
3018
|
-
|
|
3198
|
+
# Always limit to 10 rows for display, regardless of limit option
|
|
3199
|
+
display_limit = 10
|
|
3200
|
+
if len(failing_rows) > display_limit:
|
|
3201
|
+
display_rows = failing_rows.head(display_limit)
|
|
3019
3202
|
console.print(
|
|
3020
|
-
f"[dim]Showing first {
|
|
3203
|
+
f"[dim]Showing first {display_limit} of {len(failing_rows)} {row_type}[/dim]"
|
|
3021
3204
|
)
|
|
3022
3205
|
else:
|
|
3023
3206
|
display_rows = failing_rows
|
|
@@ -3028,9 +3211,9 @@ def _show_extract_and_summary(
|
|
|
3028
3211
|
|
|
3029
3212
|
preview_table = pb.preview(
|
|
3030
3213
|
data=display_rows,
|
|
3031
|
-
n_head=min(
|
|
3214
|
+
n_head=min(display_limit, len(display_rows)),
|
|
3032
3215
|
n_tail=0,
|
|
3033
|
-
limit=
|
|
3216
|
+
limit=display_limit,
|
|
3034
3217
|
show_row_numbers=True,
|
|
3035
3218
|
)
|
|
3036
3219
|
|
|
@@ -3052,7 +3235,7 @@ def _show_extract_and_summary(
|
|
|
3052
3235
|
filename = f"step_{step_index + 1:02d}_{safe_check_type}.csv"
|
|
3053
3236
|
filepath = output_folder / filename
|
|
3054
3237
|
|
|
3055
|
-
#
|
|
3238
|
+
# Use limit option for write_extract
|
|
3056
3239
|
write_rows = failing_rows
|
|
3057
3240
|
if len(failing_rows) > limit:
|
|
3058
3241
|
write_rows = failing_rows.head(limit)
|
|
@@ -3123,7 +3306,7 @@ def _show_extract_and_summary(
|
|
|
3123
3306
|
f"[green]✓ Validation PASSED: {check} check passed for {data_source}[/green]"
|
|
3124
3307
|
)
|
|
3125
3308
|
|
|
3126
|
-
console.print(Panel(success_message, border_style="green"))
|
|
3309
|
+
console.print(Panel(success_message, border_style="green", expand=False))
|
|
3127
3310
|
else:
|
|
3128
3311
|
if step_info:
|
|
3129
3312
|
if check == "rows-distinct":
|
|
@@ -3151,7 +3334,7 @@ def _show_extract_and_summary(
|
|
|
3151
3334
|
if not show_extract and check != "col-exists":
|
|
3152
3335
|
failure_message += "\n[bright_blue]💡 Tip:[/bright_blue] [cyan]Use --show-extract to see the failing rows[/cyan]"
|
|
3153
3336
|
|
|
3154
|
-
console.print(Panel(failure_message, border_style="red"))
|
|
3337
|
+
console.print(Panel(failure_message, border_style="red", expand=False))
|
|
3155
3338
|
else:
|
|
3156
3339
|
if check == "rows-distinct":
|
|
3157
3340
|
failure_message = (
|
|
@@ -3170,7 +3353,7 @@ def _show_extract_and_summary(
|
|
|
3170
3353
|
if not show_extract:
|
|
3171
3354
|
failure_message += "\n[bright_blue]💡 Tip:[/bright_blue] [cyan]Use --show-extract to see the failing rows[/cyan]"
|
|
3172
3355
|
|
|
3173
|
-
console.print(Panel(failure_message, border_style="red"))
|
|
3356
|
+
console.print(Panel(failure_message, border_style="red", expand=False))
|
|
3174
3357
|
|
|
3175
3358
|
|
|
3176
3359
|
@cli.command()
|
|
@@ -3196,6 +3379,9 @@ Example Pointblank validation script.
|
|
|
3196
3379
|
|
|
3197
3380
|
This script demonstrates how to create validation rules for your data.
|
|
3198
3381
|
Modify the data loading and validation rules below to match your requirements.
|
|
3382
|
+
|
|
3383
|
+
When using 'pb run' with --data option, the CLI will automatically replace
|
|
3384
|
+
the data source in your validation object with the provided data.
|
|
3199
3385
|
"""
|
|
3200
3386
|
|
|
3201
3387
|
import pointblank as pb
|
|
@@ -3239,11 +3425,6 @@ validation = (
|
|
|
3239
3425
|
# Finalize the validation
|
|
3240
3426
|
.interrogate()
|
|
3241
3427
|
)
|
|
3242
|
-
|
|
3243
|
-
# The validation object will be automatically used by the CLI
|
|
3244
|
-
# You can also access results programmatically:
|
|
3245
|
-
# print(f"All passed: {validation.all_passed()}")
|
|
3246
|
-
# print(f"Failed steps: {validation.n_failed()}")
|
|
3247
3428
|
'''
|
|
3248
3429
|
|
|
3249
3430
|
Path(output_file).write_text(example_script)
|
|
@@ -3251,13 +3432,17 @@ validation = (
|
|
|
3251
3432
|
console.print("\nEdit the template to add your data loading and validation rules, then run:")
|
|
3252
3433
|
console.print(f"[cyan]pb run {output_file}[/cyan]")
|
|
3253
3434
|
console.print(
|
|
3254
|
-
f"[cyan]pb run {output_file} --data your_data.csv[/cyan] [dim]#
|
|
3435
|
+
f"[cyan]pb run {output_file} --data your_data.csv[/cyan] [dim]# Replace data source automatically[/dim]"
|
|
3255
3436
|
)
|
|
3256
3437
|
|
|
3257
3438
|
|
|
3258
3439
|
@cli.command()
|
|
3259
3440
|
@click.argument("validation_script", type=click.Path(exists=True))
|
|
3260
|
-
@click.option(
|
|
3441
|
+
@click.option(
|
|
3442
|
+
"--data",
|
|
3443
|
+
type=str,
|
|
3444
|
+
help="Data source to replace in validation objects (single validation scripts only)",
|
|
3445
|
+
)
|
|
3261
3446
|
@click.option("--output-html", type=click.Path(), help="Save HTML validation report to file")
|
|
3262
3447
|
@click.option("--output-json", type=click.Path(), help="Save JSON validation summary to file")
|
|
3263
3448
|
@click.option(
|
|
@@ -3269,7 +3454,7 @@ validation = (
|
|
|
3269
3454
|
help="Save failing rows to folders (one CSV per step). Provide base name for folder.",
|
|
3270
3455
|
)
|
|
3271
3456
|
@click.option(
|
|
3272
|
-
"--limit",
|
|
3457
|
+
"--limit", default=500, help="Maximum number of failing rows to save to CSV (default: 500)"
|
|
3273
3458
|
)
|
|
3274
3459
|
@click.option(
|
|
3275
3460
|
"--fail-on",
|
|
@@ -3292,8 +3477,11 @@ def run(
|
|
|
3292
3477
|
VALIDATION_SCRIPT should be a Python file that defines validation logic.
|
|
3293
3478
|
The script should load its own data and create validation objects.
|
|
3294
3479
|
|
|
3295
|
-
If --data is provided, it will
|
|
3296
|
-
|
|
3480
|
+
If --data is provided, it will automatically replace the data source in your
|
|
3481
|
+
validation objects. This works with scripts containing a single validation.
|
|
3482
|
+
For scripts with multiple validations, use separate script files or remove --data.
|
|
3483
|
+
|
|
3484
|
+
To get started quickly, use 'pb make-template' to create a validation script template.
|
|
3297
3485
|
|
|
3298
3486
|
DATA can be:
|
|
3299
3487
|
|
|
@@ -3307,6 +3495,7 @@ def run(
|
|
|
3307
3495
|
Examples:
|
|
3308
3496
|
|
|
3309
3497
|
\b
|
|
3498
|
+
pb make-template my_validation.py # Create a template first
|
|
3310
3499
|
pb run validation_script.py
|
|
3311
3500
|
pb run validation_script.py --data data.csv
|
|
3312
3501
|
pb run validation_script.py --data small_table --output-html report.html
|
|
@@ -3369,6 +3558,72 @@ def run(
|
|
|
3369
3558
|
|
|
3370
3559
|
console.print(f"[green]✓[/green] Found {len(validations)} validation object(s)")
|
|
3371
3560
|
|
|
3561
|
+
# Implement automatic data replacement for Validate objects if --data was provided
|
|
3562
|
+
if cli_data is not None:
|
|
3563
|
+
# Check if we have multiple validations (this is not supported)
|
|
3564
|
+
if len(validations) > 1:
|
|
3565
|
+
console.print(
|
|
3566
|
+
f"[red]Error: Found {len(validations)} validation objects in the script.[/red]"
|
|
3567
|
+
)
|
|
3568
|
+
console.print(
|
|
3569
|
+
"[yellow]The --data option replaces data in ALL validation objects,[/yellow]"
|
|
3570
|
+
)
|
|
3571
|
+
console.print(
|
|
3572
|
+
"[yellow]which may cause failures if validations expect different schemas.[/yellow]"
|
|
3573
|
+
)
|
|
3574
|
+
console.print("\n[cyan]Options:[/cyan]")
|
|
3575
|
+
console.print(" 1. Split your script into separate files with one validation each")
|
|
3576
|
+
console.print(
|
|
3577
|
+
" 2. Remove the --data option to use each validation's original data"
|
|
3578
|
+
)
|
|
3579
|
+
sys.exit(1)
|
|
3580
|
+
|
|
3581
|
+
console.print(
|
|
3582
|
+
f"[yellow]Replacing data in {len(validations)} validation object(s) with CLI data[/yellow]"
|
|
3583
|
+
)
|
|
3584
|
+
|
|
3585
|
+
for idx, validation in enumerate(validations, 1):
|
|
3586
|
+
# Check if it's a Validate object with data attribute
|
|
3587
|
+
if hasattr(validation, "data") and hasattr(validation, "interrogate"):
|
|
3588
|
+
console.print("[cyan]Updating validation with new data source...[/cyan]")
|
|
3589
|
+
|
|
3590
|
+
# Store the original validation_info as our "plan"
|
|
3591
|
+
original_validation_info = validation.validation_info.copy()
|
|
3592
|
+
|
|
3593
|
+
# Replace the data
|
|
3594
|
+
validation.data = cli_data
|
|
3595
|
+
|
|
3596
|
+
# Re-process the data (same as what happens in __post_init__)
|
|
3597
|
+
from pointblank.validate import _process_data
|
|
3598
|
+
|
|
3599
|
+
validation.data = _process_data(validation.data)
|
|
3600
|
+
|
|
3601
|
+
# Reset validation results but keep the plan
|
|
3602
|
+
validation.validation_info = []
|
|
3603
|
+
|
|
3604
|
+
# Re-add each validation step from the original plan
|
|
3605
|
+
for val_info in original_validation_info:
|
|
3606
|
+
# Create a copy and reset any interrogation results
|
|
3607
|
+
new_val_info = copy.deepcopy(val_info)
|
|
3608
|
+
# Reset interrogation-specific attributes if they exist
|
|
3609
|
+
if hasattr(new_val_info, "n_passed"):
|
|
3610
|
+
new_val_info.n_passed = None
|
|
3611
|
+
if hasattr(new_val_info, "n_failed"):
|
|
3612
|
+
new_val_info.n_failed = None
|
|
3613
|
+
if hasattr(new_val_info, "all_passed"):
|
|
3614
|
+
new_val_info.all_passed = None
|
|
3615
|
+
if hasattr(new_val_info, "warning"):
|
|
3616
|
+
new_val_info.warning = None
|
|
3617
|
+
if hasattr(new_val_info, "error"):
|
|
3618
|
+
new_val_info.error = None
|
|
3619
|
+
if hasattr(new_val_info, "critical"):
|
|
3620
|
+
new_val_info.critical = None
|
|
3621
|
+
validation.validation_info.append(new_val_info)
|
|
3622
|
+
|
|
3623
|
+
# Re-interrogate with the new data
|
|
3624
|
+
console.print("[cyan]Re-interrogating with new data...[/cyan]")
|
|
3625
|
+
validation.interrogate()
|
|
3626
|
+
|
|
3372
3627
|
# Process each validation
|
|
3373
3628
|
overall_failed = False
|
|
3374
3629
|
overall_critical = False
|
|
@@ -3432,11 +3687,12 @@ def run(
|
|
|
3432
3687
|
f"\n[cyan]Step {step_num}:[/cyan] {step_info.assertion_type}"
|
|
3433
3688
|
)
|
|
3434
3689
|
|
|
3435
|
-
#
|
|
3436
|
-
|
|
3437
|
-
|
|
3690
|
+
# Always limit to 10 rows for display, regardless of limit option
|
|
3691
|
+
display_limit = 10
|
|
3692
|
+
if len(failing_rows) > display_limit:
|
|
3693
|
+
display_rows = failing_rows.head(display_limit)
|
|
3438
3694
|
console.print(
|
|
3439
|
-
f"[dim]Showing first {
|
|
3695
|
+
f"[dim]Showing first {display_limit} of {len(failing_rows)} failing rows[/dim]"
|
|
3440
3696
|
)
|
|
3441
3697
|
else:
|
|
3442
3698
|
display_rows = failing_rows
|
|
@@ -3447,9 +3703,9 @@ def run(
|
|
|
3447
3703
|
# Create a preview table using pointblank's preview function
|
|
3448
3704
|
preview_table = pb.preview(
|
|
3449
3705
|
data=display_rows,
|
|
3450
|
-
n_head=min(
|
|
3706
|
+
n_head=min(display_limit, len(display_rows)),
|
|
3451
3707
|
n_tail=0,
|
|
3452
|
-
limit=
|
|
3708
|
+
limit=display_limit,
|
|
3453
3709
|
show_row_numbers=True,
|
|
3454
3710
|
)
|
|
3455
3711
|
|
|
@@ -3502,7 +3758,7 @@ def run(
|
|
|
3502
3758
|
filename = f"step_{step_num:02d}_{safe_assertion_type}.csv"
|
|
3503
3759
|
filepath = output_folder / filename
|
|
3504
3760
|
|
|
3505
|
-
#
|
|
3761
|
+
# Use limit for CSV output
|
|
3506
3762
|
save_rows = failing_rows
|
|
3507
3763
|
if hasattr(failing_rows, "head") and len(failing_rows) > limit:
|
|
3508
3764
|
save_rows = failing_rows.head(limit)
|
|
@@ -3521,7 +3777,11 @@ def run(
|
|
|
3521
3777
|
pd_data = pd.DataFrame(save_rows)
|
|
3522
3778
|
pd_data.to_csv(str(filepath), index=False)
|
|
3523
3779
|
|
|
3524
|
-
|
|
3780
|
+
# Record the actual number of rows saved
|
|
3781
|
+
rows_saved = (
|
|
3782
|
+
len(save_rows) if hasattr(save_rows, "__len__") else limit
|
|
3783
|
+
)
|
|
3784
|
+
saved_files.append((filename, rows_saved))
|
|
3525
3785
|
|
|
3526
3786
|
except Exception as e:
|
|
3527
3787
|
console.print(
|
|
@@ -3548,11 +3808,11 @@ def run(
|
|
|
3548
3808
|
if output_html:
|
|
3549
3809
|
try:
|
|
3550
3810
|
if len(validations) == 1:
|
|
3551
|
-
# Single validation
|
|
3811
|
+
# Single validation: save directly
|
|
3552
3812
|
html_content = validations[0]._repr_html_()
|
|
3553
3813
|
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
3554
3814
|
else:
|
|
3555
|
-
# Multiple validations
|
|
3815
|
+
# Multiple validations: combine them
|
|
3556
3816
|
html_parts = []
|
|
3557
3817
|
html_parts.append("<html><body>")
|
|
3558
3818
|
html_parts.append("<h1>Pointblank Validation Report</h1>")
|
|
@@ -3572,11 +3832,11 @@ def run(
|
|
|
3572
3832
|
if output_json:
|
|
3573
3833
|
try:
|
|
3574
3834
|
if len(validations) == 1:
|
|
3575
|
-
# Single validation
|
|
3835
|
+
# Single validation: save directly
|
|
3576
3836
|
json_report = validations[0].get_json_report()
|
|
3577
3837
|
Path(output_json).write_text(json_report, encoding="utf-8")
|
|
3578
3838
|
else:
|
|
3579
|
-
# Multiple validations
|
|
3839
|
+
# Multiple validations: combine them
|
|
3580
3840
|
import json
|
|
3581
3841
|
|
|
3582
3842
|
combined_report = {"validations": []}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pointblank
|
|
3
|
-
Version: 0.11.
|
|
3
|
+
Version: 0.11.2
|
|
4
4
|
Summary: Find out if your data is what you think it is.
|
|
5
5
|
Author-email: Richard Iannone <riannone@me.com>
|
|
6
6
|
License: MIT License
|
|
@@ -156,11 +156,11 @@ validation
|
|
|
156
156
|
|
|
157
157
|
## Why Choose Pointblank?
|
|
158
158
|
|
|
159
|
-
- **Works with your existing stack
|
|
160
|
-
- **Beautiful, interactive reports
|
|
161
|
-
- **Composable validation pipeline
|
|
162
|
-
- **Threshold-based alerts
|
|
163
|
-
- **Practical outputs
|
|
159
|
+
- **Works with your existing stack**: Seamlessly integrates with Polars, Pandas, DuckDB, MySQL, PostgreSQL, SQLite, Parquet, PySpark, Snowflake, and more!
|
|
160
|
+
- **Beautiful, interactive reports**: Crystal-clear validation results that highlight issues and help communicate data quality
|
|
161
|
+
- **Composable validation pipeline**: Chain validation steps into a complete data quality workflow
|
|
162
|
+
- **Threshold-based alerts**: Set 'warning', 'error', and 'critical' thresholds with custom actions
|
|
163
|
+
- **Practical outputs**: Use validation results to filter tables, extract problematic data, or trigger downstream processes
|
|
164
164
|
|
|
165
165
|
## Real-World Example
|
|
166
166
|
|
|
@@ -240,7 +240,7 @@ validation.get_step_report(i=3).show("browser") # Get failing records from step
|
|
|
240
240
|
Pointblank includes a powerful CLI utility called `pb` that lets you run data validation workflows directly from the command line. Perfect for CI/CD pipelines, scheduled data quality checks, or quick validation tasks.
|
|
241
241
|
|
|
242
242
|
<div align="center">
|
|
243
|
-
<img src="https://posit-dev.github.io/pointblank/assets/vhs/cli-complete-workflow.gif" width="
|
|
243
|
+
<img src="https://posit-dev.github.io/pointblank/assets/vhs/cli-complete-workflow.gif" width="100%">
|
|
244
244
|
</div>
|
|
245
245
|
|
|
246
246
|
**Explore Your Data**
|
|
@@ -279,19 +279,17 @@ pb validate small_table --check col-vals-gt --column a --value 5 --show-extract
|
|
|
279
279
|
|
|
280
280
|
```bash
|
|
281
281
|
# Use exit codes for automation (0 = pass, 1 = fail)
|
|
282
|
-
pb validate small_table --check rows-distinct
|
|
282
|
+
pb validate small_table --check rows-distinct --exit-code
|
|
283
283
|
```
|
|
284
284
|
|
|
285
|
-
Learn more in our [CLI documentation](https://posit-dev.github.io/pointblank/user-guide/cli.html).
|
|
286
|
-
|
|
287
285
|
## Features That Set Pointblank Apart
|
|
288
286
|
|
|
289
|
-
- **Complete validation workflow
|
|
290
|
-
- **Built for collaboration
|
|
291
|
-
- **Practical outputs
|
|
292
|
-
- **Flexible deployment
|
|
293
|
-
- **Customizable
|
|
294
|
-
- **Internationalization
|
|
287
|
+
- **Complete validation workflow**: From data access to validation to reporting in a single pipeline
|
|
288
|
+
- **Built for collaboration**: Share results with colleagues through beautiful interactive reports
|
|
289
|
+
- **Practical outputs**: Get exactly what you need: counts, extracts, summaries, or full reports
|
|
290
|
+
- **Flexible deployment**: Use in notebooks, scripts, or data pipelines
|
|
291
|
+
- **Customizable**: Tailor validation steps and reporting to your specific needs
|
|
292
|
+
- **Internationalization**: Reports can be generated in over 20 languages, including English, Spanish, French, and German
|
|
295
293
|
|
|
296
294
|
## Documentation and Examples
|
|
297
295
|
|
|
@@ -10,7 +10,7 @@ pointblank/_utils_check_args.py,sha256=rFEc1nbCN8ftsQQWVjCNWmQ2QmUDxkfgmoJclrZeT
|
|
|
10
10
|
pointblank/_utils_html.py,sha256=uJWvS9JwQVEZgwsGmScA_u_EBRND75rzUvnJPalbRVs,3731
|
|
11
11
|
pointblank/actions.py,sha256=D6o9B2_ES9PNQg9HZwREacrrt-3A5bhdrBkL1UXz__s,18281
|
|
12
12
|
pointblank/assistant.py,sha256=YsQ9U1wacVIuYFRIJ4maBbBDTzEQPzirhUUPgySosM4,15428
|
|
13
|
-
pointblank/cli.py,sha256=
|
|
13
|
+
pointblank/cli.py,sha256=jkevhsMpSQMqG1rNqfjNpOffqVcqzJYb_6knoOR22-g,169757
|
|
14
14
|
pointblank/column.py,sha256=_FJjpjv760D1p6YGgqbwmKYktouG7AJ2A9uIMYQBTYA,76560
|
|
15
15
|
pointblank/compare.py,sha256=kFd18CehHz7g-2MF1kSmJSdOoAP80q_9PaF6QzHC1ds,866
|
|
16
16
|
pointblank/datascan.py,sha256=nmTcRLW8nAZfvRS_Nf00Wgx4oUX-o6WFOZqLDbedbu8,24563
|
|
@@ -31,9 +31,9 @@ pointblank/data/nycflights.zip,sha256=yVjbUaKUz2LydSdF9cABuir0VReHBBgV7shiNWSd0m
|
|
|
31
31
|
pointblank/data/polars-api-docs.txt,sha256=KGcS-BOtUs9zgpkWfXD-GFdFh4O_zjdkpX7msHjztLg,198045
|
|
32
32
|
pointblank/data/small_table-duckdb.zip,sha256=BhTaZ2CRS4-9Z1uVhOU6HggvW3XCar7etMznfENIcOc,2028
|
|
33
33
|
pointblank/data/small_table.zip,sha256=lmFb90Nb-v5X559Ikjg31YLAXuRyMkD9yLRElkXPMzQ,472
|
|
34
|
-
pointblank-0.11.
|
|
35
|
-
pointblank-0.11.
|
|
36
|
-
pointblank-0.11.
|
|
37
|
-
pointblank-0.11.
|
|
38
|
-
pointblank-0.11.
|
|
39
|
-
pointblank-0.11.
|
|
34
|
+
pointblank-0.11.2.dist-info/licenses/LICENSE,sha256=apLF-HWPNU7pT5bmf5KmZpD5Cklpy2u-BN_0xBoRMLY,1081
|
|
35
|
+
pointblank-0.11.2.dist-info/METADATA,sha256=qe_reU_6Jidz8zPSUzp_ohcCcDyOSBs72CxINJDxoPU,16473
|
|
36
|
+
pointblank-0.11.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
37
|
+
pointblank-0.11.2.dist-info/entry_points.txt,sha256=GqqqOTOH8uZe22wLcvYjzpizqk_j4MNcUo2YM14ryCw,42
|
|
38
|
+
pointblank-0.11.2.dist-info/top_level.txt,sha256=-wHrS1SvV8-nhvc3w-PPYs1C1WtEc1pK-eGjubbCCKc,11
|
|
39
|
+
pointblank-0.11.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|