pointblank 0.9.6__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +4 -0
- pointblank/_constants.py +4 -0
- pointblank/_datascan_utils.py +65 -0
- pointblank/_utils.py +126 -0
- pointblank/_utils_html.py +40 -0
- pointblank/assistant.py +1 -3
- pointblank/cli.py +2737 -0
- pointblank/compare.py +27 -0
- pointblank/data/api-docs.txt +518 -125
- pointblank/datascan.py +318 -959
- pointblank/scan_profile.py +321 -0
- pointblank/scan_profile_stats.py +180 -0
- pointblank/schema.py +14 -3
- pointblank/validate.py +1425 -202
- {pointblank-0.9.6.dist-info → pointblank-0.11.0.dist-info}/METADATA +49 -3
- {pointblank-0.9.6.dist-info → pointblank-0.11.0.dist-info}/RECORD +20 -14
- {pointblank-0.9.6.dist-info → pointblank-0.11.0.dist-info}/WHEEL +1 -1
- pointblank-0.11.0.dist-info/entry_points.txt +2 -0
- {pointblank-0.9.6.dist-info → pointblank-0.11.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.9.6.dist-info → pointblank-0.11.0.dist-info}/top_level.txt +0 -0
pointblank/cli.py
ADDED
|
@@ -0,0 +1,2737 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import click
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
from rich.panel import Panel
|
|
10
|
+
from rich.table import Table
|
|
11
|
+
|
|
12
|
+
import pointblank as pb
|
|
13
|
+
from pointblank._utils import _get_tbl_type, _is_lib_present
|
|
14
|
+
|
|
15
|
+
console = Console()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _format_cell_value(
|
|
19
|
+
value: Any, is_row_number: bool = False, max_width: int = 50, num_columns: int = 10
|
|
20
|
+
) -> str:
|
|
21
|
+
"""Format a cell value for Rich table display, highlighting None/NA values in red.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
value: The raw cell value from the dataframe
|
|
25
|
+
is_row_number: Whether this is a row number column value
|
|
26
|
+
max_width: Maximum character width for text truncation
|
|
27
|
+
num_columns: Number of columns in the table (affects truncation aggressiveness)
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Formatted string with Rich markup for None/NA values or row numbers
|
|
31
|
+
"""
|
|
32
|
+
# Special formatting for row numbers: never truncate them
|
|
33
|
+
if is_row_number:
|
|
34
|
+
return f"[dim]{value}[/dim]"
|
|
35
|
+
|
|
36
|
+
# Check for actual None/null values (not string representations)
|
|
37
|
+
if value is None:
|
|
38
|
+
return "[red]None[/red]"
|
|
39
|
+
|
|
40
|
+
# Check for pandas/numpy specific NA values
|
|
41
|
+
try:
|
|
42
|
+
import numpy as np
|
|
43
|
+
import pandas as pd
|
|
44
|
+
|
|
45
|
+
# Check for pandas NA
|
|
46
|
+
if pd.isna(value):
|
|
47
|
+
# If it's specifically numpy.nan, show as NaN
|
|
48
|
+
if isinstance(value, float) and np.isnan(value):
|
|
49
|
+
return "[red]NaN[/red]"
|
|
50
|
+
# If it's pandas NA, show as NA
|
|
51
|
+
elif str(type(value)).find("pandas") != -1:
|
|
52
|
+
return "[red]NA[/red]"
|
|
53
|
+
# Generic NA for other pandas missing values
|
|
54
|
+
else:
|
|
55
|
+
return "[red]NA[/red]"
|
|
56
|
+
|
|
57
|
+
except (ImportError, TypeError, ValueError): # pragma: no cover
|
|
58
|
+
# If pandas/numpy not available, value not compatible, or ambiguous array
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
# Check for empty strings (but only actual empty strings, not whitespace)
|
|
62
|
+
if isinstance(value, str) and value == "":
|
|
63
|
+
return "[red][/red]" # Empty string shown as red empty space
|
|
64
|
+
|
|
65
|
+
# Convert to string and apply intelligent truncation
|
|
66
|
+
str_value = str(value)
|
|
67
|
+
|
|
68
|
+
# Adjust max_width based on number of columns to prevent overly wide tables
|
|
69
|
+
if num_columns > 15:
|
|
70
|
+
adjusted_max_width = min(max_width, 30) # Be more aggressive with many columns
|
|
71
|
+
elif num_columns > 10:
|
|
72
|
+
adjusted_max_width = min(max_width, 40)
|
|
73
|
+
else:
|
|
74
|
+
adjusted_max_width = max_width
|
|
75
|
+
|
|
76
|
+
# Apply truncation if the string is too long
|
|
77
|
+
if len(str_value) > adjusted_max_width:
|
|
78
|
+
# For very long text, truncate more aggressively
|
|
79
|
+
if len(str_value) > adjusted_max_width * 2:
|
|
80
|
+
# For extremely long text, use a shorter truncation
|
|
81
|
+
truncated = str_value[: adjusted_max_width // 2] + "…"
|
|
82
|
+
else:
|
|
83
|
+
# For moderately long text, use a more generous truncation
|
|
84
|
+
truncated = str_value[: adjusted_max_width - 1] + "…"
|
|
85
|
+
|
|
86
|
+
return truncated
|
|
87
|
+
|
|
88
|
+
return str_value
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _get_column_dtypes(df: Any, columns: list[str]) -> dict[str, str]:
|
|
92
|
+
"""Extract data types for columns and format them in a compact way.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
df: The dataframe object
|
|
96
|
+
columns: List of column names
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Dictionary mapping column names to formatted data type strings
|
|
100
|
+
"""
|
|
101
|
+
dtypes_dict = {}
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
if hasattr(df, "dtypes"):
|
|
105
|
+
# Polars/Pandas style
|
|
106
|
+
if hasattr(df.dtypes, "to_dict"):
|
|
107
|
+
# Polars DataFrame dtypes
|
|
108
|
+
raw_dtypes = df.dtypes.to_dict() if hasattr(df.dtypes, "to_dict") else {}
|
|
109
|
+
for col in columns:
|
|
110
|
+
if col in raw_dtypes:
|
|
111
|
+
dtype_str = str(raw_dtypes[col])
|
|
112
|
+
# Convert to compact format similar to Polars glimpse()
|
|
113
|
+
dtypes_dict[col] = _format_dtype_compact(dtype_str)
|
|
114
|
+
else:
|
|
115
|
+
dtypes_dict[col] = "?"
|
|
116
|
+
else:
|
|
117
|
+
# Pandas DataFrame dtypes (Series-like)
|
|
118
|
+
for i, col in enumerate(columns):
|
|
119
|
+
if i < len(df.dtypes):
|
|
120
|
+
dtype_str = str(
|
|
121
|
+
df.dtypes.iloc[i] if hasattr(df.dtypes, "iloc") else df.dtypes[i]
|
|
122
|
+
)
|
|
123
|
+
dtypes_dict[col] = _format_dtype_compact(dtype_str)
|
|
124
|
+
else:
|
|
125
|
+
dtypes_dict[col] = "?"
|
|
126
|
+
elif hasattr(df, "schema"):
|
|
127
|
+
# Other schema-based systems (e.g., Ibis)
|
|
128
|
+
schema = df.schema
|
|
129
|
+
if hasattr(schema, "to_dict"): # pragma: no cover
|
|
130
|
+
raw_dtypes = schema.to_dict()
|
|
131
|
+
for col in columns:
|
|
132
|
+
if col in raw_dtypes:
|
|
133
|
+
dtypes_dict[col] = _format_dtype_compact(str(raw_dtypes[col]))
|
|
134
|
+
else: # pragma: no cover
|
|
135
|
+
dtypes_dict[col] = "?"
|
|
136
|
+
else: # pragma: no cover
|
|
137
|
+
for col in columns:
|
|
138
|
+
try:
|
|
139
|
+
dtype_str = str(getattr(schema, col, "Unknown"))
|
|
140
|
+
dtypes_dict[col] = _format_dtype_compact(dtype_str)
|
|
141
|
+
except Exception: # pragma: no cover
|
|
142
|
+
dtypes_dict[col] = "?"
|
|
143
|
+
else:
|
|
144
|
+
# Fallback: no type information available
|
|
145
|
+
for col in columns:
|
|
146
|
+
dtypes_dict[col] = "?"
|
|
147
|
+
|
|
148
|
+
except Exception: # pragma: no cover
|
|
149
|
+
# If any error occurs, fall back to unknown types
|
|
150
|
+
for col in columns:
|
|
151
|
+
dtypes_dict[col] = "?"
|
|
152
|
+
|
|
153
|
+
return dtypes_dict
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _format_dtype_compact(dtype_str: str) -> str:
|
|
157
|
+
"""Format a data type string to a compact representation.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
dtype_str: The raw data type string
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Compact formatted data type string
|
|
164
|
+
"""
|
|
165
|
+
# Remove common prefixes and make compact
|
|
166
|
+
dtype_str = dtype_str.lower()
|
|
167
|
+
|
|
168
|
+
# Polars types
|
|
169
|
+
if "utf8" in dtype_str or "string" in dtype_str:
|
|
170
|
+
return "str"
|
|
171
|
+
elif "int64" in dtype_str:
|
|
172
|
+
return "i64"
|
|
173
|
+
elif "int32" in dtype_str:
|
|
174
|
+
return "i32"
|
|
175
|
+
elif "float64" in dtype_str:
|
|
176
|
+
return "f64"
|
|
177
|
+
elif "float32" in dtype_str:
|
|
178
|
+
return "f32"
|
|
179
|
+
elif "boolean" in dtype_str or "bool" in dtype_str:
|
|
180
|
+
return "bool"
|
|
181
|
+
elif "datetime" in dtype_str:
|
|
182
|
+
return "datetime"
|
|
183
|
+
elif "date" in dtype_str and "datetime" not in dtype_str:
|
|
184
|
+
return "date"
|
|
185
|
+
elif "time" in dtype_str:
|
|
186
|
+
return "time"
|
|
187
|
+
|
|
188
|
+
# Pandas types
|
|
189
|
+
elif "object" in dtype_str:
|
|
190
|
+
return "obj"
|
|
191
|
+
elif "category" in dtype_str:
|
|
192
|
+
return "cat"
|
|
193
|
+
|
|
194
|
+
# Generic fallbacks
|
|
195
|
+
elif "int" in dtype_str:
|
|
196
|
+
return "int"
|
|
197
|
+
elif "float" in dtype_str:
|
|
198
|
+
return "float"
|
|
199
|
+
elif "str" in dtype_str:
|
|
200
|
+
return "str"
|
|
201
|
+
|
|
202
|
+
# Unknown or complex types - truncate if too long
|
|
203
|
+
elif len(dtype_str) > 8:
|
|
204
|
+
return dtype_str[:8] + "…"
|
|
205
|
+
else:
|
|
206
|
+
return dtype_str
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _rich_print_gt_table(gt_table: Any, preview_info: dict | None = None) -> None:
|
|
210
|
+
"""Convert a GT table to Rich table and display it in the terminal.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
gt_table: The GT table object to display
|
|
214
|
+
preview_info: Optional dict with preview context info:
|
|
215
|
+
- total_rows: Total rows in the dataset
|
|
216
|
+
- head_rows: Number of head rows shown
|
|
217
|
+
- tail_rows: Number of tail rows shown
|
|
218
|
+
- is_complete: Whether the entire dataset is shown
|
|
219
|
+
"""
|
|
220
|
+
try:
|
|
221
|
+
# Try to extract the underlying data from the GT table
|
|
222
|
+
df = None
|
|
223
|
+
|
|
224
|
+
# Great Tables stores the original data in different places depending on how it was created
|
|
225
|
+
# Let's try multiple approaches to get the data
|
|
226
|
+
if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
|
|
227
|
+
df = gt_table._tbl_data
|
|
228
|
+
elif (
|
|
229
|
+
hasattr(gt_table, "_body")
|
|
230
|
+
and hasattr(gt_table._body, "body")
|
|
231
|
+
and gt_table._body.body is not None
|
|
232
|
+
):
|
|
233
|
+
df = gt_table._body.body
|
|
234
|
+
elif hasattr(gt_table, "_data") and gt_table._data is not None:
|
|
235
|
+
df = gt_table._data
|
|
236
|
+
elif hasattr(gt_table, "data") and gt_table.data is not None:
|
|
237
|
+
df = gt_table.data
|
|
238
|
+
|
|
239
|
+
if df is not None:
|
|
240
|
+
# Create a Rich table with horizontal lines
|
|
241
|
+
from rich.box import SIMPLE_HEAD
|
|
242
|
+
|
|
243
|
+
# Create enhanced title if preview_info contains metadata
|
|
244
|
+
table_title = None
|
|
245
|
+
if preview_info and "source_type" in preview_info and "table_type" in preview_info:
|
|
246
|
+
source_type = preview_info["source_type"]
|
|
247
|
+
table_type = preview_info["table_type"]
|
|
248
|
+
table_title = f"Data Preview / {source_type} / {table_type}"
|
|
249
|
+
|
|
250
|
+
rich_table = Table(
|
|
251
|
+
title=table_title,
|
|
252
|
+
show_header=True,
|
|
253
|
+
header_style="bold magenta",
|
|
254
|
+
box=SIMPLE_HEAD,
|
|
255
|
+
title_style="bold cyan",
|
|
256
|
+
title_justify="left",
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
# Get column names
|
|
260
|
+
columns = []
|
|
261
|
+
if hasattr(df, "columns"):
|
|
262
|
+
columns = list(df.columns)
|
|
263
|
+
elif hasattr(df, "schema"): # pragma: no cover
|
|
264
|
+
columns = list(df.schema.names)
|
|
265
|
+
elif hasattr(df, "column_names"): # pragma: no cover
|
|
266
|
+
columns = list(df.column_names)
|
|
267
|
+
|
|
268
|
+
if not columns: # pragma: no cover
|
|
269
|
+
# Fallback: try to determine columns from first row
|
|
270
|
+
try:
|
|
271
|
+
if hasattr(df, "to_dicts") and len(df) > 0:
|
|
272
|
+
first_dict = df.to_dicts()[0]
|
|
273
|
+
columns = list(first_dict.keys())
|
|
274
|
+
elif hasattr(df, "to_dict") and len(df) > 0:
|
|
275
|
+
first_dict = df.to_dict("records")[0]
|
|
276
|
+
columns = list(first_dict.keys())
|
|
277
|
+
except Exception: # pragma: no cover
|
|
278
|
+
columns = [f"Column {i + 1}" for i in range(10)] # Default fallback
|
|
279
|
+
|
|
280
|
+
# Add columns to Rich table
|
|
281
|
+
# Handle wide tables by limiting columns displayed
|
|
282
|
+
max_terminal_cols = 15 # Reasonable limit for terminal display
|
|
283
|
+
|
|
284
|
+
# Get terminal width to adjust column behavior
|
|
285
|
+
try:
|
|
286
|
+
terminal_width = console.size.width
|
|
287
|
+
# Estimate max column width based on terminal size and number of columns
|
|
288
|
+
if len(columns) <= 5:
|
|
289
|
+
max_col_width = min(60, terminal_width // 4)
|
|
290
|
+
elif len(columns) <= 10:
|
|
291
|
+
max_col_width = min(40, terminal_width // 6)
|
|
292
|
+
else:
|
|
293
|
+
max_col_width = min(30, terminal_width // 8)
|
|
294
|
+
except Exception: # pragma: no cover
|
|
295
|
+
# Fallback if we can't get terminal width
|
|
296
|
+
max_col_width = 40 if len(columns) <= 10 else 25
|
|
297
|
+
|
|
298
|
+
if len(columns) > max_terminal_cols:
|
|
299
|
+
# For wide tables, show first few, middle indicator, and last few columns
|
|
300
|
+
first_cols = 7
|
|
301
|
+
last_cols = 7
|
|
302
|
+
|
|
303
|
+
display_columns = columns[:first_cols] + ["...more..."] + columns[-last_cols:]
|
|
304
|
+
|
|
305
|
+
console.print(
|
|
306
|
+
f"\n[yellow]⚠ Table has {len(columns)} columns. Showing first {first_cols} and last {last_cols} columns.[/yellow]"
|
|
307
|
+
)
|
|
308
|
+
console.print("[dim]Use --columns to specify which columns to display.[/dim]")
|
|
309
|
+
console.print(
|
|
310
|
+
f"[dim]Full column list: {', '.join(columns[:5])}...{', '.join(columns[-5:])}[/dim]\n"
|
|
311
|
+
)
|
|
312
|
+
else:
|
|
313
|
+
display_columns = columns
|
|
314
|
+
|
|
315
|
+
# Get data types for columns
|
|
316
|
+
dtypes_dict = _get_column_dtypes(df, columns)
|
|
317
|
+
|
|
318
|
+
# Calculate row number column width if needed
|
|
319
|
+
row_num_width = 6 # Default width
|
|
320
|
+
if "_row_num_" in columns:
|
|
321
|
+
try:
|
|
322
|
+
# Get the maximum row number to calculate appropriate width
|
|
323
|
+
if hasattr(df, "to_dicts"):
|
|
324
|
+
data_dict = df.to_dicts()
|
|
325
|
+
if data_dict:
|
|
326
|
+
row_nums = [row.get("_row_num_", 0) for row in data_dict]
|
|
327
|
+
max_row_num = max(row_nums) if row_nums else 0
|
|
328
|
+
row_num_width = max(len(str(max_row_num)) + 1, 6) # +1 for padding
|
|
329
|
+
elif hasattr(df, "to_dict"):
|
|
330
|
+
data_dict = df.to_dict("records")
|
|
331
|
+
if data_dict:
|
|
332
|
+
row_nums = [row.get("_row_num_", 0) for row in data_dict]
|
|
333
|
+
max_row_num = max(row_nums) if row_nums else 0
|
|
334
|
+
row_num_width = max(len(str(max_row_num)) + 1, 6) # +1 for padding
|
|
335
|
+
except Exception: # pragma: no cover
|
|
336
|
+
# If we can't determine max row number, use default
|
|
337
|
+
row_num_width = 8 # Slightly larger default for safety
|
|
338
|
+
|
|
339
|
+
for i, col in enumerate(display_columns):
|
|
340
|
+
if col == "...more...":
|
|
341
|
+
# Add a special indicator column
|
|
342
|
+
rich_table.add_column("···", style="dim", width=3, no_wrap=True)
|
|
343
|
+
else:
|
|
344
|
+
# Handle row number column specially
|
|
345
|
+
if col == "_row_num_":
|
|
346
|
+
# Row numbers get no header, right alignment, and dim gray style
|
|
347
|
+
# Use dynamic width to prevent truncation
|
|
348
|
+
rich_table.add_column(
|
|
349
|
+
"", style="dim", justify="right", no_wrap=True, width=row_num_width
|
|
350
|
+
)
|
|
351
|
+
else:
|
|
352
|
+
display_col = str(col)
|
|
353
|
+
|
|
354
|
+
# Get data type for this column (if available)
|
|
355
|
+
if col in dtypes_dict:
|
|
356
|
+
dtype_display = f"<{dtypes_dict[col]}>"
|
|
357
|
+
# Create header with column name and data type
|
|
358
|
+
header_text = f"{display_col}\n[dim yellow]{dtype_display}[/dim yellow]"
|
|
359
|
+
else:
|
|
360
|
+
header_text = display_col
|
|
361
|
+
|
|
362
|
+
rich_table.add_column(
|
|
363
|
+
header_text,
|
|
364
|
+
style="cyan",
|
|
365
|
+
no_wrap=False,
|
|
366
|
+
overflow="ellipsis",
|
|
367
|
+
max_width=max_col_width,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# Convert data to list of rows
|
|
371
|
+
rows = []
|
|
372
|
+
try:
|
|
373
|
+
if hasattr(df, "to_dicts"):
|
|
374
|
+
# Polars interface
|
|
375
|
+
data_dict = df.to_dicts()
|
|
376
|
+
if len(columns) > max_terminal_cols:
|
|
377
|
+
# For wide tables, extract only the displayed columns
|
|
378
|
+
display_data_columns = (
|
|
379
|
+
columns[:7] + columns[-7:]
|
|
380
|
+
) # Skip the "...more..." placeholder
|
|
381
|
+
rows = [
|
|
382
|
+
[
|
|
383
|
+
_format_cell_value(
|
|
384
|
+
row.get(col, ""),
|
|
385
|
+
is_row_number=(col == "_row_num_"),
|
|
386
|
+
max_width=max_col_width,
|
|
387
|
+
num_columns=len(columns),
|
|
388
|
+
)
|
|
389
|
+
for col in display_data_columns
|
|
390
|
+
]
|
|
391
|
+
for row in data_dict
|
|
392
|
+
]
|
|
393
|
+
# Add the "..." column in the middle
|
|
394
|
+
for i, row in enumerate(rows):
|
|
395
|
+
rows[i] = row[:7] + ["···"] + row[7:]
|
|
396
|
+
else:
|
|
397
|
+
rows = [
|
|
398
|
+
[
|
|
399
|
+
_format_cell_value(
|
|
400
|
+
row.get(col, ""),
|
|
401
|
+
is_row_number=(col == "_row_num_"),
|
|
402
|
+
max_width=max_col_width,
|
|
403
|
+
num_columns=len(columns),
|
|
404
|
+
)
|
|
405
|
+
for col in columns
|
|
406
|
+
]
|
|
407
|
+
for row in data_dict
|
|
408
|
+
]
|
|
409
|
+
elif hasattr(df, "to_dict"):
|
|
410
|
+
# Pandas-like interface
|
|
411
|
+
data_dict = df.to_dict("records")
|
|
412
|
+
if len(columns) > max_terminal_cols:
|
|
413
|
+
# For wide tables, extract only the displayed columns
|
|
414
|
+
display_data_columns = columns[:7] + columns[-7:]
|
|
415
|
+
rows = [
|
|
416
|
+
[
|
|
417
|
+
_format_cell_value(
|
|
418
|
+
row.get(col, ""),
|
|
419
|
+
is_row_number=(col == "_row_num_"),
|
|
420
|
+
max_width=max_col_width,
|
|
421
|
+
num_columns=len(columns),
|
|
422
|
+
)
|
|
423
|
+
for col in display_data_columns
|
|
424
|
+
]
|
|
425
|
+
for row in data_dict
|
|
426
|
+
]
|
|
427
|
+
# Add the "..." column in the middle
|
|
428
|
+
for i, row in enumerate(rows):
|
|
429
|
+
rows[i] = row[:7] + ["···"] + row[7:]
|
|
430
|
+
else:
|
|
431
|
+
rows = [
|
|
432
|
+
[
|
|
433
|
+
_format_cell_value(
|
|
434
|
+
row.get(col, ""),
|
|
435
|
+
is_row_number=(col == "_row_num_"),
|
|
436
|
+
max_width=max_col_width,
|
|
437
|
+
num_columns=len(columns),
|
|
438
|
+
)
|
|
439
|
+
for col in columns
|
|
440
|
+
]
|
|
441
|
+
for row in data_dict
|
|
442
|
+
]
|
|
443
|
+
elif hasattr(df, "iter_rows"):
|
|
444
|
+
# Polars lazy frame
|
|
445
|
+
rows = [
|
|
446
|
+
[
|
|
447
|
+
_format_cell_value(
|
|
448
|
+
val,
|
|
449
|
+
is_row_number=(i == 0 and columns[0] == "_row_num_"),
|
|
450
|
+
max_width=max_col_width,
|
|
451
|
+
num_columns=len(columns),
|
|
452
|
+
)
|
|
453
|
+
for i, val in enumerate(row)
|
|
454
|
+
]
|
|
455
|
+
for row in df.iter_rows()
|
|
456
|
+
]
|
|
457
|
+
elif hasattr(df, "__iter__"):
|
|
458
|
+
# Try to iterate directly
|
|
459
|
+
rows = [
|
|
460
|
+
[
|
|
461
|
+
_format_cell_value(
|
|
462
|
+
val,
|
|
463
|
+
is_row_number=(i == 0 and columns[0] == "_row_num_"),
|
|
464
|
+
max_width=max_col_width,
|
|
465
|
+
num_columns=len(columns),
|
|
466
|
+
)
|
|
467
|
+
for i, val in enumerate(row)
|
|
468
|
+
]
|
|
469
|
+
for row in df
|
|
470
|
+
]
|
|
471
|
+
else:
|
|
472
|
+
rows = [["Could not extract data from this format"]] # pragma: no cover
|
|
473
|
+
except Exception as e:
|
|
474
|
+
rows = [[f"Error extracting data: {e}"]] # pragma: no cover
|
|
475
|
+
|
|
476
|
+
# Add rows to Rich table with separator between head and tail
|
|
477
|
+
max_rows = 50 # Reasonable limit for terminal display
|
|
478
|
+
|
|
479
|
+
# Get preview info to determine head/tail separation
|
|
480
|
+
head_rows_count = 0
|
|
481
|
+
tail_rows_count = 0
|
|
482
|
+
total_dataset_rows = 0
|
|
483
|
+
|
|
484
|
+
if preview_info:
|
|
485
|
+
head_rows_count = preview_info.get("head_rows", 0)
|
|
486
|
+
tail_rows_count = preview_info.get("tail_rows", 0)
|
|
487
|
+
total_dataset_rows = preview_info.get("total_rows", len(rows))
|
|
488
|
+
is_complete = preview_info.get("is_complete", False)
|
|
489
|
+
else:
|
|
490
|
+
# Fallback: assume all rows are shown
|
|
491
|
+
is_complete = True
|
|
492
|
+
|
|
493
|
+
# Add rows with optional separator
|
|
494
|
+
for i, row in enumerate(rows[:max_rows]):
|
|
495
|
+
try:
|
|
496
|
+
# Add separator between head and tail rows
|
|
497
|
+
if (
|
|
498
|
+
not is_complete
|
|
499
|
+
and head_rows_count > 0
|
|
500
|
+
and tail_rows_count > 0
|
|
501
|
+
and i == head_rows_count
|
|
502
|
+
):
|
|
503
|
+
# Add a visual separator row with dashes
|
|
504
|
+
separator_row = [
|
|
505
|
+
"─" * 3 if col != "_row_num_" else "⋮"
|
|
506
|
+
for col in (
|
|
507
|
+
display_columns if "display_columns" in locals() else columns
|
|
508
|
+
)
|
|
509
|
+
]
|
|
510
|
+
rich_table.add_row(*separator_row, style="dim")
|
|
511
|
+
|
|
512
|
+
rich_table.add_row(*row)
|
|
513
|
+
except Exception as e: # pragma: no cover
|
|
514
|
+
# If there's an issue with row data, show error
|
|
515
|
+
rich_table.add_row(*[f"Error: {e}" for _ in columns]) # pragma: no cover
|
|
516
|
+
break # pragma: no cover
|
|
517
|
+
|
|
518
|
+
# Show the table
|
|
519
|
+
console.print()
|
|
520
|
+
console.print(rich_table)
|
|
521
|
+
|
|
522
|
+
# Show summary info
|
|
523
|
+
total_rows = len(rows)
|
|
524
|
+
|
|
525
|
+
# Use preview info if available, otherwise fall back to old logic
|
|
526
|
+
if preview_info:
|
|
527
|
+
total_dataset_rows = preview_info.get("total_rows", total_rows)
|
|
528
|
+
head_rows = preview_info.get("head_rows", 0)
|
|
529
|
+
tail_rows = preview_info.get("tail_rows", 0)
|
|
530
|
+
is_complete = preview_info.get("is_complete", False)
|
|
531
|
+
|
|
532
|
+
if is_complete:
|
|
533
|
+
console.print(f"\n[dim]Showing all {total_rows} rows.[/dim]")
|
|
534
|
+
elif head_rows > 0 and tail_rows > 0:
|
|
535
|
+
console.print(
|
|
536
|
+
f"\n[dim]Showing first {head_rows} and last {tail_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
|
|
537
|
+
)
|
|
538
|
+
elif head_rows > 0:
|
|
539
|
+
console.print(
|
|
540
|
+
f"\n[dim]Showing first {head_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
|
|
541
|
+
)
|
|
542
|
+
elif tail_rows > 0:
|
|
543
|
+
console.print(
|
|
544
|
+
f"\n[dim]Showing last {tail_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
|
|
545
|
+
)
|
|
546
|
+
else:
|
|
547
|
+
# Fallback for other cases
|
|
548
|
+
console.print(
|
|
549
|
+
f"\n[dim]Showing {total_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
|
|
550
|
+
)
|
|
551
|
+
else:
|
|
552
|
+
# Original logic as fallback
|
|
553
|
+
max_rows = 50 # This should match the limit used above
|
|
554
|
+
if total_rows > max_rows:
|
|
555
|
+
console.print(
|
|
556
|
+
f"\n[dim]Showing first {max_rows} of {total_rows} rows. Use --output-html to see all data.[/dim]"
|
|
557
|
+
)
|
|
558
|
+
else:
|
|
559
|
+
console.print(f"\n[dim]Showing all {total_rows} rows.[/dim]")
|
|
560
|
+
|
|
561
|
+
else:
|
|
562
|
+
# If we can't extract data, show the success message
|
|
563
|
+
console.print(
|
|
564
|
+
Panel(
|
|
565
|
+
"[green]✓[/green] Table rendered successfully. "
|
|
566
|
+
"Use --output-html to save the full interactive report.",
|
|
567
|
+
title="Table Preview",
|
|
568
|
+
border_style="green",
|
|
569
|
+
)
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
except Exception as e: # pragma: no cover
|
|
573
|
+
console.print(f"[red]Error rendering table:[/red] {e}")
|
|
574
|
+
console.print(
|
|
575
|
+
f"[dim]GT table type: {type(gt_table) if 'gt_table' in locals() else 'undefined'}[/dim]"
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
# Fallback: show the success message
|
|
579
|
+
console.print(
|
|
580
|
+
Panel(
|
|
581
|
+
"[green]✓[/green] Table rendered successfully. "
|
|
582
|
+
"Use --output-html to save the full interactive report.",
|
|
583
|
+
title="Table Preview",
|
|
584
|
+
border_style="green",
|
|
585
|
+
)
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
def _display_validation_summary(validation: Any) -> None:
|
|
590
|
+
"""Display a validation summary in a Rich table format."""
|
|
591
|
+
try:
|
|
592
|
+
# Try to get the summary from the validation report
|
|
593
|
+
if hasattr(validation, "validation_info") and validation.validation_info is not None:
|
|
594
|
+
# Use the validation_info to create a summary
|
|
595
|
+
info = validation.validation_info
|
|
596
|
+
n_steps = len(info)
|
|
597
|
+
n_passed = sum(1 for step in info if step.all_passed)
|
|
598
|
+
n_failed = n_steps - n_passed
|
|
599
|
+
|
|
600
|
+
# Calculate severity counts
|
|
601
|
+
n_warning = sum(1 for step in info if step.warning)
|
|
602
|
+
n_error = sum(1 for step in info if step.error)
|
|
603
|
+
n_critical = sum(1 for step in info if step.critical)
|
|
604
|
+
|
|
605
|
+
all_passed = n_failed == 0
|
|
606
|
+
|
|
607
|
+
# Determine highest severity
|
|
608
|
+
if n_critical > 0:
|
|
609
|
+
highest_severity = "critical"
|
|
610
|
+
elif n_error > 0:
|
|
611
|
+
highest_severity = "error"
|
|
612
|
+
elif n_warning > 0:
|
|
613
|
+
highest_severity = "warning"
|
|
614
|
+
elif n_failed > 0:
|
|
615
|
+
highest_severity = "some failing"
|
|
616
|
+
else:
|
|
617
|
+
highest_severity = "all passed"
|
|
618
|
+
|
|
619
|
+
# Create a summary table
|
|
620
|
+
table = Table(title="Validation Summary", show_header=True, header_style="bold magenta")
|
|
621
|
+
table.add_column("Metric", style="cyan", no_wrap=True)
|
|
622
|
+
table.add_column("Value", style="green")
|
|
623
|
+
|
|
624
|
+
# Add summary statistics
|
|
625
|
+
table.add_row("Total Steps", str(n_steps))
|
|
626
|
+
table.add_row("Passing Steps", str(n_passed))
|
|
627
|
+
table.add_row("Failing Steps", str(n_failed))
|
|
628
|
+
table.add_row("Warning Steps", str(n_warning))
|
|
629
|
+
table.add_row("Error Steps", str(n_error))
|
|
630
|
+
table.add_row("Critical Steps", str(n_critical))
|
|
631
|
+
table.add_row("All Passed", str(all_passed))
|
|
632
|
+
table.add_row("Highest Severity", highest_severity)
|
|
633
|
+
|
|
634
|
+
console.print(table)
|
|
635
|
+
|
|
636
|
+
# Display step details
|
|
637
|
+
if n_steps > 0:
|
|
638
|
+
steps_table = Table(
|
|
639
|
+
title="Validation Steps", show_header=True, header_style="bold cyan"
|
|
640
|
+
)
|
|
641
|
+
steps_table.add_column("Step", style="dim")
|
|
642
|
+
steps_table.add_column("Type", style="white")
|
|
643
|
+
steps_table.add_column("Column", style="cyan")
|
|
644
|
+
steps_table.add_column("Status", style="white")
|
|
645
|
+
steps_table.add_column("Passed/Total", style="green")
|
|
646
|
+
|
|
647
|
+
for step in info:
|
|
648
|
+
status_icon = "✓" if step.all_passed else "✗"
|
|
649
|
+
status_color = "green" if step.all_passed else "red"
|
|
650
|
+
|
|
651
|
+
severity = ""
|
|
652
|
+
if step.critical:
|
|
653
|
+
severity = " [red](CRITICAL)[/red]"
|
|
654
|
+
elif step.error:
|
|
655
|
+
severity = " [red](ERROR)[/red]"
|
|
656
|
+
elif step.warning:
|
|
657
|
+
severity = " [yellow](WARNING)[/yellow]"
|
|
658
|
+
|
|
659
|
+
steps_table.add_row(
|
|
660
|
+
str(step.i),
|
|
661
|
+
step.assertion_type,
|
|
662
|
+
str(step.column) if step.column else "—",
|
|
663
|
+
f"[{status_color}]{status_icon}[/{status_color}]{severity}",
|
|
664
|
+
f"{step.n_passed}/{step.n}",
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
console.print(steps_table)
|
|
668
|
+
|
|
669
|
+
# Display status with appropriate color
|
|
670
|
+
if highest_severity == "all passed":
|
|
671
|
+
console.print(
|
|
672
|
+
Panel("[green]✓ All validations passed![/green]", border_style="green")
|
|
673
|
+
)
|
|
674
|
+
elif highest_severity == "some failing":
|
|
675
|
+
console.print(
|
|
676
|
+
Panel("[yellow]⚠ Some validations failed[/yellow]", border_style="yellow")
|
|
677
|
+
)
|
|
678
|
+
elif highest_severity in ["warning", "error", "critical"]:
|
|
679
|
+
color = "yellow" if highest_severity == "warning" else "red"
|
|
680
|
+
console.print(
|
|
681
|
+
Panel(
|
|
682
|
+
f"[{color}]✗ Validation failed with {highest_severity} severity[/{color}]",
|
|
683
|
+
border_style=color,
|
|
684
|
+
)
|
|
685
|
+
)
|
|
686
|
+
else:
|
|
687
|
+
console.print("[yellow]Validation object does not contain validation results.[/yellow]")
|
|
688
|
+
|
|
689
|
+
except Exception as e: # pragma: no cover
|
|
690
|
+
console.print(f"[red]Error displaying validation summary:[/red] {e}")
|
|
691
|
+
import traceback # pragma: no cover
|
|
692
|
+
|
|
693
|
+
console.print(f"[dim]{traceback.format_exc()}[/dim]") # pragma: no cover
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
@click.group()
|
|
697
|
+
@click.version_option(version=pb.__version__, prog_name="pb")
|
|
698
|
+
def cli():
|
|
699
|
+
"""
|
|
700
|
+
Pointblank CLI - Data validation and quality tools for data engineers.
|
|
701
|
+
|
|
702
|
+
Use this CLI to validate data, preview tables, and generate reports
|
|
703
|
+
directly from the command line.
|
|
704
|
+
"""
|
|
705
|
+
pass
|
|
706
|
+
|
|
707
|
+
|
|
708
|
+
@cli.command()
|
|
709
|
+
def datasets():
|
|
710
|
+
"""
|
|
711
|
+
List available built-in datasets.
|
|
712
|
+
"""
|
|
713
|
+
datasets_info = [
|
|
714
|
+
("small_table", "13 rows × 8 columns", "Small demo dataset for testing"),
|
|
715
|
+
("game_revenue", "2,000 rows × 11 columns", "Game development company revenue data"),
|
|
716
|
+
("nycflights", "336,776 rows × 18 columns", "NYC airport flights data from 2013"),
|
|
717
|
+
("global_sales", "50,000 rows × 20 columns", "Global sales data across regions"),
|
|
718
|
+
]
|
|
719
|
+
|
|
720
|
+
table = Table(
|
|
721
|
+
title="Available Pointblank Datasets", show_header=True, header_style="bold magenta"
|
|
722
|
+
)
|
|
723
|
+
table.add_column("Dataset Name", style="cyan", no_wrap=True)
|
|
724
|
+
table.add_column("Dimensions", style="green")
|
|
725
|
+
table.add_column("Description", style="white")
|
|
726
|
+
|
|
727
|
+
for name, dims, desc in datasets_info:
|
|
728
|
+
table.add_row(name, dims, desc)
|
|
729
|
+
|
|
730
|
+
console.print(table)
|
|
731
|
+
console.print("\n[dim]Use these dataset names directly with any pb CLI command.[/dim]")
|
|
732
|
+
console.print("[dim]Example: pb preview small_table[/dim]")
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
@cli.command()
|
|
736
|
+
def requirements():
|
|
737
|
+
"""
|
|
738
|
+
Check installed dependencies and their availability.
|
|
739
|
+
"""
|
|
740
|
+
dependencies = [
|
|
741
|
+
("polars", "Polars DataFrame support"),
|
|
742
|
+
("pandas", "Pandas DataFrame support"),
|
|
743
|
+
("ibis", "Ibis backend support (DuckDB, etc.)"),
|
|
744
|
+
("duckdb", "DuckDB database support"),
|
|
745
|
+
("pyarrow", "Parquet file support"),
|
|
746
|
+
]
|
|
747
|
+
|
|
748
|
+
table = Table(title="Dependency Status", show_header=True, header_style="bold magenta")
|
|
749
|
+
table.add_column("Package", style="cyan", no_wrap=True)
|
|
750
|
+
table.add_column("Status", style="white")
|
|
751
|
+
table.add_column("Description", style="dim")
|
|
752
|
+
|
|
753
|
+
for package, description in dependencies:
|
|
754
|
+
if _is_lib_present(package):
|
|
755
|
+
status = "[green]✓ Installed[/green]"
|
|
756
|
+
else:
|
|
757
|
+
status = "[red]✗ Not installed[/red]"
|
|
758
|
+
|
|
759
|
+
table.add_row(package, status, description)
|
|
760
|
+
|
|
761
|
+
console.print(table)
|
|
762
|
+
console.print("\n[dim]Install missing packages to enable additional functionality.[/dim]")
|
|
763
|
+
|
|
764
|
+
|
|
765
|
+
@cli.command()
|
|
766
|
+
@click.argument("data_source", type=str)
|
|
767
|
+
@click.option("--columns", "-c", help="Comma-separated list of columns to display")
|
|
768
|
+
@click.option("--col-range", help="Column range like '1:10' or '5:' or ':15' (1-based indexing)")
|
|
769
|
+
@click.option("--col-first", type=int, help="Show first N columns")
|
|
770
|
+
@click.option("--col-last", type=int, help="Show last N columns")
|
|
771
|
+
@click.option("--head", "-h", default=5, help="Number of rows from the top (default: 5)")
|
|
772
|
+
@click.option("--tail", "-t", default=5, help="Number of rows from the bottom (default: 5)")
|
|
773
|
+
@click.option("--limit", "-l", default=50, help="Maximum total rows to display (default: 50)")
|
|
774
|
+
@click.option("--no-row-numbers", is_flag=True, help="Hide row numbers")
|
|
775
|
+
@click.option("--max-col-width", default=250, help="Maximum column width in pixels (default: 250)")
|
|
776
|
+
@click.option("--min-table-width", default=500, help="Minimum table width in pixels (default: 500)")
|
|
777
|
+
@click.option("--no-header", is_flag=True, help="Hide table header")
|
|
778
|
+
@click.option("--output-html", type=click.Path(), help="Save HTML output to file")
|
|
779
|
+
def preview(
|
|
780
|
+
data_source: str,
|
|
781
|
+
columns: str | None,
|
|
782
|
+
col_range: str | None,
|
|
783
|
+
col_first: int | None,
|
|
784
|
+
col_last: int | None,
|
|
785
|
+
head: int,
|
|
786
|
+
tail: int,
|
|
787
|
+
limit: int,
|
|
788
|
+
no_row_numbers: bool,
|
|
789
|
+
max_col_width: int,
|
|
790
|
+
min_table_width: int,
|
|
791
|
+
no_header: bool,
|
|
792
|
+
output_html: str | None,
|
|
793
|
+
):
|
|
794
|
+
"""
|
|
795
|
+
Preview a data table showing head and tail rows.
|
|
796
|
+
|
|
797
|
+
DATA_SOURCE can be:
|
|
798
|
+
|
|
799
|
+
\b
|
|
800
|
+
- CSV file path (e.g., data.csv)
|
|
801
|
+
- Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
|
|
802
|
+
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
803
|
+
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
804
|
+
|
|
805
|
+
COLUMN SELECTION OPTIONS:
|
|
806
|
+
|
|
807
|
+
For tables with many columns, use these options to control which columns are displayed:
|
|
808
|
+
|
|
809
|
+
\b
|
|
810
|
+
- --columns: Specify exact columns (e.g., --columns "name,age,email")
|
|
811
|
+
- --col-range: Select column range (e.g., --col-range "1:10", --col-range "5:", --col-range ":15")
|
|
812
|
+
- --col-first: Show first N columns (e.g., --col-first 5)
|
|
813
|
+
- --col-last: Show last N columns (e.g., --col-last 3)
|
|
814
|
+
|
|
815
|
+
Tables with >15 columns automatically show first 7 and last 7 columns with indicators.
|
|
816
|
+
"""
|
|
817
|
+
try:
|
|
818
|
+
with console.status("[bold green]Loading data..."):
|
|
819
|
+
# Try to load as a pointblank dataset first
|
|
820
|
+
if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
821
|
+
data = pb.load_dataset(data_source)
|
|
822
|
+
console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
|
|
823
|
+
else:
|
|
824
|
+
# Assume it's a file path or connection string
|
|
825
|
+
data = data_source
|
|
826
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
827
|
+
|
|
828
|
+
# Parse columns if provided
|
|
829
|
+
columns_list = None
|
|
830
|
+
if columns:
|
|
831
|
+
columns_list = [col.strip() for col in columns.split(",")]
|
|
832
|
+
|
|
833
|
+
# If data has _row_num_ and it's not explicitly included, add it at the beginning
|
|
834
|
+
try:
|
|
835
|
+
from pointblank.validate import (
|
|
836
|
+
_process_connection_string,
|
|
837
|
+
_process_csv_input,
|
|
838
|
+
_process_parquet_input,
|
|
839
|
+
)
|
|
840
|
+
|
|
841
|
+
# Process the data source to get actual data object to check for _row_num_
|
|
842
|
+
processed_data = data
|
|
843
|
+
if isinstance(data, str):
|
|
844
|
+
processed_data = _process_connection_string(data)
|
|
845
|
+
processed_data = _process_csv_input(processed_data)
|
|
846
|
+
processed_data = _process_parquet_input(processed_data)
|
|
847
|
+
|
|
848
|
+
# Get column names from the processed data
|
|
849
|
+
all_columns = []
|
|
850
|
+
if hasattr(processed_data, "columns"):
|
|
851
|
+
all_columns = list(processed_data.columns)
|
|
852
|
+
elif hasattr(processed_data, "schema"):
|
|
853
|
+
all_columns = list(processed_data.schema.names)
|
|
854
|
+
|
|
855
|
+
# If _row_num_ exists in data but not in user selection, add it at beginning
|
|
856
|
+
if all_columns and "_row_num_" in all_columns and "_row_num_" not in columns_list:
|
|
857
|
+
columns_list = ["_row_num_"] + columns_list
|
|
858
|
+
except Exception: # pragma: no cover
|
|
859
|
+
# If we can't process the data, just use the user's column list as-is
|
|
860
|
+
pass
|
|
861
|
+
elif col_range or col_first or col_last:
|
|
862
|
+
# Need to get column names to apply range/first/last selection
|
|
863
|
+
# Load the data to get column names
|
|
864
|
+
from pointblank.validate import (
|
|
865
|
+
_process_connection_string,
|
|
866
|
+
_process_csv_input,
|
|
867
|
+
_process_parquet_input,
|
|
868
|
+
)
|
|
869
|
+
|
|
870
|
+
# Process the data source to get actual data object
|
|
871
|
+
processed_data = data
|
|
872
|
+
if isinstance(data, str):
|
|
873
|
+
processed_data = _process_connection_string(data)
|
|
874
|
+
processed_data = _process_csv_input(processed_data)
|
|
875
|
+
processed_data = _process_parquet_input(processed_data)
|
|
876
|
+
|
|
877
|
+
# Get column names from the processed data
|
|
878
|
+
all_columns = []
|
|
879
|
+
if hasattr(processed_data, "columns"):
|
|
880
|
+
all_columns = list(processed_data.columns)
|
|
881
|
+
elif hasattr(processed_data, "schema"):
|
|
882
|
+
all_columns = list(processed_data.schema.names)
|
|
883
|
+
else:
|
|
884
|
+
console.print(
|
|
885
|
+
"[yellow]Warning: Could not determine column names for range selection[/yellow]"
|
|
886
|
+
)
|
|
887
|
+
|
|
888
|
+
if all_columns:
|
|
889
|
+
# Check if _row_num_ exists and preserve it
|
|
890
|
+
has_row_num = "_row_num_" in all_columns
|
|
891
|
+
|
|
892
|
+
if col_range:
|
|
893
|
+
# Parse range like "1:10", "5:", ":15"
|
|
894
|
+
if ":" in col_range:
|
|
895
|
+
parts = col_range.split(":")
|
|
896
|
+
start_idx = int(parts[0]) - 1 if parts[0] else 0 # Convert to 0-based
|
|
897
|
+
end_idx = int(parts[1]) if parts[1] else len(all_columns)
|
|
898
|
+
|
|
899
|
+
# Filter out _row_num_ from the range selection, we'll add it back later
|
|
900
|
+
columns_for_range = [col for col in all_columns if col != "_row_num_"]
|
|
901
|
+
selected_columns = columns_for_range[start_idx:end_idx]
|
|
902
|
+
|
|
903
|
+
# Always include _row_num_ at the beginning if it exists
|
|
904
|
+
if has_row_num:
|
|
905
|
+
columns_list = ["_row_num_"] + selected_columns
|
|
906
|
+
else:
|
|
907
|
+
columns_list = selected_columns
|
|
908
|
+
else:
|
|
909
|
+
console.print(
|
|
910
|
+
"[yellow]Warning: Invalid range format. Use 'start:end' format[/yellow]"
|
|
911
|
+
)
|
|
912
|
+
elif col_first:
|
|
913
|
+
# Filter out _row_num_ from the first N selection, we'll add it back later
|
|
914
|
+
columns_for_first = [col for col in all_columns if col != "_row_num_"]
|
|
915
|
+
selected_columns = columns_for_first[:col_first]
|
|
916
|
+
|
|
917
|
+
# Always include _row_num_ at the beginning if it exists
|
|
918
|
+
if has_row_num:
|
|
919
|
+
columns_list = ["_row_num_"] + selected_columns
|
|
920
|
+
else:
|
|
921
|
+
columns_list = selected_columns
|
|
922
|
+
elif col_last:
|
|
923
|
+
# Filter out _row_num_ from the last N selection, we'll add it back later
|
|
924
|
+
columns_for_last = [col for col in all_columns if col != "_row_num_"]
|
|
925
|
+
selected_columns = columns_for_last[-col_last:]
|
|
926
|
+
|
|
927
|
+
# Always include _row_num_ at the beginning if it exists
|
|
928
|
+
if has_row_num:
|
|
929
|
+
columns_list = ["_row_num_"] + selected_columns
|
|
930
|
+
else:
|
|
931
|
+
columns_list = selected_columns
|
|
932
|
+
|
|
933
|
+
# Generate preview
|
|
934
|
+
with console.status("[bold green]Generating preview..."):
|
|
935
|
+
# Get total dataset size before preview and gather metadata
|
|
936
|
+
try:
|
|
937
|
+
# Process the data to get the actual data object for row count and metadata
|
|
938
|
+
from pointblank.validate import (
|
|
939
|
+
_process_connection_string,
|
|
940
|
+
_process_csv_input,
|
|
941
|
+
_process_parquet_input,
|
|
942
|
+
)
|
|
943
|
+
|
|
944
|
+
processed_data = data
|
|
945
|
+
if isinstance(data, str):
|
|
946
|
+
processed_data = _process_connection_string(data)
|
|
947
|
+
processed_data = _process_csv_input(processed_data)
|
|
948
|
+
processed_data = _process_parquet_input(processed_data)
|
|
949
|
+
|
|
950
|
+
total_dataset_rows = pb.get_row_count(processed_data)
|
|
951
|
+
|
|
952
|
+
# Determine source type and table type for enhanced preview title
|
|
953
|
+
if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
954
|
+
source_type = f"Pointblank dataset: {data_source}"
|
|
955
|
+
else:
|
|
956
|
+
source_type = f"External source: {data_source}"
|
|
957
|
+
|
|
958
|
+
table_type = _get_tbl_type(processed_data)
|
|
959
|
+
except Exception:
|
|
960
|
+
# If we can't get metadata, set defaults
|
|
961
|
+
total_dataset_rows = None
|
|
962
|
+
source_type = f"Data source: {data_source}"
|
|
963
|
+
table_type = "unknown"
|
|
964
|
+
|
|
965
|
+
gt_table = pb.preview(
|
|
966
|
+
data=data,
|
|
967
|
+
columns_subset=columns_list,
|
|
968
|
+
n_head=head,
|
|
969
|
+
n_tail=tail,
|
|
970
|
+
limit=limit,
|
|
971
|
+
show_row_numbers=not no_row_numbers,
|
|
972
|
+
max_col_width=max_col_width,
|
|
973
|
+
min_tbl_width=min_table_width,
|
|
974
|
+
incl_header=not no_header,
|
|
975
|
+
)
|
|
976
|
+
|
|
977
|
+
if output_html:
|
|
978
|
+
# Save HTML to file
|
|
979
|
+
html_content = gt_table.as_raw_html()
|
|
980
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
981
|
+
console.print(f"[green]✓[/green] HTML saved to: {output_html}")
|
|
982
|
+
else:
|
|
983
|
+
# Display in terminal with preview context info
|
|
984
|
+
preview_info = None
|
|
985
|
+
if total_dataset_rows is not None:
|
|
986
|
+
# Determine if we're showing the complete dataset
|
|
987
|
+
expected_rows = min(head + tail, limit, total_dataset_rows)
|
|
988
|
+
is_complete = total_dataset_rows <= expected_rows
|
|
989
|
+
|
|
990
|
+
preview_info = {
|
|
991
|
+
"total_rows": total_dataset_rows,
|
|
992
|
+
"head_rows": head,
|
|
993
|
+
"tail_rows": tail,
|
|
994
|
+
"is_complete": is_complete,
|
|
995
|
+
"source_type": source_type,
|
|
996
|
+
"table_type": table_type,
|
|
997
|
+
}
|
|
998
|
+
|
|
999
|
+
_rich_print_gt_table(gt_table, preview_info)
|
|
1000
|
+
|
|
1001
|
+
except Exception as e: # pragma: no cover
|
|
1002
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1003
|
+
sys.exit(1) # pragma: no cover
|
|
1004
|
+
|
|
1005
|
+
|
|
1006
|
+
@cli.command()
|
|
1007
|
+
@click.argument("data_source", type=str)
|
|
1008
|
+
def info(data_source: str):
|
|
1009
|
+
"""
|
|
1010
|
+
Display information about a data source.
|
|
1011
|
+
|
|
1012
|
+
Shows table type, dimensions, column names, and data types.
|
|
1013
|
+
"""
|
|
1014
|
+
try:
|
|
1015
|
+
with console.status("[bold green]Loading data..."):
|
|
1016
|
+
# Try to load as a pointblank dataset first
|
|
1017
|
+
if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
1018
|
+
data = pb.load_dataset(data_source)
|
|
1019
|
+
source_type = f"Pointblank dataset: {data_source}"
|
|
1020
|
+
console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
|
|
1021
|
+
else:
|
|
1022
|
+
# Assume it's a file path or connection string
|
|
1023
|
+
data = data_source
|
|
1024
|
+
source_type = f"External source: {data_source}"
|
|
1025
|
+
|
|
1026
|
+
# Process the data to get actual table object for inspection
|
|
1027
|
+
from pointblank.validate import (
|
|
1028
|
+
_process_connection_string,
|
|
1029
|
+
_process_csv_input,
|
|
1030
|
+
_process_parquet_input,
|
|
1031
|
+
)
|
|
1032
|
+
|
|
1033
|
+
data = _process_connection_string(data)
|
|
1034
|
+
data = _process_csv_input(data)
|
|
1035
|
+
data = _process_parquet_input(data)
|
|
1036
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1037
|
+
|
|
1038
|
+
# Get table information
|
|
1039
|
+
tbl_type = _get_tbl_type(data)
|
|
1040
|
+
row_count = pb.get_row_count(data)
|
|
1041
|
+
col_count = pb.get_column_count(data)
|
|
1042
|
+
|
|
1043
|
+
# Import the box style for consistent styling with scan table
|
|
1044
|
+
from rich.box import SIMPLE_HEAD
|
|
1045
|
+
|
|
1046
|
+
# Create info table with same styling as scan table
|
|
1047
|
+
info_table = Table(
|
|
1048
|
+
title="Data Source Information",
|
|
1049
|
+
show_header=True,
|
|
1050
|
+
header_style="bold magenta",
|
|
1051
|
+
box=SIMPLE_HEAD,
|
|
1052
|
+
title_style="bold cyan",
|
|
1053
|
+
title_justify="left",
|
|
1054
|
+
)
|
|
1055
|
+
info_table.add_column("Property", style="cyan", no_wrap=True)
|
|
1056
|
+
info_table.add_column("Value", style="green")
|
|
1057
|
+
|
|
1058
|
+
info_table.add_row("Source", source_type)
|
|
1059
|
+
info_table.add_row("Table Type", tbl_type)
|
|
1060
|
+
info_table.add_row("Rows", f"{row_count:,}")
|
|
1061
|
+
info_table.add_row("Columns", f"{col_count:,}")
|
|
1062
|
+
|
|
1063
|
+
console.print()
|
|
1064
|
+
console.print(info_table)
|
|
1065
|
+
|
|
1066
|
+
except Exception as e:
|
|
1067
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1068
|
+
sys.exit(1)
|
|
1069
|
+
|
|
1070
|
+
|
|
1071
|
+
@cli.command()
|
|
1072
|
+
@click.argument("data_source", type=str)
|
|
1073
|
+
@click.option("--output-html", type=click.Path(), help="Save HTML scan report to file")
|
|
1074
|
+
@click.option("--columns", "-c", help="Comma-separated list of columns to scan")
|
|
1075
|
+
def scan(
|
|
1076
|
+
data_source: str,
|
|
1077
|
+
output_html: str | None,
|
|
1078
|
+
columns: str | None,
|
|
1079
|
+
):
|
|
1080
|
+
"""
|
|
1081
|
+
Generate a data scan profile report.
|
|
1082
|
+
|
|
1083
|
+
Produces a comprehensive data profile including:
|
|
1084
|
+
|
|
1085
|
+
\b
|
|
1086
|
+
- Column types and distributions
|
|
1087
|
+
- Missing value patterns
|
|
1088
|
+
- Basic statistics
|
|
1089
|
+
- Data quality indicators
|
|
1090
|
+
|
|
1091
|
+
DATA_SOURCE can be:
|
|
1092
|
+
|
|
1093
|
+
\b
|
|
1094
|
+
- CSV file path (e.g., data.csv)
|
|
1095
|
+
- Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
|
|
1096
|
+
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1097
|
+
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1098
|
+
"""
|
|
1099
|
+
try:
|
|
1100
|
+
import time
|
|
1101
|
+
|
|
1102
|
+
start_time = time.time()
|
|
1103
|
+
|
|
1104
|
+
with console.status("[bold green]Loading data..."):
|
|
1105
|
+
# Try to load as a pointblank dataset first
|
|
1106
|
+
if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
1107
|
+
data = pb.load_dataset(data_source)
|
|
1108
|
+
console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
|
|
1109
|
+
else:
|
|
1110
|
+
# Assume it's a file path or connection string
|
|
1111
|
+
data = data_source
|
|
1112
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1113
|
+
|
|
1114
|
+
# Parse columns if provided
|
|
1115
|
+
columns_list = None
|
|
1116
|
+
if columns:
|
|
1117
|
+
columns_list = [col.strip() for col in columns.split(",")]
|
|
1118
|
+
|
|
1119
|
+
# Generate data scan
|
|
1120
|
+
with console.status("[bold green]Generating data scan..."):
|
|
1121
|
+
# Use col_summary_tbl for comprehensive column scanning
|
|
1122
|
+
if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
1123
|
+
# For pointblank datasets, data is already the loaded dataframe
|
|
1124
|
+
scan_result = pb.col_summary_tbl(data=data)
|
|
1125
|
+
source_type = f"Pointblank dataset: {data_source}"
|
|
1126
|
+
table_type = _get_tbl_type(data)
|
|
1127
|
+
# Get row count for footer
|
|
1128
|
+
try:
|
|
1129
|
+
total_rows = pb.get_row_count(data)
|
|
1130
|
+
except Exception:
|
|
1131
|
+
total_rows = None
|
|
1132
|
+
else:
|
|
1133
|
+
# For file paths and connection strings, load the data first
|
|
1134
|
+
from pointblank.validate import (
|
|
1135
|
+
_process_connection_string,
|
|
1136
|
+
_process_csv_input,
|
|
1137
|
+
_process_parquet_input,
|
|
1138
|
+
)
|
|
1139
|
+
|
|
1140
|
+
processed_data = _process_connection_string(data)
|
|
1141
|
+
processed_data = _process_csv_input(processed_data)
|
|
1142
|
+
processed_data = _process_parquet_input(processed_data)
|
|
1143
|
+
scan_result = pb.col_summary_tbl(data=processed_data)
|
|
1144
|
+
source_type = f"External source: {data_source}"
|
|
1145
|
+
table_type = _get_tbl_type(processed_data)
|
|
1146
|
+
# Get row count for footer
|
|
1147
|
+
try:
|
|
1148
|
+
total_rows = pb.get_row_count(processed_data)
|
|
1149
|
+
except Exception:
|
|
1150
|
+
total_rows = None
|
|
1151
|
+
|
|
1152
|
+
scan_time = time.time() - start_time
|
|
1153
|
+
|
|
1154
|
+
if output_html:
|
|
1155
|
+
# Save HTML to file
|
|
1156
|
+
try:
|
|
1157
|
+
html_content = scan_result.as_raw_html()
|
|
1158
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
1159
|
+
console.print(f"[green]✓[/green] Data scan report saved to: {output_html}")
|
|
1160
|
+
except Exception as e:
|
|
1161
|
+
console.print(f"[yellow]Warning: Could not save HTML report: {e}[/yellow]")
|
|
1162
|
+
else:
|
|
1163
|
+
# Display rich scan table in terminal
|
|
1164
|
+
console.print(f"[green]✓[/green] Data scan completed in {scan_time:.2f}s")
|
|
1165
|
+
console.print("Use --output-html to save the full interactive scan report.")
|
|
1166
|
+
|
|
1167
|
+
# Display detailed column summary using rich formatting
|
|
1168
|
+
try:
|
|
1169
|
+
_rich_print_scan_table(
|
|
1170
|
+
scan_result, data_source, source_type, table_type, total_rows
|
|
1171
|
+
)
|
|
1172
|
+
|
|
1173
|
+
except Exception as e:
|
|
1174
|
+
console.print(f"[yellow]Could not display scan summary: {e}[/yellow]")
|
|
1175
|
+
|
|
1176
|
+
except Exception as e:
|
|
1177
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1178
|
+
sys.exit(1)
|
|
1179
|
+
|
|
1180
|
+
|
|
1181
|
+
@cli.command()
|
|
1182
|
+
@click.argument("data_source", type=str)
|
|
1183
|
+
@click.option("--output-html", type=click.Path(), help="Save HTML output to file")
|
|
1184
|
+
def missing(data_source: str, output_html: str | None):
|
|
1185
|
+
"""
|
|
1186
|
+
Generate a missing values report for a data table.
|
|
1187
|
+
|
|
1188
|
+
DATA_SOURCE can be:
|
|
1189
|
+
|
|
1190
|
+
\b
|
|
1191
|
+
- CSV file path (e.g., data.csv)
|
|
1192
|
+
- Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
|
|
1193
|
+
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1194
|
+
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1195
|
+
"""
|
|
1196
|
+
try:
|
|
1197
|
+
with console.status("[bold green]Loading data..."):
|
|
1198
|
+
# Try to load as a pointblank dataset first
|
|
1199
|
+
if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
1200
|
+
data = pb.load_dataset(data_source)
|
|
1201
|
+
console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
|
|
1202
|
+
else:
|
|
1203
|
+
# Assume it's a file path or connection string
|
|
1204
|
+
data = data_source
|
|
1205
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1206
|
+
|
|
1207
|
+
# Generate missing values table
|
|
1208
|
+
with console.status("[bold green]Analyzing missing values..."):
|
|
1209
|
+
gt_table = pb.missing_vals_tbl(data)
|
|
1210
|
+
|
|
1211
|
+
# Get original data for column types
|
|
1212
|
+
original_data = data
|
|
1213
|
+
if isinstance(data, str):
|
|
1214
|
+
# Process the data to get the actual data object
|
|
1215
|
+
from pointblank.validate import (
|
|
1216
|
+
_process_connection_string,
|
|
1217
|
+
_process_csv_input,
|
|
1218
|
+
_process_parquet_input,
|
|
1219
|
+
)
|
|
1220
|
+
|
|
1221
|
+
try:
|
|
1222
|
+
original_data = _process_connection_string(data)
|
|
1223
|
+
original_data = _process_csv_input(original_data)
|
|
1224
|
+
original_data = _process_parquet_input(original_data)
|
|
1225
|
+
except Exception: # pragma: no cover
|
|
1226
|
+
pass # Use the string data as fallback
|
|
1227
|
+
|
|
1228
|
+
if output_html:
|
|
1229
|
+
# Save HTML to file
|
|
1230
|
+
html_content = gt_table.as_raw_html()
|
|
1231
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
1232
|
+
console.print(f"[green]✓[/green] Missing values report saved to: {output_html}")
|
|
1233
|
+
else:
|
|
1234
|
+
# Display in terminal with special missing values formatting
|
|
1235
|
+
_rich_print_missing_table(gt_table, original_data)
|
|
1236
|
+
|
|
1237
|
+
except Exception as e:
|
|
1238
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1239
|
+
sys.exit(1)
|
|
1240
|
+
|
|
1241
|
+
|
|
1242
|
+
@cli.command()
|
|
1243
|
+
@click.argument("output_file", type=click.Path())
|
|
1244
|
+
def validate_example(output_file: str):
|
|
1245
|
+
"""
|
|
1246
|
+
Generate an example validation script.
|
|
1247
|
+
|
|
1248
|
+
Creates a sample Python script showing how to use Pointblank for validation.
|
|
1249
|
+
"""
|
|
1250
|
+
example_script = '''"""
|
|
1251
|
+
Example Pointblank validation script.
|
|
1252
|
+
|
|
1253
|
+
This script demonstrates how to create validation rules for your data.
|
|
1254
|
+
Modify the validation rules below to match your data requirements.
|
|
1255
|
+
"""
|
|
1256
|
+
|
|
1257
|
+
import pointblank as pb
|
|
1258
|
+
|
|
1259
|
+
# Create a validation object
|
|
1260
|
+
# The 'data' variable is automatically provided by the CLI
|
|
1261
|
+
validation = (
|
|
1262
|
+
pb.Validate(
|
|
1263
|
+
data=data,
|
|
1264
|
+
tbl_name="Example Data",
|
|
1265
|
+
label="CLI Validation Example",
|
|
1266
|
+
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
1267
|
+
)
|
|
1268
|
+
# Add your validation rules here
|
|
1269
|
+
# Example rules (modify these based on your data structure):
|
|
1270
|
+
|
|
1271
|
+
# Check that specific columns exist
|
|
1272
|
+
# .col_exists(["column1", "column2"])
|
|
1273
|
+
|
|
1274
|
+
# Check for null values
|
|
1275
|
+
# .col_vals_not_null(columns="important_column")
|
|
1276
|
+
|
|
1277
|
+
# Check value ranges
|
|
1278
|
+
# .col_vals_gt(columns="amount", value=0)
|
|
1279
|
+
# .col_vals_between(columns="score", left=0, right=100)
|
|
1280
|
+
|
|
1281
|
+
# Check string patterns
|
|
1282
|
+
# .col_vals_regex(columns="email", pattern=r"^[\\w\\.-]+@[\\w\\.-]+\\.[a-zA-Z]{2,}$")
|
|
1283
|
+
|
|
1284
|
+
# Check unique values
|
|
1285
|
+
# .col_vals_unique(columns="id")
|
|
1286
|
+
|
|
1287
|
+
# Finalize the validation
|
|
1288
|
+
.interrogate()
|
|
1289
|
+
)
|
|
1290
|
+
|
|
1291
|
+
# The validation object will be automatically used by the CLI
|
|
1292
|
+
'''
|
|
1293
|
+
|
|
1294
|
+
Path(output_file).write_text(example_script)
|
|
1295
|
+
console.print(f"[green]✓[/green] Example validation script created: {output_file}")
|
|
1296
|
+
console.print("\nEdit the script to add your validation rules, then run:")
|
|
1297
|
+
console.print(f"[cyan]pb validate your_data.csv {output_file}[/cyan]")
|
|
1298
|
+
|
|
1299
|
+
|
|
1300
|
+
@cli.command()
|
|
1301
|
+
@click.argument("data_source", type=str)
|
|
1302
|
+
@click.argument("validation_script", type=click.Path(exists=True))
|
|
1303
|
+
@click.option("--output-html", type=click.Path(), help="Save HTML validation report to file")
|
|
1304
|
+
@click.option("--output-json", type=click.Path(), help="Save JSON validation summary to file")
|
|
1305
|
+
@click.option("--fail-on-error", is_flag=True, help="Exit with non-zero code if validation fails")
|
|
1306
|
+
def validate(
|
|
1307
|
+
data_source: str,
|
|
1308
|
+
validation_script: str,
|
|
1309
|
+
output_html: str | None,
|
|
1310
|
+
output_json: str | None,
|
|
1311
|
+
fail_on_error: bool,
|
|
1312
|
+
):
|
|
1313
|
+
"""
|
|
1314
|
+
Run validation using a Python validation script.
|
|
1315
|
+
|
|
1316
|
+
DATA_SOURCE can be:
|
|
1317
|
+
|
|
1318
|
+
\b
|
|
1319
|
+
- CSV file path (e.g., data.csv)
|
|
1320
|
+
- Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
|
|
1321
|
+
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1322
|
+
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1323
|
+
|
|
1324
|
+
VALIDATION_SCRIPT should be a Python file that defines validation rules.
|
|
1325
|
+
See 'pb validate-example' for a sample script.
|
|
1326
|
+
"""
|
|
1327
|
+
try:
|
|
1328
|
+
with console.status("[bold green]Loading data..."):
|
|
1329
|
+
# Try to load as a pointblank dataset first
|
|
1330
|
+
if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
1331
|
+
data = pb.load_dataset(data_source)
|
|
1332
|
+
console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
|
|
1333
|
+
else:
|
|
1334
|
+
# Assume it's a file path or connection string
|
|
1335
|
+
data = data_source
|
|
1336
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1337
|
+
|
|
1338
|
+
# Execute the validation script
|
|
1339
|
+
with console.status("[bold green]Running validation..."):
|
|
1340
|
+
# Read and execute the validation script
|
|
1341
|
+
script_content = Path(validation_script).read_text()
|
|
1342
|
+
|
|
1343
|
+
# Create a namespace with pointblank and the data
|
|
1344
|
+
namespace = {
|
|
1345
|
+
"pb": pb,
|
|
1346
|
+
"pointblank": pb,
|
|
1347
|
+
"data": data,
|
|
1348
|
+
"__name__": "__main__",
|
|
1349
|
+
}
|
|
1350
|
+
|
|
1351
|
+
# Execute the script
|
|
1352
|
+
try:
|
|
1353
|
+
exec(script_content, namespace)
|
|
1354
|
+
except Exception as e:
|
|
1355
|
+
console.print(f"[red]Error executing validation script:[/red] {e}")
|
|
1356
|
+
sys.exit(1)
|
|
1357
|
+
|
|
1358
|
+
# Look for a validation object in the namespace
|
|
1359
|
+
validation = None
|
|
1360
|
+
|
|
1361
|
+
# Try to find the 'validation' variable specifically first
|
|
1362
|
+
if "validation" in namespace:
|
|
1363
|
+
validation = namespace["validation"]
|
|
1364
|
+
else:
|
|
1365
|
+
# Look for any validation object in the namespace
|
|
1366
|
+
for key, value in namespace.items():
|
|
1367
|
+
if hasattr(value, "interrogate") and hasattr(value, "validation_info"):
|
|
1368
|
+
validation = value
|
|
1369
|
+
break
|
|
1370
|
+
# Also check if it's a Validate object that has been interrogated
|
|
1371
|
+
elif str(type(value)).find("Validate") != -1:
|
|
1372
|
+
validation = value
|
|
1373
|
+
break
|
|
1374
|
+
|
|
1375
|
+
if validation is None:
|
|
1376
|
+
raise ValueError(
|
|
1377
|
+
"No validation object found in script. "
|
|
1378
|
+
"Script should create a Validate object and assign it to a variable named 'validation'."
|
|
1379
|
+
)
|
|
1380
|
+
|
|
1381
|
+
console.print("[green]✓[/green] Validation completed")
|
|
1382
|
+
|
|
1383
|
+
# Display summary
|
|
1384
|
+
_display_validation_summary(validation)
|
|
1385
|
+
|
|
1386
|
+
# Save outputs
|
|
1387
|
+
if output_html:
|
|
1388
|
+
try:
|
|
1389
|
+
# Get HTML representation
|
|
1390
|
+
html_content = validation._repr_html_()
|
|
1391
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
1392
|
+
console.print(f"[green]✓[/green] HTML report saved to: {output_html}")
|
|
1393
|
+
except Exception as e:
|
|
1394
|
+
console.print(f"[yellow]Warning: Could not save HTML report: {e}[/yellow]")
|
|
1395
|
+
|
|
1396
|
+
if output_json:
|
|
1397
|
+
try:
|
|
1398
|
+
# Get JSON report
|
|
1399
|
+
json_report = validation.get_json_report()
|
|
1400
|
+
Path(output_json).write_text(json_report, encoding="utf-8")
|
|
1401
|
+
console.print(f"[green]✓[/green] JSON summary saved to: {output_json}")
|
|
1402
|
+
except Exception as e:
|
|
1403
|
+
console.print(f"[yellow]Warning: Could not save JSON report: {e}[/yellow]")
|
|
1404
|
+
|
|
1405
|
+
# Check if we should fail on error
|
|
1406
|
+
if fail_on_error:
|
|
1407
|
+
try:
|
|
1408
|
+
if (
|
|
1409
|
+
hasattr(validation, "validation_info")
|
|
1410
|
+
and validation.validation_info is not None
|
|
1411
|
+
):
|
|
1412
|
+
info = validation.validation_info
|
|
1413
|
+
n_critical = sum(1 for step in info if step.critical)
|
|
1414
|
+
n_error = sum(1 for step in info if step.error)
|
|
1415
|
+
|
|
1416
|
+
if n_critical > 0 or n_error > 0:
|
|
1417
|
+
severity = "critical" if n_critical > 0 else "error"
|
|
1418
|
+
console.print(
|
|
1419
|
+
f"[red]Exiting with error due to {severity} validation failures[/red]"
|
|
1420
|
+
)
|
|
1421
|
+
sys.exit(1)
|
|
1422
|
+
except Exception as e:
|
|
1423
|
+
console.print(
|
|
1424
|
+
f"[yellow]Warning: Could not check validation status for fail-on-error: {e}[/yellow]"
|
|
1425
|
+
)
|
|
1426
|
+
|
|
1427
|
+
except Exception as e:
|
|
1428
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1429
|
+
sys.exit(1)
|
|
1430
|
+
|
|
1431
|
+
|
|
1432
|
+
@cli.command()
|
|
1433
|
+
@click.argument("data_source", type=str)
|
|
1434
|
+
@click.argument("validation_script", type=click.Path(exists=True))
|
|
1435
|
+
@click.argument("step_number", type=int)
|
|
1436
|
+
@click.option(
|
|
1437
|
+
"--limit", "-l", default=100, help="Maximum number of failing rows to show (default: 100)"
|
|
1438
|
+
)
|
|
1439
|
+
@click.option("--output-csv", type=click.Path(), help="Save failing rows to CSV file")
|
|
1440
|
+
@click.option("--output-html", type=click.Path(), help="Save failing rows table to HTML file")
|
|
1441
|
+
def extract(
|
|
1442
|
+
data_source: str,
|
|
1443
|
+
validation_script: str,
|
|
1444
|
+
step_number: int,
|
|
1445
|
+
limit: int,
|
|
1446
|
+
output_csv: str | None,
|
|
1447
|
+
output_html: str | None,
|
|
1448
|
+
):
|
|
1449
|
+
"""
|
|
1450
|
+
Extract failing rows from a specific validation step.
|
|
1451
|
+
|
|
1452
|
+
This command runs a validation and extracts the rows that failed
|
|
1453
|
+
a specific validation step, which is useful for debugging data quality issues.
|
|
1454
|
+
|
|
1455
|
+
DATA_SOURCE: Same as validate command
|
|
1456
|
+
VALIDATION_SCRIPT: Path to validation script
|
|
1457
|
+
STEP_NUMBER: The step number to extract failing rows from (1-based)
|
|
1458
|
+
"""
|
|
1459
|
+
try:
|
|
1460
|
+
with console.status("[bold green]Loading data..."):
|
|
1461
|
+
# Try to load as a pointblank dataset first
|
|
1462
|
+
if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
1463
|
+
data = pb.load_dataset(data_source)
|
|
1464
|
+
console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
|
|
1465
|
+
else:
|
|
1466
|
+
# Assume it's a file path or connection string
|
|
1467
|
+
data = data_source
|
|
1468
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1469
|
+
|
|
1470
|
+
# Execute the validation script
|
|
1471
|
+
with console.status("[bold green]Running validation..."):
|
|
1472
|
+
# Read and execute the validation script
|
|
1473
|
+
script_content = Path(validation_script).read_text()
|
|
1474
|
+
|
|
1475
|
+
# Create a namespace with pointblank and the data
|
|
1476
|
+
namespace = {
|
|
1477
|
+
"pb": pb,
|
|
1478
|
+
"pointblank": pb,
|
|
1479
|
+
"data": data,
|
|
1480
|
+
"__name__": "__main__",
|
|
1481
|
+
}
|
|
1482
|
+
|
|
1483
|
+
# Execute the script
|
|
1484
|
+
try:
|
|
1485
|
+
exec(script_content, namespace)
|
|
1486
|
+
except Exception as e:
|
|
1487
|
+
console.print(f"[red]Error executing validation script:[/red] {e}")
|
|
1488
|
+
sys.exit(1)
|
|
1489
|
+
|
|
1490
|
+
# Look for a validation object in the namespace
|
|
1491
|
+
validation = None
|
|
1492
|
+
if "validation" in namespace:
|
|
1493
|
+
validation = namespace["validation"]
|
|
1494
|
+
else:
|
|
1495
|
+
# Look for any validation object in the namespace
|
|
1496
|
+
for key, value in namespace.items():
|
|
1497
|
+
if hasattr(value, "interrogate") and hasattr(value, "validation_info"):
|
|
1498
|
+
validation = value
|
|
1499
|
+
break
|
|
1500
|
+
elif str(type(value)).find("Validate") != -1:
|
|
1501
|
+
validation = value
|
|
1502
|
+
break
|
|
1503
|
+
|
|
1504
|
+
if validation is None:
|
|
1505
|
+
raise ValueError(
|
|
1506
|
+
"No validation object found in script. "
|
|
1507
|
+
"Script should create a Validate object and assign it to a variable named 'validation'."
|
|
1508
|
+
)
|
|
1509
|
+
|
|
1510
|
+
console.print("[green]✓[/green] Validation completed")
|
|
1511
|
+
|
|
1512
|
+
# Extract failing rows from the specified step
|
|
1513
|
+
with console.status(f"[bold green]Extracting failing rows from step {step_number}..."):
|
|
1514
|
+
try:
|
|
1515
|
+
# Get the data extracts for the specific step
|
|
1516
|
+
step_extract = validation.get_data_extracts(i=step_number, frame=True)
|
|
1517
|
+
|
|
1518
|
+
if step_extract is None or len(step_extract) == 0:
|
|
1519
|
+
console.print(f"[yellow]No failing rows found for step {step_number}[/yellow]")
|
|
1520
|
+
return
|
|
1521
|
+
|
|
1522
|
+
# Limit the results
|
|
1523
|
+
if len(step_extract) > limit:
|
|
1524
|
+
step_extract = step_extract.head(limit)
|
|
1525
|
+
console.print(f"[yellow]Limited to first {limit} failing rows[/yellow]")
|
|
1526
|
+
|
|
1527
|
+
console.print(f"[green]✓[/green] Extracted {len(step_extract)} failing rows")
|
|
1528
|
+
|
|
1529
|
+
# Save outputs
|
|
1530
|
+
if output_csv:
|
|
1531
|
+
if hasattr(step_extract, "write_csv"):
|
|
1532
|
+
step_extract.write_csv(output_csv)
|
|
1533
|
+
else:
|
|
1534
|
+
step_extract.to_csv(output_csv, index=False)
|
|
1535
|
+
console.print(f"[green]✓[/green] Failing rows saved to CSV: {output_csv}")
|
|
1536
|
+
|
|
1537
|
+
if output_html:
|
|
1538
|
+
# Create a preview of the failing rows
|
|
1539
|
+
preview_table = pb.preview(
|
|
1540
|
+
step_extract, n_head=min(10, len(step_extract)), n_tail=0
|
|
1541
|
+
)
|
|
1542
|
+
html_content = preview_table._repr_html_()
|
|
1543
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
1544
|
+
console.print(
|
|
1545
|
+
f"[green]✓[/green] Failing rows table saved to HTML: {output_html}"
|
|
1546
|
+
)
|
|
1547
|
+
|
|
1548
|
+
if not output_csv and not output_html:
|
|
1549
|
+
# Display basic info about the failing rows
|
|
1550
|
+
info_table = Table(
|
|
1551
|
+
title=f"Failing Rows - Step {step_number}",
|
|
1552
|
+
show_header=True,
|
|
1553
|
+
header_style="bold red",
|
|
1554
|
+
)
|
|
1555
|
+
info_table.add_column("Property", style="cyan")
|
|
1556
|
+
info_table.add_column("Value", style="white")
|
|
1557
|
+
|
|
1558
|
+
info_table.add_row("Total Failing Rows", f"{len(step_extract):,}")
|
|
1559
|
+
info_table.add_row(
|
|
1560
|
+
"Columns",
|
|
1561
|
+
f"{len(step_extract.columns) if hasattr(step_extract, 'columns') else 'N/A'}",
|
|
1562
|
+
)
|
|
1563
|
+
|
|
1564
|
+
console.print(info_table)
|
|
1565
|
+
console.print(
|
|
1566
|
+
"\n[dim]Use --output-csv or --output-html to save the failing rows.[/dim]"
|
|
1567
|
+
)
|
|
1568
|
+
|
|
1569
|
+
except Exception as e:
|
|
1570
|
+
console.print(f"[red]Error extracting failing rows:[/red] {e}")
|
|
1571
|
+
# Try to provide helpful information
|
|
1572
|
+
if hasattr(validation, "validation_info") and validation.validation_info:
|
|
1573
|
+
max_step = len(validation.validation_info)
|
|
1574
|
+
console.print(f"[yellow]Available steps: 1 to {max_step}[/yellow]")
|
|
1575
|
+
|
|
1576
|
+
# Show step information
|
|
1577
|
+
steps_table = Table(title="Available Validation Steps", show_header=True)
|
|
1578
|
+
steps_table.add_column("Step", style="cyan")
|
|
1579
|
+
steps_table.add_column("Type", style="white")
|
|
1580
|
+
steps_table.add_column("Column", style="green")
|
|
1581
|
+
steps_table.add_column("Has Failures", style="yellow")
|
|
1582
|
+
|
|
1583
|
+
for i, step in enumerate(validation.validation_info, 1):
|
|
1584
|
+
has_failures = "Yes" if not step.all_passed else "No"
|
|
1585
|
+
steps_table.add_row(
|
|
1586
|
+
str(i),
|
|
1587
|
+
step.assertion_type,
|
|
1588
|
+
str(step.column) if step.column else "—",
|
|
1589
|
+
has_failures,
|
|
1590
|
+
)
|
|
1591
|
+
|
|
1592
|
+
console.print(steps_table)
|
|
1593
|
+
sys.exit(1)
|
|
1594
|
+
|
|
1595
|
+
except Exception as e:
|
|
1596
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1597
|
+
sys.exit(1)
|
|
1598
|
+
|
|
1599
|
+
|
|
1600
|
+
def _format_missing_percentage(value: float) -> str:
|
|
1601
|
+
"""Format missing value percentages for display.
|
|
1602
|
+
|
|
1603
|
+
Args:
|
|
1604
|
+
value: The percentage value (0-100)
|
|
1605
|
+
|
|
1606
|
+
Returns:
|
|
1607
|
+
Formatted string with proper percentage display
|
|
1608
|
+
"""
|
|
1609
|
+
if value == 0.0:
|
|
1610
|
+
return "[green]●[/green]" # Large green circle for no missing values
|
|
1611
|
+
elif value == 100.0:
|
|
1612
|
+
return "[red]●[/red]" # Large red circle for completely missing values
|
|
1613
|
+
elif value < 1.0 and value > 0:
|
|
1614
|
+
return "<1%" # Less than 1%
|
|
1615
|
+
elif value > 99.0 and value < 100.0:
|
|
1616
|
+
return ">99%" # More than 99%
|
|
1617
|
+
else:
|
|
1618
|
+
return f"{int(round(value))}%" # Round to nearest integer with % sign
|
|
1619
|
+
|
|
1620
|
+
|
|
1621
|
+
def _rich_print_missing_table(gt_table: Any, original_data: Any = None) -> None:
|
|
1622
|
+
"""Convert a missing values GT table to Rich table with special formatting.
|
|
1623
|
+
|
|
1624
|
+
Args:
|
|
1625
|
+
gt_table: The GT table object for missing values
|
|
1626
|
+
original_data: The original data source to extract column types
|
|
1627
|
+
"""
|
|
1628
|
+
try:
|
|
1629
|
+
# Extract the underlying data from the GT table
|
|
1630
|
+
df = None
|
|
1631
|
+
|
|
1632
|
+
if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
|
|
1633
|
+
df = gt_table._tbl_data
|
|
1634
|
+
elif hasattr(gt_table, "_data") and gt_table._data is not None:
|
|
1635
|
+
df = gt_table._data
|
|
1636
|
+
elif hasattr(gt_table, "data") and gt_table.data is not None:
|
|
1637
|
+
df = gt_table.data
|
|
1638
|
+
|
|
1639
|
+
if df is not None:
|
|
1640
|
+
# Create a Rich table with horizontal lines
|
|
1641
|
+
from rich.box import SIMPLE_HEAD
|
|
1642
|
+
|
|
1643
|
+
rich_table = Table(show_header=True, header_style="bold magenta", box=SIMPLE_HEAD)
|
|
1644
|
+
|
|
1645
|
+
# Get column names
|
|
1646
|
+
columns = []
|
|
1647
|
+
try:
|
|
1648
|
+
if hasattr(df, "columns"):
|
|
1649
|
+
columns = list(df.columns)
|
|
1650
|
+
elif hasattr(df, "schema"):
|
|
1651
|
+
columns = list(df.schema.names)
|
|
1652
|
+
except Exception as e:
|
|
1653
|
+
console.print(f"[red]Error getting columns:[/red] {e}")
|
|
1654
|
+
columns = []
|
|
1655
|
+
|
|
1656
|
+
if not columns:
|
|
1657
|
+
columns = [f"Column {i + 1}" for i in range(10)] # Fallback
|
|
1658
|
+
|
|
1659
|
+
# Get original data to extract column types
|
|
1660
|
+
column_types = {}
|
|
1661
|
+
if original_data is not None:
|
|
1662
|
+
try:
|
|
1663
|
+
# Get column types from original data
|
|
1664
|
+
if hasattr(original_data, "columns"):
|
|
1665
|
+
original_columns = list(original_data.columns)
|
|
1666
|
+
column_types = _get_column_dtypes(original_data, original_columns)
|
|
1667
|
+
except Exception as e:
|
|
1668
|
+
console.print(f"[red]Error getting column types:[/red] {e}")
|
|
1669
|
+
pass # Use empty dict as fallback
|
|
1670
|
+
|
|
1671
|
+
# Add columns to Rich table with special formatting for missing values table
|
|
1672
|
+
sector_columns = [col for col in columns if col != "columns" and col.isdigit()]
|
|
1673
|
+
|
|
1674
|
+
# Two separate columns: Column name (20 chars) and Data type (10 chars)
|
|
1675
|
+
rich_table.add_column("Column", style="cyan", no_wrap=True, width=20)
|
|
1676
|
+
rich_table.add_column("Type", style="yellow", no_wrap=True, width=10)
|
|
1677
|
+
|
|
1678
|
+
# Sector columns: All same width, optimized for "100%" (4 chars + padding)
|
|
1679
|
+
for sector in sector_columns:
|
|
1680
|
+
rich_table.add_column(
|
|
1681
|
+
sector,
|
|
1682
|
+
style="cyan",
|
|
1683
|
+
justify="center",
|
|
1684
|
+
no_wrap=True,
|
|
1685
|
+
width=5, # Fixed width optimized for percentage values
|
|
1686
|
+
)
|
|
1687
|
+
|
|
1688
|
+
# Convert data to rows with special formatting
|
|
1689
|
+
rows = []
|
|
1690
|
+
try:
|
|
1691
|
+
if hasattr(df, "to_dicts"):
|
|
1692
|
+
data_dict = df.to_dicts()
|
|
1693
|
+
elif hasattr(df, "to_dict"):
|
|
1694
|
+
data_dict = df.to_dict("records")
|
|
1695
|
+
else:
|
|
1696
|
+
data_dict = []
|
|
1697
|
+
|
|
1698
|
+
for i, row in enumerate(data_dict):
|
|
1699
|
+
try:
|
|
1700
|
+
# Each row should have: [column_name, data_type, sector1, sector2, ...]
|
|
1701
|
+
column_name = str(row.get("columns", ""))
|
|
1702
|
+
|
|
1703
|
+
# Truncate column name to 20 characters with ellipsis if needed
|
|
1704
|
+
if len(column_name) > 20:
|
|
1705
|
+
truncated_name = column_name[:17] + "…"
|
|
1706
|
+
else:
|
|
1707
|
+
truncated_name = column_name
|
|
1708
|
+
|
|
1709
|
+
# Get data type for this column
|
|
1710
|
+
if column_name in column_types:
|
|
1711
|
+
dtype = column_types[column_name]
|
|
1712
|
+
if len(dtype) > 10:
|
|
1713
|
+
truncated_dtype = dtype[:9] + "…"
|
|
1714
|
+
else:
|
|
1715
|
+
truncated_dtype = dtype
|
|
1716
|
+
else:
|
|
1717
|
+
truncated_dtype = "?"
|
|
1718
|
+
|
|
1719
|
+
# Start building the row with column name and type
|
|
1720
|
+
formatted_row = [truncated_name, truncated_dtype]
|
|
1721
|
+
|
|
1722
|
+
# Add sector values (formatted percentages)
|
|
1723
|
+
for sector in sector_columns:
|
|
1724
|
+
value = row.get(sector, 0.0)
|
|
1725
|
+
if isinstance(value, (int, float)):
|
|
1726
|
+
formatted_row.append(_format_missing_percentage(float(value)))
|
|
1727
|
+
else:
|
|
1728
|
+
formatted_row.append(str(value))
|
|
1729
|
+
|
|
1730
|
+
rows.append(formatted_row)
|
|
1731
|
+
|
|
1732
|
+
except Exception as e:
|
|
1733
|
+
console.print(f"[red]Error processing row {i}:[/red] {e}")
|
|
1734
|
+
continue
|
|
1735
|
+
|
|
1736
|
+
except Exception as e:
|
|
1737
|
+
console.print(f"[red]Error extracting data:[/red] {e}")
|
|
1738
|
+
rows = [["Error extracting data", "?", *["" for _ in sector_columns]]]
|
|
1739
|
+
|
|
1740
|
+
# Add rows to Rich table
|
|
1741
|
+
for row in rows:
|
|
1742
|
+
try:
|
|
1743
|
+
rich_table.add_row(*row)
|
|
1744
|
+
except Exception as e:
|
|
1745
|
+
console.print(f"[red]Error adding row:[/red] {e}")
|
|
1746
|
+
break
|
|
1747
|
+
|
|
1748
|
+
# Show the table with custom spanner header if we have sector columns
|
|
1749
|
+
if sector_columns:
|
|
1750
|
+
# Create a custom header line that shows the spanner
|
|
1751
|
+
header_parts = []
|
|
1752
|
+
header_parts.append(" " * 20) # Space for Column header
|
|
1753
|
+
header_parts.append(" " * 10) # Space for Type header
|
|
1754
|
+
|
|
1755
|
+
# Left-align "Row Sectors" with the first numbered column
|
|
1756
|
+
row_sectors_text = "Row Sectors"
|
|
1757
|
+
header_parts.append(row_sectors_text)
|
|
1758
|
+
|
|
1759
|
+
# Print the custom spanner header
|
|
1760
|
+
console.print("[dim]" + " ".join(header_parts) + "[/dim]")
|
|
1761
|
+
|
|
1762
|
+
# Add a horizontal rule below the spanner
|
|
1763
|
+
rule_parts = []
|
|
1764
|
+
rule_parts.append(" " * 20) # Space for Column header
|
|
1765
|
+
rule_parts.append(" " * 10) # Space for Type header
|
|
1766
|
+
|
|
1767
|
+
# Use a fixed width horizontal rule for "Row Sectors"
|
|
1768
|
+
horizontal_rule = "─" * 20
|
|
1769
|
+
rule_parts.append(horizontal_rule)
|
|
1770
|
+
|
|
1771
|
+
# Print the horizontal rule
|
|
1772
|
+
console.print("[dim]" + " ".join(rule_parts) + "[/dim]")
|
|
1773
|
+
|
|
1774
|
+
# Print the Rich table (will handle terminal width automatically)
|
|
1775
|
+
console.print(rich_table)
|
|
1776
|
+
footer_text = (
|
|
1777
|
+
"[dim]Symbols: [green]●[/green] = no missing values, "
|
|
1778
|
+
"[red]●[/red] = completely missing, "
|
|
1779
|
+
"<1% = less than 1% missing, "
|
|
1780
|
+
">99% = more than 99% missing[/dim]"
|
|
1781
|
+
)
|
|
1782
|
+
console.print(footer_text)
|
|
1783
|
+
|
|
1784
|
+
else:
|
|
1785
|
+
# Fallback to regular table display
|
|
1786
|
+
_rich_print_gt_table(gt_table)
|
|
1787
|
+
|
|
1788
|
+
except Exception as e:
|
|
1789
|
+
console.print(f"[red]Error rendering missing values table:[/red] {e}")
|
|
1790
|
+
# Fallback to regular table display
|
|
1791
|
+
_rich_print_gt_table(gt_table)
|
|
1792
|
+
|
|
1793
|
+
|
|
1794
|
+
def _rich_print_scan_table(
|
|
1795
|
+
scan_result: Any,
|
|
1796
|
+
data_source: str,
|
|
1797
|
+
source_type: str,
|
|
1798
|
+
table_type: str,
|
|
1799
|
+
total_rows: int | None = None,
|
|
1800
|
+
) -> None:
|
|
1801
|
+
"""
|
|
1802
|
+
Display scan results as a Rich table in the terminal with statistical measures.
|
|
1803
|
+
|
|
1804
|
+
Args:
|
|
1805
|
+
scan_result: The GT object from col_summary_tbl()
|
|
1806
|
+
data_source: Name of the data source being scanned
|
|
1807
|
+
source_type: Type of data source (e.g., "Pointblank dataset: small_table")
|
|
1808
|
+
table_type: Type of table (e.g., "polars.LazyFrame")
|
|
1809
|
+
total_rows: Total number of rows in the dataset
|
|
1810
|
+
"""
|
|
1811
|
+
try:
|
|
1812
|
+
import re
|
|
1813
|
+
|
|
1814
|
+
import narwhals as nw
|
|
1815
|
+
from rich.box import SIMPLE_HEAD
|
|
1816
|
+
|
|
1817
|
+
# Extract the underlying DataFrame from the GT object
|
|
1818
|
+
# The GT object has a _tbl_data attribute that contains the DataFrame
|
|
1819
|
+
gt_data = scan_result._tbl_data
|
|
1820
|
+
|
|
1821
|
+
# Convert to Narwhals DataFrame for consistent handling
|
|
1822
|
+
nw_data = nw.from_native(gt_data)
|
|
1823
|
+
|
|
1824
|
+
# Convert to dictionary for easier access
|
|
1825
|
+
data_dict = nw_data.to_dict(as_series=False)
|
|
1826
|
+
|
|
1827
|
+
# Create main scan table with missing data table styling
|
|
1828
|
+
# Create a comprehensive title with data source, source type, and table type
|
|
1829
|
+
title_text = f"Column Summary / {source_type} / {table_type}"
|
|
1830
|
+
|
|
1831
|
+
scan_table = Table(
|
|
1832
|
+
title=title_text,
|
|
1833
|
+
show_header=True,
|
|
1834
|
+
header_style="bold magenta",
|
|
1835
|
+
box=SIMPLE_HEAD,
|
|
1836
|
+
title_style="bold cyan",
|
|
1837
|
+
title_justify="left",
|
|
1838
|
+
)
|
|
1839
|
+
|
|
1840
|
+
# Add columns with specific styling and appropriate widths
|
|
1841
|
+
scan_table.add_column("Column", style="cyan", no_wrap=True, width=20)
|
|
1842
|
+
scan_table.add_column("Type", style="yellow", no_wrap=True, width=10)
|
|
1843
|
+
scan_table.add_column(
|
|
1844
|
+
"NA", style="red", width=6, justify="right"
|
|
1845
|
+
) # Adjusted for better formatting
|
|
1846
|
+
scan_table.add_column(
|
|
1847
|
+
"UQ", style="green", width=8, justify="right"
|
|
1848
|
+
) # Adjusted for boolean values
|
|
1849
|
+
|
|
1850
|
+
# Add statistical columns if they exist with appropriate widths
|
|
1851
|
+
stat_columns = []
|
|
1852
|
+
column_mapping = {
|
|
1853
|
+
"mean": ("Mean", "blue", 9),
|
|
1854
|
+
"std": ("SD", "blue", 9),
|
|
1855
|
+
"min": ("Min", "yellow", 9),
|
|
1856
|
+
"median": ("Med", "yellow", 9),
|
|
1857
|
+
"max": ("Max", "yellow", 9),
|
|
1858
|
+
"q_1": ("Q₁", "magenta", 8),
|
|
1859
|
+
"q_3": ("Q₃", "magenta", 9),
|
|
1860
|
+
"iqr": ("IQR", "magenta", 8),
|
|
1861
|
+
}
|
|
1862
|
+
|
|
1863
|
+
for col_key, (display_name, color, width) in column_mapping.items():
|
|
1864
|
+
if col_key in data_dict:
|
|
1865
|
+
scan_table.add_column(display_name, style=color, width=width, justify="right")
|
|
1866
|
+
stat_columns.append(col_key)
|
|
1867
|
+
|
|
1868
|
+
# Helper function to extract column name and type from HTML
|
|
1869
|
+
def extract_column_info(html_content: str) -> tuple[str, str]:
|
|
1870
|
+
"""Extract column name and type from HTML formatted content."""
|
|
1871
|
+
# Extract column name from first div
|
|
1872
|
+
name_match = re.search(r"<div[^>]*>([^<]+)</div>", html_content)
|
|
1873
|
+
column_name = name_match.group(1) if name_match else "Unknown"
|
|
1874
|
+
|
|
1875
|
+
# Extract data type from second div (with gray color)
|
|
1876
|
+
type_match = re.search(r"<div[^>]*color: gray[^>]*>([^<]+)</div>", html_content)
|
|
1877
|
+
if type_match:
|
|
1878
|
+
data_type = type_match.group(1)
|
|
1879
|
+
# Convert to compact format using the existing function
|
|
1880
|
+
compact_type = _format_dtype_compact(data_type)
|
|
1881
|
+
data_type = compact_type
|
|
1882
|
+
else:
|
|
1883
|
+
data_type = "unknown"
|
|
1884
|
+
|
|
1885
|
+
return column_name, data_type
|
|
1886
|
+
|
|
1887
|
+
# Helper function to format values with improved number formatting
|
|
1888
|
+
def format_value(
|
|
1889
|
+
value: Any, is_missing: bool = False, is_unique: bool = False, max_width: int = 8
|
|
1890
|
+
) -> str:
|
|
1891
|
+
"""Format values for display with smart number formatting and HTML cleanup."""
|
|
1892
|
+
if value is None or (isinstance(value, str) and value.strip() == ""):
|
|
1893
|
+
return "[dim]—[/dim]"
|
|
1894
|
+
|
|
1895
|
+
# Handle missing values indicator
|
|
1896
|
+
if is_missing and str(value) == "0":
|
|
1897
|
+
return "[green]●[/green]" # No missing values
|
|
1898
|
+
|
|
1899
|
+
# Clean up HTML formatting from the raw data
|
|
1900
|
+
str_val = str(value)
|
|
1901
|
+
|
|
1902
|
+
# Handle multi-line values with <br> tags FIRST - take the first line (absolute number)
|
|
1903
|
+
if "<br>" in str_val:
|
|
1904
|
+
str_val = str_val.split("<br>")[0].strip()
|
|
1905
|
+
# For unique values, we want just the integer part
|
|
1906
|
+
if is_unique:
|
|
1907
|
+
try:
|
|
1908
|
+
# Try to extract just the integer part for unique counts
|
|
1909
|
+
num_val = float(str_val)
|
|
1910
|
+
return str(int(num_val))
|
|
1911
|
+
except (ValueError, TypeError):
|
|
1912
|
+
pass
|
|
1913
|
+
|
|
1914
|
+
# Now handle HTML content (especially from boolean unique values)
|
|
1915
|
+
if "<" in str_val and ">" in str_val:
|
|
1916
|
+
# Remove HTML tags completely for cleaner display
|
|
1917
|
+
str_val = re.sub(r"<[^>]+>", "", str_val).strip()
|
|
1918
|
+
# Clean up extra whitespace
|
|
1919
|
+
str_val = re.sub(r"\s+", " ", str_val).strip()
|
|
1920
|
+
|
|
1921
|
+
# Handle values like "2<.01" - extract the first number
|
|
1922
|
+
if "<" in str_val and not (str_val.startswith("<") and str_val.endswith(">")):
|
|
1923
|
+
# Extract number before the < symbol
|
|
1924
|
+
before_lt = str_val.split("<")[0].strip()
|
|
1925
|
+
if before_lt and before_lt.replace(".", "").replace("-", "").isdigit():
|
|
1926
|
+
str_val = before_lt
|
|
1927
|
+
|
|
1928
|
+
# Handle boolean unique values like "T0.62F0.38" - extract the more readable format
|
|
1929
|
+
if re.match(r"^[TF]\d+\.\d+[TF]\d+\.\d+$", str_val):
|
|
1930
|
+
# Extract T and F values
|
|
1931
|
+
t_match = re.search(r"T(\d+\.\d+)", str_val)
|
|
1932
|
+
f_match = re.search(r"F(\d+\.\d+)", str_val)
|
|
1933
|
+
if t_match and f_match:
|
|
1934
|
+
t_val = float(t_match.group(1))
|
|
1935
|
+
f_val = float(f_match.group(1))
|
|
1936
|
+
# Show as "T0.62F0.38" but truncated if needed
|
|
1937
|
+
formatted = f"T{t_val:.2f}F{f_val:.2f}"
|
|
1938
|
+
if len(formatted) > max_width:
|
|
1939
|
+
# Truncate to fit, showing dominant value
|
|
1940
|
+
if t_val > f_val:
|
|
1941
|
+
return f"T{t_val:.1f}"
|
|
1942
|
+
else:
|
|
1943
|
+
return f"F{f_val:.1f}"
|
|
1944
|
+
return formatted
|
|
1945
|
+
|
|
1946
|
+
# Try to parse as a number for better formatting
|
|
1947
|
+
try:
|
|
1948
|
+
# Try to convert to float first
|
|
1949
|
+
num_val = float(str_val)
|
|
1950
|
+
|
|
1951
|
+
# Handle special cases
|
|
1952
|
+
if num_val == 0:
|
|
1953
|
+
return "0"
|
|
1954
|
+
elif abs(num_val) == int(abs(num_val)) and abs(num_val) < 10000:
|
|
1955
|
+
# Simple integers under 10000
|
|
1956
|
+
return str(int(num_val))
|
|
1957
|
+
elif abs(num_val) >= 10000000 and abs(num_val) < 100000000:
|
|
1958
|
+
# Likely dates in YYYYMMDD format - format as date-like
|
|
1959
|
+
int_val = int(num_val)
|
|
1960
|
+
if 19000101 <= int_val <= 29991231: # Reasonable date range
|
|
1961
|
+
str_date = str(int_val)
|
|
1962
|
+
if len(str_date) == 8:
|
|
1963
|
+
return (
|
|
1964
|
+
f"{str_date[:4]}-{str_date[4:6]}-{str_date[6:]}"[: max_width - 1]
|
|
1965
|
+
+ "…"
|
|
1966
|
+
)
|
|
1967
|
+
# Otherwise treat as large number
|
|
1968
|
+
return f"{num_val / 1000000:.1f}M"
|
|
1969
|
+
elif abs(num_val) >= 1000000:
|
|
1970
|
+
# Large numbers - use scientific notation or M/k notation
|
|
1971
|
+
|
|
1972
|
+
if abs(num_val) >= 1000000000:
|
|
1973
|
+
return f"{num_val:.1e}"
|
|
1974
|
+
else:
|
|
1975
|
+
return f"{num_val / 1000000:.1f}M"
|
|
1976
|
+
elif abs(num_val) >= 10000:
|
|
1977
|
+
# Numbers >= 10k - use compact notation
|
|
1978
|
+
return f"{num_val / 1000:.1f}k"
|
|
1979
|
+
elif abs(num_val) >= 100:
|
|
1980
|
+
# Numbers 100-9999 - show with minimal decimals
|
|
1981
|
+
return f"{num_val:.1f}"
|
|
1982
|
+
elif abs(num_val) >= 10:
|
|
1983
|
+
# Numbers 10-99 - show with one decimal
|
|
1984
|
+
return f"{num_val:.1f}"
|
|
1985
|
+
elif abs(num_val) >= 1:
|
|
1986
|
+
# Numbers 1-9 - show with two decimals
|
|
1987
|
+
return f"{num_val:.2f}"
|
|
1988
|
+
elif abs(num_val) >= 0.01:
|
|
1989
|
+
# Small numbers - show with appropriate precision
|
|
1990
|
+
return f"{num_val:.2f}"
|
|
1991
|
+
else:
|
|
1992
|
+
# Very small numbers - use scientific notation
|
|
1993
|
+
return f"{num_val:.1e}"
|
|
1994
|
+
|
|
1995
|
+
except (ValueError, TypeError):
|
|
1996
|
+
# Not a number, handle as string
|
|
1997
|
+
pass
|
|
1998
|
+
|
|
1999
|
+
# Handle date/datetime strings - show abbreviated format
|
|
2000
|
+
if len(str_val) > 10 and any(char in str_val for char in ["-", "/", ":"]):
|
|
2001
|
+
# Likely a date/datetime, show abbreviated
|
|
2002
|
+
if len(str_val) > max_width:
|
|
2003
|
+
return str_val[: max_width - 1] + "…"
|
|
2004
|
+
|
|
2005
|
+
# General string truncation with ellipsis
|
|
2006
|
+
if len(str_val) > max_width:
|
|
2007
|
+
return str_val[: max_width - 1] + "…"
|
|
2008
|
+
|
|
2009
|
+
return str_val
|
|
2010
|
+
|
|
2011
|
+
# Populate table rows
|
|
2012
|
+
num_rows = len(data_dict["colname"])
|
|
2013
|
+
for i in range(num_rows):
|
|
2014
|
+
row_data = []
|
|
2015
|
+
|
|
2016
|
+
# Column name and type from HTML content
|
|
2017
|
+
colname_html = data_dict["colname"][i]
|
|
2018
|
+
column_name, data_type = extract_column_info(colname_html)
|
|
2019
|
+
row_data.append(column_name)
|
|
2020
|
+
row_data.append(data_type)
|
|
2021
|
+
|
|
2022
|
+
# Missing values (NA)
|
|
2023
|
+
missing_val = data_dict.get("n_missing", [None] * num_rows)[i]
|
|
2024
|
+
row_data.append(format_value(missing_val, is_missing=True, max_width=6))
|
|
2025
|
+
|
|
2026
|
+
# Unique values (UQ)
|
|
2027
|
+
unique_val = data_dict.get("n_unique", [None] * num_rows)[i]
|
|
2028
|
+
row_data.append(format_value(unique_val, is_unique=True, max_width=8))
|
|
2029
|
+
|
|
2030
|
+
# Statistical columns
|
|
2031
|
+
for stat_col in stat_columns:
|
|
2032
|
+
stat_val = data_dict.get(stat_col, [None] * num_rows)[i]
|
|
2033
|
+
# Use appropriate width based on column type
|
|
2034
|
+
if stat_col in ["q_1", "iqr"]:
|
|
2035
|
+
width = 8
|
|
2036
|
+
elif stat_col in ["mean", "std", "min", "median", "max", "q_3"]:
|
|
2037
|
+
width = 9
|
|
2038
|
+
else:
|
|
2039
|
+
width = 8
|
|
2040
|
+
row_data.append(format_value(stat_val, max_width=width))
|
|
2041
|
+
|
|
2042
|
+
scan_table.add_row(*row_data)
|
|
2043
|
+
|
|
2044
|
+
# Display the results
|
|
2045
|
+
console.print()
|
|
2046
|
+
console.print(scan_table) # Add informational footer about the scan scope
|
|
2047
|
+
try:
|
|
2048
|
+
if total_rows is not None:
|
|
2049
|
+
# Full table scan
|
|
2050
|
+
footer_text = f"[dim]Scan from all {total_rows:,} rows in the table.[/dim]"
|
|
2051
|
+
|
|
2052
|
+
# Create a simple footer
|
|
2053
|
+
footer_table = Table(
|
|
2054
|
+
show_header=False,
|
|
2055
|
+
show_lines=False,
|
|
2056
|
+
box=None,
|
|
2057
|
+
padding=(0, 0),
|
|
2058
|
+
)
|
|
2059
|
+
footer_table.add_column("", style="dim", width=80)
|
|
2060
|
+
footer_table.add_row(footer_text)
|
|
2061
|
+
console.print(footer_table)
|
|
2062
|
+
|
|
2063
|
+
except Exception:
|
|
2064
|
+
# If we can't determine the scan scope, don't show a footer
|
|
2065
|
+
pass
|
|
2066
|
+
|
|
2067
|
+
except Exception as e:
|
|
2068
|
+
# Fallback to simple message if table creation fails
|
|
2069
|
+
console.print(f"[yellow]Scan results available for {data_source}[/yellow]")
|
|
2070
|
+
console.print(f"[red]Error displaying table: {str(e)}[/red]")
|
|
2071
|
+
|
|
2072
|
+
|
|
2073
|
+
@cli.command(name="validate-simple")
|
|
2074
|
+
@click.argument("data_source", type=str)
|
|
2075
|
+
@click.option(
|
|
2076
|
+
"--check",
|
|
2077
|
+
type=click.Choice(
|
|
2078
|
+
[
|
|
2079
|
+
"rows-distinct",
|
|
2080
|
+
"col-vals-not-null",
|
|
2081
|
+
"rows-complete",
|
|
2082
|
+
"col-exists",
|
|
2083
|
+
"col-vals-in-set",
|
|
2084
|
+
"col-vals-gt",
|
|
2085
|
+
"col-vals-ge",
|
|
2086
|
+
"col-vals-lt",
|
|
2087
|
+
"col-vals-le",
|
|
2088
|
+
]
|
|
2089
|
+
),
|
|
2090
|
+
default="rows-distinct",
|
|
2091
|
+
help="Type of validation check to perform",
|
|
2092
|
+
)
|
|
2093
|
+
@click.option(
|
|
2094
|
+
"--column",
|
|
2095
|
+
help="Column name to validate (required for col-vals-not-null, col-exists, col-vals-in-set, col-vals-gt, col-vals-ge, col-vals-lt, and col-vals-le checks)",
|
|
2096
|
+
)
|
|
2097
|
+
@click.option("--set", help="Comma-separated allowed values (required for col-vals-in-set check)")
|
|
2098
|
+
@click.option(
|
|
2099
|
+
"--value",
|
|
2100
|
+
type=float,
|
|
2101
|
+
help="Numeric value for comparison (required for col-vals-gt, col-vals-ge, col-vals-lt, and col-vals-le checks)",
|
|
2102
|
+
)
|
|
2103
|
+
@click.option(
|
|
2104
|
+
"--show-extract", is_flag=True, help="Show preview of failing rows if validation fails"
|
|
2105
|
+
)
|
|
2106
|
+
@click.option(
|
|
2107
|
+
"--limit", "-l", default=10, help="Maximum number of failing rows to show (default: 10)"
|
|
2108
|
+
)
|
|
2109
|
+
@click.option("--exit-code", is_flag=True, help="Exit with non-zero code if validation fails")
|
|
2110
|
+
def validate_simple(
|
|
2111
|
+
data_source: str,
|
|
2112
|
+
check: str,
|
|
2113
|
+
column: str | None,
|
|
2114
|
+
set: str | None,
|
|
2115
|
+
value: float | None,
|
|
2116
|
+
show_extract: bool,
|
|
2117
|
+
limit: int,
|
|
2118
|
+
exit_code: bool,
|
|
2119
|
+
):
|
|
2120
|
+
"""
|
|
2121
|
+
Perform simple, single-step validations directly from the command line.
|
|
2122
|
+
|
|
2123
|
+
This command provides a quick way to perform common data validation checks
|
|
2124
|
+
without needing to write a validation script.
|
|
2125
|
+
|
|
2126
|
+
DATA_SOURCE can be:
|
|
2127
|
+
|
|
2128
|
+
\b
|
|
2129
|
+
- CSV file path (e.g., data.csv)
|
|
2130
|
+
- Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
|
|
2131
|
+
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
2132
|
+
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
2133
|
+
|
|
2134
|
+
AVAILABLE CHECKS:
|
|
2135
|
+
|
|
2136
|
+
\b
|
|
2137
|
+
- rows-distinct: Check if all rows in the dataset are unique (no duplicates)
|
|
2138
|
+
- rows-complete: Check if all rows are complete (no missing values in any column)
|
|
2139
|
+
- col-exists: Check if a specific column exists in the dataset (requires --column)
|
|
2140
|
+
- col-vals-not-null: Check if all values in a column are not null/missing (requires --column)
|
|
2141
|
+
- col-vals-gt: Check if all values in a column are greater than a threshold (requires --column and --value)
|
|
2142
|
+
- col-vals-ge: Check if all values in a column are greater than or equal to a threshold (requires --column and --value)
|
|
2143
|
+
- col-vals-lt: Check if all values in a column are less than a threshold (requires --column and --value)
|
|
2144
|
+
- col-vals-le: Check if all values in a column are less than or equal to a threshold (requires --column and --value)
|
|
2145
|
+
- col-vals-in-set: Check if all values in a column are in an allowed set (requires --column and --set)
|
|
2146
|
+
|
|
2147
|
+
Examples:
|
|
2148
|
+
|
|
2149
|
+
\b
|
|
2150
|
+
pb validate-simple data.csv --check rows-distinct
|
|
2151
|
+
pb validate-simple data.csv --check rows-distinct --show-extract
|
|
2152
|
+
pb validate-simple data.csv --check rows-distinct --exit-code
|
|
2153
|
+
pb validate-simple data.csv --check rows-complete
|
|
2154
|
+
pb validate-simple data.csv --check col-exists --column price
|
|
2155
|
+
pb validate-simple data.csv --check col-vals-not-null --column email
|
|
2156
|
+
pb validate-simple data.csv --check col-vals-gt --column score --value 50
|
|
2157
|
+
pb validate-simple data.csv --check col-vals-in-set --column status --set "active,inactive,pending"
|
|
2158
|
+
"""
|
|
2159
|
+
try:
|
|
2160
|
+
# Validate required parameters for different check types
|
|
2161
|
+
if check == "col-vals-not-null" and not column:
|
|
2162
|
+
console.print(f"[red]Error:[/red] --column is required for {check} check")
|
|
2163
|
+
console.print(
|
|
2164
|
+
"Example: pb validate-simple data.csv --check col-vals-not-null --column email"
|
|
2165
|
+
)
|
|
2166
|
+
sys.exit(1)
|
|
2167
|
+
sys.exit(1)
|
|
2168
|
+
|
|
2169
|
+
if check == "col-exists" and not column:
|
|
2170
|
+
console.print(f"[red]Error:[/red] --column is required for {check} check")
|
|
2171
|
+
console.print("Example: pb validate-simple data.csv --check col-exists --column price")
|
|
2172
|
+
sys.exit(1)
|
|
2173
|
+
|
|
2174
|
+
if check == "col-vals-in-set" and not column:
|
|
2175
|
+
console.print(f"[red]Error:[/red] --column is required for {check} check")
|
|
2176
|
+
console.print(
|
|
2177
|
+
"Example: pb validate-simple data.csv --check col-vals-in-set --column status --set 'active,inactive'"
|
|
2178
|
+
)
|
|
2179
|
+
sys.exit(1)
|
|
2180
|
+
|
|
2181
|
+
if check == "col-vals-in-set" and not set:
|
|
2182
|
+
console.print(f"[red]Error:[/red] --set is required for {check} check")
|
|
2183
|
+
console.print(
|
|
2184
|
+
"Example: pb validate-simple data.csv --check col-vals-in-set --column status --set 'active,inactive,pending'"
|
|
2185
|
+
)
|
|
2186
|
+
sys.exit(1)
|
|
2187
|
+
|
|
2188
|
+
if check == "col-vals-gt" and not column:
|
|
2189
|
+
console.print(f"[red]Error:[/red] --column is required for {check} check")
|
|
2190
|
+
console.print(
|
|
2191
|
+
"Example: pb validate-simple data.csv --check col-vals-gt --column score --value 50"
|
|
2192
|
+
)
|
|
2193
|
+
sys.exit(1)
|
|
2194
|
+
|
|
2195
|
+
if check == "col-vals-gt" and value is None:
|
|
2196
|
+
console.print(f"[red]Error:[/red] --value is required for {check} check")
|
|
2197
|
+
console.print(
|
|
2198
|
+
"Example: pb validate-simple data.csv --check col-vals-gt --column score --value 50"
|
|
2199
|
+
)
|
|
2200
|
+
sys.exit(1)
|
|
2201
|
+
|
|
2202
|
+
if check == "col-vals-ge" and not column:
|
|
2203
|
+
console.print(f"[red]Error:[/red] --column is required for {check} check")
|
|
2204
|
+
console.print(
|
|
2205
|
+
"Example: pb validate-simple data.csv --check col-vals-ge --column age --value 18"
|
|
2206
|
+
)
|
|
2207
|
+
sys.exit(1)
|
|
2208
|
+
|
|
2209
|
+
if check == "col-vals-ge" and value is None:
|
|
2210
|
+
console.print(f"[red]Error:[/red] --value is required for {check} check")
|
|
2211
|
+
console.print(
|
|
2212
|
+
"Example: pb validate-simple data.csv --check col-vals-ge --column age --value 18"
|
|
2213
|
+
)
|
|
2214
|
+
sys.exit(1)
|
|
2215
|
+
|
|
2216
|
+
if check == "col-vals-lt" and not column:
|
|
2217
|
+
console.print(f"[red]Error:[/red] --column is required for {check} check")
|
|
2218
|
+
console.print(
|
|
2219
|
+
"Example: pb validate-simple data.csv --check col-vals-lt --column age --value 65"
|
|
2220
|
+
)
|
|
2221
|
+
sys.exit(1)
|
|
2222
|
+
|
|
2223
|
+
if check == "col-vals-lt" and value is None:
|
|
2224
|
+
console.print(f"[red]Error:[/red] --value is required for {check} check")
|
|
2225
|
+
console.print(
|
|
2226
|
+
"Example: pb validate-simple data.csv --check col-vals-lt --column age --value 65"
|
|
2227
|
+
)
|
|
2228
|
+
sys.exit(1)
|
|
2229
|
+
|
|
2230
|
+
if check == "col-vals-le" and not column:
|
|
2231
|
+
console.print(f"[red]Error:[/red] --column is required for {check} check")
|
|
2232
|
+
console.print(
|
|
2233
|
+
"Example: pb validate-simple data.csv --check col-vals-le --column score --value 100"
|
|
2234
|
+
)
|
|
2235
|
+
sys.exit(1)
|
|
2236
|
+
|
|
2237
|
+
if check == "col-vals-le" and value is None:
|
|
2238
|
+
console.print(f"[red]Error:[/red] --value is required for {check} check")
|
|
2239
|
+
console.print(
|
|
2240
|
+
"Example: pb validate-simple data.csv --check col-vals-le --column score --value 100"
|
|
2241
|
+
)
|
|
2242
|
+
sys.exit(1)
|
|
2243
|
+
|
|
2244
|
+
with console.status("[bold green]Loading data..."):
|
|
2245
|
+
# Try to load as a pointblank dataset first
|
|
2246
|
+
if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
2247
|
+
data = pb.load_dataset(data_source)
|
|
2248
|
+
console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
|
|
2249
|
+
else:
|
|
2250
|
+
# Assume it's a file path or connection string
|
|
2251
|
+
data = data_source
|
|
2252
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
2253
|
+
|
|
2254
|
+
# Perform the validation based on the check type
|
|
2255
|
+
with console.status(f"[bold green]Running {check} validation..."):
|
|
2256
|
+
if check == "rows-distinct":
|
|
2257
|
+
# Create validation for duplicate rows
|
|
2258
|
+
validation = (
|
|
2259
|
+
pb.Validate(
|
|
2260
|
+
data=data,
|
|
2261
|
+
tbl_name=f"Data from {data_source}",
|
|
2262
|
+
label=f"CLI Simple Validation: {check}",
|
|
2263
|
+
)
|
|
2264
|
+
.rows_distinct()
|
|
2265
|
+
.interrogate()
|
|
2266
|
+
)
|
|
2267
|
+
|
|
2268
|
+
# Get the result
|
|
2269
|
+
all_passed = validation.all_passed()
|
|
2270
|
+
|
|
2271
|
+
console.print(
|
|
2272
|
+
f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
|
|
2273
|
+
)
|
|
2274
|
+
elif check == "col-vals-not-null":
|
|
2275
|
+
# Create validation for not null values in specified column
|
|
2276
|
+
validation = (
|
|
2277
|
+
pb.Validate(
|
|
2278
|
+
data=data,
|
|
2279
|
+
tbl_name=f"Data from {data_source}",
|
|
2280
|
+
label=f"CLI Simple Validation: {check} on column '{column}'",
|
|
2281
|
+
)
|
|
2282
|
+
.col_vals_not_null(columns=column)
|
|
2283
|
+
.interrogate()
|
|
2284
|
+
)
|
|
2285
|
+
|
|
2286
|
+
# Get the result
|
|
2287
|
+
all_passed = validation.all_passed()
|
|
2288
|
+
|
|
2289
|
+
console.print(
|
|
2290
|
+
f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
|
|
2291
|
+
)
|
|
2292
|
+
elif check == "rows-complete":
|
|
2293
|
+
# Create validation for complete rows (no missing values in any column)
|
|
2294
|
+
validation = (
|
|
2295
|
+
pb.Validate(
|
|
2296
|
+
data=data,
|
|
2297
|
+
tbl_name=f"Data from {data_source}",
|
|
2298
|
+
label=f"CLI Simple Validation: {check}",
|
|
2299
|
+
)
|
|
2300
|
+
.rows_complete()
|
|
2301
|
+
.interrogate()
|
|
2302
|
+
)
|
|
2303
|
+
|
|
2304
|
+
# Get the result
|
|
2305
|
+
all_passed = validation.all_passed()
|
|
2306
|
+
|
|
2307
|
+
console.print(
|
|
2308
|
+
f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
|
|
2309
|
+
)
|
|
2310
|
+
elif check == "col-exists":
|
|
2311
|
+
# Create validation for column existence
|
|
2312
|
+
validation = (
|
|
2313
|
+
pb.Validate(
|
|
2314
|
+
data=data,
|
|
2315
|
+
tbl_name=f"Data from {data_source}",
|
|
2316
|
+
label=f"CLI Simple Validation: {check} for column '{column}'",
|
|
2317
|
+
)
|
|
2318
|
+
.col_exists(columns=column)
|
|
2319
|
+
.interrogate()
|
|
2320
|
+
)
|
|
2321
|
+
|
|
2322
|
+
# Get the result
|
|
2323
|
+
all_passed = validation.all_passed()
|
|
2324
|
+
|
|
2325
|
+
console.print(
|
|
2326
|
+
f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
|
|
2327
|
+
)
|
|
2328
|
+
elif check == "col-vals-in-set":
|
|
2329
|
+
# Parse the comma-separated set values
|
|
2330
|
+
allowed_values = [value.strip() for value in set.split(",")]
|
|
2331
|
+
|
|
2332
|
+
# Create validation for values in set
|
|
2333
|
+
validation = (
|
|
2334
|
+
pb.Validate(
|
|
2335
|
+
data=data,
|
|
2336
|
+
tbl_name=f"Data from {data_source}",
|
|
2337
|
+
label=f"CLI Simple Validation: {check} for column '{column}'",
|
|
2338
|
+
)
|
|
2339
|
+
.col_vals_in_set(columns=column, set=allowed_values)
|
|
2340
|
+
.interrogate()
|
|
2341
|
+
)
|
|
2342
|
+
|
|
2343
|
+
# Get the result
|
|
2344
|
+
all_passed = validation.all_passed()
|
|
2345
|
+
|
|
2346
|
+
console.print(
|
|
2347
|
+
f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
|
|
2348
|
+
)
|
|
2349
|
+
elif check == "col-vals-gt":
|
|
2350
|
+
# Create validation for values greater than threshold
|
|
2351
|
+
validation = (
|
|
2352
|
+
pb.Validate(
|
|
2353
|
+
data=data,
|
|
2354
|
+
tbl_name=f"Data from {data_source}",
|
|
2355
|
+
label=f"CLI Simple Validation: {check} for column '{column}' > {value}",
|
|
2356
|
+
)
|
|
2357
|
+
.col_vals_gt(columns=column, value=value)
|
|
2358
|
+
.interrogate()
|
|
2359
|
+
)
|
|
2360
|
+
|
|
2361
|
+
# Get the result
|
|
2362
|
+
all_passed = validation.all_passed()
|
|
2363
|
+
|
|
2364
|
+
console.print(
|
|
2365
|
+
f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
|
|
2366
|
+
)
|
|
2367
|
+
elif check == "col-vals-ge":
|
|
2368
|
+
# Create validation for values greater than or equal to threshold
|
|
2369
|
+
validation = (
|
|
2370
|
+
pb.Validate(
|
|
2371
|
+
data=data,
|
|
2372
|
+
tbl_name=f"Data from {data_source}",
|
|
2373
|
+
label=f"CLI Simple Validation: {check} for column '{column}' >= {value}",
|
|
2374
|
+
)
|
|
2375
|
+
.col_vals_ge(columns=column, value=value)
|
|
2376
|
+
.interrogate()
|
|
2377
|
+
)
|
|
2378
|
+
|
|
2379
|
+
# Get the result
|
|
2380
|
+
all_passed = validation.all_passed()
|
|
2381
|
+
|
|
2382
|
+
console.print(
|
|
2383
|
+
f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
|
|
2384
|
+
)
|
|
2385
|
+
elif check == "col-vals-lt":
|
|
2386
|
+
# Create validation for values less than threshold
|
|
2387
|
+
validation = (
|
|
2388
|
+
pb.Validate(
|
|
2389
|
+
data=data,
|
|
2390
|
+
tbl_name=f"Data from {data_source}",
|
|
2391
|
+
label=f"CLI Simple Validation: {check} for column '{column}' < {value}",
|
|
2392
|
+
)
|
|
2393
|
+
.col_vals_lt(columns=column, value=value)
|
|
2394
|
+
.interrogate()
|
|
2395
|
+
)
|
|
2396
|
+
|
|
2397
|
+
# Get the result
|
|
2398
|
+
all_passed = validation.all_passed()
|
|
2399
|
+
|
|
2400
|
+
console.print(
|
|
2401
|
+
f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
|
|
2402
|
+
)
|
|
2403
|
+
elif check == "col-vals-le":
|
|
2404
|
+
# Create validation for values less than or equal to threshold
|
|
2405
|
+
validation = (
|
|
2406
|
+
pb.Validate(
|
|
2407
|
+
data=data,
|
|
2408
|
+
tbl_name=f"Data from {data_source}",
|
|
2409
|
+
label=f"CLI Simple Validation: {check} for column '{column}' <= {value}",
|
|
2410
|
+
)
|
|
2411
|
+
.col_vals_le(columns=column, value=value)
|
|
2412
|
+
.interrogate()
|
|
2413
|
+
)
|
|
2414
|
+
|
|
2415
|
+
# Get the result
|
|
2416
|
+
all_passed = validation.all_passed()
|
|
2417
|
+
|
|
2418
|
+
console.print(
|
|
2419
|
+
f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
|
|
2420
|
+
)
|
|
2421
|
+
else:
|
|
2422
|
+
# This shouldn't happen due to click.Choice, but just in case
|
|
2423
|
+
console.print(f"[red]Error:[/red] Unknown check type: {check}")
|
|
2424
|
+
sys.exit(1)
|
|
2425
|
+
|
|
2426
|
+
# Display results
|
|
2427
|
+
from rich.box import SIMPLE_HEAD
|
|
2428
|
+
|
|
2429
|
+
# Create friendly title for table
|
|
2430
|
+
if check == "rows-distinct":
|
|
2431
|
+
table_title = "Validation Result: Rows Distinct"
|
|
2432
|
+
elif check == "col-vals-not-null":
|
|
2433
|
+
table_title = "Validation Result: Column Values Not Null"
|
|
2434
|
+
elif check == "rows-complete":
|
|
2435
|
+
table_title = "Validation Result: Rows Complete"
|
|
2436
|
+
elif check == "col-exists":
|
|
2437
|
+
table_title = "Validation Result: Column Exists"
|
|
2438
|
+
elif check == "col-vals-in-set":
|
|
2439
|
+
table_title = "Validation Result: Column Values In Set"
|
|
2440
|
+
elif check == "col-vals-gt":
|
|
2441
|
+
table_title = "Validation Result: Column Values Greater Than"
|
|
2442
|
+
elif check == "col-vals-ge":
|
|
2443
|
+
table_title = "Validation Result: Column Values Greater Than Or Equal"
|
|
2444
|
+
elif check == "col-vals-lt":
|
|
2445
|
+
table_title = "Validation Result: Column Values Less Than"
|
|
2446
|
+
elif check == "col-vals-le":
|
|
2447
|
+
table_title = "Validation Result: Column Values Less Than Or Equal"
|
|
2448
|
+
else:
|
|
2449
|
+
table_title = f"Validation Result: {check.replace('-', ' ').title()}"
|
|
2450
|
+
|
|
2451
|
+
result_table = Table(
|
|
2452
|
+
title=table_title,
|
|
2453
|
+
show_header=True,
|
|
2454
|
+
header_style="bold magenta",
|
|
2455
|
+
box=SIMPLE_HEAD,
|
|
2456
|
+
title_style="bold cyan",
|
|
2457
|
+
title_justify="left",
|
|
2458
|
+
)
|
|
2459
|
+
result_table.add_column("Property", style="cyan", no_wrap=True)
|
|
2460
|
+
result_table.add_column("Value", style="white")
|
|
2461
|
+
|
|
2462
|
+
# Add basic info
|
|
2463
|
+
result_table.add_row("Data Source", data_source)
|
|
2464
|
+
result_table.add_row("Check Type", check)
|
|
2465
|
+
|
|
2466
|
+
# Add column info for column-specific checks
|
|
2467
|
+
if check in [
|
|
2468
|
+
"col-vals-not-null",
|
|
2469
|
+
"col-exists",
|
|
2470
|
+
"col-vals-in-set",
|
|
2471
|
+
"col-vals-gt",
|
|
2472
|
+
"col-vals-ge",
|
|
2473
|
+
"col-vals-lt",
|
|
2474
|
+
"col-vals-le",
|
|
2475
|
+
]:
|
|
2476
|
+
result_table.add_row("Column", column)
|
|
2477
|
+
|
|
2478
|
+
# Add set info for col-vals-in-set check
|
|
2479
|
+
if check == "col-vals-in-set":
|
|
2480
|
+
allowed_values = [value.strip() for value in set.split(",")]
|
|
2481
|
+
result_table.add_row("Allowed Values", ", ".join(allowed_values))
|
|
2482
|
+
|
|
2483
|
+
# Add value info for range checks
|
|
2484
|
+
if check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"]:
|
|
2485
|
+
if check == "col-vals-gt":
|
|
2486
|
+
operator = ">"
|
|
2487
|
+
elif check == "col-vals-ge":
|
|
2488
|
+
operator = ">="
|
|
2489
|
+
elif check == "col-vals-lt":
|
|
2490
|
+
operator = "<"
|
|
2491
|
+
elif check == "col-vals-le":
|
|
2492
|
+
operator = "<="
|
|
2493
|
+
result_table.add_row("Threshold", f"{operator} {value}")
|
|
2494
|
+
|
|
2495
|
+
# Get validation details
|
|
2496
|
+
if hasattr(validation, "validation_info") and validation.validation_info:
|
|
2497
|
+
step_info = validation.validation_info[0] # Should only be one step
|
|
2498
|
+
result_table.add_row("Total Rows Tested", f"{step_info.n:,}")
|
|
2499
|
+
result_table.add_row("Passing Rows", f"{step_info.n_passed:,}")
|
|
2500
|
+
result_table.add_row("Failing Rows", f"{step_info.n_failed:,}")
|
|
2501
|
+
|
|
2502
|
+
# Overall result with color coding
|
|
2503
|
+
if all_passed:
|
|
2504
|
+
result_table.add_row("Result", "[green]✓ PASSED[/green]")
|
|
2505
|
+
if check == "rows-distinct":
|
|
2506
|
+
result_table.add_row("Duplicate Rows", "[green]None found[/green]")
|
|
2507
|
+
elif check == "col-vals-not-null":
|
|
2508
|
+
result_table.add_row("Null Values", "[green]None found[/green]")
|
|
2509
|
+
elif check == "rows-complete":
|
|
2510
|
+
result_table.add_row("Incomplete Rows", "[green]None found[/green]")
|
|
2511
|
+
elif check == "col-exists":
|
|
2512
|
+
result_table.add_row("Column Status", "[green]Column exists[/green]")
|
|
2513
|
+
elif check == "col-vals-in-set":
|
|
2514
|
+
result_table.add_row(
|
|
2515
|
+
"Values Status", "[green]All values in allowed set[/green]"
|
|
2516
|
+
)
|
|
2517
|
+
elif check == "col-vals-gt":
|
|
2518
|
+
result_table.add_row("Values Status", f"[green]All values > {value}[/green]")
|
|
2519
|
+
elif check == "col-vals-ge":
|
|
2520
|
+
result_table.add_row("Values Status", f"[green]All values >= {value}[/green]")
|
|
2521
|
+
else:
|
|
2522
|
+
result_table.add_row("Result", "[red]✗ FAILED[/red]")
|
|
2523
|
+
if check == "rows-distinct":
|
|
2524
|
+
result_table.add_row(
|
|
2525
|
+
"Duplicate Rows", f"[red]{step_info.n_failed:,} found[/red]"
|
|
2526
|
+
)
|
|
2527
|
+
elif check == "col-vals-not-null":
|
|
2528
|
+
result_table.add_row("Null Values", f"[red]{step_info.n_failed:,} found[/red]")
|
|
2529
|
+
elif check == "rows-complete":
|
|
2530
|
+
result_table.add_row(
|
|
2531
|
+
"Incomplete Rows", f"[red]{step_info.n_failed:,} found[/red]"
|
|
2532
|
+
)
|
|
2533
|
+
elif check == "col-exists":
|
|
2534
|
+
result_table.add_row("Column Status", "[red]Column does not exist[/red]")
|
|
2535
|
+
elif check == "col-vals-in-set":
|
|
2536
|
+
result_table.add_row(
|
|
2537
|
+
"Invalid Values", f"[red]{step_info.n_failed:,} found[/red]"
|
|
2538
|
+
)
|
|
2539
|
+
elif check == "col-vals-gt":
|
|
2540
|
+
result_table.add_row(
|
|
2541
|
+
"Invalid Values", f"[red]{step_info.n_failed:,} values <= {value}[/red]"
|
|
2542
|
+
)
|
|
2543
|
+
elif check == "col-vals-ge":
|
|
2544
|
+
result_table.add_row(
|
|
2545
|
+
"Invalid Values", f"[red]{step_info.n_failed:,} values < {value}[/red]"
|
|
2546
|
+
)
|
|
2547
|
+
|
|
2548
|
+
console.print()
|
|
2549
|
+
console.print(result_table)
|
|
2550
|
+
|
|
2551
|
+
# Show extract if requested and validation failed
|
|
2552
|
+
if show_extract and not all_passed:
|
|
2553
|
+
console.print()
|
|
2554
|
+
|
|
2555
|
+
# Dynamic message based on check type
|
|
2556
|
+
if check == "rows-distinct":
|
|
2557
|
+
extract_message = "[yellow]Preview of failing rows (duplicates):[/yellow]"
|
|
2558
|
+
row_type = "duplicate rows"
|
|
2559
|
+
elif check == "rows-complete":
|
|
2560
|
+
extract_message = "[yellow]Preview of failing rows (incomplete rows):[/yellow]"
|
|
2561
|
+
row_type = "incomplete rows"
|
|
2562
|
+
elif check == "col-exists":
|
|
2563
|
+
extract_message = (
|
|
2564
|
+
f"[yellow]Column '{column}' does not exist in the dataset[/yellow]"
|
|
2565
|
+
)
|
|
2566
|
+
row_type = "missing column"
|
|
2567
|
+
elif check == "col-vals-in-set":
|
|
2568
|
+
extract_message = (
|
|
2569
|
+
f"[yellow]Preview of failing rows (invalid values in '{column}'):[/yellow]"
|
|
2570
|
+
)
|
|
2571
|
+
row_type = "rows with invalid values"
|
|
2572
|
+
elif check == "col-vals-gt":
|
|
2573
|
+
extract_message = (
|
|
2574
|
+
f"[yellow]Preview of failing rows (values in '{column}' <= {value}):[/yellow]"
|
|
2575
|
+
)
|
|
2576
|
+
row_type = f"rows with values <= {value}"
|
|
2577
|
+
elif check == "col-vals-ge":
|
|
2578
|
+
extract_message = (
|
|
2579
|
+
f"[yellow]Preview of failing rows (values in '{column}' < {value}):[/yellow]"
|
|
2580
|
+
)
|
|
2581
|
+
row_type = f"rows with values < {value}"
|
|
2582
|
+
else:
|
|
2583
|
+
extract_message = "[yellow]Preview of failing rows:[/yellow]"
|
|
2584
|
+
row_type = "failing rows"
|
|
2585
|
+
|
|
2586
|
+
console.print(extract_message)
|
|
2587
|
+
|
|
2588
|
+
# Special handling for col-exists check - no rows to show when column doesn't exist
|
|
2589
|
+
if check == "col-exists" and not all_passed:
|
|
2590
|
+
console.print(f"[dim]The column '{column}' was not found in the dataset.[/dim]")
|
|
2591
|
+
console.print(
|
|
2592
|
+
"[dim]Use --show-extract with other check types to see failing data rows.[/dim]"
|
|
2593
|
+
)
|
|
2594
|
+
else:
|
|
2595
|
+
try:
|
|
2596
|
+
# Get failing rows extract
|
|
2597
|
+
failing_rows = validation.get_data_extracts(i=1, frame=True)
|
|
2598
|
+
|
|
2599
|
+
if failing_rows is not None and len(failing_rows) > 0:
|
|
2600
|
+
# Limit the number of rows shown
|
|
2601
|
+
if len(failing_rows) > limit:
|
|
2602
|
+
display_rows = failing_rows.head(limit)
|
|
2603
|
+
console.print(
|
|
2604
|
+
f"[dim]Showing first {limit} of {len(failing_rows)} {row_type}[/dim]"
|
|
2605
|
+
)
|
|
2606
|
+
else:
|
|
2607
|
+
display_rows = failing_rows
|
|
2608
|
+
console.print(f"[dim]Showing all {len(failing_rows)} {row_type}[/dim]")
|
|
2609
|
+
|
|
2610
|
+
# Create a preview table using pointblank's preview function
|
|
2611
|
+
preview_table = pb.preview(
|
|
2612
|
+
data=display_rows,
|
|
2613
|
+
n_head=min(limit, len(display_rows)),
|
|
2614
|
+
n_tail=0,
|
|
2615
|
+
limit=limit,
|
|
2616
|
+
show_row_numbers=True,
|
|
2617
|
+
)
|
|
2618
|
+
|
|
2619
|
+
# Display using our Rich table function
|
|
2620
|
+
_rich_print_gt_table(preview_table)
|
|
2621
|
+
else:
|
|
2622
|
+
console.print("[yellow]No failing rows could be extracted[/yellow]")
|
|
2623
|
+
except Exception as e:
|
|
2624
|
+
console.print(f"[yellow]Could not extract failing rows: {e}[/yellow]")
|
|
2625
|
+
|
|
2626
|
+
# Summary message
|
|
2627
|
+
console.print()
|
|
2628
|
+
if all_passed:
|
|
2629
|
+
if check == "rows-distinct":
|
|
2630
|
+
success_message = (
|
|
2631
|
+
f"[green]✓ Validation PASSED: No duplicate rows found in {data_source}[/green]"
|
|
2632
|
+
)
|
|
2633
|
+
elif check == "col-vals-not-null":
|
|
2634
|
+
success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {data_source}[/green]"
|
|
2635
|
+
elif check == "rows-complete":
|
|
2636
|
+
success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {data_source}[/green]"
|
|
2637
|
+
elif check == "col-exists":
|
|
2638
|
+
success_message = (
|
|
2639
|
+
f"[green]✓ Validation PASSED: Column '{column}' exists in {data_source}[/green]"
|
|
2640
|
+
)
|
|
2641
|
+
elif check == "col-vals-in-set":
|
|
2642
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {data_source}[/green]"
|
|
2643
|
+
elif check == "col-vals-gt":
|
|
2644
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {data_source}[/green]"
|
|
2645
|
+
elif check == "col-vals-ge":
|
|
2646
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {data_source}[/green]"
|
|
2647
|
+
elif check == "col-vals-lt":
|
|
2648
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {data_source}[/green]"
|
|
2649
|
+
elif check == "col-vals-le":
|
|
2650
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {data_source}[/green]"
|
|
2651
|
+
else:
|
|
2652
|
+
success_message = (
|
|
2653
|
+
f"[green]✓ Validation PASSED: {check} check passed for {data_source}[/green]"
|
|
2654
|
+
)
|
|
2655
|
+
|
|
2656
|
+
console.print(
|
|
2657
|
+
Panel(
|
|
2658
|
+
success_message,
|
|
2659
|
+
border_style="green",
|
|
2660
|
+
)
|
|
2661
|
+
)
|
|
2662
|
+
else:
|
|
2663
|
+
if hasattr(validation, "validation_info") and validation.validation_info:
|
|
2664
|
+
step_info = validation.validation_info[0]
|
|
2665
|
+
|
|
2666
|
+
if check == "rows-distinct":
|
|
2667
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {data_source}[/red]"
|
|
2668
|
+
elif check == "col-vals-not-null":
|
|
2669
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {data_source}[/red]"
|
|
2670
|
+
elif check == "rows-complete":
|
|
2671
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {data_source}[/red]"
|
|
2672
|
+
elif check == "col-exists":
|
|
2673
|
+
failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {data_source}[/red]"
|
|
2674
|
+
elif check == "col-vals-in-set":
|
|
2675
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {data_source}[/red]"
|
|
2676
|
+
elif check == "col-vals-gt":
|
|
2677
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {data_source}[/red]"
|
|
2678
|
+
elif check == "col-vals-ge":
|
|
2679
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {data_source}[/red]"
|
|
2680
|
+
elif check == "col-vals-lt":
|
|
2681
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {data_source}[/red]"
|
|
2682
|
+
elif check == "col-vals-le":
|
|
2683
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {data_source}[/red]"
|
|
2684
|
+
else:
|
|
2685
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {data_source}[/red]"
|
|
2686
|
+
|
|
2687
|
+
# Add hint about --show-extract if not already used (except for col-exists which has no rows to show)
|
|
2688
|
+
if not show_extract and check != "col-exists":
|
|
2689
|
+
failure_message += (
|
|
2690
|
+
"\n[dim]💡 Tip: Use --show-extract to see the failing rows[/dim]"
|
|
2691
|
+
)
|
|
2692
|
+
|
|
2693
|
+
console.print(
|
|
2694
|
+
Panel(
|
|
2695
|
+
failure_message,
|
|
2696
|
+
border_style="red",
|
|
2697
|
+
)
|
|
2698
|
+
)
|
|
2699
|
+
else:
|
|
2700
|
+
if check == "rows-distinct":
|
|
2701
|
+
failure_message = (
|
|
2702
|
+
f"[red]✗ Validation FAILED: Duplicate rows found in {data_source}[/red]"
|
|
2703
|
+
)
|
|
2704
|
+
elif check == "rows-complete":
|
|
2705
|
+
failure_message = (
|
|
2706
|
+
f"[red]✗ Validation FAILED: Incomplete rows found in {data_source}[/red]"
|
|
2707
|
+
)
|
|
2708
|
+
else:
|
|
2709
|
+
failure_message = (
|
|
2710
|
+
f"[red]✗ Validation FAILED: {check} check failed for {data_source}[/red]"
|
|
2711
|
+
)
|
|
2712
|
+
|
|
2713
|
+
# Add hint about --show-extract if not already used
|
|
2714
|
+
if not show_extract:
|
|
2715
|
+
failure_message += (
|
|
2716
|
+
"\n[dim]💡 Tip: Use --show-extract to see the failing rows[/dim]"
|
|
2717
|
+
)
|
|
2718
|
+
|
|
2719
|
+
console.print(
|
|
2720
|
+
Panel(
|
|
2721
|
+
failure_message,
|
|
2722
|
+
border_style="red",
|
|
2723
|
+
)
|
|
2724
|
+
)
|
|
2725
|
+
|
|
2726
|
+
# Exit with appropriate code if requested
|
|
2727
|
+
if exit_code and not all_passed:
|
|
2728
|
+
console.print("[dim]Exiting with non-zero code due to validation failure[/dim]")
|
|
2729
|
+
sys.exit(1)
|
|
2730
|
+
|
|
2731
|
+
except Exception as e:
|
|
2732
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
2733
|
+
sys.exit(1)
|
|
2734
|
+
|
|
2735
|
+
|
|
2736
|
+
if __name__ == "__main__": # pragma: no cover
|
|
2737
|
+
cli()
|