pointblank 0.10.0__py3-none-any.whl → 0.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/assistant.py +14 -3
- pointblank/cli.py +3644 -0
- pointblank/compare.py +9 -0
- pointblank/datascan.py +25 -3
- pointblank/validate.py +346 -37
- {pointblank-0.10.0.dist-info → pointblank-0.11.1.dist-info}/METADATA +52 -1
- {pointblank-0.10.0.dist-info → pointblank-0.11.1.dist-info}/RECORD +11 -9
- pointblank-0.11.1.dist-info/entry_points.txt +2 -0
- {pointblank-0.10.0.dist-info → pointblank-0.11.1.dist-info}/WHEEL +0 -0
- {pointblank-0.10.0.dist-info → pointblank-0.11.1.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.10.0.dist-info → pointblank-0.11.1.dist-info}/top_level.txt +0 -0
pointblank/cli.py
ADDED
|
@@ -0,0 +1,3644 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import click
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
from rich.panel import Panel
|
|
10
|
+
from rich.table import Table
|
|
11
|
+
|
|
12
|
+
import pointblank as pb
|
|
13
|
+
from pointblank._utils import _get_tbl_type, _is_lib_present
|
|
14
|
+
|
|
15
|
+
console = Console()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class OrderedGroup(click.Group):
|
|
19
|
+
"""A Click Group that displays commands in a custom order."""
|
|
20
|
+
|
|
21
|
+
def list_commands(self, ctx):
|
|
22
|
+
"""Return commands in the desired logical order."""
|
|
23
|
+
# Define the desired order
|
|
24
|
+
desired_order = [
|
|
25
|
+
# Data Discovery/Exploration
|
|
26
|
+
"info",
|
|
27
|
+
"preview",
|
|
28
|
+
"scan",
|
|
29
|
+
"missing",
|
|
30
|
+
# Validation
|
|
31
|
+
"validate",
|
|
32
|
+
"run",
|
|
33
|
+
"make-template",
|
|
34
|
+
# Utilities
|
|
35
|
+
"datasets",
|
|
36
|
+
"requirements",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
# Get all available commands
|
|
40
|
+
available_commands = super().list_commands(ctx)
|
|
41
|
+
|
|
42
|
+
# Return commands in desired order, followed by any not in the list
|
|
43
|
+
ordered = []
|
|
44
|
+
for cmd in desired_order:
|
|
45
|
+
if cmd in available_commands:
|
|
46
|
+
ordered.append(cmd)
|
|
47
|
+
|
|
48
|
+
# Add any commands not in our desired order (safety fallback)
|
|
49
|
+
for cmd in available_commands:
|
|
50
|
+
if cmd not in ordered:
|
|
51
|
+
ordered.append(cmd)
|
|
52
|
+
|
|
53
|
+
return ordered
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _load_data_source(data_source: str) -> Any:
|
|
57
|
+
"""
|
|
58
|
+
Centralized data loading function for CLI that handles all supported data source types.
|
|
59
|
+
|
|
60
|
+
This function provides a consistent way to load data across all CLI commands by leveraging
|
|
61
|
+
the _process_data() utility function and adding support for pointblank dataset names.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
data_source : str
|
|
66
|
+
The data source which could be:
|
|
67
|
+
- A pointblank dataset name (small_table, game_revenue, nycflights, global_sales)
|
|
68
|
+
- A GitHub URL pointing to a CSV or Parquet file
|
|
69
|
+
- A database connection string (e.g., "duckdb:///path/to/file.ddb::table_name")
|
|
70
|
+
- A CSV file path (string or Path object with .csv extension)
|
|
71
|
+
- A Parquet file path, glob pattern, directory, or partitioned dataset
|
|
72
|
+
|
|
73
|
+
Returns
|
|
74
|
+
-------
|
|
75
|
+
Any
|
|
76
|
+
Loaded data as a DataFrame or other data object
|
|
77
|
+
|
|
78
|
+
Raises
|
|
79
|
+
------
|
|
80
|
+
ValueError
|
|
81
|
+
If the pointblank dataset name is not recognized
|
|
82
|
+
"""
|
|
83
|
+
# Check if it's a pointblank dataset name first
|
|
84
|
+
if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
85
|
+
return pb.load_dataset(data_source)
|
|
86
|
+
|
|
87
|
+
# Otherwise, use the centralized _process_data() function for all other data sources
|
|
88
|
+
from pointblank.validate import _process_data
|
|
89
|
+
|
|
90
|
+
return _process_data(data_source)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _format_cell_value(
|
|
94
|
+
value: Any, is_row_number: bool = False, max_width: int = 50, num_columns: int = 10
|
|
95
|
+
) -> str:
|
|
96
|
+
"""Format a cell value for Rich table display, highlighting None/NA values in red.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
value: The raw cell value from the dataframe
|
|
100
|
+
is_row_number: Whether this is a row number column value
|
|
101
|
+
max_width: Maximum character width for text truncation
|
|
102
|
+
num_columns: Number of columns in the table (affects truncation aggressiveness)
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Formatted string with Rich markup for None/NA values or row numbers
|
|
106
|
+
"""
|
|
107
|
+
# Special formatting for row numbers: never truncate them
|
|
108
|
+
if is_row_number:
|
|
109
|
+
return f"[dim]{value}[/dim]"
|
|
110
|
+
|
|
111
|
+
# Check for actual None/null values (not string representations)
|
|
112
|
+
if value is None:
|
|
113
|
+
return "[red]None[/red]"
|
|
114
|
+
|
|
115
|
+
# Check for pandas/numpy specific NA values
|
|
116
|
+
try:
|
|
117
|
+
import numpy as np
|
|
118
|
+
import pandas as pd
|
|
119
|
+
|
|
120
|
+
# Check for pandas NA
|
|
121
|
+
if pd.isna(value):
|
|
122
|
+
# If it's specifically numpy.nan, show as NaN
|
|
123
|
+
if isinstance(value, float) and np.isnan(value):
|
|
124
|
+
return "[red]NaN[/red]"
|
|
125
|
+
# If it's pandas NA, show as NA
|
|
126
|
+
elif str(type(value)).find("pandas") != -1:
|
|
127
|
+
return "[red]NA[/red]"
|
|
128
|
+
# Generic NA for other pandas missing values
|
|
129
|
+
else:
|
|
130
|
+
return "[red]NA[/red]"
|
|
131
|
+
|
|
132
|
+
except (ImportError, TypeError, ValueError): # pragma: no cover
|
|
133
|
+
# If pandas/numpy not available, value not compatible, or ambiguous array
|
|
134
|
+
pass
|
|
135
|
+
|
|
136
|
+
# Check for empty strings (but only actual empty strings, not whitespace)
|
|
137
|
+
if isinstance(value, str) and value == "":
|
|
138
|
+
return "[red][/red]" # Empty string shown as red empty space
|
|
139
|
+
|
|
140
|
+
# Convert to string and apply intelligent truncation
|
|
141
|
+
str_value = str(value)
|
|
142
|
+
|
|
143
|
+
# Adjust max_width based on number of columns to prevent overly wide tables
|
|
144
|
+
if num_columns > 15:
|
|
145
|
+
adjusted_max_width = min(max_width, 30) # Be more aggressive with many columns
|
|
146
|
+
elif num_columns > 10:
|
|
147
|
+
adjusted_max_width = min(max_width, 40)
|
|
148
|
+
else:
|
|
149
|
+
adjusted_max_width = max_width
|
|
150
|
+
|
|
151
|
+
# Apply truncation if the string is too long
|
|
152
|
+
if len(str_value) > adjusted_max_width:
|
|
153
|
+
# For very long text, truncate more aggressively
|
|
154
|
+
if len(str_value) > adjusted_max_width * 2:
|
|
155
|
+
# For extremely long text, use a shorter truncation
|
|
156
|
+
truncated = str_value[: adjusted_max_width // 2] + "…"
|
|
157
|
+
else:
|
|
158
|
+
# For moderately long text, use a more generous truncation
|
|
159
|
+
truncated = str_value[: adjusted_max_width - 1] + "…"
|
|
160
|
+
|
|
161
|
+
return truncated
|
|
162
|
+
|
|
163
|
+
return str_value
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _get_column_dtypes(df: Any, columns: list[str]) -> dict[str, str]:
|
|
167
|
+
"""Extract data types for columns and format them in a compact way.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
df: The dataframe object
|
|
171
|
+
columns: List of column names
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
Dictionary mapping column names to formatted data type strings
|
|
175
|
+
"""
|
|
176
|
+
dtypes_dict = {}
|
|
177
|
+
|
|
178
|
+
try:
|
|
179
|
+
if hasattr(df, "dtypes"):
|
|
180
|
+
# Polars/Pandas style
|
|
181
|
+
if hasattr(df.dtypes, "to_dict"):
|
|
182
|
+
# Polars DataFrame dtypes
|
|
183
|
+
raw_dtypes = df.dtypes.to_dict() if hasattr(df.dtypes, "to_dict") else {}
|
|
184
|
+
for col in columns:
|
|
185
|
+
if col in raw_dtypes:
|
|
186
|
+
dtype_str = str(raw_dtypes[col])
|
|
187
|
+
# Convert to compact format similar to Polars glimpse()
|
|
188
|
+
dtypes_dict[col] = _format_dtype_compact(dtype_str)
|
|
189
|
+
else:
|
|
190
|
+
dtypes_dict[col] = "?"
|
|
191
|
+
else:
|
|
192
|
+
# Pandas DataFrame dtypes (Series-like)
|
|
193
|
+
for i, col in enumerate(columns):
|
|
194
|
+
if i < len(df.dtypes):
|
|
195
|
+
dtype_str = str(
|
|
196
|
+
df.dtypes.iloc[i] if hasattr(df.dtypes, "iloc") else df.dtypes[i]
|
|
197
|
+
)
|
|
198
|
+
dtypes_dict[col] = _format_dtype_compact(dtype_str)
|
|
199
|
+
else:
|
|
200
|
+
dtypes_dict[col] = "?"
|
|
201
|
+
elif hasattr(df, "schema"):
|
|
202
|
+
# Other schema-based systems (e.g., Ibis)
|
|
203
|
+
schema = df.schema
|
|
204
|
+
if hasattr(schema, "to_dict"): # pragma: no cover
|
|
205
|
+
raw_dtypes = schema.to_dict()
|
|
206
|
+
for col in columns:
|
|
207
|
+
if col in raw_dtypes:
|
|
208
|
+
dtypes_dict[col] = _format_dtype_compact(str(raw_dtypes[col]))
|
|
209
|
+
else: # pragma: no cover
|
|
210
|
+
dtypes_dict[col] = "?"
|
|
211
|
+
else: # pragma: no cover
|
|
212
|
+
for col in columns:
|
|
213
|
+
try:
|
|
214
|
+
dtype_str = str(getattr(schema, col, "Unknown"))
|
|
215
|
+
dtypes_dict[col] = _format_dtype_compact(dtype_str)
|
|
216
|
+
except Exception: # pragma: no cover
|
|
217
|
+
dtypes_dict[col] = "?"
|
|
218
|
+
else:
|
|
219
|
+
# Fallback: no type information available
|
|
220
|
+
for col in columns:
|
|
221
|
+
dtypes_dict[col] = "?"
|
|
222
|
+
|
|
223
|
+
except Exception: # pragma: no cover
|
|
224
|
+
# If any error occurs, fall back to unknown types
|
|
225
|
+
for col in columns:
|
|
226
|
+
dtypes_dict[col] = "?"
|
|
227
|
+
|
|
228
|
+
return dtypes_dict
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _format_dtype_compact(dtype_str: str) -> str:
|
|
232
|
+
"""Format a data type string to a compact representation.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
dtype_str: The raw data type string
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
Compact formatted data type string
|
|
239
|
+
"""
|
|
240
|
+
# Remove common prefixes and make compact
|
|
241
|
+
dtype_str = dtype_str.lower()
|
|
242
|
+
|
|
243
|
+
# Polars types
|
|
244
|
+
if "utf8" in dtype_str or "string" in dtype_str:
|
|
245
|
+
return "str"
|
|
246
|
+
elif "int64" in dtype_str:
|
|
247
|
+
return "i64"
|
|
248
|
+
elif "int32" in dtype_str:
|
|
249
|
+
return "i32"
|
|
250
|
+
elif "float64" in dtype_str:
|
|
251
|
+
return "f64"
|
|
252
|
+
elif "float32" in dtype_str:
|
|
253
|
+
return "f32"
|
|
254
|
+
elif "boolean" in dtype_str or "bool" in dtype_str:
|
|
255
|
+
return "bool"
|
|
256
|
+
elif "datetime" in dtype_str:
|
|
257
|
+
return "datetime"
|
|
258
|
+
elif "date" in dtype_str and "datetime" not in dtype_str:
|
|
259
|
+
return "date"
|
|
260
|
+
elif "time" in dtype_str:
|
|
261
|
+
return "time"
|
|
262
|
+
|
|
263
|
+
# Pandas types
|
|
264
|
+
elif "object" in dtype_str:
|
|
265
|
+
return "obj"
|
|
266
|
+
elif "category" in dtype_str:
|
|
267
|
+
return "cat"
|
|
268
|
+
|
|
269
|
+
# Generic fallbacks
|
|
270
|
+
elif "int" in dtype_str:
|
|
271
|
+
return "int"
|
|
272
|
+
elif "float" in dtype_str:
|
|
273
|
+
return "float"
|
|
274
|
+
elif "str" in dtype_str:
|
|
275
|
+
return "str"
|
|
276
|
+
|
|
277
|
+
# Unknown or complex types - truncate if too long
|
|
278
|
+
elif len(dtype_str) > 8:
|
|
279
|
+
return dtype_str[:8] + "…"
|
|
280
|
+
else:
|
|
281
|
+
return dtype_str
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _rich_print_scan_table(
|
|
285
|
+
scan_result: Any,
|
|
286
|
+
data_source: str,
|
|
287
|
+
source_type: str,
|
|
288
|
+
table_type: str,
|
|
289
|
+
total_rows: int | None = None,
|
|
290
|
+
total_columns: int | None = None,
|
|
291
|
+
) -> None:
|
|
292
|
+
"""
|
|
293
|
+
Display scan results as a Rich table in the terminal with statistical measures.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
scan_result: The GT object from col_summary_tbl()
|
|
297
|
+
data_source: Name of the data source being scanned
|
|
298
|
+
source_type: Type of data source (e.g., "Pointblank dataset: small_table")
|
|
299
|
+
table_type: Type of table (e.g., "polars.LazyFrame")
|
|
300
|
+
total_rows: Total number of rows in the dataset
|
|
301
|
+
total_columns: Total number of columns in the dataset
|
|
302
|
+
"""
|
|
303
|
+
try:
|
|
304
|
+
import re
|
|
305
|
+
|
|
306
|
+
import narwhals as nw
|
|
307
|
+
from rich.box import SIMPLE_HEAD
|
|
308
|
+
|
|
309
|
+
# Extract the underlying DataFrame from the GT object
|
|
310
|
+
# The GT object has a _tbl_data attribute that contains the DataFrame
|
|
311
|
+
gt_data = scan_result._tbl_data
|
|
312
|
+
|
|
313
|
+
# Convert to Narwhals DataFrame for consistent handling
|
|
314
|
+
nw_data = nw.from_native(gt_data)
|
|
315
|
+
|
|
316
|
+
# Convert to dictionary for easier access
|
|
317
|
+
data_dict = nw_data.to_dict(as_series=False)
|
|
318
|
+
|
|
319
|
+
# Create main scan table with missing data table styling
|
|
320
|
+
# Create a comprehensive title with data source, source type, and table type
|
|
321
|
+
title_text = f"Column Summary / {source_type} / {table_type}"
|
|
322
|
+
|
|
323
|
+
# Add dimensions subtitle in gray if available
|
|
324
|
+
if total_rows is not None and total_columns is not None:
|
|
325
|
+
title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
|
|
326
|
+
|
|
327
|
+
scan_table = Table(
|
|
328
|
+
title=title_text,
|
|
329
|
+
show_header=True,
|
|
330
|
+
header_style="bold magenta",
|
|
331
|
+
box=SIMPLE_HEAD,
|
|
332
|
+
title_style="bold cyan",
|
|
333
|
+
title_justify="left",
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
# Add columns with specific styling and appropriate widths
|
|
337
|
+
scan_table.add_column("Column", style="cyan", no_wrap=True, width=20)
|
|
338
|
+
scan_table.add_column("Type", style="yellow", no_wrap=True, width=10)
|
|
339
|
+
scan_table.add_column(
|
|
340
|
+
"NA", style="red", width=6, justify="right"
|
|
341
|
+
) # Adjusted for better formatting
|
|
342
|
+
scan_table.add_column(
|
|
343
|
+
"UQ", style="green", width=8, justify="right"
|
|
344
|
+
) # Adjusted for boolean values
|
|
345
|
+
|
|
346
|
+
# Add statistical columns if they exist with appropriate widths
|
|
347
|
+
stat_columns = []
|
|
348
|
+
column_mapping = {
|
|
349
|
+
"mean": ("Mean", "blue", 9),
|
|
350
|
+
"std": ("SD", "blue", 9),
|
|
351
|
+
"min": ("Min", "yellow", 9),
|
|
352
|
+
"median": ("Med", "yellow", 9),
|
|
353
|
+
"max": ("Max", "yellow", 9),
|
|
354
|
+
"q_1": ("Q₁", "magenta", 8),
|
|
355
|
+
"q_3": ("Q₃", "magenta", 9),
|
|
356
|
+
"iqr": ("IQR", "magenta", 8),
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
for col_key, (display_name, color, width) in column_mapping.items():
|
|
360
|
+
if col_key in data_dict:
|
|
361
|
+
scan_table.add_column(display_name, style=color, width=width, justify="right")
|
|
362
|
+
stat_columns.append(col_key)
|
|
363
|
+
|
|
364
|
+
# Helper function to extract column name and type from HTML
|
|
365
|
+
def extract_column_info(html_content: str) -> tuple[str, str]:
|
|
366
|
+
"""Extract column name and type from HTML formatted content."""
|
|
367
|
+
# Extract column name from first div
|
|
368
|
+
name_match = re.search(r"<div[^>]*>([^<]+)</div>", html_content)
|
|
369
|
+
column_name = name_match.group(1) if name_match else "Unknown"
|
|
370
|
+
|
|
371
|
+
# Extract data type from second div (with gray color)
|
|
372
|
+
type_match = re.search(r"<div[^>]*color: gray[^>]*>([^<]+)</div>", html_content)
|
|
373
|
+
if type_match:
|
|
374
|
+
data_type = type_match.group(1)
|
|
375
|
+
# Convert to compact format using the existing function
|
|
376
|
+
compact_type = _format_dtype_compact(data_type)
|
|
377
|
+
data_type = compact_type
|
|
378
|
+
else:
|
|
379
|
+
data_type = "unknown"
|
|
380
|
+
|
|
381
|
+
return column_name, data_type
|
|
382
|
+
|
|
383
|
+
# Helper function to format values with improved number formatting
|
|
384
|
+
def format_value(
|
|
385
|
+
value: Any, is_missing: bool = False, is_unique: bool = False, max_width: int = 8
|
|
386
|
+
) -> str:
|
|
387
|
+
"""Format values for display with smart number formatting and HTML cleanup."""
|
|
388
|
+
if value is None or (isinstance(value, str) and value.strip() == ""):
|
|
389
|
+
return "[dim]—[/dim]"
|
|
390
|
+
|
|
391
|
+
# Handle missing values indicator
|
|
392
|
+
if is_missing and str(value) == "0":
|
|
393
|
+
return "[green]●[/green]" # No missing values
|
|
394
|
+
|
|
395
|
+
# Clean up HTML formatting from the raw data
|
|
396
|
+
str_val = str(value)
|
|
397
|
+
|
|
398
|
+
# Handle multi-line values with <br> tags FIRST - take the first line (absolute number)
|
|
399
|
+
if "<br>" in str_val:
|
|
400
|
+
str_val = str_val.split("<br>")[0].strip()
|
|
401
|
+
# For unique values, we want just the integer part
|
|
402
|
+
if is_unique:
|
|
403
|
+
try:
|
|
404
|
+
# Try to extract just the integer part for unique counts
|
|
405
|
+
num_val = float(str_val)
|
|
406
|
+
return str(int(num_val))
|
|
407
|
+
except (ValueError, TypeError):
|
|
408
|
+
pass
|
|
409
|
+
|
|
410
|
+
# Now handle HTML content (especially from boolean unique values)
|
|
411
|
+
if "<" in str_val and ">" in str_val:
|
|
412
|
+
# Remove HTML tags completely for cleaner display
|
|
413
|
+
str_val = re.sub(r"<[^>]+>", "", str_val).strip()
|
|
414
|
+
# Clean up extra whitespace
|
|
415
|
+
str_val = re.sub(r"\s+", " ", str_val).strip()
|
|
416
|
+
|
|
417
|
+
# Handle values like "2<.01" - extract the first number
|
|
418
|
+
if "<" in str_val and not (str_val.startswith("<") and str_val.endswith(">")):
|
|
419
|
+
# Extract number before the < symbol
|
|
420
|
+
before_lt = str_val.split("<")[0].strip()
|
|
421
|
+
if before_lt and before_lt.replace(".", "").replace("-", "").isdigit():
|
|
422
|
+
str_val = before_lt
|
|
423
|
+
|
|
424
|
+
# Handle boolean unique values like "T0.62F0.38" - extract the more readable format
|
|
425
|
+
if re.match(r"^[TF]\d+\.\d+[TF]\d+\.\d+$", str_val):
|
|
426
|
+
# Extract T and F values
|
|
427
|
+
t_match = re.search(r"T(\d+\.\d+)", str_val)
|
|
428
|
+
f_match = re.search(r"F(\d+\.\d+)", str_val)
|
|
429
|
+
if t_match and f_match:
|
|
430
|
+
t_val = float(t_match.group(1))
|
|
431
|
+
f_val = float(f_match.group(1))
|
|
432
|
+
# Show as "T0.62F0.38" but truncated if needed
|
|
433
|
+
formatted = f"T{t_val:.2f}F{f_val:.2f}"
|
|
434
|
+
if len(formatted) > max_width:
|
|
435
|
+
# Truncate to fit, showing dominant value
|
|
436
|
+
if t_val > f_val:
|
|
437
|
+
return f"T{t_val:.1f}"
|
|
438
|
+
else:
|
|
439
|
+
return f"F{f_val:.1f}"
|
|
440
|
+
return formatted
|
|
441
|
+
|
|
442
|
+
# Try to parse as a number for better formatting
|
|
443
|
+
try:
|
|
444
|
+
# Try to convert to float first
|
|
445
|
+
num_val = float(str_val)
|
|
446
|
+
|
|
447
|
+
# Handle special cases
|
|
448
|
+
if num_val == 0:
|
|
449
|
+
return "0"
|
|
450
|
+
elif abs(num_val) == int(abs(num_val)) and abs(num_val) < 10000:
|
|
451
|
+
# Simple integers under 10000
|
|
452
|
+
return str(int(num_val))
|
|
453
|
+
elif abs(num_val) >= 10000000 and abs(num_val) < 100000000:
|
|
454
|
+
# Likely dates in YYYYMMDD format - format as date-like
|
|
455
|
+
int_val = int(num_val)
|
|
456
|
+
if 19000101 <= int_val <= 29991231: # Reasonable date range
|
|
457
|
+
str_date = str(int_val)
|
|
458
|
+
if len(str_date) == 8:
|
|
459
|
+
return (
|
|
460
|
+
f"{str_date[:4]}-{str_date[4:6]}-{str_date[6:]}"[: max_width - 1]
|
|
461
|
+
+ "…"
|
|
462
|
+
)
|
|
463
|
+
# Otherwise treat as large number
|
|
464
|
+
return f"{num_val / 1000000:.1f}M"
|
|
465
|
+
elif abs(num_val) >= 1000000:
|
|
466
|
+
# Large numbers - use scientific notation or M/k notation
|
|
467
|
+
|
|
468
|
+
if abs(num_val) >= 1000000000:
|
|
469
|
+
return f"{num_val:.1e}"
|
|
470
|
+
else:
|
|
471
|
+
return f"{num_val / 1000000:.1f}M"
|
|
472
|
+
elif abs(num_val) >= 10000:
|
|
473
|
+
# Numbers >= 10k - use compact notation
|
|
474
|
+
return f"{num_val / 1000:.1f}k"
|
|
475
|
+
elif abs(num_val) >= 100:
|
|
476
|
+
# Numbers 100-9999 - show with minimal decimals
|
|
477
|
+
return f"{num_val:.1f}"
|
|
478
|
+
elif abs(num_val) >= 10:
|
|
479
|
+
# Numbers 10-99 - show with one decimal
|
|
480
|
+
return f"{num_val:.1f}"
|
|
481
|
+
elif abs(num_val) >= 1:
|
|
482
|
+
# Numbers 1-9 - show with two decimals
|
|
483
|
+
return f"{num_val:.2f}"
|
|
484
|
+
elif abs(num_val) >= 0.01:
|
|
485
|
+
# Small numbers - show with appropriate precision
|
|
486
|
+
return f"{num_val:.2f}"
|
|
487
|
+
else:
|
|
488
|
+
# Very small numbers - use scientific notation
|
|
489
|
+
|
|
490
|
+
return f"{num_val:.1e}"
|
|
491
|
+
|
|
492
|
+
except (ValueError, TypeError):
|
|
493
|
+
# Not a number, handle as string
|
|
494
|
+
pass
|
|
495
|
+
|
|
496
|
+
# Handle date/datetime strings - show abbreviated format
|
|
497
|
+
if len(str_val) > 10 and any(char in str_val for char in ["-", "/", ":"]):
|
|
498
|
+
# Likely a date/datetime, show abbreviated
|
|
499
|
+
if len(str_val) > max_width:
|
|
500
|
+
return str_val[: max_width - 1] + "…"
|
|
501
|
+
|
|
502
|
+
# General string truncation with ellipsis
|
|
503
|
+
if len(str_val) > max_width:
|
|
504
|
+
return str_val[: max_width - 1] + "…"
|
|
505
|
+
|
|
506
|
+
return str_val
|
|
507
|
+
|
|
508
|
+
# Populate table rows
|
|
509
|
+
num_rows = len(data_dict["colname"])
|
|
510
|
+
for i in range(num_rows):
|
|
511
|
+
row_data = []
|
|
512
|
+
|
|
513
|
+
# Column name and type from HTML content
|
|
514
|
+
colname_html = data_dict["colname"][i]
|
|
515
|
+
column_name, data_type = extract_column_info(colname_html)
|
|
516
|
+
row_data.append(column_name)
|
|
517
|
+
row_data.append(data_type)
|
|
518
|
+
|
|
519
|
+
# Missing values (NA)
|
|
520
|
+
missing_val = data_dict.get("n_missing", [None] * num_rows)[i]
|
|
521
|
+
row_data.append(format_value(missing_val, is_missing=True, max_width=6))
|
|
522
|
+
|
|
523
|
+
# Unique values (UQ)
|
|
524
|
+
unique_val = data_dict.get("n_unique", [None] * num_rows)[i]
|
|
525
|
+
row_data.append(format_value(unique_val, is_unique=True, max_width=8))
|
|
526
|
+
|
|
527
|
+
# Statistical columns
|
|
528
|
+
for stat_col in stat_columns:
|
|
529
|
+
stat_val = data_dict.get(stat_col, [None] * num_rows)[i]
|
|
530
|
+
# Use appropriate width based on column type
|
|
531
|
+
if stat_col in ["q_1", "iqr"]:
|
|
532
|
+
width = 8
|
|
533
|
+
elif stat_col in ["mean", "std", "min", "median", "max", "q_3"]:
|
|
534
|
+
width = 9
|
|
535
|
+
else:
|
|
536
|
+
width = 8
|
|
537
|
+
row_data.append(format_value(stat_val, max_width=width))
|
|
538
|
+
|
|
539
|
+
scan_table.add_row(*row_data)
|
|
540
|
+
|
|
541
|
+
# Display the results
|
|
542
|
+
console.print()
|
|
543
|
+
console.print(scan_table)
|
|
544
|
+
|
|
545
|
+
except Exception as e:
|
|
546
|
+
# Fallback to simple message if table creation fails
|
|
547
|
+
console.print(f"[yellow]Scan results available for {data_source}[/yellow]")
|
|
548
|
+
console.print(f"[red]Error displaying table: {str(e)}[/red]")
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def _rich_print_gt_table(
|
|
552
|
+
gt_table: Any, preview_info: dict | None = None, show_summary: bool = True
|
|
553
|
+
) -> None:
|
|
554
|
+
"""Convert a GT table to Rich table and display it in the terminal.
|
|
555
|
+
|
|
556
|
+
Args:
|
|
557
|
+
gt_table: The GT table object to display
|
|
558
|
+
preview_info: Optional dict with preview context info:
|
|
559
|
+
- total_rows: Total rows in the dataset
|
|
560
|
+
- head_rows: Number of head rows shown
|
|
561
|
+
- tail_rows: Number of tail rows shown
|
|
562
|
+
- is_complete: Whether the entire dataset is shown
|
|
563
|
+
show_summary: Whether to show the row count summary at the bottom
|
|
564
|
+
"""
|
|
565
|
+
try:
|
|
566
|
+
# Try to extract the underlying data from the GT table
|
|
567
|
+
df = None
|
|
568
|
+
|
|
569
|
+
# Great Tables stores the original data in different places depending on how it was created
|
|
570
|
+
# Let's try multiple approaches to get the data
|
|
571
|
+
if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
|
|
572
|
+
df = gt_table._tbl_data
|
|
573
|
+
elif (
|
|
574
|
+
hasattr(gt_table, "_body")
|
|
575
|
+
and hasattr(gt_table._body, "body")
|
|
576
|
+
and gt_table._body.body is not None
|
|
577
|
+
):
|
|
578
|
+
df = gt_table._body.body
|
|
579
|
+
elif hasattr(gt_table, "_data") and gt_table._data is not None:
|
|
580
|
+
df = gt_table._data
|
|
581
|
+
elif hasattr(gt_table, "data") and gt_table.data is not None:
|
|
582
|
+
df = gt_table.data
|
|
583
|
+
|
|
584
|
+
if df is not None:
|
|
585
|
+
# Create a Rich table with horizontal lines
|
|
586
|
+
from rich.box import SIMPLE_HEAD
|
|
587
|
+
|
|
588
|
+
# Create enhanced title if preview_info contains metadata
|
|
589
|
+
table_title = None
|
|
590
|
+
if preview_info and "source_type" in preview_info and "table_type" in preview_info:
|
|
591
|
+
source_type = preview_info["source_type"]
|
|
592
|
+
table_type = preview_info["table_type"]
|
|
593
|
+
table_title = f"Data Preview / {source_type} / {table_type}"
|
|
594
|
+
|
|
595
|
+
rich_table = Table(
|
|
596
|
+
title=table_title,
|
|
597
|
+
show_header=True,
|
|
598
|
+
header_style="bold magenta",
|
|
599
|
+
box=SIMPLE_HEAD,
|
|
600
|
+
title_style="bold cyan",
|
|
601
|
+
title_justify="left",
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
# Get column names
|
|
605
|
+
columns = []
|
|
606
|
+
if hasattr(df, "columns"):
|
|
607
|
+
columns = list(df.columns)
|
|
608
|
+
elif hasattr(df, "schema"): # pragma: no cover
|
|
609
|
+
columns = list(df.schema.names)
|
|
610
|
+
elif hasattr(df, "column_names"): # pragma: no cover
|
|
611
|
+
columns = list(df.column_names)
|
|
612
|
+
|
|
613
|
+
if not columns: # pragma: no cover
|
|
614
|
+
# Fallback: try to determine columns from first row
|
|
615
|
+
try:
|
|
616
|
+
if hasattr(df, "to_dicts") and len(df) > 0:
|
|
617
|
+
first_dict = df.to_dicts()[0]
|
|
618
|
+
columns = list(first_dict.keys())
|
|
619
|
+
elif hasattr(df, "to_dict") and len(df) > 0:
|
|
620
|
+
first_dict = df.to_dict("records")[0]
|
|
621
|
+
columns = list(first_dict.keys())
|
|
622
|
+
except Exception: # pragma: no cover
|
|
623
|
+
columns = [f"Column {i + 1}" for i in range(10)] # Default fallback
|
|
624
|
+
|
|
625
|
+
# Add columns to Rich table
|
|
626
|
+
# Handle wide tables by limiting columns displayed
|
|
627
|
+
max_terminal_cols = 15 # Reasonable limit for terminal display
|
|
628
|
+
|
|
629
|
+
# Get terminal width to adjust column behavior
|
|
630
|
+
try:
|
|
631
|
+
terminal_width = console.size.width
|
|
632
|
+
# Estimate max column width based on terminal size and number of columns
|
|
633
|
+
if len(columns) <= 5:
|
|
634
|
+
max_col_width = min(60, terminal_width // 4)
|
|
635
|
+
elif len(columns) <= 10:
|
|
636
|
+
max_col_width = min(40, terminal_width // 6)
|
|
637
|
+
else:
|
|
638
|
+
max_col_width = min(30, terminal_width // 8)
|
|
639
|
+
except Exception: # pragma: no cover
|
|
640
|
+
# Fallback if we can't get terminal width
|
|
641
|
+
max_col_width = 40 if len(columns) <= 10 else 25
|
|
642
|
+
|
|
643
|
+
if len(columns) > max_terminal_cols:
|
|
644
|
+
# For wide tables, show first few, middle indicator, and last few columns
|
|
645
|
+
first_cols = 7
|
|
646
|
+
last_cols = 7
|
|
647
|
+
|
|
648
|
+
display_columns = columns[:first_cols] + ["...more..."] + columns[-last_cols:]
|
|
649
|
+
|
|
650
|
+
console.print(
|
|
651
|
+
f"\n[yellow]⚠ Table has {len(columns)} columns. Showing first {first_cols} and last {last_cols} columns.[/yellow]"
|
|
652
|
+
)
|
|
653
|
+
console.print("[dim]Use --columns to specify which columns to display.[/dim]")
|
|
654
|
+
console.print(
|
|
655
|
+
f"[dim]Full column list: {', '.join(columns[:5])}...{', '.join(columns[-5:])}[/dim]\n"
|
|
656
|
+
)
|
|
657
|
+
else:
|
|
658
|
+
display_columns = columns
|
|
659
|
+
|
|
660
|
+
# Get data types for columns
|
|
661
|
+
dtypes_dict = _get_column_dtypes(df, columns)
|
|
662
|
+
|
|
663
|
+
# Calculate row number column width if needed
|
|
664
|
+
row_num_width = 6 # Default width
|
|
665
|
+
if "_row_num_" in columns:
|
|
666
|
+
try:
|
|
667
|
+
# Get the maximum row number to calculate appropriate width
|
|
668
|
+
if hasattr(df, "to_dicts"):
|
|
669
|
+
data_dict = df.to_dicts()
|
|
670
|
+
if data_dict:
|
|
671
|
+
row_nums = [row.get("_row_num_", 0) for row in data_dict]
|
|
672
|
+
max_row_num = max(row_nums) if row_nums else 0
|
|
673
|
+
row_num_width = max(len(str(max_row_num)) + 1, 6) # +1 for padding
|
|
674
|
+
elif hasattr(df, "to_dict"):
|
|
675
|
+
data_dict = df.to_dict("records")
|
|
676
|
+
if data_dict:
|
|
677
|
+
row_nums = [row.get("_row_num_", 0) for row in data_dict]
|
|
678
|
+
max_row_num = max(row_nums) if row_nums else 0
|
|
679
|
+
row_num_width = max(len(str(max_row_num)) + 1, 6) # +1 for padding
|
|
680
|
+
except Exception: # pragma: no cover
|
|
681
|
+
# If we can't determine max row number, use default
|
|
682
|
+
row_num_width = 8 # Slightly larger default for safety
|
|
683
|
+
|
|
684
|
+
for i, col in enumerate(display_columns):
|
|
685
|
+
if col == "...more...":
|
|
686
|
+
# Add a special indicator column
|
|
687
|
+
rich_table.add_column("···", style="dim", width=3, no_wrap=True)
|
|
688
|
+
else:
|
|
689
|
+
# Handle row number column specially
|
|
690
|
+
if col == "_row_num_":
|
|
691
|
+
# Row numbers get no header, right alignment, and dim gray style
|
|
692
|
+
# Use dynamic width to prevent truncation
|
|
693
|
+
rich_table.add_column(
|
|
694
|
+
"", style="dim", justify="right", no_wrap=True, width=row_num_width
|
|
695
|
+
)
|
|
696
|
+
else:
|
|
697
|
+
display_col = str(col)
|
|
698
|
+
|
|
699
|
+
# Get data type for this column (if available)
|
|
700
|
+
if col in dtypes_dict:
|
|
701
|
+
dtype_display = f"<{dtypes_dict[col]}>"
|
|
702
|
+
# Create header with column name and data type
|
|
703
|
+
header_text = f"{display_col}\n[dim yellow]{dtype_display}[/dim yellow]"
|
|
704
|
+
else:
|
|
705
|
+
header_text = display_col
|
|
706
|
+
|
|
707
|
+
rich_table.add_column(
|
|
708
|
+
header_text,
|
|
709
|
+
style="cyan",
|
|
710
|
+
no_wrap=False,
|
|
711
|
+
overflow="ellipsis",
|
|
712
|
+
max_width=max_col_width,
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
# Convert data to list of rows
|
|
716
|
+
rows = []
|
|
717
|
+
try:
|
|
718
|
+
if hasattr(df, "to_dicts"):
|
|
719
|
+
# Polars interface
|
|
720
|
+
data_dict = df.to_dicts()
|
|
721
|
+
if len(columns) > max_terminal_cols:
|
|
722
|
+
# For wide tables, extract only the displayed columns
|
|
723
|
+
display_data_columns = (
|
|
724
|
+
columns[:7] + columns[-7:]
|
|
725
|
+
) # Skip the "...more..." placeholder
|
|
726
|
+
rows = [
|
|
727
|
+
[
|
|
728
|
+
_format_cell_value(
|
|
729
|
+
row.get(col, ""),
|
|
730
|
+
is_row_number=(col == "_row_num_"),
|
|
731
|
+
max_width=max_col_width,
|
|
732
|
+
num_columns=len(columns),
|
|
733
|
+
)
|
|
734
|
+
for col in display_data_columns
|
|
735
|
+
]
|
|
736
|
+
for row in data_dict
|
|
737
|
+
]
|
|
738
|
+
# Add the "..." column in the middle
|
|
739
|
+
for i, row in enumerate(rows):
|
|
740
|
+
rows[i] = row[:7] + ["···"] + row[7:]
|
|
741
|
+
else:
|
|
742
|
+
rows = [
|
|
743
|
+
[
|
|
744
|
+
_format_cell_value(
|
|
745
|
+
row.get(col, ""),
|
|
746
|
+
is_row_number=(col == "_row_num_"),
|
|
747
|
+
max_width=max_col_width,
|
|
748
|
+
num_columns=len(columns),
|
|
749
|
+
)
|
|
750
|
+
for col in columns
|
|
751
|
+
]
|
|
752
|
+
for row in data_dict
|
|
753
|
+
]
|
|
754
|
+
elif hasattr(df, "to_dict"):
|
|
755
|
+
# Pandas-like interface
|
|
756
|
+
data_dict = df.to_dict("records")
|
|
757
|
+
if len(columns) > max_terminal_cols:
|
|
758
|
+
# For wide tables, extract only the displayed columns
|
|
759
|
+
display_data_columns = columns[:7] + columns[-7:]
|
|
760
|
+
rows = [
|
|
761
|
+
[
|
|
762
|
+
_format_cell_value(
|
|
763
|
+
row.get(col, ""),
|
|
764
|
+
is_row_number=(col == "_row_num_"),
|
|
765
|
+
max_width=max_col_width,
|
|
766
|
+
num_columns=len(columns),
|
|
767
|
+
)
|
|
768
|
+
for col in display_data_columns
|
|
769
|
+
]
|
|
770
|
+
for row in data_dict
|
|
771
|
+
]
|
|
772
|
+
# Add the "..." column in the middle
|
|
773
|
+
for i, row in enumerate(rows):
|
|
774
|
+
rows[i] = row[:7] + ["···"] + row[7:]
|
|
775
|
+
else:
|
|
776
|
+
rows = [
|
|
777
|
+
[
|
|
778
|
+
_format_cell_value(
|
|
779
|
+
row.get(col, ""),
|
|
780
|
+
is_row_number=(col == "_row_num_"),
|
|
781
|
+
max_width=max_col_width,
|
|
782
|
+
num_columns=len(columns),
|
|
783
|
+
)
|
|
784
|
+
for col in columns
|
|
785
|
+
]
|
|
786
|
+
for row in data_dict
|
|
787
|
+
]
|
|
788
|
+
elif hasattr(df, "iter_rows"):
|
|
789
|
+
# Polars lazy frame
|
|
790
|
+
rows = [
|
|
791
|
+
[
|
|
792
|
+
_format_cell_value(
|
|
793
|
+
val,
|
|
794
|
+
is_row_number=(i == 0 and columns[0] == "_row_num_"),
|
|
795
|
+
max_width=max_col_width,
|
|
796
|
+
num_columns=len(columns),
|
|
797
|
+
)
|
|
798
|
+
for i, val in enumerate(row)
|
|
799
|
+
]
|
|
800
|
+
for row in df.iter_rows()
|
|
801
|
+
]
|
|
802
|
+
elif hasattr(df, "__iter__"):
|
|
803
|
+
# Try to iterate directly
|
|
804
|
+
rows = [
|
|
805
|
+
[
|
|
806
|
+
_format_cell_value(
|
|
807
|
+
val,
|
|
808
|
+
is_row_number=(i == 0 and columns[0] == "_row_num_"),
|
|
809
|
+
max_width=max_col_width,
|
|
810
|
+
num_columns=len(columns),
|
|
811
|
+
)
|
|
812
|
+
for i, val in enumerate(row)
|
|
813
|
+
]
|
|
814
|
+
for row in df
|
|
815
|
+
]
|
|
816
|
+
else:
|
|
817
|
+
rows = [["Could not extract data from this format"]] # pragma: no cover
|
|
818
|
+
except Exception as e:
|
|
819
|
+
rows = [[f"Error extracting data: {e}"]] # pragma: no cover
|
|
820
|
+
|
|
821
|
+
# Add rows to Rich table with separator between head and tail
|
|
822
|
+
max_rows = 50 # Reasonable limit for terminal display
|
|
823
|
+
|
|
824
|
+
# Get preview info to determine head/tail separation
|
|
825
|
+
head_rows_count = 0
|
|
826
|
+
tail_rows_count = 0
|
|
827
|
+
total_dataset_rows = 0
|
|
828
|
+
|
|
829
|
+
if preview_info:
|
|
830
|
+
head_rows_count = preview_info.get("head_rows", 0)
|
|
831
|
+
tail_rows_count = preview_info.get("tail_rows", 0)
|
|
832
|
+
total_dataset_rows = preview_info.get("total_rows", len(rows))
|
|
833
|
+
is_complete = preview_info.get("is_complete", False)
|
|
834
|
+
else:
|
|
835
|
+
# Fallback: assume all rows are shown
|
|
836
|
+
is_complete = True
|
|
837
|
+
|
|
838
|
+
# Add rows with optional separator
|
|
839
|
+
for i, row in enumerate(rows[:max_rows]):
|
|
840
|
+
try:
|
|
841
|
+
# Add separator between head and tail rows
|
|
842
|
+
if (
|
|
843
|
+
not is_complete
|
|
844
|
+
and head_rows_count > 0
|
|
845
|
+
and tail_rows_count > 0
|
|
846
|
+
and i == head_rows_count
|
|
847
|
+
):
|
|
848
|
+
# Add a visual separator row with dashes
|
|
849
|
+
separator_row = [
|
|
850
|
+
"─" * 3 if col != "_row_num_" else "⋮"
|
|
851
|
+
for col in (
|
|
852
|
+
display_columns if "display_columns" in locals() else columns
|
|
853
|
+
)
|
|
854
|
+
]
|
|
855
|
+
rich_table.add_row(*separator_row, style="dim")
|
|
856
|
+
|
|
857
|
+
rich_table.add_row(*row)
|
|
858
|
+
except Exception as e: # pragma: no cover
|
|
859
|
+
# If there's an issue with row data, show error
|
|
860
|
+
rich_table.add_row(*[f"Error: {e}" for _ in columns]) # pragma: no cover
|
|
861
|
+
break # pragma: no cover
|
|
862
|
+
|
|
863
|
+
# Show the table
|
|
864
|
+
console.print()
|
|
865
|
+
console.print(rich_table)
|
|
866
|
+
|
|
867
|
+
# Show summary info (conditionally)
|
|
868
|
+
if show_summary:
|
|
869
|
+
total_rows = len(rows)
|
|
870
|
+
|
|
871
|
+
# Use preview info if available, otherwise fall back to old logic
|
|
872
|
+
if preview_info:
|
|
873
|
+
total_dataset_rows = preview_info.get("total_rows", total_rows)
|
|
874
|
+
head_rows = preview_info.get("head_rows", 0)
|
|
875
|
+
tail_rows = preview_info.get("tail_rows", 0)
|
|
876
|
+
is_complete = preview_info.get("is_complete", False)
|
|
877
|
+
|
|
878
|
+
if is_complete:
|
|
879
|
+
console.print(f"\n[dim]Showing all {total_rows} rows.[/dim]")
|
|
880
|
+
elif head_rows > 0 and tail_rows > 0:
|
|
881
|
+
console.print(
|
|
882
|
+
f"\n[dim]Showing first {head_rows} and last {tail_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
|
|
883
|
+
)
|
|
884
|
+
elif head_rows > 0:
|
|
885
|
+
console.print(
|
|
886
|
+
f"\n[dim]Showing first {head_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
|
|
887
|
+
)
|
|
888
|
+
elif tail_rows > 0:
|
|
889
|
+
console.print(
|
|
890
|
+
f"\n[dim]Showing last {tail_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
|
|
891
|
+
)
|
|
892
|
+
else:
|
|
893
|
+
# Fallback for other cases
|
|
894
|
+
console.print(
|
|
895
|
+
f"\n[dim]Showing {total_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
|
|
896
|
+
)
|
|
897
|
+
else:
|
|
898
|
+
# Original logic as fallback
|
|
899
|
+
max_rows = 50 # This should match the limit used above
|
|
900
|
+
if total_rows > max_rows:
|
|
901
|
+
console.print(
|
|
902
|
+
f"\n[dim]Showing first {max_rows} of {total_rows} rows. Use --output-html to see all data.[/dim]"
|
|
903
|
+
)
|
|
904
|
+
else:
|
|
905
|
+
console.print(f"\n[dim]Showing all {total_rows} rows.[/dim]")
|
|
906
|
+
|
|
907
|
+
else:
|
|
908
|
+
# If we can't extract data, show the success message
|
|
909
|
+
console.print(
|
|
910
|
+
Panel(
|
|
911
|
+
"[green]✓[/green] Table rendered successfully. "
|
|
912
|
+
"Use --output-html to save the full interactive report.",
|
|
913
|
+
title="Table Preview",
|
|
914
|
+
border_style="green",
|
|
915
|
+
)
|
|
916
|
+
)
|
|
917
|
+
|
|
918
|
+
except Exception as e: # pragma: no cover
|
|
919
|
+
console.print(f"[red]Error rendering table:[/red] {e}")
|
|
920
|
+
console.print(
|
|
921
|
+
f"[dim]GT table type: {type(gt_table) if 'gt_table' in locals() else 'undefined'}[/dim]"
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
# Fallback: show the success message
|
|
925
|
+
console.print(
|
|
926
|
+
Panel(
|
|
927
|
+
"[green]✓[/green] Table rendered successfully. "
|
|
928
|
+
"Use --output-html to save the full interactive report.",
|
|
929
|
+
title="Table Preview",
|
|
930
|
+
border_style="green",
|
|
931
|
+
)
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
def _display_validation_summary(validation: Any) -> None:
|
|
936
|
+
"""Display a validation summary in a Rich table format."""
|
|
937
|
+
try:
|
|
938
|
+
# Try to get the summary from the validation report
|
|
939
|
+
if hasattr(validation, "validation_info") and validation.validation_info is not None:
|
|
940
|
+
# Use the validation_info to create a summary
|
|
941
|
+
info = validation.validation_info
|
|
942
|
+
n_steps = len(info)
|
|
943
|
+
n_passed = sum(1 for step in info if step.all_passed)
|
|
944
|
+
n_failed = n_steps - n_passed
|
|
945
|
+
|
|
946
|
+
# Calculate severity counts
|
|
947
|
+
n_warning = sum(1 for step in info if step.warning)
|
|
948
|
+
n_error = sum(1 for step in info if step.error)
|
|
949
|
+
n_critical = sum(1 for step in info if step.critical)
|
|
950
|
+
|
|
951
|
+
all_passed = n_failed == 0
|
|
952
|
+
|
|
953
|
+
# Determine highest severity
|
|
954
|
+
if n_critical > 0:
|
|
955
|
+
highest_severity = "critical"
|
|
956
|
+
elif n_error > 0:
|
|
957
|
+
highest_severity = "error"
|
|
958
|
+
elif n_warning > 0:
|
|
959
|
+
highest_severity = "warning"
|
|
960
|
+
elif n_failed > 0:
|
|
961
|
+
highest_severity = "some failing"
|
|
962
|
+
else:
|
|
963
|
+
highest_severity = "all passed"
|
|
964
|
+
|
|
965
|
+
# Create a summary table
|
|
966
|
+
table = Table(title="Validation Summary", show_header=True, header_style="bold magenta")
|
|
967
|
+
table.add_column("Metric", style="cyan", no_wrap=True)
|
|
968
|
+
table.add_column("Value", style="green")
|
|
969
|
+
|
|
970
|
+
# Add summary statistics
|
|
971
|
+
table.add_row("Total Steps", str(n_steps))
|
|
972
|
+
table.add_row("Passing Steps", str(n_passed))
|
|
973
|
+
table.add_row("Failing Steps", str(n_failed))
|
|
974
|
+
table.add_row("Warning Steps", str(n_warning))
|
|
975
|
+
table.add_row("Error Steps", str(n_error))
|
|
976
|
+
table.add_row("Critical Steps", str(n_critical))
|
|
977
|
+
table.add_row("All Passed", str(all_passed))
|
|
978
|
+
table.add_row("Highest Severity", highest_severity)
|
|
979
|
+
|
|
980
|
+
console.print(table)
|
|
981
|
+
|
|
982
|
+
# Display step details
|
|
983
|
+
if n_steps > 0:
|
|
984
|
+
steps_table = Table(
|
|
985
|
+
title="Validation Steps", show_header=True, header_style="bold cyan"
|
|
986
|
+
)
|
|
987
|
+
steps_table.add_column("Step", style="dim")
|
|
988
|
+
steps_table.add_column("Type", style="white")
|
|
989
|
+
steps_table.add_column("Column", style="cyan")
|
|
990
|
+
steps_table.add_column("Status", style="white")
|
|
991
|
+
steps_table.add_column("Passed/Total", style="green")
|
|
992
|
+
|
|
993
|
+
for step in info:
|
|
994
|
+
status_icon = "✓" if step.all_passed else "✗"
|
|
995
|
+
status_color = "green" if step.all_passed else "red"
|
|
996
|
+
|
|
997
|
+
severity = ""
|
|
998
|
+
if step.critical:
|
|
999
|
+
severity = " [red](CRITICAL)[/red]"
|
|
1000
|
+
elif step.error:
|
|
1001
|
+
severity = " [red](ERROR)[/red]"
|
|
1002
|
+
elif step.warning:
|
|
1003
|
+
severity = " [yellow](WARNING)[/yellow]"
|
|
1004
|
+
|
|
1005
|
+
steps_table.add_row(
|
|
1006
|
+
str(step.i),
|
|
1007
|
+
step.assertion_type,
|
|
1008
|
+
str(step.column) if step.column else "—",
|
|
1009
|
+
f"[{status_color}]{status_icon}[/{status_color}]{severity}",
|
|
1010
|
+
f"{step.n_passed}/{step.n}",
|
|
1011
|
+
)
|
|
1012
|
+
|
|
1013
|
+
console.print(steps_table)
|
|
1014
|
+
|
|
1015
|
+
# Display status with appropriate color
|
|
1016
|
+
if highest_severity == "all passed":
|
|
1017
|
+
console.print(
|
|
1018
|
+
Panel("[green]✓ All validations passed![/green]", border_style="green")
|
|
1019
|
+
)
|
|
1020
|
+
elif highest_severity == "some failing":
|
|
1021
|
+
console.print(
|
|
1022
|
+
Panel("[yellow]⚠ Some validations failed[/yellow]", border_style="yellow")
|
|
1023
|
+
)
|
|
1024
|
+
elif highest_severity in ["warning", "error", "critical"]:
|
|
1025
|
+
color = "yellow" if highest_severity == "warning" else "red"
|
|
1026
|
+
console.print(
|
|
1027
|
+
Panel(
|
|
1028
|
+
f"[{color}]✗ Validation failed with {highest_severity} severity[/{color}]",
|
|
1029
|
+
border_style=color,
|
|
1030
|
+
)
|
|
1031
|
+
)
|
|
1032
|
+
else:
|
|
1033
|
+
console.print("[yellow]Validation object does not contain validation results.[/yellow]")
|
|
1034
|
+
|
|
1035
|
+
except Exception as e: # pragma: no cover
|
|
1036
|
+
console.print(f"[red]Error displaying validation summary:[/red] {e}")
|
|
1037
|
+
import traceback # pragma: no cover
|
|
1038
|
+
|
|
1039
|
+
console.print(f"[dim]{traceback.format_exc()}[/dim]") # pragma: no cover
|
|
1040
|
+
|
|
1041
|
+
|
|
1042
|
+
@click.group(cls=OrderedGroup)
|
|
1043
|
+
@click.version_option(version=pb.__version__, prog_name="pb")
|
|
1044
|
+
def cli():
|
|
1045
|
+
"""
|
|
1046
|
+
Pointblank CLI - Data validation and quality tools for data engineers.
|
|
1047
|
+
|
|
1048
|
+
Use this CLI to run validation scripts, preview tables, and generate reports
|
|
1049
|
+
directly from the command line.
|
|
1050
|
+
"""
|
|
1051
|
+
pass
|
|
1052
|
+
|
|
1053
|
+
|
|
1054
|
+
@cli.command()
|
|
1055
|
+
@click.argument("data_source", type=str)
|
|
1056
|
+
def info(data_source: str):
|
|
1057
|
+
"""
|
|
1058
|
+
Display information about a data source.
|
|
1059
|
+
|
|
1060
|
+
Shows table type, dimensions, column names, and data types.
|
|
1061
|
+
|
|
1062
|
+
DATA_SOURCE can be:
|
|
1063
|
+
|
|
1064
|
+
\b
|
|
1065
|
+
- CSV file path (e.g., data.csv)
|
|
1066
|
+
- Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
|
|
1067
|
+
- GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
|
|
1068
|
+
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1069
|
+
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1070
|
+
"""
|
|
1071
|
+
try:
|
|
1072
|
+
with console.status("[bold green]Loading data..."):
|
|
1073
|
+
# Load the data source using the centralized function
|
|
1074
|
+
data = _load_data_source(data_source)
|
|
1075
|
+
|
|
1076
|
+
# Get table information
|
|
1077
|
+
tbl_type = _get_tbl_type(data)
|
|
1078
|
+
row_count = pb.get_row_count(data)
|
|
1079
|
+
col_count = pb.get_column_count(data)
|
|
1080
|
+
|
|
1081
|
+
# Import the box style
|
|
1082
|
+
from rich.box import SIMPLE_HEAD
|
|
1083
|
+
|
|
1084
|
+
# Create info table
|
|
1085
|
+
info_table = Table(
|
|
1086
|
+
title="Data Source Information",
|
|
1087
|
+
show_header=True,
|
|
1088
|
+
header_style="bold magenta",
|
|
1089
|
+
box=SIMPLE_HEAD,
|
|
1090
|
+
title_style="bold cyan",
|
|
1091
|
+
title_justify="left",
|
|
1092
|
+
)
|
|
1093
|
+
info_table.add_column("Property", style="cyan", no_wrap=True)
|
|
1094
|
+
info_table.add_column("Value", style="green")
|
|
1095
|
+
|
|
1096
|
+
info_table.add_row("Source", data_source)
|
|
1097
|
+
info_table.add_row("Table Type", tbl_type)
|
|
1098
|
+
info_table.add_row("Rows", f"{row_count:,}")
|
|
1099
|
+
info_table.add_row("Columns", f"{col_count:,}")
|
|
1100
|
+
|
|
1101
|
+
console.print()
|
|
1102
|
+
console.print(info_table)
|
|
1103
|
+
|
|
1104
|
+
except Exception as e:
|
|
1105
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1106
|
+
sys.exit(1)
|
|
1107
|
+
|
|
1108
|
+
|
|
1109
|
+
@cli.command()
|
|
1110
|
+
@click.argument("data_source", type=str)
|
|
1111
|
+
@click.option("--columns", "-c", help="Comma-separated list of columns to display")
|
|
1112
|
+
@click.option("--col-range", help="Column range like '1:10' or '5:' or ':15' (1-based indexing)")
|
|
1113
|
+
@click.option("--col-first", type=int, help="Show first N columns")
|
|
1114
|
+
@click.option("--col-last", type=int, help="Show last N columns")
|
|
1115
|
+
@click.option("--head", "-h", default=5, help="Number of rows from the top (default: 5)")
|
|
1116
|
+
@click.option("--tail", "-t", default=5, help="Number of rows from the bottom (default: 5)")
|
|
1117
|
+
@click.option("--limit", "-l", default=50, help="Maximum total rows to display (default: 50)")
|
|
1118
|
+
@click.option("--no-row-numbers", is_flag=True, help="Hide row numbers")
|
|
1119
|
+
@click.option("--max-col-width", default=250, help="Maximum column width in pixels (default: 250)")
|
|
1120
|
+
@click.option("--min-table-width", default=500, help="Minimum table width in pixels (default: 500)")
|
|
1121
|
+
@click.option("--no-header", is_flag=True, help="Hide table header")
|
|
1122
|
+
@click.option("--output-html", type=click.Path(), help="Save HTML output to file")
|
|
1123
|
+
def preview(
|
|
1124
|
+
data_source: str,
|
|
1125
|
+
columns: str | None,
|
|
1126
|
+
col_range: str | None,
|
|
1127
|
+
col_first: int | None,
|
|
1128
|
+
col_last: int | None,
|
|
1129
|
+
head: int,
|
|
1130
|
+
tail: int,
|
|
1131
|
+
limit: int,
|
|
1132
|
+
no_row_numbers: bool,
|
|
1133
|
+
max_col_width: int,
|
|
1134
|
+
min_table_width: int,
|
|
1135
|
+
no_header: bool,
|
|
1136
|
+
output_html: str | None,
|
|
1137
|
+
):
|
|
1138
|
+
"""
|
|
1139
|
+
Preview a data table showing head and tail rows.
|
|
1140
|
+
|
|
1141
|
+
DATA_SOURCE can be:
|
|
1142
|
+
|
|
1143
|
+
\b
|
|
1144
|
+
- CSV file path (e.g., data.csv)
|
|
1145
|
+
- Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
|
|
1146
|
+
- GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
|
|
1147
|
+
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1148
|
+
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1149
|
+
|
|
1150
|
+
COLUMN SELECTION OPTIONS:
|
|
1151
|
+
|
|
1152
|
+
For tables with many columns, use these options to control which columns are displayed:
|
|
1153
|
+
|
|
1154
|
+
\b
|
|
1155
|
+
- --columns: Specify exact columns (e.g., --columns "name,age,email")
|
|
1156
|
+
- --col-range: Select column range (e.g., --col-range "1:10", --col-range "5:", --col-range ":15")
|
|
1157
|
+
- --col-first: Show first N columns (e.g., --col-first 5)
|
|
1158
|
+
- --col-last: Show last N columns (e.g., --col-last 3)
|
|
1159
|
+
|
|
1160
|
+
Tables with >15 columns automatically show first 7 and last 7 columns with indicators.
|
|
1161
|
+
"""
|
|
1162
|
+
try:
|
|
1163
|
+
with console.status("[bold green]Loading data..."):
|
|
1164
|
+
# Load the data source using the centralized function
|
|
1165
|
+
data = _load_data_source(data_source)
|
|
1166
|
+
|
|
1167
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1168
|
+
|
|
1169
|
+
# Parse columns if provided
|
|
1170
|
+
columns_list = None
|
|
1171
|
+
if columns:
|
|
1172
|
+
columns_list = [col.strip() for col in columns.split(",")]
|
|
1173
|
+
|
|
1174
|
+
# If data has _row_num_ and it's not explicitly included, add it at the beginning
|
|
1175
|
+
try:
|
|
1176
|
+
# Data is already processed, just use it directly
|
|
1177
|
+
processed_data = data
|
|
1178
|
+
|
|
1179
|
+
# Get column names from the processed data
|
|
1180
|
+
all_columns = []
|
|
1181
|
+
if hasattr(processed_data, "columns"):
|
|
1182
|
+
all_columns = list(processed_data.columns)
|
|
1183
|
+
elif hasattr(processed_data, "schema"):
|
|
1184
|
+
all_columns = list(processed_data.schema.names)
|
|
1185
|
+
|
|
1186
|
+
# If _row_num_ exists in data but not in user selection, add it at beginning
|
|
1187
|
+
if all_columns and "_row_num_" in all_columns and "_row_num_" not in columns_list:
|
|
1188
|
+
columns_list = ["_row_num_"] + columns_list
|
|
1189
|
+
except Exception: # pragma: no cover
|
|
1190
|
+
# If we can't process the data, just use the user's column list as-is
|
|
1191
|
+
pass
|
|
1192
|
+
elif col_range or col_first or col_last:
|
|
1193
|
+
# Need to get column names to apply range/first/last selection
|
|
1194
|
+
# Data is already processed, just use it directly
|
|
1195
|
+
processed_data = data
|
|
1196
|
+
|
|
1197
|
+
# Get column names from the processed data
|
|
1198
|
+
all_columns = []
|
|
1199
|
+
if hasattr(processed_data, "columns"):
|
|
1200
|
+
all_columns = list(processed_data.columns)
|
|
1201
|
+
elif hasattr(processed_data, "schema"):
|
|
1202
|
+
all_columns = list(processed_data.schema.names)
|
|
1203
|
+
else:
|
|
1204
|
+
console.print(
|
|
1205
|
+
"[yellow]Warning: Could not determine column names for range selection[/yellow]"
|
|
1206
|
+
)
|
|
1207
|
+
|
|
1208
|
+
if all_columns:
|
|
1209
|
+
# Check if _row_num_ exists and preserve it
|
|
1210
|
+
has_row_num = "_row_num_" in all_columns
|
|
1211
|
+
|
|
1212
|
+
if col_range:
|
|
1213
|
+
# Parse range like "1:10", "5:", ":15"
|
|
1214
|
+
if ":" in col_range:
|
|
1215
|
+
parts = col_range.split(":")
|
|
1216
|
+
start_idx = int(parts[0]) - 1 if parts[0] else 0 # Convert to 0-based
|
|
1217
|
+
end_idx = int(parts[1]) if parts[1] else len(all_columns)
|
|
1218
|
+
|
|
1219
|
+
# Filter out _row_num_ from the range selection, we'll add it back later
|
|
1220
|
+
columns_for_range = [col for col in all_columns if col != "_row_num_"]
|
|
1221
|
+
selected_columns = columns_for_range[start_idx:end_idx]
|
|
1222
|
+
|
|
1223
|
+
# Always include _row_num_ at the beginning if it exists
|
|
1224
|
+
if has_row_num:
|
|
1225
|
+
columns_list = ["_row_num_"] + selected_columns
|
|
1226
|
+
else:
|
|
1227
|
+
columns_list = selected_columns
|
|
1228
|
+
else:
|
|
1229
|
+
console.print(
|
|
1230
|
+
"[yellow]Warning: Invalid range format. Use 'start:end' format[/yellow]"
|
|
1231
|
+
)
|
|
1232
|
+
elif col_first:
|
|
1233
|
+
# Filter out _row_num_ from the first N selection, we'll add it back later
|
|
1234
|
+
columns_for_first = [col for col in all_columns if col != "_row_num_"]
|
|
1235
|
+
selected_columns = columns_for_first[:col_first]
|
|
1236
|
+
|
|
1237
|
+
# Always include _row_num_ at the beginning if it exists
|
|
1238
|
+
if has_row_num:
|
|
1239
|
+
columns_list = ["_row_num_"] + selected_columns
|
|
1240
|
+
else:
|
|
1241
|
+
columns_list = selected_columns
|
|
1242
|
+
elif col_last:
|
|
1243
|
+
# Filter out _row_num_ from the last N selection, we'll add it back later
|
|
1244
|
+
columns_for_last = [col for col in all_columns if col != "_row_num_"]
|
|
1245
|
+
selected_columns = columns_for_last[-col_last:]
|
|
1246
|
+
|
|
1247
|
+
# Always include _row_num_ at the beginning if it exists
|
|
1248
|
+
if has_row_num:
|
|
1249
|
+
columns_list = ["_row_num_"] + selected_columns
|
|
1250
|
+
else:
|
|
1251
|
+
columns_list = selected_columns
|
|
1252
|
+
|
|
1253
|
+
# Generate preview
|
|
1254
|
+
with console.status("[bold green]Generating preview..."):
|
|
1255
|
+
# Get total dataset size before preview and gather metadata
|
|
1256
|
+
try:
|
|
1257
|
+
# Data is already processed, just use it directly
|
|
1258
|
+
processed_data = data
|
|
1259
|
+
|
|
1260
|
+
total_dataset_rows = pb.get_row_count(processed_data)
|
|
1261
|
+
total_dataset_columns = pb.get_column_count(processed_data)
|
|
1262
|
+
|
|
1263
|
+
# Determine source type and table type for enhanced preview title
|
|
1264
|
+
if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
1265
|
+
source_type = f"Pointblank dataset: {data_source}"
|
|
1266
|
+
else:
|
|
1267
|
+
source_type = f"External source: {data_source}"
|
|
1268
|
+
|
|
1269
|
+
table_type = _get_tbl_type(processed_data)
|
|
1270
|
+
except Exception:
|
|
1271
|
+
# If we can't get metadata, set defaults
|
|
1272
|
+
total_dataset_rows = None
|
|
1273
|
+
total_dataset_columns = None
|
|
1274
|
+
source_type = f"Data source: {data_source}"
|
|
1275
|
+
table_type = "unknown"
|
|
1276
|
+
|
|
1277
|
+
gt_table = pb.preview(
|
|
1278
|
+
data=data,
|
|
1279
|
+
columns_subset=columns_list,
|
|
1280
|
+
n_head=head,
|
|
1281
|
+
n_tail=tail,
|
|
1282
|
+
limit=limit,
|
|
1283
|
+
show_row_numbers=not no_row_numbers,
|
|
1284
|
+
max_col_width=max_col_width,
|
|
1285
|
+
min_tbl_width=min_table_width,
|
|
1286
|
+
incl_header=not no_header,
|
|
1287
|
+
)
|
|
1288
|
+
|
|
1289
|
+
if output_html:
|
|
1290
|
+
# Save HTML to file
|
|
1291
|
+
html_content = gt_table.as_raw_html()
|
|
1292
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
1293
|
+
console.print(f"[green]✓[/green] HTML saved to: {output_html}")
|
|
1294
|
+
else:
|
|
1295
|
+
# Display in terminal with preview context info
|
|
1296
|
+
preview_info = None
|
|
1297
|
+
if total_dataset_rows is not None:
|
|
1298
|
+
# Determine if we're showing the complete dataset
|
|
1299
|
+
expected_rows = min(head + tail, limit, total_dataset_rows)
|
|
1300
|
+
is_complete = total_dataset_rows <= expected_rows
|
|
1301
|
+
|
|
1302
|
+
preview_info = {
|
|
1303
|
+
"total_rows": total_dataset_rows,
|
|
1304
|
+
"total_columns": total_dataset_columns,
|
|
1305
|
+
"head_rows": head,
|
|
1306
|
+
"tail_rows": tail,
|
|
1307
|
+
"is_complete": is_complete,
|
|
1308
|
+
"source_type": source_type,
|
|
1309
|
+
"table_type": table_type,
|
|
1310
|
+
}
|
|
1311
|
+
|
|
1312
|
+
_rich_print_gt_table(gt_table, preview_info)
|
|
1313
|
+
|
|
1314
|
+
except Exception as e: # pragma: no cover
|
|
1315
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1316
|
+
sys.exit(1) # pragma: no cover
|
|
1317
|
+
|
|
1318
|
+
|
|
1319
|
+
@cli.command()
|
|
1320
|
+
@click.argument("data_source", type=str)
|
|
1321
|
+
@click.option("--output-html", type=click.Path(), help="Save HTML scan report to file")
|
|
1322
|
+
@click.option("--columns", "-c", help="Comma-separated list of columns to scan")
|
|
1323
|
+
def scan(
|
|
1324
|
+
data_source: str,
|
|
1325
|
+
output_html: str | None,
|
|
1326
|
+
columns: str | None,
|
|
1327
|
+
):
|
|
1328
|
+
"""
|
|
1329
|
+
Generate a data scan profile report.
|
|
1330
|
+
|
|
1331
|
+
Produces a comprehensive data profile including:
|
|
1332
|
+
|
|
1333
|
+
\b
|
|
1334
|
+
- Column types and distributions
|
|
1335
|
+
- Missing value patterns
|
|
1336
|
+
- Basic statistics
|
|
1337
|
+
- Data quality indicators
|
|
1338
|
+
|
|
1339
|
+
DATA_SOURCE can be:
|
|
1340
|
+
|
|
1341
|
+
\b
|
|
1342
|
+
- CSV file path (e.g., data.csv)
|
|
1343
|
+
- Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
|
|
1344
|
+
- GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
|
|
1345
|
+
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1346
|
+
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1347
|
+
"""
|
|
1348
|
+
try:
|
|
1349
|
+
import time
|
|
1350
|
+
|
|
1351
|
+
start_time = time.time()
|
|
1352
|
+
|
|
1353
|
+
with console.status("[bold green]Loading data..."):
|
|
1354
|
+
# Load the data source using the centralized function
|
|
1355
|
+
data = _load_data_source(data_source)
|
|
1356
|
+
|
|
1357
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1358
|
+
|
|
1359
|
+
# Parse columns if provided
|
|
1360
|
+
columns_list = None
|
|
1361
|
+
if columns:
|
|
1362
|
+
columns_list = [col.strip() for col in columns.split(",")]
|
|
1363
|
+
|
|
1364
|
+
# Generate data scan
|
|
1365
|
+
with console.status("[bold green]Generating data scan..."):
|
|
1366
|
+
# Use col_summary_tbl for comprehensive column scanning
|
|
1367
|
+
# Data is already processed by _load_data_source
|
|
1368
|
+
scan_result = pb.col_summary_tbl(data=data)
|
|
1369
|
+
|
|
1370
|
+
if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
1371
|
+
source_type = f"Pointblank dataset: {data_source}"
|
|
1372
|
+
else:
|
|
1373
|
+
source_type = f"External source: {data_source}"
|
|
1374
|
+
|
|
1375
|
+
table_type = _get_tbl_type(data)
|
|
1376
|
+
# Get row count and column count for header
|
|
1377
|
+
try:
|
|
1378
|
+
total_rows = pb.get_row_count(data)
|
|
1379
|
+
total_columns = pb.get_column_count(data)
|
|
1380
|
+
except Exception:
|
|
1381
|
+
total_rows = None
|
|
1382
|
+
total_columns = None
|
|
1383
|
+
|
|
1384
|
+
scan_time = time.time() - start_time
|
|
1385
|
+
|
|
1386
|
+
if output_html:
|
|
1387
|
+
# Save HTML to file
|
|
1388
|
+
try:
|
|
1389
|
+
html_content = scan_result.as_raw_html()
|
|
1390
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
1391
|
+
console.print(f"[green]✓[/green] Data scan report saved to: {output_html}")
|
|
1392
|
+
except Exception as e:
|
|
1393
|
+
console.print(f"[yellow]Warning: Could not save HTML report: {e}[/yellow]")
|
|
1394
|
+
else:
|
|
1395
|
+
# Display rich scan table in terminal
|
|
1396
|
+
console.print(f"[green]✓[/green] Data scan completed in {scan_time:.2f}s")
|
|
1397
|
+
console.print("Use --output-html to save the full interactive scan report.")
|
|
1398
|
+
|
|
1399
|
+
# Display detailed column summary using rich formatting
|
|
1400
|
+
try:
|
|
1401
|
+
_rich_print_scan_table(
|
|
1402
|
+
scan_result, data_source, source_type, table_type, total_rows, total_columns
|
|
1403
|
+
)
|
|
1404
|
+
|
|
1405
|
+
except Exception as e:
|
|
1406
|
+
console.print(f"[yellow]Could not display scan summary: {e}[/yellow]")
|
|
1407
|
+
|
|
1408
|
+
except Exception as e:
|
|
1409
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1410
|
+
sys.exit(1)
|
|
1411
|
+
|
|
1412
|
+
|
|
1413
|
+
@cli.command()
|
|
1414
|
+
@click.argument("data_source", type=str)
|
|
1415
|
+
@click.option("--output-html", type=click.Path(), help="Save HTML output to file")
|
|
1416
|
+
def missing(data_source: str, output_html: str | None):
|
|
1417
|
+
"""
|
|
1418
|
+
Generate a missing values report for a data table.
|
|
1419
|
+
|
|
1420
|
+
DATA_SOURCE can be:
|
|
1421
|
+
|
|
1422
|
+
\b
|
|
1423
|
+
- CSV file path (e.g., data.csv)
|
|
1424
|
+
- Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
|
|
1425
|
+
- GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
|
|
1426
|
+
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1427
|
+
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1428
|
+
"""
|
|
1429
|
+
try:
|
|
1430
|
+
with console.status("[bold green]Loading data..."):
|
|
1431
|
+
# Load the data source using the centralized function
|
|
1432
|
+
data = _load_data_source(data_source)
|
|
1433
|
+
|
|
1434
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1435
|
+
|
|
1436
|
+
# Generate missing values table
|
|
1437
|
+
with console.status("[bold green]Analyzing missing values..."):
|
|
1438
|
+
gt_table = pb.missing_vals_tbl(data)
|
|
1439
|
+
|
|
1440
|
+
# Data is already processed, just use it directly
|
|
1441
|
+
original_data = data
|
|
1442
|
+
|
|
1443
|
+
if output_html:
|
|
1444
|
+
# Save HTML to file
|
|
1445
|
+
html_content = gt_table.as_raw_html()
|
|
1446
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
1447
|
+
console.print(f"[green]✓[/green] Missing values report saved to: {output_html}")
|
|
1448
|
+
else:
|
|
1449
|
+
# Display in terminal with special missing values formatting
|
|
1450
|
+
_rich_print_missing_table(gt_table, original_data)
|
|
1451
|
+
|
|
1452
|
+
except Exception as e:
|
|
1453
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1454
|
+
sys.exit(1)
|
|
1455
|
+
|
|
1456
|
+
|
|
1457
|
+
@cli.command(name="validate")
|
|
1458
|
+
@click.argument("data_source", type=str)
|
|
1459
|
+
@click.option(
|
|
1460
|
+
"--check",
|
|
1461
|
+
"checks", # Changed to collect multiple values
|
|
1462
|
+
type=click.Choice(
|
|
1463
|
+
[
|
|
1464
|
+
"rows-distinct",
|
|
1465
|
+
"col-vals-not-null",
|
|
1466
|
+
"rows-complete",
|
|
1467
|
+
"col-exists",
|
|
1468
|
+
"col-vals-in-set",
|
|
1469
|
+
"col-vals-gt",
|
|
1470
|
+
"col-vals-ge",
|
|
1471
|
+
"col-vals-lt",
|
|
1472
|
+
"col-vals-le",
|
|
1473
|
+
]
|
|
1474
|
+
),
|
|
1475
|
+
multiple=True, # Allow multiple --check options
|
|
1476
|
+
help="Type of validation check to perform. Can be used multiple times for multiple checks.",
|
|
1477
|
+
)
|
|
1478
|
+
@click.option("--list-checks", is_flag=True, help="List available validation checks and exit")
|
|
1479
|
+
@click.option(
|
|
1480
|
+
"--column",
|
|
1481
|
+
"columns", # Changed to collect multiple values
|
|
1482
|
+
multiple=True, # Allow multiple --column options
|
|
1483
|
+
help="Column name or integer position as #N (1-based index) for validation.",
|
|
1484
|
+
)
|
|
1485
|
+
@click.option(
|
|
1486
|
+
"--set",
|
|
1487
|
+
"sets", # Changed to collect multiple values
|
|
1488
|
+
multiple=True, # Allow multiple --set options
|
|
1489
|
+
help="Comma-separated allowed values for col-vals-in-set checks.",
|
|
1490
|
+
)
|
|
1491
|
+
@click.option(
|
|
1492
|
+
"--value",
|
|
1493
|
+
"values", # Changed to collect multiple values
|
|
1494
|
+
type=float,
|
|
1495
|
+
multiple=True, # Allow multiple --value options
|
|
1496
|
+
help="Numeric value for comparison checks.",
|
|
1497
|
+
)
|
|
1498
|
+
@click.option(
|
|
1499
|
+
"--show-extract", is_flag=True, help="Show extract of failing rows if validation fails"
|
|
1500
|
+
)
|
|
1501
|
+
@click.option(
|
|
1502
|
+
"--write-extract", type=str, help="Save failing rows to folder. Provide base name for folder."
|
|
1503
|
+
)
|
|
1504
|
+
@click.option(
|
|
1505
|
+
"--limit", "-l", default=10, help="Maximum number of failing rows to show/save (default: 10)"
|
|
1506
|
+
)
|
|
1507
|
+
@click.option("--exit-code", is_flag=True, help="Exit with non-zero code if validation fails")
|
|
1508
|
+
@click.pass_context
|
|
1509
|
+
def validate(
|
|
1510
|
+
ctx: click.Context,
|
|
1511
|
+
data_source: str,
|
|
1512
|
+
checks: tuple[str, ...], # Changed to tuple
|
|
1513
|
+
columns: tuple[str, ...], # Changed to tuple
|
|
1514
|
+
sets: tuple[str, ...], # Changed to tuple
|
|
1515
|
+
values: tuple[float, ...], # Changed to tuple
|
|
1516
|
+
show_extract: bool,
|
|
1517
|
+
write_extract: str | None,
|
|
1518
|
+
limit: int,
|
|
1519
|
+
exit_code: bool,
|
|
1520
|
+
list_checks: bool,
|
|
1521
|
+
):
|
|
1522
|
+
"""
|
|
1523
|
+
Perform single or multiple data validations.
|
|
1524
|
+
|
|
1525
|
+
Run one or more validation checks on your data in a single command.
|
|
1526
|
+
Use multiple --check options to perform multiple validations.
|
|
1527
|
+
|
|
1528
|
+
DATA_SOURCE can be:
|
|
1529
|
+
|
|
1530
|
+
\b
|
|
1531
|
+
- CSV file path (e.g., data.csv)
|
|
1532
|
+
- Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
|
|
1533
|
+
- GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
|
|
1534
|
+
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1535
|
+
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1536
|
+
|
|
1537
|
+
AVAILABLE CHECKS:
|
|
1538
|
+
|
|
1539
|
+
Use --list-checks to see all available validation methods with examples.
|
|
1540
|
+
|
|
1541
|
+
The default check is 'rows-distinct' which checks for duplicate rows.
|
|
1542
|
+
|
|
1543
|
+
\b
|
|
1544
|
+
- rows-distinct: Check if all rows in the dataset are unique (no duplicates)
|
|
1545
|
+
- rows-complete: Check if all rows are complete (no missing values in any column)
|
|
1546
|
+
- col-exists: Check if a specific column exists in the dataset (requires --column)
|
|
1547
|
+
- col-vals-not-null: Check if all values in a column are not null/missing (requires --column)
|
|
1548
|
+
- col-vals-gt: Check if all values in a column are greater than a threshold (requires --column and --value)
|
|
1549
|
+
- col-vals-ge: Check if all values in a column are greater than or equal to a threshold (requires --column and --value)
|
|
1550
|
+
- col-vals-lt: Check if all values in a column are less than a threshold (requires --column and --value)
|
|
1551
|
+
- col-vals-le: Check if all values in a column are less than or equal to a threshold (requires --column and --value)
|
|
1552
|
+
- col-vals-in-set: Check if all values in a column are in an allowed set (requires --column and --set)
|
|
1553
|
+
|
|
1554
|
+
Examples:
|
|
1555
|
+
|
|
1556
|
+
\b
|
|
1557
|
+
pb validate data.csv # Uses default validation (rows-distinct)
|
|
1558
|
+
pb validate data.csv --list-checks # Show all available checks
|
|
1559
|
+
pb validate data.csv --check rows-distinct
|
|
1560
|
+
pb validate data.csv --check rows-distinct --show-extract
|
|
1561
|
+
pb validate data.csv --check rows-distinct --write-extract failing_rows_folder
|
|
1562
|
+
pb validate data.csv --check rows-distinct --exit-code
|
|
1563
|
+
pb validate data.csv --check rows-complete
|
|
1564
|
+
pb validate data.csv --check col-exists --column price
|
|
1565
|
+
pb validate data.csv --check col-vals-not-null --column email
|
|
1566
|
+
pb validate data.csv --check col-vals-gt --column score --value 50
|
|
1567
|
+
pb validate data.csv --check col-vals-in-set --column status --set "active,inactive,pending"
|
|
1568
|
+
|
|
1569
|
+
Multiple validations in one command:
|
|
1570
|
+
pb validate data.csv --check rows-distinct --check rows-complete
|
|
1571
|
+
pb validate data.csv --check col-vals-not-null --column email --check col-vals-gt --column age --value 18
|
|
1572
|
+
"""
|
|
1573
|
+
try:
|
|
1574
|
+
# Handle backward compatibility and parameter conversion
|
|
1575
|
+
import sys
|
|
1576
|
+
|
|
1577
|
+
# Convert parameter tuples to lists, handling default case
|
|
1578
|
+
if not checks:
|
|
1579
|
+
# No --check options provided, use default
|
|
1580
|
+
checks_list = ["rows-distinct"]
|
|
1581
|
+
is_using_default_check = True
|
|
1582
|
+
else:
|
|
1583
|
+
checks_list = list(checks)
|
|
1584
|
+
is_using_default_check = False
|
|
1585
|
+
|
|
1586
|
+
columns_list = list(columns) if columns else []
|
|
1587
|
+
sets_list = list(sets) if sets else []
|
|
1588
|
+
values_list = list(values) if values else []
|
|
1589
|
+
|
|
1590
|
+
# Map parameters to checks intelligently
|
|
1591
|
+
mapped_columns, mapped_sets, mapped_values = _map_parameters_to_checks(
|
|
1592
|
+
checks_list, columns_list, sets_list, values_list
|
|
1593
|
+
)
|
|
1594
|
+
|
|
1595
|
+
# Handle --list-checks option
|
|
1596
|
+
if list_checks:
|
|
1597
|
+
console.print("[bold bright_cyan]Available Validation Checks:[/bold bright_cyan]")
|
|
1598
|
+
console.print()
|
|
1599
|
+
console.print("[bold magenta]Basic checks:[/bold magenta]")
|
|
1600
|
+
console.print(
|
|
1601
|
+
" • [bold cyan]rows-distinct[/bold cyan] Check for duplicate rows [yellow](default)[/yellow]"
|
|
1602
|
+
)
|
|
1603
|
+
console.print(
|
|
1604
|
+
" • [bold cyan]rows-complete[/bold cyan] Check for missing values in any column"
|
|
1605
|
+
)
|
|
1606
|
+
console.print()
|
|
1607
|
+
console.print(
|
|
1608
|
+
"[bold magenta]Column-specific checks [bright_black](require --column)[/bright_black]:[/bold magenta]"
|
|
1609
|
+
)
|
|
1610
|
+
console.print(" • [bold cyan]col-exists[/bold cyan] Check if a column exists")
|
|
1611
|
+
console.print(
|
|
1612
|
+
" • [bold cyan]col-vals-not-null[/bold cyan] Check for null values in a column"
|
|
1613
|
+
)
|
|
1614
|
+
console.print()
|
|
1615
|
+
console.print(
|
|
1616
|
+
"[bold magenta]Value comparison checks [bright_black](require --column and --value)[/bright_black]:[/bold magenta]"
|
|
1617
|
+
)
|
|
1618
|
+
console.print(
|
|
1619
|
+
" • [bold cyan]col-vals-gt[/bold cyan] Values greater than threshold"
|
|
1620
|
+
)
|
|
1621
|
+
console.print(
|
|
1622
|
+
" • [bold cyan]col-vals-ge[/bold cyan] Values greater than or equal to threshold"
|
|
1623
|
+
)
|
|
1624
|
+
console.print(" • [bold cyan]col-vals-lt[/bold cyan] Values less than threshold")
|
|
1625
|
+
console.print(
|
|
1626
|
+
" • [bold cyan]col-vals-le[/bold cyan] Values less than or equal to threshold"
|
|
1627
|
+
)
|
|
1628
|
+
console.print()
|
|
1629
|
+
console.print(
|
|
1630
|
+
"[bold magenta]Set validation check [bright_black](requires --column and --set)[/bright_black]:[/bold magenta]"
|
|
1631
|
+
)
|
|
1632
|
+
console.print(
|
|
1633
|
+
" • [bold cyan]col-vals-in-set[/bold cyan] Values must be in allowed set"
|
|
1634
|
+
)
|
|
1635
|
+
console.print()
|
|
1636
|
+
console.print("[bold bright_yellow]Examples:[/bold bright_yellow]")
|
|
1637
|
+
console.print(
|
|
1638
|
+
f" [bright_blue]pb validate {data_source} --check rows-distinct[/bright_blue]"
|
|
1639
|
+
)
|
|
1640
|
+
console.print(
|
|
1641
|
+
f" [bright_blue]pb validate {data_source} --check col-vals-not-null --column price[/bright_blue]"
|
|
1642
|
+
)
|
|
1643
|
+
console.print(
|
|
1644
|
+
f" [bright_blue]pb validate {data_source} --check col-vals-gt --column age --value 18[/bright_blue]"
|
|
1645
|
+
)
|
|
1646
|
+
import sys
|
|
1647
|
+
|
|
1648
|
+
sys.exit(0)
|
|
1649
|
+
|
|
1650
|
+
# Validate required parameters for different check types
|
|
1651
|
+
# Check parameters for each check in the list using mapped parameters
|
|
1652
|
+
for i, check in enumerate(checks_list):
|
|
1653
|
+
# Get corresponding mapped parameters for this check
|
|
1654
|
+
column = mapped_columns[i] if i < len(mapped_columns) else None
|
|
1655
|
+
set_val = mapped_sets[i] if i < len(mapped_sets) else None
|
|
1656
|
+
value = mapped_values[i] if i < len(mapped_values) else None
|
|
1657
|
+
|
|
1658
|
+
if check == "col-vals-not-null" and not column:
|
|
1659
|
+
console.print(f"[red]Error:[/red] --column is required for {check} check")
|
|
1660
|
+
console.print(
|
|
1661
|
+
"Example: pb validate data.csv --check col-vals-not-null --column email"
|
|
1662
|
+
)
|
|
1663
|
+
sys.exit(1)
|
|
1664
|
+
|
|
1665
|
+
if check == "col-exists" and not column:
|
|
1666
|
+
console.print(f"[red]Error:[/red] --column is required for {check} check")
|
|
1667
|
+
console.print("Example: pb validate data.csv --check col-exists --column price")
|
|
1668
|
+
sys.exit(1)
|
|
1669
|
+
|
|
1670
|
+
if check == "col-vals-in-set" and not column:
|
|
1671
|
+
console.print(f"[red]Error:[/red] --column is required for {check} check")
|
|
1672
|
+
console.print(
|
|
1673
|
+
"Example: pb validate data.csv --check col-vals-in-set --column status --set 'active,inactive'"
|
|
1674
|
+
)
|
|
1675
|
+
sys.exit(1)
|
|
1676
|
+
|
|
1677
|
+
if check == "col-vals-in-set" and not set_val:
|
|
1678
|
+
console.print(f"[red]Error:[/red] --set is required for {check} check")
|
|
1679
|
+
console.print(
|
|
1680
|
+
"Example: pb validate data.csv --check col-vals-in-set --column status --set 'active,inactive'"
|
|
1681
|
+
)
|
|
1682
|
+
sys.exit(1)
|
|
1683
|
+
|
|
1684
|
+
if check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"] and not column:
|
|
1685
|
+
console.print(f"[red]Error:[/red] --column is required for {check} check")
|
|
1686
|
+
console.print(
|
|
1687
|
+
f"Example: pb validate data.csv --check {check} --column score --value 50"
|
|
1688
|
+
)
|
|
1689
|
+
sys.exit(1)
|
|
1690
|
+
|
|
1691
|
+
if (
|
|
1692
|
+
check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"]
|
|
1693
|
+
and value is None
|
|
1694
|
+
):
|
|
1695
|
+
console.print(f"[red]Error:[/red] --value is required for {check} check")
|
|
1696
|
+
console.print(
|
|
1697
|
+
f"Example: pb validate data.csv --check {check} --column score --value 50"
|
|
1698
|
+
)
|
|
1699
|
+
sys.exit(1)
|
|
1700
|
+
|
|
1701
|
+
with console.status("[bold green]Loading data..."):
|
|
1702
|
+
# Load the data source using the centralized function
|
|
1703
|
+
data = _load_data_source(data_source)
|
|
1704
|
+
|
|
1705
|
+
# Get all column names for error reporting
|
|
1706
|
+
if hasattr(data, "columns"):
|
|
1707
|
+
all_columns = list(data.columns)
|
|
1708
|
+
elif hasattr(data, "schema"):
|
|
1709
|
+
all_columns = list(data.schema.names)
|
|
1710
|
+
else:
|
|
1711
|
+
all_columns = []
|
|
1712
|
+
|
|
1713
|
+
# Resolve any '#N' column references to actual column names
|
|
1714
|
+
columns_list = _resolve_column_indices(columns_list, data)
|
|
1715
|
+
|
|
1716
|
+
# Check for out-of-range #N columns and provide a helpful error
|
|
1717
|
+
for col in columns_list:
|
|
1718
|
+
if isinstance(col, str) and col.startswith("#"):
|
|
1719
|
+
try:
|
|
1720
|
+
idx = int(col[1:])
|
|
1721
|
+
if idx < 1 or idx > len(all_columns):
|
|
1722
|
+
console.print(
|
|
1723
|
+
f"[red]Error:[/red] There is no column {idx} (the column position "
|
|
1724
|
+
f"range is 1 to {len(all_columns)})"
|
|
1725
|
+
)
|
|
1726
|
+
sys.exit(1)
|
|
1727
|
+
except Exception:
|
|
1728
|
+
pass # Let later validation handle other errors
|
|
1729
|
+
|
|
1730
|
+
# Update mapped_columns to use resolved column names
|
|
1731
|
+
mapped_columns, mapped_sets, mapped_values = _map_parameters_to_checks(
|
|
1732
|
+
checks_list, columns_list, sets_list, values_list
|
|
1733
|
+
)
|
|
1734
|
+
|
|
1735
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1736
|
+
|
|
1737
|
+
# Build a single validation object with chained checks
|
|
1738
|
+
with console.status(f"[bold green]Running {len(checks_list)} validation check(s)..."):
|
|
1739
|
+
# Initialize validation object
|
|
1740
|
+
validation = pb.Validate(
|
|
1741
|
+
data=data,
|
|
1742
|
+
tbl_name=f"Data from {data_source}",
|
|
1743
|
+
label=f"CLI Validation: {', '.join(checks_list)}",
|
|
1744
|
+
)
|
|
1745
|
+
|
|
1746
|
+
# Add each check to the validation chain
|
|
1747
|
+
for i, check in enumerate(checks_list):
|
|
1748
|
+
# Get corresponding mapped parameters for this check
|
|
1749
|
+
column = mapped_columns[i] if i < len(mapped_columns) else None
|
|
1750
|
+
set_val = mapped_sets[i] if i < len(mapped_sets) else None
|
|
1751
|
+
value = mapped_values[i] if i < len(mapped_values) else None
|
|
1752
|
+
|
|
1753
|
+
if check == "rows-distinct":
|
|
1754
|
+
validation = validation.rows_distinct()
|
|
1755
|
+
elif check == "col-vals-not-null":
|
|
1756
|
+
validation = validation.col_vals_not_null(columns=column)
|
|
1757
|
+
elif check == "rows-complete":
|
|
1758
|
+
validation = validation.rows_complete()
|
|
1759
|
+
elif check == "col-exists":
|
|
1760
|
+
validation = validation.col_exists(columns=column)
|
|
1761
|
+
elif check == "col-vals-in-set":
|
|
1762
|
+
# Parse the comma-separated set values
|
|
1763
|
+
allowed_values = [v.strip() for v in set_val.split(",")]
|
|
1764
|
+
validation = validation.col_vals_in_set(columns=column, set=allowed_values)
|
|
1765
|
+
elif check == "col-vals-gt":
|
|
1766
|
+
validation = validation.col_vals_gt(columns=column, value=value)
|
|
1767
|
+
elif check == "col-vals-ge":
|
|
1768
|
+
validation = validation.col_vals_ge(columns=column, value=value)
|
|
1769
|
+
elif check == "col-vals-lt":
|
|
1770
|
+
validation = validation.col_vals_lt(columns=column, value=value)
|
|
1771
|
+
elif check == "col-vals-le":
|
|
1772
|
+
validation = validation.col_vals_le(columns=column, value=value)
|
|
1773
|
+
else:
|
|
1774
|
+
console.print(f"[red]Error:[/red] Unknown check type: {check}")
|
|
1775
|
+
sys.exit(1)
|
|
1776
|
+
|
|
1777
|
+
# Execute all validations
|
|
1778
|
+
validation = validation.interrogate()
|
|
1779
|
+
all_passed = validation.all_passed()
|
|
1780
|
+
|
|
1781
|
+
# Display completion message
|
|
1782
|
+
if len(checks_list) == 1:
|
|
1783
|
+
if is_using_default_check:
|
|
1784
|
+
console.print(
|
|
1785
|
+
f"[green]✓[/green] {checks_list[0]} validation completed [dim](default validation)[/dim]"
|
|
1786
|
+
)
|
|
1787
|
+
else:
|
|
1788
|
+
console.print(f"[green]✓[/green] {checks_list[0]} validation completed")
|
|
1789
|
+
else:
|
|
1790
|
+
console.print(f"[green]✓[/green] {len(checks_list)} validations completed")
|
|
1791
|
+
|
|
1792
|
+
# Display results based on whether we have single or multiple checks
|
|
1793
|
+
if len(checks_list) == 1:
|
|
1794
|
+
# Single check - use current display format
|
|
1795
|
+
_display_validation_result(
|
|
1796
|
+
validation,
|
|
1797
|
+
checks_list,
|
|
1798
|
+
mapped_columns,
|
|
1799
|
+
mapped_sets,
|
|
1800
|
+
mapped_values,
|
|
1801
|
+
data_source,
|
|
1802
|
+
0,
|
|
1803
|
+
1,
|
|
1804
|
+
show_extract,
|
|
1805
|
+
write_extract,
|
|
1806
|
+
limit,
|
|
1807
|
+
)
|
|
1808
|
+
else:
|
|
1809
|
+
# Multiple checks - use stacked display format
|
|
1810
|
+
any_failed = False
|
|
1811
|
+
for i in range(len(checks_list)):
|
|
1812
|
+
console.print() # Add spacing between results
|
|
1813
|
+
_display_validation_result(
|
|
1814
|
+
validation,
|
|
1815
|
+
checks_list,
|
|
1816
|
+
mapped_columns,
|
|
1817
|
+
mapped_sets,
|
|
1818
|
+
mapped_values,
|
|
1819
|
+
data_source,
|
|
1820
|
+
i,
|
|
1821
|
+
len(checks_list),
|
|
1822
|
+
show_extract,
|
|
1823
|
+
write_extract,
|
|
1824
|
+
limit,
|
|
1825
|
+
)
|
|
1826
|
+
|
|
1827
|
+
# Check if this validation failed
|
|
1828
|
+
if hasattr(validation, "validation_info") and len(validation.validation_info) > i:
|
|
1829
|
+
step_info = validation.validation_info[i]
|
|
1830
|
+
if step_info.n_failed > 0:
|
|
1831
|
+
any_failed = True
|
|
1832
|
+
|
|
1833
|
+
# Show tip about --show-extract if any failed and not already used
|
|
1834
|
+
if any_failed and not show_extract:
|
|
1835
|
+
console.print()
|
|
1836
|
+
console.print(
|
|
1837
|
+
"[bright_blue]💡 Tip:[/bright_blue] [cyan]Use --show-extract to see the failing rows[/cyan]"
|
|
1838
|
+
)
|
|
1839
|
+
|
|
1840
|
+
# Add informational hints when using default validation (only for single check)
|
|
1841
|
+
if len(checks_list) == 1 and is_using_default_check:
|
|
1842
|
+
console.print()
|
|
1843
|
+
console.print("[bold blue]ℹ️ Information:[/bold blue] Using default validation method")
|
|
1844
|
+
console.print("To specify a different validation, use the --check option.")
|
|
1845
|
+
console.print()
|
|
1846
|
+
console.print("[bold magenta]Common validation options:[/bold magenta]")
|
|
1847
|
+
console.print(
|
|
1848
|
+
" • [bold cyan]--check rows-complete[/bold cyan] Check for rows with missing values"
|
|
1849
|
+
)
|
|
1850
|
+
console.print(
|
|
1851
|
+
" • [bold cyan]--check col-vals-not-null[/bold cyan] Check for null values in a column [bright_black](requires --column)[/bright_black]"
|
|
1852
|
+
)
|
|
1853
|
+
console.print(
|
|
1854
|
+
" • [bold cyan]--check col-exists[/bold cyan] Check if a column exists [bright_black](requires --column)[/bright_black]"
|
|
1855
|
+
)
|
|
1856
|
+
console.print()
|
|
1857
|
+
console.print("[bold bright_yellow]Examples:[/bold bright_yellow]")
|
|
1858
|
+
console.print(
|
|
1859
|
+
f" [bright_blue]pb validate {data_source} --check rows-complete[/bright_blue]"
|
|
1860
|
+
)
|
|
1861
|
+
console.print(
|
|
1862
|
+
f" [bright_blue]pb validate {data_source} --check col-vals-not-null --column price[/bright_blue]"
|
|
1863
|
+
)
|
|
1864
|
+
|
|
1865
|
+
# Exit with appropriate code if requested
|
|
1866
|
+
if exit_code and not all_passed:
|
|
1867
|
+
console.print("[dim]Exiting with non-zero code due to validation failure[/dim]")
|
|
1868
|
+
import sys
|
|
1869
|
+
|
|
1870
|
+
sys.exit(1)
|
|
1871
|
+
|
|
1872
|
+
except Exception as e:
|
|
1873
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1874
|
+
sys.exit(1)
|
|
1875
|
+
|
|
1876
|
+
|
|
1877
|
+
@cli.command()
|
|
1878
|
+
def datasets():
|
|
1879
|
+
"""
|
|
1880
|
+
List available built-in datasets.
|
|
1881
|
+
"""
|
|
1882
|
+
from rich.box import SIMPLE_HEAD
|
|
1883
|
+
|
|
1884
|
+
datasets_info = [
|
|
1885
|
+
("small_table", "13 rows × 8 columns", "Small demo dataset for testing"),
|
|
1886
|
+
("game_revenue", "2,000 rows × 11 columns", "Game development company revenue data"),
|
|
1887
|
+
("nycflights", "336,776 rows × 18 columns", "NYC airport flights data from 2013"),
|
|
1888
|
+
("global_sales", "50,000 rows × 20 columns", "Global sales data across regions"),
|
|
1889
|
+
]
|
|
1890
|
+
|
|
1891
|
+
table = Table(
|
|
1892
|
+
title="Available Pointblank Datasets", show_header=True, header_style="bold magenta"
|
|
1893
|
+
)
|
|
1894
|
+
|
|
1895
|
+
# Create the datasets table
|
|
1896
|
+
table = Table(
|
|
1897
|
+
title="Available Pointblank Datasets",
|
|
1898
|
+
show_header=True,
|
|
1899
|
+
header_style="bold magenta",
|
|
1900
|
+
box=SIMPLE_HEAD,
|
|
1901
|
+
title_style="bold cyan",
|
|
1902
|
+
title_justify="left",
|
|
1903
|
+
)
|
|
1904
|
+
|
|
1905
|
+
table.add_column("Dataset Name", style="cyan", no_wrap=True)
|
|
1906
|
+
table.add_column("Dimensions", style="green")
|
|
1907
|
+
table.add_column("Description", style="white")
|
|
1908
|
+
|
|
1909
|
+
for name, dims, desc in datasets_info:
|
|
1910
|
+
table.add_row(name, dims, desc)
|
|
1911
|
+
|
|
1912
|
+
console.print(table)
|
|
1913
|
+
console.print("\n[dim]Use these dataset names directly with any pb CLI command.[/dim]")
|
|
1914
|
+
console.print("[dim]Example: pb preview small_table[/dim]")
|
|
1915
|
+
|
|
1916
|
+
|
|
1917
|
+
@cli.command()
|
|
1918
|
+
def requirements():
|
|
1919
|
+
"""
|
|
1920
|
+
Check installed dependencies and their availability.
|
|
1921
|
+
"""
|
|
1922
|
+
from rich.box import SIMPLE_HEAD
|
|
1923
|
+
|
|
1924
|
+
dependencies = [
|
|
1925
|
+
("polars", "Polars DataFrame support"),
|
|
1926
|
+
("pandas", "Pandas DataFrame support"),
|
|
1927
|
+
("ibis", "Ibis backend support (DuckDB, etc.)"),
|
|
1928
|
+
("duckdb", "DuckDB database support"),
|
|
1929
|
+
("pyarrow", "Parquet file support"),
|
|
1930
|
+
]
|
|
1931
|
+
|
|
1932
|
+
# Create requirements table
|
|
1933
|
+
table = Table(
|
|
1934
|
+
title="Dependency Status",
|
|
1935
|
+
show_header=True,
|
|
1936
|
+
header_style="bold magenta",
|
|
1937
|
+
box=SIMPLE_HEAD,
|
|
1938
|
+
title_style="bold cyan",
|
|
1939
|
+
title_justify="left",
|
|
1940
|
+
)
|
|
1941
|
+
|
|
1942
|
+
table.add_column("Package", style="cyan", no_wrap=True)
|
|
1943
|
+
table.add_column("Status", style="white")
|
|
1944
|
+
table.add_column("Description", style="dim")
|
|
1945
|
+
|
|
1946
|
+
for package, description in dependencies:
|
|
1947
|
+
if _is_lib_present(package):
|
|
1948
|
+
status = "[green]✓ Installed[/green]"
|
|
1949
|
+
else:
|
|
1950
|
+
status = "[red]✗ Not installed[/red]"
|
|
1951
|
+
|
|
1952
|
+
table.add_row(package, status, description)
|
|
1953
|
+
|
|
1954
|
+
console.print(table)
|
|
1955
|
+
console.print("\n[dim]Install missing packages to enable additional functionality.[/dim]")
|
|
1956
|
+
|
|
1957
|
+
|
|
1958
|
+
def _rich_print_scan_table(
|
|
1959
|
+
scan_result: Any,
|
|
1960
|
+
data_source: str,
|
|
1961
|
+
source_type: str,
|
|
1962
|
+
table_type: str,
|
|
1963
|
+
total_rows: int | None = None,
|
|
1964
|
+
total_columns: int | None = None,
|
|
1965
|
+
) -> None:
|
|
1966
|
+
"""
|
|
1967
|
+
Display scan results as a Rich table in the terminal with statistical measures.
|
|
1968
|
+
|
|
1969
|
+
Args:
|
|
1970
|
+
scan_result: The GT object from col_summary_tbl()
|
|
1971
|
+
data_source: Name of the data source being scanned
|
|
1972
|
+
source_type: Type of data source (e.g., "Pointblank dataset: small_table")
|
|
1973
|
+
table_type: Type of table (e.g., "polars.LazyFrame")
|
|
1974
|
+
total_rows: Total number of rows in the dataset
|
|
1975
|
+
total_columns: Total number of columns in the dataset
|
|
1976
|
+
"""
|
|
1977
|
+
try:
|
|
1978
|
+
import re
|
|
1979
|
+
|
|
1980
|
+
import narwhals as nw
|
|
1981
|
+
from rich.box import SIMPLE_HEAD
|
|
1982
|
+
|
|
1983
|
+
# Extract the underlying DataFrame from the GT object
|
|
1984
|
+
# The GT object has a _tbl_data attribute that contains the DataFrame
|
|
1985
|
+
gt_data = scan_result._tbl_data
|
|
1986
|
+
|
|
1987
|
+
# Convert to Narwhals DataFrame for consistent handling
|
|
1988
|
+
nw_data = nw.from_native(gt_data)
|
|
1989
|
+
|
|
1990
|
+
# Convert to dictionary for easier access
|
|
1991
|
+
data_dict = nw_data.to_dict(as_series=False)
|
|
1992
|
+
|
|
1993
|
+
# Create main scan table with missing data table styling
|
|
1994
|
+
# Create a comprehensive title with data source, source type, and table type
|
|
1995
|
+
title_text = f"Column Summary / {source_type} / {table_type}"
|
|
1996
|
+
|
|
1997
|
+
# Add dimensions subtitle in gray if available
|
|
1998
|
+
if total_rows is not None and total_columns is not None:
|
|
1999
|
+
title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
|
|
2000
|
+
|
|
2001
|
+
# Create the scan table
|
|
2002
|
+
scan_table = Table(
|
|
2003
|
+
title=title_text,
|
|
2004
|
+
show_header=True,
|
|
2005
|
+
header_style="bold magenta",
|
|
2006
|
+
box=SIMPLE_HEAD,
|
|
2007
|
+
title_style="bold cyan",
|
|
2008
|
+
title_justify="left",
|
|
2009
|
+
)
|
|
2010
|
+
|
|
2011
|
+
# Add columns with specific styling and appropriate widths
|
|
2012
|
+
scan_table.add_column("Column", style="cyan", no_wrap=True, width=20)
|
|
2013
|
+
scan_table.add_column("Type", style="yellow", no_wrap=True, width=10)
|
|
2014
|
+
scan_table.add_column(
|
|
2015
|
+
"NA", style="red", width=6, justify="right"
|
|
2016
|
+
) # Adjusted for better formatting
|
|
2017
|
+
scan_table.add_column(
|
|
2018
|
+
"UQ", style="green", width=8, justify="right"
|
|
2019
|
+
) # Adjusted for boolean values
|
|
2020
|
+
|
|
2021
|
+
# Add statistical columns if they exist with appropriate widths
|
|
2022
|
+
stat_columns = []
|
|
2023
|
+
column_mapping = {
|
|
2024
|
+
"mean": ("Mean", "blue", 9),
|
|
2025
|
+
"std": ("SD", "blue", 9),
|
|
2026
|
+
"min": ("Min", "yellow", 9),
|
|
2027
|
+
"median": ("Med", "yellow", 9),
|
|
2028
|
+
"max": ("Max", "yellow", 9),
|
|
2029
|
+
"q_1": ("Q₁", "magenta", 8),
|
|
2030
|
+
"q_3": ("Q₃", "magenta", 9),
|
|
2031
|
+
"iqr": ("IQR", "magenta", 8),
|
|
2032
|
+
}
|
|
2033
|
+
|
|
2034
|
+
for col_key, (display_name, color, width) in column_mapping.items():
|
|
2035
|
+
if col_key in data_dict:
|
|
2036
|
+
scan_table.add_column(display_name, style=color, width=width, justify="right")
|
|
2037
|
+
stat_columns.append(col_key)
|
|
2038
|
+
|
|
2039
|
+
# Helper function to extract column name and type from HTML
|
|
2040
|
+
def extract_column_info(html_content: str) -> tuple[str, str]:
|
|
2041
|
+
"""Extract column name and type from HTML formatted content."""
|
|
2042
|
+
# Extract column name from first div
|
|
2043
|
+
name_match = re.search(r"<div[^>]*>([^<]+)</div>", html_content)
|
|
2044
|
+
column_name = name_match.group(1) if name_match else "Unknown"
|
|
2045
|
+
|
|
2046
|
+
# Extract data type from second div (with gray color)
|
|
2047
|
+
type_match = re.search(r"<div[^>]*color: gray[^>]*>([^<]+)</div>", html_content)
|
|
2048
|
+
if type_match:
|
|
2049
|
+
data_type = type_match.group(1)
|
|
2050
|
+
# Convert to compact format using the existing function
|
|
2051
|
+
compact_type = _format_dtype_compact(data_type)
|
|
2052
|
+
data_type = compact_type
|
|
2053
|
+
else:
|
|
2054
|
+
data_type = "unknown"
|
|
2055
|
+
|
|
2056
|
+
return column_name, data_type
|
|
2057
|
+
|
|
2058
|
+
# Helper function to format values with improved number formatting
|
|
2059
|
+
def format_value(
|
|
2060
|
+
value: Any, is_missing: bool = False, is_unique: bool = False, max_width: int = 8
|
|
2061
|
+
) -> str:
|
|
2062
|
+
"""Format values for display with smart number formatting and HTML cleanup."""
|
|
2063
|
+
if value is None or (isinstance(value, str) and value.strip() == ""):
|
|
2064
|
+
return "[dim]—[/dim]"
|
|
2065
|
+
|
|
2066
|
+
# Handle missing values indicator
|
|
2067
|
+
if is_missing and str(value) == "0":
|
|
2068
|
+
return "[green]●[/green]" # No missing values
|
|
2069
|
+
|
|
2070
|
+
# Clean up HTML formatting from the raw data
|
|
2071
|
+
str_val = str(value)
|
|
2072
|
+
|
|
2073
|
+
# Handle multi-line values with <br> tags FIRST - take the first line (absolute number)
|
|
2074
|
+
if "<br>" in str_val:
|
|
2075
|
+
str_val = str_val.split("<br>")[0].strip()
|
|
2076
|
+
# For unique values, we want just the integer part
|
|
2077
|
+
if is_unique:
|
|
2078
|
+
try:
|
|
2079
|
+
# Try to extract just the integer part for unique counts
|
|
2080
|
+
num_val = float(str_val)
|
|
2081
|
+
return str(int(num_val))
|
|
2082
|
+
except (ValueError, TypeError):
|
|
2083
|
+
pass
|
|
2084
|
+
|
|
2085
|
+
# Now handle HTML content (especially from boolean unique values)
|
|
2086
|
+
if "<" in str_val and ">" in str_val:
|
|
2087
|
+
# Remove HTML tags completely for cleaner display
|
|
2088
|
+
str_val = re.sub(r"<[^>]+>", "", str_val).strip()
|
|
2089
|
+
# Clean up extra whitespace
|
|
2090
|
+
str_val = re.sub(r"\s+", " ", str_val).strip()
|
|
2091
|
+
|
|
2092
|
+
# Handle values like "2<.01" - extract the first number
|
|
2093
|
+
if "<" in str_val and not (str_val.startswith("<") and str_val.endswith(">")):
|
|
2094
|
+
# Extract number before the < symbol
|
|
2095
|
+
before_lt = str_val.split("<")[0].strip()
|
|
2096
|
+
if before_lt and before_lt.replace(".", "").replace("-", "").isdigit():
|
|
2097
|
+
str_val = before_lt
|
|
2098
|
+
|
|
2099
|
+
# Handle boolean unique values like "T0.62F0.38" - extract the more readable format
|
|
2100
|
+
if re.match(r"^[TF]\d+\.\d+[TF]\d+\.\d+$", str_val):
|
|
2101
|
+
# Extract T and F values
|
|
2102
|
+
t_match = re.search(r"T(\d+\.\d+)", str_val)
|
|
2103
|
+
f_match = re.search(r"F(\d+\.\d+)", str_val)
|
|
2104
|
+
if t_match and f_match:
|
|
2105
|
+
t_val = float(t_match.group(1))
|
|
2106
|
+
f_val = float(f_match.group(1))
|
|
2107
|
+
# Show as "T0.62F0.38" but truncated if needed
|
|
2108
|
+
formatted = f"T{t_val:.2f}F{f_val:.2f}"
|
|
2109
|
+
if len(formatted) > max_width:
|
|
2110
|
+
# Truncate to fit, showing dominant value
|
|
2111
|
+
if t_val > f_val:
|
|
2112
|
+
return f"T{t_val:.1f}"
|
|
2113
|
+
else:
|
|
2114
|
+
return f"F{f_val:.1f}"
|
|
2115
|
+
return formatted
|
|
2116
|
+
|
|
2117
|
+
# Try to parse as a number for better formatting
|
|
2118
|
+
try:
|
|
2119
|
+
# Try to convert to float first
|
|
2120
|
+
num_val = float(str_val)
|
|
2121
|
+
|
|
2122
|
+
# Handle special cases
|
|
2123
|
+
if num_val == 0:
|
|
2124
|
+
return "0"
|
|
2125
|
+
elif abs(num_val) == int(abs(num_val)) and abs(num_val) < 10000:
|
|
2126
|
+
# Simple integers under 10000
|
|
2127
|
+
return str(int(num_val))
|
|
2128
|
+
elif abs(num_val) >= 10000000 and abs(num_val) < 100000000:
|
|
2129
|
+
# Likely dates in YYYYMMDD format - format as date-like
|
|
2130
|
+
int_val = int(num_val)
|
|
2131
|
+
if 19000101 <= int_val <= 29991231: # Reasonable date range
|
|
2132
|
+
str_date = str(int_val)
|
|
2133
|
+
if len(str_date) == 8:
|
|
2134
|
+
return (
|
|
2135
|
+
f"{str_date[:4]}-{str_date[4:6]}-{str_date[6:]}"[: max_width - 1]
|
|
2136
|
+
+ "…"
|
|
2137
|
+
)
|
|
2138
|
+
# Otherwise treat as large number
|
|
2139
|
+
return f"{num_val / 1000000:.1f}M"
|
|
2140
|
+
elif abs(num_val) >= 1000000:
|
|
2141
|
+
# Large numbers - use scientific notation or M/k notation
|
|
2142
|
+
|
|
2143
|
+
if abs(num_val) >= 1000000000:
|
|
2144
|
+
return f"{num_val:.1e}"
|
|
2145
|
+
else:
|
|
2146
|
+
return f"{num_val / 1000000:.1f}M"
|
|
2147
|
+
elif abs(num_val) >= 10000:
|
|
2148
|
+
# Numbers >= 10k - use compact notation
|
|
2149
|
+
return f"{num_val / 1000:.1f}k"
|
|
2150
|
+
elif abs(num_val) >= 100:
|
|
2151
|
+
# Numbers 100-9999 - show with minimal decimals
|
|
2152
|
+
return f"{num_val:.1f}"
|
|
2153
|
+
elif abs(num_val) >= 10:
|
|
2154
|
+
# Numbers 10-99 - show with one decimal
|
|
2155
|
+
return f"{num_val:.1f}"
|
|
2156
|
+
elif abs(num_val) >= 1:
|
|
2157
|
+
# Numbers 1-9 - show with two decimals
|
|
2158
|
+
return f"{num_val:.2f}"
|
|
2159
|
+
elif abs(num_val) >= 0.01:
|
|
2160
|
+
# Small numbers - show with appropriate precision
|
|
2161
|
+
return f"{num_val:.2f}"
|
|
2162
|
+
else:
|
|
2163
|
+
# Very small numbers - use scientific notation
|
|
2164
|
+
|
|
2165
|
+
return f"{num_val:.1e}"
|
|
2166
|
+
|
|
2167
|
+
except (ValueError, TypeError):
|
|
2168
|
+
# Not a number, handle as string
|
|
2169
|
+
pass
|
|
2170
|
+
|
|
2171
|
+
# Handle date/datetime strings - show abbreviated format
|
|
2172
|
+
if len(str_val) > 10 and any(char in str_val for char in ["-", "/", ":"]):
|
|
2173
|
+
# Likely a date/datetime, show abbreviated
|
|
2174
|
+
if len(str_val) > max_width:
|
|
2175
|
+
return str_val[: max_width - 1] + "…"
|
|
2176
|
+
|
|
2177
|
+
# General string truncation with ellipsis
|
|
2178
|
+
if len(str_val) > max_width:
|
|
2179
|
+
return str_val[: max_width - 1] + "…"
|
|
2180
|
+
|
|
2181
|
+
return str_val
|
|
2182
|
+
|
|
2183
|
+
# Populate table rows
|
|
2184
|
+
num_rows = len(data_dict["colname"])
|
|
2185
|
+
for i in range(num_rows):
|
|
2186
|
+
row_data = []
|
|
2187
|
+
|
|
2188
|
+
# Column name and type from HTML content
|
|
2189
|
+
colname_html = data_dict["colname"][i]
|
|
2190
|
+
column_name, data_type = extract_column_info(colname_html)
|
|
2191
|
+
row_data.append(column_name)
|
|
2192
|
+
row_data.append(data_type)
|
|
2193
|
+
|
|
2194
|
+
# Missing values (NA)
|
|
2195
|
+
missing_val = data_dict.get("n_missing", [None] * num_rows)[i]
|
|
2196
|
+
row_data.append(format_value(missing_val, is_missing=True, max_width=6))
|
|
2197
|
+
|
|
2198
|
+
# Unique values (UQ)
|
|
2199
|
+
unique_val = data_dict.get("n_unique", [None] * num_rows)[i]
|
|
2200
|
+
row_data.append(format_value(unique_val, is_unique=True, max_width=8))
|
|
2201
|
+
|
|
2202
|
+
# Statistical columns
|
|
2203
|
+
for stat_col in stat_columns:
|
|
2204
|
+
stat_val = data_dict.get(stat_col, [None] * num_rows)[i]
|
|
2205
|
+
# Use appropriate width based on column type
|
|
2206
|
+
if stat_col in ["q_1", "iqr"]:
|
|
2207
|
+
width = 8
|
|
2208
|
+
elif stat_col in ["mean", "std", "min", "median", "max", "q_3"]:
|
|
2209
|
+
width = 9
|
|
2210
|
+
else:
|
|
2211
|
+
width = 8
|
|
2212
|
+
row_data.append(format_value(stat_val, max_width=width))
|
|
2213
|
+
|
|
2214
|
+
scan_table.add_row(*row_data)
|
|
2215
|
+
|
|
2216
|
+
# Display the results
|
|
2217
|
+
console.print()
|
|
2218
|
+
console.print(scan_table)
|
|
2219
|
+
|
|
2220
|
+
except Exception as e:
|
|
2221
|
+
# Fallback to simple message if table creation fails
|
|
2222
|
+
console.print(f"[yellow]Scan results available for {data_source}[/yellow]")
|
|
2223
|
+
console.print(f"[red]Error displaying table: {str(e)}[/red]")
|
|
2224
|
+
|
|
2225
|
+
|
|
2226
|
+
def _rich_print_missing_table(gt_table: Any, original_data: Any = None) -> None:
|
|
2227
|
+
"""Convert a missing values GT table to Rich table with special formatting.
|
|
2228
|
+
|
|
2229
|
+
Args:
|
|
2230
|
+
gt_table: The GT table object for missing values
|
|
2231
|
+
original_data: The original data source to extract column types
|
|
2232
|
+
"""
|
|
2233
|
+
try:
|
|
2234
|
+
# Extract the underlying data from the GT table
|
|
2235
|
+
df = None
|
|
2236
|
+
|
|
2237
|
+
if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
|
|
2238
|
+
df = gt_table._tbl_data
|
|
2239
|
+
elif hasattr(gt_table, "_data") and gt_table._data is not None:
|
|
2240
|
+
df = gt_table._data
|
|
2241
|
+
elif hasattr(gt_table, "data") and gt_table.data is not None:
|
|
2242
|
+
df = gt_table.data
|
|
2243
|
+
|
|
2244
|
+
if df is not None:
|
|
2245
|
+
from rich.box import SIMPLE_HEAD
|
|
2246
|
+
|
|
2247
|
+
# Create the missing values table
|
|
2248
|
+
rich_table = Table(show_header=True, header_style="bold magenta", box=SIMPLE_HEAD)
|
|
2249
|
+
|
|
2250
|
+
# Get column names
|
|
2251
|
+
columns = []
|
|
2252
|
+
try:
|
|
2253
|
+
if hasattr(df, "columns"):
|
|
2254
|
+
columns = list(df.columns)
|
|
2255
|
+
elif hasattr(df, "schema"):
|
|
2256
|
+
columns = list(df.schema.names)
|
|
2257
|
+
except Exception as e:
|
|
2258
|
+
console.print(f"[red]Error getting columns:[/red] {e}")
|
|
2259
|
+
columns = []
|
|
2260
|
+
|
|
2261
|
+
if not columns:
|
|
2262
|
+
columns = [f"Column {i + 1}" for i in range(10)] # Fallback
|
|
2263
|
+
|
|
2264
|
+
# Get original data to extract column types
|
|
2265
|
+
column_types = {}
|
|
2266
|
+
if original_data is not None:
|
|
2267
|
+
try:
|
|
2268
|
+
# Get column types from original data
|
|
2269
|
+
if hasattr(original_data, "columns"):
|
|
2270
|
+
original_columns = list(original_data.columns)
|
|
2271
|
+
column_types = _get_column_dtypes(original_data, original_columns)
|
|
2272
|
+
except Exception as e:
|
|
2273
|
+
console.print(f"[red]Error getting column types:[/red] {e}")
|
|
2274
|
+
pass # Use empty dict as fallback
|
|
2275
|
+
|
|
2276
|
+
# Add columns to Rich table with special formatting for missing values table
|
|
2277
|
+
sector_columns = [col for col in columns if col != "columns" and col.isdigit()]
|
|
2278
|
+
|
|
2279
|
+
# Two separate columns: Column name (20 chars) and Data type (10 chars)
|
|
2280
|
+
rich_table.add_column("Column", style="cyan", no_wrap=True, width=20)
|
|
2281
|
+
rich_table.add_column("Type", style="yellow", no_wrap=True, width=10)
|
|
2282
|
+
|
|
2283
|
+
# Sector columns: All same width, optimized for "100%" (4 chars + padding)
|
|
2284
|
+
for sector in sector_columns:
|
|
2285
|
+
rich_table.add_column(
|
|
2286
|
+
sector,
|
|
2287
|
+
style="cyan",
|
|
2288
|
+
justify="center",
|
|
2289
|
+
no_wrap=True,
|
|
2290
|
+
width=5, # Fixed width optimized for percentage values
|
|
2291
|
+
)
|
|
2292
|
+
|
|
2293
|
+
# Convert data to rows with special formatting
|
|
2294
|
+
rows = []
|
|
2295
|
+
try:
|
|
2296
|
+
if hasattr(df, "to_dicts"):
|
|
2297
|
+
data_dict = df.to_dicts()
|
|
2298
|
+
elif hasattr(df, "to_dict"):
|
|
2299
|
+
data_dict = df.to_dict("records")
|
|
2300
|
+
else:
|
|
2301
|
+
data_dict = []
|
|
2302
|
+
|
|
2303
|
+
for i, row in enumerate(data_dict):
|
|
2304
|
+
try:
|
|
2305
|
+
# Each row should have: [column_name, data_type, sector1, sector2, ...]
|
|
2306
|
+
column_name = str(row.get("columns", ""))
|
|
2307
|
+
|
|
2308
|
+
# Truncate column name to 20 characters with ellipsis if needed
|
|
2309
|
+
if len(column_name) > 20:
|
|
2310
|
+
truncated_name = column_name[:17] + "…"
|
|
2311
|
+
else:
|
|
2312
|
+
truncated_name = column_name
|
|
2313
|
+
|
|
2314
|
+
# Get data type for this column
|
|
2315
|
+
if column_name in column_types:
|
|
2316
|
+
dtype = column_types[column_name]
|
|
2317
|
+
if len(dtype) > 10:
|
|
2318
|
+
truncated_dtype = dtype[:9] + "…"
|
|
2319
|
+
else:
|
|
2320
|
+
truncated_dtype = dtype
|
|
2321
|
+
else:
|
|
2322
|
+
truncated_dtype = "?"
|
|
2323
|
+
|
|
2324
|
+
# Start building the row with column name and type
|
|
2325
|
+
formatted_row = [truncated_name, truncated_dtype]
|
|
2326
|
+
|
|
2327
|
+
# Add sector values (formatted percentages)
|
|
2328
|
+
for sector in sector_columns:
|
|
2329
|
+
value = row.get(sector, 0.0)
|
|
2330
|
+
if isinstance(value, (int, float)):
|
|
2331
|
+
formatted_row.append(_format_missing_percentage(float(value)))
|
|
2332
|
+
else:
|
|
2333
|
+
formatted_row.append(str(value))
|
|
2334
|
+
|
|
2335
|
+
rows.append(formatted_row)
|
|
2336
|
+
|
|
2337
|
+
except Exception as e:
|
|
2338
|
+
console.print(f"[red]Error processing row {i}:[/red] {e}")
|
|
2339
|
+
continue
|
|
2340
|
+
|
|
2341
|
+
except Exception as e:
|
|
2342
|
+
console.print(f"[red]Error extracting data:[/red] {e}")
|
|
2343
|
+
rows = [["Error extracting data", "?", *["" for _ in sector_columns]]]
|
|
2344
|
+
|
|
2345
|
+
# Add rows to Rich table
|
|
2346
|
+
for row in rows:
|
|
2347
|
+
try:
|
|
2348
|
+
rich_table.add_row(*row)
|
|
2349
|
+
except Exception as e:
|
|
2350
|
+
console.print(f"[red]Error adding row:[/red] {e}")
|
|
2351
|
+
break
|
|
2352
|
+
|
|
2353
|
+
# Show the table with custom spanner header if we have sector columns
|
|
2354
|
+
if sector_columns:
|
|
2355
|
+
# Create a custom header line that shows the spanner
|
|
2356
|
+
header_parts = []
|
|
2357
|
+
header_parts.append(" " * 20) # Space for Column header
|
|
2358
|
+
header_parts.append(" " * 10) # Space for Type header
|
|
2359
|
+
|
|
2360
|
+
# Left-align "Row Sectors" with the first numbered column
|
|
2361
|
+
row_sectors_text = "Row Sectors"
|
|
2362
|
+
header_parts.append(row_sectors_text)
|
|
2363
|
+
|
|
2364
|
+
# Print the custom spanner header
|
|
2365
|
+
console.print("[dim]" + " ".join(header_parts) + "[/dim]")
|
|
2366
|
+
|
|
2367
|
+
# Add a horizontal rule below the spanner
|
|
2368
|
+
rule_parts = []
|
|
2369
|
+
rule_parts.append(" " * 20) # Space for Column header
|
|
2370
|
+
rule_parts.append(" " * 10) # Space for Type header
|
|
2371
|
+
|
|
2372
|
+
# Use a fixed width horizontal rule for "Row Sectors"
|
|
2373
|
+
horizontal_rule = "─" * 20
|
|
2374
|
+
rule_parts.append(horizontal_rule)
|
|
2375
|
+
|
|
2376
|
+
# Print the horizontal rule
|
|
2377
|
+
console.print("[dim]" + " ".join(rule_parts) + "[/dim]")
|
|
2378
|
+
|
|
2379
|
+
# Print the Rich table (will handle terminal width automatically)
|
|
2380
|
+
console.print(rich_table)
|
|
2381
|
+
footer_text = (
|
|
2382
|
+
"[dim]Symbols: [green]●[/green] = no missing values, "
|
|
2383
|
+
"[red]●[/red] = completely missing, "
|
|
2384
|
+
"<1% = less than 1% missing, "
|
|
2385
|
+
">99% = more than 99% missing[/dim]"
|
|
2386
|
+
)
|
|
2387
|
+
console.print(footer_text)
|
|
2388
|
+
|
|
2389
|
+
else:
|
|
2390
|
+
# Fallback to regular table display
|
|
2391
|
+
_rich_print_gt_table(gt_table)
|
|
2392
|
+
|
|
2393
|
+
except Exception as e:
|
|
2394
|
+
console.print(f"[red]Error rendering missing values table:[/red] {e}")
|
|
2395
|
+
# Fallback to regular table display
|
|
2396
|
+
_rich_print_gt_table(gt_table)
|
|
2397
|
+
|
|
2398
|
+
|
|
2399
|
+
def _map_parameters_to_checks(
|
|
2400
|
+
checks_list: list[str], columns_list: list[str], sets_list: list[str], values_list: list[float]
|
|
2401
|
+
) -> tuple[list[str], list[str], list[float]]:
|
|
2402
|
+
"""
|
|
2403
|
+
Map parameters to checks intelligently, handling flexible parameter ordering.
|
|
2404
|
+
|
|
2405
|
+
This function distributes the provided parameters across checks based on what each check needs.
|
|
2406
|
+
For checks that don't need certain parameters, None/empty values are assigned.
|
|
2407
|
+
|
|
2408
|
+
Args:
|
|
2409
|
+
checks_list: List of validation check types
|
|
2410
|
+
columns_list: List of column names provided by user
|
|
2411
|
+
sets_list: List of set values provided by user
|
|
2412
|
+
values_list: List of numeric values provided by user
|
|
2413
|
+
|
|
2414
|
+
Returns:
|
|
2415
|
+
Tuple of (mapped_columns, mapped_sets, mapped_values) where each list
|
|
2416
|
+
has the same length as checks_list
|
|
2417
|
+
"""
|
|
2418
|
+
mapped_columns = []
|
|
2419
|
+
mapped_sets = []
|
|
2420
|
+
mapped_values = []
|
|
2421
|
+
|
|
2422
|
+
# Keep track of which parameters we've used
|
|
2423
|
+
column_index = 0
|
|
2424
|
+
set_index = 0
|
|
2425
|
+
value_index = 0
|
|
2426
|
+
|
|
2427
|
+
for check in checks_list:
|
|
2428
|
+
# Determine what parameters this check needs
|
|
2429
|
+
needs_column = check in [
|
|
2430
|
+
"col-vals-not-null",
|
|
2431
|
+
"col-exists",
|
|
2432
|
+
"col-vals-in-set",
|
|
2433
|
+
"col-vals-gt",
|
|
2434
|
+
"col-vals-ge",
|
|
2435
|
+
"col-vals-lt",
|
|
2436
|
+
"col-vals-le",
|
|
2437
|
+
]
|
|
2438
|
+
needs_set = check == "col-vals-in-set"
|
|
2439
|
+
needs_value = check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"]
|
|
2440
|
+
|
|
2441
|
+
# Assign column parameter if needed
|
|
2442
|
+
if needs_column:
|
|
2443
|
+
if column_index < len(columns_list):
|
|
2444
|
+
mapped_columns.append(columns_list[column_index])
|
|
2445
|
+
column_index += 1
|
|
2446
|
+
else:
|
|
2447
|
+
mapped_columns.append(None) # Will cause validation error later
|
|
2448
|
+
else:
|
|
2449
|
+
mapped_columns.append(None)
|
|
2450
|
+
|
|
2451
|
+
# Assign set parameter if needed
|
|
2452
|
+
if needs_set:
|
|
2453
|
+
if set_index < len(sets_list):
|
|
2454
|
+
mapped_sets.append(sets_list[set_index])
|
|
2455
|
+
set_index += 1
|
|
2456
|
+
else:
|
|
2457
|
+
mapped_sets.append(None) # Will cause validation error later
|
|
2458
|
+
else:
|
|
2459
|
+
mapped_sets.append(None)
|
|
2460
|
+
|
|
2461
|
+
# Assign value parameter if needed
|
|
2462
|
+
if needs_value:
|
|
2463
|
+
if value_index < len(values_list):
|
|
2464
|
+
mapped_values.append(values_list[value_index])
|
|
2465
|
+
value_index += 1
|
|
2466
|
+
else:
|
|
2467
|
+
mapped_values.append(None) # Will cause validation error later
|
|
2468
|
+
else:
|
|
2469
|
+
mapped_values.append(None)
|
|
2470
|
+
|
|
2471
|
+
return mapped_columns, mapped_sets, mapped_values
|
|
2472
|
+
|
|
2473
|
+
|
|
2474
|
+
def _resolve_column_indices(columns_list, data):
|
|
2475
|
+
"""
|
|
2476
|
+
Replace any '#N' entries in columns_list with the actual column name from data (1-based).
|
|
2477
|
+
"""
|
|
2478
|
+
# Get column names from the data
|
|
2479
|
+
if hasattr(data, "columns"):
|
|
2480
|
+
all_columns = list(data.columns)
|
|
2481
|
+
elif hasattr(data, "schema"):
|
|
2482
|
+
all_columns = list(data.schema.names)
|
|
2483
|
+
else:
|
|
2484
|
+
return columns_list # Can't resolve, return as-is
|
|
2485
|
+
|
|
2486
|
+
resolved = []
|
|
2487
|
+
for col in columns_list:
|
|
2488
|
+
if isinstance(col, str) and col.startswith("#"):
|
|
2489
|
+
try:
|
|
2490
|
+
idx = int(col[1:]) - 1 # 1-based to 0-based
|
|
2491
|
+
if 0 <= idx < len(all_columns):
|
|
2492
|
+
resolved.append(all_columns[idx])
|
|
2493
|
+
else:
|
|
2494
|
+
resolved.append(col) # Out of range, keep as-is
|
|
2495
|
+
except Exception:
|
|
2496
|
+
resolved.append(col) # Not a valid number, keep as-is
|
|
2497
|
+
else:
|
|
2498
|
+
resolved.append(col)
|
|
2499
|
+
return resolved
|
|
2500
|
+
|
|
2501
|
+
|
|
2502
|
+
def _display_validation_result(
|
|
2503
|
+
validation: Any,
|
|
2504
|
+
checks_list: list[str],
|
|
2505
|
+
columns_list: list[str],
|
|
2506
|
+
sets_list: list[str],
|
|
2507
|
+
values_list: list[float],
|
|
2508
|
+
data_source: str,
|
|
2509
|
+
step_index: int,
|
|
2510
|
+
total_checks: int,
|
|
2511
|
+
show_extract: bool,
|
|
2512
|
+
write_extract: str | None,
|
|
2513
|
+
limit: int,
|
|
2514
|
+
) -> None:
|
|
2515
|
+
"""Display a single validation result with proper formatting for single or multiple checks."""
|
|
2516
|
+
from rich.box import SIMPLE_HEAD
|
|
2517
|
+
|
|
2518
|
+
# Get parameters for this specific check
|
|
2519
|
+
check = checks_list[step_index]
|
|
2520
|
+
column = columns_list[step_index] if step_index < len(columns_list) else None
|
|
2521
|
+
set_val = sets_list[step_index] if step_index < len(sets_list) else None
|
|
2522
|
+
value = values_list[step_index] if step_index < len(values_list) else None
|
|
2523
|
+
|
|
2524
|
+
# Get validation step info
|
|
2525
|
+
step_info = None
|
|
2526
|
+
if hasattr(validation, "validation_info") and len(validation.validation_info) > step_index:
|
|
2527
|
+
step_info = validation.validation_info[step_index]
|
|
2528
|
+
|
|
2529
|
+
# Create friendly title for table
|
|
2530
|
+
if total_checks == 1:
|
|
2531
|
+
# Single check - use original title format
|
|
2532
|
+
if check == "rows-distinct":
|
|
2533
|
+
table_title = "Validation Result: Rows Distinct"
|
|
2534
|
+
elif check == "col-vals-not-null":
|
|
2535
|
+
table_title = "Validation Result: Column Values Not Null"
|
|
2536
|
+
elif check == "rows-complete":
|
|
2537
|
+
table_title = "Validation Result: Rows Complete"
|
|
2538
|
+
elif check == "col-exists":
|
|
2539
|
+
table_title = "Validation Result: Column Exists"
|
|
2540
|
+
elif check == "col-vals-in-set":
|
|
2541
|
+
table_title = "Validation Result: Column Values In Set"
|
|
2542
|
+
elif check == "col-vals-gt":
|
|
2543
|
+
table_title = "Validation Result: Column Values Greater Than"
|
|
2544
|
+
elif check == "col-vals-ge":
|
|
2545
|
+
table_title = "Validation Result: Column Values Greater Than Or Equal"
|
|
2546
|
+
elif check == "col-vals-lt":
|
|
2547
|
+
table_title = "Validation Result: Column Values Less Than"
|
|
2548
|
+
elif check == "col-vals-le":
|
|
2549
|
+
table_title = "Validation Result: Column Values Less Than Or Equal"
|
|
2550
|
+
else:
|
|
2551
|
+
table_title = f"Validation Result: {check.replace('-', ' ').title()}"
|
|
2552
|
+
else:
|
|
2553
|
+
# Multiple checks - add numbering
|
|
2554
|
+
if check == "rows-distinct":
|
|
2555
|
+
base_title = "Rows Distinct"
|
|
2556
|
+
elif check == "col-vals-not-null":
|
|
2557
|
+
base_title = "Column Values Not Null"
|
|
2558
|
+
elif check == "rows-complete":
|
|
2559
|
+
base_title = "Rows Complete"
|
|
2560
|
+
elif check == "col-exists":
|
|
2561
|
+
base_title = "Column Exists"
|
|
2562
|
+
elif check == "col-vals-in-set":
|
|
2563
|
+
base_title = "Column Values In Set"
|
|
2564
|
+
elif check == "col-vals-gt":
|
|
2565
|
+
base_title = "Column Values Greater Than"
|
|
2566
|
+
elif check == "col-vals-ge":
|
|
2567
|
+
base_title = "Column Values Greater Than Or Equal"
|
|
2568
|
+
elif check == "col-vals-lt":
|
|
2569
|
+
base_title = "Column Values Less Than"
|
|
2570
|
+
elif check == "col-vals-le":
|
|
2571
|
+
base_title = "Column Values Less Than Or Equal"
|
|
2572
|
+
else:
|
|
2573
|
+
base_title = check.replace("-", " ").title()
|
|
2574
|
+
|
|
2575
|
+
table_title = f"Validation Result ({step_index + 1} of {total_checks}): {base_title}"
|
|
2576
|
+
|
|
2577
|
+
# Create the validation results table
|
|
2578
|
+
result_table = Table(
|
|
2579
|
+
title=table_title,
|
|
2580
|
+
show_header=True,
|
|
2581
|
+
header_style="bold magenta",
|
|
2582
|
+
box=SIMPLE_HEAD,
|
|
2583
|
+
title_style="bold cyan",
|
|
2584
|
+
title_justify="left",
|
|
2585
|
+
)
|
|
2586
|
+
result_table.add_column("Property", style="cyan", no_wrap=True)
|
|
2587
|
+
result_table.add_column("Value", style="white")
|
|
2588
|
+
|
|
2589
|
+
# Add basic info
|
|
2590
|
+
result_table.add_row("Data Source", data_source)
|
|
2591
|
+
result_table.add_row("Check Type", check)
|
|
2592
|
+
|
|
2593
|
+
# Add column info for column-specific checks
|
|
2594
|
+
if check in [
|
|
2595
|
+
"col-vals-not-null",
|
|
2596
|
+
"col-exists",
|
|
2597
|
+
"col-vals-in-set",
|
|
2598
|
+
"col-vals-gt",
|
|
2599
|
+
"col-vals-ge",
|
|
2600
|
+
"col-vals-lt",
|
|
2601
|
+
"col-vals-le",
|
|
2602
|
+
]:
|
|
2603
|
+
result_table.add_row("Column", column)
|
|
2604
|
+
|
|
2605
|
+
# Add set info for col-vals-in-set check
|
|
2606
|
+
if check == "col-vals-in-set" and set_val:
|
|
2607
|
+
allowed_values = [v.strip() for v in set_val.split(",")]
|
|
2608
|
+
result_table.add_row("Allowed Values", ", ".join(allowed_values))
|
|
2609
|
+
|
|
2610
|
+
# Add value info for range checks
|
|
2611
|
+
if check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"] and value is not None:
|
|
2612
|
+
if check == "col-vals-gt":
|
|
2613
|
+
operator = ">"
|
|
2614
|
+
elif check == "col-vals-ge":
|
|
2615
|
+
operator = ">="
|
|
2616
|
+
elif check == "col-vals-lt":
|
|
2617
|
+
operator = "<"
|
|
2618
|
+
elif check == "col-vals-le":
|
|
2619
|
+
operator = "<="
|
|
2620
|
+
result_table.add_row("Threshold", f"{operator} {value}")
|
|
2621
|
+
|
|
2622
|
+
# Get validation details
|
|
2623
|
+
if step_info:
|
|
2624
|
+
result_table.add_row("Total Rows Tested", f"{step_info.n:,}")
|
|
2625
|
+
result_table.add_row("Passing Rows", f"{step_info.n_passed:,}")
|
|
2626
|
+
result_table.add_row("Failing Rows", f"{step_info.n_failed:,}")
|
|
2627
|
+
|
|
2628
|
+
# Check if this step passed
|
|
2629
|
+
step_passed = step_info.n_failed == 0
|
|
2630
|
+
|
|
2631
|
+
# Overall result with color coding
|
|
2632
|
+
if step_passed:
|
|
2633
|
+
result_table.add_row("Result", "[green]✓ PASSED[/green]")
|
|
2634
|
+
if check == "rows-distinct":
|
|
2635
|
+
result_table.add_row("Duplicate Rows", "[green]None found[/green]")
|
|
2636
|
+
elif check == "col-vals-not-null":
|
|
2637
|
+
result_table.add_row("Null Values", "[green]None found[/green]")
|
|
2638
|
+
elif check == "rows-complete":
|
|
2639
|
+
result_table.add_row("Incomplete Rows", "[green]None found[/green]")
|
|
2640
|
+
elif check == "col-exists":
|
|
2641
|
+
result_table.add_row("Column Status", "[green]Column exists[/green]")
|
|
2642
|
+
elif check == "col-vals-in-set":
|
|
2643
|
+
result_table.add_row("Values Status", "[green]All values in allowed set[/green]")
|
|
2644
|
+
elif check == "col-vals-gt":
|
|
2645
|
+
result_table.add_row("Values Status", f"[green]All values > {value}[/green]")
|
|
2646
|
+
elif check == "col-vals-ge":
|
|
2647
|
+
result_table.add_row("Values Status", f"[green]All values >= {value}[/green]")
|
|
2648
|
+
elif check == "col-vals-lt":
|
|
2649
|
+
result_table.add_row("Values Status", f"[green]All values < {value}[/green]")
|
|
2650
|
+
elif check == "col-vals-le":
|
|
2651
|
+
result_table.add_row("Values Status", f"[green]All values <= {value}[/green]")
|
|
2652
|
+
else:
|
|
2653
|
+
result_table.add_row("Result", "[red]✗ FAILED[/red]")
|
|
2654
|
+
if check == "rows-distinct":
|
|
2655
|
+
result_table.add_row("Duplicate Rows", f"[red]{step_info.n_failed:,} found[/red]")
|
|
2656
|
+
elif check == "col-vals-not-null":
|
|
2657
|
+
result_table.add_row("Null Values", f"[red]{step_info.n_failed:,} found[/red]")
|
|
2658
|
+
elif check == "rows-complete":
|
|
2659
|
+
result_table.add_row("Incomplete Rows", f"[red]{step_info.n_failed:,} found[/red]")
|
|
2660
|
+
elif check == "col-exists":
|
|
2661
|
+
result_table.add_row("Column Status", "[red]Column does not exist[/red]")
|
|
2662
|
+
elif check == "col-vals-in-set":
|
|
2663
|
+
result_table.add_row("Invalid Values", f"[red]{step_info.n_failed:,} found[/red]")
|
|
2664
|
+
elif check == "col-vals-gt":
|
|
2665
|
+
result_table.add_row(
|
|
2666
|
+
"Invalid Values", f"[red]{step_info.n_failed:,} values <= {value}[/red]"
|
|
2667
|
+
)
|
|
2668
|
+
elif check == "col-vals-ge":
|
|
2669
|
+
result_table.add_row(
|
|
2670
|
+
"Invalid Values", f"[red]{step_info.n_failed:,} values < {value}[/red]"
|
|
2671
|
+
)
|
|
2672
|
+
elif check == "col-vals-lt":
|
|
2673
|
+
result_table.add_row(
|
|
2674
|
+
"Invalid Values", f"[red]{step_info.n_failed:,} values >= {value}[/red]"
|
|
2675
|
+
)
|
|
2676
|
+
elif check == "col-vals-le":
|
|
2677
|
+
result_table.add_row(
|
|
2678
|
+
"Invalid Values", f"[red]{step_info.n_failed:,} values > {value}[/red]"
|
|
2679
|
+
)
|
|
2680
|
+
|
|
2681
|
+
console.print()
|
|
2682
|
+
console.print(result_table)
|
|
2683
|
+
|
|
2684
|
+
# Show extract and summary for single check only, or if this is a failed step in multiple checks
|
|
2685
|
+
if total_checks == 1:
|
|
2686
|
+
# For single check, show extract and summary as before
|
|
2687
|
+
_show_extract_and_summary(
|
|
2688
|
+
validation,
|
|
2689
|
+
check,
|
|
2690
|
+
column,
|
|
2691
|
+
set_val,
|
|
2692
|
+
value,
|
|
2693
|
+
data_source,
|
|
2694
|
+
step_index,
|
|
2695
|
+
step_info,
|
|
2696
|
+
show_extract,
|
|
2697
|
+
write_extract,
|
|
2698
|
+
limit,
|
|
2699
|
+
)
|
|
2700
|
+
else:
|
|
2701
|
+
# For multiple checks, show summary panel and handle extract if needed
|
|
2702
|
+
if step_info:
|
|
2703
|
+
step_passed = step_info.n_failed == 0
|
|
2704
|
+
if step_passed:
|
|
2705
|
+
# Create success message for this step
|
|
2706
|
+
if check == "rows-distinct":
|
|
2707
|
+
success_message = f"[green]✓ Validation PASSED: No duplicate rows found in {data_source}[/green]"
|
|
2708
|
+
elif check == "col-vals-not-null":
|
|
2709
|
+
success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {data_source}[/green]"
|
|
2710
|
+
elif check == "rows-complete":
|
|
2711
|
+
success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {data_source}[/green]"
|
|
2712
|
+
elif check == "col-exists":
|
|
2713
|
+
success_message = f"[green]✓ Validation PASSED: Column '{column}' exists in {data_source}[/green]"
|
|
2714
|
+
elif check == "col-vals-in-set":
|
|
2715
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {data_source}[/green]"
|
|
2716
|
+
elif check == "col-vals-gt":
|
|
2717
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {data_source}[/green]"
|
|
2718
|
+
elif check == "col-vals-ge":
|
|
2719
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {data_source}[/green]"
|
|
2720
|
+
elif check == "col-vals-lt":
|
|
2721
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {data_source}[/green]"
|
|
2722
|
+
elif check == "col-vals-le":
|
|
2723
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {data_source}[/green]"
|
|
2724
|
+
else:
|
|
2725
|
+
success_message = f"[green]✓ Validation PASSED: {check} check passed for {data_source}[/green]"
|
|
2726
|
+
|
|
2727
|
+
console.print(
|
|
2728
|
+
Panel(
|
|
2729
|
+
success_message,
|
|
2730
|
+
border_style="green",
|
|
2731
|
+
)
|
|
2732
|
+
)
|
|
2733
|
+
else:
|
|
2734
|
+
# Create failure message for this step (without tip)
|
|
2735
|
+
if check == "rows-distinct":
|
|
2736
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {data_source}[/red]"
|
|
2737
|
+
elif check == "col-vals-not-null":
|
|
2738
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {data_source}[/red]"
|
|
2739
|
+
elif check == "rows-complete":
|
|
2740
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {data_source}[/red]"
|
|
2741
|
+
elif check == "col-exists":
|
|
2742
|
+
failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {data_source}[/red]"
|
|
2743
|
+
elif check == "col-vals-in-set":
|
|
2744
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {data_source}[/red]"
|
|
2745
|
+
elif check == "col-vals-gt":
|
|
2746
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {data_source}[/red]"
|
|
2747
|
+
elif check == "col-vals-ge":
|
|
2748
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {data_source}[/red]"
|
|
2749
|
+
elif check == "col-vals-lt":
|
|
2750
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {data_source}[/red]"
|
|
2751
|
+
elif check == "col-vals-le":
|
|
2752
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {data_source}[/red]"
|
|
2753
|
+
else:
|
|
2754
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {data_source}[/red]"
|
|
2755
|
+
|
|
2756
|
+
console.print(
|
|
2757
|
+
Panel(
|
|
2758
|
+
failure_message,
|
|
2759
|
+
border_style="red",
|
|
2760
|
+
)
|
|
2761
|
+
)
|
|
2762
|
+
|
|
2763
|
+
# For multiple checks, show extract if requested and this step failed
|
|
2764
|
+
if (show_extract or write_extract) and not step_passed:
|
|
2765
|
+
_show_extract_for_multi_check(
|
|
2766
|
+
validation,
|
|
2767
|
+
check,
|
|
2768
|
+
column,
|
|
2769
|
+
set_val,
|
|
2770
|
+
value,
|
|
2771
|
+
data_source,
|
|
2772
|
+
step_index,
|
|
2773
|
+
step_info,
|
|
2774
|
+
show_extract,
|
|
2775
|
+
write_extract,
|
|
2776
|
+
limit,
|
|
2777
|
+
)
|
|
2778
|
+
|
|
2779
|
+
|
|
2780
|
+
def _show_extract_for_multi_check(
|
|
2781
|
+
validation: Any,
|
|
2782
|
+
check: str,
|
|
2783
|
+
column: str | None,
|
|
2784
|
+
set_val: str | None,
|
|
2785
|
+
value: float | None,
|
|
2786
|
+
data_source: str,
|
|
2787
|
+
step_index: int,
|
|
2788
|
+
step_info: Any,
|
|
2789
|
+
show_extract: bool,
|
|
2790
|
+
write_extract: str | None,
|
|
2791
|
+
limit: int,
|
|
2792
|
+
) -> None:
|
|
2793
|
+
"""Show extract for a single validation step in multiple checks scenario."""
|
|
2794
|
+
# Dynamic message based on check type
|
|
2795
|
+
if check == "rows-distinct":
|
|
2796
|
+
extract_message = "[yellow]Extract of failing rows (duplicates):[/yellow]"
|
|
2797
|
+
row_type = "duplicate rows"
|
|
2798
|
+
elif check == "rows-complete":
|
|
2799
|
+
extract_message = "[yellow]Extract of failing rows (incomplete rows):[/yellow]"
|
|
2800
|
+
row_type = "incomplete rows"
|
|
2801
|
+
elif check == "col-exists":
|
|
2802
|
+
extract_message = f"[yellow]Column '{column}' does not exist in the dataset[/yellow]"
|
|
2803
|
+
row_type = "missing column"
|
|
2804
|
+
elif check == "col-vals-not-null":
|
|
2805
|
+
extract_message = f"[yellow]Extract of failing rows (null values in '{column}'):[/yellow]"
|
|
2806
|
+
row_type = "rows with null values"
|
|
2807
|
+
elif check == "col-vals-in-set":
|
|
2808
|
+
extract_message = (
|
|
2809
|
+
f"[yellow]Extract of failing rows (invalid values in '{column}'):[/yellow]"
|
|
2810
|
+
)
|
|
2811
|
+
row_type = "rows with invalid values"
|
|
2812
|
+
elif check == "col-vals-gt":
|
|
2813
|
+
extract_message = (
|
|
2814
|
+
f"[yellow]Extract of failing rows (values in '{column}' <= {value}):[/yellow]"
|
|
2815
|
+
)
|
|
2816
|
+
row_type = f"rows with values <= {value}"
|
|
2817
|
+
elif check == "col-vals-ge":
|
|
2818
|
+
extract_message = (
|
|
2819
|
+
f"[yellow]Extract of failing rows (values in '{column}' < {value}):[/yellow]"
|
|
2820
|
+
)
|
|
2821
|
+
row_type = f"rows with values < {value}"
|
|
2822
|
+
elif check == "col-vals-lt":
|
|
2823
|
+
extract_message = (
|
|
2824
|
+
f"[yellow]Extract of failing rows (values in '{column}' >= {value}):[/yellow]"
|
|
2825
|
+
)
|
|
2826
|
+
row_type = f"rows with values >= {value}"
|
|
2827
|
+
elif check == "col-vals-le":
|
|
2828
|
+
extract_message = (
|
|
2829
|
+
f"[yellow]Extract of failing rows (values in '{column}' > {value}):[/yellow]"
|
|
2830
|
+
)
|
|
2831
|
+
row_type = f"rows with values > {value}"
|
|
2832
|
+
else:
|
|
2833
|
+
extract_message = "[yellow]Extract of failing rows:[/yellow]"
|
|
2834
|
+
row_type = "failing rows"
|
|
2835
|
+
|
|
2836
|
+
if show_extract:
|
|
2837
|
+
console.print()
|
|
2838
|
+
console.print(extract_message)
|
|
2839
|
+
|
|
2840
|
+
# Special handling for col-exists check - no rows to show when column doesn't exist
|
|
2841
|
+
if check == "col-exists":
|
|
2842
|
+
if show_extract:
|
|
2843
|
+
console.print(f"[dim]The column '{column}' was not found in the dataset.[/dim]")
|
|
2844
|
+
console.print(
|
|
2845
|
+
"[dim]Use --show-extract with other check types to see failing data rows.[/dim]"
|
|
2846
|
+
)
|
|
2847
|
+
if write_extract:
|
|
2848
|
+
console.print("[yellow]Cannot save failing rows when column doesn't exist[/yellow]")
|
|
2849
|
+
else:
|
|
2850
|
+
try:
|
|
2851
|
+
# Get failing rows extract - use step_index + 1 since extracts are 1-indexed
|
|
2852
|
+
failing_rows = validation.get_data_extracts(i=step_index + 1, frame=True)
|
|
2853
|
+
|
|
2854
|
+
if failing_rows is not None and len(failing_rows) > 0:
|
|
2855
|
+
if show_extract:
|
|
2856
|
+
# Limit the number of rows shown
|
|
2857
|
+
if len(failing_rows) > limit:
|
|
2858
|
+
display_rows = failing_rows.head(limit)
|
|
2859
|
+
console.print(
|
|
2860
|
+
f"[dim]Showing first {limit} of {len(failing_rows)} {row_type}[/dim]"
|
|
2861
|
+
)
|
|
2862
|
+
else:
|
|
2863
|
+
display_rows = failing_rows
|
|
2864
|
+
console.print(f"[dim]Showing all {len(failing_rows)} {row_type}[/dim]")
|
|
2865
|
+
|
|
2866
|
+
# Create a preview table using pointblank's preview function
|
|
2867
|
+
import pointblank as pb
|
|
2868
|
+
|
|
2869
|
+
preview_table = pb.preview(
|
|
2870
|
+
data=display_rows,
|
|
2871
|
+
n_head=min(limit, len(display_rows)),
|
|
2872
|
+
n_tail=0,
|
|
2873
|
+
limit=limit,
|
|
2874
|
+
show_row_numbers=True,
|
|
2875
|
+
)
|
|
2876
|
+
|
|
2877
|
+
# Display using our Rich table function
|
|
2878
|
+
_rich_print_gt_table(preview_table, show_summary=False)
|
|
2879
|
+
|
|
2880
|
+
if write_extract:
|
|
2881
|
+
try:
|
|
2882
|
+
from pathlib import Path
|
|
2883
|
+
|
|
2884
|
+
folder_name = write_extract
|
|
2885
|
+
|
|
2886
|
+
# Create the output folder
|
|
2887
|
+
output_folder = Path(folder_name)
|
|
2888
|
+
output_folder.mkdir(parents=True, exist_ok=True)
|
|
2889
|
+
|
|
2890
|
+
# Create safe filename from check type
|
|
2891
|
+
safe_check_type = check.replace("-", "_")
|
|
2892
|
+
filename = f"step_{step_index + 1:02d}_{safe_check_type}.csv"
|
|
2893
|
+
filepath = output_folder / filename
|
|
2894
|
+
|
|
2895
|
+
# Limit the output if needed
|
|
2896
|
+
write_rows = failing_rows
|
|
2897
|
+
if len(failing_rows) > limit:
|
|
2898
|
+
write_rows = failing_rows.head(limit)
|
|
2899
|
+
|
|
2900
|
+
# Save to CSV
|
|
2901
|
+
if hasattr(write_rows, "write_csv"):
|
|
2902
|
+
# Polars
|
|
2903
|
+
write_rows.write_csv(str(filepath))
|
|
2904
|
+
elif hasattr(write_rows, "to_csv"):
|
|
2905
|
+
# Pandas
|
|
2906
|
+
write_rows.to_csv(str(filepath), index=False)
|
|
2907
|
+
else:
|
|
2908
|
+
# Try converting to pandas as fallback
|
|
2909
|
+
import pandas as pd
|
|
2910
|
+
|
|
2911
|
+
pd_data = pd.DataFrame(write_rows)
|
|
2912
|
+
pd_data.to_csv(str(filepath), index=False)
|
|
2913
|
+
|
|
2914
|
+
rows_saved = len(write_rows) if hasattr(write_rows, "__len__") else limit
|
|
2915
|
+
console.print(
|
|
2916
|
+
f"[green]✓[/green] Failing rows saved to folder: {output_folder}"
|
|
2917
|
+
)
|
|
2918
|
+
console.print(f"[dim] - {filename}: {rows_saved} rows[/dim]")
|
|
2919
|
+
except Exception as e:
|
|
2920
|
+
console.print(f"[yellow]Warning: Could not save failing rows: {e}[/yellow]")
|
|
2921
|
+
else:
|
|
2922
|
+
if show_extract:
|
|
2923
|
+
console.print("[yellow]No failing rows could be extracted[/yellow]")
|
|
2924
|
+
if write_extract:
|
|
2925
|
+
console.print("[yellow]No failing rows could be extracted to save[/yellow]")
|
|
2926
|
+
except Exception as e:
|
|
2927
|
+
if show_extract:
|
|
2928
|
+
console.print(f"[yellow]Could not extract failing rows: {e}[/yellow]")
|
|
2929
|
+
if write_extract:
|
|
2930
|
+
console.print(f"[yellow]Could not extract failing rows to save: {e}[/yellow]")
|
|
2931
|
+
|
|
2932
|
+
|
|
2933
|
+
def _show_extract_and_summary(
|
|
2934
|
+
validation: Any,
|
|
2935
|
+
check: str,
|
|
2936
|
+
column: str | None,
|
|
2937
|
+
set_val: str | None,
|
|
2938
|
+
value: float | None,
|
|
2939
|
+
data_source: str,
|
|
2940
|
+
step_index: int,
|
|
2941
|
+
step_info: Any,
|
|
2942
|
+
show_extract: bool,
|
|
2943
|
+
write_extract: str | None,
|
|
2944
|
+
limit: int,
|
|
2945
|
+
) -> None:
|
|
2946
|
+
"""Show extract and summary for a validation step (used for single checks)."""
|
|
2947
|
+
step_passed = step_info.n_failed == 0 if step_info else True
|
|
2948
|
+
|
|
2949
|
+
# Show extract if requested and validation failed
|
|
2950
|
+
if (show_extract or write_extract) and not step_passed:
|
|
2951
|
+
console.print()
|
|
2952
|
+
|
|
2953
|
+
# Dynamic message based on check type
|
|
2954
|
+
if check == "rows-distinct":
|
|
2955
|
+
extract_message = "[yellow]Extract of failing rows (duplicates):[/yellow]"
|
|
2956
|
+
row_type = "duplicate rows"
|
|
2957
|
+
elif check == "rows-complete":
|
|
2958
|
+
extract_message = "[yellow]Extract of failing rows (incomplete rows):[/yellow]"
|
|
2959
|
+
row_type = "incomplete rows"
|
|
2960
|
+
elif check == "col-exists":
|
|
2961
|
+
extract_message = f"[yellow]Column '{column}' does not exist in the dataset[/yellow]"
|
|
2962
|
+
row_type = "missing column"
|
|
2963
|
+
elif check == "col-vals-not-null":
|
|
2964
|
+
extract_message = (
|
|
2965
|
+
f"[yellow]Extract of failing rows (null values in '{column}'):[/yellow]"
|
|
2966
|
+
)
|
|
2967
|
+
row_type = "rows with null values"
|
|
2968
|
+
elif check == "col-vals-in-set":
|
|
2969
|
+
extract_message = (
|
|
2970
|
+
f"[yellow]Extract of failing rows (invalid values in '{column}'):[/yellow]"
|
|
2971
|
+
)
|
|
2972
|
+
row_type = "rows with invalid values"
|
|
2973
|
+
elif check == "col-vals-gt":
|
|
2974
|
+
extract_message = (
|
|
2975
|
+
f"[yellow]Extract of failing rows (values in '{column}' <= {value}):[/yellow]"
|
|
2976
|
+
)
|
|
2977
|
+
row_type = f"rows with values <= {value}"
|
|
2978
|
+
elif check == "col-vals-ge":
|
|
2979
|
+
extract_message = (
|
|
2980
|
+
f"[yellow]Extract of failing rows (values in '{column}' < {value}):[/yellow]"
|
|
2981
|
+
)
|
|
2982
|
+
row_type = f"rows with values < {value}"
|
|
2983
|
+
elif check == "col-vals-lt":
|
|
2984
|
+
extract_message = (
|
|
2985
|
+
f"[yellow]Extract of failing rows (values in '{column}' >= {value}):[/yellow]"
|
|
2986
|
+
)
|
|
2987
|
+
row_type = f"rows with values >= {value}"
|
|
2988
|
+
elif check == "col-vals-le":
|
|
2989
|
+
extract_message = (
|
|
2990
|
+
f"[yellow]Extract of failing rows (values in '{column}' > {value}):[/yellow]"
|
|
2991
|
+
)
|
|
2992
|
+
row_type = f"rows with values > {value}"
|
|
2993
|
+
else:
|
|
2994
|
+
extract_message = "[yellow]Extract of failing rows:[/yellow]"
|
|
2995
|
+
row_type = "failing rows"
|
|
2996
|
+
|
|
2997
|
+
if show_extract:
|
|
2998
|
+
console.print(extract_message)
|
|
2999
|
+
|
|
3000
|
+
# Special handling for col-exists check - no rows to show when column doesn't exist
|
|
3001
|
+
if check == "col-exists" and not step_passed:
|
|
3002
|
+
if show_extract:
|
|
3003
|
+
console.print(f"[dim]The column '{column}' was not found in the dataset.[/dim]")
|
|
3004
|
+
console.print(
|
|
3005
|
+
"[dim]Use --show-extract with other check types to see failing data rows.[/dim]"
|
|
3006
|
+
)
|
|
3007
|
+
if write_extract:
|
|
3008
|
+
console.print("[yellow]Cannot save failing rows when column doesn't exist[/yellow]")
|
|
3009
|
+
else:
|
|
3010
|
+
try:
|
|
3011
|
+
# Get failing rows extract - use step_index + 1 since extracts are 1-indexed
|
|
3012
|
+
failing_rows = validation.get_data_extracts(i=step_index + 1, frame=True)
|
|
3013
|
+
|
|
3014
|
+
if failing_rows is not None and len(failing_rows) > 0:
|
|
3015
|
+
if show_extract:
|
|
3016
|
+
# Limit the number of rows shown
|
|
3017
|
+
if len(failing_rows) > limit:
|
|
3018
|
+
display_rows = failing_rows.head(limit)
|
|
3019
|
+
console.print(
|
|
3020
|
+
f"[dim]Showing first {limit} of {len(failing_rows)} {row_type}[/dim]"
|
|
3021
|
+
)
|
|
3022
|
+
else:
|
|
3023
|
+
display_rows = failing_rows
|
|
3024
|
+
console.print(f"[dim]Showing all {len(failing_rows)} {row_type}[/dim]")
|
|
3025
|
+
|
|
3026
|
+
# Create a preview table using pointblank's preview function
|
|
3027
|
+
import pointblank as pb
|
|
3028
|
+
|
|
3029
|
+
preview_table = pb.preview(
|
|
3030
|
+
data=display_rows,
|
|
3031
|
+
n_head=min(limit, len(display_rows)),
|
|
3032
|
+
n_tail=0,
|
|
3033
|
+
limit=limit,
|
|
3034
|
+
show_row_numbers=True,
|
|
3035
|
+
)
|
|
3036
|
+
|
|
3037
|
+
# Display using our Rich table function
|
|
3038
|
+
_rich_print_gt_table(preview_table, show_summary=False)
|
|
3039
|
+
|
|
3040
|
+
if write_extract:
|
|
3041
|
+
try:
|
|
3042
|
+
from pathlib import Path
|
|
3043
|
+
|
|
3044
|
+
folder_name = write_extract
|
|
3045
|
+
|
|
3046
|
+
# Create the output folder
|
|
3047
|
+
output_folder = Path(folder_name)
|
|
3048
|
+
output_folder.mkdir(parents=True, exist_ok=True)
|
|
3049
|
+
|
|
3050
|
+
# Create safe filename from check type
|
|
3051
|
+
safe_check_type = check.replace("-", "_")
|
|
3052
|
+
filename = f"step_{step_index + 1:02d}_{safe_check_type}.csv"
|
|
3053
|
+
filepath = output_folder / filename
|
|
3054
|
+
|
|
3055
|
+
# Limit the output if needed
|
|
3056
|
+
write_rows = failing_rows
|
|
3057
|
+
if len(failing_rows) > limit:
|
|
3058
|
+
write_rows = failing_rows.head(limit)
|
|
3059
|
+
|
|
3060
|
+
# Save to CSV
|
|
3061
|
+
if hasattr(write_rows, "write_csv"):
|
|
3062
|
+
# Polars
|
|
3063
|
+
write_rows.write_csv(str(filepath))
|
|
3064
|
+
elif hasattr(write_rows, "to_csv"):
|
|
3065
|
+
# Pandas
|
|
3066
|
+
write_rows.to_csv(str(filepath), index=False)
|
|
3067
|
+
else:
|
|
3068
|
+
# Try converting to pandas as fallback
|
|
3069
|
+
import pandas as pd
|
|
3070
|
+
|
|
3071
|
+
pd_data = pd.DataFrame(write_rows)
|
|
3072
|
+
pd_data.to_csv(str(filepath), index=False)
|
|
3073
|
+
|
|
3074
|
+
rows_saved = (
|
|
3075
|
+
len(write_rows) if hasattr(write_rows, "__len__") else limit
|
|
3076
|
+
)
|
|
3077
|
+
console.print(
|
|
3078
|
+
f"[green]✓[/green] Failing rows saved to folder: {output_folder}"
|
|
3079
|
+
)
|
|
3080
|
+
console.print(f"[dim] - {filename}: {rows_saved} rows[/dim]")
|
|
3081
|
+
except Exception as e:
|
|
3082
|
+
console.print(
|
|
3083
|
+
f"[yellow]Warning: Could not save failing rows: {e}[/yellow]"
|
|
3084
|
+
)
|
|
3085
|
+
else:
|
|
3086
|
+
if show_extract:
|
|
3087
|
+
console.print("[yellow]No failing rows could be extracted[/yellow]")
|
|
3088
|
+
if write_extract:
|
|
3089
|
+
console.print("[yellow]No failing rows could be extracted to save[/yellow]")
|
|
3090
|
+
except Exception as e:
|
|
3091
|
+
if show_extract:
|
|
3092
|
+
console.print(f"[yellow]Could not extract failing rows: {e}[/yellow]")
|
|
3093
|
+
if write_extract:
|
|
3094
|
+
console.print(f"[yellow]Could not extract failing rows to save: {e}[/yellow]")
|
|
3095
|
+
|
|
3096
|
+
# Summary message
|
|
3097
|
+
console.print()
|
|
3098
|
+
if step_passed:
|
|
3099
|
+
if check == "rows-distinct":
|
|
3100
|
+
success_message = (
|
|
3101
|
+
f"[green]✓ Validation PASSED: No duplicate rows found in {data_source}[/green]"
|
|
3102
|
+
)
|
|
3103
|
+
elif check == "col-vals-not-null":
|
|
3104
|
+
success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {data_source}[/green]"
|
|
3105
|
+
elif check == "rows-complete":
|
|
3106
|
+
success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {data_source}[/green]"
|
|
3107
|
+
elif check == "col-exists":
|
|
3108
|
+
success_message = (
|
|
3109
|
+
f"[green]✓ Validation PASSED: Column '{column}' exists in {data_source}[/green]"
|
|
3110
|
+
)
|
|
3111
|
+
elif check == "col-vals-in-set":
|
|
3112
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {data_source}[/green]"
|
|
3113
|
+
elif check == "col-vals-gt":
|
|
3114
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {data_source}[/green]"
|
|
3115
|
+
elif check == "col-vals-ge":
|
|
3116
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {data_source}[/green]"
|
|
3117
|
+
elif check == "col-vals-lt":
|
|
3118
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {data_source}[/green]"
|
|
3119
|
+
elif check == "col-vals-le":
|
|
3120
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {data_source}[/green]"
|
|
3121
|
+
else:
|
|
3122
|
+
success_message = (
|
|
3123
|
+
f"[green]✓ Validation PASSED: {check} check passed for {data_source}[/green]"
|
|
3124
|
+
)
|
|
3125
|
+
|
|
3126
|
+
console.print(Panel(success_message, border_style="green"))
|
|
3127
|
+
else:
|
|
3128
|
+
if step_info:
|
|
3129
|
+
if check == "rows-distinct":
|
|
3130
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {data_source}[/red]"
|
|
3131
|
+
elif check == "col-vals-not-null":
|
|
3132
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {data_source}[/red]"
|
|
3133
|
+
elif check == "rows-complete":
|
|
3134
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {data_source}[/red]"
|
|
3135
|
+
elif check == "col-exists":
|
|
3136
|
+
failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {data_source}[/red]"
|
|
3137
|
+
elif check == "col-vals-in-set":
|
|
3138
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {data_source}[/red]"
|
|
3139
|
+
elif check == "col-vals-gt":
|
|
3140
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {data_source}[/red]"
|
|
3141
|
+
elif check == "col-vals-ge":
|
|
3142
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {data_source}[/red]"
|
|
3143
|
+
elif check == "col-vals-lt":
|
|
3144
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {data_source}[/red]"
|
|
3145
|
+
elif check == "col-vals-le":
|
|
3146
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {data_source}[/red]"
|
|
3147
|
+
else:
|
|
3148
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {data_source}[/red]"
|
|
3149
|
+
|
|
3150
|
+
# Add hint about --show-extract if not already used (except for col-exists which has no rows to show)
|
|
3151
|
+
if not show_extract and check != "col-exists":
|
|
3152
|
+
failure_message += "\n[bright_blue]💡 Tip:[/bright_blue] [cyan]Use --show-extract to see the failing rows[/cyan]"
|
|
3153
|
+
|
|
3154
|
+
console.print(Panel(failure_message, border_style="red"))
|
|
3155
|
+
else:
|
|
3156
|
+
if check == "rows-distinct":
|
|
3157
|
+
failure_message = (
|
|
3158
|
+
f"[red]✗ Validation FAILED: Duplicate rows found in {data_source}[/red]"
|
|
3159
|
+
)
|
|
3160
|
+
elif check == "rows-complete":
|
|
3161
|
+
failure_message = (
|
|
3162
|
+
f"[red]✗ Validation FAILED: Incomplete rows found in {data_source}[/red]"
|
|
3163
|
+
)
|
|
3164
|
+
else:
|
|
3165
|
+
failure_message = (
|
|
3166
|
+
f"[red]✗ Validation FAILED: {check} check failed for {data_source}[/red]"
|
|
3167
|
+
)
|
|
3168
|
+
|
|
3169
|
+
# Add hint about --show-extract if not already used
|
|
3170
|
+
if not show_extract:
|
|
3171
|
+
failure_message += "\n[bright_blue]💡 Tip:[/bright_blue] [cyan]Use --show-extract to see the failing rows[/cyan]"
|
|
3172
|
+
|
|
3173
|
+
console.print(Panel(failure_message, border_style="red"))
|
|
3174
|
+
|
|
3175
|
+
|
|
3176
|
+
@cli.command()
|
|
3177
|
+
@click.argument("output_file", type=click.Path())
|
|
3178
|
+
def make_template(output_file: str):
|
|
3179
|
+
"""
|
|
3180
|
+
Create a validation script template.
|
|
3181
|
+
|
|
3182
|
+
Creates a sample Python script with examples showing how to use Pointblank
|
|
3183
|
+
for data validation. Edit the template to add your own data loading and
|
|
3184
|
+
validation rules, then run it with 'pb run'.
|
|
3185
|
+
|
|
3186
|
+
OUTPUT_FILE is the path where the template script will be created.
|
|
3187
|
+
|
|
3188
|
+
Examples:
|
|
3189
|
+
|
|
3190
|
+
\b
|
|
3191
|
+
pb make-template my_validation.py
|
|
3192
|
+
pb make-template validation_template.py
|
|
3193
|
+
"""
|
|
3194
|
+
example_script = '''"""
|
|
3195
|
+
Example Pointblank validation script.
|
|
3196
|
+
|
|
3197
|
+
This script demonstrates how to create validation rules for your data.
|
|
3198
|
+
Modify the data loading and validation rules below to match your requirements.
|
|
3199
|
+
"""
|
|
3200
|
+
|
|
3201
|
+
import pointblank as pb
|
|
3202
|
+
|
|
3203
|
+
# Load your data (replace this with your actual data source)
|
|
3204
|
+
# You can load from various sources:
|
|
3205
|
+
# data = pb.load_dataset("small_table") # Built-in dataset
|
|
3206
|
+
# data = pd.read_csv("your_data.csv") # CSV file
|
|
3207
|
+
# data = pl.read_parquet("data.parquet") # Parquet file
|
|
3208
|
+
# data = pb.load_data("database://connection") # Database
|
|
3209
|
+
|
|
3210
|
+
data = pb.load_dataset("small_table") # Example with built-in dataset
|
|
3211
|
+
|
|
3212
|
+
# Create a validation object
|
|
3213
|
+
validation = (
|
|
3214
|
+
pb.Validate(
|
|
3215
|
+
data=data,
|
|
3216
|
+
tbl_name="Example Data",
|
|
3217
|
+
label="Validation Example",
|
|
3218
|
+
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
3219
|
+
)
|
|
3220
|
+
# Add your validation rules here
|
|
3221
|
+
# Example rules (modify these based on your data structure):
|
|
3222
|
+
|
|
3223
|
+
# Check that specific columns exist
|
|
3224
|
+
# .col_exists(["column1", "column2"])
|
|
3225
|
+
|
|
3226
|
+
# Check for null values
|
|
3227
|
+
# .col_vals_not_null(columns="important_column")
|
|
3228
|
+
|
|
3229
|
+
# Check value ranges
|
|
3230
|
+
# .col_vals_gt(columns="amount", value=0)
|
|
3231
|
+
# .col_vals_between(columns="score", left=0, right=100)
|
|
3232
|
+
|
|
3233
|
+
# Check string patterns
|
|
3234
|
+
# .col_vals_regex(columns="email", pattern=r"^[\\w\\.-]+@[\\w\\.-]+\\.[a-zA-Z]{2,}$")
|
|
3235
|
+
|
|
3236
|
+
# Check unique values
|
|
3237
|
+
# .col_vals_unique(columns="id")
|
|
3238
|
+
|
|
3239
|
+
# Finalize the validation
|
|
3240
|
+
.interrogate()
|
|
3241
|
+
)
|
|
3242
|
+
|
|
3243
|
+
# The validation object will be automatically used by the CLI
|
|
3244
|
+
# You can also access results programmatically:
|
|
3245
|
+
# print(f"All passed: {validation.all_passed()}")
|
|
3246
|
+
# print(f"Failed steps: {validation.n_failed()}")
|
|
3247
|
+
'''
|
|
3248
|
+
|
|
3249
|
+
Path(output_file).write_text(example_script)
|
|
3250
|
+
console.print(f"[green]✓[/green] Validation script template created: {output_file}")
|
|
3251
|
+
console.print("\nEdit the template to add your data loading and validation rules, then run:")
|
|
3252
|
+
console.print(f"[cyan]pb run {output_file}[/cyan]")
|
|
3253
|
+
console.print(
|
|
3254
|
+
f"[cyan]pb run {output_file} --data your_data.csv[/cyan] [dim]# Override data source[/dim]"
|
|
3255
|
+
)
|
|
3256
|
+
|
|
3257
|
+
|
|
3258
|
+
@cli.command()
|
|
3259
|
+
@click.argument("validation_script", type=click.Path(exists=True))
|
|
3260
|
+
@click.option("--data", type=str, help="Optional data source to override script's data loading")
|
|
3261
|
+
@click.option("--output-html", type=click.Path(), help="Save HTML validation report to file")
|
|
3262
|
+
@click.option("--output-json", type=click.Path(), help="Save JSON validation summary to file")
|
|
3263
|
+
@click.option(
|
|
3264
|
+
"--show-extract", is_flag=True, help="Show extract of failing rows if validation fails"
|
|
3265
|
+
)
|
|
3266
|
+
@click.option(
|
|
3267
|
+
"--write-extract",
|
|
3268
|
+
type=str,
|
|
3269
|
+
help="Save failing rows to folders (one CSV per step). Provide base name for folder.",
|
|
3270
|
+
)
|
|
3271
|
+
@click.option(
|
|
3272
|
+
"--limit", "-l", default=10, help="Maximum number of failing rows to show/save (default: 10)"
|
|
3273
|
+
)
|
|
3274
|
+
@click.option(
|
|
3275
|
+
"--fail-on",
|
|
3276
|
+
type=click.Choice(["critical", "error", "warning", "any"], case_sensitive=False),
|
|
3277
|
+
help="Exit with non-zero code when validation reaches this threshold level",
|
|
3278
|
+
)
|
|
3279
|
+
def run(
|
|
3280
|
+
validation_script: str,
|
|
3281
|
+
data: str | None,
|
|
3282
|
+
output_html: str | None,
|
|
3283
|
+
output_json: str | None,
|
|
3284
|
+
show_extract: bool,
|
|
3285
|
+
write_extract: str | None,
|
|
3286
|
+
limit: int,
|
|
3287
|
+
fail_on: str | None,
|
|
3288
|
+
):
|
|
3289
|
+
"""
|
|
3290
|
+
Run a Pointblank validation script.
|
|
3291
|
+
|
|
3292
|
+
VALIDATION_SCRIPT should be a Python file that defines validation logic.
|
|
3293
|
+
The script should load its own data and create validation objects.
|
|
3294
|
+
|
|
3295
|
+
If --data is provided, it will be available as a 'cli_data' variable in the script,
|
|
3296
|
+
allowing you to optionally override your script's data loading.
|
|
3297
|
+
|
|
3298
|
+
DATA can be:
|
|
3299
|
+
|
|
3300
|
+
\b
|
|
3301
|
+
- CSV file path (e.g., data.csv)
|
|
3302
|
+
- Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
|
|
3303
|
+
- GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
|
|
3304
|
+
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
3305
|
+
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
3306
|
+
|
|
3307
|
+
Examples:
|
|
3308
|
+
|
|
3309
|
+
\b
|
|
3310
|
+
pb run validation_script.py
|
|
3311
|
+
pb run validation_script.py --data data.csv
|
|
3312
|
+
pb run validation_script.py --data small_table --output-html report.html
|
|
3313
|
+
pb run validation_script.py --show-extract --fail-on error
|
|
3314
|
+
pb run validation_script.py --write-extract extracts_folder --fail-on critical
|
|
3315
|
+
"""
|
|
3316
|
+
try:
|
|
3317
|
+
# Load optional data override if provided
|
|
3318
|
+
cli_data = None
|
|
3319
|
+
if data:
|
|
3320
|
+
with console.status(f"[bold green]Loading data from {data}..."):
|
|
3321
|
+
cli_data = _load_data_source(data)
|
|
3322
|
+
console.print(f"[green]✓[/green] Loaded data override: {data}")
|
|
3323
|
+
|
|
3324
|
+
# Execute the validation script
|
|
3325
|
+
with console.status("[bold green]Running validation script..."):
|
|
3326
|
+
# Read and execute the validation script
|
|
3327
|
+
script_content = Path(validation_script).read_text()
|
|
3328
|
+
|
|
3329
|
+
# Create a namespace with pointblank and optional CLI data
|
|
3330
|
+
namespace = {
|
|
3331
|
+
"pb": pb,
|
|
3332
|
+
"pointblank": pb,
|
|
3333
|
+
"cli_data": cli_data, # Available if --data was provided
|
|
3334
|
+
"__name__": "__main__",
|
|
3335
|
+
"__file__": str(Path(validation_script).resolve()),
|
|
3336
|
+
}
|
|
3337
|
+
|
|
3338
|
+
# Execute the script
|
|
3339
|
+
try:
|
|
3340
|
+
exec(script_content, namespace)
|
|
3341
|
+
except Exception as e:
|
|
3342
|
+
console.print(f"[red]Error executing validation script:[/red] {e}")
|
|
3343
|
+
sys.exit(1)
|
|
3344
|
+
|
|
3345
|
+
# Look for validation objects in the namespace
|
|
3346
|
+
validations = []
|
|
3347
|
+
|
|
3348
|
+
# Look for the 'validation' variable specifically first
|
|
3349
|
+
if "validation" in namespace:
|
|
3350
|
+
validations.append(namespace["validation"])
|
|
3351
|
+
|
|
3352
|
+
# Also look for any other validation objects
|
|
3353
|
+
for key, value in namespace.items():
|
|
3354
|
+
if (
|
|
3355
|
+
key != "validation"
|
|
3356
|
+
and hasattr(value, "interrogate")
|
|
3357
|
+
and hasattr(value, "validation_info")
|
|
3358
|
+
):
|
|
3359
|
+
validations.append(value)
|
|
3360
|
+
# Also check if it's a Validate object that has been interrogated
|
|
3361
|
+
elif key != "validation" and str(type(value)).find("Validate") != -1:
|
|
3362
|
+
validations.append(value)
|
|
3363
|
+
|
|
3364
|
+
if not validations:
|
|
3365
|
+
raise ValueError(
|
|
3366
|
+
"No validation objects found in script. "
|
|
3367
|
+
"Script should create Validate objects and call .interrogate() on them."
|
|
3368
|
+
)
|
|
3369
|
+
|
|
3370
|
+
console.print(f"[green]✓[/green] Found {len(validations)} validation object(s)")
|
|
3371
|
+
|
|
3372
|
+
# Process each validation
|
|
3373
|
+
overall_failed = False
|
|
3374
|
+
overall_critical = False
|
|
3375
|
+
overall_error = False
|
|
3376
|
+
overall_warning = False
|
|
3377
|
+
|
|
3378
|
+
for i, validation in enumerate(validations, 1):
|
|
3379
|
+
if len(validations) > 1:
|
|
3380
|
+
console.print(f"\n[bold cyan]Validation {i}:[/bold cyan]")
|
|
3381
|
+
|
|
3382
|
+
# Display summary
|
|
3383
|
+
_display_validation_summary(validation)
|
|
3384
|
+
|
|
3385
|
+
# Check failure status
|
|
3386
|
+
validation_failed = False
|
|
3387
|
+
has_critical = False
|
|
3388
|
+
has_error = False
|
|
3389
|
+
has_warning = False
|
|
3390
|
+
|
|
3391
|
+
if hasattr(validation, "validation_info") and validation.validation_info:
|
|
3392
|
+
for step_info in validation.validation_info:
|
|
3393
|
+
if step_info.critical:
|
|
3394
|
+
has_critical = True
|
|
3395
|
+
overall_critical = True
|
|
3396
|
+
if step_info.error:
|
|
3397
|
+
has_error = True
|
|
3398
|
+
overall_error = True
|
|
3399
|
+
if step_info.warning:
|
|
3400
|
+
has_warning = True
|
|
3401
|
+
overall_warning = True
|
|
3402
|
+
if step_info.n_failed > 0:
|
|
3403
|
+
validation_failed = True
|
|
3404
|
+
overall_failed = True
|
|
3405
|
+
|
|
3406
|
+
# Handle extract functionality for failed validations
|
|
3407
|
+
failed_steps = []
|
|
3408
|
+
if (
|
|
3409
|
+
validation_failed
|
|
3410
|
+
and hasattr(validation, "validation_info")
|
|
3411
|
+
and validation.validation_info
|
|
3412
|
+
):
|
|
3413
|
+
for j, step_info in enumerate(validation.validation_info, 1):
|
|
3414
|
+
if step_info.n_failed > 0:
|
|
3415
|
+
failed_steps.append((j, step_info))
|
|
3416
|
+
|
|
3417
|
+
if validation_failed and failed_steps and (show_extract or write_extract):
|
|
3418
|
+
console.print()
|
|
3419
|
+
|
|
3420
|
+
if show_extract:
|
|
3421
|
+
extract_title = "Extract of failing rows from validation steps"
|
|
3422
|
+
if len(validations) > 1:
|
|
3423
|
+
extract_title += f" (Validation {i})"
|
|
3424
|
+
console.print(f"[yellow]{extract_title}:[/yellow]")
|
|
3425
|
+
|
|
3426
|
+
for step_num, step_info in failed_steps:
|
|
3427
|
+
try:
|
|
3428
|
+
failing_rows = validation.get_data_extracts(i=step_num, frame=True)
|
|
3429
|
+
|
|
3430
|
+
if failing_rows is not None and len(failing_rows) > 0:
|
|
3431
|
+
console.print(
|
|
3432
|
+
f"\n[cyan]Step {step_num}:[/cyan] {step_info.assertion_type}"
|
|
3433
|
+
)
|
|
3434
|
+
|
|
3435
|
+
# Limit the number of rows shown
|
|
3436
|
+
if len(failing_rows) > limit:
|
|
3437
|
+
display_rows = failing_rows.head(limit)
|
|
3438
|
+
console.print(
|
|
3439
|
+
f"[dim]Showing first {limit} of {len(failing_rows)} failing rows[/dim]"
|
|
3440
|
+
)
|
|
3441
|
+
else:
|
|
3442
|
+
display_rows = failing_rows
|
|
3443
|
+
console.print(
|
|
3444
|
+
f"[dim]Showing all {len(failing_rows)} failing rows[/dim]"
|
|
3445
|
+
)
|
|
3446
|
+
|
|
3447
|
+
# Create a preview table using pointblank's preview function
|
|
3448
|
+
preview_table = pb.preview(
|
|
3449
|
+
data=display_rows,
|
|
3450
|
+
n_head=min(limit, len(display_rows)),
|
|
3451
|
+
n_tail=0,
|
|
3452
|
+
limit=limit,
|
|
3453
|
+
show_row_numbers=True,
|
|
3454
|
+
)
|
|
3455
|
+
|
|
3456
|
+
# Display using our Rich table function
|
|
3457
|
+
_rich_print_gt_table(preview_table, show_summary=False)
|
|
3458
|
+
else:
|
|
3459
|
+
console.print(
|
|
3460
|
+
f"\n[cyan]Step {step_num}:[/cyan] {step_info.assertion_type}"
|
|
3461
|
+
)
|
|
3462
|
+
console.print("[yellow]No failing rows could be extracted[/yellow]")
|
|
3463
|
+
except Exception as e:
|
|
3464
|
+
console.print(
|
|
3465
|
+
f"\n[cyan]Step {step_num}:[/cyan] {step_info.assertion_type}"
|
|
3466
|
+
)
|
|
3467
|
+
console.print(f"[yellow]Could not extract failing rows: {e}[/yellow]")
|
|
3468
|
+
|
|
3469
|
+
if write_extract:
|
|
3470
|
+
try:
|
|
3471
|
+
folder_name = write_extract
|
|
3472
|
+
|
|
3473
|
+
# Add validation number if multiple validations
|
|
3474
|
+
if len(validations) > 1:
|
|
3475
|
+
folder_name = f"{folder_name}_validation_{i}"
|
|
3476
|
+
|
|
3477
|
+
# Create the output folder
|
|
3478
|
+
output_folder = Path(folder_name)
|
|
3479
|
+
output_folder.mkdir(parents=True, exist_ok=True)
|
|
3480
|
+
|
|
3481
|
+
saved_files = []
|
|
3482
|
+
|
|
3483
|
+
# Save each failing step to its own CSV file
|
|
3484
|
+
for step_num, step_info in failed_steps:
|
|
3485
|
+
try:
|
|
3486
|
+
failing_rows = validation.get_data_extracts(i=step_num, frame=True)
|
|
3487
|
+
if failing_rows is not None and len(failing_rows) > 0:
|
|
3488
|
+
# Create safe filename from assertion type
|
|
3489
|
+
safe_assertion_type = (
|
|
3490
|
+
step_info.assertion_type.replace(" ", "_")
|
|
3491
|
+
.replace("/", "_")
|
|
3492
|
+
.replace("\\", "_")
|
|
3493
|
+
.replace(":", "_")
|
|
3494
|
+
.replace("<", "_")
|
|
3495
|
+
.replace(">", "_")
|
|
3496
|
+
.replace("|", "_")
|
|
3497
|
+
.replace("?", "_")
|
|
3498
|
+
.replace("*", "_")
|
|
3499
|
+
.replace('"', "_")
|
|
3500
|
+
)
|
|
3501
|
+
|
|
3502
|
+
filename = f"step_{step_num:02d}_{safe_assertion_type}.csv"
|
|
3503
|
+
filepath = output_folder / filename
|
|
3504
|
+
|
|
3505
|
+
# Limit the output if needed
|
|
3506
|
+
save_rows = failing_rows
|
|
3507
|
+
if hasattr(failing_rows, "head") and len(failing_rows) > limit:
|
|
3508
|
+
save_rows = failing_rows.head(limit)
|
|
3509
|
+
|
|
3510
|
+
# Save to CSV
|
|
3511
|
+
if hasattr(save_rows, "write_csv"):
|
|
3512
|
+
# Polars
|
|
3513
|
+
save_rows.write_csv(str(filepath))
|
|
3514
|
+
elif hasattr(save_rows, "to_csv"):
|
|
3515
|
+
# Pandas
|
|
3516
|
+
save_rows.to_csv(str(filepath), index=False)
|
|
3517
|
+
else:
|
|
3518
|
+
# Try converting to pandas as fallback
|
|
3519
|
+
import pandas as pd
|
|
3520
|
+
|
|
3521
|
+
pd_data = pd.DataFrame(save_rows)
|
|
3522
|
+
pd_data.to_csv(str(filepath), index=False)
|
|
3523
|
+
|
|
3524
|
+
saved_files.append((filename, len(failing_rows)))
|
|
3525
|
+
|
|
3526
|
+
except Exception as e:
|
|
3527
|
+
console.print(
|
|
3528
|
+
f"[yellow]Warning: Could not save failing rows from step {step_num}: {e}[/yellow]"
|
|
3529
|
+
)
|
|
3530
|
+
|
|
3531
|
+
if saved_files:
|
|
3532
|
+
console.print(
|
|
3533
|
+
f"[green]✓[/green] Failing rows saved to folder: {output_folder}"
|
|
3534
|
+
)
|
|
3535
|
+
for filename, row_count in saved_files:
|
|
3536
|
+
console.print(f"[dim] - {filename}: {row_count} rows[/dim]")
|
|
3537
|
+
else:
|
|
3538
|
+
console.print(
|
|
3539
|
+
"[yellow]No failing rows could be extracted to save[/yellow]"
|
|
3540
|
+
)
|
|
3541
|
+
|
|
3542
|
+
except Exception as e:
|
|
3543
|
+
console.print(
|
|
3544
|
+
f"[yellow]Warning: Could not save failing rows to CSV: {e}[/yellow]"
|
|
3545
|
+
)
|
|
3546
|
+
|
|
3547
|
+
# Save HTML and JSON outputs (combine multiple validations if needed)
|
|
3548
|
+
if output_html:
|
|
3549
|
+
try:
|
|
3550
|
+
if len(validations) == 1:
|
|
3551
|
+
# Single validation - save directly
|
|
3552
|
+
html_content = validations[0]._repr_html_()
|
|
3553
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
3554
|
+
else:
|
|
3555
|
+
# Multiple validations - combine them
|
|
3556
|
+
html_parts = []
|
|
3557
|
+
html_parts.append("<html><body>")
|
|
3558
|
+
html_parts.append("<h1>Pointblank Validation Report</h1>")
|
|
3559
|
+
|
|
3560
|
+
for i, validation in enumerate(validations, 1):
|
|
3561
|
+
html_parts.append(f"<h2>Validation {i}</h2>")
|
|
3562
|
+
html_parts.append(validation._repr_html_())
|
|
3563
|
+
|
|
3564
|
+
html_parts.append("</body></html>")
|
|
3565
|
+
html_content = "\n".join(html_parts)
|
|
3566
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
3567
|
+
|
|
3568
|
+
console.print(f"[green]✓[/green] HTML report saved to: {output_html}")
|
|
3569
|
+
except Exception as e:
|
|
3570
|
+
console.print(f"[yellow]Warning: Could not save HTML report: {e}[/yellow]")
|
|
3571
|
+
|
|
3572
|
+
if output_json:
|
|
3573
|
+
try:
|
|
3574
|
+
if len(validations) == 1:
|
|
3575
|
+
# Single validation - save directly
|
|
3576
|
+
json_report = validations[0].get_json_report()
|
|
3577
|
+
Path(output_json).write_text(json_report, encoding="utf-8")
|
|
3578
|
+
else:
|
|
3579
|
+
# Multiple validations - combine them
|
|
3580
|
+
import json
|
|
3581
|
+
|
|
3582
|
+
combined_report = {"validations": []}
|
|
3583
|
+
|
|
3584
|
+
for i, validation in enumerate(validations, 1):
|
|
3585
|
+
validation_json = json.loads(validation.get_json_report())
|
|
3586
|
+
validation_json["validation_id"] = i
|
|
3587
|
+
combined_report["validations"].append(validation_json)
|
|
3588
|
+
|
|
3589
|
+
Path(output_json).write_text(
|
|
3590
|
+
json.dumps(combined_report, indent=2), encoding="utf-8"
|
|
3591
|
+
)
|
|
3592
|
+
|
|
3593
|
+
console.print(f"[green]✓[/green] JSON summary saved to: {output_json}")
|
|
3594
|
+
except Exception as e:
|
|
3595
|
+
console.print(f"[yellow]Warning: Could not save JSON report: {e}[/yellow]")
|
|
3596
|
+
|
|
3597
|
+
# Check if we should fail based on threshold
|
|
3598
|
+
if fail_on:
|
|
3599
|
+
should_exit = False
|
|
3600
|
+
exit_reason = ""
|
|
3601
|
+
|
|
3602
|
+
if fail_on.lower() == "critical" and overall_critical:
|
|
3603
|
+
should_exit = True
|
|
3604
|
+
exit_reason = "critical validation failures"
|
|
3605
|
+
elif fail_on.lower() == "error" and (overall_critical or overall_error):
|
|
3606
|
+
should_exit = True
|
|
3607
|
+
exit_reason = "error or critical validation failures"
|
|
3608
|
+
elif fail_on.lower() == "warning" and (
|
|
3609
|
+
overall_critical or overall_error or overall_warning
|
|
3610
|
+
):
|
|
3611
|
+
should_exit = True
|
|
3612
|
+
exit_reason = "warning, error, or critical validation failures"
|
|
3613
|
+
elif fail_on.lower() == "any" and overall_failed:
|
|
3614
|
+
should_exit = True
|
|
3615
|
+
exit_reason = "validation failures"
|
|
3616
|
+
|
|
3617
|
+
if should_exit:
|
|
3618
|
+
console.print(f"[red]Exiting with error due to {exit_reason}[/red]")
|
|
3619
|
+
sys.exit(1)
|
|
3620
|
+
|
|
3621
|
+
except Exception as e:
|
|
3622
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
3623
|
+
sys.exit(1)
|
|
3624
|
+
|
|
3625
|
+
|
|
3626
|
+
def _format_missing_percentage(value: float) -> str:
|
|
3627
|
+
"""Format missing value percentages for display.
|
|
3628
|
+
|
|
3629
|
+
Args:
|
|
3630
|
+
value: The percentage value (0-100)
|
|
3631
|
+
|
|
3632
|
+
Returns:
|
|
3633
|
+
Formatted string with proper percentage display
|
|
3634
|
+
"""
|
|
3635
|
+
if value == 0.0:
|
|
3636
|
+
return "[green]●[/green]" # Large green circle for no missing values
|
|
3637
|
+
elif value == 100.0:
|
|
3638
|
+
return "[red]●[/red]" # Large red circle for completely missing values
|
|
3639
|
+
elif value < 1.0 and value > 0:
|
|
3640
|
+
return "<1%" # Less than 1%
|
|
3641
|
+
elif value > 99.0 and value < 100.0:
|
|
3642
|
+
return ">99%" # More than 99%
|
|
3643
|
+
else:
|
|
3644
|
+
return f"{int(round(value))}%" # Round to nearest integer with % sign
|