pointblank 0.11.2__py3-none-any.whl → 0.11.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/cli.py +1456 -172
- {pointblank-0.11.2.dist-info → pointblank-0.11.3.dist-info}/METADATA +48 -1
- {pointblank-0.11.2.dist-info → pointblank-0.11.3.dist-info}/RECORD +7 -7
- {pointblank-0.11.2.dist-info → pointblank-0.11.3.dist-info}/WHEEL +0 -0
- {pointblank-0.11.2.dist-info → pointblank-0.11.3.dist-info}/entry_points.txt +0 -0
- {pointblank-0.11.2.dist-info → pointblank-0.11.3.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.11.2.dist-info → pointblank-0.11.3.dist-info}/top_level.txt +0 -0
pointblank/cli.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import copy
|
|
4
|
+
import os
|
|
5
|
+
import shutil
|
|
4
6
|
import sys
|
|
5
7
|
from pathlib import Path
|
|
6
8
|
from typing import Any
|
|
@@ -32,6 +34,8 @@ class OrderedGroup(click.Group):
|
|
|
32
34
|
"validate",
|
|
33
35
|
"run",
|
|
34
36
|
"make-template",
|
|
37
|
+
# Data Manipulation
|
|
38
|
+
"pl",
|
|
35
39
|
# Utilities
|
|
36
40
|
"datasets",
|
|
37
41
|
"requirements",
|
|
@@ -91,6 +95,15 @@ def _load_data_source(data_source: str) -> Any:
|
|
|
91
95
|
return _process_data(data_source)
|
|
92
96
|
|
|
93
97
|
|
|
98
|
+
def _is_piped_data_source(data_source: str) -> bool:
|
|
99
|
+
"""Check if the data source is from a piped pb command."""
|
|
100
|
+
return (
|
|
101
|
+
data_source
|
|
102
|
+
and ("pb_pipe_" in data_source)
|
|
103
|
+
and (data_source.startswith("/var/folders/") or data_source.startswith("/tmp/"))
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
94
107
|
def _format_cell_value(
|
|
95
108
|
value: Any, is_row_number: bool = False, max_width: int = 50, num_columns: int = 10
|
|
96
109
|
) -> str:
|
|
@@ -558,9 +571,12 @@ def _rich_print_gt_table(
|
|
|
558
571
|
gt_table: The GT table object to display
|
|
559
572
|
preview_info: Optional dict with preview context info:
|
|
560
573
|
- total_rows: Total rows in the dataset
|
|
574
|
+
- total_columns: Total columns in the dataset
|
|
561
575
|
- head_rows: Number of head rows shown
|
|
562
576
|
- tail_rows: Number of tail rows shown
|
|
563
577
|
- is_complete: Whether the entire dataset is shown
|
|
578
|
+
- source_type: Type of data source (e.g., "External source: worldcities_new.csv")
|
|
579
|
+
- table_type: Type of table (e.g., "polars")
|
|
564
580
|
show_summary: Whether to show the row count summary at the bottom
|
|
565
581
|
"""
|
|
566
582
|
try:
|
|
@@ -593,6 +609,12 @@ def _rich_print_gt_table(
|
|
|
593
609
|
table_type = preview_info["table_type"]
|
|
594
610
|
table_title = f"Data Preview / {source_type} / {table_type}"
|
|
595
611
|
|
|
612
|
+
# Add dimensions subtitle in gray if available
|
|
613
|
+
total_rows = preview_info.get("total_rows")
|
|
614
|
+
total_columns = preview_info.get("total_columns")
|
|
615
|
+
if total_rows is not None and total_columns is not None:
|
|
616
|
+
table_title += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
|
|
617
|
+
|
|
596
618
|
rich_table = Table(
|
|
597
619
|
title=table_title,
|
|
598
620
|
show_header=True,
|
|
@@ -1209,20 +1231,31 @@ def _display_validation_summary(validation: Any) -> None:
|
|
|
1209
1231
|
|
|
1210
1232
|
|
|
1211
1233
|
@click.group(cls=OrderedGroup)
|
|
1212
|
-
@click.version_option(
|
|
1234
|
+
@click.version_option(pb.__version__, "-v", "--version", prog_name="pb")
|
|
1235
|
+
@click.help_option("-h", "--help")
|
|
1213
1236
|
def cli():
|
|
1214
1237
|
"""
|
|
1215
1238
|
Pointblank CLI: Data validation and quality tools for data engineers.
|
|
1216
1239
|
|
|
1217
|
-
Use this CLI to
|
|
1218
|
-
|
|
1240
|
+
Use this CLI to validate data quality, explore datasets, and generate comprehensive
|
|
1241
|
+
reports for CSV, Parquet, and database sources. Suitable for data pipelines, ETL
|
|
1242
|
+
validation, and exploratory data analysis from the command line.
|
|
1243
|
+
|
|
1244
|
+
Quick Examples:
|
|
1245
|
+
|
|
1246
|
+
\b
|
|
1247
|
+
pb preview data.csv Preview your data
|
|
1248
|
+
pb scan data.csv Generate data profile
|
|
1249
|
+
pb validate data.csv Run basic validation
|
|
1250
|
+
|
|
1251
|
+
Use pb COMMAND --help for detailed help on any command.
|
|
1219
1252
|
"""
|
|
1220
1253
|
pass
|
|
1221
1254
|
|
|
1222
1255
|
|
|
1223
1256
|
@cli.command()
|
|
1224
|
-
@click.argument("data_source", type=str)
|
|
1225
|
-
def info(data_source: str):
|
|
1257
|
+
@click.argument("data_source", type=str, required=False)
|
|
1258
|
+
def info(data_source: str | None):
|
|
1226
1259
|
"""
|
|
1227
1260
|
Display information about a data source.
|
|
1228
1261
|
|
|
@@ -1238,6 +1271,11 @@ def info(data_source: str):
|
|
|
1238
1271
|
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1239
1272
|
"""
|
|
1240
1273
|
try:
|
|
1274
|
+
# Handle missing data_source with concise help
|
|
1275
|
+
if data_source is None:
|
|
1276
|
+
_show_concise_help("info", None)
|
|
1277
|
+
return
|
|
1278
|
+
|
|
1241
1279
|
with console.status("[bold green]Loading data..."):
|
|
1242
1280
|
# Load the data source using the centralized function
|
|
1243
1281
|
data = _load_data_source(data_source)
|
|
@@ -1276,21 +1314,21 @@ def info(data_source: str):
|
|
|
1276
1314
|
|
|
1277
1315
|
|
|
1278
1316
|
@cli.command()
|
|
1279
|
-
@click.argument("data_source", type=str)
|
|
1280
|
-
@click.option("--columns",
|
|
1317
|
+
@click.argument("data_source", type=str, required=False)
|
|
1318
|
+
@click.option("--columns", help="Comma-separated list of columns to display")
|
|
1281
1319
|
@click.option("--col-range", help="Column range like '1:10' or '5:' or ':15' (1-based indexing)")
|
|
1282
1320
|
@click.option("--col-first", type=int, help="Show first N columns")
|
|
1283
1321
|
@click.option("--col-last", type=int, help="Show last N columns")
|
|
1284
|
-
@click.option("--head",
|
|
1285
|
-
@click.option("--tail",
|
|
1286
|
-
@click.option("--limit",
|
|
1322
|
+
@click.option("--head", default=5, help="Number of rows from the top (default: 5)")
|
|
1323
|
+
@click.option("--tail", default=5, help="Number of rows from the bottom (default: 5)")
|
|
1324
|
+
@click.option("--limit", default=50, help="Maximum total rows to display (default: 50)")
|
|
1287
1325
|
@click.option("--no-row-numbers", is_flag=True, help="Hide row numbers")
|
|
1288
1326
|
@click.option("--max-col-width", default=250, help="Maximum column width in pixels (default: 250)")
|
|
1289
1327
|
@click.option("--min-table-width", default=500, help="Minimum table width in pixels (default: 500)")
|
|
1290
1328
|
@click.option("--no-header", is_flag=True, help="Hide table header")
|
|
1291
1329
|
@click.option("--output-html", type=click.Path(), help="Save HTML output to file")
|
|
1292
1330
|
def preview(
|
|
1293
|
-
data_source: str,
|
|
1331
|
+
data_source: str | None,
|
|
1294
1332
|
columns: str | None,
|
|
1295
1333
|
col_range: str | None,
|
|
1296
1334
|
col_first: int | None,
|
|
@@ -1315,6 +1353,7 @@ def preview(
|
|
|
1315
1353
|
- GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
|
|
1316
1354
|
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1317
1355
|
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1356
|
+
- Piped data from pb pl command
|
|
1318
1357
|
|
|
1319
1358
|
COLUMN SELECTION OPTIONS:
|
|
1320
1359
|
|
|
@@ -1329,11 +1368,52 @@ def preview(
|
|
|
1329
1368
|
Tables with >15 columns automatically show first 7 and last 7 columns with indicators.
|
|
1330
1369
|
"""
|
|
1331
1370
|
try:
|
|
1371
|
+
import sys
|
|
1372
|
+
|
|
1373
|
+
# Handle piped input
|
|
1374
|
+
if data_source is None:
|
|
1375
|
+
if not sys.stdin.isatty():
|
|
1376
|
+
# Data is being piped in - read the file path from stdin
|
|
1377
|
+
piped_input = sys.stdin.read().strip()
|
|
1378
|
+
if piped_input:
|
|
1379
|
+
data_source = piped_input
|
|
1380
|
+
|
|
1381
|
+
# Determine the format from the file extension
|
|
1382
|
+
if piped_input.endswith(".parquet"):
|
|
1383
|
+
format_type = "Parquet"
|
|
1384
|
+
elif piped_input.endswith(".csv"):
|
|
1385
|
+
format_type = "CSV"
|
|
1386
|
+
else:
|
|
1387
|
+
format_type = "unknown"
|
|
1388
|
+
|
|
1389
|
+
console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
|
|
1390
|
+
else:
|
|
1391
|
+
console.print("[red]Error:[/red] No data provided via pipe")
|
|
1392
|
+
sys.exit(1)
|
|
1393
|
+
else:
|
|
1394
|
+
# Show concise help and exit
|
|
1395
|
+
_show_concise_help("preview", None)
|
|
1396
|
+
return
|
|
1397
|
+
|
|
1332
1398
|
with console.status("[bold green]Loading data..."):
|
|
1333
1399
|
# Load the data source using the centralized function
|
|
1334
1400
|
data = _load_data_source(data_source)
|
|
1335
1401
|
|
|
1336
|
-
|
|
1402
|
+
# Check if this is a piped data source and create friendly display name
|
|
1403
|
+
is_piped_data = _is_piped_data_source(data_source)
|
|
1404
|
+
|
|
1405
|
+
if is_piped_data:
|
|
1406
|
+
if data_source.endswith(".parquet"):
|
|
1407
|
+
display_source = "Parquet file via `pb pl`"
|
|
1408
|
+
elif data_source.endswith(".csv"):
|
|
1409
|
+
display_source = "CSV file via `pb pl`"
|
|
1410
|
+
else:
|
|
1411
|
+
display_source = "File via `pb pl`"
|
|
1412
|
+
console.print(
|
|
1413
|
+
f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
|
|
1414
|
+
)
|
|
1415
|
+
else:
|
|
1416
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1337
1417
|
|
|
1338
1418
|
# Parse columns if provided
|
|
1339
1419
|
columns_list = None
|
|
@@ -1355,7 +1435,7 @@ def preview(
|
|
|
1355
1435
|
# If _row_num_ exists in data but not in user selection, add it at beginning
|
|
1356
1436
|
if all_columns and "_row_num_" in all_columns and "_row_num_" not in columns_list:
|
|
1357
1437
|
columns_list = ["_row_num_"] + columns_list
|
|
1358
|
-
except Exception:
|
|
1438
|
+
except Exception:
|
|
1359
1439
|
# If we can't process the data, just use the user's column list as-is
|
|
1360
1440
|
pass
|
|
1361
1441
|
elif col_range or col_first or col_last:
|
|
@@ -1430,7 +1510,14 @@ def preview(
|
|
|
1430
1510
|
total_dataset_columns = pb.get_column_count(processed_data)
|
|
1431
1511
|
|
|
1432
1512
|
# Determine source type and table type for enhanced preview title
|
|
1433
|
-
if
|
|
1513
|
+
if is_piped_data:
|
|
1514
|
+
if data_source.endswith(".parquet"):
|
|
1515
|
+
source_type = "Polars expression (serialized to Parquet) from `pb pl`"
|
|
1516
|
+
elif data_source.endswith(".csv"):
|
|
1517
|
+
source_type = "Polars expression (serialized to CSV) from `pb pl`"
|
|
1518
|
+
else:
|
|
1519
|
+
source_type = "Polars expression from `pb pl`"
|
|
1520
|
+
elif data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
1434
1521
|
source_type = f"Pointblank dataset: {data_source}"
|
|
1435
1522
|
else:
|
|
1436
1523
|
source_type = f"External source: {data_source}"
|
|
@@ -1480,17 +1567,17 @@ def preview(
|
|
|
1480
1567
|
|
|
1481
1568
|
_rich_print_gt_table(gt_table, preview_info)
|
|
1482
1569
|
|
|
1483
|
-
except Exception as e:
|
|
1570
|
+
except Exception as e:
|
|
1484
1571
|
console.print(f"[red]Error:[/red] {e}")
|
|
1485
|
-
sys.exit(1)
|
|
1572
|
+
sys.exit(1)
|
|
1486
1573
|
|
|
1487
1574
|
|
|
1488
1575
|
@cli.command()
|
|
1489
|
-
@click.argument("data_source", type=str)
|
|
1576
|
+
@click.argument("data_source", type=str, required=False)
|
|
1490
1577
|
@click.option("--output-html", type=click.Path(), help="Save HTML scan report to file")
|
|
1491
1578
|
@click.option("--columns", "-c", help="Comma-separated list of columns to scan")
|
|
1492
1579
|
def scan(
|
|
1493
|
-
data_source: str,
|
|
1580
|
+
data_source: str | None,
|
|
1494
1581
|
output_html: str | None,
|
|
1495
1582
|
columns: str | None,
|
|
1496
1583
|
):
|
|
@@ -1513,17 +1600,58 @@ def scan(
|
|
|
1513
1600
|
- GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
|
|
1514
1601
|
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1515
1602
|
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1603
|
+
- Piped data from pb pl command
|
|
1516
1604
|
"""
|
|
1517
1605
|
try:
|
|
1606
|
+
import sys
|
|
1518
1607
|
import time
|
|
1519
1608
|
|
|
1520
1609
|
start_time = time.time()
|
|
1521
1610
|
|
|
1611
|
+
# Handle piped input
|
|
1612
|
+
if data_source is None:
|
|
1613
|
+
if not sys.stdin.isatty():
|
|
1614
|
+
# Data is being piped in - read the file path from stdin
|
|
1615
|
+
piped_input = sys.stdin.read().strip()
|
|
1616
|
+
if piped_input:
|
|
1617
|
+
data_source = piped_input
|
|
1618
|
+
|
|
1619
|
+
# Determine the format from the file extension
|
|
1620
|
+
if piped_input.endswith(".parquet"):
|
|
1621
|
+
format_type = "Parquet"
|
|
1622
|
+
elif piped_input.endswith(".csv"):
|
|
1623
|
+
format_type = "CSV"
|
|
1624
|
+
else:
|
|
1625
|
+
format_type = "unknown"
|
|
1626
|
+
|
|
1627
|
+
console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
|
|
1628
|
+
else:
|
|
1629
|
+
console.print("[red]Error:[/red] No data provided via pipe")
|
|
1630
|
+
sys.exit(1)
|
|
1631
|
+
else:
|
|
1632
|
+
# Show concise help and exit
|
|
1633
|
+
_show_concise_help("scan", None)
|
|
1634
|
+
return
|
|
1635
|
+
|
|
1522
1636
|
with console.status("[bold green]Loading data..."):
|
|
1523
1637
|
# Load the data source using the centralized function
|
|
1524
1638
|
data = _load_data_source(data_source)
|
|
1525
1639
|
|
|
1526
|
-
|
|
1640
|
+
# Check if this is a piped data source and create friendly display name
|
|
1641
|
+
is_piped_data = _is_piped_data_source(data_source)
|
|
1642
|
+
|
|
1643
|
+
if is_piped_data:
|
|
1644
|
+
if data_source.endswith(".parquet"):
|
|
1645
|
+
display_source = "Parquet file via `pb pl`"
|
|
1646
|
+
elif data_source.endswith(".csv"):
|
|
1647
|
+
display_source = "CSV file via `pb pl`"
|
|
1648
|
+
else:
|
|
1649
|
+
display_source = "File via `pb pl`"
|
|
1650
|
+
console.print(
|
|
1651
|
+
f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
|
|
1652
|
+
)
|
|
1653
|
+
else:
|
|
1654
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1527
1655
|
|
|
1528
1656
|
# Parse columns if provided
|
|
1529
1657
|
columns_list = None
|
|
@@ -1536,7 +1664,15 @@ def scan(
|
|
|
1536
1664
|
# Data is already processed by _load_data_source
|
|
1537
1665
|
scan_result = pb.col_summary_tbl(data=data)
|
|
1538
1666
|
|
|
1539
|
-
|
|
1667
|
+
# Create friendly source type for display
|
|
1668
|
+
if is_piped_data:
|
|
1669
|
+
if data_source.endswith(".parquet"):
|
|
1670
|
+
source_type = "Polars expression (serialized to Parquet) from `pb pl`"
|
|
1671
|
+
elif data_source.endswith(".csv"):
|
|
1672
|
+
source_type = "Polars expression (serialized to CSV) from `pb pl`"
|
|
1673
|
+
else:
|
|
1674
|
+
source_type = "Polars expression from `pb pl`"
|
|
1675
|
+
elif data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
1540
1676
|
source_type = f"Pointblank dataset: {data_source}"
|
|
1541
1677
|
else:
|
|
1542
1678
|
source_type = f"External source: {data_source}"
|
|
@@ -1568,7 +1704,12 @@ def scan(
|
|
|
1568
1704
|
# Display detailed column summary using rich formatting
|
|
1569
1705
|
try:
|
|
1570
1706
|
_rich_print_scan_table(
|
|
1571
|
-
scan_result,
|
|
1707
|
+
scan_result,
|
|
1708
|
+
display_source if is_piped_data else data_source,
|
|
1709
|
+
source_type,
|
|
1710
|
+
table_type,
|
|
1711
|
+
total_rows,
|
|
1712
|
+
total_columns,
|
|
1572
1713
|
)
|
|
1573
1714
|
|
|
1574
1715
|
except Exception as e:
|
|
@@ -1580,9 +1721,9 @@ def scan(
|
|
|
1580
1721
|
|
|
1581
1722
|
|
|
1582
1723
|
@cli.command()
|
|
1583
|
-
@click.argument("data_source", type=str)
|
|
1724
|
+
@click.argument("data_source", type=str, required=False)
|
|
1584
1725
|
@click.option("--output-html", type=click.Path(), help="Save HTML output to file")
|
|
1585
|
-
def missing(data_source: str, output_html: str | None):
|
|
1726
|
+
def missing(data_source: str | None, output_html: str | None):
|
|
1586
1727
|
"""
|
|
1587
1728
|
Generate a missing values report for a data table.
|
|
1588
1729
|
|
|
@@ -1594,13 +1735,55 @@ def missing(data_source: str, output_html: str | None):
|
|
|
1594
1735
|
- GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
|
|
1595
1736
|
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1596
1737
|
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1738
|
+
- Piped data from pb pl command
|
|
1597
1739
|
"""
|
|
1598
1740
|
try:
|
|
1741
|
+
import sys
|
|
1742
|
+
|
|
1743
|
+
# Handle piped input
|
|
1744
|
+
if data_source is None:
|
|
1745
|
+
if not sys.stdin.isatty():
|
|
1746
|
+
# Data is being piped in - read the file path from stdin
|
|
1747
|
+
piped_input = sys.stdin.read().strip()
|
|
1748
|
+
if piped_input:
|
|
1749
|
+
data_source = piped_input
|
|
1750
|
+
|
|
1751
|
+
# Determine the format from the file extension
|
|
1752
|
+
if piped_input.endswith(".parquet"):
|
|
1753
|
+
format_type = "Parquet"
|
|
1754
|
+
elif piped_input.endswith(".csv"):
|
|
1755
|
+
format_type = "CSV"
|
|
1756
|
+
else:
|
|
1757
|
+
format_type = "unknown"
|
|
1758
|
+
|
|
1759
|
+
console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
|
|
1760
|
+
else:
|
|
1761
|
+
console.print("[red]Error:[/red] No data provided via pipe")
|
|
1762
|
+
sys.exit(1)
|
|
1763
|
+
else:
|
|
1764
|
+
# Show concise help and exit
|
|
1765
|
+
_show_concise_help("missing", None)
|
|
1766
|
+
return
|
|
1767
|
+
|
|
1599
1768
|
with console.status("[bold green]Loading data..."):
|
|
1600
1769
|
# Load the data source using the centralized function
|
|
1601
1770
|
data = _load_data_source(data_source)
|
|
1602
1771
|
|
|
1603
|
-
|
|
1772
|
+
# Check if this is a piped data source and create friendly display name
|
|
1773
|
+
is_piped_data = _is_piped_data_source(data_source)
|
|
1774
|
+
|
|
1775
|
+
if is_piped_data:
|
|
1776
|
+
if data_source.endswith(".parquet"):
|
|
1777
|
+
display_source = "Parquet file via `pb pl`"
|
|
1778
|
+
elif data_source.endswith(".csv"):
|
|
1779
|
+
display_source = "CSV file via `pb pl`"
|
|
1780
|
+
else:
|
|
1781
|
+
display_source = "File via `pb pl`"
|
|
1782
|
+
console.print(
|
|
1783
|
+
f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
|
|
1784
|
+
)
|
|
1785
|
+
else:
|
|
1786
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1604
1787
|
|
|
1605
1788
|
# Generate missing values table
|
|
1606
1789
|
with console.status("[bold green]Analyzing missing values..."):
|
|
@@ -1616,7 +1799,38 @@ def missing(data_source: str, output_html: str | None):
|
|
|
1616
1799
|
console.print(f"[green]✓[/green] Missing values report saved to: {output_html}")
|
|
1617
1800
|
else:
|
|
1618
1801
|
# Display in terminal with special missing values formatting
|
|
1619
|
-
|
|
1802
|
+
# Create enhanced context info for missing table display
|
|
1803
|
+
missing_info = {}
|
|
1804
|
+
try:
|
|
1805
|
+
# Determine source type and table type for enhanced preview title
|
|
1806
|
+
if is_piped_data:
|
|
1807
|
+
if data_source.endswith(".parquet"):
|
|
1808
|
+
source_type = "Polars expression (serialized to Parquet) from `pb pl`"
|
|
1809
|
+
elif data_source.endswith(".csv"):
|
|
1810
|
+
source_type = "Polars expression (serialized to CSV) from `pb pl`"
|
|
1811
|
+
else:
|
|
1812
|
+
source_type = "Polars expression from `pb pl`"
|
|
1813
|
+
elif data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
1814
|
+
source_type = f"Pointblank dataset: {data_source}"
|
|
1815
|
+
else:
|
|
1816
|
+
source_type = f"External source: {data_source}"
|
|
1817
|
+
|
|
1818
|
+
missing_info = {
|
|
1819
|
+
"source_type": source_type,
|
|
1820
|
+
"table_type": _get_tbl_type(original_data),
|
|
1821
|
+
"total_rows": pb.get_row_count(original_data),
|
|
1822
|
+
"total_columns": pb.get_column_count(original_data),
|
|
1823
|
+
}
|
|
1824
|
+
except Exception:
|
|
1825
|
+
# Use defaults if metadata extraction fails
|
|
1826
|
+
missing_info = {
|
|
1827
|
+
"source_type": f"Data source: {data_source}",
|
|
1828
|
+
"table_type": "unknown",
|
|
1829
|
+
"total_rows": None,
|
|
1830
|
+
"total_columns": None,
|
|
1831
|
+
}
|
|
1832
|
+
|
|
1833
|
+
_rich_print_missing_table_enhanced(gt_table, original_data, missing_info)
|
|
1620
1834
|
|
|
1621
1835
|
except Exception as e:
|
|
1622
1836
|
console.print(f"[red]Error:[/red] {e}")
|
|
@@ -1741,6 +1955,8 @@ def validate(
|
|
|
1741
1955
|
pb validate data.csv --check col-vals-not-null --column email --check col-vals-gt --column age --value 18
|
|
1742
1956
|
"""
|
|
1743
1957
|
try:
|
|
1958
|
+
import sys
|
|
1959
|
+
|
|
1744
1960
|
# Handle --list-checks option early (doesn't need data source)
|
|
1745
1961
|
if list_checks:
|
|
1746
1962
|
console.print("[bold bright_cyan]Available Validation Checks:[/bold bright_cyan]")
|
|
@@ -1797,13 +2013,31 @@ def validate(
|
|
|
1797
2013
|
sys.exit(0)
|
|
1798
2014
|
|
|
1799
2015
|
# Check if data_source is provided (required for all operations except --list-checks)
|
|
2016
|
+
# or if we have piped input
|
|
1800
2017
|
if data_source is None:
|
|
1801
|
-
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
|
|
2018
|
+
# Check if we have piped input
|
|
2019
|
+
if not sys.stdin.isatty():
|
|
2020
|
+
# Data is being piped in: read the file path from stdin
|
|
2021
|
+
piped_input = sys.stdin.read().strip()
|
|
2022
|
+
if piped_input:
|
|
2023
|
+
data_source = piped_input
|
|
2024
|
+
|
|
2025
|
+
# Determine the format from the file extension
|
|
2026
|
+
if piped_input.endswith(".parquet"):
|
|
2027
|
+
format_type = "Parquet"
|
|
2028
|
+
elif piped_input.endswith(".csv"):
|
|
2029
|
+
format_type = "CSV"
|
|
2030
|
+
else:
|
|
2031
|
+
format_type = "unknown"
|
|
1805
2032
|
|
|
1806
|
-
|
|
2033
|
+
console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
|
|
2034
|
+
else:
|
|
2035
|
+
console.print("[red]Error:[/red] No data provided via pipe")
|
|
2036
|
+
sys.exit(1)
|
|
2037
|
+
else:
|
|
2038
|
+
# Show concise help and exit
|
|
2039
|
+
_show_concise_help("validate", None)
|
|
2040
|
+
return
|
|
1807
2041
|
|
|
1808
2042
|
# Handle backward compatibility and parameter conversion
|
|
1809
2043
|
import sys
|
|
@@ -1911,7 +2145,25 @@ def validate(
|
|
|
1911
2145
|
checks_list, columns_list, sets_list, values_list
|
|
1912
2146
|
)
|
|
1913
2147
|
|
|
1914
|
-
|
|
2148
|
+
# Check if this is a piped data source and create friendly display name
|
|
2149
|
+
is_piped_data = (
|
|
2150
|
+
data_source
|
|
2151
|
+
and data_source.startswith("/var/folders/")
|
|
2152
|
+
and ("pb_pipe_" in data_source or "/T/" in data_source)
|
|
2153
|
+
)
|
|
2154
|
+
|
|
2155
|
+
if is_piped_data:
|
|
2156
|
+
if data_source.endswith(".parquet"):
|
|
2157
|
+
display_source = "Parquet file via `pb pl`"
|
|
2158
|
+
elif data_source.endswith(".csv"):
|
|
2159
|
+
display_source = "CSV file via `pb pl`"
|
|
2160
|
+
else:
|
|
2161
|
+
display_source = "File via `pb pl`"
|
|
2162
|
+
console.print(
|
|
2163
|
+
f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
|
|
2164
|
+
)
|
|
2165
|
+
else:
|
|
2166
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1915
2167
|
|
|
1916
2168
|
# Build a single validation object with chained checks
|
|
1917
2169
|
with console.status(f"[bold green]Running {len(checks_list)} validation check(s)..."):
|
|
@@ -2134,136 +2386,339 @@ def requirements():
|
|
|
2134
2386
|
console.print("\n[dim]Install missing packages to enable additional functionality.[/dim]")
|
|
2135
2387
|
|
|
2136
2388
|
|
|
2137
|
-
def
|
|
2138
|
-
|
|
2139
|
-
data_source: str,
|
|
2140
|
-
source_type: str,
|
|
2141
|
-
table_type: str,
|
|
2142
|
-
total_rows: int | None = None,
|
|
2143
|
-
total_columns: int | None = None,
|
|
2389
|
+
def _rich_print_missing_table_enhanced(
|
|
2390
|
+
gt_table: Any, original_data: Any = None, missing_info: dict = None
|
|
2144
2391
|
) -> None:
|
|
2145
|
-
"""
|
|
2146
|
-
Display scan results as a Rich table in the terminal with statistical measures.
|
|
2392
|
+
"""Convert a missing values GT table to Rich table with enhanced formatting and metadata.
|
|
2147
2393
|
|
|
2148
2394
|
Args:
|
|
2149
|
-
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
table_type: Type of table (e.g., "polars.LazyFrame")
|
|
2153
|
-
total_rows: Total number of rows in the dataset
|
|
2154
|
-
total_columns: Total number of columns in the dataset
|
|
2395
|
+
gt_table: The GT table object for missing values
|
|
2396
|
+
original_data: The original data source to extract column types
|
|
2397
|
+
missing_info: Dict with metadata including source_type, table_type, total_rows, total_columns
|
|
2155
2398
|
"""
|
|
2156
2399
|
try:
|
|
2157
|
-
|
|
2400
|
+
# Extract the underlying data from the GT table
|
|
2401
|
+
df = None
|
|
2158
2402
|
|
|
2159
|
-
|
|
2160
|
-
|
|
2403
|
+
if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
|
|
2404
|
+
df = gt_table._tbl_data
|
|
2405
|
+
elif hasattr(gt_table, "_data") and gt_table._data is not None:
|
|
2406
|
+
df = gt_table._data
|
|
2407
|
+
elif hasattr(gt_table, "data") and gt_table.data is not None:
|
|
2408
|
+
df = gt_table.data
|
|
2161
2409
|
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
gt_data = scan_result._tbl_data
|
|
2410
|
+
if df is not None:
|
|
2411
|
+
from rich.box import SIMPLE_HEAD
|
|
2165
2412
|
|
|
2166
|
-
|
|
2167
|
-
|
|
2413
|
+
# Extract metadata from missing_info or use defaults
|
|
2414
|
+
source_type = "Data source"
|
|
2415
|
+
table_type = "unknown"
|
|
2416
|
+
total_rows = None
|
|
2417
|
+
total_columns = None
|
|
2168
2418
|
|
|
2169
|
-
|
|
2170
|
-
|
|
2419
|
+
if missing_info:
|
|
2420
|
+
source_type = missing_info.get("source_type", "Data source")
|
|
2421
|
+
table_type = missing_info.get("table_type", "unknown")
|
|
2422
|
+
total_rows = missing_info.get("total_rows")
|
|
2423
|
+
total_columns = missing_info.get("total_columns")
|
|
2171
2424
|
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
title_text = f"Column Summary / {source_type} / {table_type}"
|
|
2425
|
+
# Create enhanced title matching the scan table format
|
|
2426
|
+
title_text = f"Missing Values / {source_type} / {table_type}"
|
|
2175
2427
|
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2428
|
+
# Add dimensions subtitle in gray if available
|
|
2429
|
+
if total_rows is not None and total_columns is not None:
|
|
2430
|
+
title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
|
|
2179
2431
|
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
|
|
2186
|
-
|
|
2187
|
-
|
|
2188
|
-
|
|
2432
|
+
# Get column names
|
|
2433
|
+
columns = []
|
|
2434
|
+
try:
|
|
2435
|
+
if hasattr(df, "columns"):
|
|
2436
|
+
columns = list(df.columns)
|
|
2437
|
+
elif hasattr(df, "schema"):
|
|
2438
|
+
columns = list(df.schema.names)
|
|
2439
|
+
except Exception as e:
|
|
2440
|
+
console.print(f"[red]Error getting columns:[/red] {e}")
|
|
2441
|
+
columns = []
|
|
2189
2442
|
|
|
2190
|
-
|
|
2191
|
-
|
|
2192
|
-
scan_table.add_column("Type", style="yellow", no_wrap=True, width=10)
|
|
2193
|
-
scan_table.add_column(
|
|
2194
|
-
"NA", style="red", width=6, justify="right"
|
|
2195
|
-
) # Adjusted for better formatting
|
|
2196
|
-
scan_table.add_column(
|
|
2197
|
-
"UQ", style="green", width=8, justify="right"
|
|
2198
|
-
) # Adjusted for boolean values
|
|
2443
|
+
if not columns:
|
|
2444
|
+
columns = [f"Column {i + 1}" for i in range(10)] # Fallback
|
|
2199
2445
|
|
|
2200
|
-
|
|
2201
|
-
|
|
2202
|
-
|
|
2203
|
-
|
|
2204
|
-
|
|
2205
|
-
|
|
2206
|
-
|
|
2207
|
-
|
|
2208
|
-
|
|
2209
|
-
|
|
2210
|
-
|
|
2211
|
-
}
|
|
2446
|
+
# Get original data to extract column types
|
|
2447
|
+
column_types = {}
|
|
2448
|
+
if original_data is not None:
|
|
2449
|
+
try:
|
|
2450
|
+
# Get column types from original data
|
|
2451
|
+
if hasattr(original_data, "columns"):
|
|
2452
|
+
original_columns = list(original_data.columns)
|
|
2453
|
+
column_types = _get_column_dtypes(original_data, original_columns)
|
|
2454
|
+
except Exception as e:
|
|
2455
|
+
console.print(f"[red]Error getting column types:[/red] {e}")
|
|
2456
|
+
pass # Use empty dict as fallback
|
|
2212
2457
|
|
|
2213
|
-
|
|
2214
|
-
|
|
2215
|
-
scan_table.add_column(display_name, style=color, width=width, justify="right")
|
|
2216
|
-
stat_columns.append(col_key)
|
|
2458
|
+
# Add columns to Rich table with special formatting for missing values table
|
|
2459
|
+
sector_columns = [col for col in columns if col != "columns" and col.isdigit()]
|
|
2217
2460
|
|
|
2218
|
-
|
|
2219
|
-
|
|
2220
|
-
"
|
|
2221
|
-
# Extract column name from first div
|
|
2222
|
-
name_match = re.search(r"<div[^>]*>([^<]+)</div>", html_content)
|
|
2223
|
-
column_name = name_match.group(1) if name_match else "Unknown"
|
|
2461
|
+
# Print the title first
|
|
2462
|
+
console.print()
|
|
2463
|
+
console.print(f"[bold cyan]{title_text}[/bold cyan]")
|
|
2224
2464
|
|
|
2225
|
-
#
|
|
2226
|
-
|
|
2227
|
-
|
|
2228
|
-
|
|
2229
|
-
|
|
2230
|
-
|
|
2231
|
-
data_type = compact_type
|
|
2232
|
-
else:
|
|
2233
|
-
data_type = "unknown"
|
|
2465
|
+
# Show the custom spanner header if we have sector columns
|
|
2466
|
+
if sector_columns:
|
|
2467
|
+
# Create a custom header line that shows the spanner
|
|
2468
|
+
header_parts = []
|
|
2469
|
+
header_parts.append(" " * 20) # Space for Column header
|
|
2470
|
+
header_parts.append(" " * 10) # Space for Type header
|
|
2234
2471
|
|
|
2235
|
-
|
|
2472
|
+
# Left-align "Row Sectors" with the first numbered column
|
|
2473
|
+
row_sectors_text = "Row Sectors"
|
|
2474
|
+
header_parts.append(row_sectors_text)
|
|
2236
2475
|
|
|
2237
|
-
|
|
2238
|
-
|
|
2239
|
-
value: Any, is_missing: bool = False, is_unique: bool = False, max_width: int = 8
|
|
2240
|
-
) -> str:
|
|
2241
|
-
"""Format values for display with smart number formatting and HTML cleanup."""
|
|
2242
|
-
if value is None or (isinstance(value, str) and value.strip() == ""):
|
|
2243
|
-
return "[dim]—[/dim]"
|
|
2476
|
+
# Print the custom spanner header
|
|
2477
|
+
console.print("[dim]" + " ".join(header_parts) + "[/dim]")
|
|
2244
2478
|
|
|
2245
|
-
|
|
2246
|
-
|
|
2247
|
-
|
|
2479
|
+
# Add a horizontal rule below the spanner
|
|
2480
|
+
rule_parts = []
|
|
2481
|
+
rule_parts.append(" " * 20) # Space for Column header
|
|
2482
|
+
rule_parts.append(" " * 10) # Space for Type header
|
|
2248
2483
|
|
|
2249
|
-
|
|
2250
|
-
|
|
2484
|
+
# Use a fixed width horizontal rule for "Row Sectors"
|
|
2485
|
+
horizontal_rule = "─" * 20
|
|
2486
|
+
rule_parts.append(horizontal_rule)
|
|
2251
2487
|
|
|
2252
|
-
|
|
2253
|
-
|
|
2254
|
-
str_val = str_val.split("<br>")[0].strip()
|
|
2255
|
-
# For unique values, we want just the integer part
|
|
2256
|
-
if is_unique:
|
|
2257
|
-
try:
|
|
2258
|
-
# Try to extract just the integer part for unique counts
|
|
2259
|
-
num_val = float(str_val)
|
|
2260
|
-
return str(int(num_val))
|
|
2261
|
-
except (ValueError, TypeError):
|
|
2262
|
-
pass
|
|
2488
|
+
# Print the horizontal rule
|
|
2489
|
+
console.print("[dim]" + " ".join(rule_parts) + "[/dim]")
|
|
2263
2490
|
|
|
2264
|
-
#
|
|
2265
|
-
|
|
2266
|
-
|
|
2491
|
+
# Create the missing values table WITHOUT the title (since we printed it above)
|
|
2492
|
+
rich_table = Table(
|
|
2493
|
+
show_header=True,
|
|
2494
|
+
header_style="bold magenta",
|
|
2495
|
+
box=SIMPLE_HEAD,
|
|
2496
|
+
)
|
|
2497
|
+
|
|
2498
|
+
# Two separate columns: Column name (20 chars) and Data type (10 chars)
|
|
2499
|
+
rich_table.add_column("Column", style="cyan", no_wrap=True, width=20)
|
|
2500
|
+
rich_table.add_column("Type", style="yellow", no_wrap=True, width=10)
|
|
2501
|
+
|
|
2502
|
+
# Sector columns: All same width, optimized for "100%" (4 chars + padding)
|
|
2503
|
+
for sector in sector_columns:
|
|
2504
|
+
rich_table.add_column(
|
|
2505
|
+
sector,
|
|
2506
|
+
style="cyan",
|
|
2507
|
+
justify="center",
|
|
2508
|
+
no_wrap=True,
|
|
2509
|
+
width=5, # Fixed width optimized for percentage values
|
|
2510
|
+
)
|
|
2511
|
+
|
|
2512
|
+
# Convert data to rows with special formatting
|
|
2513
|
+
rows = []
|
|
2514
|
+
try:
|
|
2515
|
+
if hasattr(df, "to_dicts"):
|
|
2516
|
+
data_dict = df.to_dicts()
|
|
2517
|
+
elif hasattr(df, "to_dict"):
|
|
2518
|
+
data_dict = df.to_dict("records")
|
|
2519
|
+
else:
|
|
2520
|
+
data_dict = []
|
|
2521
|
+
|
|
2522
|
+
for i, row in enumerate(data_dict):
|
|
2523
|
+
try:
|
|
2524
|
+
# Each row should have: [column_name, data_type, sector1, sector2, ...]
|
|
2525
|
+
column_name = str(row.get("columns", ""))
|
|
2526
|
+
|
|
2527
|
+
# Truncate column name to 20 characters with ellipsis if needed
|
|
2528
|
+
if len(column_name) > 20:
|
|
2529
|
+
truncated_name = column_name[:17] + "…"
|
|
2530
|
+
else:
|
|
2531
|
+
truncated_name = column_name
|
|
2532
|
+
|
|
2533
|
+
# Get data type for this column
|
|
2534
|
+
if column_name in column_types:
|
|
2535
|
+
dtype = column_types[column_name]
|
|
2536
|
+
if len(dtype) > 10:
|
|
2537
|
+
truncated_dtype = dtype[:9] + "…"
|
|
2538
|
+
else:
|
|
2539
|
+
truncated_dtype = dtype
|
|
2540
|
+
else:
|
|
2541
|
+
truncated_dtype = "?"
|
|
2542
|
+
|
|
2543
|
+
# Start building the row with column name and type
|
|
2544
|
+
formatted_row = [truncated_name, truncated_dtype]
|
|
2545
|
+
|
|
2546
|
+
# Add sector values (formatted percentages)
|
|
2547
|
+
for sector in sector_columns:
|
|
2548
|
+
value = row.get(sector, 0.0)
|
|
2549
|
+
if isinstance(value, (int, float)):
|
|
2550
|
+
formatted_row.append(_format_missing_percentage(float(value)))
|
|
2551
|
+
else:
|
|
2552
|
+
formatted_row.append(str(value))
|
|
2553
|
+
|
|
2554
|
+
rows.append(formatted_row)
|
|
2555
|
+
|
|
2556
|
+
except Exception as e:
|
|
2557
|
+
console.print(f"[red]Error processing row {i}:[/red] {e}")
|
|
2558
|
+
continue
|
|
2559
|
+
|
|
2560
|
+
except Exception as e:
|
|
2561
|
+
console.print(f"[red]Error extracting data:[/red] {e}")
|
|
2562
|
+
rows = [["Error extracting data", "?", *["" for _ in sector_columns]]]
|
|
2563
|
+
|
|
2564
|
+
# Add rows to Rich table
|
|
2565
|
+
for row in rows:
|
|
2566
|
+
try:
|
|
2567
|
+
rich_table.add_row(*row)
|
|
2568
|
+
except Exception as e:
|
|
2569
|
+
console.print(f"[red]Error adding row:[/red] {e}")
|
|
2570
|
+
break
|
|
2571
|
+
|
|
2572
|
+
# Print the Rich table (without title since we already printed it)
|
|
2573
|
+
console.print(rich_table)
|
|
2574
|
+
|
|
2575
|
+
footer_text = (
|
|
2576
|
+
"[dim]Symbols: [green]●[/green] = no missing vals in sector, "
|
|
2577
|
+
"[red]●[/red] = all vals completely missing, "
|
|
2578
|
+
"[cyan]x%[/cyan] = percentage missing[/dim]"
|
|
2579
|
+
)
|
|
2580
|
+
console.print(footer_text)
|
|
2581
|
+
|
|
2582
|
+
else:
|
|
2583
|
+
# Fallback to regular table display
|
|
2584
|
+
_rich_print_gt_table(gt_table)
|
|
2585
|
+
|
|
2586
|
+
except Exception as e:
|
|
2587
|
+
console.print(f"[red]Error rendering missing values table:[/red] {e}")
|
|
2588
|
+
# Fallback to regular table display
|
|
2589
|
+
_rich_print_gt_table(gt_table)
|
|
2590
|
+
|
|
2591
|
+
|
|
2592
|
+
def _rich_print_scan_table(
|
|
2593
|
+
scan_result: Any,
|
|
2594
|
+
data_source: str,
|
|
2595
|
+
source_type: str,
|
|
2596
|
+
table_type: str,
|
|
2597
|
+
total_rows: int | None = None,
|
|
2598
|
+
total_columns: int | None = None,
|
|
2599
|
+
) -> None:
|
|
2600
|
+
"""
|
|
2601
|
+
Display scan results as a Rich table in the terminal with statistical measures.
|
|
2602
|
+
|
|
2603
|
+
Args:
|
|
2604
|
+
scan_result: The GT object from col_summary_tbl()
|
|
2605
|
+
data_source: Name of the data source being scanned
|
|
2606
|
+
source_type: Type of data source (e.g., "Pointblank dataset: small_table")
|
|
2607
|
+
table_type: Type of table (e.g., "polars.LazyFrame")
|
|
2608
|
+
total_rows: Total number of rows in the dataset
|
|
2609
|
+
total_columns: Total number of columns in the dataset
|
|
2610
|
+
"""
|
|
2611
|
+
try:
|
|
2612
|
+
import re
|
|
2613
|
+
|
|
2614
|
+
import narwhals as nw
|
|
2615
|
+
from rich.box import SIMPLE_HEAD
|
|
2616
|
+
|
|
2617
|
+
# Extract the underlying DataFrame from the GT object
|
|
2618
|
+
# The GT object has a _tbl_data attribute that contains the DataFrame
|
|
2619
|
+
gt_data = scan_result._tbl_data
|
|
2620
|
+
|
|
2621
|
+
# Convert to Narwhals DataFrame for consistent handling
|
|
2622
|
+
nw_data = nw.from_native(gt_data)
|
|
2623
|
+
|
|
2624
|
+
# Convert to dictionary for easier access
|
|
2625
|
+
data_dict = nw_data.to_dict(as_series=False)
|
|
2626
|
+
|
|
2627
|
+
# Create main scan table with missing data table styling
|
|
2628
|
+
# Create a comprehensive title with data source, source type, and table type
|
|
2629
|
+
title_text = f"Column Summary / {source_type} / {table_type}"
|
|
2630
|
+
|
|
2631
|
+
# Add dimensions subtitle in gray if available
|
|
2632
|
+
if total_rows is not None and total_columns is not None:
|
|
2633
|
+
title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
|
|
2634
|
+
|
|
2635
|
+
# Create the scan table
|
|
2636
|
+
scan_table = Table(
|
|
2637
|
+
title=title_text,
|
|
2638
|
+
show_header=True,
|
|
2639
|
+
header_style="bold magenta",
|
|
2640
|
+
box=SIMPLE_HEAD,
|
|
2641
|
+
title_style="bold cyan",
|
|
2642
|
+
title_justify="left",
|
|
2643
|
+
)
|
|
2644
|
+
|
|
2645
|
+
# Add columns with specific styling and appropriate widths
|
|
2646
|
+
scan_table.add_column("Column", style="cyan", no_wrap=True, width=20)
|
|
2647
|
+
scan_table.add_column("Type", style="yellow", no_wrap=True, width=10)
|
|
2648
|
+
scan_table.add_column(
|
|
2649
|
+
"NA", style="red", width=6, justify="right"
|
|
2650
|
+
) # Adjusted for better formatting
|
|
2651
|
+
scan_table.add_column(
|
|
2652
|
+
"UQ", style="green", width=8, justify="right"
|
|
2653
|
+
) # Adjusted for boolean values
|
|
2654
|
+
|
|
2655
|
+
# Add statistical columns if they exist with appropriate widths
|
|
2656
|
+
stat_columns = []
|
|
2657
|
+
column_mapping = {
|
|
2658
|
+
"mean": ("Mean", "blue", 9),
|
|
2659
|
+
"std": ("SD", "blue", 9),
|
|
2660
|
+
"min": ("Min", "yellow", 9),
|
|
2661
|
+
"median": ("Med", "yellow", 9),
|
|
2662
|
+
"max": ("Max", "yellow", 9),
|
|
2663
|
+
"q_1": ("Q₁", "magenta", 8),
|
|
2664
|
+
"q_3": ("Q₃", "magenta", 9),
|
|
2665
|
+
"iqr": ("IQR", "magenta", 8),
|
|
2666
|
+
}
|
|
2667
|
+
|
|
2668
|
+
for col_key, (display_name, color, width) in column_mapping.items():
|
|
2669
|
+
if col_key in data_dict:
|
|
2670
|
+
scan_table.add_column(display_name, style=color, width=width, justify="right")
|
|
2671
|
+
stat_columns.append(col_key)
|
|
2672
|
+
|
|
2673
|
+
# Helper function to extract column name and type from HTML
|
|
2674
|
+
def extract_column_info(html_content: str) -> tuple[str, str]:
|
|
2675
|
+
"""Extract column name and type from HTML formatted content."""
|
|
2676
|
+
# Extract column name from first div
|
|
2677
|
+
name_match = re.search(r"<div[^>]*>([^<]+)</div>", html_content)
|
|
2678
|
+
column_name = name_match.group(1) if name_match else "Unknown"
|
|
2679
|
+
|
|
2680
|
+
# Extract data type from second div (with gray color)
|
|
2681
|
+
type_match = re.search(r"<div[^>]*color: gray[^>]*>([^<]+)</div>", html_content)
|
|
2682
|
+
if type_match:
|
|
2683
|
+
data_type = type_match.group(1)
|
|
2684
|
+
# Convert to compact format using the existing function
|
|
2685
|
+
compact_type = _format_dtype_compact(data_type)
|
|
2686
|
+
data_type = compact_type
|
|
2687
|
+
else:
|
|
2688
|
+
data_type = "unknown"
|
|
2689
|
+
|
|
2690
|
+
return column_name, data_type
|
|
2691
|
+
|
|
2692
|
+
# Helper function to format values with improved number formatting
|
|
2693
|
+
def format_value(
|
|
2694
|
+
value: Any, is_missing: bool = False, is_unique: bool = False, max_width: int = 8
|
|
2695
|
+
) -> str:
|
|
2696
|
+
"""Format values for display with smart number formatting and HTML cleanup."""
|
|
2697
|
+
if value is None or (isinstance(value, str) and value.strip() == ""):
|
|
2698
|
+
return "[dim]—[/dim]"
|
|
2699
|
+
|
|
2700
|
+
# Handle missing values indicator
|
|
2701
|
+
if is_missing and str(value) == "0":
|
|
2702
|
+
return "[green]●[/green]" # No missing values
|
|
2703
|
+
|
|
2704
|
+
# Clean up HTML formatting from the raw data
|
|
2705
|
+
str_val = str(value)
|
|
2706
|
+
|
|
2707
|
+
# Handle multi-line values with <br> tags FIRST: take the first line (absolute number)
|
|
2708
|
+
if "<br>" in str_val:
|
|
2709
|
+
str_val = str_val.split("<br>")[0].strip()
|
|
2710
|
+
# For unique values, we want just the integer part
|
|
2711
|
+
if is_unique:
|
|
2712
|
+
try:
|
|
2713
|
+
# Try to extract just the integer part for unique counts
|
|
2714
|
+
num_val = float(str_val)
|
|
2715
|
+
return str(int(num_val))
|
|
2716
|
+
except (ValueError, TypeError):
|
|
2717
|
+
pass
|
|
2718
|
+
|
|
2719
|
+
# Now handle HTML content (especially from boolean unique values)
|
|
2720
|
+
if "<" in str_val and ">" in str_val:
|
|
2721
|
+
# Remove HTML tags completely for cleaner display
|
|
2267
2722
|
str_val = re.sub(r"<[^>]+>", "", str_val).strip()
|
|
2268
2723
|
# Clean up extra whitespace
|
|
2269
2724
|
str_val = re.sub(r"\s+", " ", str_val).strip()
|
|
@@ -2423,8 +2878,36 @@ def _rich_print_missing_table(gt_table: Any, original_data: Any = None) -> None:
|
|
|
2423
2878
|
if df is not None:
|
|
2424
2879
|
from rich.box import SIMPLE_HEAD
|
|
2425
2880
|
|
|
2426
|
-
#
|
|
2427
|
-
|
|
2881
|
+
# Get metadata for enhanced missing table title
|
|
2882
|
+
total_rows = None
|
|
2883
|
+
total_columns = None
|
|
2884
|
+
source_type = "Data source"
|
|
2885
|
+
table_type = "unknown"
|
|
2886
|
+
|
|
2887
|
+
if original_data is not None:
|
|
2888
|
+
try:
|
|
2889
|
+
total_rows = pb.get_row_count(original_data)
|
|
2890
|
+
total_columns = pb.get_column_count(original_data)
|
|
2891
|
+
table_type = _get_tbl_type(original_data)
|
|
2892
|
+
except Exception:
|
|
2893
|
+
pass
|
|
2894
|
+
|
|
2895
|
+
# Create enhanced title matching the scan table format
|
|
2896
|
+
title_text = f"Missing Values / {source_type} / {table_type}"
|
|
2897
|
+
|
|
2898
|
+
# Add dimensions subtitle in gray if available
|
|
2899
|
+
if total_rows is not None and total_columns is not None:
|
|
2900
|
+
title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
|
|
2901
|
+
|
|
2902
|
+
# Create the missing values table with enhanced title
|
|
2903
|
+
rich_table = Table(
|
|
2904
|
+
title=title_text,
|
|
2905
|
+
show_header=True,
|
|
2906
|
+
header_style="bold magenta",
|
|
2907
|
+
box=SIMPLE_HEAD,
|
|
2908
|
+
title_style="bold cyan",
|
|
2909
|
+
title_justify="left",
|
|
2910
|
+
)
|
|
2428
2911
|
|
|
2429
2912
|
# Get column names
|
|
2430
2913
|
columns = []
|
|
@@ -2556,12 +3039,12 @@ def _rich_print_missing_table(gt_table: Any, original_data: Any = None) -> None:
|
|
|
2556
3039
|
console.print("[dim]" + " ".join(rule_parts) + "[/dim]")
|
|
2557
3040
|
|
|
2558
3041
|
# Print the Rich table (will handle terminal width automatically)
|
|
3042
|
+
console.print()
|
|
2559
3043
|
console.print(rich_table)
|
|
2560
3044
|
footer_text = (
|
|
2561
|
-
"[dim]Symbols: [green]●[/green] = no missing
|
|
2562
|
-
"[red]●[/red] = completely missing, "
|
|
2563
|
-
"
|
|
2564
|
-
">99% = more than 99% missing[/dim]"
|
|
3045
|
+
"[dim]Symbols: [green]●[/green] = no missing vals in sector, "
|
|
3046
|
+
"[red]●[/red] = all vals completely missing, "
|
|
3047
|
+
"[cyan]x%[/cyan] = percentage missing[/dim]"
|
|
2565
3048
|
)
|
|
2566
3049
|
console.print(footer_text)
|
|
2567
3050
|
|
|
@@ -2700,6 +3183,20 @@ def _display_validation_result(
|
|
|
2700
3183
|
set_val = sets_list[step_index] if step_index < len(sets_list) else None
|
|
2701
3184
|
value = values_list[step_index] if step_index < len(values_list) else None
|
|
2702
3185
|
|
|
3186
|
+
# Check if this is piped data
|
|
3187
|
+
is_piped_data = _is_piped_data_source(data_source)
|
|
3188
|
+
|
|
3189
|
+
# Create friendly display name for data source
|
|
3190
|
+
if is_piped_data:
|
|
3191
|
+
if data_source.endswith(".parquet"):
|
|
3192
|
+
display_source = "Polars expression (serialized to Parquet) from `pb pl`"
|
|
3193
|
+
elif data_source.endswith(".csv"):
|
|
3194
|
+
display_source = "Polars expression (serialized to CSV) from `pb pl`"
|
|
3195
|
+
else:
|
|
3196
|
+
display_source = "Polars expression from `pb pl`"
|
|
3197
|
+
else:
|
|
3198
|
+
display_source = data_source
|
|
3199
|
+
|
|
2703
3200
|
# Get validation step info
|
|
2704
3201
|
step_info = None
|
|
2705
3202
|
if hasattr(validation, "validation_info") and len(validation.validation_info) > step_index:
|
|
@@ -2766,7 +3263,7 @@ def _display_validation_result(
|
|
|
2766
3263
|
result_table.add_column("Value", style="white")
|
|
2767
3264
|
|
|
2768
3265
|
# Add basic info
|
|
2769
|
-
result_table.add_row("Data Source",
|
|
3266
|
+
result_table.add_row("Data Source", display_source)
|
|
2770
3267
|
result_table.add_row("Check Type", check)
|
|
2771
3268
|
|
|
2772
3269
|
# Add column info for column-specific checks
|
|
@@ -3128,6 +3625,18 @@ def _show_extract_and_summary(
|
|
|
3128
3625
|
"""Show extract and summary for a validation step (used for single checks)."""
|
|
3129
3626
|
step_passed = step_info.n_failed == 0 if step_info else True
|
|
3130
3627
|
|
|
3628
|
+
# Get the friendly display name
|
|
3629
|
+
is_piped_data = _is_piped_data_source(data_source)
|
|
3630
|
+
if is_piped_data:
|
|
3631
|
+
if data_source.endswith(".parquet"):
|
|
3632
|
+
display_source = "Polars expression (serialized to Parquet) from `pb pl`"
|
|
3633
|
+
elif data_source.endswith(".csv"):
|
|
3634
|
+
display_source = "Polars expression (serialized to CSV) from `pb pl`"
|
|
3635
|
+
else:
|
|
3636
|
+
display_source = "Polars expression from `pb pl`"
|
|
3637
|
+
else:
|
|
3638
|
+
display_source = data_source
|
|
3639
|
+
|
|
3131
3640
|
# Show extract if requested and validation failed
|
|
3132
3641
|
if (show_extract or write_extract) and not step_passed:
|
|
3133
3642
|
console.print()
|
|
@@ -3281,54 +3790,54 @@ def _show_extract_and_summary(
|
|
|
3281
3790
|
if step_passed:
|
|
3282
3791
|
if check == "rows-distinct":
|
|
3283
3792
|
success_message = (
|
|
3284
|
-
f"[green]✓ Validation PASSED: No duplicate rows found in {
|
|
3793
|
+
f"[green]✓ Validation PASSED: No duplicate rows found in {display_source}[/green]"
|
|
3285
3794
|
)
|
|
3286
3795
|
elif check == "col-vals-not-null":
|
|
3287
|
-
success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {
|
|
3796
|
+
success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {display_source}[/green]"
|
|
3288
3797
|
elif check == "rows-complete":
|
|
3289
|
-
success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {
|
|
3798
|
+
success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {display_source}[/green]"
|
|
3290
3799
|
elif check == "col-exists":
|
|
3291
3800
|
success_message = (
|
|
3292
|
-
f"[green]✓ Validation PASSED: Column '{column}' exists in {
|
|
3801
|
+
f"[green]✓ Validation PASSED: Column '{column}' exists in {display_source}[/green]"
|
|
3293
3802
|
)
|
|
3294
3803
|
elif check == "col-vals-in-set":
|
|
3295
|
-
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {
|
|
3804
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {display_source}[/green]"
|
|
3296
3805
|
elif check == "col-vals-gt":
|
|
3297
|
-
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {
|
|
3806
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {display_source}[/green]"
|
|
3298
3807
|
elif check == "col-vals-ge":
|
|
3299
|
-
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {
|
|
3808
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {display_source}[/green]"
|
|
3300
3809
|
elif check == "col-vals-lt":
|
|
3301
|
-
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {
|
|
3810
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {display_source}[/green]"
|
|
3302
3811
|
elif check == "col-vals-le":
|
|
3303
|
-
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {
|
|
3812
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {display_source}[/green]"
|
|
3304
3813
|
else:
|
|
3305
3814
|
success_message = (
|
|
3306
|
-
f"[green]✓ Validation PASSED: {check} check passed for {
|
|
3815
|
+
f"[green]✓ Validation PASSED: {check} check passed for {display_source}[/green]"
|
|
3307
3816
|
)
|
|
3308
3817
|
|
|
3309
3818
|
console.print(Panel(success_message, border_style="green", expand=False))
|
|
3310
3819
|
else:
|
|
3311
3820
|
if step_info:
|
|
3312
3821
|
if check == "rows-distinct":
|
|
3313
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {
|
|
3822
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {display_source}[/red]"
|
|
3314
3823
|
elif check == "col-vals-not-null":
|
|
3315
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {
|
|
3824
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {display_source}[/red]"
|
|
3316
3825
|
elif check == "rows-complete":
|
|
3317
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {
|
|
3826
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {display_source}[/red]"
|
|
3318
3827
|
elif check == "col-exists":
|
|
3319
|
-
failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {
|
|
3828
|
+
failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {display_source}[/red]"
|
|
3320
3829
|
elif check == "col-vals-in-set":
|
|
3321
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {
|
|
3830
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {display_source}[/red]"
|
|
3322
3831
|
elif check == "col-vals-gt":
|
|
3323
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {
|
|
3832
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {display_source}[/red]"
|
|
3324
3833
|
elif check == "col-vals-ge":
|
|
3325
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {
|
|
3834
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {display_source}[/red]"
|
|
3326
3835
|
elif check == "col-vals-lt":
|
|
3327
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {
|
|
3836
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {display_source}[/red]"
|
|
3328
3837
|
elif check == "col-vals-le":
|
|
3329
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {
|
|
3838
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {display_source}[/red]"
|
|
3330
3839
|
else:
|
|
3331
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {
|
|
3840
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {display_source}[/red]"
|
|
3332
3841
|
|
|
3333
3842
|
# Add hint about --show-extract if not already used (except for col-exists which has no rows to show)
|
|
3334
3843
|
if not show_extract and check != "col-exists":
|
|
@@ -3338,15 +3847,15 @@ def _show_extract_and_summary(
|
|
|
3338
3847
|
else:
|
|
3339
3848
|
if check == "rows-distinct":
|
|
3340
3849
|
failure_message = (
|
|
3341
|
-
f"[red]✗ Validation FAILED: Duplicate rows found in {
|
|
3850
|
+
f"[red]✗ Validation FAILED: Duplicate rows found in {display_source}[/red]"
|
|
3342
3851
|
)
|
|
3343
3852
|
elif check == "rows-complete":
|
|
3344
3853
|
failure_message = (
|
|
3345
|
-
f"[red]✗ Validation FAILED: Incomplete rows found in {
|
|
3854
|
+
f"[red]✗ Validation FAILED: Incomplete rows found in {display_source}[/red]"
|
|
3346
3855
|
)
|
|
3347
3856
|
else:
|
|
3348
3857
|
failure_message = (
|
|
3349
|
-
f"[red]✗ Validation FAILED: {check} check failed for {
|
|
3858
|
+
f"[red]✗ Validation FAILED: {check} check failed for {display_source}[/red]"
|
|
3350
3859
|
)
|
|
3351
3860
|
|
|
3352
3861
|
# Add hint about --show-extract if not already used
|
|
@@ -3357,8 +3866,8 @@ def _show_extract_and_summary(
|
|
|
3357
3866
|
|
|
3358
3867
|
|
|
3359
3868
|
@cli.command()
|
|
3360
|
-
@click.argument("output_file", type=click.Path())
|
|
3361
|
-
def make_template(output_file: str):
|
|
3869
|
+
@click.argument("output_file", type=click.Path(), required=False)
|
|
3870
|
+
def make_template(output_file: str | None):
|
|
3362
3871
|
"""
|
|
3363
3872
|
Create a validation script template.
|
|
3364
3873
|
|
|
@@ -3374,6 +3883,11 @@ def make_template(output_file: str):
|
|
|
3374
3883
|
pb make-template my_validation.py
|
|
3375
3884
|
pb make-template validation_template.py
|
|
3376
3885
|
"""
|
|
3886
|
+
# Handle missing output_file with concise help
|
|
3887
|
+
if output_file is None:
|
|
3888
|
+
_show_concise_help("make-template", None)
|
|
3889
|
+
return
|
|
3890
|
+
|
|
3377
3891
|
example_script = '''"""
|
|
3378
3892
|
Example Pointblank validation script.
|
|
3379
3893
|
|
|
@@ -3437,7 +3951,7 @@ validation = (
|
|
|
3437
3951
|
|
|
3438
3952
|
|
|
3439
3953
|
@cli.command()
|
|
3440
|
-
@click.argument("validation_script", type=click.Path(exists=True))
|
|
3954
|
+
@click.argument("validation_script", type=click.Path(exists=True), required=False)
|
|
3441
3955
|
@click.option(
|
|
3442
3956
|
"--data",
|
|
3443
3957
|
type=str,
|
|
@@ -3462,7 +3976,7 @@ validation = (
|
|
|
3462
3976
|
help="Exit with non-zero code when validation reaches this threshold level",
|
|
3463
3977
|
)
|
|
3464
3978
|
def run(
|
|
3465
|
-
validation_script: str,
|
|
3979
|
+
validation_script: str | None,
|
|
3466
3980
|
data: str | None,
|
|
3467
3981
|
output_html: str | None,
|
|
3468
3982
|
output_json: str | None,
|
|
@@ -3503,6 +4017,11 @@ def run(
|
|
|
3503
4017
|
pb run validation_script.py --write-extract extracts_folder --fail-on critical
|
|
3504
4018
|
"""
|
|
3505
4019
|
try:
|
|
4020
|
+
# Handle missing validation_script with concise help
|
|
4021
|
+
if validation_script is None:
|
|
4022
|
+
_show_concise_help("run", None)
|
|
4023
|
+
return
|
|
4024
|
+
|
|
3506
4025
|
# Load optional data override if provided
|
|
3507
4026
|
cli_data = None
|
|
3508
4027
|
if data:
|
|
@@ -3902,3 +4421,768 @@ def _format_missing_percentage(value: float) -> str:
|
|
|
3902
4421
|
return ">99%" # More than 99%
|
|
3903
4422
|
else:
|
|
3904
4423
|
return f"{int(round(value))}%" # Round to nearest integer with % sign
|
|
4424
|
+
|
|
4425
|
+
|
|
4426
|
+
@cli.command()
|
|
4427
|
+
@click.argument("polars_expression", type=str, required=False)
|
|
4428
|
+
@click.option("--edit", "-e", is_flag=True, help="Open editor for multi-line input")
|
|
4429
|
+
@click.option("--file", "-f", type=click.Path(exists=True), help="Read query from file")
|
|
4430
|
+
@click.option(
|
|
4431
|
+
"--editor", help="Editor to use for --edit mode (overrides $EDITOR and auto-detection)"
|
|
4432
|
+
)
|
|
4433
|
+
@click.option(
|
|
4434
|
+
"--output-format",
|
|
4435
|
+
"-o",
|
|
4436
|
+
type=click.Choice(["preview", "scan", "missing", "info"]),
|
|
4437
|
+
default="preview",
|
|
4438
|
+
help="Output format for the result",
|
|
4439
|
+
)
|
|
4440
|
+
@click.option("--preview-head", default=5, help="Number of head rows for preview")
|
|
4441
|
+
@click.option("--preview-tail", default=5, help="Number of tail rows for preview")
|
|
4442
|
+
@click.option("--output-html", type=click.Path(), help="Save HTML output to file")
|
|
4443
|
+
@click.option(
|
|
4444
|
+
"--pipe", is_flag=True, help="Output data in a format suitable for piping to other pb commands"
|
|
4445
|
+
)
|
|
4446
|
+
@click.option(
|
|
4447
|
+
"--pipe-format",
|
|
4448
|
+
type=click.Choice(["parquet", "csv"]),
|
|
4449
|
+
default="parquet",
|
|
4450
|
+
help="Format for piped output (default: parquet)",
|
|
4451
|
+
)
|
|
4452
|
+
def pl(
|
|
4453
|
+
polars_expression: str | None,
|
|
4454
|
+
edit: bool,
|
|
4455
|
+
file: str | None,
|
|
4456
|
+
editor: str | None,
|
|
4457
|
+
output_format: str,
|
|
4458
|
+
preview_head: int,
|
|
4459
|
+
preview_tail: int,
|
|
4460
|
+
output_html: str | None,
|
|
4461
|
+
pipe: bool,
|
|
4462
|
+
pipe_format: str,
|
|
4463
|
+
):
|
|
4464
|
+
"""
|
|
4465
|
+
Execute Polars expressions and display results.
|
|
4466
|
+
|
|
4467
|
+
Execute Polars DataFrame operations from the command line and display
|
|
4468
|
+
the results using Pointblank's visualization tools.
|
|
4469
|
+
|
|
4470
|
+
POLARS_EXPRESSION should be a valid Polars expression that returns a DataFrame.
|
|
4471
|
+
The 'pl' module is automatically imported and available.
|
|
4472
|
+
|
|
4473
|
+
Examples:
|
|
4474
|
+
|
|
4475
|
+
\b
|
|
4476
|
+
# Direct expression
|
|
4477
|
+
pb pl "pl.read_csv('data.csv')"
|
|
4478
|
+
pb pl "pl.read_csv('data.csv').select(['name', 'age'])"
|
|
4479
|
+
pb pl "pl.read_csv('data.csv').filter(pl.col('age') > 25)"
|
|
4480
|
+
|
|
4481
|
+
# Multi-line with editor (supports multiple statements)
|
|
4482
|
+
pb pl --edit
|
|
4483
|
+
|
|
4484
|
+
# Multi-statement code example in editor:
|
|
4485
|
+
# csv = pl.read_csv('data.csv')
|
|
4486
|
+
# result = csv.select(['name', 'age']).filter(pl.col('age') > 25)
|
|
4487
|
+
|
|
4488
|
+
# Multi-line with a specific editor
|
|
4489
|
+
pb pl --edit --editor nano
|
|
4490
|
+
pb pl --edit --editor code
|
|
4491
|
+
pb pl --edit --editor micro
|
|
4492
|
+
|
|
4493
|
+
# From file
|
|
4494
|
+
pb pl --file query.py
|
|
4495
|
+
|
|
4496
|
+
# Piping to other pb commands
|
|
4497
|
+
pb pl "pl.read_csv('data.csv').filter(pl.col('age') > 25)" --pipe | pb validate --check rows-distinct
|
|
4498
|
+
pb pl --edit --pipe | pb preview --head 10
|
|
4499
|
+
pb pl --edit --pipe | pb scan --output-html report.html
|
|
4500
|
+
pb pl --edit --pipe | pb missing --output-html missing_report.html
|
|
4501
|
+
|
|
4502
|
+
Use --output-format to change how results are displayed:
|
|
4503
|
+
|
|
4504
|
+
\b
|
|
4505
|
+
pb pl "pl.read_csv('data.csv')" --output-format scan
|
|
4506
|
+
pb pl "pl.read_csv('data.csv')" --output-format missing
|
|
4507
|
+
pb pl "pl.read_csv('data.csv')" --output-format info
|
|
4508
|
+
|
|
4509
|
+
Note: For multi-statement code, assign your final result to a variable like
|
|
4510
|
+
'result', 'df', 'data', or ensure it's the last expression.
|
|
4511
|
+
"""
|
|
4512
|
+
try:
|
|
4513
|
+
# Check if Polars is available
|
|
4514
|
+
if not _is_lib_present("polars"):
|
|
4515
|
+
console.print("[red]Error:[/red] Polars is not installed")
|
|
4516
|
+
console.print("\nThe 'pb pl' command requires Polars to be installed.")
|
|
4517
|
+
console.print("Install it with: [cyan]pip install polars[/cyan]")
|
|
4518
|
+
console.print("\nTo check all dependency status, run: [cyan]pb requirements[/cyan]")
|
|
4519
|
+
sys.exit(1)
|
|
4520
|
+
|
|
4521
|
+
import polars as pl
|
|
4522
|
+
|
|
4523
|
+
# Determine the source of the query
|
|
4524
|
+
query_code = None
|
|
4525
|
+
|
|
4526
|
+
if file:
|
|
4527
|
+
# Read from file
|
|
4528
|
+
query_code = Path(file).read_text()
|
|
4529
|
+
elif edit:
|
|
4530
|
+
# Determine which editor to use
|
|
4531
|
+
chosen_editor = editor or _get_best_editor()
|
|
4532
|
+
|
|
4533
|
+
# When piping, send editor message to stderr
|
|
4534
|
+
if pipe:
|
|
4535
|
+
print(f"Using editor: {chosen_editor}", file=sys.stderr)
|
|
4536
|
+
else:
|
|
4537
|
+
console.print(f"[dim]Using editor: {chosen_editor}[/dim]")
|
|
4538
|
+
|
|
4539
|
+
# Interactive editor with custom editor
|
|
4540
|
+
if chosen_editor == "code":
|
|
4541
|
+
# Special handling for VS Code
|
|
4542
|
+
query_code = _edit_with_vscode()
|
|
4543
|
+
else:
|
|
4544
|
+
# Use click.edit() for terminal editors
|
|
4545
|
+
query_code = click.edit(
|
|
4546
|
+
"# Enter your Polars query here\n"
|
|
4547
|
+
"# Example:\n"
|
|
4548
|
+
"# pl.read_csv('data.csv').select(['name', 'age'])\n"
|
|
4549
|
+
"# pl.read_csv('data.csv').filter(pl.col('age') > 25)\n"
|
|
4550
|
+
"# \n"
|
|
4551
|
+
"# The result should be a Polars DataFrame or LazyFrame\n"
|
|
4552
|
+
"\n",
|
|
4553
|
+
editor=chosen_editor,
|
|
4554
|
+
)
|
|
4555
|
+
|
|
4556
|
+
if query_code is None:
|
|
4557
|
+
if pipe:
|
|
4558
|
+
print("No query entered", file=sys.stderr)
|
|
4559
|
+
else:
|
|
4560
|
+
console.print("[yellow]No query entered[/yellow]")
|
|
4561
|
+
sys.exit(1)
|
|
4562
|
+
elif polars_expression:
|
|
4563
|
+
# Direct argument
|
|
4564
|
+
query_code = polars_expression
|
|
4565
|
+
else:
|
|
4566
|
+
# Try to read from stdin (for piping)
|
|
4567
|
+
if not sys.stdin.isatty():
|
|
4568
|
+
# Data is being piped in
|
|
4569
|
+
query_code = sys.stdin.read().strip()
|
|
4570
|
+
else:
|
|
4571
|
+
# No input provided and stdin is a terminal - show concise help
|
|
4572
|
+
_show_concise_help("pl", None)
|
|
4573
|
+
return
|
|
4574
|
+
|
|
4575
|
+
if not query_code or not query_code.strip():
|
|
4576
|
+
console.print("[red]Error:[/red] Empty query")
|
|
4577
|
+
sys.exit(1)
|
|
4578
|
+
|
|
4579
|
+
# Execute the query
|
|
4580
|
+
with console.status("[bold green]Executing Polars expression..."):
|
|
4581
|
+
namespace = {
|
|
4582
|
+
"pl": pl,
|
|
4583
|
+
"polars": pl,
|
|
4584
|
+
"__builtins__": __builtins__,
|
|
4585
|
+
}
|
|
4586
|
+
|
|
4587
|
+
try:
|
|
4588
|
+
# Check if this is a single expression or multiple statements
|
|
4589
|
+
if "\n" in query_code.strip() or any(
|
|
4590
|
+
keyword in query_code
|
|
4591
|
+
for keyword in [
|
|
4592
|
+
" = ",
|
|
4593
|
+
"import",
|
|
4594
|
+
"for ",
|
|
4595
|
+
"if ",
|
|
4596
|
+
"def ",
|
|
4597
|
+
"class ",
|
|
4598
|
+
"with ",
|
|
4599
|
+
"try:",
|
|
4600
|
+
]
|
|
4601
|
+
):
|
|
4602
|
+
# Multiple statements - use exec()
|
|
4603
|
+
exec(query_code, namespace)
|
|
4604
|
+
|
|
4605
|
+
# Look for the result in the namespace
|
|
4606
|
+
# Try common variable names first
|
|
4607
|
+
result = None
|
|
4608
|
+
for var_name in ["result", "df", "data", "table", "output"]:
|
|
4609
|
+
if var_name in namespace:
|
|
4610
|
+
result = namespace[var_name]
|
|
4611
|
+
break
|
|
4612
|
+
|
|
4613
|
+
# If no common names found, look for any DataFrame/LazyFrame
|
|
4614
|
+
if result is None:
|
|
4615
|
+
for key, value in namespace.items():
|
|
4616
|
+
if (
|
|
4617
|
+
hasattr(value, "collect") or hasattr(value, "columns")
|
|
4618
|
+
) and not key.startswith("_"):
|
|
4619
|
+
result = value
|
|
4620
|
+
break
|
|
4621
|
+
|
|
4622
|
+
# If still no result, get the last assigned variable (excluding builtins)
|
|
4623
|
+
if result is None:
|
|
4624
|
+
# Get variables that were added to namespace (excluding our imports)
|
|
4625
|
+
user_vars = {
|
|
4626
|
+
k: v
|
|
4627
|
+
for k, v in namespace.items()
|
|
4628
|
+
if k not in ["pl", "polars", "__builtins__"] and not k.startswith("_")
|
|
4629
|
+
}
|
|
4630
|
+
if user_vars:
|
|
4631
|
+
# Get the last variable (this is a heuristic)
|
|
4632
|
+
last_var = list(user_vars.keys())[-1]
|
|
4633
|
+
result = user_vars[last_var]
|
|
4634
|
+
|
|
4635
|
+
if result is None:
|
|
4636
|
+
if pipe:
|
|
4637
|
+
print(
|
|
4638
|
+
"[red]Error:[/red] Could not find result variable", file=sys.stderr
|
|
4639
|
+
)
|
|
4640
|
+
print(
|
|
4641
|
+
"[dim]Assign your final result to a variable like 'result', 'df', or 'data'[/dim]",
|
|
4642
|
+
file=sys.stderr,
|
|
4643
|
+
)
|
|
4644
|
+
print(
|
|
4645
|
+
"[dim]Or ensure your last line returns a DataFrame[/dim]",
|
|
4646
|
+
file=sys.stderr,
|
|
4647
|
+
)
|
|
4648
|
+
else:
|
|
4649
|
+
console.print("[red]Error:[/red] Could not find result variable")
|
|
4650
|
+
console.print(
|
|
4651
|
+
"[dim]Assign your final result to a variable like 'result', 'df', or 'data'[/dim]"
|
|
4652
|
+
)
|
|
4653
|
+
console.print("[dim]Or ensure your last line returns a DataFrame[/dim]")
|
|
4654
|
+
sys.exit(1)
|
|
4655
|
+
|
|
4656
|
+
else:
|
|
4657
|
+
# Single expression - use eval()
|
|
4658
|
+
result = eval(query_code, namespace)
|
|
4659
|
+
|
|
4660
|
+
# Validate result
|
|
4661
|
+
if not hasattr(result, "collect") and not hasattr(result, "columns"):
|
|
4662
|
+
if pipe:
|
|
4663
|
+
print(
|
|
4664
|
+
"[red]Error:[/red] Expression must return a Polars DataFrame or LazyFrame",
|
|
4665
|
+
file=sys.stderr,
|
|
4666
|
+
)
|
|
4667
|
+
print(f"[dim]Got: {type(result)}[/dim]", file=sys.stderr)
|
|
4668
|
+
else:
|
|
4669
|
+
console.print(
|
|
4670
|
+
"[red]Error:[/red] Expression must return a Polars DataFrame or LazyFrame"
|
|
4671
|
+
)
|
|
4672
|
+
console.print(f"[dim]Got: {type(result)}[/dim]")
|
|
4673
|
+
sys.exit(1)
|
|
4674
|
+
|
|
4675
|
+
except Exception as e:
|
|
4676
|
+
# When piping, send errors to stderr so they don't interfere with the pipe
|
|
4677
|
+
if pipe:
|
|
4678
|
+
print(f"Error executing Polars expression: {e}", file=sys.stderr)
|
|
4679
|
+
print(file=sys.stderr)
|
|
4680
|
+
|
|
4681
|
+
# Create a panel with the expression(s) for better readability
|
|
4682
|
+
if "\n" in query_code.strip():
|
|
4683
|
+
# Multi-line expression
|
|
4684
|
+
print(f"Expression(s) provided:\n{query_code}", file=sys.stderr)
|
|
4685
|
+
else:
|
|
4686
|
+
# Single line expression
|
|
4687
|
+
print(f"Expression provided: {query_code}", file=sys.stderr)
|
|
4688
|
+
else:
|
|
4689
|
+
# Normal error handling when not piping
|
|
4690
|
+
console.print(f"[red]Error executing Polars expression:[/red] {e}")
|
|
4691
|
+
console.print()
|
|
4692
|
+
|
|
4693
|
+
# Create a panel with the expression(s) for better readability
|
|
4694
|
+
if "\n" in query_code.strip():
|
|
4695
|
+
# Multi-line expression
|
|
4696
|
+
console.print(
|
|
4697
|
+
Panel(
|
|
4698
|
+
query_code,
|
|
4699
|
+
title="Expression(s) provided",
|
|
4700
|
+
border_style="red",
|
|
4701
|
+
expand=False,
|
|
4702
|
+
title_align="left",
|
|
4703
|
+
)
|
|
4704
|
+
)
|
|
4705
|
+
else:
|
|
4706
|
+
# Single line expression
|
|
4707
|
+
console.print(
|
|
4708
|
+
Panel(
|
|
4709
|
+
query_code,
|
|
4710
|
+
title="Expression provided",
|
|
4711
|
+
border_style="red",
|
|
4712
|
+
expand=False,
|
|
4713
|
+
title_align="left",
|
|
4714
|
+
)
|
|
4715
|
+
)
|
|
4716
|
+
|
|
4717
|
+
sys.exit(1)
|
|
4718
|
+
|
|
4719
|
+
# Only print success message when not piping (so it doesn't interfere with pipe output)
|
|
4720
|
+
if not pipe:
|
|
4721
|
+
console.print("[green]✓[/green] Polars expression executed successfully")
|
|
4722
|
+
|
|
4723
|
+
# Process output
|
|
4724
|
+
if pipe:
|
|
4725
|
+
# Output data for piping to other commands
|
|
4726
|
+
_handle_pl_pipe(result, pipe_format)
|
|
4727
|
+
elif output_format == "preview":
|
|
4728
|
+
_handle_pl_preview(result, preview_head, preview_tail, output_html)
|
|
4729
|
+
elif output_format == "scan":
|
|
4730
|
+
_handle_pl_scan(result, query_code, output_html)
|
|
4731
|
+
elif output_format == "missing":
|
|
4732
|
+
_handle_pl_missing(result, query_code, output_html)
|
|
4733
|
+
elif output_format == "info":
|
|
4734
|
+
_handle_pl_info(result, query_code, output_html)
|
|
4735
|
+
elif output_format == "validate":
|
|
4736
|
+
console.print("[yellow]Validation output format not yet implemented[/yellow]")
|
|
4737
|
+
console.print("Use 'pb validate' with a data file for now")
|
|
4738
|
+
|
|
4739
|
+
except Exception as e:
|
|
4740
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
4741
|
+
sys.exit(1)
|
|
4742
|
+
|
|
4743
|
+
|
|
4744
|
+
def _handle_pl_preview(result: Any, head: int, tail: int, output_html: str | None) -> None:
|
|
4745
|
+
"""Handle preview output for Polars results."""
|
|
4746
|
+
try:
|
|
4747
|
+
# Create preview using existing preview function
|
|
4748
|
+
gt_table = pb.preview(
|
|
4749
|
+
data=result,
|
|
4750
|
+
n_head=head,
|
|
4751
|
+
n_tail=tail,
|
|
4752
|
+
show_row_numbers=True,
|
|
4753
|
+
)
|
|
4754
|
+
|
|
4755
|
+
if output_html:
|
|
4756
|
+
html_content = gt_table.as_raw_html()
|
|
4757
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
4758
|
+
console.print(f"[green]✓[/green] HTML saved to: {output_html}")
|
|
4759
|
+
else:
|
|
4760
|
+
# Get metadata for enhanced preview
|
|
4761
|
+
try:
|
|
4762
|
+
total_rows = pb.get_row_count(result)
|
|
4763
|
+
total_columns = pb.get_column_count(result)
|
|
4764
|
+
table_type = _get_tbl_type(result)
|
|
4765
|
+
|
|
4766
|
+
preview_info = {
|
|
4767
|
+
"total_rows": total_rows,
|
|
4768
|
+
"total_columns": total_columns,
|
|
4769
|
+
"head_rows": head,
|
|
4770
|
+
"tail_rows": tail,
|
|
4771
|
+
"is_complete": total_rows <= (head + tail),
|
|
4772
|
+
"source_type": "Polars expression",
|
|
4773
|
+
"table_type": table_type,
|
|
4774
|
+
}
|
|
4775
|
+
|
|
4776
|
+
_rich_print_gt_table(gt_table, preview_info)
|
|
4777
|
+
except Exception:
|
|
4778
|
+
# Fallback to basic display
|
|
4779
|
+
_rich_print_gt_table(gt_table)
|
|
4780
|
+
|
|
4781
|
+
except Exception as e:
|
|
4782
|
+
console.print(f"[red]Error creating preview:[/red] {e}")
|
|
4783
|
+
sys.exit(1)
|
|
4784
|
+
|
|
4785
|
+
|
|
4786
|
+
def _handle_pl_scan(result: Any, expression: str, output_html: str | None) -> None:
|
|
4787
|
+
"""Handle scan output for Polars results."""
|
|
4788
|
+
try:
|
|
4789
|
+
scan_result = pb.col_summary_tbl(data=result)
|
|
4790
|
+
|
|
4791
|
+
if output_html:
|
|
4792
|
+
html_content = scan_result.as_raw_html()
|
|
4793
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
4794
|
+
console.print(f"[green]✓[/green] Data scan report saved to: {output_html}")
|
|
4795
|
+
else:
|
|
4796
|
+
# Get metadata for enhanced scan display
|
|
4797
|
+
try:
|
|
4798
|
+
total_rows = pb.get_row_count(result)
|
|
4799
|
+
total_columns = pb.get_column_count(result)
|
|
4800
|
+
table_type = _get_tbl_type(result)
|
|
4801
|
+
|
|
4802
|
+
_rich_print_scan_table(
|
|
4803
|
+
scan_result,
|
|
4804
|
+
expression,
|
|
4805
|
+
"Polars expression",
|
|
4806
|
+
table_type,
|
|
4807
|
+
total_rows,
|
|
4808
|
+
total_columns,
|
|
4809
|
+
)
|
|
4810
|
+
except Exception as e:
|
|
4811
|
+
console.print(f"[yellow]Could not display scan summary: {e}[/yellow]")
|
|
4812
|
+
|
|
4813
|
+
except Exception as e:
|
|
4814
|
+
console.print(f"[red]Error creating scan:[/red] {e}")
|
|
4815
|
+
sys.exit(1)
|
|
4816
|
+
|
|
4817
|
+
|
|
4818
|
+
def _handle_pl_missing(result: Any, expression: str, output_html: str | None) -> None:
|
|
4819
|
+
"""Handle missing values output for Polars results."""
|
|
4820
|
+
try:
|
|
4821
|
+
missing_table = pb.missing_vals_tbl(data=result)
|
|
4822
|
+
|
|
4823
|
+
if output_html:
|
|
4824
|
+
html_content = missing_table.as_raw_html()
|
|
4825
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
4826
|
+
console.print(f"[green]✓[/green] Missing values report saved to: {output_html}")
|
|
4827
|
+
else:
|
|
4828
|
+
_rich_print_missing_table(missing_table, result)
|
|
4829
|
+
|
|
4830
|
+
except Exception as e:
|
|
4831
|
+
console.print(f"[red]Error creating missing values report:[/red] {e}")
|
|
4832
|
+
sys.exit(1)
|
|
4833
|
+
|
|
4834
|
+
|
|
4835
|
+
def _handle_pl_info(result: Any, expression: str, output_html: str | None) -> None:
|
|
4836
|
+
"""Handle info output for Polars results."""
|
|
4837
|
+
try:
|
|
4838
|
+
# Get basic info
|
|
4839
|
+
tbl_type = _get_tbl_type(result)
|
|
4840
|
+
row_count = pb.get_row_count(result)
|
|
4841
|
+
col_count = pb.get_column_count(result)
|
|
4842
|
+
|
|
4843
|
+
# Get column names and types
|
|
4844
|
+
if hasattr(result, "columns"):
|
|
4845
|
+
columns = list(result.columns)
|
|
4846
|
+
elif hasattr(result, "schema"):
|
|
4847
|
+
columns = list(result.schema.names)
|
|
4848
|
+
else:
|
|
4849
|
+
columns = []
|
|
4850
|
+
|
|
4851
|
+
dtypes_dict = _get_column_dtypes(result, columns)
|
|
4852
|
+
|
|
4853
|
+
if output_html:
|
|
4854
|
+
# Create a simple HTML info page
|
|
4855
|
+
# TODO: Implement an improved version of this in the Python API and then
|
|
4856
|
+
# use that here
|
|
4857
|
+
html_content = f"""
|
|
4858
|
+
<html><body>
|
|
4859
|
+
<h2>Polars Expression Info</h2>
|
|
4860
|
+
<p><strong>Expression:</strong> {expression}</p>
|
|
4861
|
+
<p><strong>Table Type:</strong> {tbl_type}</p>
|
|
4862
|
+
<p><strong>Rows:</strong> {row_count:,}</p>
|
|
4863
|
+
<p><strong>Columns:</strong> {col_count:,}</p>
|
|
4864
|
+
<h3>Column Details</h3>
|
|
4865
|
+
<ul>
|
|
4866
|
+
{"".join(f"<li>{col}: {dtypes_dict.get(col, '?')}</li>" for col in columns)}
|
|
4867
|
+
</ul>
|
|
4868
|
+
</body></html>
|
|
4869
|
+
"""
|
|
4870
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
4871
|
+
console.print(f"[green]✓[/green] HTML info saved to: {output_html}")
|
|
4872
|
+
else:
|
|
4873
|
+
# Display info table
|
|
4874
|
+
from rich.box import SIMPLE_HEAD
|
|
4875
|
+
|
|
4876
|
+
info_table = Table(
|
|
4877
|
+
title="Polars Expression Info",
|
|
4878
|
+
show_header=True,
|
|
4879
|
+
header_style="bold magenta",
|
|
4880
|
+
box=SIMPLE_HEAD,
|
|
4881
|
+
title_style="bold cyan",
|
|
4882
|
+
title_justify="left",
|
|
4883
|
+
)
|
|
4884
|
+
info_table.add_column("Property", style="cyan", no_wrap=True)
|
|
4885
|
+
info_table.add_column("Value", style="green")
|
|
4886
|
+
|
|
4887
|
+
info_table.add_row("Expression", expression)
|
|
4888
|
+
# Capitalize "polars" to "Polars" for consistency with pb info command
|
|
4889
|
+
display_tbl_type = (
|
|
4890
|
+
tbl_type.replace("polars", "Polars") if "polars" in tbl_type.lower() else tbl_type
|
|
4891
|
+
)
|
|
4892
|
+
info_table.add_row("Table Type", display_tbl_type)
|
|
4893
|
+
info_table.add_row("Rows", f"{row_count:,}")
|
|
4894
|
+
info_table.add_row("Columns", f"{col_count:,}")
|
|
4895
|
+
|
|
4896
|
+
console.print()
|
|
4897
|
+
console.print(info_table)
|
|
4898
|
+
|
|
4899
|
+
# Show column details
|
|
4900
|
+
if columns:
|
|
4901
|
+
console.print("\n[bold cyan]Column Details:[/bold cyan]")
|
|
4902
|
+
for col in columns[:10]: # Show first 10 columns
|
|
4903
|
+
dtype = dtypes_dict.get(col, "?")
|
|
4904
|
+
console.print(f" • {col}: [yellow]{dtype}[/yellow]")
|
|
4905
|
+
|
|
4906
|
+
if len(columns) > 10:
|
|
4907
|
+
console.print(f" ... and {len(columns) - 10} more columns")
|
|
4908
|
+
|
|
4909
|
+
except Exception as e:
|
|
4910
|
+
console.print(f"[red]Error creating info:[/red] {e}")
|
|
4911
|
+
sys.exit(1)
|
|
4912
|
+
|
|
4913
|
+
|
|
4914
|
+
def _handle_pl_pipe(result: Any, pipe_format: str) -> None:
|
|
4915
|
+
"""Handle piped output from Polars results."""
|
|
4916
|
+
try:
|
|
4917
|
+
import sys
|
|
4918
|
+
import tempfile
|
|
4919
|
+
|
|
4920
|
+
# Create a temporary file to store the data
|
|
4921
|
+
with tempfile.NamedTemporaryFile(
|
|
4922
|
+
mode="w", suffix=f".{pipe_format}", prefix="pb_pipe_", delete=False
|
|
4923
|
+
) as temp_file:
|
|
4924
|
+
temp_path = temp_file.name
|
|
4925
|
+
|
|
4926
|
+
# Write the data to the temporary file
|
|
4927
|
+
if pipe_format == "parquet":
|
|
4928
|
+
if hasattr(result, "write_parquet"):
|
|
4929
|
+
# Polars
|
|
4930
|
+
result.write_parquet(temp_path)
|
|
4931
|
+
elif hasattr(result, "to_parquet"):
|
|
4932
|
+
# Pandas
|
|
4933
|
+
result.to_parquet(temp_path)
|
|
4934
|
+
else:
|
|
4935
|
+
# Convert to pandas and write
|
|
4936
|
+
import pandas as pd
|
|
4937
|
+
|
|
4938
|
+
pd_result = pd.DataFrame(result)
|
|
4939
|
+
pd_result.to_parquet(temp_path)
|
|
4940
|
+
else: # CSV
|
|
4941
|
+
if hasattr(result, "write_csv"):
|
|
4942
|
+
# Polars
|
|
4943
|
+
result.write_csv(temp_path)
|
|
4944
|
+
elif hasattr(result, "to_csv"):
|
|
4945
|
+
# Pandas
|
|
4946
|
+
result.to_csv(temp_path, index=False)
|
|
4947
|
+
else:
|
|
4948
|
+
# Convert to pandas and write
|
|
4949
|
+
import pandas as pd
|
|
4950
|
+
|
|
4951
|
+
pd_result = pd.DataFrame(result)
|
|
4952
|
+
pd_result.to_csv(temp_path, index=False)
|
|
4953
|
+
|
|
4954
|
+
# Output the temporary file path to stdout for the next command
|
|
4955
|
+
print(temp_path)
|
|
4956
|
+
|
|
4957
|
+
except Exception as e:
|
|
4958
|
+
print(f"[red]Error creating pipe output:[/red] {e}", file=sys.stderr)
|
|
4959
|
+
sys.exit(1)
|
|
4960
|
+
|
|
4961
|
+
|
|
4962
|
+
def _get_best_editor() -> str:
|
|
4963
|
+
"""Detect the best available editor on the system."""
|
|
4964
|
+
|
|
4965
|
+
# Check environment variable first
|
|
4966
|
+
if "EDITOR" in os.environ:
|
|
4967
|
+
return os.environ["EDITOR"]
|
|
4968
|
+
|
|
4969
|
+
# Check for common editors in order of preference
|
|
4970
|
+
editors = [
|
|
4971
|
+
"code", # VS Code
|
|
4972
|
+
"micro", # Modern terminal editor
|
|
4973
|
+
"nano", # User-friendly terminal editor
|
|
4974
|
+
"vim", # Vim
|
|
4975
|
+
"vi", # Vi (fallback)
|
|
4976
|
+
]
|
|
4977
|
+
|
|
4978
|
+
for editor in editors:
|
|
4979
|
+
if shutil.which(editor):
|
|
4980
|
+
return editor
|
|
4981
|
+
|
|
4982
|
+
# Ultimate fallback
|
|
4983
|
+
return "nano"
|
|
4984
|
+
|
|
4985
|
+
|
|
4986
|
+
def _edit_with_vscode() -> str | None:
|
|
4987
|
+
"""Edit Polars query using VS Code."""
|
|
4988
|
+
import subprocess
|
|
4989
|
+
import tempfile
|
|
4990
|
+
|
|
4991
|
+
# Create a temporary Python file
|
|
4992
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", prefix="pb_pl_", delete=False) as f:
|
|
4993
|
+
f.write("import polars as pl\n")
|
|
4994
|
+
f.write("\n")
|
|
4995
|
+
f.write("# Enter your Polars query here\n")
|
|
4996
|
+
f.write("# Examples:\n")
|
|
4997
|
+
f.write("# \n")
|
|
4998
|
+
f.write("# Single expression:\n")
|
|
4999
|
+
f.write("# pl.read_csv('data.csv').select(['name', 'age'])\n")
|
|
5000
|
+
f.write("# \n")
|
|
5001
|
+
f.write("# Multiple statements:\n")
|
|
5002
|
+
f.write("# csv = pl.read_csv('data.csv')\n")
|
|
5003
|
+
f.write("# result = csv.select(['name', 'age']).filter(pl.col('age') > 25)\n")
|
|
5004
|
+
f.write("# \n")
|
|
5005
|
+
f.write("# For multi-statement code, assign your final result to a variable\n")
|
|
5006
|
+
f.write("# like 'result', 'df', 'data', or just ensure it's the last line\n")
|
|
5007
|
+
f.write("# \n")
|
|
5008
|
+
f.write("# Save and then close this file in VS Code to execute the query\n")
|
|
5009
|
+
f.write("\n")
|
|
5010
|
+
temp_file = f.name
|
|
5011
|
+
|
|
5012
|
+
try:
|
|
5013
|
+
# Open in VS Code and wait for it to close
|
|
5014
|
+
result = subprocess.run(
|
|
5015
|
+
["code", "--wait", temp_file], capture_output=True, text=True, timeout=300
|
|
5016
|
+
)
|
|
5017
|
+
|
|
5018
|
+
if result.returncode != 0:
|
|
5019
|
+
console.print(f"[yellow]VS Code exited with code {result.returncode}[/yellow]")
|
|
5020
|
+
|
|
5021
|
+
# Read the edited content
|
|
5022
|
+
with open(temp_file, "r") as f:
|
|
5023
|
+
content = f.read()
|
|
5024
|
+
|
|
5025
|
+
# Remove comments, empty lines, and import statements for cleaner execution
|
|
5026
|
+
lines = []
|
|
5027
|
+
for line in content.split("\n"):
|
|
5028
|
+
stripped = line.strip()
|
|
5029
|
+
if (
|
|
5030
|
+
stripped
|
|
5031
|
+
and not stripped.startswith("#")
|
|
5032
|
+
and not stripped.startswith("import polars")
|
|
5033
|
+
and not stripped.startswith("import polars as pl")
|
|
5034
|
+
):
|
|
5035
|
+
lines.append(line)
|
|
5036
|
+
|
|
5037
|
+
return "\n".join(lines) if lines else None
|
|
5038
|
+
|
|
5039
|
+
except subprocess.TimeoutExpired:
|
|
5040
|
+
console.print("[red]Timeout:[/red] VS Code took too long to respond")
|
|
5041
|
+
return None
|
|
5042
|
+
except subprocess.CalledProcessError as e:
|
|
5043
|
+
console.print(f"[red]Error:[/red] Could not open VS Code: {e}")
|
|
5044
|
+
return None
|
|
5045
|
+
except FileNotFoundError:
|
|
5046
|
+
console.print("[red]Error:[/red] VS Code not found in PATH")
|
|
5047
|
+
return None
|
|
5048
|
+
finally:
|
|
5049
|
+
# Clean up
|
|
5050
|
+
Path(temp_file).unlink(missing_ok=True)
|
|
5051
|
+
|
|
5052
|
+
|
|
5053
|
+
def _show_concise_help(command_name: str, ctx: click.Context) -> None:
|
|
5054
|
+
"""Show concise help for a command when required arguments are missing."""
|
|
5055
|
+
|
|
5056
|
+
if command_name == "info":
|
|
5057
|
+
console.print("[bold cyan]pb info[/bold cyan] - Display information about a data source")
|
|
5058
|
+
console.print()
|
|
5059
|
+
console.print("[bold yellow]Usage:[/bold yellow]")
|
|
5060
|
+
console.print(" pb info data.csv")
|
|
5061
|
+
console.print(" pb info small_table")
|
|
5062
|
+
console.print()
|
|
5063
|
+
console.print("[dim]Shows table type, dimensions, column names, and data types[/dim]")
|
|
5064
|
+
console.print()
|
|
5065
|
+
console.print(
|
|
5066
|
+
"[dim]Use [bold]pb info --help[/bold] for complete options and examples[/dim]"
|
|
5067
|
+
)
|
|
5068
|
+
|
|
5069
|
+
elif command_name == "preview":
|
|
5070
|
+
console.print(
|
|
5071
|
+
"[bold cyan]pb preview[/bold cyan] - Preview a data table showing head and tail rows"
|
|
5072
|
+
)
|
|
5073
|
+
console.print()
|
|
5074
|
+
console.print("[bold yellow]Usage:[/bold yellow]")
|
|
5075
|
+
console.print(" pb preview data.csv")
|
|
5076
|
+
console.print(" pb preview data.parquet --head 10 --tail 5")
|
|
5077
|
+
console.print()
|
|
5078
|
+
console.print("[bold yellow]Key Options:[/bold yellow]")
|
|
5079
|
+
console.print(" --head N Number of rows from the top (default: 5)")
|
|
5080
|
+
console.print(" --tail N Number of rows from the bottom (default: 5)")
|
|
5081
|
+
console.print(" --columns LIST Comma-separated list of columns to display")
|
|
5082
|
+
console.print(" --output-html Save HTML output to file")
|
|
5083
|
+
console.print()
|
|
5084
|
+
console.print(
|
|
5085
|
+
"[dim]Use [bold]pb preview --help[/bold] for complete options and examples[/dim]"
|
|
5086
|
+
)
|
|
5087
|
+
|
|
5088
|
+
elif command_name == "scan":
|
|
5089
|
+
console.print(
|
|
5090
|
+
"[bold cyan]pb scan[/bold cyan] - Generate a comprehensive data profile report"
|
|
5091
|
+
)
|
|
5092
|
+
console.print()
|
|
5093
|
+
console.print("[bold yellow]Usage:[/bold yellow]")
|
|
5094
|
+
console.print(" pb scan data.csv")
|
|
5095
|
+
console.print(" pb scan data.parquet --output-html report.html")
|
|
5096
|
+
console.print()
|
|
5097
|
+
console.print("[bold yellow]Key Options:[/bold yellow]")
|
|
5098
|
+
console.print(" --output-html Save HTML scan report to file")
|
|
5099
|
+
console.print(" --columns LIST Comma-separated list of columns to scan")
|
|
5100
|
+
console.print()
|
|
5101
|
+
console.print(
|
|
5102
|
+
"[dim]Use [bold]pb scan --help[/bold] for complete options and examples[/dim]"
|
|
5103
|
+
)
|
|
5104
|
+
|
|
5105
|
+
elif command_name == "missing":
|
|
5106
|
+
console.print("[bold cyan]pb missing[/bold cyan] - Generate a missing values report")
|
|
5107
|
+
console.print()
|
|
5108
|
+
console.print("[bold yellow]Usage:[/bold yellow]")
|
|
5109
|
+
console.print(" pb missing data.csv")
|
|
5110
|
+
console.print(" pb missing data.parquet --output-html missing_report.html")
|
|
5111
|
+
console.print()
|
|
5112
|
+
console.print("[bold yellow]Key Options:[/bold yellow]")
|
|
5113
|
+
console.print(" --output-html Save HTML output to file")
|
|
5114
|
+
console.print()
|
|
5115
|
+
console.print(
|
|
5116
|
+
"[dim]Use [bold]pb missing --help[/bold] for complete options and examples[/dim]"
|
|
5117
|
+
)
|
|
5118
|
+
|
|
5119
|
+
elif command_name == "validate":
|
|
5120
|
+
console.print("[bold cyan]pb validate[/bold cyan] - Perform data validation checks")
|
|
5121
|
+
console.print()
|
|
5122
|
+
console.print("[bold yellow]Usage:[/bold yellow]")
|
|
5123
|
+
console.print(" pb validate data.csv")
|
|
5124
|
+
console.print(" pb validate data.csv --check col-vals-not-null --column email")
|
|
5125
|
+
console.print()
|
|
5126
|
+
console.print("[bold yellow]Key Options:[/bold yellow]")
|
|
5127
|
+
console.print(" --check TYPE Validation check type (default: rows-distinct)")
|
|
5128
|
+
console.print(" --column COL Column name for column-specific checks")
|
|
5129
|
+
console.print(" --show-extract Show failing rows if validation fails")
|
|
5130
|
+
console.print(" --list-checks List all available validation checks")
|
|
5131
|
+
console.print()
|
|
5132
|
+
console.print(
|
|
5133
|
+
"[dim]Use [bold]pb validate --help[/bold] for complete options and examples[/dim]"
|
|
5134
|
+
)
|
|
5135
|
+
|
|
5136
|
+
elif command_name == "run":
|
|
5137
|
+
console.print("[bold cyan]pb run[/bold cyan] - Run a Pointblank validation script")
|
|
5138
|
+
console.print()
|
|
5139
|
+
console.print("[bold yellow]Usage:[/bold yellow]")
|
|
5140
|
+
console.print(" pb run validation_script.py")
|
|
5141
|
+
console.print(" pb run validation_script.py --data data.csv")
|
|
5142
|
+
console.print()
|
|
5143
|
+
console.print("[bold yellow]Key Options:[/bold yellow]")
|
|
5144
|
+
console.print(" --data SOURCE Replace data source in validation objects")
|
|
5145
|
+
console.print(" --output-html Save HTML validation report to file")
|
|
5146
|
+
console.print(" --show-extract Show failing rows if validation fails")
|
|
5147
|
+
console.print(" --fail-on LEVEL Exit with error on critical/error/warning/any")
|
|
5148
|
+
console.print()
|
|
5149
|
+
console.print("[dim]Use [bold]pb run --help[/bold] for complete options and examples[/dim]")
|
|
5150
|
+
|
|
5151
|
+
elif command_name == "make-template":
|
|
5152
|
+
console.print(
|
|
5153
|
+
"[bold cyan]pb make-template[/bold cyan] - Create a validation script template"
|
|
5154
|
+
)
|
|
5155
|
+
console.print()
|
|
5156
|
+
console.print("[bold yellow]Usage:[/bold yellow]")
|
|
5157
|
+
console.print(" pb make-template my_validation.py")
|
|
5158
|
+
console.print(" pb make-template validation_template.py")
|
|
5159
|
+
console.print()
|
|
5160
|
+
console.print("[dim]Creates a sample Python script with validation examples[/dim]")
|
|
5161
|
+
console.print("[dim]Edit the template and run with [bold]pb run[/bold][/dim]")
|
|
5162
|
+
console.print()
|
|
5163
|
+
console.print(
|
|
5164
|
+
"[dim]Use [bold]pb make-template --help[/bold] for complete options and examples[/dim]"
|
|
5165
|
+
)
|
|
5166
|
+
|
|
5167
|
+
elif command_name == "pl":
|
|
5168
|
+
console.print(
|
|
5169
|
+
"[bold cyan]pb pl[/bold cyan] - Execute Polars expressions and display results"
|
|
5170
|
+
)
|
|
5171
|
+
console.print()
|
|
5172
|
+
console.print("[bold yellow]Usage:[/bold yellow]")
|
|
5173
|
+
console.print(" pb pl \"pl.read_csv('data.csv')\"")
|
|
5174
|
+
console.print(" pb pl --edit")
|
|
5175
|
+
console.print()
|
|
5176
|
+
console.print("[bold yellow]Key Options:[/bold yellow]")
|
|
5177
|
+
console.print(" --edit Open editor for multi-line input")
|
|
5178
|
+
console.print(" --file FILE Read query from file")
|
|
5179
|
+
console.print(" --output-format Output format: preview, scan, missing, info")
|
|
5180
|
+
console.print(" --pipe Output for piping to other pb commands")
|
|
5181
|
+
console.print()
|
|
5182
|
+
console.print("[dim]Use [bold]pb pl --help[/bold] for complete options and examples[/dim]")
|
|
5183
|
+
|
|
5184
|
+
# Fix the exit call at the end
|
|
5185
|
+
if ctx is not None:
|
|
5186
|
+
ctx.exit(1)
|
|
5187
|
+
else:
|
|
5188
|
+
sys.exit(1)
|