pointblank 0.11.2__py3-none-any.whl → 0.11.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/cli.py CHANGED
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import copy
4
+ import os
5
+ import shutil
4
6
  import sys
5
7
  from pathlib import Path
6
8
  from typing import Any
@@ -32,6 +34,8 @@ class OrderedGroup(click.Group):
32
34
  "validate",
33
35
  "run",
34
36
  "make-template",
37
+ # Data Manipulation
38
+ "pl",
35
39
  # Utilities
36
40
  "datasets",
37
41
  "requirements",
@@ -91,6 +95,15 @@ def _load_data_source(data_source: str) -> Any:
91
95
  return _process_data(data_source)
92
96
 
93
97
 
98
+ def _is_piped_data_source(data_source: str) -> bool:
99
+ """Check if the data source is from a piped pb command."""
100
+ return (
101
+ data_source
102
+ and ("pb_pipe_" in data_source)
103
+ and (data_source.startswith("/var/folders/") or data_source.startswith("/tmp/"))
104
+ )
105
+
106
+
94
107
  def _format_cell_value(
95
108
  value: Any, is_row_number: bool = False, max_width: int = 50, num_columns: int = 10
96
109
  ) -> str:
@@ -558,9 +571,12 @@ def _rich_print_gt_table(
558
571
  gt_table: The GT table object to display
559
572
  preview_info: Optional dict with preview context info:
560
573
  - total_rows: Total rows in the dataset
574
+ - total_columns: Total columns in the dataset
561
575
  - head_rows: Number of head rows shown
562
576
  - tail_rows: Number of tail rows shown
563
577
  - is_complete: Whether the entire dataset is shown
578
+ - source_type: Type of data source (e.g., "External source: worldcities_new.csv")
579
+ - table_type: Type of table (e.g., "polars")
564
580
  show_summary: Whether to show the row count summary at the bottom
565
581
  """
566
582
  try:
@@ -593,6 +609,12 @@ def _rich_print_gt_table(
593
609
  table_type = preview_info["table_type"]
594
610
  table_title = f"Data Preview / {source_type} / {table_type}"
595
611
 
612
+ # Add dimensions subtitle in gray if available
613
+ total_rows = preview_info.get("total_rows")
614
+ total_columns = preview_info.get("total_columns")
615
+ if total_rows is not None and total_columns is not None:
616
+ table_title += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
617
+
596
618
  rich_table = Table(
597
619
  title=table_title,
598
620
  show_header=True,
@@ -1209,20 +1231,31 @@ def _display_validation_summary(validation: Any) -> None:
1209
1231
 
1210
1232
 
1211
1233
  @click.group(cls=OrderedGroup)
1212
- @click.version_option(version=pb.__version__, prog_name="pb")
1234
+ @click.version_option(pb.__version__, "-v", "--version", prog_name="pb")
1235
+ @click.help_option("-h", "--help")
1213
1236
  def cli():
1214
1237
  """
1215
1238
  Pointblank CLI: Data validation and quality tools for data engineers.
1216
1239
 
1217
- Use this CLI to run validation scripts, preview tables, and generate reports
1218
- directly from the command line.
1240
+ Use this CLI to validate data quality, explore datasets, and generate comprehensive
1241
+ reports for CSV, Parquet, and database sources. Suitable for data pipelines, ETL
1242
+ validation, and exploratory data analysis from the command line.
1243
+
1244
+ Quick Examples:
1245
+
1246
+ \b
1247
+ pb preview data.csv Preview your data
1248
+ pb scan data.csv Generate data profile
1249
+ pb validate data.csv Run basic validation
1250
+
1251
+ Use pb COMMAND --help for detailed help on any command.
1219
1252
  """
1220
1253
  pass
1221
1254
 
1222
1255
 
1223
1256
  @cli.command()
1224
- @click.argument("data_source", type=str)
1225
- def info(data_source: str):
1257
+ @click.argument("data_source", type=str, required=False)
1258
+ def info(data_source: str | None):
1226
1259
  """
1227
1260
  Display information about a data source.
1228
1261
 
@@ -1238,6 +1271,11 @@ def info(data_source: str):
1238
1271
  - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1239
1272
  """
1240
1273
  try:
1274
+ # Handle missing data_source with concise help
1275
+ if data_source is None:
1276
+ _show_concise_help("info", None)
1277
+ return
1278
+
1241
1279
  with console.status("[bold green]Loading data..."):
1242
1280
  # Load the data source using the centralized function
1243
1281
  data = _load_data_source(data_source)
@@ -1276,21 +1314,21 @@ def info(data_source: str):
1276
1314
 
1277
1315
 
1278
1316
  @cli.command()
1279
- @click.argument("data_source", type=str)
1280
- @click.option("--columns", "-c", help="Comma-separated list of columns to display")
1317
+ @click.argument("data_source", type=str, required=False)
1318
+ @click.option("--columns", help="Comma-separated list of columns to display")
1281
1319
  @click.option("--col-range", help="Column range like '1:10' or '5:' or ':15' (1-based indexing)")
1282
1320
  @click.option("--col-first", type=int, help="Show first N columns")
1283
1321
  @click.option("--col-last", type=int, help="Show last N columns")
1284
- @click.option("--head", "-h", default=5, help="Number of rows from the top (default: 5)")
1285
- @click.option("--tail", "-t", default=5, help="Number of rows from the bottom (default: 5)")
1286
- @click.option("--limit", "-l", default=50, help="Maximum total rows to display (default: 50)")
1322
+ @click.option("--head", default=5, help="Number of rows from the top (default: 5)")
1323
+ @click.option("--tail", default=5, help="Number of rows from the bottom (default: 5)")
1324
+ @click.option("--limit", default=50, help="Maximum total rows to display (default: 50)")
1287
1325
  @click.option("--no-row-numbers", is_flag=True, help="Hide row numbers")
1288
1326
  @click.option("--max-col-width", default=250, help="Maximum column width in pixels (default: 250)")
1289
1327
  @click.option("--min-table-width", default=500, help="Minimum table width in pixels (default: 500)")
1290
1328
  @click.option("--no-header", is_flag=True, help="Hide table header")
1291
1329
  @click.option("--output-html", type=click.Path(), help="Save HTML output to file")
1292
1330
  def preview(
1293
- data_source: str,
1331
+ data_source: str | None,
1294
1332
  columns: str | None,
1295
1333
  col_range: str | None,
1296
1334
  col_first: int | None,
@@ -1315,6 +1353,7 @@ def preview(
1315
1353
  - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
1316
1354
  - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1317
1355
  - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1356
+ - Piped data from pb pl command
1318
1357
 
1319
1358
  COLUMN SELECTION OPTIONS:
1320
1359
 
@@ -1329,11 +1368,52 @@ def preview(
1329
1368
  Tables with >15 columns automatically show first 7 and last 7 columns with indicators.
1330
1369
  """
1331
1370
  try:
1371
+ import sys
1372
+
1373
+ # Handle piped input
1374
+ if data_source is None:
1375
+ if not sys.stdin.isatty():
1376
+ # Data is being piped in - read the file path from stdin
1377
+ piped_input = sys.stdin.read().strip()
1378
+ if piped_input:
1379
+ data_source = piped_input
1380
+
1381
+ # Determine the format from the file extension
1382
+ if piped_input.endswith(".parquet"):
1383
+ format_type = "Parquet"
1384
+ elif piped_input.endswith(".csv"):
1385
+ format_type = "CSV"
1386
+ else:
1387
+ format_type = "unknown"
1388
+
1389
+ console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
1390
+ else:
1391
+ console.print("[red]Error:[/red] No data provided via pipe")
1392
+ sys.exit(1)
1393
+ else:
1394
+ # Show concise help and exit
1395
+ _show_concise_help("preview", None)
1396
+ return
1397
+
1332
1398
  with console.status("[bold green]Loading data..."):
1333
1399
  # Load the data source using the centralized function
1334
1400
  data = _load_data_source(data_source)
1335
1401
 
1336
- console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1402
+ # Check if this is a piped data source and create friendly display name
1403
+ is_piped_data = _is_piped_data_source(data_source)
1404
+
1405
+ if is_piped_data:
1406
+ if data_source.endswith(".parquet"):
1407
+ display_source = "Parquet file via `pb pl`"
1408
+ elif data_source.endswith(".csv"):
1409
+ display_source = "CSV file via `pb pl`"
1410
+ else:
1411
+ display_source = "File via `pb pl`"
1412
+ console.print(
1413
+ f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
1414
+ )
1415
+ else:
1416
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1337
1417
 
1338
1418
  # Parse columns if provided
1339
1419
  columns_list = None
@@ -1355,7 +1435,7 @@ def preview(
1355
1435
  # If _row_num_ exists in data but not in user selection, add it at beginning
1356
1436
  if all_columns and "_row_num_" in all_columns and "_row_num_" not in columns_list:
1357
1437
  columns_list = ["_row_num_"] + columns_list
1358
- except Exception: # pragma: no cover
1438
+ except Exception:
1359
1439
  # If we can't process the data, just use the user's column list as-is
1360
1440
  pass
1361
1441
  elif col_range or col_first or col_last:
@@ -1430,7 +1510,14 @@ def preview(
1430
1510
  total_dataset_columns = pb.get_column_count(processed_data)
1431
1511
 
1432
1512
  # Determine source type and table type for enhanced preview title
1433
- if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1513
+ if is_piped_data:
1514
+ if data_source.endswith(".parquet"):
1515
+ source_type = "Polars expression (serialized to Parquet) from `pb pl`"
1516
+ elif data_source.endswith(".csv"):
1517
+ source_type = "Polars expression (serialized to CSV) from `pb pl`"
1518
+ else:
1519
+ source_type = "Polars expression from `pb pl`"
1520
+ elif data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1434
1521
  source_type = f"Pointblank dataset: {data_source}"
1435
1522
  else:
1436
1523
  source_type = f"External source: {data_source}"
@@ -1480,17 +1567,17 @@ def preview(
1480
1567
 
1481
1568
  _rich_print_gt_table(gt_table, preview_info)
1482
1569
 
1483
- except Exception as e: # pragma: no cover
1570
+ except Exception as e:
1484
1571
  console.print(f"[red]Error:[/red] {e}")
1485
- sys.exit(1) # pragma: no cover
1572
+ sys.exit(1)
1486
1573
 
1487
1574
 
1488
1575
  @cli.command()
1489
- @click.argument("data_source", type=str)
1576
+ @click.argument("data_source", type=str, required=False)
1490
1577
  @click.option("--output-html", type=click.Path(), help="Save HTML scan report to file")
1491
1578
  @click.option("--columns", "-c", help="Comma-separated list of columns to scan")
1492
1579
  def scan(
1493
- data_source: str,
1580
+ data_source: str | None,
1494
1581
  output_html: str | None,
1495
1582
  columns: str | None,
1496
1583
  ):
@@ -1513,17 +1600,58 @@ def scan(
1513
1600
  - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
1514
1601
  - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1515
1602
  - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1603
+ - Piped data from pb pl command
1516
1604
  """
1517
1605
  try:
1606
+ import sys
1518
1607
  import time
1519
1608
 
1520
1609
  start_time = time.time()
1521
1610
 
1611
+ # Handle piped input
1612
+ if data_source is None:
1613
+ if not sys.stdin.isatty():
1614
+ # Data is being piped in - read the file path from stdin
1615
+ piped_input = sys.stdin.read().strip()
1616
+ if piped_input:
1617
+ data_source = piped_input
1618
+
1619
+ # Determine the format from the file extension
1620
+ if piped_input.endswith(".parquet"):
1621
+ format_type = "Parquet"
1622
+ elif piped_input.endswith(".csv"):
1623
+ format_type = "CSV"
1624
+ else:
1625
+ format_type = "unknown"
1626
+
1627
+ console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
1628
+ else:
1629
+ console.print("[red]Error:[/red] No data provided via pipe")
1630
+ sys.exit(1)
1631
+ else:
1632
+ # Show concise help and exit
1633
+ _show_concise_help("scan", None)
1634
+ return
1635
+
1522
1636
  with console.status("[bold green]Loading data..."):
1523
1637
  # Load the data source using the centralized function
1524
1638
  data = _load_data_source(data_source)
1525
1639
 
1526
- console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1640
+ # Check if this is a piped data source and create friendly display name
1641
+ is_piped_data = _is_piped_data_source(data_source)
1642
+
1643
+ if is_piped_data:
1644
+ if data_source.endswith(".parquet"):
1645
+ display_source = "Parquet file via `pb pl`"
1646
+ elif data_source.endswith(".csv"):
1647
+ display_source = "CSV file via `pb pl`"
1648
+ else:
1649
+ display_source = "File via `pb pl`"
1650
+ console.print(
1651
+ f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
1652
+ )
1653
+ else:
1654
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1527
1655
 
1528
1656
  # Parse columns if provided
1529
1657
  columns_list = None
@@ -1536,7 +1664,15 @@ def scan(
1536
1664
  # Data is already processed by _load_data_source
1537
1665
  scan_result = pb.col_summary_tbl(data=data)
1538
1666
 
1539
- if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1667
+ # Create friendly source type for display
1668
+ if is_piped_data:
1669
+ if data_source.endswith(".parquet"):
1670
+ source_type = "Polars expression (serialized to Parquet) from `pb pl`"
1671
+ elif data_source.endswith(".csv"):
1672
+ source_type = "Polars expression (serialized to CSV) from `pb pl`"
1673
+ else:
1674
+ source_type = "Polars expression from `pb pl`"
1675
+ elif data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1540
1676
  source_type = f"Pointblank dataset: {data_source}"
1541
1677
  else:
1542
1678
  source_type = f"External source: {data_source}"
@@ -1568,7 +1704,12 @@ def scan(
1568
1704
  # Display detailed column summary using rich formatting
1569
1705
  try:
1570
1706
  _rich_print_scan_table(
1571
- scan_result, data_source, source_type, table_type, total_rows, total_columns
1707
+ scan_result,
1708
+ display_source if is_piped_data else data_source,
1709
+ source_type,
1710
+ table_type,
1711
+ total_rows,
1712
+ total_columns,
1572
1713
  )
1573
1714
 
1574
1715
  except Exception as e:
@@ -1580,9 +1721,9 @@ def scan(
1580
1721
 
1581
1722
 
1582
1723
  @cli.command()
1583
- @click.argument("data_source", type=str)
1724
+ @click.argument("data_source", type=str, required=False)
1584
1725
  @click.option("--output-html", type=click.Path(), help="Save HTML output to file")
1585
- def missing(data_source: str, output_html: str | None):
1726
+ def missing(data_source: str | None, output_html: str | None):
1586
1727
  """
1587
1728
  Generate a missing values report for a data table.
1588
1729
 
@@ -1594,13 +1735,55 @@ def missing(data_source: str, output_html: str | None):
1594
1735
  - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
1595
1736
  - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1596
1737
  - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1738
+ - Piped data from pb pl command
1597
1739
  """
1598
1740
  try:
1741
+ import sys
1742
+
1743
+ # Handle piped input
1744
+ if data_source is None:
1745
+ if not sys.stdin.isatty():
1746
+ # Data is being piped in - read the file path from stdin
1747
+ piped_input = sys.stdin.read().strip()
1748
+ if piped_input:
1749
+ data_source = piped_input
1750
+
1751
+ # Determine the format from the file extension
1752
+ if piped_input.endswith(".parquet"):
1753
+ format_type = "Parquet"
1754
+ elif piped_input.endswith(".csv"):
1755
+ format_type = "CSV"
1756
+ else:
1757
+ format_type = "unknown"
1758
+
1759
+ console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
1760
+ else:
1761
+ console.print("[red]Error:[/red] No data provided via pipe")
1762
+ sys.exit(1)
1763
+ else:
1764
+ # Show concise help and exit
1765
+ _show_concise_help("missing", None)
1766
+ return
1767
+
1599
1768
  with console.status("[bold green]Loading data..."):
1600
1769
  # Load the data source using the centralized function
1601
1770
  data = _load_data_source(data_source)
1602
1771
 
1603
- console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1772
+ # Check if this is a piped data source and create friendly display name
1773
+ is_piped_data = _is_piped_data_source(data_source)
1774
+
1775
+ if is_piped_data:
1776
+ if data_source.endswith(".parquet"):
1777
+ display_source = "Parquet file via `pb pl`"
1778
+ elif data_source.endswith(".csv"):
1779
+ display_source = "CSV file via `pb pl`"
1780
+ else:
1781
+ display_source = "File via `pb pl`"
1782
+ console.print(
1783
+ f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
1784
+ )
1785
+ else:
1786
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1604
1787
 
1605
1788
  # Generate missing values table
1606
1789
  with console.status("[bold green]Analyzing missing values..."):
@@ -1616,7 +1799,38 @@ def missing(data_source: str, output_html: str | None):
1616
1799
  console.print(f"[green]✓[/green] Missing values report saved to: {output_html}")
1617
1800
  else:
1618
1801
  # Display in terminal with special missing values formatting
1619
- _rich_print_missing_table(gt_table, original_data)
1802
+ # Create enhanced context info for missing table display
1803
+ missing_info = {}
1804
+ try:
1805
+ # Determine source type and table type for enhanced preview title
1806
+ if is_piped_data:
1807
+ if data_source.endswith(".parquet"):
1808
+ source_type = "Polars expression (serialized to Parquet) from `pb pl`"
1809
+ elif data_source.endswith(".csv"):
1810
+ source_type = "Polars expression (serialized to CSV) from `pb pl`"
1811
+ else:
1812
+ source_type = "Polars expression from `pb pl`"
1813
+ elif data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1814
+ source_type = f"Pointblank dataset: {data_source}"
1815
+ else:
1816
+ source_type = f"External source: {data_source}"
1817
+
1818
+ missing_info = {
1819
+ "source_type": source_type,
1820
+ "table_type": _get_tbl_type(original_data),
1821
+ "total_rows": pb.get_row_count(original_data),
1822
+ "total_columns": pb.get_column_count(original_data),
1823
+ }
1824
+ except Exception:
1825
+ # Use defaults if metadata extraction fails
1826
+ missing_info = {
1827
+ "source_type": f"Data source: {data_source}",
1828
+ "table_type": "unknown",
1829
+ "total_rows": None,
1830
+ "total_columns": None,
1831
+ }
1832
+
1833
+ _rich_print_missing_table_enhanced(gt_table, original_data, missing_info)
1620
1834
 
1621
1835
  except Exception as e:
1622
1836
  console.print(f"[red]Error:[/red] {e}")
@@ -1741,6 +1955,8 @@ def validate(
1741
1955
  pb validate data.csv --check col-vals-not-null --column email --check col-vals-gt --column age --value 18
1742
1956
  """
1743
1957
  try:
1958
+ import sys
1959
+
1744
1960
  # Handle --list-checks option early (doesn't need data source)
1745
1961
  if list_checks:
1746
1962
  console.print("[bold bright_cyan]Available Validation Checks:[/bold bright_cyan]")
@@ -1797,13 +2013,31 @@ def validate(
1797
2013
  sys.exit(0)
1798
2014
 
1799
2015
  # Check if data_source is provided (required for all operations except --list-checks)
2016
+ # or if we have piped input
1800
2017
  if data_source is None:
1801
- console.print("[red]Error:[/red] DATA_SOURCE is required")
1802
- console.print("Use 'pb validate --help' for usage information")
1803
- console.print("Or use 'pb validate --list-checks' to see available validation types")
1804
- import sys
2018
+ # Check if we have piped input
2019
+ if not sys.stdin.isatty():
2020
+ # Data is being piped in: read the file path from stdin
2021
+ piped_input = sys.stdin.read().strip()
2022
+ if piped_input:
2023
+ data_source = piped_input
2024
+
2025
+ # Determine the format from the file extension
2026
+ if piped_input.endswith(".parquet"):
2027
+ format_type = "Parquet"
2028
+ elif piped_input.endswith(".csv"):
2029
+ format_type = "CSV"
2030
+ else:
2031
+ format_type = "unknown"
1805
2032
 
1806
- sys.exit(1)
2033
+ console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
2034
+ else:
2035
+ console.print("[red]Error:[/red] No data provided via pipe")
2036
+ sys.exit(1)
2037
+ else:
2038
+ # Show concise help and exit
2039
+ _show_concise_help("validate", None)
2040
+ return
1807
2041
 
1808
2042
  # Handle backward compatibility and parameter conversion
1809
2043
  import sys
@@ -1911,7 +2145,25 @@ def validate(
1911
2145
  checks_list, columns_list, sets_list, values_list
1912
2146
  )
1913
2147
 
1914
- console.print(f"[green]✓[/green] Loaded data source: {data_source}")
2148
+ # Check if this is a piped data source and create friendly display name
2149
+ is_piped_data = (
2150
+ data_source
2151
+ and data_source.startswith("/var/folders/")
2152
+ and ("pb_pipe_" in data_source or "/T/" in data_source)
2153
+ )
2154
+
2155
+ if is_piped_data:
2156
+ if data_source.endswith(".parquet"):
2157
+ display_source = "Parquet file via `pb pl`"
2158
+ elif data_source.endswith(".csv"):
2159
+ display_source = "CSV file via `pb pl`"
2160
+ else:
2161
+ display_source = "File via `pb pl`"
2162
+ console.print(
2163
+ f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
2164
+ )
2165
+ else:
2166
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1915
2167
 
1916
2168
  # Build a single validation object with chained checks
1917
2169
  with console.status(f"[bold green]Running {len(checks_list)} validation check(s)..."):
@@ -2134,136 +2386,339 @@ def requirements():
2134
2386
  console.print("\n[dim]Install missing packages to enable additional functionality.[/dim]")
2135
2387
 
2136
2388
 
2137
- def _rich_print_scan_table(
2138
- scan_result: Any,
2139
- data_source: str,
2140
- source_type: str,
2141
- table_type: str,
2142
- total_rows: int | None = None,
2143
- total_columns: int | None = None,
2389
+ def _rich_print_missing_table_enhanced(
2390
+ gt_table: Any, original_data: Any = None, missing_info: dict = None
2144
2391
  ) -> None:
2145
- """
2146
- Display scan results as a Rich table in the terminal with statistical measures.
2392
+ """Convert a missing values GT table to Rich table with enhanced formatting and metadata.
2147
2393
 
2148
2394
  Args:
2149
- scan_result: The GT object from col_summary_tbl()
2150
- data_source: Name of the data source being scanned
2151
- source_type: Type of data source (e.g., "Pointblank dataset: small_table")
2152
- table_type: Type of table (e.g., "polars.LazyFrame")
2153
- total_rows: Total number of rows in the dataset
2154
- total_columns: Total number of columns in the dataset
2395
+ gt_table: The GT table object for missing values
2396
+ original_data: The original data source to extract column types
2397
+ missing_info: Dict with metadata including source_type, table_type, total_rows, total_columns
2155
2398
  """
2156
2399
  try:
2157
- import re
2400
+ # Extract the underlying data from the GT table
2401
+ df = None
2158
2402
 
2159
- import narwhals as nw
2160
- from rich.box import SIMPLE_HEAD
2403
+ if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
2404
+ df = gt_table._tbl_data
2405
+ elif hasattr(gt_table, "_data") and gt_table._data is not None:
2406
+ df = gt_table._data
2407
+ elif hasattr(gt_table, "data") and gt_table.data is not None:
2408
+ df = gt_table.data
2161
2409
 
2162
- # Extract the underlying DataFrame from the GT object
2163
- # The GT object has a _tbl_data attribute that contains the DataFrame
2164
- gt_data = scan_result._tbl_data
2410
+ if df is not None:
2411
+ from rich.box import SIMPLE_HEAD
2165
2412
 
2166
- # Convert to Narwhals DataFrame for consistent handling
2167
- nw_data = nw.from_native(gt_data)
2413
+ # Extract metadata from missing_info or use defaults
2414
+ source_type = "Data source"
2415
+ table_type = "unknown"
2416
+ total_rows = None
2417
+ total_columns = None
2168
2418
 
2169
- # Convert to dictionary for easier access
2170
- data_dict = nw_data.to_dict(as_series=False)
2419
+ if missing_info:
2420
+ source_type = missing_info.get("source_type", "Data source")
2421
+ table_type = missing_info.get("table_type", "unknown")
2422
+ total_rows = missing_info.get("total_rows")
2423
+ total_columns = missing_info.get("total_columns")
2171
2424
 
2172
- # Create main scan table with missing data table styling
2173
- # Create a comprehensive title with data source, source type, and table type
2174
- title_text = f"Column Summary / {source_type} / {table_type}"
2425
+ # Create enhanced title matching the scan table format
2426
+ title_text = f"Missing Values / {source_type} / {table_type}"
2175
2427
 
2176
- # Add dimensions subtitle in gray if available
2177
- if total_rows is not None and total_columns is not None:
2178
- title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
2428
+ # Add dimensions subtitle in gray if available
2429
+ if total_rows is not None and total_columns is not None:
2430
+ title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
2179
2431
 
2180
- # Create the scan table
2181
- scan_table = Table(
2182
- title=title_text,
2183
- show_header=True,
2184
- header_style="bold magenta",
2185
- box=SIMPLE_HEAD,
2186
- title_style="bold cyan",
2187
- title_justify="left",
2188
- )
2432
+ # Get column names
2433
+ columns = []
2434
+ try:
2435
+ if hasattr(df, "columns"):
2436
+ columns = list(df.columns)
2437
+ elif hasattr(df, "schema"):
2438
+ columns = list(df.schema.names)
2439
+ except Exception as e:
2440
+ console.print(f"[red]Error getting columns:[/red] {e}")
2441
+ columns = []
2189
2442
 
2190
- # Add columns with specific styling and appropriate widths
2191
- scan_table.add_column("Column", style="cyan", no_wrap=True, width=20)
2192
- scan_table.add_column("Type", style="yellow", no_wrap=True, width=10)
2193
- scan_table.add_column(
2194
- "NA", style="red", width=6, justify="right"
2195
- ) # Adjusted for better formatting
2196
- scan_table.add_column(
2197
- "UQ", style="green", width=8, justify="right"
2198
- ) # Adjusted for boolean values
2443
+ if not columns:
2444
+ columns = [f"Column {i + 1}" for i in range(10)] # Fallback
2199
2445
 
2200
- # Add statistical columns if they exist with appropriate widths
2201
- stat_columns = []
2202
- column_mapping = {
2203
- "mean": ("Mean", "blue", 9),
2204
- "std": ("SD", "blue", 9),
2205
- "min": ("Min", "yellow", 9),
2206
- "median": ("Med", "yellow", 9),
2207
- "max": ("Max", "yellow", 9),
2208
- "q_1": ("Q₁", "magenta", 8),
2209
- "q_3": ("Q₃", "magenta", 9),
2210
- "iqr": ("IQR", "magenta", 8),
2211
- }
2446
+ # Get original data to extract column types
2447
+ column_types = {}
2448
+ if original_data is not None:
2449
+ try:
2450
+ # Get column types from original data
2451
+ if hasattr(original_data, "columns"):
2452
+ original_columns = list(original_data.columns)
2453
+ column_types = _get_column_dtypes(original_data, original_columns)
2454
+ except Exception as e:
2455
+ console.print(f"[red]Error getting column types:[/red] {e}")
2456
+ pass # Use empty dict as fallback
2212
2457
 
2213
- for col_key, (display_name, color, width) in column_mapping.items():
2214
- if col_key in data_dict:
2215
- scan_table.add_column(display_name, style=color, width=width, justify="right")
2216
- stat_columns.append(col_key)
2458
+ # Add columns to Rich table with special formatting for missing values table
2459
+ sector_columns = [col for col in columns if col != "columns" and col.isdigit()]
2217
2460
 
2218
- # Helper function to extract column name and type from HTML
2219
- def extract_column_info(html_content: str) -> tuple[str, str]:
2220
- """Extract column name and type from HTML formatted content."""
2221
- # Extract column name from first div
2222
- name_match = re.search(r"<div[^>]*>([^<]+)</div>", html_content)
2223
- column_name = name_match.group(1) if name_match else "Unknown"
2461
+ # Print the title first
2462
+ console.print()
2463
+ console.print(f"[bold cyan]{title_text}[/bold cyan]")
2224
2464
 
2225
- # Extract data type from second div (with gray color)
2226
- type_match = re.search(r"<div[^>]*color: gray[^>]*>([^<]+)</div>", html_content)
2227
- if type_match:
2228
- data_type = type_match.group(1)
2229
- # Convert to compact format using the existing function
2230
- compact_type = _format_dtype_compact(data_type)
2231
- data_type = compact_type
2232
- else:
2233
- data_type = "unknown"
2465
+ # Show the custom spanner header if we have sector columns
2466
+ if sector_columns:
2467
+ # Create a custom header line that shows the spanner
2468
+ header_parts = []
2469
+ header_parts.append(" " * 20) # Space for Column header
2470
+ header_parts.append(" " * 10) # Space for Type header
2234
2471
 
2235
- return column_name, data_type
2472
+ # Left-align "Row Sectors" with the first numbered column
2473
+ row_sectors_text = "Row Sectors"
2474
+ header_parts.append(row_sectors_text)
2236
2475
 
2237
- # Helper function to format values with improved number formatting
2238
- def format_value(
2239
- value: Any, is_missing: bool = False, is_unique: bool = False, max_width: int = 8
2240
- ) -> str:
2241
- """Format values for display with smart number formatting and HTML cleanup."""
2242
- if value is None or (isinstance(value, str) and value.strip() == ""):
2243
- return "[dim]—[/dim]"
2476
+ # Print the custom spanner header
2477
+ console.print("[dim]" + " ".join(header_parts) + "[/dim]")
2244
2478
 
2245
- # Handle missing values indicator
2246
- if is_missing and str(value) == "0":
2247
- return "[green]●[/green]" # No missing values
2479
+ # Add a horizontal rule below the spanner
2480
+ rule_parts = []
2481
+ rule_parts.append(" " * 20) # Space for Column header
2482
+ rule_parts.append(" " * 10) # Space for Type header
2248
2483
 
2249
- # Clean up HTML formatting from the raw data
2250
- str_val = str(value)
2484
+ # Use a fixed width horizontal rule for "Row Sectors"
2485
+ horizontal_rule = "─" * 20
2486
+ rule_parts.append(horizontal_rule)
2251
2487
 
2252
- # Handle multi-line values with <br> tags FIRST: take the first line (absolute number)
2253
- if "<br>" in str_val:
2254
- str_val = str_val.split("<br>")[0].strip()
2255
- # For unique values, we want just the integer part
2256
- if is_unique:
2257
- try:
2258
- # Try to extract just the integer part for unique counts
2259
- num_val = float(str_val)
2260
- return str(int(num_val))
2261
- except (ValueError, TypeError):
2262
- pass
2488
+ # Print the horizontal rule
2489
+ console.print("[dim]" + " ".join(rule_parts) + "[/dim]")
2263
2490
 
2264
- # Now handle HTML content (especially from boolean unique values)
2265
- if "<" in str_val and ">" in str_val:
2266
- # Remove HTML tags completely for cleaner display
2491
+ # Create the missing values table WITHOUT the title (since we printed it above)
2492
+ rich_table = Table(
2493
+ show_header=True,
2494
+ header_style="bold magenta",
2495
+ box=SIMPLE_HEAD,
2496
+ )
2497
+
2498
+ # Two separate columns: Column name (20 chars) and Data type (10 chars)
2499
+ rich_table.add_column("Column", style="cyan", no_wrap=True, width=20)
2500
+ rich_table.add_column("Type", style="yellow", no_wrap=True, width=10)
2501
+
2502
+ # Sector columns: All same width, optimized for "100%" (4 chars + padding)
2503
+ for sector in sector_columns:
2504
+ rich_table.add_column(
2505
+ sector,
2506
+ style="cyan",
2507
+ justify="center",
2508
+ no_wrap=True,
2509
+ width=5, # Fixed width optimized for percentage values
2510
+ )
2511
+
2512
+ # Convert data to rows with special formatting
2513
+ rows = []
2514
+ try:
2515
+ if hasattr(df, "to_dicts"):
2516
+ data_dict = df.to_dicts()
2517
+ elif hasattr(df, "to_dict"):
2518
+ data_dict = df.to_dict("records")
2519
+ else:
2520
+ data_dict = []
2521
+
2522
+ for i, row in enumerate(data_dict):
2523
+ try:
2524
+ # Each row should have: [column_name, data_type, sector1, sector2, ...]
2525
+ column_name = str(row.get("columns", ""))
2526
+
2527
+ # Truncate column name to 20 characters with ellipsis if needed
2528
+ if len(column_name) > 20:
2529
+ truncated_name = column_name[:17] + "…"
2530
+ else:
2531
+ truncated_name = column_name
2532
+
2533
+ # Get data type for this column
2534
+ if column_name in column_types:
2535
+ dtype = column_types[column_name]
2536
+ if len(dtype) > 10:
2537
+ truncated_dtype = dtype[:9] + "…"
2538
+ else:
2539
+ truncated_dtype = dtype
2540
+ else:
2541
+ truncated_dtype = "?"
2542
+
2543
+ # Start building the row with column name and type
2544
+ formatted_row = [truncated_name, truncated_dtype]
2545
+
2546
+ # Add sector values (formatted percentages)
2547
+ for sector in sector_columns:
2548
+ value = row.get(sector, 0.0)
2549
+ if isinstance(value, (int, float)):
2550
+ formatted_row.append(_format_missing_percentage(float(value)))
2551
+ else:
2552
+ formatted_row.append(str(value))
2553
+
2554
+ rows.append(formatted_row)
2555
+
2556
+ except Exception as e:
2557
+ console.print(f"[red]Error processing row {i}:[/red] {e}")
2558
+ continue
2559
+
2560
+ except Exception as e:
2561
+ console.print(f"[red]Error extracting data:[/red] {e}")
2562
+ rows = [["Error extracting data", "?", *["" for _ in sector_columns]]]
2563
+
2564
+ # Add rows to Rich table
2565
+ for row in rows:
2566
+ try:
2567
+ rich_table.add_row(*row)
2568
+ except Exception as e:
2569
+ console.print(f"[red]Error adding row:[/red] {e}")
2570
+ break
2571
+
2572
+ # Print the Rich table (without title since we already printed it)
2573
+ console.print(rich_table)
2574
+
2575
+ footer_text = (
2576
+ "[dim]Symbols: [green]●[/green] = no missing vals in sector, "
2577
+ "[red]●[/red] = all vals completely missing, "
2578
+ "[cyan]x%[/cyan] = percentage missing[/dim]"
2579
+ )
2580
+ console.print(footer_text)
2581
+
2582
+ else:
2583
+ # Fallback to regular table display
2584
+ _rich_print_gt_table(gt_table)
2585
+
2586
+ except Exception as e:
2587
+ console.print(f"[red]Error rendering missing values table:[/red] {e}")
2588
+ # Fallback to regular table display
2589
+ _rich_print_gt_table(gt_table)
2590
+
2591
+
2592
+ def _rich_print_scan_table(
2593
+ scan_result: Any,
2594
+ data_source: str,
2595
+ source_type: str,
2596
+ table_type: str,
2597
+ total_rows: int | None = None,
2598
+ total_columns: int | None = None,
2599
+ ) -> None:
2600
+ """
2601
+ Display scan results as a Rich table in the terminal with statistical measures.
2602
+
2603
+ Args:
2604
+ scan_result: The GT object from col_summary_tbl()
2605
+ data_source: Name of the data source being scanned
2606
+ source_type: Type of data source (e.g., "Pointblank dataset: small_table")
2607
+ table_type: Type of table (e.g., "polars.LazyFrame")
2608
+ total_rows: Total number of rows in the dataset
2609
+ total_columns: Total number of columns in the dataset
2610
+ """
2611
+ try:
2612
+ import re
2613
+
2614
+ import narwhals as nw
2615
+ from rich.box import SIMPLE_HEAD
2616
+
2617
+ # Extract the underlying DataFrame from the GT object
2618
+ # The GT object has a _tbl_data attribute that contains the DataFrame
2619
+ gt_data = scan_result._tbl_data
2620
+
2621
+ # Convert to Narwhals DataFrame for consistent handling
2622
+ nw_data = nw.from_native(gt_data)
2623
+
2624
+ # Convert to dictionary for easier access
2625
+ data_dict = nw_data.to_dict(as_series=False)
2626
+
2627
+ # Create main scan table with missing data table styling
2628
+ # Create a comprehensive title with data source, source type, and table type
2629
+ title_text = f"Column Summary / {source_type} / {table_type}"
2630
+
2631
+ # Add dimensions subtitle in gray if available
2632
+ if total_rows is not None and total_columns is not None:
2633
+ title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
2634
+
2635
+ # Create the scan table
2636
+ scan_table = Table(
2637
+ title=title_text,
2638
+ show_header=True,
2639
+ header_style="bold magenta",
2640
+ box=SIMPLE_HEAD,
2641
+ title_style="bold cyan",
2642
+ title_justify="left",
2643
+ )
2644
+
2645
+ # Add columns with specific styling and appropriate widths
2646
+ scan_table.add_column("Column", style="cyan", no_wrap=True, width=20)
2647
+ scan_table.add_column("Type", style="yellow", no_wrap=True, width=10)
2648
+ scan_table.add_column(
2649
+ "NA", style="red", width=6, justify="right"
2650
+ ) # Adjusted for better formatting
2651
+ scan_table.add_column(
2652
+ "UQ", style="green", width=8, justify="right"
2653
+ ) # Adjusted for boolean values
2654
+
2655
+ # Add statistical columns if they exist with appropriate widths
2656
+ stat_columns = []
2657
+ column_mapping = {
2658
+ "mean": ("Mean", "blue", 9),
2659
+ "std": ("SD", "blue", 9),
2660
+ "min": ("Min", "yellow", 9),
2661
+ "median": ("Med", "yellow", 9),
2662
+ "max": ("Max", "yellow", 9),
2663
+ "q_1": ("Q₁", "magenta", 8),
2664
+ "q_3": ("Q₃", "magenta", 9),
2665
+ "iqr": ("IQR", "magenta", 8),
2666
+ }
2667
+
2668
+ for col_key, (display_name, color, width) in column_mapping.items():
2669
+ if col_key in data_dict:
2670
+ scan_table.add_column(display_name, style=color, width=width, justify="right")
2671
+ stat_columns.append(col_key)
2672
+
2673
+ # Helper function to extract column name and type from HTML
2674
+ def extract_column_info(html_content: str) -> tuple[str, str]:
2675
+ """Extract column name and type from HTML formatted content."""
2676
+ # Extract column name from first div
2677
+ name_match = re.search(r"<div[^>]*>([^<]+)</div>", html_content)
2678
+ column_name = name_match.group(1) if name_match else "Unknown"
2679
+
2680
+ # Extract data type from second div (with gray color)
2681
+ type_match = re.search(r"<div[^>]*color: gray[^>]*>([^<]+)</div>", html_content)
2682
+ if type_match:
2683
+ data_type = type_match.group(1)
2684
+ # Convert to compact format using the existing function
2685
+ compact_type = _format_dtype_compact(data_type)
2686
+ data_type = compact_type
2687
+ else:
2688
+ data_type = "unknown"
2689
+
2690
+ return column_name, data_type
2691
+
2692
+ # Helper function to format values with improved number formatting
2693
+ def format_value(
2694
+ value: Any, is_missing: bool = False, is_unique: bool = False, max_width: int = 8
2695
+ ) -> str:
2696
+ """Format values for display with smart number formatting and HTML cleanup."""
2697
+ if value is None or (isinstance(value, str) and value.strip() == ""):
2698
+ return "[dim]—[/dim]"
2699
+
2700
+ # Handle missing values indicator
2701
+ if is_missing and str(value) == "0":
2702
+ return "[green]●[/green]" # No missing values
2703
+
2704
+ # Clean up HTML formatting from the raw data
2705
+ str_val = str(value)
2706
+
2707
+ # Handle multi-line values with <br> tags FIRST: take the first line (absolute number)
2708
+ if "<br>" in str_val:
2709
+ str_val = str_val.split("<br>")[0].strip()
2710
+ # For unique values, we want just the integer part
2711
+ if is_unique:
2712
+ try:
2713
+ # Try to extract just the integer part for unique counts
2714
+ num_val = float(str_val)
2715
+ return str(int(num_val))
2716
+ except (ValueError, TypeError):
2717
+ pass
2718
+
2719
+ # Now handle HTML content (especially from boolean unique values)
2720
+ if "<" in str_val and ">" in str_val:
2721
+ # Remove HTML tags completely for cleaner display
2267
2722
  str_val = re.sub(r"<[^>]+>", "", str_val).strip()
2268
2723
  # Clean up extra whitespace
2269
2724
  str_val = re.sub(r"\s+", " ", str_val).strip()
@@ -2423,8 +2878,36 @@ def _rich_print_missing_table(gt_table: Any, original_data: Any = None) -> None:
2423
2878
  if df is not None:
2424
2879
  from rich.box import SIMPLE_HEAD
2425
2880
 
2426
- # Create the missing values table
2427
- rich_table = Table(show_header=True, header_style="bold magenta", box=SIMPLE_HEAD)
2881
+ # Get metadata for enhanced missing table title
2882
+ total_rows = None
2883
+ total_columns = None
2884
+ source_type = "Data source"
2885
+ table_type = "unknown"
2886
+
2887
+ if original_data is not None:
2888
+ try:
2889
+ total_rows = pb.get_row_count(original_data)
2890
+ total_columns = pb.get_column_count(original_data)
2891
+ table_type = _get_tbl_type(original_data)
2892
+ except Exception:
2893
+ pass
2894
+
2895
+ # Create enhanced title matching the scan table format
2896
+ title_text = f"Missing Values / {source_type} / {table_type}"
2897
+
2898
+ # Add dimensions subtitle in gray if available
2899
+ if total_rows is not None and total_columns is not None:
2900
+ title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
2901
+
2902
+ # Create the missing values table with enhanced title
2903
+ rich_table = Table(
2904
+ title=title_text,
2905
+ show_header=True,
2906
+ header_style="bold magenta",
2907
+ box=SIMPLE_HEAD,
2908
+ title_style="bold cyan",
2909
+ title_justify="left",
2910
+ )
2428
2911
 
2429
2912
  # Get column names
2430
2913
  columns = []
@@ -2556,12 +3039,12 @@ def _rich_print_missing_table(gt_table: Any, original_data: Any = None) -> None:
2556
3039
  console.print("[dim]" + " ".join(rule_parts) + "[/dim]")
2557
3040
 
2558
3041
  # Print the Rich table (will handle terminal width automatically)
3042
+ console.print()
2559
3043
  console.print(rich_table)
2560
3044
  footer_text = (
2561
- "[dim]Symbols: [green]●[/green] = no missing values, "
2562
- "[red]●[/red] = completely missing, "
2563
- "<1% = less than 1% missing, "
2564
- ">99% = more than 99% missing[/dim]"
3045
+ "[dim]Symbols: [green]●[/green] = no missing vals in sector, "
3046
+ "[red]●[/red] = all vals completely missing, "
3047
+ "[cyan]x%[/cyan] = percentage missing[/dim]"
2565
3048
  )
2566
3049
  console.print(footer_text)
2567
3050
 
@@ -2700,6 +3183,20 @@ def _display_validation_result(
2700
3183
  set_val = sets_list[step_index] if step_index < len(sets_list) else None
2701
3184
  value = values_list[step_index] if step_index < len(values_list) else None
2702
3185
 
3186
+ # Check if this is piped data
3187
+ is_piped_data = _is_piped_data_source(data_source)
3188
+
3189
+ # Create friendly display name for data source
3190
+ if is_piped_data:
3191
+ if data_source.endswith(".parquet"):
3192
+ display_source = "Polars expression (serialized to Parquet) from `pb pl`"
3193
+ elif data_source.endswith(".csv"):
3194
+ display_source = "Polars expression (serialized to CSV) from `pb pl`"
3195
+ else:
3196
+ display_source = "Polars expression from `pb pl`"
3197
+ else:
3198
+ display_source = data_source
3199
+
2703
3200
  # Get validation step info
2704
3201
  step_info = None
2705
3202
  if hasattr(validation, "validation_info") and len(validation.validation_info) > step_index:
@@ -2766,7 +3263,7 @@ def _display_validation_result(
2766
3263
  result_table.add_column("Value", style="white")
2767
3264
 
2768
3265
  # Add basic info
2769
- result_table.add_row("Data Source", data_source)
3266
+ result_table.add_row("Data Source", display_source)
2770
3267
  result_table.add_row("Check Type", check)
2771
3268
 
2772
3269
  # Add column info for column-specific checks
@@ -3128,6 +3625,18 @@ def _show_extract_and_summary(
3128
3625
  """Show extract and summary for a validation step (used for single checks)."""
3129
3626
  step_passed = step_info.n_failed == 0 if step_info else True
3130
3627
 
3628
+ # Get the friendly display name
3629
+ is_piped_data = _is_piped_data_source(data_source)
3630
+ if is_piped_data:
3631
+ if data_source.endswith(".parquet"):
3632
+ display_source = "Polars expression (serialized to Parquet) from `pb pl`"
3633
+ elif data_source.endswith(".csv"):
3634
+ display_source = "Polars expression (serialized to CSV) from `pb pl`"
3635
+ else:
3636
+ display_source = "Polars expression from `pb pl`"
3637
+ else:
3638
+ display_source = data_source
3639
+
3131
3640
  # Show extract if requested and validation failed
3132
3641
  if (show_extract or write_extract) and not step_passed:
3133
3642
  console.print()
@@ -3281,54 +3790,54 @@ def _show_extract_and_summary(
3281
3790
  if step_passed:
3282
3791
  if check == "rows-distinct":
3283
3792
  success_message = (
3284
- f"[green]✓ Validation PASSED: No duplicate rows found in {data_source}[/green]"
3793
+ f"[green]✓ Validation PASSED: No duplicate rows found in {display_source}[/green]"
3285
3794
  )
3286
3795
  elif check == "col-vals-not-null":
3287
- success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {data_source}[/green]"
3796
+ success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {display_source}[/green]"
3288
3797
  elif check == "rows-complete":
3289
- success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {data_source}[/green]"
3798
+ success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {display_source}[/green]"
3290
3799
  elif check == "col-exists":
3291
3800
  success_message = (
3292
- f"[green]✓ Validation PASSED: Column '{column}' exists in {data_source}[/green]"
3801
+ f"[green]✓ Validation PASSED: Column '{column}' exists in {display_source}[/green]"
3293
3802
  )
3294
3803
  elif check == "col-vals-in-set":
3295
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {data_source}[/green]"
3804
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {display_source}[/green]"
3296
3805
  elif check == "col-vals-gt":
3297
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {data_source}[/green]"
3806
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {display_source}[/green]"
3298
3807
  elif check == "col-vals-ge":
3299
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {data_source}[/green]"
3808
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {display_source}[/green]"
3300
3809
  elif check == "col-vals-lt":
3301
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {data_source}[/green]"
3810
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {display_source}[/green]"
3302
3811
  elif check == "col-vals-le":
3303
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {data_source}[/green]"
3812
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {display_source}[/green]"
3304
3813
  else:
3305
3814
  success_message = (
3306
- f"[green]✓ Validation PASSED: {check} check passed for {data_source}[/green]"
3815
+ f"[green]✓ Validation PASSED: {check} check passed for {display_source}[/green]"
3307
3816
  )
3308
3817
 
3309
3818
  console.print(Panel(success_message, border_style="green", expand=False))
3310
3819
  else:
3311
3820
  if step_info:
3312
3821
  if check == "rows-distinct":
3313
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {data_source}[/red]"
3822
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {display_source}[/red]"
3314
3823
  elif check == "col-vals-not-null":
3315
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {data_source}[/red]"
3824
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {display_source}[/red]"
3316
3825
  elif check == "rows-complete":
3317
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {data_source}[/red]"
3826
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {display_source}[/red]"
3318
3827
  elif check == "col-exists":
3319
- failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {data_source}[/red]"
3828
+ failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {display_source}[/red]"
3320
3829
  elif check == "col-vals-in-set":
3321
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {data_source}[/red]"
3830
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {display_source}[/red]"
3322
3831
  elif check == "col-vals-gt":
3323
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {data_source}[/red]"
3832
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {display_source}[/red]"
3324
3833
  elif check == "col-vals-ge":
3325
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {data_source}[/red]"
3834
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {display_source}[/red]"
3326
3835
  elif check == "col-vals-lt":
3327
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {data_source}[/red]"
3836
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {display_source}[/red]"
3328
3837
  elif check == "col-vals-le":
3329
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {data_source}[/red]"
3838
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {display_source}[/red]"
3330
3839
  else:
3331
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {data_source}[/red]"
3840
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {display_source}[/red]"
3332
3841
 
3333
3842
  # Add hint about --show-extract if not already used (except for col-exists which has no rows to show)
3334
3843
  if not show_extract and check != "col-exists":
@@ -3338,15 +3847,15 @@ def _show_extract_and_summary(
3338
3847
  else:
3339
3848
  if check == "rows-distinct":
3340
3849
  failure_message = (
3341
- f"[red]✗ Validation FAILED: Duplicate rows found in {data_source}[/red]"
3850
+ f"[red]✗ Validation FAILED: Duplicate rows found in {display_source}[/red]"
3342
3851
  )
3343
3852
  elif check == "rows-complete":
3344
3853
  failure_message = (
3345
- f"[red]✗ Validation FAILED: Incomplete rows found in {data_source}[/red]"
3854
+ f"[red]✗ Validation FAILED: Incomplete rows found in {display_source}[/red]"
3346
3855
  )
3347
3856
  else:
3348
3857
  failure_message = (
3349
- f"[red]✗ Validation FAILED: {check} check failed for {data_source}[/red]"
3858
+ f"[red]✗ Validation FAILED: {check} check failed for {display_source}[/red]"
3350
3859
  )
3351
3860
 
3352
3861
  # Add hint about --show-extract if not already used
@@ -3357,8 +3866,8 @@ def _show_extract_and_summary(
3357
3866
 
3358
3867
 
3359
3868
  @cli.command()
3360
- @click.argument("output_file", type=click.Path())
3361
- def make_template(output_file: str):
3869
+ @click.argument("output_file", type=click.Path(), required=False)
3870
+ def make_template(output_file: str | None):
3362
3871
  """
3363
3872
  Create a validation script template.
3364
3873
 
@@ -3374,6 +3883,11 @@ def make_template(output_file: str):
3374
3883
  pb make-template my_validation.py
3375
3884
  pb make-template validation_template.py
3376
3885
  """
3886
+ # Handle missing output_file with concise help
3887
+ if output_file is None:
3888
+ _show_concise_help("make-template", None)
3889
+ return
3890
+
3377
3891
  example_script = '''"""
3378
3892
  Example Pointblank validation script.
3379
3893
 
@@ -3437,7 +3951,7 @@ validation = (
3437
3951
 
3438
3952
 
3439
3953
  @cli.command()
3440
- @click.argument("validation_script", type=click.Path(exists=True))
3954
+ @click.argument("validation_script", type=click.Path(exists=True), required=False)
3441
3955
  @click.option(
3442
3956
  "--data",
3443
3957
  type=str,
@@ -3462,7 +3976,7 @@ validation = (
3462
3976
  help="Exit with non-zero code when validation reaches this threshold level",
3463
3977
  )
3464
3978
  def run(
3465
- validation_script: str,
3979
+ validation_script: str | None,
3466
3980
  data: str | None,
3467
3981
  output_html: str | None,
3468
3982
  output_json: str | None,
@@ -3503,6 +4017,11 @@ def run(
3503
4017
  pb run validation_script.py --write-extract extracts_folder --fail-on critical
3504
4018
  """
3505
4019
  try:
4020
+ # Handle missing validation_script with concise help
4021
+ if validation_script is None:
4022
+ _show_concise_help("run", None)
4023
+ return
4024
+
3506
4025
  # Load optional data override if provided
3507
4026
  cli_data = None
3508
4027
  if data:
@@ -3902,3 +4421,768 @@ def _format_missing_percentage(value: float) -> str:
3902
4421
  return ">99%" # More than 99%
3903
4422
  else:
3904
4423
  return f"{int(round(value))}%" # Round to nearest integer with % sign
4424
+
4425
+
4426
+ @cli.command()
4427
+ @click.argument("polars_expression", type=str, required=False)
4428
+ @click.option("--edit", "-e", is_flag=True, help="Open editor for multi-line input")
4429
+ @click.option("--file", "-f", type=click.Path(exists=True), help="Read query from file")
4430
+ @click.option(
4431
+ "--editor", help="Editor to use for --edit mode (overrides $EDITOR and auto-detection)"
4432
+ )
4433
+ @click.option(
4434
+ "--output-format",
4435
+ "-o",
4436
+ type=click.Choice(["preview", "scan", "missing", "info"]),
4437
+ default="preview",
4438
+ help="Output format for the result",
4439
+ )
4440
+ @click.option("--preview-head", default=5, help="Number of head rows for preview")
4441
+ @click.option("--preview-tail", default=5, help="Number of tail rows for preview")
4442
+ @click.option("--output-html", type=click.Path(), help="Save HTML output to file")
4443
+ @click.option(
4444
+ "--pipe", is_flag=True, help="Output data in a format suitable for piping to other pb commands"
4445
+ )
4446
+ @click.option(
4447
+ "--pipe-format",
4448
+ type=click.Choice(["parquet", "csv"]),
4449
+ default="parquet",
4450
+ help="Format for piped output (default: parquet)",
4451
+ )
4452
+ def pl(
4453
+ polars_expression: str | None,
4454
+ edit: bool,
4455
+ file: str | None,
4456
+ editor: str | None,
4457
+ output_format: str,
4458
+ preview_head: int,
4459
+ preview_tail: int,
4460
+ output_html: str | None,
4461
+ pipe: bool,
4462
+ pipe_format: str,
4463
+ ):
4464
+ """
4465
+ Execute Polars expressions and display results.
4466
+
4467
+ Execute Polars DataFrame operations from the command line and display
4468
+ the results using Pointblank's visualization tools.
4469
+
4470
+ POLARS_EXPRESSION should be a valid Polars expression that returns a DataFrame.
4471
+ The 'pl' module is automatically imported and available.
4472
+
4473
+ Examples:
4474
+
4475
+ \b
4476
+ # Direct expression
4477
+ pb pl "pl.read_csv('data.csv')"
4478
+ pb pl "pl.read_csv('data.csv').select(['name', 'age'])"
4479
+ pb pl "pl.read_csv('data.csv').filter(pl.col('age') > 25)"
4480
+
4481
+ # Multi-line with editor (supports multiple statements)
4482
+ pb pl --edit
4483
+
4484
+ # Multi-statement code example in editor:
4485
+ # csv = pl.read_csv('data.csv')
4486
+ # result = csv.select(['name', 'age']).filter(pl.col('age') > 25)
4487
+
4488
+ # Multi-line with a specific editor
4489
+ pb pl --edit --editor nano
4490
+ pb pl --edit --editor code
4491
+ pb pl --edit --editor micro
4492
+
4493
+ # From file
4494
+ pb pl --file query.py
4495
+
4496
+ # Piping to other pb commands
4497
+ pb pl "pl.read_csv('data.csv').filter(pl.col('age') > 25)" --pipe | pb validate --check rows-distinct
4498
+ pb pl --edit --pipe | pb preview --head 10
4499
+ pb pl --edit --pipe | pb scan --output-html report.html
4500
+ pb pl --edit --pipe | pb missing --output-html missing_report.html
4501
+
4502
+ Use --output-format to change how results are displayed:
4503
+
4504
+ \b
4505
+ pb pl "pl.read_csv('data.csv')" --output-format scan
4506
+ pb pl "pl.read_csv('data.csv')" --output-format missing
4507
+ pb pl "pl.read_csv('data.csv')" --output-format info
4508
+
4509
+ Note: For multi-statement code, assign your final result to a variable like
4510
+ 'result', 'df', 'data', or ensure it's the last expression.
4511
+ """
4512
+ try:
4513
+ # Check if Polars is available
4514
+ if not _is_lib_present("polars"):
4515
+ console.print("[red]Error:[/red] Polars is not installed")
4516
+ console.print("\nThe 'pb pl' command requires Polars to be installed.")
4517
+ console.print("Install it with: [cyan]pip install polars[/cyan]")
4518
+ console.print("\nTo check all dependency status, run: [cyan]pb requirements[/cyan]")
4519
+ sys.exit(1)
4520
+
4521
+ import polars as pl
4522
+
4523
+ # Determine the source of the query
4524
+ query_code = None
4525
+
4526
+ if file:
4527
+ # Read from file
4528
+ query_code = Path(file).read_text()
4529
+ elif edit:
4530
+ # Determine which editor to use
4531
+ chosen_editor = editor or _get_best_editor()
4532
+
4533
+ # When piping, send editor message to stderr
4534
+ if pipe:
4535
+ print(f"Using editor: {chosen_editor}", file=sys.stderr)
4536
+ else:
4537
+ console.print(f"[dim]Using editor: {chosen_editor}[/dim]")
4538
+
4539
+ # Interactive editor with custom editor
4540
+ if chosen_editor == "code":
4541
+ # Special handling for VS Code
4542
+ query_code = _edit_with_vscode()
4543
+ else:
4544
+ # Use click.edit() for terminal editors
4545
+ query_code = click.edit(
4546
+ "# Enter your Polars query here\n"
4547
+ "# Example:\n"
4548
+ "# pl.read_csv('data.csv').select(['name', 'age'])\n"
4549
+ "# pl.read_csv('data.csv').filter(pl.col('age') > 25)\n"
4550
+ "# \n"
4551
+ "# The result should be a Polars DataFrame or LazyFrame\n"
4552
+ "\n",
4553
+ editor=chosen_editor,
4554
+ )
4555
+
4556
+ if query_code is None:
4557
+ if pipe:
4558
+ print("No query entered", file=sys.stderr)
4559
+ else:
4560
+ console.print("[yellow]No query entered[/yellow]")
4561
+ sys.exit(1)
4562
+ elif polars_expression:
4563
+ # Direct argument
4564
+ query_code = polars_expression
4565
+ else:
4566
+ # Try to read from stdin (for piping)
4567
+ if not sys.stdin.isatty():
4568
+ # Data is being piped in
4569
+ query_code = sys.stdin.read().strip()
4570
+ else:
4571
+ # No input provided and stdin is a terminal - show concise help
4572
+ _show_concise_help("pl", None)
4573
+ return
4574
+
4575
+ if not query_code or not query_code.strip():
4576
+ console.print("[red]Error:[/red] Empty query")
4577
+ sys.exit(1)
4578
+
4579
+ # Execute the query
4580
+ with console.status("[bold green]Executing Polars expression..."):
4581
+ namespace = {
4582
+ "pl": pl,
4583
+ "polars": pl,
4584
+ "__builtins__": __builtins__,
4585
+ }
4586
+
4587
+ try:
4588
+ # Check if this is a single expression or multiple statements
4589
+ if "\n" in query_code.strip() or any(
4590
+ keyword in query_code
4591
+ for keyword in [
4592
+ " = ",
4593
+ "import",
4594
+ "for ",
4595
+ "if ",
4596
+ "def ",
4597
+ "class ",
4598
+ "with ",
4599
+ "try:",
4600
+ ]
4601
+ ):
4602
+ # Multiple statements - use exec()
4603
+ exec(query_code, namespace)
4604
+
4605
+ # Look for the result in the namespace
4606
+ # Try common variable names first
4607
+ result = None
4608
+ for var_name in ["result", "df", "data", "table", "output"]:
4609
+ if var_name in namespace:
4610
+ result = namespace[var_name]
4611
+ break
4612
+
4613
+ # If no common names found, look for any DataFrame/LazyFrame
4614
+ if result is None:
4615
+ for key, value in namespace.items():
4616
+ if (
4617
+ hasattr(value, "collect") or hasattr(value, "columns")
4618
+ ) and not key.startswith("_"):
4619
+ result = value
4620
+ break
4621
+
4622
+ # If still no result, get the last assigned variable (excluding builtins)
4623
+ if result is None:
4624
+ # Get variables that were added to namespace (excluding our imports)
4625
+ user_vars = {
4626
+ k: v
4627
+ for k, v in namespace.items()
4628
+ if k not in ["pl", "polars", "__builtins__"] and not k.startswith("_")
4629
+ }
4630
+ if user_vars:
4631
+ # Get the last variable (this is a heuristic)
4632
+ last_var = list(user_vars.keys())[-1]
4633
+ result = user_vars[last_var]
4634
+
4635
+ if result is None:
4636
+ if pipe:
4637
+ print(
4638
+ "[red]Error:[/red] Could not find result variable", file=sys.stderr
4639
+ )
4640
+ print(
4641
+ "[dim]Assign your final result to a variable like 'result', 'df', or 'data'[/dim]",
4642
+ file=sys.stderr,
4643
+ )
4644
+ print(
4645
+ "[dim]Or ensure your last line returns a DataFrame[/dim]",
4646
+ file=sys.stderr,
4647
+ )
4648
+ else:
4649
+ console.print("[red]Error:[/red] Could not find result variable")
4650
+ console.print(
4651
+ "[dim]Assign your final result to a variable like 'result', 'df', or 'data'[/dim]"
4652
+ )
4653
+ console.print("[dim]Or ensure your last line returns a DataFrame[/dim]")
4654
+ sys.exit(1)
4655
+
4656
+ else:
4657
+ # Single expression - use eval()
4658
+ result = eval(query_code, namespace)
4659
+
4660
+ # Validate result
4661
+ if not hasattr(result, "collect") and not hasattr(result, "columns"):
4662
+ if pipe:
4663
+ print(
4664
+ "[red]Error:[/red] Expression must return a Polars DataFrame or LazyFrame",
4665
+ file=sys.stderr,
4666
+ )
4667
+ print(f"[dim]Got: {type(result)}[/dim]", file=sys.stderr)
4668
+ else:
4669
+ console.print(
4670
+ "[red]Error:[/red] Expression must return a Polars DataFrame or LazyFrame"
4671
+ )
4672
+ console.print(f"[dim]Got: {type(result)}[/dim]")
4673
+ sys.exit(1)
4674
+
4675
+ except Exception as e:
4676
+ # When piping, send errors to stderr so they don't interfere with the pipe
4677
+ if pipe:
4678
+ print(f"Error executing Polars expression: {e}", file=sys.stderr)
4679
+ print(file=sys.stderr)
4680
+
4681
+ # Create a panel with the expression(s) for better readability
4682
+ if "\n" in query_code.strip():
4683
+ # Multi-line expression
4684
+ print(f"Expression(s) provided:\n{query_code}", file=sys.stderr)
4685
+ else:
4686
+ # Single line expression
4687
+ print(f"Expression provided: {query_code}", file=sys.stderr)
4688
+ else:
4689
+ # Normal error handling when not piping
4690
+ console.print(f"[red]Error executing Polars expression:[/red] {e}")
4691
+ console.print()
4692
+
4693
+ # Create a panel with the expression(s) for better readability
4694
+ if "\n" in query_code.strip():
4695
+ # Multi-line expression
4696
+ console.print(
4697
+ Panel(
4698
+ query_code,
4699
+ title="Expression(s) provided",
4700
+ border_style="red",
4701
+ expand=False,
4702
+ title_align="left",
4703
+ )
4704
+ )
4705
+ else:
4706
+ # Single line expression
4707
+ console.print(
4708
+ Panel(
4709
+ query_code,
4710
+ title="Expression provided",
4711
+ border_style="red",
4712
+ expand=False,
4713
+ title_align="left",
4714
+ )
4715
+ )
4716
+
4717
+ sys.exit(1)
4718
+
4719
+ # Only print success message when not piping (so it doesn't interfere with pipe output)
4720
+ if not pipe:
4721
+ console.print("[green]✓[/green] Polars expression executed successfully")
4722
+
4723
+ # Process output
4724
+ if pipe:
4725
+ # Output data for piping to other commands
4726
+ _handle_pl_pipe(result, pipe_format)
4727
+ elif output_format == "preview":
4728
+ _handle_pl_preview(result, preview_head, preview_tail, output_html)
4729
+ elif output_format == "scan":
4730
+ _handle_pl_scan(result, query_code, output_html)
4731
+ elif output_format == "missing":
4732
+ _handle_pl_missing(result, query_code, output_html)
4733
+ elif output_format == "info":
4734
+ _handle_pl_info(result, query_code, output_html)
4735
+ elif output_format == "validate":
4736
+ console.print("[yellow]Validation output format not yet implemented[/yellow]")
4737
+ console.print("Use 'pb validate' with a data file for now")
4738
+
4739
+ except Exception as e:
4740
+ console.print(f"[red]Error:[/red] {e}")
4741
+ sys.exit(1)
4742
+
4743
+
4744
+ def _handle_pl_preview(result: Any, head: int, tail: int, output_html: str | None) -> None:
4745
+ """Handle preview output for Polars results."""
4746
+ try:
4747
+ # Create preview using existing preview function
4748
+ gt_table = pb.preview(
4749
+ data=result,
4750
+ n_head=head,
4751
+ n_tail=tail,
4752
+ show_row_numbers=True,
4753
+ )
4754
+
4755
+ if output_html:
4756
+ html_content = gt_table.as_raw_html()
4757
+ Path(output_html).write_text(html_content, encoding="utf-8")
4758
+ console.print(f"[green]✓[/green] HTML saved to: {output_html}")
4759
+ else:
4760
+ # Get metadata for enhanced preview
4761
+ try:
4762
+ total_rows = pb.get_row_count(result)
4763
+ total_columns = pb.get_column_count(result)
4764
+ table_type = _get_tbl_type(result)
4765
+
4766
+ preview_info = {
4767
+ "total_rows": total_rows,
4768
+ "total_columns": total_columns,
4769
+ "head_rows": head,
4770
+ "tail_rows": tail,
4771
+ "is_complete": total_rows <= (head + tail),
4772
+ "source_type": "Polars expression",
4773
+ "table_type": table_type,
4774
+ }
4775
+
4776
+ _rich_print_gt_table(gt_table, preview_info)
4777
+ except Exception:
4778
+ # Fallback to basic display
4779
+ _rich_print_gt_table(gt_table)
4780
+
4781
+ except Exception as e:
4782
+ console.print(f"[red]Error creating preview:[/red] {e}")
4783
+ sys.exit(1)
4784
+
4785
+
4786
+ def _handle_pl_scan(result: Any, expression: str, output_html: str | None) -> None:
4787
+ """Handle scan output for Polars results."""
4788
+ try:
4789
+ scan_result = pb.col_summary_tbl(data=result)
4790
+
4791
+ if output_html:
4792
+ html_content = scan_result.as_raw_html()
4793
+ Path(output_html).write_text(html_content, encoding="utf-8")
4794
+ console.print(f"[green]✓[/green] Data scan report saved to: {output_html}")
4795
+ else:
4796
+ # Get metadata for enhanced scan display
4797
+ try:
4798
+ total_rows = pb.get_row_count(result)
4799
+ total_columns = pb.get_column_count(result)
4800
+ table_type = _get_tbl_type(result)
4801
+
4802
+ _rich_print_scan_table(
4803
+ scan_result,
4804
+ expression,
4805
+ "Polars expression",
4806
+ table_type,
4807
+ total_rows,
4808
+ total_columns,
4809
+ )
4810
+ except Exception as e:
4811
+ console.print(f"[yellow]Could not display scan summary: {e}[/yellow]")
4812
+
4813
+ except Exception as e:
4814
+ console.print(f"[red]Error creating scan:[/red] {e}")
4815
+ sys.exit(1)
4816
+
4817
+
4818
+ def _handle_pl_missing(result: Any, expression: str, output_html: str | None) -> None:
4819
+ """Handle missing values output for Polars results."""
4820
+ try:
4821
+ missing_table = pb.missing_vals_tbl(data=result)
4822
+
4823
+ if output_html:
4824
+ html_content = missing_table.as_raw_html()
4825
+ Path(output_html).write_text(html_content, encoding="utf-8")
4826
+ console.print(f"[green]✓[/green] Missing values report saved to: {output_html}")
4827
+ else:
4828
+ _rich_print_missing_table(missing_table, result)
4829
+
4830
+ except Exception as e:
4831
+ console.print(f"[red]Error creating missing values report:[/red] {e}")
4832
+ sys.exit(1)
4833
+
4834
+
4835
+ def _handle_pl_info(result: Any, expression: str, output_html: str | None) -> None:
4836
+ """Handle info output for Polars results."""
4837
+ try:
4838
+ # Get basic info
4839
+ tbl_type = _get_tbl_type(result)
4840
+ row_count = pb.get_row_count(result)
4841
+ col_count = pb.get_column_count(result)
4842
+
4843
+ # Get column names and types
4844
+ if hasattr(result, "columns"):
4845
+ columns = list(result.columns)
4846
+ elif hasattr(result, "schema"):
4847
+ columns = list(result.schema.names)
4848
+ else:
4849
+ columns = []
4850
+
4851
+ dtypes_dict = _get_column_dtypes(result, columns)
4852
+
4853
+ if output_html:
4854
+ # Create a simple HTML info page
4855
+ # TODO: Implement an improved version of this in the Python API and then
4856
+ # use that here
4857
+ html_content = f"""
4858
+ <html><body>
4859
+ <h2>Polars Expression Info</h2>
4860
+ <p><strong>Expression:</strong> {expression}</p>
4861
+ <p><strong>Table Type:</strong> {tbl_type}</p>
4862
+ <p><strong>Rows:</strong> {row_count:,}</p>
4863
+ <p><strong>Columns:</strong> {col_count:,}</p>
4864
+ <h3>Column Details</h3>
4865
+ <ul>
4866
+ {"".join(f"<li>{col}: {dtypes_dict.get(col, '?')}</li>" for col in columns)}
4867
+ </ul>
4868
+ </body></html>
4869
+ """
4870
+ Path(output_html).write_text(html_content, encoding="utf-8")
4871
+ console.print(f"[green]✓[/green] HTML info saved to: {output_html}")
4872
+ else:
4873
+ # Display info table
4874
+ from rich.box import SIMPLE_HEAD
4875
+
4876
+ info_table = Table(
4877
+ title="Polars Expression Info",
4878
+ show_header=True,
4879
+ header_style="bold magenta",
4880
+ box=SIMPLE_HEAD,
4881
+ title_style="bold cyan",
4882
+ title_justify="left",
4883
+ )
4884
+ info_table.add_column("Property", style="cyan", no_wrap=True)
4885
+ info_table.add_column("Value", style="green")
4886
+
4887
+ info_table.add_row("Expression", expression)
4888
+ # Capitalize "polars" to "Polars" for consistency with pb info command
4889
+ display_tbl_type = (
4890
+ tbl_type.replace("polars", "Polars") if "polars" in tbl_type.lower() else tbl_type
4891
+ )
4892
+ info_table.add_row("Table Type", display_tbl_type)
4893
+ info_table.add_row("Rows", f"{row_count:,}")
4894
+ info_table.add_row("Columns", f"{col_count:,}")
4895
+
4896
+ console.print()
4897
+ console.print(info_table)
4898
+
4899
+ # Show column details
4900
+ if columns:
4901
+ console.print("\n[bold cyan]Column Details:[/bold cyan]")
4902
+ for col in columns[:10]: # Show first 10 columns
4903
+ dtype = dtypes_dict.get(col, "?")
4904
+ console.print(f" • {col}: [yellow]{dtype}[/yellow]")
4905
+
4906
+ if len(columns) > 10:
4907
+ console.print(f" ... and {len(columns) - 10} more columns")
4908
+
4909
+ except Exception as e:
4910
+ console.print(f"[red]Error creating info:[/red] {e}")
4911
+ sys.exit(1)
4912
+
4913
+
4914
+ def _handle_pl_pipe(result: Any, pipe_format: str) -> None:
4915
+ """Handle piped output from Polars results."""
4916
+ try:
4917
+ import sys
4918
+ import tempfile
4919
+
4920
+ # Create a temporary file to store the data
4921
+ with tempfile.NamedTemporaryFile(
4922
+ mode="w", suffix=f".{pipe_format}", prefix="pb_pipe_", delete=False
4923
+ ) as temp_file:
4924
+ temp_path = temp_file.name
4925
+
4926
+ # Write the data to the temporary file
4927
+ if pipe_format == "parquet":
4928
+ if hasattr(result, "write_parquet"):
4929
+ # Polars
4930
+ result.write_parquet(temp_path)
4931
+ elif hasattr(result, "to_parquet"):
4932
+ # Pandas
4933
+ result.to_parquet(temp_path)
4934
+ else:
4935
+ # Convert to pandas and write
4936
+ import pandas as pd
4937
+
4938
+ pd_result = pd.DataFrame(result)
4939
+ pd_result.to_parquet(temp_path)
4940
+ else: # CSV
4941
+ if hasattr(result, "write_csv"):
4942
+ # Polars
4943
+ result.write_csv(temp_path)
4944
+ elif hasattr(result, "to_csv"):
4945
+ # Pandas
4946
+ result.to_csv(temp_path, index=False)
4947
+ else:
4948
+ # Convert to pandas and write
4949
+ import pandas as pd
4950
+
4951
+ pd_result = pd.DataFrame(result)
4952
+ pd_result.to_csv(temp_path, index=False)
4953
+
4954
+ # Output the temporary file path to stdout for the next command
4955
+ print(temp_path)
4956
+
4957
+ except Exception as e:
4958
+ print(f"[red]Error creating pipe output:[/red] {e}", file=sys.stderr)
4959
+ sys.exit(1)
4960
+
4961
+
4962
+ def _get_best_editor() -> str:
4963
+ """Detect the best available editor on the system."""
4964
+
4965
+ # Check environment variable first
4966
+ if "EDITOR" in os.environ:
4967
+ return os.environ["EDITOR"]
4968
+
4969
+ # Check for common editors in order of preference
4970
+ editors = [
4971
+ "code", # VS Code
4972
+ "micro", # Modern terminal editor
4973
+ "nano", # User-friendly terminal editor
4974
+ "vim", # Vim
4975
+ "vi", # Vi (fallback)
4976
+ ]
4977
+
4978
+ for editor in editors:
4979
+ if shutil.which(editor):
4980
+ return editor
4981
+
4982
+ # Ultimate fallback
4983
+ return "nano"
4984
+
4985
+
4986
+ def _edit_with_vscode() -> str | None:
4987
+ """Edit Polars query using VS Code."""
4988
+ import subprocess
4989
+ import tempfile
4990
+
4991
+ # Create a temporary Python file
4992
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".py", prefix="pb_pl_", delete=False) as f:
4993
+ f.write("import polars as pl\n")
4994
+ f.write("\n")
4995
+ f.write("# Enter your Polars query here\n")
4996
+ f.write("# Examples:\n")
4997
+ f.write("# \n")
4998
+ f.write("# Single expression:\n")
4999
+ f.write("# pl.read_csv('data.csv').select(['name', 'age'])\n")
5000
+ f.write("# \n")
5001
+ f.write("# Multiple statements:\n")
5002
+ f.write("# csv = pl.read_csv('data.csv')\n")
5003
+ f.write("# result = csv.select(['name', 'age']).filter(pl.col('age') > 25)\n")
5004
+ f.write("# \n")
5005
+ f.write("# For multi-statement code, assign your final result to a variable\n")
5006
+ f.write("# like 'result', 'df', 'data', or just ensure it's the last line\n")
5007
+ f.write("# \n")
5008
+ f.write("# Save and then close this file in VS Code to execute the query\n")
5009
+ f.write("\n")
5010
+ temp_file = f.name
5011
+
5012
+ try:
5013
+ # Open in VS Code and wait for it to close
5014
+ result = subprocess.run(
5015
+ ["code", "--wait", temp_file], capture_output=True, text=True, timeout=300
5016
+ )
5017
+
5018
+ if result.returncode != 0:
5019
+ console.print(f"[yellow]VS Code exited with code {result.returncode}[/yellow]")
5020
+
5021
+ # Read the edited content
5022
+ with open(temp_file, "r") as f:
5023
+ content = f.read()
5024
+
5025
+ # Remove comments, empty lines, and import statements for cleaner execution
5026
+ lines = []
5027
+ for line in content.split("\n"):
5028
+ stripped = line.strip()
5029
+ if (
5030
+ stripped
5031
+ and not stripped.startswith("#")
5032
+ and not stripped.startswith("import polars")
5033
+ and not stripped.startswith("import polars as pl")
5034
+ ):
5035
+ lines.append(line)
5036
+
5037
+ return "\n".join(lines) if lines else None
5038
+
5039
+ except subprocess.TimeoutExpired:
5040
+ console.print("[red]Timeout:[/red] VS Code took too long to respond")
5041
+ return None
5042
+ except subprocess.CalledProcessError as e:
5043
+ console.print(f"[red]Error:[/red] Could not open VS Code: {e}")
5044
+ return None
5045
+ except FileNotFoundError:
5046
+ console.print("[red]Error:[/red] VS Code not found in PATH")
5047
+ return None
5048
+ finally:
5049
+ # Clean up
5050
+ Path(temp_file).unlink(missing_ok=True)
5051
+
5052
+
5053
+ def _show_concise_help(command_name: str, ctx: click.Context) -> None:
5054
+ """Show concise help for a command when required arguments are missing."""
5055
+
5056
+ if command_name == "info":
5057
+ console.print("[bold cyan]pb info[/bold cyan] - Display information about a data source")
5058
+ console.print()
5059
+ console.print("[bold yellow]Usage:[/bold yellow]")
5060
+ console.print(" pb info data.csv")
5061
+ console.print(" pb info small_table")
5062
+ console.print()
5063
+ console.print("[dim]Shows table type, dimensions, column names, and data types[/dim]")
5064
+ console.print()
5065
+ console.print(
5066
+ "[dim]Use [bold]pb info --help[/bold] for complete options and examples[/dim]"
5067
+ )
5068
+
5069
+ elif command_name == "preview":
5070
+ console.print(
5071
+ "[bold cyan]pb preview[/bold cyan] - Preview a data table showing head and tail rows"
5072
+ )
5073
+ console.print()
5074
+ console.print("[bold yellow]Usage:[/bold yellow]")
5075
+ console.print(" pb preview data.csv")
5076
+ console.print(" pb preview data.parquet --head 10 --tail 5")
5077
+ console.print()
5078
+ console.print("[bold yellow]Key Options:[/bold yellow]")
5079
+ console.print(" --head N Number of rows from the top (default: 5)")
5080
+ console.print(" --tail N Number of rows from the bottom (default: 5)")
5081
+ console.print(" --columns LIST Comma-separated list of columns to display")
5082
+ console.print(" --output-html Save HTML output to file")
5083
+ console.print()
5084
+ console.print(
5085
+ "[dim]Use [bold]pb preview --help[/bold] for complete options and examples[/dim]"
5086
+ )
5087
+
5088
+ elif command_name == "scan":
5089
+ console.print(
5090
+ "[bold cyan]pb scan[/bold cyan] - Generate a comprehensive data profile report"
5091
+ )
5092
+ console.print()
5093
+ console.print("[bold yellow]Usage:[/bold yellow]")
5094
+ console.print(" pb scan data.csv")
5095
+ console.print(" pb scan data.parquet --output-html report.html")
5096
+ console.print()
5097
+ console.print("[bold yellow]Key Options:[/bold yellow]")
5098
+ console.print(" --output-html Save HTML scan report to file")
5099
+ console.print(" --columns LIST Comma-separated list of columns to scan")
5100
+ console.print()
5101
+ console.print(
5102
+ "[dim]Use [bold]pb scan --help[/bold] for complete options and examples[/dim]"
5103
+ )
5104
+
5105
+ elif command_name == "missing":
5106
+ console.print("[bold cyan]pb missing[/bold cyan] - Generate a missing values report")
5107
+ console.print()
5108
+ console.print("[bold yellow]Usage:[/bold yellow]")
5109
+ console.print(" pb missing data.csv")
5110
+ console.print(" pb missing data.parquet --output-html missing_report.html")
5111
+ console.print()
5112
+ console.print("[bold yellow]Key Options:[/bold yellow]")
5113
+ console.print(" --output-html Save HTML output to file")
5114
+ console.print()
5115
+ console.print(
5116
+ "[dim]Use [bold]pb missing --help[/bold] for complete options and examples[/dim]"
5117
+ )
5118
+
5119
+ elif command_name == "validate":
5120
+ console.print("[bold cyan]pb validate[/bold cyan] - Perform data validation checks")
5121
+ console.print()
5122
+ console.print("[bold yellow]Usage:[/bold yellow]")
5123
+ console.print(" pb validate data.csv")
5124
+ console.print(" pb validate data.csv --check col-vals-not-null --column email")
5125
+ console.print()
5126
+ console.print("[bold yellow]Key Options:[/bold yellow]")
5127
+ console.print(" --check TYPE Validation check type (default: rows-distinct)")
5128
+ console.print(" --column COL Column name for column-specific checks")
5129
+ console.print(" --show-extract Show failing rows if validation fails")
5130
+ console.print(" --list-checks List all available validation checks")
5131
+ console.print()
5132
+ console.print(
5133
+ "[dim]Use [bold]pb validate --help[/bold] for complete options and examples[/dim]"
5134
+ )
5135
+
5136
+ elif command_name == "run":
5137
+ console.print("[bold cyan]pb run[/bold cyan] - Run a Pointblank validation script")
5138
+ console.print()
5139
+ console.print("[bold yellow]Usage:[/bold yellow]")
5140
+ console.print(" pb run validation_script.py")
5141
+ console.print(" pb run validation_script.py --data data.csv")
5142
+ console.print()
5143
+ console.print("[bold yellow]Key Options:[/bold yellow]")
5144
+ console.print(" --data SOURCE Replace data source in validation objects")
5145
+ console.print(" --output-html Save HTML validation report to file")
5146
+ console.print(" --show-extract Show failing rows if validation fails")
5147
+ console.print(" --fail-on LEVEL Exit with error on critical/error/warning/any")
5148
+ console.print()
5149
+ console.print("[dim]Use [bold]pb run --help[/bold] for complete options and examples[/dim]")
5150
+
5151
+ elif command_name == "make-template":
5152
+ console.print(
5153
+ "[bold cyan]pb make-template[/bold cyan] - Create a validation script template"
5154
+ )
5155
+ console.print()
5156
+ console.print("[bold yellow]Usage:[/bold yellow]")
5157
+ console.print(" pb make-template my_validation.py")
5158
+ console.print(" pb make-template validation_template.py")
5159
+ console.print()
5160
+ console.print("[dim]Creates a sample Python script with validation examples[/dim]")
5161
+ console.print("[dim]Edit the template and run with [bold]pb run[/bold][/dim]")
5162
+ console.print()
5163
+ console.print(
5164
+ "[dim]Use [bold]pb make-template --help[/bold] for complete options and examples[/dim]"
5165
+ )
5166
+
5167
+ elif command_name == "pl":
5168
+ console.print(
5169
+ "[bold cyan]pb pl[/bold cyan] - Execute Polars expressions and display results"
5170
+ )
5171
+ console.print()
5172
+ console.print("[bold yellow]Usage:[/bold yellow]")
5173
+ console.print(" pb pl \"pl.read_csv('data.csv')\"")
5174
+ console.print(" pb pl --edit")
5175
+ console.print()
5176
+ console.print("[bold yellow]Key Options:[/bold yellow]")
5177
+ console.print(" --edit Open editor for multi-line input")
5178
+ console.print(" --file FILE Read query from file")
5179
+ console.print(" --output-format Output format: preview, scan, missing, info")
5180
+ console.print(" --pipe Output for piping to other pb commands")
5181
+ console.print()
5182
+ console.print("[dim]Use [bold]pb pl --help[/bold] for complete options and examples[/dim]")
5183
+
5184
+ # Fix the exit call at the end
5185
+ if ctx is not None:
5186
+ ctx.exit(1)
5187
+ else:
5188
+ sys.exit(1)