pointblank 0.11.2__py3-none-any.whl → 0.11.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/cli.py CHANGED
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import copy
4
+ import os
5
+ import shutil
4
6
  import sys
5
7
  from pathlib import Path
6
8
  from typing import Any
@@ -32,6 +34,8 @@ class OrderedGroup(click.Group):
32
34
  "validate",
33
35
  "run",
34
36
  "make-template",
37
+ # Data Manipulation
38
+ "pl",
35
39
  # Utilities
36
40
  "datasets",
37
41
  "requirements",
@@ -91,6 +95,15 @@ def _load_data_source(data_source: str) -> Any:
91
95
  return _process_data(data_source)
92
96
 
93
97
 
98
+ def _is_piped_data_source(data_source: str) -> bool:
99
+ """Check if the data source is from a piped pb command."""
100
+ return (
101
+ data_source
102
+ and ("pb_pipe_" in data_source)
103
+ and (data_source.startswith("/var/folders/") or data_source.startswith("/tmp/"))
104
+ )
105
+
106
+
94
107
  def _format_cell_value(
95
108
  value: Any, is_row_number: bool = False, max_width: int = 50, num_columns: int = 10
96
109
  ) -> str:
@@ -558,9 +571,12 @@ def _rich_print_gt_table(
558
571
  gt_table: The GT table object to display
559
572
  preview_info: Optional dict with preview context info:
560
573
  - total_rows: Total rows in the dataset
574
+ - total_columns: Total columns in the dataset
561
575
  - head_rows: Number of head rows shown
562
576
  - tail_rows: Number of tail rows shown
563
577
  - is_complete: Whether the entire dataset is shown
578
+ - source_type: Type of data source (e.g., "External source: worldcities_new.csv")
579
+ - table_type: Type of table (e.g., "polars")
564
580
  show_summary: Whether to show the row count summary at the bottom
565
581
  """
566
582
  try:
@@ -593,6 +609,12 @@ def _rich_print_gt_table(
593
609
  table_type = preview_info["table_type"]
594
610
  table_title = f"Data Preview / {source_type} / {table_type}"
595
611
 
612
+ # Add dimensions subtitle in gray if available
613
+ total_rows = preview_info.get("total_rows")
614
+ total_columns = preview_info.get("total_columns")
615
+ if total_rows is not None and total_columns is not None:
616
+ table_title += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
617
+
596
618
  rich_table = Table(
597
619
  title=table_title,
598
620
  show_header=True,
@@ -1209,20 +1231,31 @@ def _display_validation_summary(validation: Any) -> None:
1209
1231
 
1210
1232
 
1211
1233
  @click.group(cls=OrderedGroup)
1212
- @click.version_option(version=pb.__version__, prog_name="pb")
1234
+ @click.version_option(pb.__version__, "-v", "--version", prog_name="pb")
1235
+ @click.help_option("-h", "--help")
1213
1236
  def cli():
1214
1237
  """
1215
1238
  Pointblank CLI: Data validation and quality tools for data engineers.
1216
1239
 
1217
- Use this CLI to run validation scripts, preview tables, and generate reports
1218
- directly from the command line.
1240
+ Use this CLI to validate data quality, explore datasets, and generate comprehensive
1241
+ reports for CSV, Parquet, and database sources. Suitable for data pipelines, ETL
1242
+ validation, and exploratory data analysis from the command line.
1243
+
1244
+ Quick Examples:
1245
+
1246
+ \b
1247
+ pb preview data.csv Preview your data
1248
+ pb scan data.csv Generate data profile
1249
+ pb validate data.csv Run basic validation
1250
+
1251
+ Use pb COMMAND --help for detailed help on any command.
1219
1252
  """
1220
1253
  pass
1221
1254
 
1222
1255
 
1223
1256
  @cli.command()
1224
- @click.argument("data_source", type=str)
1225
- def info(data_source: str):
1257
+ @click.argument("data_source", type=str, required=False)
1258
+ def info(data_source: str | None):
1226
1259
  """
1227
1260
  Display information about a data source.
1228
1261
 
@@ -1238,6 +1271,11 @@ def info(data_source: str):
1238
1271
  - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1239
1272
  """
1240
1273
  try:
1274
+ # Handle missing data_source with concise help
1275
+ if data_source is None:
1276
+ _show_concise_help("info", None)
1277
+ return
1278
+
1241
1279
  with console.status("[bold green]Loading data..."):
1242
1280
  # Load the data source using the centralized function
1243
1281
  data = _load_data_source(data_source)
@@ -1276,21 +1314,21 @@ def info(data_source: str):
1276
1314
 
1277
1315
 
1278
1316
  @cli.command()
1279
- @click.argument("data_source", type=str)
1280
- @click.option("--columns", "-c", help="Comma-separated list of columns to display")
1317
+ @click.argument("data_source", type=str, required=False)
1318
+ @click.option("--columns", help="Comma-separated list of columns to display")
1281
1319
  @click.option("--col-range", help="Column range like '1:10' or '5:' or ':15' (1-based indexing)")
1282
1320
  @click.option("--col-first", type=int, help="Show first N columns")
1283
1321
  @click.option("--col-last", type=int, help="Show last N columns")
1284
- @click.option("--head", "-h", default=5, help="Number of rows from the top (default: 5)")
1285
- @click.option("--tail", "-t", default=5, help="Number of rows from the bottom (default: 5)")
1286
- @click.option("--limit", "-l", default=50, help="Maximum total rows to display (default: 50)")
1322
+ @click.option("--head", default=5, help="Number of rows from the top (default: 5)")
1323
+ @click.option("--tail", default=5, help="Number of rows from the bottom (default: 5)")
1324
+ @click.option("--limit", default=50, help="Maximum total rows to display (default: 50)")
1287
1325
  @click.option("--no-row-numbers", is_flag=True, help="Hide row numbers")
1288
1326
  @click.option("--max-col-width", default=250, help="Maximum column width in pixels (default: 250)")
1289
1327
  @click.option("--min-table-width", default=500, help="Minimum table width in pixels (default: 500)")
1290
1328
  @click.option("--no-header", is_flag=True, help="Hide table header")
1291
1329
  @click.option("--output-html", type=click.Path(), help="Save HTML output to file")
1292
1330
  def preview(
1293
- data_source: str,
1331
+ data_source: str | None,
1294
1332
  columns: str | None,
1295
1333
  col_range: str | None,
1296
1334
  col_first: int | None,
@@ -1315,6 +1353,7 @@ def preview(
1315
1353
  - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
1316
1354
  - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1317
1355
  - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1356
+ - Piped data from pb pl command
1318
1357
 
1319
1358
  COLUMN SELECTION OPTIONS:
1320
1359
 
@@ -1329,11 +1368,52 @@ def preview(
1329
1368
  Tables with >15 columns automatically show first 7 and last 7 columns with indicators.
1330
1369
  """
1331
1370
  try:
1371
+ import sys
1372
+
1373
+ # Handle piped input
1374
+ if data_source is None:
1375
+ if not sys.stdin.isatty():
1376
+ # Data is being piped in - read the file path from stdin
1377
+ piped_input = sys.stdin.read().strip()
1378
+ if piped_input:
1379
+ data_source = piped_input
1380
+
1381
+ # Determine the format from the file extension
1382
+ if piped_input.endswith(".parquet"):
1383
+ format_type = "Parquet"
1384
+ elif piped_input.endswith(".csv"):
1385
+ format_type = "CSV"
1386
+ else:
1387
+ format_type = "unknown"
1388
+
1389
+ console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
1390
+ else:
1391
+ console.print("[red]Error:[/red] No data provided via pipe")
1392
+ sys.exit(1)
1393
+ else:
1394
+ # Show concise help and exit
1395
+ _show_concise_help("preview", None)
1396
+ return
1397
+
1332
1398
  with console.status("[bold green]Loading data..."):
1333
1399
  # Load the data source using the centralized function
1334
1400
  data = _load_data_source(data_source)
1335
1401
 
1336
- console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1402
+ # Check if this is a piped data source and create friendly display name
1403
+ is_piped_data = _is_piped_data_source(data_source)
1404
+
1405
+ if is_piped_data:
1406
+ if data_source.endswith(".parquet"):
1407
+ display_source = "Parquet file via `pb pl`"
1408
+ elif data_source.endswith(".csv"):
1409
+ display_source = "CSV file via `pb pl`"
1410
+ else:
1411
+ display_source = "File via `pb pl`"
1412
+ console.print(
1413
+ f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
1414
+ )
1415
+ else:
1416
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1337
1417
 
1338
1418
  # Parse columns if provided
1339
1419
  columns_list = None
@@ -1355,7 +1435,7 @@ def preview(
1355
1435
  # If _row_num_ exists in data but not in user selection, add it at beginning
1356
1436
  if all_columns and "_row_num_" in all_columns and "_row_num_" not in columns_list:
1357
1437
  columns_list = ["_row_num_"] + columns_list
1358
- except Exception: # pragma: no cover
1438
+ except Exception:
1359
1439
  # If we can't process the data, just use the user's column list as-is
1360
1440
  pass
1361
1441
  elif col_range or col_first or col_last:
@@ -1430,7 +1510,14 @@ def preview(
1430
1510
  total_dataset_columns = pb.get_column_count(processed_data)
1431
1511
 
1432
1512
  # Determine source type and table type for enhanced preview title
1433
- if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1513
+ if is_piped_data:
1514
+ if data_source.endswith(".parquet"):
1515
+ source_type = "Polars expression (serialized to Parquet) from `pb pl`"
1516
+ elif data_source.endswith(".csv"):
1517
+ source_type = "Polars expression (serialized to CSV) from `pb pl`"
1518
+ else:
1519
+ source_type = "Polars expression from `pb pl`"
1520
+ elif data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1434
1521
  source_type = f"Pointblank dataset: {data_source}"
1435
1522
  else:
1436
1523
  source_type = f"External source: {data_source}"
@@ -1480,17 +1567,17 @@ def preview(
1480
1567
 
1481
1568
  _rich_print_gt_table(gt_table, preview_info)
1482
1569
 
1483
- except Exception as e: # pragma: no cover
1570
+ except Exception as e:
1484
1571
  console.print(f"[red]Error:[/red] {e}")
1485
- sys.exit(1) # pragma: no cover
1572
+ sys.exit(1)
1486
1573
 
1487
1574
 
1488
1575
  @cli.command()
1489
- @click.argument("data_source", type=str)
1576
+ @click.argument("data_source", type=str, required=False)
1490
1577
  @click.option("--output-html", type=click.Path(), help="Save HTML scan report to file")
1491
1578
  @click.option("--columns", "-c", help="Comma-separated list of columns to scan")
1492
1579
  def scan(
1493
- data_source: str,
1580
+ data_source: str | None,
1494
1581
  output_html: str | None,
1495
1582
  columns: str | None,
1496
1583
  ):
@@ -1513,17 +1600,58 @@ def scan(
1513
1600
  - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
1514
1601
  - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1515
1602
  - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1603
+ - Piped data from pb pl command
1516
1604
  """
1517
1605
  try:
1606
+ import sys
1518
1607
  import time
1519
1608
 
1520
1609
  start_time = time.time()
1521
1610
 
1611
+ # Handle piped input
1612
+ if data_source is None:
1613
+ if not sys.stdin.isatty():
1614
+ # Data is being piped in - read the file path from stdin
1615
+ piped_input = sys.stdin.read().strip()
1616
+ if piped_input:
1617
+ data_source = piped_input
1618
+
1619
+ # Determine the format from the file extension
1620
+ if piped_input.endswith(".parquet"):
1621
+ format_type = "Parquet"
1622
+ elif piped_input.endswith(".csv"):
1623
+ format_type = "CSV"
1624
+ else:
1625
+ format_type = "unknown"
1626
+
1627
+ console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
1628
+ else:
1629
+ console.print("[red]Error:[/red] No data provided via pipe")
1630
+ sys.exit(1)
1631
+ else:
1632
+ # Show concise help and exit
1633
+ _show_concise_help("scan", None)
1634
+ return
1635
+
1522
1636
  with console.status("[bold green]Loading data..."):
1523
1637
  # Load the data source using the centralized function
1524
1638
  data = _load_data_source(data_source)
1525
1639
 
1526
- console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1640
+ # Check if this is a piped data source and create friendly display name
1641
+ is_piped_data = _is_piped_data_source(data_source)
1642
+
1643
+ if is_piped_data:
1644
+ if data_source.endswith(".parquet"):
1645
+ display_source = "Parquet file via `pb pl`"
1646
+ elif data_source.endswith(".csv"):
1647
+ display_source = "CSV file via `pb pl`"
1648
+ else:
1649
+ display_source = "File via `pb pl`"
1650
+ console.print(
1651
+ f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
1652
+ )
1653
+ else:
1654
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1527
1655
 
1528
1656
  # Parse columns if provided
1529
1657
  columns_list = None
@@ -1536,7 +1664,15 @@ def scan(
1536
1664
  # Data is already processed by _load_data_source
1537
1665
  scan_result = pb.col_summary_tbl(data=data)
1538
1666
 
1539
- if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1667
+ # Create friendly source type for display
1668
+ if is_piped_data:
1669
+ if data_source.endswith(".parquet"):
1670
+ source_type = "Polars expression (serialized to Parquet) from `pb pl`"
1671
+ elif data_source.endswith(".csv"):
1672
+ source_type = "Polars expression (serialized to CSV) from `pb pl`"
1673
+ else:
1674
+ source_type = "Polars expression from `pb pl`"
1675
+ elif data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1540
1676
  source_type = f"Pointblank dataset: {data_source}"
1541
1677
  else:
1542
1678
  source_type = f"External source: {data_source}"
@@ -1568,7 +1704,12 @@ def scan(
1568
1704
  # Display detailed column summary using rich formatting
1569
1705
  try:
1570
1706
  _rich_print_scan_table(
1571
- scan_result, data_source, source_type, table_type, total_rows, total_columns
1707
+ scan_result,
1708
+ display_source if is_piped_data else data_source,
1709
+ source_type,
1710
+ table_type,
1711
+ total_rows,
1712
+ total_columns,
1572
1713
  )
1573
1714
 
1574
1715
  except Exception as e:
@@ -1580,9 +1721,9 @@ def scan(
1580
1721
 
1581
1722
 
1582
1723
  @cli.command()
1583
- @click.argument("data_source", type=str)
1724
+ @click.argument("data_source", type=str, required=False)
1584
1725
  @click.option("--output-html", type=click.Path(), help="Save HTML output to file")
1585
- def missing(data_source: str, output_html: str | None):
1726
+ def missing(data_source: str | None, output_html: str | None):
1586
1727
  """
1587
1728
  Generate a missing values report for a data table.
1588
1729
 
@@ -1594,13 +1735,55 @@ def missing(data_source: str, output_html: str | None):
1594
1735
  - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
1595
1736
  - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1596
1737
  - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1738
+ - Piped data from pb pl command
1597
1739
  """
1598
1740
  try:
1741
+ import sys
1742
+
1743
+ # Handle piped input
1744
+ if data_source is None:
1745
+ if not sys.stdin.isatty():
1746
+ # Data is being piped in - read the file path from stdin
1747
+ piped_input = sys.stdin.read().strip()
1748
+ if piped_input:
1749
+ data_source = piped_input
1750
+
1751
+ # Determine the format from the file extension
1752
+ if piped_input.endswith(".parquet"):
1753
+ format_type = "Parquet"
1754
+ elif piped_input.endswith(".csv"):
1755
+ format_type = "CSV"
1756
+ else:
1757
+ format_type = "unknown"
1758
+
1759
+ console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
1760
+ else:
1761
+ console.print("[red]Error:[/red] No data provided via pipe")
1762
+ sys.exit(1)
1763
+ else:
1764
+ # Show concise help and exit
1765
+ _show_concise_help("missing", None)
1766
+ return
1767
+
1599
1768
  with console.status("[bold green]Loading data..."):
1600
1769
  # Load the data source using the centralized function
1601
1770
  data = _load_data_source(data_source)
1602
1771
 
1603
- console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1772
+ # Check if this is a piped data source and create friendly display name
1773
+ is_piped_data = _is_piped_data_source(data_source)
1774
+
1775
+ if is_piped_data:
1776
+ if data_source.endswith(".parquet"):
1777
+ display_source = "Parquet file via `pb pl`"
1778
+ elif data_source.endswith(".csv"):
1779
+ display_source = "CSV file via `pb pl`"
1780
+ else:
1781
+ display_source = "File via `pb pl`"
1782
+ console.print(
1783
+ f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
1784
+ )
1785
+ else:
1786
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1604
1787
 
1605
1788
  # Generate missing values table
1606
1789
  with console.status("[bold green]Analyzing missing values..."):
@@ -1616,7 +1799,38 @@ def missing(data_source: str, output_html: str | None):
1616
1799
  console.print(f"[green]✓[/green] Missing values report saved to: {output_html}")
1617
1800
  else:
1618
1801
  # Display in terminal with special missing values formatting
1619
- _rich_print_missing_table(gt_table, original_data)
1802
+ # Create enhanced context info for missing table display
1803
+ missing_info = {}
1804
+ try:
1805
+ # Determine source type and table type for enhanced preview title
1806
+ if is_piped_data:
1807
+ if data_source.endswith(".parquet"):
1808
+ source_type = "Polars expression (serialized to Parquet) from `pb pl`"
1809
+ elif data_source.endswith(".csv"):
1810
+ source_type = "Polars expression (serialized to CSV) from `pb pl`"
1811
+ else:
1812
+ source_type = "Polars expression from `pb pl`"
1813
+ elif data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1814
+ source_type = f"Pointblank dataset: {data_source}"
1815
+ else:
1816
+ source_type = f"External source: {data_source}"
1817
+
1818
+ missing_info = {
1819
+ "source_type": source_type,
1820
+ "table_type": _get_tbl_type(original_data),
1821
+ "total_rows": pb.get_row_count(original_data),
1822
+ "total_columns": pb.get_column_count(original_data),
1823
+ }
1824
+ except Exception:
1825
+ # Use defaults if metadata extraction fails
1826
+ missing_info = {
1827
+ "source_type": f"Data source: {data_source}",
1828
+ "table_type": "unknown",
1829
+ "total_rows": None,
1830
+ "total_columns": None,
1831
+ }
1832
+
1833
+ _rich_print_missing_table_enhanced(gt_table, original_data, missing_info)
1620
1834
 
1621
1835
  except Exception as e:
1622
1836
  console.print(f"[red]Error:[/red] {e}")
@@ -1741,6 +1955,8 @@ def validate(
1741
1955
  pb validate data.csv --check col-vals-not-null --column email --check col-vals-gt --column age --value 18
1742
1956
  """
1743
1957
  try:
1958
+ import sys
1959
+
1744
1960
  # Handle --list-checks option early (doesn't need data source)
1745
1961
  if list_checks:
1746
1962
  console.print("[bold bright_cyan]Available Validation Checks:[/bold bright_cyan]")
@@ -1797,13 +2013,31 @@ def validate(
1797
2013
  sys.exit(0)
1798
2014
 
1799
2015
  # Check if data_source is provided (required for all operations except --list-checks)
2016
+ # or if we have piped input
1800
2017
  if data_source is None:
1801
- console.print("[red]Error:[/red] DATA_SOURCE is required")
1802
- console.print("Use 'pb validate --help' for usage information")
1803
- console.print("Or use 'pb validate --list-checks' to see available validation types")
1804
- import sys
2018
+ # Check if we have piped input
2019
+ if not sys.stdin.isatty():
2020
+ # Data is being piped in: read the file path from stdin
2021
+ piped_input = sys.stdin.read().strip()
2022
+ if piped_input:
2023
+ data_source = piped_input
2024
+
2025
+ # Determine the format from the file extension
2026
+ if piped_input.endswith(".parquet"):
2027
+ format_type = "Parquet"
2028
+ elif piped_input.endswith(".csv"):
2029
+ format_type = "CSV"
2030
+ else:
2031
+ format_type = "unknown"
1805
2032
 
1806
- sys.exit(1)
2033
+ console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
2034
+ else:
2035
+ console.print("[red]Error:[/red] No data provided via pipe")
2036
+ sys.exit(1)
2037
+ else:
2038
+ # Show concise help and exit
2039
+ _show_concise_help("validate", None)
2040
+ return
1807
2041
 
1808
2042
  # Handle backward compatibility and parameter conversion
1809
2043
  import sys
@@ -1911,7 +2145,25 @@ def validate(
1911
2145
  checks_list, columns_list, sets_list, values_list
1912
2146
  )
1913
2147
 
1914
- console.print(f"[green]✓[/green] Loaded data source: {data_source}")
2148
+ # Check if this is a piped data source and create friendly display name
2149
+ is_piped_data = (
2150
+ data_source
2151
+ and data_source.startswith("/var/folders/")
2152
+ and ("pb_pipe_" in data_source or "/T/" in data_source)
2153
+ )
2154
+
2155
+ if is_piped_data:
2156
+ if data_source.endswith(".parquet"):
2157
+ display_source = "Parquet file via `pb pl`"
2158
+ elif data_source.endswith(".csv"):
2159
+ display_source = "CSV file via `pb pl`"
2160
+ else:
2161
+ display_source = "File via `pb pl`"
2162
+ console.print(
2163
+ f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
2164
+ )
2165
+ else:
2166
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1915
2167
 
1916
2168
  # Build a single validation object with chained checks
1917
2169
  with console.status(f"[bold green]Running {len(checks_list)} validation check(s)..."):
@@ -2134,136 +2386,339 @@ def requirements():
2134
2386
  console.print("\n[dim]Install missing packages to enable additional functionality.[/dim]")
2135
2387
 
2136
2388
 
2137
- def _rich_print_scan_table(
2138
- scan_result: Any,
2139
- data_source: str,
2140
- source_type: str,
2141
- table_type: str,
2142
- total_rows: int | None = None,
2143
- total_columns: int | None = None,
2389
+ def _rich_print_missing_table_enhanced(
2390
+ gt_table: Any, original_data: Any = None, missing_info: dict = None
2144
2391
  ) -> None:
2145
- """
2146
- Display scan results as a Rich table in the terminal with statistical measures.
2392
+ """Convert a missing values GT table to Rich table with enhanced formatting and metadata.
2147
2393
 
2148
2394
  Args:
2149
- scan_result: The GT object from col_summary_tbl()
2150
- data_source: Name of the data source being scanned
2151
- source_type: Type of data source (e.g., "Pointblank dataset: small_table")
2152
- table_type: Type of table (e.g., "polars.LazyFrame")
2153
- total_rows: Total number of rows in the dataset
2154
- total_columns: Total number of columns in the dataset
2395
+ gt_table: The GT table object for missing values
2396
+ original_data: The original data source to extract column types
2397
+ missing_info: Dict with metadata including source_type, table_type, total_rows, total_columns
2155
2398
  """
2156
2399
  try:
2157
- import re
2400
+ # Extract the underlying data from the GT table
2401
+ df = None
2158
2402
 
2159
- import narwhals as nw
2160
- from rich.box import SIMPLE_HEAD
2403
+ if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
2404
+ df = gt_table._tbl_data
2405
+ elif hasattr(gt_table, "_data") and gt_table._data is not None:
2406
+ df = gt_table._data
2407
+ elif hasattr(gt_table, "data") and gt_table.data is not None:
2408
+ df = gt_table.data
2161
2409
 
2162
- # Extract the underlying DataFrame from the GT object
2163
- # The GT object has a _tbl_data attribute that contains the DataFrame
2164
- gt_data = scan_result._tbl_data
2410
+ if df is not None:
2411
+ from rich.box import SIMPLE_HEAD
2165
2412
 
2166
- # Convert to Narwhals DataFrame for consistent handling
2167
- nw_data = nw.from_native(gt_data)
2413
+ # Extract metadata from missing_info or use defaults
2414
+ source_type = "Data source"
2415
+ table_type = "unknown"
2416
+ total_rows = None
2417
+ total_columns = None
2168
2418
 
2169
- # Convert to dictionary for easier access
2170
- data_dict = nw_data.to_dict(as_series=False)
2419
+ if missing_info:
2420
+ source_type = missing_info.get("source_type", "Data source")
2421
+ table_type = missing_info.get("table_type", "unknown")
2422
+ total_rows = missing_info.get("total_rows")
2423
+ total_columns = missing_info.get("total_columns")
2171
2424
 
2172
- # Create main scan table with missing data table styling
2173
- # Create a comprehensive title with data source, source type, and table type
2174
- title_text = f"Column Summary / {source_type} / {table_type}"
2425
+ # Create enhanced title matching the scan table format
2426
+ title_text = f"Missing Values / {source_type} / {table_type}"
2175
2427
 
2176
- # Add dimensions subtitle in gray if available
2177
- if total_rows is not None and total_columns is not None:
2178
- title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
2428
+ # Add dimensions subtitle in gray if available
2429
+ if total_rows is not None and total_columns is not None:
2430
+ title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
2179
2431
 
2180
- # Create the scan table
2181
- scan_table = Table(
2182
- title=title_text,
2183
- show_header=True,
2184
- header_style="bold magenta",
2185
- box=SIMPLE_HEAD,
2186
- title_style="bold cyan",
2187
- title_justify="left",
2188
- )
2432
+ # Get column names
2433
+ columns = []
2434
+ try:
2435
+ if hasattr(df, "columns"):
2436
+ columns = list(df.columns)
2437
+ elif hasattr(df, "schema"):
2438
+ columns = list(df.schema.names)
2439
+ except Exception as e:
2440
+ console.print(f"[red]Error getting columns:[/red] {e}")
2441
+ columns = []
2189
2442
 
2190
- # Add columns with specific styling and appropriate widths
2191
- scan_table.add_column("Column", style="cyan", no_wrap=True, width=20)
2192
- scan_table.add_column("Type", style="yellow", no_wrap=True, width=10)
2193
- scan_table.add_column(
2194
- "NA", style="red", width=6, justify="right"
2195
- ) # Adjusted for better formatting
2196
- scan_table.add_column(
2197
- "UQ", style="green", width=8, justify="right"
2198
- ) # Adjusted for boolean values
2443
+ if not columns:
2444
+ columns = [f"Column {i + 1}" for i in range(10)] # Fallback
2199
2445
 
2200
- # Add statistical columns if they exist with appropriate widths
2201
- stat_columns = []
2202
- column_mapping = {
2203
- "mean": ("Mean", "blue", 9),
2204
- "std": ("SD", "blue", 9),
2205
- "min": ("Min", "yellow", 9),
2206
- "median": ("Med", "yellow", 9),
2207
- "max": ("Max", "yellow", 9),
2208
- "q_1": ("Q₁", "magenta", 8),
2209
- "q_3": ("Q₃", "magenta", 9),
2210
- "iqr": ("IQR", "magenta", 8),
2211
- }
2446
+ # Get original data to extract column types
2447
+ column_types = {}
2448
+ if original_data is not None:
2449
+ try:
2450
+ # Get column types from original data
2451
+ if hasattr(original_data, "columns"):
2452
+ original_columns = list(original_data.columns)
2453
+ column_types = _get_column_dtypes(original_data, original_columns)
2454
+ except Exception as e:
2455
+ console.print(f"[red]Error getting column types:[/red] {e}")
2456
+ pass # Use empty dict as fallback
2212
2457
 
2213
- for col_key, (display_name, color, width) in column_mapping.items():
2214
- if col_key in data_dict:
2215
- scan_table.add_column(display_name, style=color, width=width, justify="right")
2216
- stat_columns.append(col_key)
2458
+ # Add columns to Rich table with special formatting for missing values table
2459
+ sector_columns = [col for col in columns if col != "columns" and col.isdigit()]
2217
2460
 
2218
- # Helper function to extract column name and type from HTML
2219
- def extract_column_info(html_content: str) -> tuple[str, str]:
2220
- """Extract column name and type from HTML formatted content."""
2221
- # Extract column name from first div
2222
- name_match = re.search(r"<div[^>]*>([^<]+)</div>", html_content)
2223
- column_name = name_match.group(1) if name_match else "Unknown"
2461
+ # Print the title first
2462
+ console.print()
2463
+ console.print(f"[bold cyan]{title_text}[/bold cyan]")
2224
2464
 
2225
- # Extract data type from second div (with gray color)
2226
- type_match = re.search(r"<div[^>]*color: gray[^>]*>([^<]+)</div>", html_content)
2227
- if type_match:
2228
- data_type = type_match.group(1)
2229
- # Convert to compact format using the existing function
2230
- compact_type = _format_dtype_compact(data_type)
2231
- data_type = compact_type
2232
- else:
2233
- data_type = "unknown"
2465
+ # Show the custom spanner header if we have sector columns
2466
+ if sector_columns:
2467
+ # Create a custom header line that shows the spanner
2468
+ header_parts = []
2469
+ header_parts.append(" " * 20) # Space for Column header
2470
+ header_parts.append(" " * 10) # Space for Type header
2234
2471
 
2235
- return column_name, data_type
2472
+ # Left-align "Row Sectors" with the first numbered column
2473
+ row_sectors_text = "Row Sectors"
2474
+ header_parts.append(row_sectors_text)
2236
2475
 
2237
- # Helper function to format values with improved number formatting
2238
- def format_value(
2239
- value: Any, is_missing: bool = False, is_unique: bool = False, max_width: int = 8
2240
- ) -> str:
2241
- """Format values for display with smart number formatting and HTML cleanup."""
2242
- if value is None or (isinstance(value, str) and value.strip() == ""):
2243
- return "[dim]—[/dim]"
2476
+ # Print the custom spanner header
2477
+ console.print("[dim]" + " ".join(header_parts) + "[/dim]")
2244
2478
 
2245
- # Handle missing values indicator
2246
- if is_missing and str(value) == "0":
2247
- return "[green]●[/green]" # No missing values
2479
+ # Add a horizontal rule below the spanner
2480
+ rule_parts = []
2481
+ rule_parts.append(" " * 20) # Space for Column header
2482
+ rule_parts.append(" " * 10) # Space for Type header
2248
2483
 
2249
- # Clean up HTML formatting from the raw data
2250
- str_val = str(value)
2484
+ # Use a fixed width horizontal rule for "Row Sectors"
2485
+ horizontal_rule = "─" * 20
2486
+ rule_parts.append(horizontal_rule)
2251
2487
 
2252
- # Handle multi-line values with <br> tags FIRST: take the first line (absolute number)
2253
- if "<br>" in str_val:
2254
- str_val = str_val.split("<br>")[0].strip()
2255
- # For unique values, we want just the integer part
2256
- if is_unique:
2257
- try:
2258
- # Try to extract just the integer part for unique counts
2259
- num_val = float(str_val)
2260
- return str(int(num_val))
2261
- except (ValueError, TypeError):
2262
- pass
2488
+ # Print the horizontal rule
2489
+ console.print("[dim]" + " ".join(rule_parts) + "[/dim]")
2263
2490
 
2264
- # Now handle HTML content (especially from boolean unique values)
2265
- if "<" in str_val and ">" in str_val:
2266
- # Remove HTML tags completely for cleaner display
2491
+ # Create the missing values table WITHOUT the title (since we printed it above)
2492
+ rich_table = Table(
2493
+ show_header=True,
2494
+ header_style="bold magenta",
2495
+ box=SIMPLE_HEAD,
2496
+ )
2497
+
2498
+ # Two separate columns: Column name (20 chars) and Data type (10 chars)
2499
+ rich_table.add_column("Column", style="cyan", no_wrap=True, width=20)
2500
+ rich_table.add_column("Type", style="yellow", no_wrap=True, width=10)
2501
+
2502
+ # Sector columns: All same width, optimized for "100%" (4 chars + padding)
2503
+ for sector in sector_columns:
2504
+ rich_table.add_column(
2505
+ sector,
2506
+ style="cyan",
2507
+ justify="center",
2508
+ no_wrap=True,
2509
+ width=5, # Fixed width optimized for percentage values
2510
+ )
2511
+
2512
+ # Convert data to rows with special formatting
2513
+ rows = []
2514
+ try:
2515
+ if hasattr(df, "to_dicts"):
2516
+ data_dict = df.to_dicts()
2517
+ elif hasattr(df, "to_dict"):
2518
+ data_dict = df.to_dict("records")
2519
+ else:
2520
+ data_dict = []
2521
+
2522
+ for i, row in enumerate(data_dict):
2523
+ try:
2524
+ # Each row should have: [column_name, data_type, sector1, sector2, ...]
2525
+ column_name = str(row.get("columns", ""))
2526
+
2527
+ # Truncate column name to 20 characters with ellipsis if needed
2528
+ if len(column_name) > 20:
2529
+ truncated_name = column_name[:17] + "…"
2530
+ else:
2531
+ truncated_name = column_name
2532
+
2533
+ # Get data type for this column
2534
+ if column_name in column_types:
2535
+ dtype = column_types[column_name]
2536
+ if len(dtype) > 10:
2537
+ truncated_dtype = dtype[:9] + "…"
2538
+ else:
2539
+ truncated_dtype = dtype
2540
+ else:
2541
+ truncated_dtype = "?"
2542
+
2543
+ # Start building the row with column name and type
2544
+ formatted_row = [truncated_name, truncated_dtype]
2545
+
2546
+ # Add sector values (formatted percentages)
2547
+ for sector in sector_columns:
2548
+ value = row.get(sector, 0.0)
2549
+ if isinstance(value, (int, float)):
2550
+ formatted_row.append(_format_missing_percentage(float(value)))
2551
+ else:
2552
+ formatted_row.append(str(value))
2553
+
2554
+ rows.append(formatted_row)
2555
+
2556
+ except Exception as e:
2557
+ console.print(f"[red]Error processing row {i}:[/red] {e}")
2558
+ continue
2559
+
2560
+ except Exception as e:
2561
+ console.print(f"[red]Error extracting data:[/red] {e}")
2562
+ rows = [["Error extracting data", "?", *["" for _ in sector_columns]]]
2563
+
2564
+ # Add rows to Rich table
2565
+ for row in rows:
2566
+ try:
2567
+ rich_table.add_row(*row)
2568
+ except Exception as e:
2569
+ console.print(f"[red]Error adding row:[/red] {e}")
2570
+ break
2571
+
2572
+ # Print the Rich table (without title since we already printed it)
2573
+ console.print(rich_table)
2574
+
2575
+ footer_text = (
2576
+ "[dim]Symbols: [green]●[/green] = no missing vals in sector, "
2577
+ "[red]●[/red] = all vals completely missing, "
2578
+ "[cyan]x%[/cyan] = percentage missing[/dim]"
2579
+ )
2580
+ console.print(footer_text)
2581
+
2582
+ else:
2583
+ # Fallback to regular table display
2584
+ _rich_print_gt_table(gt_table)
2585
+
2586
+ except Exception as e:
2587
+ console.print(f"[red]Error rendering missing values table:[/red] {e}")
2588
+ # Fallback to regular table display
2589
+ _rich_print_gt_table(gt_table)
2590
+
2591
+
2592
+ def _rich_print_scan_table(
2593
+ scan_result: Any,
2594
+ data_source: str,
2595
+ source_type: str,
2596
+ table_type: str,
2597
+ total_rows: int | None = None,
2598
+ total_columns: int | None = None,
2599
+ ) -> None:
2600
+ """
2601
+ Display scan results as a Rich table in the terminal with statistical measures.
2602
+
2603
+ Args:
2604
+ scan_result: The GT object from col_summary_tbl()
2605
+ data_source: Name of the data source being scanned
2606
+ source_type: Type of data source (e.g., "Pointblank dataset: small_table")
2607
+ table_type: Type of table (e.g., "polars.LazyFrame")
2608
+ total_rows: Total number of rows in the dataset
2609
+ total_columns: Total number of columns in the dataset
2610
+ """
2611
+ try:
2612
+ import re
2613
+
2614
+ import narwhals as nw
2615
+ from rich.box import SIMPLE_HEAD
2616
+
2617
+ # Extract the underlying DataFrame from the GT object
2618
+ # The GT object has a _tbl_data attribute that contains the DataFrame
2619
+ gt_data = scan_result._tbl_data
2620
+
2621
+ # Convert to Narwhals DataFrame for consistent handling
2622
+ nw_data = nw.from_native(gt_data)
2623
+
2624
+ # Convert to dictionary for easier access
2625
+ data_dict = nw_data.to_dict(as_series=False)
2626
+
2627
+ # Create main scan table with missing data table styling
2628
+ # Create a comprehensive title with data source, source type, and table type
2629
+ title_text = f"Column Summary / {source_type} / {table_type}"
2630
+
2631
+ # Add dimensions subtitle in gray if available
2632
+ if total_rows is not None and total_columns is not None:
2633
+ title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
2634
+
2635
+ # Create the scan table
2636
+ scan_table = Table(
2637
+ title=title_text,
2638
+ show_header=True,
2639
+ header_style="bold magenta",
2640
+ box=SIMPLE_HEAD,
2641
+ title_style="bold cyan",
2642
+ title_justify="left",
2643
+ )
2644
+
2645
+ # Add columns with specific styling and appropriate widths
2646
+ scan_table.add_column("Column", style="cyan", no_wrap=True, width=20)
2647
+ scan_table.add_column("Type", style="yellow", no_wrap=True, width=10)
2648
+ scan_table.add_column(
2649
+ "NA", style="red", width=6, justify="right"
2650
+ ) # Adjusted for better formatting
2651
+ scan_table.add_column(
2652
+ "UQ", style="green", width=8, justify="right"
2653
+ ) # Adjusted for boolean values
2654
+
2655
+ # Add statistical columns if they exist with appropriate widths
2656
+ stat_columns = []
2657
+ column_mapping = {
2658
+ "mean": ("Mean", "blue", 9),
2659
+ "std": ("SD", "blue", 9),
2660
+ "min": ("Min", "yellow", 9),
2661
+ "median": ("Med", "yellow", 9),
2662
+ "max": ("Max", "yellow", 9),
2663
+ "q_1": ("Q₁", "magenta", 8),
2664
+ "q_3": ("Q₃", "magenta", 9),
2665
+ "iqr": ("IQR", "magenta", 8),
2666
+ }
2667
+
2668
+ for col_key, (display_name, color, width) in column_mapping.items():
2669
+ if col_key in data_dict:
2670
+ scan_table.add_column(display_name, style=color, width=width, justify="right")
2671
+ stat_columns.append(col_key)
2672
+
2673
+ # Helper function to extract column name and type from HTML
2674
+ def extract_column_info(html_content: str) -> tuple[str, str]:
2675
+ """Extract column name and type from HTML formatted content."""
2676
+ # Extract column name from first div
2677
+ name_match = re.search(r"<div[^>]*>([^<]+)</div>", html_content)
2678
+ column_name = name_match.group(1) if name_match else "Unknown"
2679
+
2680
+ # Extract data type from second div (with gray color)
2681
+ type_match = re.search(r"<div[^>]*color: gray[^>]*>([^<]+)</div>", html_content)
2682
+ if type_match:
2683
+ data_type = type_match.group(1)
2684
+ # Convert to compact format using the existing function
2685
+ compact_type = _format_dtype_compact(data_type)
2686
+ data_type = compact_type
2687
+ else:
2688
+ data_type = "unknown"
2689
+
2690
+ return column_name, data_type
2691
+
2692
+ # Helper function to format values with improved number formatting
2693
+ def format_value(
2694
+ value: Any, is_missing: bool = False, is_unique: bool = False, max_width: int = 8
2695
+ ) -> str:
2696
+ """Format values for display with smart number formatting and HTML cleanup."""
2697
+ if value is None or (isinstance(value, str) and value.strip() == ""):
2698
+ return "[dim]—[/dim]"
2699
+
2700
+ # Handle missing values indicator
2701
+ if is_missing and str(value) == "0":
2702
+ return "[green]●[/green]" # No missing values
2703
+
2704
+ # Clean up HTML formatting from the raw data
2705
+ str_val = str(value)
2706
+
2707
+ # Handle multi-line values with <br> tags FIRST: take the first line (absolute number)
2708
+ if "<br>" in str_val:
2709
+ str_val = str_val.split("<br>")[0].strip()
2710
+ # For unique values, we want just the integer part
2711
+ if is_unique:
2712
+ try:
2713
+ # Try to extract just the integer part for unique counts
2714
+ num_val = float(str_val)
2715
+ return str(int(num_val))
2716
+ except (ValueError, TypeError):
2717
+ pass
2718
+
2719
+ # Now handle HTML content (especially from boolean unique values)
2720
+ if "<" in str_val and ">" in str_val:
2721
+ # Remove HTML tags completely for cleaner display
2267
2722
  str_val = re.sub(r"<[^>]+>", "", str_val).strip()
2268
2723
  # Clean up extra whitespace
2269
2724
  str_val = re.sub(r"\s+", " ", str_val).strip()
@@ -2423,8 +2878,36 @@ def _rich_print_missing_table(gt_table: Any, original_data: Any = None) -> None:
2423
2878
  if df is not None:
2424
2879
  from rich.box import SIMPLE_HEAD
2425
2880
 
2426
- # Create the missing values table
2427
- rich_table = Table(show_header=True, header_style="bold magenta", box=SIMPLE_HEAD)
2881
+ # Get metadata for enhanced missing table title
2882
+ total_rows = None
2883
+ total_columns = None
2884
+ source_type = "Data source"
2885
+ table_type = "unknown"
2886
+
2887
+ if original_data is not None:
2888
+ try:
2889
+ total_rows = pb.get_row_count(original_data)
2890
+ total_columns = pb.get_column_count(original_data)
2891
+ table_type = _get_tbl_type(original_data)
2892
+ except Exception:
2893
+ pass
2894
+
2895
+ # Create enhanced title matching the scan table format
2896
+ title_text = f"Missing Values / {source_type} / {table_type}"
2897
+
2898
+ # Add dimensions subtitle in gray if available
2899
+ if total_rows is not None and total_columns is not None:
2900
+ title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
2901
+
2902
+ # Create the missing values table with enhanced title
2903
+ rich_table = Table(
2904
+ title=title_text,
2905
+ show_header=True,
2906
+ header_style="bold magenta",
2907
+ box=SIMPLE_HEAD,
2908
+ title_style="bold cyan",
2909
+ title_justify="left",
2910
+ )
2428
2911
 
2429
2912
  # Get column names
2430
2913
  columns = []
@@ -2556,12 +3039,12 @@ def _rich_print_missing_table(gt_table: Any, original_data: Any = None) -> None:
2556
3039
  console.print("[dim]" + " ".join(rule_parts) + "[/dim]")
2557
3040
 
2558
3041
  # Print the Rich table (will handle terminal width automatically)
3042
+ console.print()
2559
3043
  console.print(rich_table)
2560
3044
  footer_text = (
2561
- "[dim]Symbols: [green]●[/green] = no missing values, "
2562
- "[red]●[/red] = completely missing, "
2563
- "<1% = less than 1% missing, "
2564
- ">99% = more than 99% missing[/dim]"
3045
+ "[dim]Symbols: [green]●[/green] = no missing vals in sector, "
3046
+ "[red]●[/red] = all vals completely missing, "
3047
+ "[cyan]x%[/cyan] = percentage missing[/dim]"
2565
3048
  )
2566
3049
  console.print(footer_text)
2567
3050
 
@@ -2700,6 +3183,20 @@ def _display_validation_result(
2700
3183
  set_val = sets_list[step_index] if step_index < len(sets_list) else None
2701
3184
  value = values_list[step_index] if step_index < len(values_list) else None
2702
3185
 
3186
+ # Check if this is piped data
3187
+ is_piped_data = _is_piped_data_source(data_source)
3188
+
3189
+ # Create friendly display name for data source
3190
+ if is_piped_data:
3191
+ if data_source.endswith(".parquet"):
3192
+ display_source = "Polars expression (serialized to Parquet) from `pb pl`"
3193
+ elif data_source.endswith(".csv"):
3194
+ display_source = "Polars expression (serialized to CSV) from `pb pl`"
3195
+ else:
3196
+ display_source = "Polars expression from `pb pl`"
3197
+ else:
3198
+ display_source = data_source
3199
+
2703
3200
  # Get validation step info
2704
3201
  step_info = None
2705
3202
  if hasattr(validation, "validation_info") and len(validation.validation_info) > step_index:
@@ -2766,7 +3263,7 @@ def _display_validation_result(
2766
3263
  result_table.add_column("Value", style="white")
2767
3264
 
2768
3265
  # Add basic info
2769
- result_table.add_row("Data Source", data_source)
3266
+ result_table.add_row("Data Source", display_source)
2770
3267
  result_table.add_row("Check Type", check)
2771
3268
 
2772
3269
  # Add column info for column-specific checks
@@ -3128,6 +3625,18 @@ def _show_extract_and_summary(
3128
3625
  """Show extract and summary for a validation step (used for single checks)."""
3129
3626
  step_passed = step_info.n_failed == 0 if step_info else True
3130
3627
 
3628
+ # Get the friendly display name
3629
+ is_piped_data = _is_piped_data_source(data_source)
3630
+ if is_piped_data:
3631
+ if data_source.endswith(".parquet"):
3632
+ display_source = "Polars expression (serialized to Parquet) from `pb pl`"
3633
+ elif data_source.endswith(".csv"):
3634
+ display_source = "Polars expression (serialized to CSV) from `pb pl`"
3635
+ else:
3636
+ display_source = "Polars expression from `pb pl`"
3637
+ else:
3638
+ display_source = data_source
3639
+
3131
3640
  # Show extract if requested and validation failed
3132
3641
  if (show_extract or write_extract) and not step_passed:
3133
3642
  console.print()
@@ -3281,54 +3790,54 @@ def _show_extract_and_summary(
3281
3790
  if step_passed:
3282
3791
  if check == "rows-distinct":
3283
3792
  success_message = (
3284
- f"[green]✓ Validation PASSED: No duplicate rows found in {data_source}[/green]"
3793
+ f"[green]✓ Validation PASSED: No duplicate rows found in {display_source}[/green]"
3285
3794
  )
3286
3795
  elif check == "col-vals-not-null":
3287
- success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {data_source}[/green]"
3796
+ success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {display_source}[/green]"
3288
3797
  elif check == "rows-complete":
3289
- success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {data_source}[/green]"
3798
+ success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {display_source}[/green]"
3290
3799
  elif check == "col-exists":
3291
3800
  success_message = (
3292
- f"[green]✓ Validation PASSED: Column '{column}' exists in {data_source}[/green]"
3801
+ f"[green]✓ Validation PASSED: Column '{column}' exists in {display_source}[/green]"
3293
3802
  )
3294
3803
  elif check == "col-vals-in-set":
3295
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {data_source}[/green]"
3804
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {display_source}[/green]"
3296
3805
  elif check == "col-vals-gt":
3297
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {data_source}[/green]"
3806
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {display_source}[/green]"
3298
3807
  elif check == "col-vals-ge":
3299
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {data_source}[/green]"
3808
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {display_source}[/green]"
3300
3809
  elif check == "col-vals-lt":
3301
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {data_source}[/green]"
3810
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {display_source}[/green]"
3302
3811
  elif check == "col-vals-le":
3303
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {data_source}[/green]"
3812
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {display_source}[/green]"
3304
3813
  else:
3305
3814
  success_message = (
3306
- f"[green]✓ Validation PASSED: {check} check passed for {data_source}[/green]"
3815
+ f"[green]✓ Validation PASSED: {check} check passed for {display_source}[/green]"
3307
3816
  )
3308
3817
 
3309
3818
  console.print(Panel(success_message, border_style="green", expand=False))
3310
3819
  else:
3311
3820
  if step_info:
3312
3821
  if check == "rows-distinct":
3313
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {data_source}[/red]"
3822
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {display_source}[/red]"
3314
3823
  elif check == "col-vals-not-null":
3315
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {data_source}[/red]"
3824
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {display_source}[/red]"
3316
3825
  elif check == "rows-complete":
3317
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {data_source}[/red]"
3826
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {display_source}[/red]"
3318
3827
  elif check == "col-exists":
3319
- failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {data_source}[/red]"
3828
+ failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {display_source}[/red]"
3320
3829
  elif check == "col-vals-in-set":
3321
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {data_source}[/red]"
3830
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {display_source}[/red]"
3322
3831
  elif check == "col-vals-gt":
3323
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {data_source}[/red]"
3832
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {display_source}[/red]"
3324
3833
  elif check == "col-vals-ge":
3325
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {data_source}[/red]"
3834
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {display_source}[/red]"
3326
3835
  elif check == "col-vals-lt":
3327
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {data_source}[/red]"
3836
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {display_source}[/red]"
3328
3837
  elif check == "col-vals-le":
3329
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {data_source}[/red]"
3838
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {display_source}[/red]"
3330
3839
  else:
3331
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {data_source}[/red]"
3840
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {display_source}[/red]"
3332
3841
 
3333
3842
  # Add hint about --show-extract if not already used (except for col-exists which has no rows to show)
3334
3843
  if not show_extract and check != "col-exists":
@@ -3338,15 +3847,15 @@ def _show_extract_and_summary(
3338
3847
  else:
3339
3848
  if check == "rows-distinct":
3340
3849
  failure_message = (
3341
- f"[red]✗ Validation FAILED: Duplicate rows found in {data_source}[/red]"
3850
+ f"[red]✗ Validation FAILED: Duplicate rows found in {display_source}[/red]"
3342
3851
  )
3343
3852
  elif check == "rows-complete":
3344
3853
  failure_message = (
3345
- f"[red]✗ Validation FAILED: Incomplete rows found in {data_source}[/red]"
3854
+ f"[red]✗ Validation FAILED: Incomplete rows found in {display_source}[/red]"
3346
3855
  )
3347
3856
  else:
3348
3857
  failure_message = (
3349
- f"[red]✗ Validation FAILED: {check} check failed for {data_source}[/red]"
3858
+ f"[red]✗ Validation FAILED: {check} check failed for {display_source}[/red]"
3350
3859
  )
3351
3860
 
3352
3861
  # Add hint about --show-extract if not already used
@@ -3357,31 +3866,131 @@ def _show_extract_and_summary(
3357
3866
 
3358
3867
 
3359
3868
  @cli.command()
3360
- @click.argument("output_file", type=click.Path())
3361
- def make_template(output_file: str):
3869
+ @click.argument("output_file", type=click.Path(), required=False)
3870
+ def make_template(output_file: str | None):
3362
3871
  """
3363
- Create a validation script template.
3872
+ Create a validation script or YAML configuration template.
3873
+
3874
+ Creates a sample Python script or YAML configuration with examples showing how to use Pointblank
3875
+ for data validation. The template type is determined by the file extension:
3876
+ - .py files create Python script templates
3877
+ - .yaml/.yml files create YAML configuration templates
3364
3878
 
3365
- Creates a sample Python script with examples showing how to use Pointblank
3366
- for data validation. Edit the template to add your own data loading and
3367
- validation rules, then run it with 'pb run'.
3879
+ Edit the template to add your own data loading and validation rules, then run it with 'pb run'.
3368
3880
 
3369
- OUTPUT_FILE is the path where the template script will be created.
3881
+ OUTPUT_FILE is the path where the template will be created.
3370
3882
 
3371
3883
  Examples:
3372
3884
 
3373
3885
  \b
3374
- pb make-template my_validation.py
3375
- pb make-template validation_template.py
3886
+ pb make-template my_validation.py # Creates Python script template
3887
+ pb make-template my_validation.yaml # Creates YAML config template
3888
+ pb make-template validation_template.yml # Creates YAML config template
3376
3889
  """
3377
- example_script = '''"""
3378
- Example Pointblank validation script.
3890
+ # Handle missing output_file with concise help
3891
+ if output_file is None:
3892
+ _show_concise_help("make-template", None)
3893
+ return
3379
3894
 
3380
- This script demonstrates how to create validation rules for your data.
3381
- Modify the data loading and validation rules below to match your requirements.
3895
+ # Detect file type based on extension
3896
+ file_path = Path(output_file)
3897
+ file_extension = file_path.suffix.lower()
3382
3898
 
3383
- When using 'pb run' with --data option, the CLI will automatically replace
3384
- the data source in your validation object with the provided data.
3899
+ is_yaml_file = file_extension in [".yaml", ".yml"]
3900
+ is_python_file = file_extension == ".py"
3901
+
3902
+ if not is_yaml_file and not is_python_file:
3903
+ console.print(
3904
+ f"[yellow]Warning:[/yellow] Unknown file extension '{file_extension}'. "
3905
+ "Creating Python template by default. Use .py, .yaml, or .yml extensions for specific template types."
3906
+ )
3907
+ is_python_file = True
3908
+
3909
+ if is_yaml_file:
3910
+ # Create YAML template
3911
+ example_yaml = """# Example Pointblank YAML validation configuration
3912
+ #
3913
+ # This YAML file demonstrates how to create validation rules for your data.
3914
+ # Modify the data source and validation steps below to match your requirements.
3915
+ #
3916
+ # When using 'pb run' with --data option, the CLI will automatically replace
3917
+ # the 'tbl' field with the provided data source.
3918
+
3919
+ # Data source configuration
3920
+ tbl: small_table # Replace with your data source
3921
+ # Can be: dataset name, CSV file, Parquet file, database connection, etc.
3922
+
3923
+ # Optional: Table name for reporting (defaults to filename if not specified)
3924
+ tbl_name: "Example Validation"
3925
+
3926
+ # Optional: Label for this validation run
3927
+ label: "Validation Template"
3928
+
3929
+ # Optional: Validation thresholds (defaults shown below)
3930
+ # thresholds:
3931
+ # warning: 0.05 # 5% failure rate triggers warning
3932
+ # error: 0.10 # 10% failure rate triggers error
3933
+ # critical: 0.15 # 15% failure rate triggers critical
3934
+
3935
+ # Validation steps to perform
3936
+ steps:
3937
+ # Check for duplicate rows across all columns
3938
+ - rows_distinct
3939
+
3940
+ # Check that required columns exist
3941
+ - col_exists:
3942
+ columns: [column1, column2] # Replace with your actual column names
3943
+
3944
+ # Check for null values in important columns
3945
+ - col_vals_not_null:
3946
+ columns: important_column # Replace with your actual column name
3947
+
3948
+ # Check value ranges (uncomment and modify as needed)
3949
+ # - col_vals_gt:
3950
+ # columns: amount
3951
+ # value: 0
3952
+
3953
+ # - col_vals_between:
3954
+ # columns: score
3955
+ # left: 0
3956
+ # right: 100
3957
+
3958
+ # Check string patterns (uncomment and modify as needed)
3959
+ # - col_vals_regex:
3960
+ # columns: email
3961
+ # pattern: "^[\\w\\.-]+@[\\w\\.-]+\\.[a-zA-Z]{2,}$"
3962
+
3963
+ # Check for unique values (uncomment and modify as needed)
3964
+ # - col_vals_unique:
3965
+ # columns: id
3966
+
3967
+ # Check values are in allowed set (uncomment and modify as needed)
3968
+ # - col_vals_in_set:
3969
+ # columns: status
3970
+ # set: [active, inactive, pending]
3971
+
3972
+ # Add more validation steps as needed
3973
+ # See the Pointblank documentation for the full list of available validation functions
3974
+ """
3975
+
3976
+ Path(output_file).write_text(example_yaml)
3977
+ console.print(f"[green]✓[/green] YAML validation template created: {output_file}")
3978
+ console.print("\nEdit the template to add your data source and validation rules, then run:")
3979
+ console.print(f"[cyan]pb run {output_file}[/cyan]")
3980
+ console.print(
3981
+ f"[cyan]pb run {output_file} --data your_data.csv[/cyan] [dim]# Override data source[/dim]"
3982
+ )
3983
+
3984
+ else:
3985
+ # Create Python template
3986
+ example_script = '''"""
3987
+ Example Pointblank validation script.
3988
+
3989
+ This script demonstrates how to create validation rules for your data.
3990
+ Modify the data loading and validation rules below to match your requirements.
3991
+
3992
+ When using 'pb run' with --data option, the CLI will automatically replace
3993
+ the data source in your validation object with the provided data.
3385
3994
  """
3386
3995
 
3387
3996
  import pointblank as pb
@@ -3427,21 +4036,23 @@ validation = (
3427
4036
  )
3428
4037
  '''
3429
4038
 
3430
- Path(output_file).write_text(example_script)
3431
- console.print(f"[green]✓[/green] Validation script template created: {output_file}")
3432
- console.print("\nEdit the template to add your data loading and validation rules, then run:")
3433
- console.print(f"[cyan]pb run {output_file}[/cyan]")
3434
- console.print(
3435
- f"[cyan]pb run {output_file} --data your_data.csv[/cyan] [dim]# Replace data source automatically[/dim]"
3436
- )
4039
+ Path(output_file).write_text(example_script)
4040
+ console.print(f"[green]✓[/green] Python validation template created: {output_file}")
4041
+ console.print(
4042
+ "\nEdit the template to add your data loading and validation rules, then run:"
4043
+ )
4044
+ console.print(f"[cyan]pb run {output_file}[/cyan]")
4045
+ console.print(
4046
+ f"[cyan]pb run {output_file} --data your_data.csv[/cyan] [dim]# Replace data source automatically[/dim]"
4047
+ )
3437
4048
 
3438
4049
 
3439
4050
  @cli.command()
3440
- @click.argument("validation_script", type=click.Path(exists=True))
4051
+ @click.argument("validation_file", type=click.Path(exists=True), required=False)
3441
4052
  @click.option(
3442
4053
  "--data",
3443
4054
  type=str,
3444
- help="Data source to replace in validation objects (single validation scripts only)",
4055
+ help="Data source to replace in validation objects (Python scripts and YAML configs)",
3445
4056
  )
3446
4057
  @click.option("--output-html", type=click.Path(), help="Save HTML validation report to file")
3447
4058
  @click.option("--output-json", type=click.Path(), help="Save JSON validation summary to file")
@@ -3462,7 +4073,7 @@ validation = (
3462
4073
  help="Exit with non-zero code when validation reaches this threshold level",
3463
4074
  )
3464
4075
  def run(
3465
- validation_script: str,
4076
+ validation_file: str | None,
3466
4077
  data: str | None,
3467
4078
  output_html: str | None,
3468
4079
  output_json: str | None,
@@ -3472,16 +4083,19 @@ def run(
3472
4083
  fail_on: str | None,
3473
4084
  ):
3474
4085
  """
3475
- Run a Pointblank validation script.
4086
+ Run a Pointblank validation script or YAML configuration.
3476
4087
 
3477
- VALIDATION_SCRIPT should be a Python file that defines validation logic.
3478
- The script should load its own data and create validation objects.
4088
+ VALIDATION_FILE can be:
4089
+ - A Python file (.py) that defines validation logic
4090
+ - A YAML configuration file (.yaml, .yml) that defines validation steps
4091
+
4092
+ Python scripts should load their own data and create validation objects.
4093
+ YAML configurations define data sources and validation steps declaratively.
3479
4094
 
3480
4095
  If --data is provided, it will automatically replace the data source in your
3481
- validation objects. This works with scripts containing a single validation.
3482
- For scripts with multiple validations, use separate script files or remove --data.
4096
+ validation objects (Python scripts) or override the 'tbl' field (YAML configs).
3483
4097
 
3484
- To get started quickly, use 'pb make-template' to create a validation script template.
4098
+ To get started quickly, use 'pb make-template' to create templates.
3485
4099
 
3486
4100
  DATA can be:
3487
4101
 
@@ -3495,14 +4109,34 @@ def run(
3495
4109
  Examples:
3496
4110
 
3497
4111
  \b
3498
- pb make-template my_validation.py # Create a template first
4112
+ pb make-template my_validation.py # Create a Python template
3499
4113
  pb run validation_script.py
4114
+ pb run validation_config.yaml
3500
4115
  pb run validation_script.py --data data.csv
3501
- pb run validation_script.py --data small_table --output-html report.html
4116
+ pb run validation_config.yaml --data small_table --output-html report.html
3502
4117
  pb run validation_script.py --show-extract --fail-on error
3503
- pb run validation_script.py --write-extract extracts_folder --fail-on critical
4118
+ pb run validation_config.yaml --write-extract extracts_folder --fail-on critical
3504
4119
  """
3505
4120
  try:
4121
+ # Handle missing validation_file with concise help
4122
+ if validation_file is None:
4123
+ _show_concise_help("run", None)
4124
+ return
4125
+
4126
+ # Detect file type based on extension
4127
+ file_path = Path(validation_file)
4128
+ file_extension = file_path.suffix.lower()
4129
+
4130
+ is_yaml_file = file_extension in [".yaml", ".yml"]
4131
+ is_python_file = file_extension == ".py"
4132
+
4133
+ if not is_yaml_file and not is_python_file:
4134
+ console.print(
4135
+ f"[red]Error:[/red] Unsupported file type '{file_extension}'. "
4136
+ "Only .py (Python scripts) and .yaml/.yml (YAML configs) are supported."
4137
+ )
4138
+ sys.exit(1)
4139
+
3506
4140
  # Load optional data override if provided
3507
4141
  cli_data = None
3508
4142
  if data:
@@ -3510,60 +4144,94 @@ def run(
3510
4144
  cli_data = _load_data_source(data)
3511
4145
  console.print(f"[green]✓[/green] Loaded data override: {data}")
3512
4146
 
3513
- # Execute the validation script
3514
- with console.status("[bold green]Running validation script..."):
3515
- # Read and execute the validation script
3516
- script_content = Path(validation_script).read_text()
4147
+ # Process based on file type
4148
+ validations = []
3517
4149
 
3518
- # Create a namespace with pointblank and optional CLI data
3519
- namespace = {
3520
- "pb": pb,
3521
- "pointblank": pb,
3522
- "cli_data": cli_data, # Available if --data was provided
3523
- "__name__": "__main__",
3524
- "__file__": str(Path(validation_script).resolve()),
3525
- }
4150
+ if is_yaml_file:
4151
+ # Handle YAML configuration file
4152
+ from pointblank.yaml import YAMLValidationError, YAMLValidator, yaml_interrogate
3526
4153
 
3527
- # Execute the script
3528
- try:
3529
- exec(script_content, namespace)
3530
- except Exception as e:
3531
- console.print(f"[red]Error executing validation script:[/red] {e}")
3532
- sys.exit(1)
4154
+ with console.status("[bold green]Running YAML validation..."):
4155
+ try:
4156
+ if cli_data is not None:
4157
+ # Load and modify YAML config to use CLI data
4158
+ console.print(
4159
+ "[yellow]Replacing data source in YAML config with CLI data[/yellow]"
4160
+ )
3533
4161
 
3534
- # Look for validation objects in the namespace
3535
- validations = []
4162
+ validator = YAMLValidator()
4163
+ config = validator.load_config(validation_file)
3536
4164
 
3537
- # Look for the 'validation' variable specifically first
3538
- if "validation" in namespace:
3539
- validations.append(namespace["validation"])
4165
+ # Replace the 'tbl' field with our CLI data
4166
+ # Note: We pass the CLI data object directly instead of a string
4167
+ config["tbl"] = cli_data
3540
4168
 
3541
- # Also look for any other validation objects
3542
- for key, value in namespace.items():
3543
- if (
3544
- key != "validation"
3545
- and hasattr(value, "interrogate")
3546
- and hasattr(value, "validation_info")
3547
- ):
3548
- validations.append(value)
3549
- # Also check if it's a Validate object that has been interrogated
3550
- elif key != "validation" and str(type(value)).find("Validate") != -1:
3551
- validations.append(value)
3552
-
3553
- if not validations:
3554
- raise ValueError(
3555
- "No validation objects found in script. "
3556
- "Script should create Validate objects and call .interrogate() on them."
3557
- )
4169
+ # Build and execute validation with modified config
4170
+ validation = validator.execute_workflow(config)
4171
+
4172
+ else:
4173
+ # Use YAML config as-is
4174
+ validation = yaml_interrogate(validation_file)
4175
+
4176
+ validations.append(validation)
4177
+
4178
+ except YAMLValidationError as e:
4179
+ console.print(f"[red]YAML validation error:[/red] {e}")
4180
+ sys.exit(1)
4181
+
4182
+ else:
4183
+ # Handle Python script file
4184
+ with console.status("[bold green]Running Python validation script..."):
4185
+ # Read and execute the validation script
4186
+ script_content = Path(validation_file).read_text()
4187
+
4188
+ # Create a namespace with pointblank and optional CLI data
4189
+ namespace = {
4190
+ "pb": pb,
4191
+ "pointblank": pb,
4192
+ "cli_data": cli_data, # Available if --data was provided
4193
+ "__name__": "__main__",
4194
+ "__file__": str(Path(validation_file).resolve()),
4195
+ }
4196
+
4197
+ # Execute the script
4198
+ try:
4199
+ exec(script_content, namespace)
4200
+ except Exception as e:
4201
+ console.print(f"[red]Error executing validation script:[/red] {e}")
4202
+ sys.exit(1)
4203
+
4204
+ # Look for validation objects in the namespace
4205
+ # Look for the 'validation' variable specifically first
4206
+ if "validation" in namespace:
4207
+ validations.append(namespace["validation"])
4208
+
4209
+ # Also look for any other validation objects
4210
+ for key, value in namespace.items():
4211
+ if (
4212
+ key != "validation"
4213
+ and hasattr(value, "interrogate")
4214
+ and hasattr(value, "validation_info")
4215
+ ):
4216
+ validations.append(value)
4217
+ # Also check if it's a Validate object that has been interrogated
4218
+ elif key != "validation" and str(type(value)).find("Validate") != -1:
4219
+ validations.append(value)
4220
+
4221
+ if not validations:
4222
+ raise ValueError(
4223
+ "No validation objects found in script. "
4224
+ "Script should create Validate objects and call .interrogate() on them."
4225
+ )
3558
4226
 
3559
4227
  console.print(f"[green]✓[/green] Found {len(validations)} validation object(s)")
3560
4228
 
3561
- # Implement automatic data replacement for Validate objects if --data was provided
3562
- if cli_data is not None:
3563
- # Check if we have multiple validations (this is not supported)
4229
+ # Implement automatic data replacement for Python scripts only (YAML configs handle this differently)
4230
+ if cli_data is not None and is_python_file:
4231
+ # Check if we have multiple validations (this is not supported for Python scripts)
3564
4232
  if len(validations) > 1:
3565
4233
  console.print(
3566
- f"[red]Error: Found {len(validations)} validation objects in the script.[/red]"
4234
+ f"[red]Error: Found {len(validations)} validation objects in the Python script.[/red]"
3567
4235
  )
3568
4236
  console.print(
3569
4237
  "[yellow]The --data option replaces data in ALL validation objects,[/yellow]"
@@ -3902,3 +4570,768 @@ def _format_missing_percentage(value: float) -> str:
3902
4570
  return ">99%" # More than 99%
3903
4571
  else:
3904
4572
  return f"{int(round(value))}%" # Round to nearest integer with % sign
4573
+
4574
+
4575
+ @cli.command()
4576
+ @click.argument("polars_expression", type=str, required=False)
4577
+ @click.option("--edit", "-e", is_flag=True, help="Open editor for multi-line input")
4578
+ @click.option("--file", "-f", type=click.Path(exists=True), help="Read query from file")
4579
+ @click.option(
4580
+ "--editor", help="Editor to use for --edit mode (overrides $EDITOR and auto-detection)"
4581
+ )
4582
+ @click.option(
4583
+ "--output-format",
4584
+ "-o",
4585
+ type=click.Choice(["preview", "scan", "missing", "info"]),
4586
+ default="preview",
4587
+ help="Output format for the result",
4588
+ )
4589
+ @click.option("--preview-head", default=5, help="Number of head rows for preview")
4590
+ @click.option("--preview-tail", default=5, help="Number of tail rows for preview")
4591
+ @click.option("--output-html", type=click.Path(), help="Save HTML output to file")
4592
+ @click.option(
4593
+ "--pipe", is_flag=True, help="Output data in a format suitable for piping to other pb commands"
4594
+ )
4595
+ @click.option(
4596
+ "--pipe-format",
4597
+ type=click.Choice(["parquet", "csv"]),
4598
+ default="parquet",
4599
+ help="Format for piped output (default: parquet)",
4600
+ )
4601
+ def pl(
4602
+ polars_expression: str | None,
4603
+ edit: bool,
4604
+ file: str | None,
4605
+ editor: str | None,
4606
+ output_format: str,
4607
+ preview_head: int,
4608
+ preview_tail: int,
4609
+ output_html: str | None,
4610
+ pipe: bool,
4611
+ pipe_format: str,
4612
+ ):
4613
+ """
4614
+ Execute Polars expressions and display results.
4615
+
4616
+ Execute Polars DataFrame operations from the command line and display
4617
+ the results using Pointblank's visualization tools.
4618
+
4619
+ POLARS_EXPRESSION should be a valid Polars expression that returns a DataFrame.
4620
+ The 'pl' module is automatically imported and available.
4621
+
4622
+ Examples:
4623
+
4624
+ \b
4625
+ # Direct expression
4626
+ pb pl "pl.read_csv('data.csv')"
4627
+ pb pl "pl.read_csv('data.csv').select(['name', 'age'])"
4628
+ pb pl "pl.read_csv('data.csv').filter(pl.col('age') > 25)"
4629
+
4630
+ # Multi-line with editor (supports multiple statements)
4631
+ pb pl --edit
4632
+
4633
+ # Multi-statement code example in editor:
4634
+ # csv = pl.read_csv('data.csv')
4635
+ # result = csv.select(['name', 'age']).filter(pl.col('age') > 25)
4636
+
4637
+ # Multi-line with a specific editor
4638
+ pb pl --edit --editor nano
4639
+ pb pl --edit --editor code
4640
+ pb pl --edit --editor micro
4641
+
4642
+ # From file
4643
+ pb pl --file query.py
4644
+
4645
+ # Piping to other pb commands
4646
+ pb pl "pl.read_csv('data.csv').filter(pl.col('age') > 25)" --pipe | pb validate --check rows-distinct
4647
+ pb pl --edit --pipe | pb preview --head 10
4648
+ pb pl --edit --pipe | pb scan --output-html report.html
4649
+ pb pl --edit --pipe | pb missing --output-html missing_report.html
4650
+
4651
+ Use --output-format to change how results are displayed:
4652
+
4653
+ \b
4654
+ pb pl "pl.read_csv('data.csv')" --output-format scan
4655
+ pb pl "pl.read_csv('data.csv')" --output-format missing
4656
+ pb pl "pl.read_csv('data.csv')" --output-format info
4657
+
4658
+ Note: For multi-statement code, assign your final result to a variable like
4659
+ 'result', 'df', 'data', or ensure it's the last expression.
4660
+ """
4661
+ try:
4662
+ # Check if Polars is available
4663
+ if not _is_lib_present("polars"):
4664
+ console.print("[red]Error:[/red] Polars is not installed")
4665
+ console.print("\nThe 'pb pl' command requires Polars to be installed.")
4666
+ console.print("Install it with: [cyan]pip install polars[/cyan]")
4667
+ console.print("\nTo check all dependency status, run: [cyan]pb requirements[/cyan]")
4668
+ sys.exit(1)
4669
+
4670
+ import polars as pl
4671
+
4672
+ # Determine the source of the query
4673
+ query_code = None
4674
+
4675
+ if file:
4676
+ # Read from file
4677
+ query_code = Path(file).read_text()
4678
+ elif edit:
4679
+ # Determine which editor to use
4680
+ chosen_editor = editor or _get_best_editor()
4681
+
4682
+ # When piping, send editor message to stderr
4683
+ if pipe:
4684
+ print(f"Using editor: {chosen_editor}", file=sys.stderr)
4685
+ else:
4686
+ console.print(f"[dim]Using editor: {chosen_editor}[/dim]")
4687
+
4688
+ # Interactive editor with custom editor
4689
+ if chosen_editor == "code":
4690
+ # Special handling for VS Code
4691
+ query_code = _edit_with_vscode()
4692
+ else:
4693
+ # Use click.edit() for terminal editors
4694
+ query_code = click.edit(
4695
+ "# Enter your Polars query here\n"
4696
+ "# Example:\n"
4697
+ "# pl.read_csv('data.csv').select(['name', 'age'])\n"
4698
+ "# pl.read_csv('data.csv').filter(pl.col('age') > 25)\n"
4699
+ "# \n"
4700
+ "# The result should be a Polars DataFrame or LazyFrame\n"
4701
+ "\n",
4702
+ editor=chosen_editor,
4703
+ )
4704
+
4705
+ if query_code is None:
4706
+ if pipe:
4707
+ print("No query entered", file=sys.stderr)
4708
+ else:
4709
+ console.print("[yellow]No query entered[/yellow]")
4710
+ sys.exit(1)
4711
+ elif polars_expression:
4712
+ # Direct argument
4713
+ query_code = polars_expression
4714
+ else:
4715
+ # Try to read from stdin (for piping)
4716
+ if not sys.stdin.isatty():
4717
+ # Data is being piped in
4718
+ query_code = sys.stdin.read().strip()
4719
+ else:
4720
+ # No input provided and stdin is a terminal - show concise help
4721
+ _show_concise_help("pl", None)
4722
+ return
4723
+
4724
+ if not query_code or not query_code.strip():
4725
+ console.print("[red]Error:[/red] Empty query")
4726
+ sys.exit(1)
4727
+
4728
+ # Execute the query
4729
+ with console.status("[bold green]Executing Polars expression..."):
4730
+ namespace = {
4731
+ "pl": pl,
4732
+ "polars": pl,
4733
+ "__builtins__": __builtins__,
4734
+ }
4735
+
4736
+ try:
4737
+ # Check if this is a single expression or multiple statements
4738
+ if "\n" in query_code.strip() or any(
4739
+ keyword in query_code
4740
+ for keyword in [
4741
+ " = ",
4742
+ "import",
4743
+ "for ",
4744
+ "if ",
4745
+ "def ",
4746
+ "class ",
4747
+ "with ",
4748
+ "try:",
4749
+ ]
4750
+ ):
4751
+ # Multiple statements - use exec()
4752
+ exec(query_code, namespace)
4753
+
4754
+ # Look for the result in the namespace
4755
+ # Try common variable names first
4756
+ result = None
4757
+ for var_name in ["result", "df", "data", "table", "output"]:
4758
+ if var_name in namespace:
4759
+ result = namespace[var_name]
4760
+ break
4761
+
4762
+ # If no common names found, look for any DataFrame/LazyFrame
4763
+ if result is None:
4764
+ for key, value in namespace.items():
4765
+ if (
4766
+ hasattr(value, "collect") or hasattr(value, "columns")
4767
+ ) and not key.startswith("_"):
4768
+ result = value
4769
+ break
4770
+
4771
+ # If still no result, get the last assigned variable (excluding builtins)
4772
+ if result is None:
4773
+ # Get variables that were added to namespace (excluding our imports)
4774
+ user_vars = {
4775
+ k: v
4776
+ for k, v in namespace.items()
4777
+ if k not in ["pl", "polars", "__builtins__"] and not k.startswith("_")
4778
+ }
4779
+ if user_vars:
4780
+ # Get the last variable (this is a heuristic)
4781
+ last_var = list(user_vars.keys())[-1]
4782
+ result = user_vars[last_var]
4783
+
4784
+ if result is None:
4785
+ if pipe:
4786
+ print(
4787
+ "[red]Error:[/red] Could not find result variable", file=sys.stderr
4788
+ )
4789
+ print(
4790
+ "[dim]Assign your final result to a variable like 'result', 'df', or 'data'[/dim]",
4791
+ file=sys.stderr,
4792
+ )
4793
+ print(
4794
+ "[dim]Or ensure your last line returns a DataFrame[/dim]",
4795
+ file=sys.stderr,
4796
+ )
4797
+ else:
4798
+ console.print("[red]Error:[/red] Could not find result variable")
4799
+ console.print(
4800
+ "[dim]Assign your final result to a variable like 'result', 'df', or 'data'[/dim]"
4801
+ )
4802
+ console.print("[dim]Or ensure your last line returns a DataFrame[/dim]")
4803
+ sys.exit(1)
4804
+
4805
+ else:
4806
+ # Single expression - use eval()
4807
+ result = eval(query_code, namespace)
4808
+
4809
+ # Validate result
4810
+ if not hasattr(result, "collect") and not hasattr(result, "columns"):
4811
+ if pipe:
4812
+ print(
4813
+ "[red]Error:[/red] Expression must return a Polars DataFrame or LazyFrame",
4814
+ file=sys.stderr,
4815
+ )
4816
+ print(f"[dim]Got: {type(result)}[/dim]", file=sys.stderr)
4817
+ else:
4818
+ console.print(
4819
+ "[red]Error:[/red] Expression must return a Polars DataFrame or LazyFrame"
4820
+ )
4821
+ console.print(f"[dim]Got: {type(result)}[/dim]")
4822
+ sys.exit(1)
4823
+
4824
+ except Exception as e:
4825
+ # When piping, send errors to stderr so they don't interfere with the pipe
4826
+ if pipe:
4827
+ print(f"Error executing Polars expression: {e}", file=sys.stderr)
4828
+ print(file=sys.stderr)
4829
+
4830
+ # Create a panel with the expression(s) for better readability
4831
+ if "\n" in query_code.strip():
4832
+ # Multi-line expression
4833
+ print(f"Expression(s) provided:\n{query_code}", file=sys.stderr)
4834
+ else:
4835
+ # Single line expression
4836
+ print(f"Expression provided: {query_code}", file=sys.stderr)
4837
+ else:
4838
+ # Normal error handling when not piping
4839
+ console.print(f"[red]Error executing Polars expression:[/red] {e}")
4840
+ console.print()
4841
+
4842
+ # Create a panel with the expression(s) for better readability
4843
+ if "\n" in query_code.strip():
4844
+ # Multi-line expression
4845
+ console.print(
4846
+ Panel(
4847
+ query_code,
4848
+ title="Expression(s) provided",
4849
+ border_style="red",
4850
+ expand=False,
4851
+ title_align="left",
4852
+ )
4853
+ )
4854
+ else:
4855
+ # Single line expression
4856
+ console.print(
4857
+ Panel(
4858
+ query_code,
4859
+ title="Expression provided",
4860
+ border_style="red",
4861
+ expand=False,
4862
+ title_align="left",
4863
+ )
4864
+ )
4865
+
4866
+ sys.exit(1)
4867
+
4868
+ # Only print success message when not piping (so it doesn't interfere with pipe output)
4869
+ if not pipe:
4870
+ console.print("[green]✓[/green] Polars expression executed successfully")
4871
+
4872
+ # Process output
4873
+ if pipe:
4874
+ # Output data for piping to other commands
4875
+ _handle_pl_pipe(result, pipe_format)
4876
+ elif output_format == "preview":
4877
+ _handle_pl_preview(result, preview_head, preview_tail, output_html)
4878
+ elif output_format == "scan":
4879
+ _handle_pl_scan(result, query_code, output_html)
4880
+ elif output_format == "missing":
4881
+ _handle_pl_missing(result, query_code, output_html)
4882
+ elif output_format == "info":
4883
+ _handle_pl_info(result, query_code, output_html)
4884
+ elif output_format == "validate":
4885
+ console.print("[yellow]Validation output format not yet implemented[/yellow]")
4886
+ console.print("Use 'pb validate' with a data file for now")
4887
+
4888
+ except Exception as e:
4889
+ console.print(f"[red]Error:[/red] {e}")
4890
+ sys.exit(1)
4891
+
4892
+
4893
+ def _handle_pl_preview(result: Any, head: int, tail: int, output_html: str | None) -> None:
4894
+ """Handle preview output for Polars results."""
4895
+ try:
4896
+ # Create preview using existing preview function
4897
+ gt_table = pb.preview(
4898
+ data=result,
4899
+ n_head=head,
4900
+ n_tail=tail,
4901
+ show_row_numbers=True,
4902
+ )
4903
+
4904
+ if output_html:
4905
+ html_content = gt_table.as_raw_html()
4906
+ Path(output_html).write_text(html_content, encoding="utf-8")
4907
+ console.print(f"[green]✓[/green] HTML saved to: {output_html}")
4908
+ else:
4909
+ # Get metadata for enhanced preview
4910
+ try:
4911
+ total_rows = pb.get_row_count(result)
4912
+ total_columns = pb.get_column_count(result)
4913
+ table_type = _get_tbl_type(result)
4914
+
4915
+ preview_info = {
4916
+ "total_rows": total_rows,
4917
+ "total_columns": total_columns,
4918
+ "head_rows": head,
4919
+ "tail_rows": tail,
4920
+ "is_complete": total_rows <= (head + tail),
4921
+ "source_type": "Polars expression",
4922
+ "table_type": table_type,
4923
+ }
4924
+
4925
+ _rich_print_gt_table(gt_table, preview_info)
4926
+ except Exception:
4927
+ # Fallback to basic display
4928
+ _rich_print_gt_table(gt_table)
4929
+
4930
+ except Exception as e:
4931
+ console.print(f"[red]Error creating preview:[/red] {e}")
4932
+ sys.exit(1)
4933
+
4934
+
4935
+ def _handle_pl_scan(result: Any, expression: str, output_html: str | None) -> None:
4936
+ """Handle scan output for Polars results."""
4937
+ try:
4938
+ scan_result = pb.col_summary_tbl(data=result)
4939
+
4940
+ if output_html:
4941
+ html_content = scan_result.as_raw_html()
4942
+ Path(output_html).write_text(html_content, encoding="utf-8")
4943
+ console.print(f"[green]✓[/green] Data scan report saved to: {output_html}")
4944
+ else:
4945
+ # Get metadata for enhanced scan display
4946
+ try:
4947
+ total_rows = pb.get_row_count(result)
4948
+ total_columns = pb.get_column_count(result)
4949
+ table_type = _get_tbl_type(result)
4950
+
4951
+ _rich_print_scan_table(
4952
+ scan_result,
4953
+ expression,
4954
+ "Polars expression",
4955
+ table_type,
4956
+ total_rows,
4957
+ total_columns,
4958
+ )
4959
+ except Exception as e:
4960
+ console.print(f"[yellow]Could not display scan summary: {e}[/yellow]")
4961
+
4962
+ except Exception as e:
4963
+ console.print(f"[red]Error creating scan:[/red] {e}")
4964
+ sys.exit(1)
4965
+
4966
+
4967
+ def _handle_pl_missing(result: Any, expression: str, output_html: str | None) -> None:
4968
+ """Handle missing values output for Polars results."""
4969
+ try:
4970
+ missing_table = pb.missing_vals_tbl(data=result)
4971
+
4972
+ if output_html:
4973
+ html_content = missing_table.as_raw_html()
4974
+ Path(output_html).write_text(html_content, encoding="utf-8")
4975
+ console.print(f"[green]✓[/green] Missing values report saved to: {output_html}")
4976
+ else:
4977
+ _rich_print_missing_table(missing_table, result)
4978
+
4979
+ except Exception as e:
4980
+ console.print(f"[red]Error creating missing values report:[/red] {e}")
4981
+ sys.exit(1)
4982
+
4983
+
4984
+ def _handle_pl_info(result: Any, expression: str, output_html: str | None) -> None:
4985
+ """Handle info output for Polars results."""
4986
+ try:
4987
+ # Get basic info
4988
+ tbl_type = _get_tbl_type(result)
4989
+ row_count = pb.get_row_count(result)
4990
+ col_count = pb.get_column_count(result)
4991
+
4992
+ # Get column names and types
4993
+ if hasattr(result, "columns"):
4994
+ columns = list(result.columns)
4995
+ elif hasattr(result, "schema"):
4996
+ columns = list(result.schema.names)
4997
+ else:
4998
+ columns = []
4999
+
5000
+ dtypes_dict = _get_column_dtypes(result, columns)
5001
+
5002
+ if output_html:
5003
+ # Create a simple HTML info page
5004
+ # TODO: Implement an improved version of this in the Python API and then
5005
+ # use that here
5006
+ html_content = f"""
5007
+ <html><body>
5008
+ <h2>Polars Expression Info</h2>
5009
+ <p><strong>Expression:</strong> {expression}</p>
5010
+ <p><strong>Table Type:</strong> {tbl_type}</p>
5011
+ <p><strong>Rows:</strong> {row_count:,}</p>
5012
+ <p><strong>Columns:</strong> {col_count:,}</p>
5013
+ <h3>Column Details</h3>
5014
+ <ul>
5015
+ {"".join(f"<li>{col}: {dtypes_dict.get(col, '?')}</li>" for col in columns)}
5016
+ </ul>
5017
+ </body></html>
5018
+ """
5019
+ Path(output_html).write_text(html_content, encoding="utf-8")
5020
+ console.print(f"[green]✓[/green] HTML info saved to: {output_html}")
5021
+ else:
5022
+ # Display info table
5023
+ from rich.box import SIMPLE_HEAD
5024
+
5025
+ info_table = Table(
5026
+ title="Polars Expression Info",
5027
+ show_header=True,
5028
+ header_style="bold magenta",
5029
+ box=SIMPLE_HEAD,
5030
+ title_style="bold cyan",
5031
+ title_justify="left",
5032
+ )
5033
+ info_table.add_column("Property", style="cyan", no_wrap=True)
5034
+ info_table.add_column("Value", style="green")
5035
+
5036
+ info_table.add_row("Expression", expression)
5037
+ # Capitalize "polars" to "Polars" for consistency with pb info command
5038
+ display_tbl_type = (
5039
+ tbl_type.replace("polars", "Polars") if "polars" in tbl_type.lower() else tbl_type
5040
+ )
5041
+ info_table.add_row("Table Type", display_tbl_type)
5042
+ info_table.add_row("Rows", f"{row_count:,}")
5043
+ info_table.add_row("Columns", f"{col_count:,}")
5044
+
5045
+ console.print()
5046
+ console.print(info_table)
5047
+
5048
+ # Show column details
5049
+ if columns:
5050
+ console.print("\n[bold cyan]Column Details:[/bold cyan]")
5051
+ for col in columns[:10]: # Show first 10 columns
5052
+ dtype = dtypes_dict.get(col, "?")
5053
+ console.print(f" • {col}: [yellow]{dtype}[/yellow]")
5054
+
5055
+ if len(columns) > 10:
5056
+ console.print(f" ... and {len(columns) - 10} more columns")
5057
+
5058
+ except Exception as e:
5059
+ console.print(f"[red]Error creating info:[/red] {e}")
5060
+ sys.exit(1)
5061
+
5062
+
5063
+ def _handle_pl_pipe(result: Any, pipe_format: str) -> None:
5064
+ """Handle piped output from Polars results."""
5065
+ try:
5066
+ import sys
5067
+ import tempfile
5068
+
5069
+ # Create a temporary file to store the data
5070
+ with tempfile.NamedTemporaryFile(
5071
+ mode="w", suffix=f".{pipe_format}", prefix="pb_pipe_", delete=False
5072
+ ) as temp_file:
5073
+ temp_path = temp_file.name
5074
+
5075
+ # Write the data to the temporary file
5076
+ if pipe_format == "parquet":
5077
+ if hasattr(result, "write_parquet"):
5078
+ # Polars
5079
+ result.write_parquet(temp_path)
5080
+ elif hasattr(result, "to_parquet"):
5081
+ # Pandas
5082
+ result.to_parquet(temp_path)
5083
+ else:
5084
+ # Convert to pandas and write
5085
+ import pandas as pd
5086
+
5087
+ pd_result = pd.DataFrame(result)
5088
+ pd_result.to_parquet(temp_path)
5089
+ else: # CSV
5090
+ if hasattr(result, "write_csv"):
5091
+ # Polars
5092
+ result.write_csv(temp_path)
5093
+ elif hasattr(result, "to_csv"):
5094
+ # Pandas
5095
+ result.to_csv(temp_path, index=False)
5096
+ else:
5097
+ # Convert to pandas and write
5098
+ import pandas as pd
5099
+
5100
+ pd_result = pd.DataFrame(result)
5101
+ pd_result.to_csv(temp_path, index=False)
5102
+
5103
+ # Output the temporary file path to stdout for the next command
5104
+ print(temp_path)
5105
+
5106
+ except Exception as e:
5107
+ print(f"[red]Error creating pipe output:[/red] {e}", file=sys.stderr)
5108
+ sys.exit(1)
5109
+
5110
+
5111
+ def _get_best_editor() -> str:
5112
+ """Detect the best available editor on the system."""
5113
+
5114
+ # Check environment variable first
5115
+ if "EDITOR" in os.environ:
5116
+ return os.environ["EDITOR"]
5117
+
5118
+ # Check for common editors in order of preference
5119
+ editors = [
5120
+ "code", # VS Code
5121
+ "micro", # Modern terminal editor
5122
+ "nano", # User-friendly terminal editor
5123
+ "vim", # Vim
5124
+ "vi", # Vi (fallback)
5125
+ ]
5126
+
5127
+ for editor in editors:
5128
+ if shutil.which(editor):
5129
+ return editor
5130
+
5131
+ # Ultimate fallback
5132
+ return "nano"
5133
+
5134
+
5135
+ def _edit_with_vscode() -> str | None:
5136
+ """Edit Polars query using VS Code."""
5137
+ import subprocess
5138
+ import tempfile
5139
+
5140
+ # Create a temporary Python file
5141
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".py", prefix="pb_pl_", delete=False) as f:
5142
+ f.write("import polars as pl\n")
5143
+ f.write("\n")
5144
+ f.write("# Enter your Polars query here\n")
5145
+ f.write("# Examples:\n")
5146
+ f.write("# \n")
5147
+ f.write("# Single expression:\n")
5148
+ f.write("# pl.read_csv('data.csv').select(['name', 'age'])\n")
5149
+ f.write("# \n")
5150
+ f.write("# Multiple statements:\n")
5151
+ f.write("# csv = pl.read_csv('data.csv')\n")
5152
+ f.write("# result = csv.select(['name', 'age']).filter(pl.col('age') > 25)\n")
5153
+ f.write("# \n")
5154
+ f.write("# For multi-statement code, assign your final result to a variable\n")
5155
+ f.write("# like 'result', 'df', 'data', or just ensure it's the last line\n")
5156
+ f.write("# \n")
5157
+ f.write("# Save and then close this file in VS Code to execute the query\n")
5158
+ f.write("\n")
5159
+ temp_file = f.name
5160
+
5161
+ try:
5162
+ # Open in VS Code and wait for it to close
5163
+ result = subprocess.run(
5164
+ ["code", "--wait", temp_file], capture_output=True, text=True, timeout=300
5165
+ )
5166
+
5167
+ if result.returncode != 0:
5168
+ console.print(f"[yellow]VS Code exited with code {result.returncode}[/yellow]")
5169
+
5170
+ # Read the edited content
5171
+ with open(temp_file, "r") as f:
5172
+ content = f.read()
5173
+
5174
+ # Remove comments, empty lines, and import statements for cleaner execution
5175
+ lines = []
5176
+ for line in content.split("\n"):
5177
+ stripped = line.strip()
5178
+ if (
5179
+ stripped
5180
+ and not stripped.startswith("#")
5181
+ and not stripped.startswith("import polars")
5182
+ and not stripped.startswith("import polars as pl")
5183
+ ):
5184
+ lines.append(line)
5185
+
5186
+ return "\n".join(lines) if lines else None
5187
+
5188
+ except subprocess.TimeoutExpired:
5189
+ console.print("[red]Timeout:[/red] VS Code took too long to respond")
5190
+ return None
5191
+ except subprocess.CalledProcessError as e:
5192
+ console.print(f"[red]Error:[/red] Could not open VS Code: {e}")
5193
+ return None
5194
+ except FileNotFoundError:
5195
+ console.print("[red]Error:[/red] VS Code not found in PATH")
5196
+ return None
5197
+ finally:
5198
+ # Clean up
5199
+ Path(temp_file).unlink(missing_ok=True)
5200
+
5201
+
5202
+ def _show_concise_help(command_name: str, ctx: click.Context) -> None:
5203
+ """Show concise help for a command when required arguments are missing."""
5204
+
5205
+ if command_name == "info":
5206
+ console.print("[bold cyan]pb info[/bold cyan] - Display information about a data source")
5207
+ console.print()
5208
+ console.print("[bold yellow]Usage:[/bold yellow]")
5209
+ console.print(" pb info data.csv")
5210
+ console.print(" pb info small_table")
5211
+ console.print()
5212
+ console.print("[dim]Shows table type, dimensions, column names, and data types[/dim]")
5213
+ console.print()
5214
+ console.print(
5215
+ "[dim]Use [bold]pb info --help[/bold] for complete options and examples[/dim]"
5216
+ )
5217
+
5218
+ elif command_name == "preview":
5219
+ console.print(
5220
+ "[bold cyan]pb preview[/bold cyan] - Preview a data table showing head and tail rows"
5221
+ )
5222
+ console.print()
5223
+ console.print("[bold yellow]Usage:[/bold yellow]")
5224
+ console.print(" pb preview data.csv")
5225
+ console.print(" pb preview data.parquet --head 10 --tail 5")
5226
+ console.print()
5227
+ console.print("[bold yellow]Key Options:[/bold yellow]")
5228
+ console.print(" --head N Number of rows from the top (default: 5)")
5229
+ console.print(" --tail N Number of rows from the bottom (default: 5)")
5230
+ console.print(" --columns LIST Comma-separated list of columns to display")
5231
+ console.print(" --output-html Save HTML output to file")
5232
+ console.print()
5233
+ console.print(
5234
+ "[dim]Use [bold]pb preview --help[/bold] for complete options and examples[/dim]"
5235
+ )
5236
+
5237
+ elif command_name == "scan":
5238
+ console.print(
5239
+ "[bold cyan]pb scan[/bold cyan] - Generate a comprehensive data profile report"
5240
+ )
5241
+ console.print()
5242
+ console.print("[bold yellow]Usage:[/bold yellow]")
5243
+ console.print(" pb scan data.csv")
5244
+ console.print(" pb scan data.parquet --output-html report.html")
5245
+ console.print()
5246
+ console.print("[bold yellow]Key Options:[/bold yellow]")
5247
+ console.print(" --output-html Save HTML scan report to file")
5248
+ console.print(" --columns LIST Comma-separated list of columns to scan")
5249
+ console.print()
5250
+ console.print(
5251
+ "[dim]Use [bold]pb scan --help[/bold] for complete options and examples[/dim]"
5252
+ )
5253
+
5254
+ elif command_name == "missing":
5255
+ console.print("[bold cyan]pb missing[/bold cyan] - Generate a missing values report")
5256
+ console.print()
5257
+ console.print("[bold yellow]Usage:[/bold yellow]")
5258
+ console.print(" pb missing data.csv")
5259
+ console.print(" pb missing data.parquet --output-html missing_report.html")
5260
+ console.print()
5261
+ console.print("[bold yellow]Key Options:[/bold yellow]")
5262
+ console.print(" --output-html Save HTML output to file")
5263
+ console.print()
5264
+ console.print(
5265
+ "[dim]Use [bold]pb missing --help[/bold] for complete options and examples[/dim]"
5266
+ )
5267
+
5268
+ elif command_name == "validate":
5269
+ console.print("[bold cyan]pb validate[/bold cyan] - Perform data validation checks")
5270
+ console.print()
5271
+ console.print("[bold yellow]Usage:[/bold yellow]")
5272
+ console.print(" pb validate data.csv")
5273
+ console.print(" pb validate data.csv --check col-vals-not-null --column email")
5274
+ console.print()
5275
+ console.print("[bold yellow]Key Options:[/bold yellow]")
5276
+ console.print(" --check TYPE Validation check type (default: rows-distinct)")
5277
+ console.print(" --column COL Column name for column-specific checks")
5278
+ console.print(" --show-extract Show failing rows if validation fails")
5279
+ console.print(" --list-checks List all available validation checks")
5280
+ console.print()
5281
+ console.print(
5282
+ "[dim]Use [bold]pb validate --help[/bold] for complete options and examples[/dim]"
5283
+ )
5284
+
5285
+ elif command_name == "run":
5286
+ console.print("[bold cyan]pb run[/bold cyan] - Run a Pointblank validation script")
5287
+ console.print()
5288
+ console.print("[bold yellow]Usage:[/bold yellow]")
5289
+ console.print(" pb run validation_script.py")
5290
+ console.print(" pb run validation_script.py --data data.csv")
5291
+ console.print()
5292
+ console.print("[bold yellow]Key Options:[/bold yellow]")
5293
+ console.print(" --data SOURCE Replace data source in validation objects")
5294
+ console.print(" --output-html Save HTML validation report to file")
5295
+ console.print(" --show-extract Show failing rows if validation fails")
5296
+ console.print(" --fail-on LEVEL Exit with error on critical/error/warning/any")
5297
+ console.print()
5298
+ console.print("[dim]Use [bold]pb run --help[/bold] for complete options and examples[/dim]")
5299
+
5300
+ elif command_name == "make-template":
5301
+ console.print(
5302
+ "[bold cyan]pb make-template[/bold cyan] - Create a validation script or YAML template"
5303
+ )
5304
+ console.print()
5305
+ console.print("[bold yellow]Usage:[/bold yellow]")
5306
+ console.print(" pb make-template my_validation.py # Python script template")
5307
+ console.print(" pb make-template my_validation.yaml # YAML config template")
5308
+ console.print()
5309
+ console.print("[dim]Creates sample templates with validation examples[/dim]")
5310
+ console.print("[dim]Edit the template and run with [bold]pb run[/bold][/dim]")
5311
+ console.print()
5312
+ console.print(
5313
+ "[dim]Use [bold]pb make-template --help[/bold] for complete options and examples[/dim]"
5314
+ )
5315
+
5316
+ elif command_name == "pl":
5317
+ console.print(
5318
+ "[bold cyan]pb pl[/bold cyan] - Execute Polars expressions and display results"
5319
+ )
5320
+ console.print()
5321
+ console.print("[bold yellow]Usage:[/bold yellow]")
5322
+ console.print(" pb pl \"pl.read_csv('data.csv')\"")
5323
+ console.print(" pb pl --edit")
5324
+ console.print()
5325
+ console.print("[bold yellow]Key Options:[/bold yellow]")
5326
+ console.print(" --edit Open editor for multi-line input")
5327
+ console.print(" --file FILE Read query from file")
5328
+ console.print(" --output-format Output format: preview, scan, missing, info")
5329
+ console.print(" --pipe Output for piping to other pb commands")
5330
+ console.print()
5331
+ console.print("[dim]Use [bold]pb pl --help[/bold] for complete options and examples[/dim]")
5332
+
5333
+ # Fix the exit call at the end
5334
+ if ctx is not None:
5335
+ ctx.exit(1)
5336
+ else:
5337
+ sys.exit(1)