tab-cli 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tab_cli/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """Tab CLI - A CLI tool for tabular data."""
2
+
3
+ __version__ = "0.1.0"
tab_cli/cli.py ADDED
@@ -0,0 +1,171 @@
1
+ """Main CLI entry point using Typer."""
2
+
3
+ import sys
4
+ from typing import Annotated, Optional
5
+
6
+ from loguru import logger
7
+ import polars as pl
8
+ import typer
9
+ from rich.console import Console
10
+ from rich.logging import RichHandler
11
+
12
+ from tab_cli import config
13
+ from tab_cli.handlers import TableWriter, infer_reader, infer_writer
14
+
15
+ app = typer.Typer(
16
+ help="A CLI tool for viewing and manipulating tabular data.",
17
+ no_args_is_help=True,
18
+ )
19
+
20
+
21
+ @app.callback()
22
+ def main_callback(
23
+ az_url_authority_is_account: Annotated[
24
+ bool,
25
+ typer.Option(
26
+ "--az-url-authority-is-account",
27
+ help="Interpret az:// URL authority as storage account name instead of container name",
28
+ ),
29
+ ] = False,
30
+ log_level: Annotated[
31
+ str,
32
+ typer.Option("--log-level", help="Log level from {DEBUG, INFO, WARNING, ERROR, CRITICAL}"),
33
+ ] = "INFO",
34
+ ) -> None:
35
+ """Global options for tab_cli CLI."""
36
+ config.config.az_url_authority_is_account = az_url_authority_is_account
37
+ logger.remove()
38
+ logger.add(
39
+ RichHandler(
40
+ rich_tracebacks=True,
41
+ tracebacks_show_locals=True,
42
+ markup=True,
43
+ ),
44
+ format="{message}",
45
+ level=log_level.upper(),
46
+ )
47
+
48
+
49
+ def _output(
50
+ lf: pl.LazyFrame,
51
+ limit: int | None,
52
+ skip: int,
53
+ output: str | None,
54
+ ) -> None:
55
+ show_truncation = limit is None and output is None
56
+ actual_limit = 20 if show_truncation else limit
57
+
58
+ if show_truncation:
59
+ assert actual_limit is not None
60
+ lf = lf.slice(skip, length=actual_limit + 1)
61
+ df = lf.collect()
62
+ truncated = len(df) > actual_limit
63
+ if truncated:
64
+ df = df.head(actual_limit)
65
+ lf = df.lazy()
66
+ else:
67
+ if skip > 0 or actual_limit is not None:
68
+ lf = lf.slice(skip, length=actual_limit)
69
+ truncated = False
70
+
71
+ writer = infer_writer(output, truncated=show_truncation and truncated)
72
+
73
+ for chunk in writer.write(lf):
74
+ sys.stdout.buffer.write(chunk)
75
+
76
+
77
+ @app.command()
78
+ def view(
79
+ path: Annotated[str, typer.Argument(help="Path to the data file or directory")],
80
+ limit: Annotated[Optional[int], typer.Option("--limit", help="Maximum number of rows to display")] = None,
81
+ skip: Annotated[int, typer.Option("--skip", help="Number of rows to skip")] = 0,
82
+ input: Annotated[Optional[str], typer.Option("-i", "--input-format", help="Input format")] = None,
83
+ output: Annotated[Optional[str], typer.Option("-o", "--output-format", help="Output format")] = None,
84
+ ) -> None:
85
+ """View tabular data from a file."""
86
+ reader = infer_reader(path, format=input)
87
+ lf = reader.read(path)
88
+ _output(lf, limit=limit, skip=skip, output=output)
89
+
90
+ @app.command()
91
+ def schema(
92
+ path: Annotated[str, typer.Argument(help="Path to the data file or directory")],
93
+ input: Annotated[Optional[str], typer.Option("-i", "--input-format", help="Input format")] = None,
94
+ ) -> None:
95
+ """Display the schema of a tabular data file."""
96
+ reader = infer_reader(path, format=input)
97
+ table_schema = reader.schema(path)
98
+ console = Console(force_terminal=True)
99
+ console.print(table_schema)
100
+
101
+
102
+ @app.command()
103
+ def sql(
104
+ query: Annotated[str, typer.Argument(help="SQL query to execute (table is available as 't')")],
105
+ path: Annotated[str, typer.Argument(help="Path to the data file or directory")],
106
+ limit: Annotated[Optional[int], typer.Option("--limit", help="Maximum number of rows to display")] = None,
107
+ skip: Annotated[int, typer.Option("--skip", help="Number of rows to skip")] = 0,
108
+ input: Annotated[Optional[str], typer.Option("-i", "--input-format", help="Input format")] = None,
109
+ output: Annotated[Optional[str], typer.Option("-o", "--output-format", help="Output format")] = None,
110
+ ) -> None:
111
+ """Run a SQL query on tabular data. The table is available as 't'."""
112
+ reader = infer_reader(path, format=input)
113
+ lf = reader.read(path)
114
+ ctx = pl.SQLContext(t=lf, eager=False)
115
+ result_lf = ctx.execute(query)
116
+ _output(result_lf, limit=limit, skip=skip, output=output)
117
+
118
+
119
+ @app.command()
120
+ def summary(
121
+ path: Annotated[str, typer.Argument(help="Path to the data file or directory")],
122
+ input: Annotated[Optional[str], typer.Option("-i", "--input-format", help="Input format")] = None,
123
+ ) -> None:
124
+ """Display summary information about a tabular data file."""
125
+ handler = infer_reader(path, format=input)
126
+ table_summary = handler.summary(path)
127
+ console = Console(force_terminal=True)
128
+ console.print(table_summary)
129
+
130
+
131
+ @app.command()
132
+ def convert(
133
+ src: Annotated[str, typer.Argument(help="Path to the source file or directory")],
134
+ dst: Annotated[str, typer.Argument(help="Path to the destination file or directory")],
135
+ input: Annotated[Optional[str], typer.Option("-i", "--input-format", help="Input format")] = None,
136
+ output: Annotated[Optional[str], typer.Option("-o", "--output-format", help="Output format")] = None,
137
+ num_partitions: Annotated[Optional[int], typer.Option("-n", "--num-partitions", help="Number of output partitions")] = None,
138
+ ) -> None:
139
+ """Convert tabular data from one format to another."""
140
+ reader = infer_reader(src, format=input)
141
+ # Determine output format: use -o if specified, else inherit from input
142
+ if output is not None:
143
+ writer = infer_writer(format=output)
144
+ elif input is not None:
145
+ writer = infer_writer(format=input)
146
+ else:
147
+ writer = reader
148
+ assert isinstance(writer, TableWriter)
149
+ lf = reader.read(src)
150
+ writer.write_to_path(lf, dst, partitions=num_partitions)
151
+
152
+
153
+ @app.command()
154
+ def cat(
155
+ paths: Annotated[list[str], typer.Argument(help="Paths to the data files or directories")],
156
+ input: Annotated[Optional[str], typer.Option("-i", "--input-format", help="Input format")] = None,
157
+ output: Annotated[Optional[str], typer.Option("-o", "--output-format", help="Output format")] = None,
158
+ ) -> None:
159
+ """Concatenate tabular data from multiple files."""
160
+ reader = infer_reader(paths[0], format=input)
161
+ files = [reader.read(path) for path in paths]
162
+ lf = pl.concat(files, how="vertical")
163
+ _output(lf, limit=None, skip=0, output=output)
164
+
165
+
166
+ def main() -> None:
167
+ app()
168
+
169
+
170
+ if __name__ == "__main__":
171
+ main()
tab_cli/config.py ADDED
@@ -0,0 +1,14 @@
1
+ """Global configuration for tab_cli-cli."""
2
+
3
+ from dataclasses import dataclass
4
+
5
+
6
+ @dataclass
7
+ class Config:
8
+ """Global configuration settings."""
9
+
10
+ az_url_authority_is_account: bool = False
11
+
12
+
13
+ # Global config instance
14
+ config: Config = Config()
@@ -0,0 +1,15 @@
1
+ """Format handlers for reading and writing tabular data."""
2
+
3
+ from tab_cli.formats.base import FormatHandler
4
+ from tab_cli.formats.avro import AvroFormat
5
+ from tab_cli.formats.csv import CsvFormat
6
+ from tab_cli.formats.jsonl import JsonlFormat
7
+ from tab_cli.formats.parquet import ParquetFormat
8
+
9
+ __all__ = [
10
+ "FormatHandler",
11
+ "AvroFormat",
12
+ "CsvFormat",
13
+ "JsonlFormat",
14
+ "ParquetFormat",
15
+ ]
@@ -0,0 +1,47 @@
1
+ """Avro format handler using polars-fastavro."""
2
+
3
+ from collections.abc import Iterable
4
+ from io import BytesIO
5
+ from typing import BinaryIO
6
+
7
+ import polars as pl
8
+ import polars_fastavro
9
+
10
+ from tab_cli.formats.base import FormatHandler
11
+
12
+
13
+ class AvroFormat(FormatHandler):
14
+ """Handler for Avro files."""
15
+
16
+ def extension(self) -> str:
17
+ return ".avro"
18
+
19
+ def supports_glob(self) -> bool:
20
+ # polars_fastavro doesn't support glob patterns
21
+ return False
22
+
23
+ def scan(self, url: str, storage_options: dict[str, str] | None = None) -> pl.LazyFrame:
24
+ # polars_fastavro doesn't support storage_options, so cloud URIs
25
+ # need to be accessed through fsspec first
26
+ return polars_fastavro.scan_avro(url)
27
+
28
+ def read_stream(self, stream: BinaryIO) -> pl.DataFrame:
29
+ return polars_fastavro.read_avro(stream)
30
+
31
+ def collect_schema(self, url: str, storage_options: dict[str, str] | None = None) -> list[tuple[str, pl.DataType]]:
32
+ # polars_fastavro doesn't support storage_options
33
+ return list(polars_fastavro.scan_avro(url).collect_schema().items())
34
+
35
+ def count_rows(self, url: str, storage_options: dict[str, str] | None = None) -> int:
36
+ # polars_fastavro doesn't support storage_options
37
+ return polars_fastavro.scan_avro(url).select(pl.len()).collect().item()
38
+
39
+ def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
40
+ output = BytesIO()
41
+ df = lf.collect()
42
+ polars_fastavro.write_avro(df, output)
43
+ yield output.getvalue()
44
+
45
+ def write_to_single_file(self, lf: pl.LazyFrame, path: str) -> None:
46
+ df = lf.collect()
47
+ polars_fastavro.write_avro(df, path)
@@ -0,0 +1,63 @@
1
+ """Base format handler interface."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from collections.abc import Iterable
5
+ from typing import BinaryIO
6
+
7
+ import polars as pl
8
+
9
+
10
+ class FormatHandler(ABC):
11
+ """Handles reading and writing a specific tabular format."""
12
+
13
+ @abstractmethod
14
+ def extension(self) -> str:
15
+ """Return the file extension (e.g., '.parquet')."""
16
+ pass
17
+
18
+ def supports_glob(self) -> bool:
19
+ """Whether this format supports glob patterns in scan().
20
+
21
+ Formats with native Polars support (Parquet, CSV, JSONL) can scan
22
+ directories directly. Others (Avro) need manual file iteration.
23
+ """
24
+ return False
25
+
26
+ @abstractmethod
27
+ def scan(self, url: str, storage_options: dict[str, str] | None = None) -> pl.LazyFrame:
28
+ """Scan from a URL (local path or cloud URL).
29
+
30
+ Args:
31
+ url: The URL to scan from.
32
+ storage_options: Optional storage options for cloud access.
33
+ """
34
+ pass
35
+
36
+ @abstractmethod
37
+ def read_stream(self, stream: BinaryIO) -> pl.DataFrame:
38
+ """Read from a byte stream. Returns eager DataFrame."""
39
+ pass
40
+
41
+ @abstractmethod
42
+ def collect_schema(self, url: str, storage_options: dict[str, str] | None = None) -> list[tuple[str, pl.DataType]]:
43
+ """Get schema as list of (name, dtype) tuples."""
44
+ pass
45
+
46
+ @abstractmethod
47
+ def count_rows(self, url: str, storage_options: dict[str, str] | None = None) -> int:
48
+ """Count rows in the file."""
49
+ pass
50
+
51
+ def extra_summary(self, url: str) -> dict[str, str | int | float] | None:
52
+ """Return format-specific summary metadata, if any."""
53
+ return None
54
+
55
+ @abstractmethod
56
+ def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
57
+ """Write LazyFrame to bytes (for streaming output)."""
58
+ pass
59
+
60
+ @abstractmethod
61
+ def write_to_single_file(self, lf: pl.LazyFrame, path: str) -> None:
62
+ """Write LazyFrame to a single file."""
63
+ pass
tab_cli/formats/csv.py ADDED
@@ -0,0 +1,45 @@
1
+ """CSV/TSV format handler."""
2
+
3
+ from collections.abc import Iterable
4
+ from io import BytesIO
5
+ from typing import BinaryIO
6
+
7
+ import polars as pl
8
+
9
+ from tab_cli.formats.base import FormatHandler
10
+
11
+
12
+ class CsvFormat(FormatHandler):
13
+ """Handler for CSV/TSV files."""
14
+
15
+ def __init__(self, separator: str = ","):
16
+ self.separator = separator
17
+
18
+ def extension(self) -> str:
19
+ return ".csv" if self.separator == "," else ".tsv"
20
+
21
+ def supports_glob(self) -> bool:
22
+ return True
23
+
24
+ def scan(self, url: str, storage_options: dict[str, str] | None = None) -> pl.LazyFrame:
25
+ return pl.scan_csv(url, separator=self.separator, storage_options=storage_options)
26
+
27
+ def read_stream(self, stream: BinaryIO) -> pl.DataFrame:
28
+ return pl.read_csv(stream, separator=self.separator)
29
+
30
+ def collect_schema(self, url: str, storage_options: dict[str, str] | None = None) -> list[tuple[str, pl.DataType]]:
31
+ return list(pl.scan_csv(url, separator=self.separator, storage_options=storage_options).collect_schema().items())
32
+
33
+ def count_rows(self, url: str, storage_options: dict[str, str] | None = None) -> int:
34
+ return pl.scan_csv(url, separator=self.separator, storage_options=storage_options).select(pl.len()).collect().item()
35
+
36
+ def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
37
+ first = True
38
+ for batch in lf.collect_batches():
39
+ output = BytesIO()
40
+ batch.write_csv(output, separator=self.separator, include_header=first)
41
+ first = False
42
+ yield output.getvalue()
43
+
44
+ def write_to_single_file(self, lf: pl.LazyFrame, path: str) -> None:
45
+ lf.sink_csv(path, separator=self.separator)
@@ -0,0 +1,41 @@
1
+ """JSONL (newline-delimited JSON) format handler."""
2
+
3
+ import json
4
+ from collections.abc import Iterable
5
+ from typing import BinaryIO
6
+
7
+ import polars as pl
8
+
9
+ from tab_cli.formats.base import FormatHandler
10
+
11
+
12
+ class JsonlFormat(FormatHandler):
13
+ """Handler for JSONL files."""
14
+
15
+ def extension(self) -> str:
16
+ return ".jsonl"
17
+
18
+ def supports_glob(self) -> bool:
19
+ return True
20
+
21
+ def scan(self, url: str, storage_options: dict[str, str] | None = None) -> pl.LazyFrame:
22
+ return pl.scan_ndjson(url, storage_options=storage_options)
23
+
24
+ def read_stream(self, stream: BinaryIO) -> pl.DataFrame:
25
+ return pl.read_ndjson(stream)
26
+
27
+ def collect_schema(self, url: str, storage_options: dict[str, str] | None = None) -> list[tuple[str, pl.DataType]]:
28
+ return list(pl.scan_ndjson(url, storage_options=storage_options).collect_schema().items())
29
+
30
+ def count_rows(self, url: str, storage_options: dict[str, str] | None = None) -> int:
31
+ return pl.scan_ndjson(url, storage_options=storage_options).select(pl.len()).collect().item()
32
+
33
+ def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
34
+ for batch in lf.collect_batches():
35
+ for row in batch.iter_rows(named=True):
36
+ yield (json.dumps(row, default=str, ensure_ascii=False) + "\n").encode("utf-8")
37
+
38
+ def write_to_single_file(self, lf: pl.LazyFrame, path: str) -> None:
39
+ with open(path, "wb") as f:
40
+ for chunk in self.write(lf):
41
+ f.write(chunk)
@@ -0,0 +1,57 @@
1
+ """Parquet format handler."""
2
+
3
+ from collections.abc import Iterable
4
+ from io import BytesIO
5
+ from typing import BinaryIO
6
+
7
+ import polars as pl
8
+
9
+ from tab_cli.formats.base import FormatHandler
10
+
11
+
12
+ class ParquetFormat(FormatHandler):
13
+ """Handler for Parquet files."""
14
+
15
+ def extension(self) -> str:
16
+ return ".parquet"
17
+
18
+ def supports_glob(self) -> bool:
19
+ return True
20
+
21
+ def scan(self, url: str, storage_options: dict[str, str] | None = None) -> pl.LazyFrame:
22
+ return pl.scan_parquet(url, storage_options=storage_options)
23
+
24
+ def read_stream(self, stream: BinaryIO) -> pl.DataFrame:
25
+ return pl.read_parquet(stream)
26
+
27
+ def collect_schema(self, url: str, storage_options: dict[str, str] | None = None) -> list[tuple[str, pl.DataType]]:
28
+ return list(pl.scan_parquet(url, storage_options=storage_options).collect_schema().items())
29
+
30
+ def count_rows(self, url: str, storage_options: dict[str, str] | None = None) -> int:
31
+ return pl.scan_parquet(url, storage_options=storage_options).select(pl.len()).collect().item()
32
+
33
+ def extra_summary(self, url: str) -> dict[str, str | int | float] | None:
34
+ import pyarrow.parquet as pq
35
+
36
+ pf = pq.ParquetFile(url)
37
+ metadata = pf.metadata
38
+
39
+ extra: dict[str, str | int | float] = {}
40
+ codecs: set[str] = set()
41
+ for rg_idx in range(metadata.num_row_groups):
42
+ rg = metadata.row_group(rg_idx)
43
+ for col_idx in range(rg.num_columns):
44
+ col = rg.column(col_idx)
45
+ codecs.add(col.compression)
46
+ extra["Row groups"] = metadata.num_row_groups
47
+ if codecs:
48
+ extra["Compression"] = ", ".join(sorted(codecs))
49
+ return extra
50
+
51
+ def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
52
+ output = BytesIO()
53
+ lf.sink_parquet(output)
54
+ yield output.getvalue()
55
+
56
+ def write_to_single_file(self, lf: pl.LazyFrame, path: str) -> None:
57
+ lf.sink_parquet(path)
@@ -0,0 +1,87 @@
1
+ """Handler registration and inference."""
2
+
3
+ import os
4
+
5
+ from tab_cli.formats import AvroFormat, CsvFormat, JsonlFormat, ParquetFormat
6
+ from tab_cli.formats.base import FormatHandler
7
+ from tab_cli.handlers.base import FormatWriter, TableReader, TableWriter
8
+ from tab_cli.handlers.cli_table import CliTableFormatter
9
+ from tab_cli.storage import get_backend
10
+
11
+ # Format handlers
12
+ _FORMAT_MAP: dict[str, FormatHandler] = {
13
+ "csv": CsvFormat(","),
14
+ "tsv": CsvFormat("\t"),
15
+ "parquet": ParquetFormat(),
16
+ "jsonl": JsonlFormat(),
17
+ "avro": AvroFormat(),
18
+ }
19
+
20
+
21
+ def _get_extension(path: str) -> str:
22
+ """Extract file extension from a path or URL."""
23
+ # Handle URIs by getting the path component
24
+ if "://" in path:
25
+ path = path.split("://", 1)[1]
26
+ # Get extension from basename
27
+ basename = os.path.basename(path.rstrip("/"))
28
+ return os.path.splitext(basename)[1][1:].lower()
29
+
30
+
31
+ def infer_reader(path: str, format: str | None = None) -> TableReader:
32
+ """Infer the reader for a file.
33
+
34
+ Args:
35
+ path: Path to the file or directory (local or cloud URL).
36
+ format: Explicit format override. If None, inferred from extension.
37
+
38
+ Returns:
39
+ TableReader configured for the format and storage backend.
40
+ """
41
+ backend = get_backend(path)
42
+
43
+ if format is not None:
44
+ fmt = _FORMAT_MAP.get(format.lower())
45
+ if fmt is None:
46
+ raise ValueError(f"Unknown format: {format}. Supported: {', '.join(_FORMAT_MAP)}")
47
+ return TableReader(backend, fmt)
48
+
49
+ # Infer format from path
50
+ if backend.is_directory(path):
51
+ # Get extension from first file in directory
52
+ for file_info in backend.list_files(path, ""):
53
+ extension = _get_extension(file_info.url)
54
+ if extension:
55
+ break
56
+ else:
57
+ raise ValueError(f"No files found in directory: {path}")
58
+ else:
59
+ extension = _get_extension(path)
60
+
61
+ fmt = _FORMAT_MAP.get(extension)
62
+ if fmt is None:
63
+ raise ValueError(f"Unknown extension: {extension}. Supported: {', '.join(_FORMAT_MAP)}")
64
+
65
+ return TableReader(backend, fmt)
66
+
67
+
68
+ def infer_writer(format: str | None = None, truncated: bool = False) -> TableWriter:
69
+ """Infer the writer for a format.
70
+
71
+ Args:
72
+ format: Output format. If None, returns CLI table formatter.
73
+ truncated: Whether the output is truncated (for CLI display).
74
+
75
+ Returns:
76
+ TableWriter for the format.
77
+ """
78
+ if format is None:
79
+ return CliTableFormatter(truncated=truncated)
80
+ if format == "table-svg":
81
+ return CliTableFormatter(truncated=truncated, svg_capture=True)
82
+
83
+ fmt = _FORMAT_MAP.get(format.lower())
84
+ if fmt is None:
85
+ raise ValueError(f"Unknown format: {format}. Supported: {', '.join(_FORMAT_MAP)}")
86
+
87
+ return FormatWriter(fmt)