tab-cli 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,62 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual environments
24
+ .env
25
+ .venv
26
+ env/
27
+ venv/
28
+ ENV/
29
+
30
+ # uv
31
+ .uv/
32
+ uv.lock
33
+
34
+ # PyCharm
35
+ .idea/
36
+ *.iml
37
+ *.ipr
38
+ *.iws
39
+ out/
40
+
41
+ # Testing
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ htmlcov/
47
+ .pytest_cache/
48
+ .hypothesis/
49
+
50
+ # Mypy
51
+ .mypy_cache/
52
+
53
+ # Ruff
54
+ .ruff_cache/
55
+
56
+ # Distribution
57
+ *.manifest
58
+ *.spec
59
+
60
+ # Misc
61
+ *.log
62
+ .DS_Store
tab_cli-0.1.0/LICENSE ADDED
@@ -0,0 +1,7 @@
1
+ Copyright 2026-- Tongfei Chen
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
tab_cli-0.1.0/Makefile ADDED
@@ -0,0 +1,32 @@
1
+ .PHONY: install dev clean lint format test build publish publish-test
2
+
3
+ install:
4
+ uv tool install . --force
5
+
6
+ dev:
7
+ uv sync --dev
8
+
9
+ clean:
10
+ rm -rf dist/ build/ *.egg-info .pytest_cache .mypy_cache .ruff_cache
11
+ find . -type d -name __pycache__ -exec rm -rf {} +
12
+
13
+ lint:
14
+ uv run ruff check tab_cli/
15
+
16
+ format:
17
+ uv run ruff format tab_cli/
18
+
19
+ typecheck:
20
+ uv run ty check tab_cli/
21
+
22
+ test:
23
+ uv run pytest
24
+
25
+ build: clean
26
+ uv build
27
+
28
+ publish: build
29
+ uv publish
30
+
31
+ publish-test: build
32
+ uv publish --publish-url https://test.pypi.org/legacy/
tab_cli-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,100 @@
1
+ Metadata-Version: 2.4
2
+ Name: tab-cli
3
+ Version: 0.1.0
4
+ Summary: A CLI tool for tabular data
5
+ Author-email: Tongfei Chen <tongfei@pm.me>
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.10
8
+ Requires-Dist: blobfile>=3.0
9
+ Requires-Dist: fire>=0.5
10
+ Requires-Dist: polars>=1.0
11
+ Requires-Dist: pyarrow>=15.0
12
+ Requires-Dist: rich>=13.0
13
+ Description-Content-Type: text/markdown
14
+
15
+ # tab
16
+
17
+ A fast CLI tool for viewing, querying, and converting tabular data files.
18
+
19
+ ## Supported Formats
20
+ - Parquet
21
+ - CSV
22
+ - TSV
23
+ - Jsonl
24
+
25
+ ## Usage
26
+
27
+ ### View data
28
+
29
+ Display rows from a tabular data file:
30
+
31
+ ```bash
32
+ tab view data.parquet
33
+ tab view data.csv --limit 20
34
+ tab view data.tsv --skip 100 --limit 50
35
+ ```
36
+
37
+ Output to different formats:
38
+
39
+ ```bash
40
+ tab view data.parquet -o jsonl
41
+ tab view data.parquet -o csv
42
+ ```
43
+
44
+ ### Schema
45
+
46
+ Display the schema (column names and types):
47
+
48
+ ```bash
49
+ tab schema data.parquet
50
+ ```
51
+
52
+ ### Summary
53
+
54
+ Display summary information about a file:
55
+
56
+ ```bash
57
+ tab summary data.parquet
58
+ ```
59
+
60
+ ### SQL queries
61
+
62
+ Run SQL queries on your data. The table is referenced as `t`:
63
+
64
+ ```bash
65
+ tab sql "SELECT * FROM t WHERE age > 30" data.parquet
66
+ tab sql "SELECT name, COUNT(*) FROM t GROUP BY name" data.csv
67
+ ```
68
+
69
+ ### Convert
70
+
71
+ Convert between formats:
72
+
73
+ ```bash
74
+ tab convert data.csv data.parquet
75
+ tab convert data.parquet data.jsonl -o jsonl
76
+ ```
77
+
78
+ Write partitioned output:
79
+
80
+ ```bash
81
+ tab convert data.csv output_dir/ -o parquet -n 4
82
+ ```
83
+
84
+ ## Options
85
+
86
+ ### Common options
87
+
88
+ | Option | Description |
89
+ |-----------|-------------------------------------------------------------------------------|
90
+ | `-i` | Input format (`parquet`, `csv`, `tsv`, `jsonl`). Auto-detected from extension. |
91
+ | `-o` | Output format (`parquet`, `csv`, `tsv`, `jsonl`). |
92
+ | `--limit` | Maximum number of rows to display. |
93
+ | `--skip` | Number of rows to skip from the beginning. |
94
+
95
+ ### Convert options
96
+
97
+ | Option | Description |
98
+ |--------|-------------|
99
+ | `-n` | Number of output partitions. Creates a directory with part files. |
100
+
@@ -0,0 +1,86 @@
1
+ # tab
2
+
3
+ A fast CLI tool for viewing, querying, and converting tabular data files.
4
+
5
+ ## Supported Formats
6
+ - Parquet
7
+ - CSV
8
+ - TSV
9
+ - Jsonl
10
+
11
+ ## Usage
12
+
13
+ ### View data
14
+
15
+ Display rows from a tabular data file:
16
+
17
+ ```bash
18
+ tab view data.parquet
19
+ tab view data.csv --limit 20
20
+ tab view data.tsv --skip 100 --limit 50
21
+ ```
22
+
23
+ Output to different formats:
24
+
25
+ ```bash
26
+ tab view data.parquet -o jsonl
27
+ tab view data.parquet -o csv
28
+ ```
29
+
30
+ ### Schema
31
+
32
+ Display the schema (column names and types):
33
+
34
+ ```bash
35
+ tab schema data.parquet
36
+ ```
37
+
38
+ ### Summary
39
+
40
+ Display summary information about a file:
41
+
42
+ ```bash
43
+ tab summary data.parquet
44
+ ```
45
+
46
+ ### SQL queries
47
+
48
+ Run SQL queries on your data. The table is referenced as `t`:
49
+
50
+ ```bash
51
+ tab sql "SELECT * FROM t WHERE age > 30" data.parquet
52
+ tab sql "SELECT name, COUNT(*) FROM t GROUP BY name" data.csv
53
+ ```
54
+
55
+ ### Convert
56
+
57
+ Convert between formats:
58
+
59
+ ```bash
60
+ tab convert data.csv data.parquet
61
+ tab convert data.parquet data.jsonl -o jsonl
62
+ ```
63
+
64
+ Write partitioned output:
65
+
66
+ ```bash
67
+ tab convert data.csv output_dir/ -o parquet -n 4
68
+ ```
69
+
70
+ ## Options
71
+
72
+ ### Common options
73
+
74
+ | Option | Description |
75
+ |-----------|-------------------------------------------------------------------------------|
76
+ | `-i` | Input format (`parquet`, `csv`, `tsv`, `jsonl`). Auto-detected from extension. |
77
+ | `-o` | Output format (`parquet`, `csv`, `tsv`, `jsonl`). |
78
+ | `--limit` | Maximum number of rows to display. |
79
+ | `--skip` | Number of rows to skip from the beginning. |
80
+
81
+ ### Convert options
82
+
83
+ | Option | Description |
84
+ |--------|-------------|
85
+ | `-n` | Number of output partitions. Creates a directory with part files. |
86
+
@@ -0,0 +1,28 @@
1
+ [project]
2
+ name = "tab-cli"
3
+ version = "0.1.0"
4
+ description = "A CLI tool for tabular data"
5
+ authors = [{name = "Tongfei Chen", email = "tongfei@pm.me"}]
6
+ readme = "README.md"
7
+ repository = "https://github.com/tongfei/tab"
8
+ requires-python = ">=3.10"
9
+ dependencies = [
10
+ "fire>=0.5",
11
+ "rich>=13.0",
12
+ "polars>=1.0",
13
+ "pyarrow>=15.0",
14
+ "blobfile>=3.0",
15
+ ]
16
+
17
+ [project.scripts]
18
+ tab = "tab_cli.cli:main"
19
+
20
+ [build-system]
21
+ requires = ["hatchling"]
22
+ build-backend = "hatchling.build"
23
+
24
+ [dependency-groups]
25
+ dev = [
26
+ "ruff>=0.14.14",
27
+ "ty>=0.0.14",
28
+ ]
@@ -0,0 +1,3 @@
1
+ """Tab CLI - A CLI tool for tabular data."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,147 @@
1
+ """Main CLI entry point using Fire."""
2
+
3
+ import sys
4
+
5
+ import fire
6
+ import polars as pl
7
+ from rich.console import Console
8
+
9
+ from tab_cli.handlers import infer_reader, infer_writer, TableWriter
10
+
11
+
12
+ class Tab:
13
+
14
+ def _output(
15
+ self,
16
+ lf: pl.LazyFrame,
17
+ limit: int | None,
18
+ skip: int,
19
+ output: str | None,
20
+ ) -> None:
21
+ show_truncation = limit is None and output is None
22
+ actual_limit = 20 if show_truncation else limit
23
+
24
+ if show_truncation:
25
+ assert actual_limit is not None
26
+ lf = lf.slice(skip, length=actual_limit + 1)
27
+ df = lf.collect()
28
+ truncated = len(df) > actual_limit
29
+ if truncated:
30
+ df = df.head(actual_limit)
31
+ lf = df.lazy()
32
+ else:
33
+ if skip > 0 or actual_limit is not None:
34
+ lf = lf.slice(skip, length=actual_limit)
35
+ truncated = False
36
+
37
+ writer = infer_writer(output, truncated=show_truncation and truncated)
38
+
39
+ for chunk in writer.write(lf):
40
+ sys.stdout.buffer.write(chunk)
41
+
42
+ def view(
43
+ self,
44
+ path: str,
45
+ limit: int | None = None,
46
+ skip: int = 0,
47
+ output: str | None = None,
48
+ input: str | None = None,
49
+ ) -> None:
50
+ """View tabular data from a file.
51
+
52
+ Args:
53
+ path: Path to the data file.
54
+ limit: Maximum number of rows to display (default: 10).
55
+ skip: Number of rows to skip from the beginning (default: 0).
56
+ output: Output format ('jsonl', 'csv', 'tsv', 'parquet'). Default is a rich table.
57
+ input: Input format ('parquet', 'csv', 'tsv'). Default is inferred from extension.
58
+ """
59
+ reader = infer_reader(path, format=input)
60
+ lf = reader.read(path)
61
+ self._output(lf, limit=limit, skip=skip, output=output)
62
+
63
+ def schema(self, path: str, input: str | None = None) -> None:
64
+ """Display the schema of a tabular data file.
65
+
66
+ Args:
67
+ path: Path to the data file.
68
+ input: Input format ('parquet', 'csv', 'tsv'). Default is inferred from extension.
69
+ """
70
+ reader = infer_reader(path, format=input)
71
+ table_schema = reader.schema(path)
72
+ console = Console(force_terminal=True)
73
+ console.print(table_schema)
74
+
75
+ def sql(
76
+ self,
77
+ query: str,
78
+ path: str,
79
+ limit: int | None = None,
80
+ skip: int = 0,
81
+ output: str | None = None,
82
+ input: str | None = None,
83
+ ) -> None:
84
+ """Run a SQL query on tabular data. The table is available as `t`.
85
+
86
+ Args:
87
+ query: SQL query to execute. Reference the data as table `t`.
88
+ path: Path to the data file.
89
+ limit: Maximum number of rows to display (default: 10).
90
+ skip: Number of rows to skip from the beginning (default: 0).
91
+ output: Output format ('jsonl', 'csv', 'tsv', 'parquet'). Default is a rich table.
92
+ input: Input format ('parquet', 'csv', 'tsv'). Default is inferred from extension.
93
+ """
94
+ reader = infer_reader(path, format=input)
95
+ lf = reader.read(path)
96
+ ctx = pl.SQLContext(t=lf, eager=False)
97
+ result_lf = ctx.execute(query)
98
+ self._output(result_lf, limit=limit, skip=skip, output=output)
99
+
100
+ def summary(self, path: str, input: str | None = None) -> None:
101
+ """Display summary information about a tabular data file.
102
+
103
+ Args:
104
+ path: Path to the data file.
105
+ input: Input format ('parquet', 'csv', 'tsv'). Default is inferred from extension.
106
+ """
107
+ handler = infer_reader(path, format=input)
108
+ table_summary = handler.summary(path)
109
+ console = Console(force_terminal=True)
110
+ console.print(table_summary)
111
+
112
+ def convert(
113
+ self,
114
+ src: str,
115
+ dst: str,
116
+ input: str | None = None,
117
+ output: str | None = None,
118
+ num_partitions: int | None = None,
119
+ ) -> None:
120
+ """Convert tabular data from one format to another.
121
+
122
+ Args:
123
+ src: Path to the source data file.
124
+ dst: Path to the destination file or directory.
125
+ input: Input format ('parquet', 'csv', 'tsv'). Default is inferred from src extension.
126
+ output: Output format ('parquet', 'csv', 'tsv'). Default is same as input format.
127
+ num_partitions: Number of output partitions. If not specified, writes to a single file.
128
+ """
129
+ reader = infer_reader(src, format=input)
130
+ # Determine output format: use -o if specified, else inherit from input
131
+ if output is not None:
132
+ writer = infer_writer(format=output)
133
+ elif input is not None:
134
+ writer = infer_writer(format=input)
135
+ else:
136
+ writer = reader
137
+ assert isinstance(writer, TableWriter)
138
+ lf = reader.read(src)
139
+ writer.write_to_path(lf, dst, partitions=num_partitions)
140
+
141
+
142
+ def main():
143
+ fire.Fire(Tab)
144
+
145
+
146
+ if __name__ == "__main__":
147
+ main()
@@ -0,0 +1,55 @@
1
+ import os
2
+
3
+ from tab_cli.handlers.base import TableReader, TableWriter
4
+ from tab_cli.handlers.cli_table import CliTableFormatter
5
+ from tab_cli.handlers.csv import CsvHandler
6
+ from tab_cli.handlers.directory import DirectoryReader
7
+ from tab_cli.handlers.jsonl import JsonlHandler
8
+ from tab_cli.handlers.parquet import ParquetHandler
9
+
10
+ _READER_MAP = {
11
+ "csv": CsvHandler(","),
12
+ "tsv": CsvHandler("\t"),
13
+ "parquet": ParquetHandler(),
14
+ "jsonl": JsonlHandler(),
15
+ }
16
+
17
+ _WRITER_MAP = {
18
+ "csv": CsvHandler(","),
19
+ "tsv": CsvHandler("\t"),
20
+ "parquet": ParquetHandler(),
21
+ "jsonl": JsonlHandler(),
22
+ }
23
+
24
+ def infer_reader(path: str, format: str | None = None) -> TableReader:
25
+ """Infer the handler for a file. If format is given, use that instead of extension."""
26
+ if format is not None:
27
+ handler = _READER_MAP.get(format.lower())
28
+ if handler is None:
29
+ raise ValueError(f"Unknown format: {format}. Supported: {', '.join(_READER_MAP)}")
30
+ return handler
31
+
32
+ if os.path.isdir(path):
33
+ extension = os.path.splitext(os.listdir(path)[0])[1][1:].lower()
34
+ return DirectoryReader(extension, infer_reader_from_extension(extension))
35
+
36
+ extension = os.path.splitext(path)[1][1:].lower()
37
+ return infer_reader_from_extension(extension)
38
+
39
+
40
+ def infer_reader_from_extension(extension: str) -> TableReader:
41
+ """Infer the handler for a file based on its extension."""
42
+ handler = _READER_MAP.get(extension)
43
+ if handler is None:
44
+ raise ValueError(f"Unknown extension: {extension}. Supported: {', '.join(_READER_MAP)}")
45
+ return handler
46
+
47
+
48
+ def infer_writer(format: str | None = None, truncated: bool = False) -> TableWriter:
49
+ """Infer the writer for a format."""
50
+ if format is None:
51
+ return CliTableFormatter(truncated=truncated)
52
+ handler = _WRITER_MAP.get(format.lower())
53
+ if handler is None:
54
+ raise ValueError(f"Unknown format: {format}. Supported: {', '.join(_WRITER_MAP)}")
55
+ return handler
@@ -0,0 +1,126 @@
1
+ """Base reader interface for tabular data."""
2
+
3
+ import os
4
+ from abc import ABC, abstractmethod
5
+ from collections.abc import Iterable
6
+ from dataclasses import dataclass
7
+ from rich.table import Table
8
+ from rich.progress import Progress
9
+ from rich import box
10
+
11
+ import polars as pl
12
+
13
+ from tab_cli.style import _KEY_STYLE, _VAL_STYLE, _ALT_ROW_STYLE
14
+
15
+
16
+ @dataclass
17
+ class TableSchema:
18
+ """Schema information for a table."""
19
+
20
+ columns: list[tuple[str, pl.DataType]]
21
+
22
+ def __rich__(self) -> Table:
23
+ """Rich-formatted output for the schema."""
24
+
25
+ table = Table(
26
+ show_header=False,
27
+ box=box.SIMPLE_HEAD,
28
+ row_styles=["", _ALT_ROW_STYLE],
29
+ )
30
+ table.add_column(style=_KEY_STYLE)
31
+ table.add_column(style=_VAL_STYLE)
32
+ for name, dtype in self.columns:
33
+ table.add_row(name, str(dtype))
34
+ return table
35
+
36
+
37
+ @dataclass
38
+ class TableSummary:
39
+ """Summary information for a table."""
40
+
41
+ file_size: int
42
+ num_rows: int
43
+ num_columns: int
44
+ extra: dict[str, str | int | float] | None = None
45
+
46
+ def __rich__(self) -> Table:
47
+ """Rich-formatted output for the summary."""
48
+
49
+ def format_size(size: int) -> str:
50
+ s: float = size
51
+ for unit in ["B", "KiB", "MiB", "GiB", "TiB"]:
52
+ if s < 1024:
53
+ return f"{s:.1f} {unit}" if unit != "B" else f"{int(s)} {unit}"
54
+ s /= 1024
55
+ return f"{s:.1f} PiB"
56
+
57
+ table = Table(
58
+ show_header=False,
59
+ box=box.SIMPLE_HEAD,
60
+ row_styles=["", _ALT_ROW_STYLE],
61
+ )
62
+ table.add_column(style=_KEY_STYLE)
63
+ table.add_column(style=_VAL_STYLE)
64
+
65
+ table.add_row("File size", format_size(self.file_size))
66
+ table.add_row("Rows", f"{self.num_rows:,}")
67
+ table.add_row("Columns", str(self.num_columns))
68
+
69
+ if self.extra:
70
+ for key, value in self.extra.items():
71
+ table.add_row(key, str(value))
72
+
73
+ return table
74
+
75
+
76
+ class TableReader(ABC):
77
+
78
+ @abstractmethod
79
+ def read(self, path: str, limit: int | None = None, offset: int = 0) -> pl.LazyFrame:
80
+ pass
81
+
82
+ @abstractmethod
83
+ def schema(self, path: str) -> TableSchema:
84
+ pass
85
+
86
+ @abstractmethod
87
+ def summary(self, path: str) -> TableSummary:
88
+ pass
89
+
90
+
91
+ class TableWriter(ABC):
92
+
93
+ @abstractmethod
94
+ def extension(self) -> str:
95
+ """Return the file extension for this format (e.g., '.parquet', '.csv')."""
96
+ pass
97
+
98
+ @abstractmethod
99
+ def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
100
+ pass
101
+
102
+ @abstractmethod
103
+ def write_single(self, lf: pl.LazyFrame, path: str) -> None:
104
+ """Write a LazyFrame to a single file."""
105
+ pass
106
+
107
+ def write_to_path(self, lf: pl.LazyFrame, path: str, partitions: int | None = None) -> None:
108
+ """Write a LazyFrame to a file or partitioned directory."""
109
+ if partitions is None:
110
+ with Progress() as progress:
111
+ task = progress.add_task("Writing...", total=1)
112
+ self.write_single(lf, path)
113
+ progress.update(task, completed=1)
114
+ else:
115
+ os.makedirs(path, exist_ok=True)
116
+ row_count = lf.select(pl.len()).collect().item()
117
+ rows_per_part = (row_count + partitions - 1) // partitions
118
+ with Progress() as progress:
119
+ task = progress.add_task("Writing partitions...", total=partitions)
120
+ for i in range(partitions):
121
+ offset = i * rows_per_part
122
+ if offset < row_count:
123
+ part_lf = lf.slice(offset, rows_per_part)
124
+ part_path = os.path.join(path, f"part-{i:05d}{self.extension()}")
125
+ self.write_single(part_lf, part_path)
126
+ progress.update(task, advance=1)
@@ -0,0 +1,48 @@
1
+ from collections.abc import Iterable
2
+
3
+ from rich.table import Table
4
+ from rich import box
5
+ from rich.console import Console
6
+ import polars as pl
7
+
8
+ from tab_cli.handlers.base import TableWriter
9
+ from tab_cli.style import _ALT_ROW_STYLE, _KEY_STYLE
10
+
11
+
12
+ class CliTableFormatter(TableWriter):
13
+ def __init__(self, truncated: bool = False):
14
+ self.truncated = truncated
15
+
16
+ def extension(self) -> str:
17
+ return ".txt"
18
+
19
+ def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
20
+
21
+ table = Table(
22
+ show_header=True,
23
+ header_style=_KEY_STYLE,
24
+ box=box.SIMPLE_HEAD,
25
+ row_styles=["default", _ALT_ROW_STYLE],
26
+ )
27
+
28
+ for col in lf.collect_schema().names():
29
+ table.add_column(col)
30
+
31
+ for batch in lf.collect_batches():
32
+ for row in batch.iter_rows():
33
+ table.add_row(*[str(v) if v is not None else "" for v in row])
34
+
35
+ if self.truncated:
36
+ table.add_row(*["..." for _ in lf.collect_schema().names()])
37
+
38
+ console = Console(force_terminal=True)
39
+ with console.capture() as capture:
40
+ console.print(table)
41
+
42
+ yield capture.get().encode("utf-8")
43
+
44
+ def write_single(self, lf: pl.LazyFrame, path: str) -> None:
45
+ """Write a LazyFrame to a single text file."""
46
+ with open(path, "wb") as f:
47
+ for chunk in self.write(lf):
48
+ f.write(chunk)
@@ -0,0 +1,56 @@
1
+ """CSV file handler using Polars."""
2
+
3
+ import os
4
+ from collections.abc import Iterable
5
+ from io import BytesIO
6
+
7
+ import polars as pl
8
+
9
+ from tab_cli.handlers.base import TableReader, TableWriter, TableSchema, TableSummary
10
+
11
+
12
+ class CsvHandler(TableReader, TableWriter):
13
+ """Handler for CSV/TSV files."""
14
+
15
+ def __init__(self, separator: str = ","):
16
+ self.separator = separator
17
+
18
+ def extension(self) -> str:
19
+ return ".csv" if self.separator == "," else ".tsv"
20
+
21
+ def read(self, path: str, limit: int | None = None, offset: int = 0) -> pl.LazyFrame:
22
+ lf = pl.scan_csv(path, separator=self.separator)
23
+ if offset > 0:
24
+ lf = lf.slice(offset, length=limit)
25
+ elif limit is not None:
26
+ lf = lf.head(limit)
27
+ return lf
28
+
29
+ def schema(self, path: str) -> TableSchema:
30
+ lf = pl.scan_csv(path, separator=self.separator)
31
+ columns = list(lf.collect_schema().items())
32
+ return TableSchema(columns=columns)
33
+
34
+ def summary(self, path: str) -> TableSummary:
35
+ file_size = os.path.getsize(path)
36
+ lf = pl.scan_csv(path, separator=self.separator)
37
+ schema = lf.collect_schema()
38
+ num_columns = len(schema)
39
+ num_rows = lf.select(pl.len()).collect().item()
40
+ return TableSummary(
41
+ file_size=file_size,
42
+ num_rows=num_rows,
43
+ num_columns=num_columns,
44
+ )
45
+
46
+ def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
47
+ first = True
48
+ for batch in lf.collect_batches():
49
+ output = BytesIO()
50
+ batch.write_csv(output, separator=self.separator, include_header=first)
51
+ first = False
52
+ yield output.getvalue()
53
+
54
+ def write_single(self, lf: pl.LazyFrame, path: str) -> None:
55
+ """Write a LazyFrame to a single CSV/TSV file."""
56
+ lf.sink_csv(path, separator=self.separator)
@@ -0,0 +1,96 @@
1
+ """Directory handler for partitioned datasets."""
2
+
3
+ import os
4
+ from collections.abc import Iterable
5
+ from glob import glob
6
+
7
+ import polars as pl
8
+
9
+ from tab_cli.handlers.base import TableReader, TableSchema, TableSummary
10
+
11
+
12
+ class DirectoryReader(TableReader):
13
+ """Handler wrapper for partitioned datasets (directories of files)."""
14
+
15
+ def __init__(self, extension: str, file_handler: TableReader) -> None:
16
+ self.extension = extension
17
+ self.file_handler = file_handler
18
+
19
+ def _get_files(self, path: str) -> list[str]:
20
+ """Get all files with matching extension in the directory."""
21
+ pattern = os.path.join(path, "**", f"*{self.extension}")
22
+ return sorted(glob(pattern, recursive=True))
23
+
24
+ def read(self, path: str, limit: int | None = None, offset: int = 0) -> pl.LazyFrame:
25
+ """Read data from a partitioned dataset."""
26
+ files = self._get_files(path)
27
+ if not files:
28
+ raise ValueError(f"No {self.extension} files found in {path}")
29
+
30
+ frames = [self.file_handler.read(file) for file in files]
31
+ lf = pl.concat(frames, how="vertical")
32
+
33
+ if offset > 0:
34
+ lf = lf.slice(offset, length=limit)
35
+ elif limit is not None:
36
+ lf = lf.head(limit)
37
+
38
+ return lf
39
+
40
+ def schema(self, path: str) -> TableSchema:
41
+ """Get the schema from the partitioned dataset."""
42
+ files = self._get_files(path)
43
+ if not files:
44
+ raise ValueError(f"No {self.extension} files found in {path}")
45
+ return self.file_handler.schema(files[0])
46
+
47
+ def summary(self, path: str) -> TableSummary:
48
+ """Get aggregated summary from all partition files."""
49
+ files = self._get_files(path)
50
+ if not files:
51
+ raise ValueError(f"No {self.extension} files found in {path}")
52
+
53
+ file_size = 0
54
+ num_rows = 0
55
+ num_columns: int | None = None
56
+
57
+ extra_numeric: dict[str, float] = {}
58
+ extra_strings: dict[str, set[str]] = {}
59
+
60
+ for file in files:
61
+ file_summary = self.file_handler.summary(file)
62
+ file_size += file_summary.file_size
63
+ num_rows += file_summary.num_rows
64
+
65
+ if num_columns is None:
66
+ num_columns = file_summary.num_columns
67
+ elif file_summary.num_columns != num_columns:
68
+ raise ValueError(f"Inconsistent column counts in {path}")
69
+
70
+ if file_summary.extra:
71
+ for key, value in file_summary.extra.items():
72
+ if isinstance(value, (int, float)):
73
+ extra_numeric[key] = extra_numeric.get(key, 0) + value
74
+ else:
75
+ extra_strings.setdefault(key, set()).add(str(value))
76
+
77
+ extra: dict[str, str | int | float] = {"Partitions": len(files)}
78
+ for key, value in extra_numeric.items():
79
+ if float(value).is_integer():
80
+ extra[key] = int(value)
81
+ else:
82
+ extra[key] = value
83
+
84
+ for key, values in extra_strings.items():
85
+ if len(values) == 1:
86
+ extra[key] = next(iter(values))
87
+ else:
88
+ extra[key] = ", ".join(sorted(values))
89
+
90
+ return TableSummary(
91
+ file_size=file_size,
92
+ num_rows=num_rows,
93
+ num_columns=num_columns or 0,
94
+ extra=extra,
95
+ )
96
+
@@ -0,0 +1,47 @@
1
+ import os
2
+ from collections.abc import Iterable
3
+ import json
4
+ import polars as pl
5
+ from tab_cli.handlers.base import TableReader, TableWriter, TableSchema, TableSummary
6
+
7
+
8
+ class JsonlHandler(TableReader, TableWriter):
9
+
10
+ def read(self, path: str, limit: int | None = None, offset: int = 0) -> pl.LazyFrame:
11
+ lf = pl.scan_ndjson(path)
12
+ if offset > 0:
13
+ lf = lf.slice(offset, length=limit)
14
+ elif limit is not None:
15
+ lf = lf.head(limit)
16
+ return lf
17
+
18
+ def schema(self, path: str) -> TableSchema:
19
+ lf = pl.scan_ndjson(path)
20
+ columns = list(lf.collect_schema().items())
21
+ return TableSchema(columns=columns)
22
+
23
+ def summary(self, path: str) -> TableSummary:
24
+ file_size = os.path.getsize(path)
25
+ lf = pl.scan_ndjson(path)
26
+ schema = lf.collect_schema()
27
+ num_columns = len(schema)
28
+ num_rows = lf.select(pl.len()).collect().item()
29
+ return TableSummary(
30
+ file_size=file_size,
31
+ num_rows=num_rows,
32
+ num_columns=num_columns,
33
+ )
34
+
35
+ def extension(self) -> str:
36
+ return ".jsonl"
37
+
38
+ def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
39
+ for batch in lf.collect_batches():
40
+ for row in batch.iter_rows(named=True):
41
+ yield (json.dumps(row, default=str, ensure_ascii=False) + "\n").encode("utf-8")
42
+
43
+ def write_single(self, lf: pl.LazyFrame, path: str) -> None:
44
+ """Write a LazyFrame to a single JSONL file."""
45
+ with open(path, "wb") as f:
46
+ for chunk in self.write(lf):
47
+ f.write(chunk)
@@ -0,0 +1,75 @@
1
+ """Parquet file handler using Polars."""
2
+
3
+ import os
4
+ from collections.abc import Iterable
5
+ from io import BytesIO
6
+
7
+ import polars as pl
8
+
9
+ from tab_cli.handlers.base import TableReader, TableWriter, TableSchema, TableSummary
10
+
11
+
12
+ class ParquetHandler(TableReader, TableWriter):
13
+ """Handler for Parquet files."""
14
+
15
+ def extension(self) -> str:
16
+ return ".parquet"
17
+
18
+ def read(self, path: str, limit: int | None = None, offset: int = 0) -> pl.LazyFrame:
19
+ """Read data from a Parquet file."""
20
+ df = pl.scan_parquet(path)
21
+ if offset > 0:
22
+ df = df.slice(offset, length=limit)
23
+ elif limit is not None:
24
+ df = df.head(limit)
25
+ return df
26
+
27
+ def schema(self, path: str) -> TableSchema:
28
+ """Get the schema of the Parquet file."""
29
+ lf = pl.scan_parquet(path)
30
+ columns = list(lf.collect_schema().items())
31
+ return TableSchema(columns=columns)
32
+
33
+ def summary(self, path: str) -> TableSummary:
34
+ """Get summary information about the Parquet file."""
35
+ import pyarrow.parquet as pq
36
+
37
+ file_size = os.path.getsize(path)
38
+ lf = pl.scan_parquet(path)
39
+ schema = lf.collect_schema()
40
+ num_columns = len(schema)
41
+ num_rows = lf.select(pl.len()).collect().item()
42
+
43
+ # Get parquet metadata using pyarrow
44
+ pf = pq.ParquetFile(path)
45
+ metadata = pf.metadata
46
+
47
+ extra: dict[str, str | int | float] = {}
48
+
49
+ # Collect compression codecs from all column chunks
50
+ codecs: set[str] = set()
51
+ for rg_idx in range(metadata.num_row_groups):
52
+ rg = metadata.row_group(rg_idx)
53
+ for col_idx in range(rg.num_columns):
54
+ col = rg.column(col_idx)
55
+ codecs.add(col.compression)
56
+ extra["Row groups"] = metadata.num_row_groups
57
+ if codecs:
58
+ extra["Compression"] = ", ".join(sorted(codecs))
59
+
60
+ return TableSummary(
61
+ file_size=file_size,
62
+ num_rows=num_rows,
63
+ num_columns=num_columns,
64
+ extra=extra,
65
+ )
66
+
67
+ def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
68
+ """Write a LazyFrame to Parquet bytes."""
69
+ output = BytesIO()
70
+ lf.sink_parquet(output)
71
+ yield output.getvalue()
72
+
73
+ def write_single(self, lf: pl.LazyFrame, path: str) -> None:
74
+ """Write a LazyFrame to a single Parquet file."""
75
+ lf.sink_parquet(path)
@@ -0,0 +1,3 @@
1
+ _KEY_STYLE = "sea_green3"
2
+ _VAL_STYLE = "bold sky_blue3"
3
+ _ALT_ROW_STYLE = "on #282828"