tab-cli 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tab_cli-0.1.0/.gitignore +62 -0
- tab_cli-0.1.0/LICENSE +7 -0
- tab_cli-0.1.0/Makefile +32 -0
- tab_cli-0.1.0/PKG-INFO +100 -0
- tab_cli-0.1.0/README.md +86 -0
- tab_cli-0.1.0/pyproject.toml +28 -0
- tab_cli-0.1.0/tab_cli/__init__.py +3 -0
- tab_cli-0.1.0/tab_cli/cli.py +147 -0
- tab_cli-0.1.0/tab_cli/handlers/__init__.py +55 -0
- tab_cli-0.1.0/tab_cli/handlers/base.py +126 -0
- tab_cli-0.1.0/tab_cli/handlers/cli_table.py +48 -0
- tab_cli-0.1.0/tab_cli/handlers/csv.py +56 -0
- tab_cli-0.1.0/tab_cli/handlers/directory.py +96 -0
- tab_cli-0.1.0/tab_cli/handlers/jsonl.py +47 -0
- tab_cli-0.1.0/tab_cli/handlers/parquet.py +75 -0
- tab_cli-0.1.0/tab_cli/style.py +3 -0
tab_cli-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
*.egg-info/
|
|
20
|
+
.installed.cfg
|
|
21
|
+
*.egg
|
|
22
|
+
|
|
23
|
+
# Virtual environments
|
|
24
|
+
.env
|
|
25
|
+
.venv
|
|
26
|
+
env/
|
|
27
|
+
venv/
|
|
28
|
+
ENV/
|
|
29
|
+
|
|
30
|
+
# uv
|
|
31
|
+
.uv/
|
|
32
|
+
uv.lock
|
|
33
|
+
|
|
34
|
+
# PyCharm
|
|
35
|
+
.idea/
|
|
36
|
+
*.iml
|
|
37
|
+
*.ipr
|
|
38
|
+
*.iws
|
|
39
|
+
out/
|
|
40
|
+
|
|
41
|
+
# Testing
|
|
42
|
+
.tox/
|
|
43
|
+
.nox/
|
|
44
|
+
.coverage
|
|
45
|
+
.coverage.*
|
|
46
|
+
htmlcov/
|
|
47
|
+
.pytest_cache/
|
|
48
|
+
.hypothesis/
|
|
49
|
+
|
|
50
|
+
# Mypy
|
|
51
|
+
.mypy_cache/
|
|
52
|
+
|
|
53
|
+
# Ruff
|
|
54
|
+
.ruff_cache/
|
|
55
|
+
|
|
56
|
+
# Distribution
|
|
57
|
+
*.manifest
|
|
58
|
+
*.spec
|
|
59
|
+
|
|
60
|
+
# Misc
|
|
61
|
+
*.log
|
|
62
|
+
.DS_Store
|
tab_cli-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright 2026-- Tongfei Chen
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
tab_cli-0.1.0/Makefile
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
.PHONY: install dev clean lint format test build publish publish-test
|
|
2
|
+
|
|
3
|
+
install:
|
|
4
|
+
uv tool install . --force
|
|
5
|
+
|
|
6
|
+
dev:
|
|
7
|
+
uv sync --dev
|
|
8
|
+
|
|
9
|
+
clean:
|
|
10
|
+
rm -rf dist/ build/ *.egg-info .pytest_cache .mypy_cache .ruff_cache
|
|
11
|
+
find . -type d -name __pycache__ -exec rm -rf {} +
|
|
12
|
+
|
|
13
|
+
lint:
|
|
14
|
+
uv run ruff check tab_cli/
|
|
15
|
+
|
|
16
|
+
format:
|
|
17
|
+
uv run ruff format tab_cli/
|
|
18
|
+
|
|
19
|
+
typecheck:
|
|
20
|
+
uv run ty check tab_cli/
|
|
21
|
+
|
|
22
|
+
test:
|
|
23
|
+
uv run pytest
|
|
24
|
+
|
|
25
|
+
build: clean
|
|
26
|
+
uv build
|
|
27
|
+
|
|
28
|
+
publish: build
|
|
29
|
+
uv publish
|
|
30
|
+
|
|
31
|
+
publish-test: build
|
|
32
|
+
uv publish --publish-url https://test.pypi.org/legacy/
|
tab_cli-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tab-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A CLI tool for tabular data
|
|
5
|
+
Author-email: Tongfei Chen <tongfei@pm.me>
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Requires-Dist: blobfile>=3.0
|
|
9
|
+
Requires-Dist: fire>=0.5
|
|
10
|
+
Requires-Dist: polars>=1.0
|
|
11
|
+
Requires-Dist: pyarrow>=15.0
|
|
12
|
+
Requires-Dist: rich>=13.0
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# tab
|
|
16
|
+
|
|
17
|
+
A fast CLI tool for viewing, querying, and converting tabular data files.
|
|
18
|
+
|
|
19
|
+
## Supported Formats
|
|
20
|
+
- Parquet
|
|
21
|
+
- CSV
|
|
22
|
+
- TSV
|
|
23
|
+
- Jsonl
|
|
24
|
+
|
|
25
|
+
## Usage
|
|
26
|
+
|
|
27
|
+
### View data
|
|
28
|
+
|
|
29
|
+
Display rows from a tabular data file:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
tab view data.parquet
|
|
33
|
+
tab view data.csv --limit 20
|
|
34
|
+
tab view data.tsv --skip 100 --limit 50
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Output to different formats:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
tab view data.parquet -o jsonl
|
|
41
|
+
tab view data.parquet -o csv
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Schema
|
|
45
|
+
|
|
46
|
+
Display the schema (column names and types):
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
tab schema data.parquet
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Summary
|
|
53
|
+
|
|
54
|
+
Display summary information about a file:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
tab summary data.parquet
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### SQL queries
|
|
61
|
+
|
|
62
|
+
Run SQL queries on your data. The table is referenced as `t`:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
tab sql "SELECT * FROM t WHERE age > 30" data.parquet
|
|
66
|
+
tab sql "SELECT name, COUNT(*) FROM t GROUP BY name" data.csv
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Convert
|
|
70
|
+
|
|
71
|
+
Convert between formats:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
tab convert data.csv data.parquet
|
|
75
|
+
tab convert data.parquet data.jsonl -o jsonl
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
Write partitioned output:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
tab convert data.csv output_dir/ -o parquet -n 4
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Options
|
|
85
|
+
|
|
86
|
+
### Common options
|
|
87
|
+
|
|
88
|
+
| Option | Description |
|
|
89
|
+
|-----------|-------------------------------------------------------------------------------|
|
|
90
|
+
| `-i` | Input format (`parquet`, `csv`, `tsv`, `jsonl`). Auto-detected from extension. |
|
|
91
|
+
| `-o` | Output format (`parquet`, `csv`, `tsv`, `jsonl`). |
|
|
92
|
+
| `--limit` | Maximum number of rows to display. |
|
|
93
|
+
| `--skip` | Number of rows to skip from the beginning. |
|
|
94
|
+
|
|
95
|
+
### Convert options
|
|
96
|
+
|
|
97
|
+
| Option | Description |
|
|
98
|
+
|--------|-------------|
|
|
99
|
+
| `-n` | Number of output partitions. Creates a directory with part files. |
|
|
100
|
+
|
tab_cli-0.1.0/README.md
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# tab
|
|
2
|
+
|
|
3
|
+
A fast CLI tool for viewing, querying, and converting tabular data files.
|
|
4
|
+
|
|
5
|
+
## Supported Formats
|
|
6
|
+
- Parquet
|
|
7
|
+
- CSV
|
|
8
|
+
- TSV
|
|
9
|
+
- Jsonl
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
### View data
|
|
14
|
+
|
|
15
|
+
Display rows from a tabular data file:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
tab view data.parquet
|
|
19
|
+
tab view data.csv --limit 20
|
|
20
|
+
tab view data.tsv --skip 100 --limit 50
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Output to different formats:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
tab view data.parquet -o jsonl
|
|
27
|
+
tab view data.parquet -o csv
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### Schema
|
|
31
|
+
|
|
32
|
+
Display the schema (column names and types):
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
tab schema data.parquet
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### Summary
|
|
39
|
+
|
|
40
|
+
Display summary information about a file:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
tab summary data.parquet
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### SQL queries
|
|
47
|
+
|
|
48
|
+
Run SQL queries on your data. The table is referenced as `t`:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
tab sql "SELECT * FROM t WHERE age > 30" data.parquet
|
|
52
|
+
tab sql "SELECT name, COUNT(*) FROM t GROUP BY name" data.csv
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Convert
|
|
56
|
+
|
|
57
|
+
Convert between formats:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
tab convert data.csv data.parquet
|
|
61
|
+
tab convert data.parquet data.jsonl -o jsonl
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Write partitioned output:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
tab convert data.csv output_dir/ -o parquet -n 4
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Options
|
|
71
|
+
|
|
72
|
+
### Common options
|
|
73
|
+
|
|
74
|
+
| Option | Description |
|
|
75
|
+
|-----------|-------------------------------------------------------------------------------|
|
|
76
|
+
| `-i` | Input format (`parquet`, `csv`, `tsv`, `jsonl`). Auto-detected from extension. |
|
|
77
|
+
| `-o` | Output format (`parquet`, `csv`, `tsv`, `jsonl`). |
|
|
78
|
+
| `--limit` | Maximum number of rows to display. |
|
|
79
|
+
| `--skip` | Number of rows to skip from the beginning. |
|
|
80
|
+
|
|
81
|
+
### Convert options
|
|
82
|
+
|
|
83
|
+
| Option | Description |
|
|
84
|
+
|--------|-------------|
|
|
85
|
+
| `-n` | Number of output partitions. Creates a directory with part files. |
|
|
86
|
+
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "tab-cli"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "A CLI tool for tabular data"
|
|
5
|
+
authors = [{name = "Tongfei Chen", email = "tongfei@pm.me"}]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
repository = "https://github.com/tongfei/tab"
|
|
8
|
+
requires-python = ">=3.10"
|
|
9
|
+
dependencies = [
|
|
10
|
+
"fire>=0.5",
|
|
11
|
+
"rich>=13.0",
|
|
12
|
+
"polars>=1.0",
|
|
13
|
+
"pyarrow>=15.0",
|
|
14
|
+
"blobfile>=3.0",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[project.scripts]
|
|
18
|
+
tab = "tab_cli.cli:main"
|
|
19
|
+
|
|
20
|
+
[build-system]
|
|
21
|
+
requires = ["hatchling"]
|
|
22
|
+
build-backend = "hatchling.build"
|
|
23
|
+
|
|
24
|
+
[dependency-groups]
|
|
25
|
+
dev = [
|
|
26
|
+
"ruff>=0.14.14",
|
|
27
|
+
"ty>=0.0.14",
|
|
28
|
+
]
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""Main CLI entry point using Fire."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
import fire
|
|
6
|
+
import polars as pl
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
|
|
9
|
+
from tab_cli.handlers import infer_reader, infer_writer, TableWriter
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Tab:
|
|
13
|
+
|
|
14
|
+
def _output(
|
|
15
|
+
self,
|
|
16
|
+
lf: pl.LazyFrame,
|
|
17
|
+
limit: int | None,
|
|
18
|
+
skip: int,
|
|
19
|
+
output: str | None,
|
|
20
|
+
) -> None:
|
|
21
|
+
show_truncation = limit is None and output is None
|
|
22
|
+
actual_limit = 20 if show_truncation else limit
|
|
23
|
+
|
|
24
|
+
if show_truncation:
|
|
25
|
+
assert actual_limit is not None
|
|
26
|
+
lf = lf.slice(skip, length=actual_limit + 1)
|
|
27
|
+
df = lf.collect()
|
|
28
|
+
truncated = len(df) > actual_limit
|
|
29
|
+
if truncated:
|
|
30
|
+
df = df.head(actual_limit)
|
|
31
|
+
lf = df.lazy()
|
|
32
|
+
else:
|
|
33
|
+
if skip > 0 or actual_limit is not None:
|
|
34
|
+
lf = lf.slice(skip, length=actual_limit)
|
|
35
|
+
truncated = False
|
|
36
|
+
|
|
37
|
+
writer = infer_writer(output, truncated=show_truncation and truncated)
|
|
38
|
+
|
|
39
|
+
for chunk in writer.write(lf):
|
|
40
|
+
sys.stdout.buffer.write(chunk)
|
|
41
|
+
|
|
42
|
+
def view(
|
|
43
|
+
self,
|
|
44
|
+
path: str,
|
|
45
|
+
limit: int | None = None,
|
|
46
|
+
skip: int = 0,
|
|
47
|
+
output: str | None = None,
|
|
48
|
+
input: str | None = None,
|
|
49
|
+
) -> None:
|
|
50
|
+
"""View tabular data from a file.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
path: Path to the data file.
|
|
54
|
+
limit: Maximum number of rows to display (default: 10).
|
|
55
|
+
skip: Number of rows to skip from the beginning (default: 0).
|
|
56
|
+
output: Output format ('jsonl', 'csv', 'tsv', 'parquet'). Default is a rich table.
|
|
57
|
+
input: Input format ('parquet', 'csv', 'tsv'). Default is inferred from extension.
|
|
58
|
+
"""
|
|
59
|
+
reader = infer_reader(path, format=input)
|
|
60
|
+
lf = reader.read(path)
|
|
61
|
+
self._output(lf, limit=limit, skip=skip, output=output)
|
|
62
|
+
|
|
63
|
+
def schema(self, path: str, input: str | None = None) -> None:
|
|
64
|
+
"""Display the schema of a tabular data file.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
path: Path to the data file.
|
|
68
|
+
input: Input format ('parquet', 'csv', 'tsv'). Default is inferred from extension.
|
|
69
|
+
"""
|
|
70
|
+
reader = infer_reader(path, format=input)
|
|
71
|
+
table_schema = reader.schema(path)
|
|
72
|
+
console = Console(force_terminal=True)
|
|
73
|
+
console.print(table_schema)
|
|
74
|
+
|
|
75
|
+
def sql(
|
|
76
|
+
self,
|
|
77
|
+
query: str,
|
|
78
|
+
path: str,
|
|
79
|
+
limit: int | None = None,
|
|
80
|
+
skip: int = 0,
|
|
81
|
+
output: str | None = None,
|
|
82
|
+
input: str | None = None,
|
|
83
|
+
) -> None:
|
|
84
|
+
"""Run a SQL query on tabular data. The table is available as `t`.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
query: SQL query to execute. Reference the data as table `t`.
|
|
88
|
+
path: Path to the data file.
|
|
89
|
+
limit: Maximum number of rows to display (default: 10).
|
|
90
|
+
skip: Number of rows to skip from the beginning (default: 0).
|
|
91
|
+
output: Output format ('jsonl', 'csv', 'tsv', 'parquet'). Default is a rich table.
|
|
92
|
+
input: Input format ('parquet', 'csv', 'tsv'). Default is inferred from extension.
|
|
93
|
+
"""
|
|
94
|
+
reader = infer_reader(path, format=input)
|
|
95
|
+
lf = reader.read(path)
|
|
96
|
+
ctx = pl.SQLContext(t=lf, eager=False)
|
|
97
|
+
result_lf = ctx.execute(query)
|
|
98
|
+
self._output(result_lf, limit=limit, skip=skip, output=output)
|
|
99
|
+
|
|
100
|
+
def summary(self, path: str, input: str | None = None) -> None:
|
|
101
|
+
"""Display summary information about a tabular data file.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
path: Path to the data file.
|
|
105
|
+
input: Input format ('parquet', 'csv', 'tsv'). Default is inferred from extension.
|
|
106
|
+
"""
|
|
107
|
+
handler = infer_reader(path, format=input)
|
|
108
|
+
table_summary = handler.summary(path)
|
|
109
|
+
console = Console(force_terminal=True)
|
|
110
|
+
console.print(table_summary)
|
|
111
|
+
|
|
112
|
+
def convert(
|
|
113
|
+
self,
|
|
114
|
+
src: str,
|
|
115
|
+
dst: str,
|
|
116
|
+
input: str | None = None,
|
|
117
|
+
output: str | None = None,
|
|
118
|
+
num_partitions: int | None = None,
|
|
119
|
+
) -> None:
|
|
120
|
+
"""Convert tabular data from one format to another.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
src: Path to the source data file.
|
|
124
|
+
dst: Path to the destination file or directory.
|
|
125
|
+
input: Input format ('parquet', 'csv', 'tsv'). Default is inferred from src extension.
|
|
126
|
+
output: Output format ('parquet', 'csv', 'tsv'). Default is same as input format.
|
|
127
|
+
num_partitions: Number of output partitions. If not specified, writes to a single file.
|
|
128
|
+
"""
|
|
129
|
+
reader = infer_reader(src, format=input)
|
|
130
|
+
# Determine output format: use -o if specified, else inherit from input
|
|
131
|
+
if output is not None:
|
|
132
|
+
writer = infer_writer(format=output)
|
|
133
|
+
elif input is not None:
|
|
134
|
+
writer = infer_writer(format=input)
|
|
135
|
+
else:
|
|
136
|
+
writer = reader
|
|
137
|
+
assert isinstance(writer, TableWriter)
|
|
138
|
+
lf = reader.read(src)
|
|
139
|
+
writer.write_to_path(lf, dst, partitions=num_partitions)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def main():
|
|
143
|
+
fire.Fire(Tab)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
if __name__ == "__main__":
|
|
147
|
+
main()
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from tab_cli.handlers.base import TableReader, TableWriter
|
|
4
|
+
from tab_cli.handlers.cli_table import CliTableFormatter
|
|
5
|
+
from tab_cli.handlers.csv import CsvHandler
|
|
6
|
+
from tab_cli.handlers.directory import DirectoryReader
|
|
7
|
+
from tab_cli.handlers.jsonl import JsonlHandler
|
|
8
|
+
from tab_cli.handlers.parquet import ParquetHandler
|
|
9
|
+
|
|
10
|
+
_READER_MAP = {
|
|
11
|
+
"csv": CsvHandler(","),
|
|
12
|
+
"tsv": CsvHandler("\t"),
|
|
13
|
+
"parquet": ParquetHandler(),
|
|
14
|
+
"jsonl": JsonlHandler(),
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
_WRITER_MAP = {
|
|
18
|
+
"csv": CsvHandler(","),
|
|
19
|
+
"tsv": CsvHandler("\t"),
|
|
20
|
+
"parquet": ParquetHandler(),
|
|
21
|
+
"jsonl": JsonlHandler(),
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
def infer_reader(path: str, format: str | None = None) -> TableReader:
|
|
25
|
+
"""Infer the handler for a file. If format is given, use that instead of extension."""
|
|
26
|
+
if format is not None:
|
|
27
|
+
handler = _READER_MAP.get(format.lower())
|
|
28
|
+
if handler is None:
|
|
29
|
+
raise ValueError(f"Unknown format: {format}. Supported: {', '.join(_READER_MAP)}")
|
|
30
|
+
return handler
|
|
31
|
+
|
|
32
|
+
if os.path.isdir(path):
|
|
33
|
+
extension = os.path.splitext(os.listdir(path)[0])[1][1:].lower()
|
|
34
|
+
return DirectoryReader(extension, infer_reader_from_extension(extension))
|
|
35
|
+
|
|
36
|
+
extension = os.path.splitext(path)[1][1:].lower()
|
|
37
|
+
return infer_reader_from_extension(extension)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def infer_reader_from_extension(extension: str) -> TableReader:
|
|
41
|
+
"""Infer the handler for a file based on its extension."""
|
|
42
|
+
handler = _READER_MAP.get(extension)
|
|
43
|
+
if handler is None:
|
|
44
|
+
raise ValueError(f"Unknown extension: {extension}. Supported: {', '.join(_READER_MAP)}")
|
|
45
|
+
return handler
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def infer_writer(format: str | None = None, truncated: bool = False) -> TableWriter:
|
|
49
|
+
"""Infer the writer for a format."""
|
|
50
|
+
if format is None:
|
|
51
|
+
return CliTableFormatter(truncated=truncated)
|
|
52
|
+
handler = _WRITER_MAP.get(format.lower())
|
|
53
|
+
if handler is None:
|
|
54
|
+
raise ValueError(f"Unknown format: {format}. Supported: {', '.join(_WRITER_MAP)}")
|
|
55
|
+
return handler
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""Base reader interface for tabular data."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from collections.abc import Iterable
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
from rich.progress import Progress
|
|
9
|
+
from rich import box
|
|
10
|
+
|
|
11
|
+
import polars as pl
|
|
12
|
+
|
|
13
|
+
from tab_cli.style import _KEY_STYLE, _VAL_STYLE, _ALT_ROW_STYLE
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class TableSchema:
|
|
18
|
+
"""Schema information for a table."""
|
|
19
|
+
|
|
20
|
+
columns: list[tuple[str, pl.DataType]]
|
|
21
|
+
|
|
22
|
+
def __rich__(self) -> Table:
|
|
23
|
+
"""Rich-formatted output for the schema."""
|
|
24
|
+
|
|
25
|
+
table = Table(
|
|
26
|
+
show_header=False,
|
|
27
|
+
box=box.SIMPLE_HEAD,
|
|
28
|
+
row_styles=["", _ALT_ROW_STYLE],
|
|
29
|
+
)
|
|
30
|
+
table.add_column(style=_KEY_STYLE)
|
|
31
|
+
table.add_column(style=_VAL_STYLE)
|
|
32
|
+
for name, dtype in self.columns:
|
|
33
|
+
table.add_row(name, str(dtype))
|
|
34
|
+
return table
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class TableSummary:
|
|
39
|
+
"""Summary information for a table."""
|
|
40
|
+
|
|
41
|
+
file_size: int
|
|
42
|
+
num_rows: int
|
|
43
|
+
num_columns: int
|
|
44
|
+
extra: dict[str, str | int | float] | None = None
|
|
45
|
+
|
|
46
|
+
def __rich__(self) -> Table:
|
|
47
|
+
"""Rich-formatted output for the summary."""
|
|
48
|
+
|
|
49
|
+
def format_size(size: int) -> str:
|
|
50
|
+
s: float = size
|
|
51
|
+
for unit in ["B", "KiB", "MiB", "GiB", "TiB"]:
|
|
52
|
+
if s < 1024:
|
|
53
|
+
return f"{s:.1f} {unit}" if unit != "B" else f"{int(s)} {unit}"
|
|
54
|
+
s /= 1024
|
|
55
|
+
return f"{s:.1f} PiB"
|
|
56
|
+
|
|
57
|
+
table = Table(
|
|
58
|
+
show_header=False,
|
|
59
|
+
box=box.SIMPLE_HEAD,
|
|
60
|
+
row_styles=["", _ALT_ROW_STYLE],
|
|
61
|
+
)
|
|
62
|
+
table.add_column(style=_KEY_STYLE)
|
|
63
|
+
table.add_column(style=_VAL_STYLE)
|
|
64
|
+
|
|
65
|
+
table.add_row("File size", format_size(self.file_size))
|
|
66
|
+
table.add_row("Rows", f"{self.num_rows:,}")
|
|
67
|
+
table.add_row("Columns", str(self.num_columns))
|
|
68
|
+
|
|
69
|
+
if self.extra:
|
|
70
|
+
for key, value in self.extra.items():
|
|
71
|
+
table.add_row(key, str(value))
|
|
72
|
+
|
|
73
|
+
return table
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class TableReader(ABC):
|
|
77
|
+
|
|
78
|
+
@abstractmethod
|
|
79
|
+
def read(self, path: str, limit: int | None = None, offset: int = 0) -> pl.LazyFrame:
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
@abstractmethod
|
|
83
|
+
def schema(self, path: str) -> TableSchema:
|
|
84
|
+
pass
|
|
85
|
+
|
|
86
|
+
@abstractmethod
|
|
87
|
+
def summary(self, path: str) -> TableSummary:
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class TableWriter(ABC):
|
|
92
|
+
|
|
93
|
+
@abstractmethod
|
|
94
|
+
def extension(self) -> str:
|
|
95
|
+
"""Return the file extension for this format (e.g., '.parquet', '.csv')."""
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
@abstractmethod
|
|
99
|
+
def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
|
|
100
|
+
pass
|
|
101
|
+
|
|
102
|
+
@abstractmethod
|
|
103
|
+
def write_single(self, lf: pl.LazyFrame, path: str) -> None:
|
|
104
|
+
"""Write a LazyFrame to a single file."""
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
def write_to_path(self, lf: pl.LazyFrame, path: str, partitions: int | None = None) -> None:
|
|
108
|
+
"""Write a LazyFrame to a file or partitioned directory."""
|
|
109
|
+
if partitions is None:
|
|
110
|
+
with Progress() as progress:
|
|
111
|
+
task = progress.add_task("Writing...", total=1)
|
|
112
|
+
self.write_single(lf, path)
|
|
113
|
+
progress.update(task, completed=1)
|
|
114
|
+
else:
|
|
115
|
+
os.makedirs(path, exist_ok=True)
|
|
116
|
+
row_count = lf.select(pl.len()).collect().item()
|
|
117
|
+
rows_per_part = (row_count + partitions - 1) // partitions
|
|
118
|
+
with Progress() as progress:
|
|
119
|
+
task = progress.add_task("Writing partitions...", total=partitions)
|
|
120
|
+
for i in range(partitions):
|
|
121
|
+
offset = i * rows_per_part
|
|
122
|
+
if offset < row_count:
|
|
123
|
+
part_lf = lf.slice(offset, rows_per_part)
|
|
124
|
+
part_path = os.path.join(path, f"part-{i:05d}{self.extension()}")
|
|
125
|
+
self.write_single(part_lf, part_path)
|
|
126
|
+
progress.update(task, advance=1)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
|
|
3
|
+
from rich.table import Table
|
|
4
|
+
from rich import box
|
|
5
|
+
from rich.console import Console
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
from tab_cli.handlers.base import TableWriter
|
|
9
|
+
from tab_cli.style import _ALT_ROW_STYLE, _KEY_STYLE
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CliTableFormatter(TableWriter):
|
|
13
|
+
def __init__(self, truncated: bool = False):
|
|
14
|
+
self.truncated = truncated
|
|
15
|
+
|
|
16
|
+
def extension(self) -> str:
|
|
17
|
+
return ".txt"
|
|
18
|
+
|
|
19
|
+
def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
|
|
20
|
+
|
|
21
|
+
table = Table(
|
|
22
|
+
show_header=True,
|
|
23
|
+
header_style=_KEY_STYLE,
|
|
24
|
+
box=box.SIMPLE_HEAD,
|
|
25
|
+
row_styles=["default", _ALT_ROW_STYLE],
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
for col in lf.collect_schema().names():
|
|
29
|
+
table.add_column(col)
|
|
30
|
+
|
|
31
|
+
for batch in lf.collect_batches():
|
|
32
|
+
for row in batch.iter_rows():
|
|
33
|
+
table.add_row(*[str(v) if v is not None else "" for v in row])
|
|
34
|
+
|
|
35
|
+
if self.truncated:
|
|
36
|
+
table.add_row(*["..." for _ in lf.collect_schema().names()])
|
|
37
|
+
|
|
38
|
+
console = Console(force_terminal=True)
|
|
39
|
+
with console.capture() as capture:
|
|
40
|
+
console.print(table)
|
|
41
|
+
|
|
42
|
+
yield capture.get().encode("utf-8")
|
|
43
|
+
|
|
44
|
+
def write_single(self, lf: pl.LazyFrame, path: str) -> None:
|
|
45
|
+
"""Write a LazyFrame to a single text file."""
|
|
46
|
+
with open(path, "wb") as f:
|
|
47
|
+
for chunk in self.write(lf):
|
|
48
|
+
f.write(chunk)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""CSV file handler using Polars."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from io import BytesIO
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
from tab_cli.handlers.base import TableReader, TableWriter, TableSchema, TableSummary
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CsvHandler(TableReader, TableWriter):
|
|
13
|
+
"""Handler for CSV/TSV files."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, separator: str = ","):
|
|
16
|
+
self.separator = separator
|
|
17
|
+
|
|
18
|
+
def extension(self) -> str:
|
|
19
|
+
return ".csv" if self.separator == "," else ".tsv"
|
|
20
|
+
|
|
21
|
+
def read(self, path: str, limit: int | None = None, offset: int = 0) -> pl.LazyFrame:
|
|
22
|
+
lf = pl.scan_csv(path, separator=self.separator)
|
|
23
|
+
if offset > 0:
|
|
24
|
+
lf = lf.slice(offset, length=limit)
|
|
25
|
+
elif limit is not None:
|
|
26
|
+
lf = lf.head(limit)
|
|
27
|
+
return lf
|
|
28
|
+
|
|
29
|
+
def schema(self, path: str) -> TableSchema:
|
|
30
|
+
lf = pl.scan_csv(path, separator=self.separator)
|
|
31
|
+
columns = list(lf.collect_schema().items())
|
|
32
|
+
return TableSchema(columns=columns)
|
|
33
|
+
|
|
34
|
+
def summary(self, path: str) -> TableSummary:
|
|
35
|
+
file_size = os.path.getsize(path)
|
|
36
|
+
lf = pl.scan_csv(path, separator=self.separator)
|
|
37
|
+
schema = lf.collect_schema()
|
|
38
|
+
num_columns = len(schema)
|
|
39
|
+
num_rows = lf.select(pl.len()).collect().item()
|
|
40
|
+
return TableSummary(
|
|
41
|
+
file_size=file_size,
|
|
42
|
+
num_rows=num_rows,
|
|
43
|
+
num_columns=num_columns,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
|
|
47
|
+
first = True
|
|
48
|
+
for batch in lf.collect_batches():
|
|
49
|
+
output = BytesIO()
|
|
50
|
+
batch.write_csv(output, separator=self.separator, include_header=first)
|
|
51
|
+
first = False
|
|
52
|
+
yield output.getvalue()
|
|
53
|
+
|
|
54
|
+
def write_single(self, lf: pl.LazyFrame, path: str) -> None:
|
|
55
|
+
"""Write a LazyFrame to a single CSV/TSV file."""
|
|
56
|
+
lf.sink_csv(path, separator=self.separator)
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Directory handler for partitioned datasets."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from glob import glob
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
from tab_cli.handlers.base import TableReader, TableSchema, TableSummary
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DirectoryReader(TableReader):
|
|
13
|
+
"""Handler wrapper for partitioned datasets (directories of files)."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, extension: str, file_handler: TableReader) -> None:
|
|
16
|
+
self.extension = extension
|
|
17
|
+
self.file_handler = file_handler
|
|
18
|
+
|
|
19
|
+
def _get_files(self, path: str) -> list[str]:
|
|
20
|
+
"""Get all files with matching extension in the directory."""
|
|
21
|
+
pattern = os.path.join(path, "**", f"*{self.extension}")
|
|
22
|
+
return sorted(glob(pattern, recursive=True))
|
|
23
|
+
|
|
24
|
+
def read(self, path: str, limit: int | None = None, offset: int = 0) -> pl.LazyFrame:
|
|
25
|
+
"""Read data from a partitioned dataset."""
|
|
26
|
+
files = self._get_files(path)
|
|
27
|
+
if not files:
|
|
28
|
+
raise ValueError(f"No {self.extension} files found in {path}")
|
|
29
|
+
|
|
30
|
+
frames = [self.file_handler.read(file) for file in files]
|
|
31
|
+
lf = pl.concat(frames, how="vertical")
|
|
32
|
+
|
|
33
|
+
if offset > 0:
|
|
34
|
+
lf = lf.slice(offset, length=limit)
|
|
35
|
+
elif limit is not None:
|
|
36
|
+
lf = lf.head(limit)
|
|
37
|
+
|
|
38
|
+
return lf
|
|
39
|
+
|
|
40
|
+
def schema(self, path: str) -> TableSchema:
|
|
41
|
+
"""Get the schema from the partitioned dataset."""
|
|
42
|
+
files = self._get_files(path)
|
|
43
|
+
if not files:
|
|
44
|
+
raise ValueError(f"No {self.extension} files found in {path}")
|
|
45
|
+
return self.file_handler.schema(files[0])
|
|
46
|
+
|
|
47
|
+
def summary(self, path: str) -> TableSummary:
|
|
48
|
+
"""Get aggregated summary from all partition files."""
|
|
49
|
+
files = self._get_files(path)
|
|
50
|
+
if not files:
|
|
51
|
+
raise ValueError(f"No {self.extension} files found in {path}")
|
|
52
|
+
|
|
53
|
+
file_size = 0
|
|
54
|
+
num_rows = 0
|
|
55
|
+
num_columns: int | None = None
|
|
56
|
+
|
|
57
|
+
extra_numeric: dict[str, float] = {}
|
|
58
|
+
extra_strings: dict[str, set[str]] = {}
|
|
59
|
+
|
|
60
|
+
for file in files:
|
|
61
|
+
file_summary = self.file_handler.summary(file)
|
|
62
|
+
file_size += file_summary.file_size
|
|
63
|
+
num_rows += file_summary.num_rows
|
|
64
|
+
|
|
65
|
+
if num_columns is None:
|
|
66
|
+
num_columns = file_summary.num_columns
|
|
67
|
+
elif file_summary.num_columns != num_columns:
|
|
68
|
+
raise ValueError(f"Inconsistent column counts in {path}")
|
|
69
|
+
|
|
70
|
+
if file_summary.extra:
|
|
71
|
+
for key, value in file_summary.extra.items():
|
|
72
|
+
if isinstance(value, (int, float)):
|
|
73
|
+
extra_numeric[key] = extra_numeric.get(key, 0) + value
|
|
74
|
+
else:
|
|
75
|
+
extra_strings.setdefault(key, set()).add(str(value))
|
|
76
|
+
|
|
77
|
+
extra: dict[str, str | int | float] = {"Partitions": len(files)}
|
|
78
|
+
for key, value in extra_numeric.items():
|
|
79
|
+
if float(value).is_integer():
|
|
80
|
+
extra[key] = int(value)
|
|
81
|
+
else:
|
|
82
|
+
extra[key] = value
|
|
83
|
+
|
|
84
|
+
for key, values in extra_strings.items():
|
|
85
|
+
if len(values) == 1:
|
|
86
|
+
extra[key] = next(iter(values))
|
|
87
|
+
else:
|
|
88
|
+
extra[key] = ", ".join(sorted(values))
|
|
89
|
+
|
|
90
|
+
return TableSummary(
|
|
91
|
+
file_size=file_size,
|
|
92
|
+
num_rows=num_rows,
|
|
93
|
+
num_columns=num_columns or 0,
|
|
94
|
+
extra=extra,
|
|
95
|
+
)
|
|
96
|
+
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
import json
|
|
4
|
+
import polars as pl
|
|
5
|
+
from tab_cli.handlers.base import TableReader, TableWriter, TableSchema, TableSummary
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class JsonlHandler(TableReader, TableWriter):
|
|
9
|
+
|
|
10
|
+
def read(self, path: str, limit: int | None = None, offset: int = 0) -> pl.LazyFrame:
|
|
11
|
+
lf = pl.scan_ndjson(path)
|
|
12
|
+
if offset > 0:
|
|
13
|
+
lf = lf.slice(offset, length=limit)
|
|
14
|
+
elif limit is not None:
|
|
15
|
+
lf = lf.head(limit)
|
|
16
|
+
return lf
|
|
17
|
+
|
|
18
|
+
def schema(self, path: str) -> TableSchema:
|
|
19
|
+
lf = pl.scan_ndjson(path)
|
|
20
|
+
columns = list(lf.collect_schema().items())
|
|
21
|
+
return TableSchema(columns=columns)
|
|
22
|
+
|
|
23
|
+
def summary(self, path: str) -> TableSummary:
|
|
24
|
+
file_size = os.path.getsize(path)
|
|
25
|
+
lf = pl.scan_ndjson(path)
|
|
26
|
+
schema = lf.collect_schema()
|
|
27
|
+
num_columns = len(schema)
|
|
28
|
+
num_rows = lf.select(pl.len()).collect().item()
|
|
29
|
+
return TableSummary(
|
|
30
|
+
file_size=file_size,
|
|
31
|
+
num_rows=num_rows,
|
|
32
|
+
num_columns=num_columns,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def extension(self) -> str:
|
|
36
|
+
return ".jsonl"
|
|
37
|
+
|
|
38
|
+
def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
|
|
39
|
+
for batch in lf.collect_batches():
|
|
40
|
+
for row in batch.iter_rows(named=True):
|
|
41
|
+
yield (json.dumps(row, default=str, ensure_ascii=False) + "\n").encode("utf-8")
|
|
42
|
+
|
|
43
|
+
def write_single(self, lf: pl.LazyFrame, path: str) -> None:
|
|
44
|
+
"""Write a LazyFrame to a single JSONL file."""
|
|
45
|
+
with open(path, "wb") as f:
|
|
46
|
+
for chunk in self.write(lf):
|
|
47
|
+
f.write(chunk)
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Parquet file handler using Polars."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from io import BytesIO
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
from tab_cli.handlers.base import TableReader, TableWriter, TableSchema, TableSummary
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ParquetHandler(TableReader, TableWriter):
|
|
13
|
+
"""Handler for Parquet files."""
|
|
14
|
+
|
|
15
|
+
def extension(self) -> str:
|
|
16
|
+
return ".parquet"
|
|
17
|
+
|
|
18
|
+
def read(self, path: str, limit: int | None = None, offset: int = 0) -> pl.LazyFrame:
|
|
19
|
+
"""Read data from a Parquet file."""
|
|
20
|
+
df = pl.scan_parquet(path)
|
|
21
|
+
if offset > 0:
|
|
22
|
+
df = df.slice(offset, length=limit)
|
|
23
|
+
elif limit is not None:
|
|
24
|
+
df = df.head(limit)
|
|
25
|
+
return df
|
|
26
|
+
|
|
27
|
+
def schema(self, path: str) -> TableSchema:
|
|
28
|
+
"""Get the schema of the Parquet file."""
|
|
29
|
+
lf = pl.scan_parquet(path)
|
|
30
|
+
columns = list(lf.collect_schema().items())
|
|
31
|
+
return TableSchema(columns=columns)
|
|
32
|
+
|
|
33
|
+
def summary(self, path: str) -> TableSummary:
|
|
34
|
+
"""Get summary information about the Parquet file."""
|
|
35
|
+
import pyarrow.parquet as pq
|
|
36
|
+
|
|
37
|
+
file_size = os.path.getsize(path)
|
|
38
|
+
lf = pl.scan_parquet(path)
|
|
39
|
+
schema = lf.collect_schema()
|
|
40
|
+
num_columns = len(schema)
|
|
41
|
+
num_rows = lf.select(pl.len()).collect().item()
|
|
42
|
+
|
|
43
|
+
# Get parquet metadata using pyarrow
|
|
44
|
+
pf = pq.ParquetFile(path)
|
|
45
|
+
metadata = pf.metadata
|
|
46
|
+
|
|
47
|
+
extra: dict[str, str | int | float] = {}
|
|
48
|
+
|
|
49
|
+
# Collect compression codecs from all column chunks
|
|
50
|
+
codecs: set[str] = set()
|
|
51
|
+
for rg_idx in range(metadata.num_row_groups):
|
|
52
|
+
rg = metadata.row_group(rg_idx)
|
|
53
|
+
for col_idx in range(rg.num_columns):
|
|
54
|
+
col = rg.column(col_idx)
|
|
55
|
+
codecs.add(col.compression)
|
|
56
|
+
extra["Row groups"] = metadata.num_row_groups
|
|
57
|
+
if codecs:
|
|
58
|
+
extra["Compression"] = ", ".join(sorted(codecs))
|
|
59
|
+
|
|
60
|
+
return TableSummary(
|
|
61
|
+
file_size=file_size,
|
|
62
|
+
num_rows=num_rows,
|
|
63
|
+
num_columns=num_columns,
|
|
64
|
+
extra=extra,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
|
|
68
|
+
"""Write a LazyFrame to Parquet bytes."""
|
|
69
|
+
output = BytesIO()
|
|
70
|
+
lf.sink_parquet(output)
|
|
71
|
+
yield output.getvalue()
|
|
72
|
+
|
|
73
|
+
def write_single(self, lf: pl.LazyFrame, path: str) -> None:
|
|
74
|
+
"""Write a LazyFrame to a single Parquet file."""
|
|
75
|
+
lf.sink_parquet(path)
|