tab-cli 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tab_cli/__init__.py +3 -0
- tab_cli/cli.py +171 -0
- tab_cli/config.py +14 -0
- tab_cli/formats/__init__.py +15 -0
- tab_cli/formats/avro.py +47 -0
- tab_cli/formats/base.py +63 -0
- tab_cli/formats/csv.py +45 -0
- tab_cli/formats/jsonl.py +41 -0
- tab_cli/formats/parquet.py +57 -0
- tab_cli/handlers/__init__.py +87 -0
- tab_cli/handlers/base.py +259 -0
- tab_cli/handlers/cli_table.py +55 -0
- tab_cli/storage/__init__.py +83 -0
- tab_cli/storage/aws.py +223 -0
- tab_cli/storage/az.py +249 -0
- tab_cli/storage/base.py +36 -0
- tab_cli/storage/fsspec.py +60 -0
- tab_cli/storage/gcloud.py +215 -0
- tab_cli/storage/local.py +25 -0
- tab_cli/style.py +4 -0
- tab_cli/url_parser.py +97 -0
- tab_cli-0.1.1.dist-info/METADATA +27 -0
- tab_cli-0.1.1.dist-info/RECORD +26 -0
- tab_cli-0.1.1.dist-info/WHEEL +4 -0
- tab_cli-0.1.1.dist-info/entry_points.txt +2 -0
- tab_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
tab_cli/__init__.py
ADDED
tab_cli/cli.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""Main CLI entry point using Typer."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from typing import Annotated, Optional
|
|
5
|
+
|
|
6
|
+
from loguru import logger
|
|
7
|
+
import polars as pl
|
|
8
|
+
import typer
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from rich.logging import RichHandler
|
|
11
|
+
|
|
12
|
+
from tab_cli import config
|
|
13
|
+
from tab_cli.handlers import TableWriter, infer_reader, infer_writer
|
|
14
|
+
|
|
15
|
+
app = typer.Typer(
|
|
16
|
+
help="A CLI tool for viewing and manipulating tabular data.",
|
|
17
|
+
no_args_is_help=True,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@app.callback()
|
|
22
|
+
def main_callback(
|
|
23
|
+
az_url_authority_is_account: Annotated[
|
|
24
|
+
bool,
|
|
25
|
+
typer.Option(
|
|
26
|
+
"--az-url-authority-is-account",
|
|
27
|
+
help="Interpret az:// URL authority as storage account name instead of container name",
|
|
28
|
+
),
|
|
29
|
+
] = False,
|
|
30
|
+
log_level: Annotated[
|
|
31
|
+
str,
|
|
32
|
+
typer.Option("--log-level", help="Log level from {DEBUG, INFO, WARNING, ERROR, CRITICAL}"),
|
|
33
|
+
] = "INFO",
|
|
34
|
+
) -> None:
|
|
35
|
+
"""Global options for tab_cli CLI."""
|
|
36
|
+
config.config.az_url_authority_is_account = az_url_authority_is_account
|
|
37
|
+
logger.remove()
|
|
38
|
+
logger.add(
|
|
39
|
+
RichHandler(
|
|
40
|
+
rich_tracebacks=True,
|
|
41
|
+
tracebacks_show_locals=True,
|
|
42
|
+
markup=True,
|
|
43
|
+
),
|
|
44
|
+
format="{message}",
|
|
45
|
+
level=log_level.upper(),
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _output(
|
|
50
|
+
lf: pl.LazyFrame,
|
|
51
|
+
limit: int | None,
|
|
52
|
+
skip: int,
|
|
53
|
+
output: str | None,
|
|
54
|
+
) -> None:
|
|
55
|
+
show_truncation = limit is None and output is None
|
|
56
|
+
actual_limit = 20 if show_truncation else limit
|
|
57
|
+
|
|
58
|
+
if show_truncation:
|
|
59
|
+
assert actual_limit is not None
|
|
60
|
+
lf = lf.slice(skip, length=actual_limit + 1)
|
|
61
|
+
df = lf.collect()
|
|
62
|
+
truncated = len(df) > actual_limit
|
|
63
|
+
if truncated:
|
|
64
|
+
df = df.head(actual_limit)
|
|
65
|
+
lf = df.lazy()
|
|
66
|
+
else:
|
|
67
|
+
if skip > 0 or actual_limit is not None:
|
|
68
|
+
lf = lf.slice(skip, length=actual_limit)
|
|
69
|
+
truncated = False
|
|
70
|
+
|
|
71
|
+
writer = infer_writer(output, truncated=show_truncation and truncated)
|
|
72
|
+
|
|
73
|
+
for chunk in writer.write(lf):
|
|
74
|
+
sys.stdout.buffer.write(chunk)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@app.command()
|
|
78
|
+
def view(
|
|
79
|
+
path: Annotated[str, typer.Argument(help="Path to the data file or directory")],
|
|
80
|
+
limit: Annotated[Optional[int], typer.Option("--limit", help="Maximum number of rows to display")] = None,
|
|
81
|
+
skip: Annotated[int, typer.Option("--skip", help="Number of rows to skip")] = 0,
|
|
82
|
+
input: Annotated[Optional[str], typer.Option("-i", "--input-format", help="Input format")] = None,
|
|
83
|
+
output: Annotated[Optional[str], typer.Option("-o", "--output-format", help="Output format")] = None,
|
|
84
|
+
) -> None:
|
|
85
|
+
"""View tabular data from a file."""
|
|
86
|
+
reader = infer_reader(path, format=input)
|
|
87
|
+
lf = reader.read(path)
|
|
88
|
+
_output(lf, limit=limit, skip=skip, output=output)
|
|
89
|
+
|
|
90
|
+
@app.command()
|
|
91
|
+
def schema(
|
|
92
|
+
path: Annotated[str, typer.Argument(help="Path to the data file or directory")],
|
|
93
|
+
input: Annotated[Optional[str], typer.Option("-i", "--input-format", help="Input format")] = None,
|
|
94
|
+
) -> None:
|
|
95
|
+
"""Display the schema of a tabular data file."""
|
|
96
|
+
reader = infer_reader(path, format=input)
|
|
97
|
+
table_schema = reader.schema(path)
|
|
98
|
+
console = Console(force_terminal=True)
|
|
99
|
+
console.print(table_schema)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@app.command()
|
|
103
|
+
def sql(
|
|
104
|
+
query: Annotated[str, typer.Argument(help="SQL query to execute (table is available as 't')")],
|
|
105
|
+
path: Annotated[str, typer.Argument(help="Path to the data file or directory")],
|
|
106
|
+
limit: Annotated[Optional[int], typer.Option("--limit", help="Maximum number of rows to display")] = None,
|
|
107
|
+
skip: Annotated[int, typer.Option("--skip", help="Number of rows to skip")] = 0,
|
|
108
|
+
input: Annotated[Optional[str], typer.Option("-i", "--input-format", help="Input format")] = None,
|
|
109
|
+
output: Annotated[Optional[str], typer.Option("-o", "--output-format", help="Output format")] = None,
|
|
110
|
+
) -> None:
|
|
111
|
+
"""Run a SQL query on tabular data. The table is available as 't'."""
|
|
112
|
+
reader = infer_reader(path, format=input)
|
|
113
|
+
lf = reader.read(path)
|
|
114
|
+
ctx = pl.SQLContext(t=lf, eager=False)
|
|
115
|
+
result_lf = ctx.execute(query)
|
|
116
|
+
_output(result_lf, limit=limit, skip=skip, output=output)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@app.command()
|
|
120
|
+
def summary(
|
|
121
|
+
path: Annotated[str, typer.Argument(help="Path to the data file or directory")],
|
|
122
|
+
input: Annotated[Optional[str], typer.Option("-i", "--input-format", help="Input format")] = None,
|
|
123
|
+
) -> None:
|
|
124
|
+
"""Display summary information about a tabular data file."""
|
|
125
|
+
handler = infer_reader(path, format=input)
|
|
126
|
+
table_summary = handler.summary(path)
|
|
127
|
+
console = Console(force_terminal=True)
|
|
128
|
+
console.print(table_summary)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@app.command()
|
|
132
|
+
def convert(
|
|
133
|
+
src: Annotated[str, typer.Argument(help="Path to the source file or directory")],
|
|
134
|
+
dst: Annotated[str, typer.Argument(help="Path to the destination file or directory")],
|
|
135
|
+
input: Annotated[Optional[str], typer.Option("-i", "--input-format", help="Input format")] = None,
|
|
136
|
+
output: Annotated[Optional[str], typer.Option("-o", "--output-format", help="Output format")] = None,
|
|
137
|
+
num_partitions: Annotated[Optional[int], typer.Option("-n", "--num-partitions", help="Number of output partitions")] = None,
|
|
138
|
+
) -> None:
|
|
139
|
+
"""Convert tabular data from one format to another."""
|
|
140
|
+
reader = infer_reader(src, format=input)
|
|
141
|
+
# Determine output format: use -o if specified, else inherit from input
|
|
142
|
+
if output is not None:
|
|
143
|
+
writer = infer_writer(format=output)
|
|
144
|
+
elif input is not None:
|
|
145
|
+
writer = infer_writer(format=input)
|
|
146
|
+
else:
|
|
147
|
+
writer = reader
|
|
148
|
+
assert isinstance(writer, TableWriter)
|
|
149
|
+
lf = reader.read(src)
|
|
150
|
+
writer.write_to_path(lf, dst, partitions=num_partitions)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@app.command()
|
|
154
|
+
def cat(
|
|
155
|
+
paths: Annotated[list[str], typer.Argument(help="Paths to the data files or directories")],
|
|
156
|
+
input: Annotated[Optional[str], typer.Option("-i", "--input-format", help="Input format")] = None,
|
|
157
|
+
output: Annotated[Optional[str], typer.Option("-o", "--output-format", help="Output format")] = None,
|
|
158
|
+
) -> None:
|
|
159
|
+
"""Concatenate tabular data from multiple files."""
|
|
160
|
+
reader = infer_reader(paths[0], format=input)
|
|
161
|
+
files = [reader.read(path) for path in paths]
|
|
162
|
+
lf = pl.concat(files, how="vertical")
|
|
163
|
+
_output(lf, limit=None, skip=0, output=output)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def main() -> None:
|
|
167
|
+
app()
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
if __name__ == "__main__":
|
|
171
|
+
main()
|
tab_cli/config.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Global configuration for tab_cli-cli."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class Config:
|
|
8
|
+
"""Global configuration settings."""
|
|
9
|
+
|
|
10
|
+
az_url_authority_is_account: bool = False
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# Global config instance
|
|
14
|
+
config: Config = Config()
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Format handlers for reading and writing tabular data."""
|
|
2
|
+
|
|
3
|
+
from tab_cli.formats.base import FormatHandler
|
|
4
|
+
from tab_cli.formats.avro import AvroFormat
|
|
5
|
+
from tab_cli.formats.csv import CsvFormat
|
|
6
|
+
from tab_cli.formats.jsonl import JsonlFormat
|
|
7
|
+
from tab_cli.formats.parquet import ParquetFormat
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"FormatHandler",
|
|
11
|
+
"AvroFormat",
|
|
12
|
+
"CsvFormat",
|
|
13
|
+
"JsonlFormat",
|
|
14
|
+
"ParquetFormat",
|
|
15
|
+
]
|
tab_cli/formats/avro.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Avro format handler using polars-fastavro."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
from typing import BinaryIO
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
import polars_fastavro
|
|
9
|
+
|
|
10
|
+
from tab_cli.formats.base import FormatHandler
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AvroFormat(FormatHandler):
|
|
14
|
+
"""Handler for Avro files."""
|
|
15
|
+
|
|
16
|
+
def extension(self) -> str:
|
|
17
|
+
return ".avro"
|
|
18
|
+
|
|
19
|
+
def supports_glob(self) -> bool:
|
|
20
|
+
# polars_fastavro doesn't support glob patterns
|
|
21
|
+
return False
|
|
22
|
+
|
|
23
|
+
def scan(self, url: str, storage_options: dict[str, str] | None = None) -> pl.LazyFrame:
|
|
24
|
+
# polars_fastavro doesn't support storage_options, so cloud URIs
|
|
25
|
+
# need to be accessed through fsspec first
|
|
26
|
+
return polars_fastavro.scan_avro(url)
|
|
27
|
+
|
|
28
|
+
def read_stream(self, stream: BinaryIO) -> pl.DataFrame:
|
|
29
|
+
return polars_fastavro.read_avro(stream)
|
|
30
|
+
|
|
31
|
+
def collect_schema(self, url: str, storage_options: dict[str, str] | None = None) -> list[tuple[str, pl.DataType]]:
|
|
32
|
+
# polars_fastavro doesn't support storage_options
|
|
33
|
+
return list(polars_fastavro.scan_avro(url).collect_schema().items())
|
|
34
|
+
|
|
35
|
+
def count_rows(self, url: str, storage_options: dict[str, str] | None = None) -> int:
|
|
36
|
+
# polars_fastavro doesn't support storage_options
|
|
37
|
+
return polars_fastavro.scan_avro(url).select(pl.len()).collect().item()
|
|
38
|
+
|
|
39
|
+
def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
|
|
40
|
+
output = BytesIO()
|
|
41
|
+
df = lf.collect()
|
|
42
|
+
polars_fastavro.write_avro(df, output)
|
|
43
|
+
yield output.getvalue()
|
|
44
|
+
|
|
45
|
+
def write_to_single_file(self, lf: pl.LazyFrame, path: str) -> None:
|
|
46
|
+
df = lf.collect()
|
|
47
|
+
polars_fastavro.write_avro(df, path)
|
tab_cli/formats/base.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Base format handler interface."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from typing import BinaryIO
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FormatHandler(ABC):
|
|
11
|
+
"""Handles reading and writing a specific tabular format."""
|
|
12
|
+
|
|
13
|
+
@abstractmethod
|
|
14
|
+
def extension(self) -> str:
|
|
15
|
+
"""Return the file extension (e.g., '.parquet')."""
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
def supports_glob(self) -> bool:
|
|
19
|
+
"""Whether this format supports glob patterns in scan().
|
|
20
|
+
|
|
21
|
+
Formats with native Polars support (Parquet, CSV, JSONL) can scan
|
|
22
|
+
directories directly. Others (Avro) need manual file iteration.
|
|
23
|
+
"""
|
|
24
|
+
return False
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def scan(self, url: str, storage_options: dict[str, str] | None = None) -> pl.LazyFrame:
|
|
28
|
+
"""Scan from a URL (local path or cloud URL).
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
url: The URL to scan from.
|
|
32
|
+
storage_options: Optional storage options for cloud access.
|
|
33
|
+
"""
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def read_stream(self, stream: BinaryIO) -> pl.DataFrame:
|
|
38
|
+
"""Read from a byte stream. Returns eager DataFrame."""
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
@abstractmethod
|
|
42
|
+
def collect_schema(self, url: str, storage_options: dict[str, str] | None = None) -> list[tuple[str, pl.DataType]]:
|
|
43
|
+
"""Get schema as list of (name, dtype) tuples."""
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
@abstractmethod
|
|
47
|
+
def count_rows(self, url: str, storage_options: dict[str, str] | None = None) -> int:
|
|
48
|
+
"""Count rows in the file."""
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
def extra_summary(self, url: str) -> dict[str, str | int | float] | None:
|
|
52
|
+
"""Return format-specific summary metadata, if any."""
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
@abstractmethod
|
|
56
|
+
def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
|
|
57
|
+
"""Write LazyFrame to bytes (for streaming output)."""
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
@abstractmethod
|
|
61
|
+
def write_to_single_file(self, lf: pl.LazyFrame, path: str) -> None:
|
|
62
|
+
"""Write LazyFrame to a single file."""
|
|
63
|
+
pass
|
tab_cli/formats/csv.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""CSV/TSV format handler."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
from typing import BinaryIO
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
from tab_cli.formats.base import FormatHandler
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CsvFormat(FormatHandler):
|
|
13
|
+
"""Handler for CSV/TSV files."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, separator: str = ","):
|
|
16
|
+
self.separator = separator
|
|
17
|
+
|
|
18
|
+
def extension(self) -> str:
|
|
19
|
+
return ".csv" if self.separator == "," else ".tsv"
|
|
20
|
+
|
|
21
|
+
def supports_glob(self) -> bool:
|
|
22
|
+
return True
|
|
23
|
+
|
|
24
|
+
def scan(self, url: str, storage_options: dict[str, str] | None = None) -> pl.LazyFrame:
|
|
25
|
+
return pl.scan_csv(url, separator=self.separator, storage_options=storage_options)
|
|
26
|
+
|
|
27
|
+
def read_stream(self, stream: BinaryIO) -> pl.DataFrame:
|
|
28
|
+
return pl.read_csv(stream, separator=self.separator)
|
|
29
|
+
|
|
30
|
+
def collect_schema(self, url: str, storage_options: dict[str, str] | None = None) -> list[tuple[str, pl.DataType]]:
|
|
31
|
+
return list(pl.scan_csv(url, separator=self.separator, storage_options=storage_options).collect_schema().items())
|
|
32
|
+
|
|
33
|
+
def count_rows(self, url: str, storage_options: dict[str, str] | None = None) -> int:
|
|
34
|
+
return pl.scan_csv(url, separator=self.separator, storage_options=storage_options).select(pl.len()).collect().item()
|
|
35
|
+
|
|
36
|
+
def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
|
|
37
|
+
first = True
|
|
38
|
+
for batch in lf.collect_batches():
|
|
39
|
+
output = BytesIO()
|
|
40
|
+
batch.write_csv(output, separator=self.separator, include_header=first)
|
|
41
|
+
first = False
|
|
42
|
+
yield output.getvalue()
|
|
43
|
+
|
|
44
|
+
def write_to_single_file(self, lf: pl.LazyFrame, path: str) -> None:
|
|
45
|
+
lf.sink_csv(path, separator=self.separator)
|
tab_cli/formats/jsonl.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""JSONL (newline-delimited JSON) format handler."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from typing import BinaryIO
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
from tab_cli.formats.base import FormatHandler
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class JsonlFormat(FormatHandler):
|
|
13
|
+
"""Handler for JSONL files."""
|
|
14
|
+
|
|
15
|
+
def extension(self) -> str:
|
|
16
|
+
return ".jsonl"
|
|
17
|
+
|
|
18
|
+
def supports_glob(self) -> bool:
|
|
19
|
+
return True
|
|
20
|
+
|
|
21
|
+
def scan(self, url: str, storage_options: dict[str, str] | None = None) -> pl.LazyFrame:
|
|
22
|
+
return pl.scan_ndjson(url, storage_options=storage_options)
|
|
23
|
+
|
|
24
|
+
def read_stream(self, stream: BinaryIO) -> pl.DataFrame:
|
|
25
|
+
return pl.read_ndjson(stream)
|
|
26
|
+
|
|
27
|
+
def collect_schema(self, url: str, storage_options: dict[str, str] | None = None) -> list[tuple[str, pl.DataType]]:
|
|
28
|
+
return list(pl.scan_ndjson(url, storage_options=storage_options).collect_schema().items())
|
|
29
|
+
|
|
30
|
+
def count_rows(self, url: str, storage_options: dict[str, str] | None = None) -> int:
|
|
31
|
+
return pl.scan_ndjson(url, storage_options=storage_options).select(pl.len()).collect().item()
|
|
32
|
+
|
|
33
|
+
def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
|
|
34
|
+
for batch in lf.collect_batches():
|
|
35
|
+
for row in batch.iter_rows(named=True):
|
|
36
|
+
yield (json.dumps(row, default=str, ensure_ascii=False) + "\n").encode("utf-8")
|
|
37
|
+
|
|
38
|
+
def write_to_single_file(self, lf: pl.LazyFrame, path: str) -> None:
|
|
39
|
+
with open(path, "wb") as f:
|
|
40
|
+
for chunk in self.write(lf):
|
|
41
|
+
f.write(chunk)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Parquet format handler."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
from typing import BinaryIO
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
from tab_cli.formats.base import FormatHandler
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ParquetFormat(FormatHandler):
|
|
13
|
+
"""Handler for Parquet files."""
|
|
14
|
+
|
|
15
|
+
def extension(self) -> str:
|
|
16
|
+
return ".parquet"
|
|
17
|
+
|
|
18
|
+
def supports_glob(self) -> bool:
|
|
19
|
+
return True
|
|
20
|
+
|
|
21
|
+
def scan(self, url: str, storage_options: dict[str, str] | None = None) -> pl.LazyFrame:
|
|
22
|
+
return pl.scan_parquet(url, storage_options=storage_options)
|
|
23
|
+
|
|
24
|
+
def read_stream(self, stream: BinaryIO) -> pl.DataFrame:
|
|
25
|
+
return pl.read_parquet(stream)
|
|
26
|
+
|
|
27
|
+
def collect_schema(self, url: str, storage_options: dict[str, str] | None = None) -> list[tuple[str, pl.DataType]]:
|
|
28
|
+
return list(pl.scan_parquet(url, storage_options=storage_options).collect_schema().items())
|
|
29
|
+
|
|
30
|
+
def count_rows(self, url: str, storage_options: dict[str, str] | None = None) -> int:
|
|
31
|
+
return pl.scan_parquet(url, storage_options=storage_options).select(pl.len()).collect().item()
|
|
32
|
+
|
|
33
|
+
def extra_summary(self, url: str) -> dict[str, str | int | float] | None:
|
|
34
|
+
import pyarrow.parquet as pq
|
|
35
|
+
|
|
36
|
+
pf = pq.ParquetFile(url)
|
|
37
|
+
metadata = pf.metadata
|
|
38
|
+
|
|
39
|
+
extra: dict[str, str | int | float] = {}
|
|
40
|
+
codecs: set[str] = set()
|
|
41
|
+
for rg_idx in range(metadata.num_row_groups):
|
|
42
|
+
rg = metadata.row_group(rg_idx)
|
|
43
|
+
for col_idx in range(rg.num_columns):
|
|
44
|
+
col = rg.column(col_idx)
|
|
45
|
+
codecs.add(col.compression)
|
|
46
|
+
extra["Row groups"] = metadata.num_row_groups
|
|
47
|
+
if codecs:
|
|
48
|
+
extra["Compression"] = ", ".join(sorted(codecs))
|
|
49
|
+
return extra
|
|
50
|
+
|
|
51
|
+
def write(self, lf: pl.LazyFrame) -> Iterable[bytes]:
|
|
52
|
+
output = BytesIO()
|
|
53
|
+
lf.sink_parquet(output)
|
|
54
|
+
yield output.getvalue()
|
|
55
|
+
|
|
56
|
+
def write_to_single_file(self, lf: pl.LazyFrame, path: str) -> None:
|
|
57
|
+
lf.sink_parquet(path)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Handler registration and inference."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
from tab_cli.formats import AvroFormat, CsvFormat, JsonlFormat, ParquetFormat
|
|
6
|
+
from tab_cli.formats.base import FormatHandler
|
|
7
|
+
from tab_cli.handlers.base import FormatWriter, TableReader, TableWriter
|
|
8
|
+
from tab_cli.handlers.cli_table import CliTableFormatter
|
|
9
|
+
from tab_cli.storage import get_backend
|
|
10
|
+
|
|
11
|
+
# Format handlers
|
|
12
|
+
_FORMAT_MAP: dict[str, FormatHandler] = {
|
|
13
|
+
"csv": CsvFormat(","),
|
|
14
|
+
"tsv": CsvFormat("\t"),
|
|
15
|
+
"parquet": ParquetFormat(),
|
|
16
|
+
"jsonl": JsonlFormat(),
|
|
17
|
+
"avro": AvroFormat(),
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _get_extension(path: str) -> str:
|
|
22
|
+
"""Extract file extension from a path or URL."""
|
|
23
|
+
# Handle URIs by getting the path component
|
|
24
|
+
if "://" in path:
|
|
25
|
+
path = path.split("://", 1)[1]
|
|
26
|
+
# Get extension from basename
|
|
27
|
+
basename = os.path.basename(path.rstrip("/"))
|
|
28
|
+
return os.path.splitext(basename)[1][1:].lower()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def infer_reader(path: str, format: str | None = None) -> TableReader:
|
|
32
|
+
"""Infer the reader for a file.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
path: Path to the file or directory (local or cloud URL).
|
|
36
|
+
format: Explicit format override. If None, inferred from extension.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
TableReader configured for the format and storage backend.
|
|
40
|
+
"""
|
|
41
|
+
backend = get_backend(path)
|
|
42
|
+
|
|
43
|
+
if format is not None:
|
|
44
|
+
fmt = _FORMAT_MAP.get(format.lower())
|
|
45
|
+
if fmt is None:
|
|
46
|
+
raise ValueError(f"Unknown format: {format}. Supported: {', '.join(_FORMAT_MAP)}")
|
|
47
|
+
return TableReader(backend, fmt)
|
|
48
|
+
|
|
49
|
+
# Infer format from path
|
|
50
|
+
if backend.is_directory(path):
|
|
51
|
+
# Get extension from first file in directory
|
|
52
|
+
for file_info in backend.list_files(path, ""):
|
|
53
|
+
extension = _get_extension(file_info.url)
|
|
54
|
+
if extension:
|
|
55
|
+
break
|
|
56
|
+
else:
|
|
57
|
+
raise ValueError(f"No files found in directory: {path}")
|
|
58
|
+
else:
|
|
59
|
+
extension = _get_extension(path)
|
|
60
|
+
|
|
61
|
+
fmt = _FORMAT_MAP.get(extension)
|
|
62
|
+
if fmt is None:
|
|
63
|
+
raise ValueError(f"Unknown extension: {extension}. Supported: {', '.join(_FORMAT_MAP)}")
|
|
64
|
+
|
|
65
|
+
return TableReader(backend, fmt)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def infer_writer(format: str | None = None, truncated: bool = False) -> TableWriter:
|
|
69
|
+
"""Infer the writer for a format.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
format: Output format. If None, returns CLI table formatter.
|
|
73
|
+
truncated: Whether the output is truncated (for CLI display).
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
TableWriter for the format.
|
|
77
|
+
"""
|
|
78
|
+
if format is None:
|
|
79
|
+
return CliTableFormatter(truncated=truncated)
|
|
80
|
+
if format == "table-svg":
|
|
81
|
+
return CliTableFormatter(truncated=truncated, svg_capture=True)
|
|
82
|
+
|
|
83
|
+
fmt = _FORMAT_MAP.get(format.lower())
|
|
84
|
+
if fmt is None:
|
|
85
|
+
raise ValueError(f"Unknown format: {format}. Supported: {', '.join(_FORMAT_MAP)}")
|
|
86
|
+
|
|
87
|
+
return FormatWriter(fmt)
|