parq-cli 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parq/__init__.py +10 -0
- parq/__main__.py +15 -0
- parq/cli.py +130 -0
- parq/output.py +176 -0
- parq/reader.py +164 -0
- parq_cli-0.0.1.dist-info/METADATA +226 -0
- parq_cli-0.0.1.dist-info/RECORD +10 -0
- parq_cli-0.0.1.dist-info/WHEEL +4 -0
- parq_cli-0.0.1.dist-info/entry_points.txt +2 -0
- parq_cli-0.0.1.dist-info/licenses/LICENSE +21 -0
parq/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""
|
|
2
|
+
parq-cli: A powerful command-line tool for inspecting and analyzing Apache Parquet files.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
__version__ = "0.1.0"
|
|
6
|
+
__author__ = "Jinfeng Sun"
|
|
7
|
+
|
|
8
|
+
# {{CHENGQI:
|
|
9
|
+
# Action: Created; Timestamp: 2025-10-14 16:12:00 +08:00; Reason: Package initialization file; Principle_Applied: KISS
|
|
10
|
+
# }}
|
parq/__main__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Entry point for running parq as a module.
|
|
3
|
+
Allows execution via: python -m parq
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from parq.cli import app
|
|
7
|
+
|
|
8
|
+
if __name__ == "__main__":
|
|
9
|
+
app()
|
|
10
|
+
|
|
11
|
+
# {{CHENGQI:
|
|
12
|
+
# Action: Modified; Timestamp: 2025-10-14 18:07:04 +08:00;
|
|
13
|
+
# Reason: Entry point using app() with command-based design;
|
|
14
|
+
# Principle_Applied: KISS, Typer best practices
|
|
15
|
+
# }}
|
parq/cli.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI application module.
|
|
3
|
+
Command-line interface for parq-cli tool.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
import typer
|
|
10
|
+
from typing_extensions import Annotated
|
|
11
|
+
|
|
12
|
+
from parq.output import OutputFormatter
|
|
13
|
+
from parq.reader import ParquetReader
|
|
14
|
+
|
|
15
|
+
app = typer.Typer(
|
|
16
|
+
name="parq",
|
|
17
|
+
help="A powerful command-line tool for inspecting Apache Parquet files ๐",
|
|
18
|
+
add_completion=False,
|
|
19
|
+
no_args_is_help=False,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
formatter = OutputFormatter()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@app.command()
|
|
26
|
+
def main(
|
|
27
|
+
file: Annotated[
|
|
28
|
+
Optional[Path],
|
|
29
|
+
typer.Argument(
|
|
30
|
+
help="Path to Parquet file",
|
|
31
|
+
),
|
|
32
|
+
] = None,
|
|
33
|
+
schema: Annotated[
|
|
34
|
+
bool, typer.Option("--schema", "-s", help="Display schema information")
|
|
35
|
+
] = False,
|
|
36
|
+
head: Annotated[Optional[int], typer.Option("--head", help="Display first N rows")] = None,
|
|
37
|
+
tail: Annotated[Optional[int], typer.Option("--tail", help="Display last N rows")] = None,
|
|
38
|
+
count: Annotated[bool, typer.Option("--count", "-c", help="Display total row count")] = False,
|
|
39
|
+
version: Annotated[
|
|
40
|
+
bool, typer.Option("--version", "-v", help="Show version information")
|
|
41
|
+
] = False,
|
|
42
|
+
) -> None:
|
|
43
|
+
"""
|
|
44
|
+
A powerful command-line tool for inspecting Apache Parquet files ๐
|
|
45
|
+
|
|
46
|
+
Examples:
|
|
47
|
+
|
|
48
|
+
# Show file metadata
|
|
49
|
+
parq data.parquet
|
|
50
|
+
|
|
51
|
+
# Show schema
|
|
52
|
+
parq data.parquet --schema
|
|
53
|
+
|
|
54
|
+
# Show first 10 rows
|
|
55
|
+
parq data.parquet --head 10
|
|
56
|
+
|
|
57
|
+
# Show last 5 rows
|
|
58
|
+
parq data.parquet --tail 5
|
|
59
|
+
|
|
60
|
+
# Show row count
|
|
61
|
+
parq data.parquet --count
|
|
62
|
+
|
|
63
|
+
# Show version
|
|
64
|
+
parq --version
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
# Handle version flag
|
|
68
|
+
if version:
|
|
69
|
+
from parq import __version__
|
|
70
|
+
|
|
71
|
+
typer.echo(f"parq-cli version {__version__}")
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
# File is required if not showing version
|
|
75
|
+
if file is None:
|
|
76
|
+
typer.echo("Error: Missing argument 'FILE'.")
|
|
77
|
+
typer.echo("Try 'parq --help' for help.")
|
|
78
|
+
raise typer.Exit(code=1)
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
reader = ParquetReader(str(file))
|
|
82
|
+
|
|
83
|
+
# If no options specified, show metadata
|
|
84
|
+
if not any([schema, head is not None, tail is not None, count]):
|
|
85
|
+
metadata = reader.get_metadata_dict()
|
|
86
|
+
formatter.print_metadata(metadata)
|
|
87
|
+
return
|
|
88
|
+
|
|
89
|
+
# Show schema
|
|
90
|
+
if schema:
|
|
91
|
+
schema_info = reader.get_schema_info()
|
|
92
|
+
formatter.print_schema(schema_info)
|
|
93
|
+
|
|
94
|
+
# Show head
|
|
95
|
+
if head is not None:
|
|
96
|
+
table = reader.read_head(head)
|
|
97
|
+
formatter.print_table(table, f"First {head} Rows")
|
|
98
|
+
|
|
99
|
+
# Show tail
|
|
100
|
+
if tail is not None:
|
|
101
|
+
table = reader.read_tail(tail)
|
|
102
|
+
formatter.print_table(table, f"Last {tail} Rows")
|
|
103
|
+
|
|
104
|
+
# Show count
|
|
105
|
+
if count:
|
|
106
|
+
formatter.print_count(reader.num_rows)
|
|
107
|
+
|
|
108
|
+
except FileNotFoundError as e:
|
|
109
|
+
formatter.print_error(str(e))
|
|
110
|
+
raise typer.Exit(code=1)
|
|
111
|
+
except Exception as e:
|
|
112
|
+
formatter.print_error(f"Failed to read Parquet file: {e}")
|
|
113
|
+
raise typer.Exit(code=1)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
if __name__ == "__main__":
|
|
117
|
+
app()
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# {{CHENGQI:
|
|
121
|
+
# Action: Modified; Timestamp: 2025-10-14 18:07:04 +08:00;
|
|
122
|
+
# Reason: Fixed CLI options parsing by using @app.command() instead of @app.callback();
|
|
123
|
+
# Principle_Applied: KISS, Typer best practices - single command app should use @app.command()
|
|
124
|
+
# }}
|
|
125
|
+
# {{START MODIFICATIONS}}
|
|
126
|
+
# - Changed @app.callback(invoke_without_command=True) to @app.command()
|
|
127
|
+
# - Removed ctx parameter and subcommand checking logic (lines 67-69)
|
|
128
|
+
# - This fixes the issue where options like --schema were incorrectly parsed as subcommands
|
|
129
|
+
# - Now 'parq file.parquet --schema' works correctly as expected
|
|
130
|
+
# {{END MODIFICATIONS}}
|
parq/output.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Output formatting module.
|
|
3
|
+
Handles pretty-printing of Parquet data and metadata.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import Any, Dict, List
|
|
7
|
+
|
|
8
|
+
import pyarrow as pa
|
|
9
|
+
from rich import box
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich.panel import Panel
|
|
12
|
+
from rich.table import Table
|
|
13
|
+
|
|
14
|
+
console = Console()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class OutputFormatter:
|
|
18
|
+
"""Formatter for displaying Parquet data and metadata."""
|
|
19
|
+
|
|
20
|
+
@staticmethod
|
|
21
|
+
def _format_file_size(size_bytes: int) -> str:
|
|
22
|
+
"""
|
|
23
|
+
Format file size in human-readable format.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
size_bytes: Size in bytes
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Formatted string like "1.23 MB"
|
|
30
|
+
"""
|
|
31
|
+
if size_bytes < 1024:
|
|
32
|
+
return f"{size_bytes} B"
|
|
33
|
+
elif size_bytes < 1024 * 1024:
|
|
34
|
+
return f"{size_bytes / 1024:.2f} KB"
|
|
35
|
+
elif size_bytes < 1024 * 1024 * 1024:
|
|
36
|
+
return f"{size_bytes / (1024 * 1024):.2f} MB"
|
|
37
|
+
else:
|
|
38
|
+
return f"{size_bytes / (1024 * 1024 * 1024):.2f} GB"
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def print_metadata(metadata_dict: Dict[str, Any]) -> None:
|
|
42
|
+
"""
|
|
43
|
+
Print file metadata in a formatted panel.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
metadata_dict: Dictionary containing metadata
|
|
47
|
+
"""
|
|
48
|
+
# Special handling for specific fields
|
|
49
|
+
content_lines = []
|
|
50
|
+
for key, value in metadata_dict.items():
|
|
51
|
+
if key == "num_columns":
|
|
52
|
+
content_lines.append(
|
|
53
|
+
f"[cyan]{key}:[/cyan] [yellow]{value}[/yellow] [dim](logical)[/dim]"
|
|
54
|
+
)
|
|
55
|
+
elif key == "num_physical_columns":
|
|
56
|
+
content_lines.append(
|
|
57
|
+
f"[cyan]{key}:[/cyan] [yellow]{value}[/yellow] [dim](storage)[/dim]"
|
|
58
|
+
)
|
|
59
|
+
elif key == "file_size":
|
|
60
|
+
# Format file size in human-readable format
|
|
61
|
+
formatted_size = OutputFormatter._format_file_size(value)
|
|
62
|
+
content_lines.append(f"[cyan]{key}:[/cyan] [yellow]{formatted_size}[/yellow]")
|
|
63
|
+
else:
|
|
64
|
+
content_lines.append(f"[cyan]{key}:[/cyan] [yellow]{value}[/yellow]")
|
|
65
|
+
|
|
66
|
+
content = "\n".join(content_lines)
|
|
67
|
+
|
|
68
|
+
panel = Panel(
|
|
69
|
+
content,
|
|
70
|
+
title="[bold green]๐ Parquet File Metadata[/bold green]",
|
|
71
|
+
border_style="green",
|
|
72
|
+
box=box.ROUNDED,
|
|
73
|
+
)
|
|
74
|
+
console.print(panel)
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def print_schema(schema_info: List[Dict[str, Any]]) -> None:
|
|
78
|
+
"""
|
|
79
|
+
Print schema information as a table.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
schema_info: List of column information dictionaries
|
|
83
|
+
"""
|
|
84
|
+
table = Table(
|
|
85
|
+
title="[bold blue]๐ Schema Information[/bold blue]",
|
|
86
|
+
box=box.ROUNDED,
|
|
87
|
+
show_header=True,
|
|
88
|
+
header_style="bold magenta",
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
table.add_column("Column Name", style="cyan", no_wrap=True)
|
|
92
|
+
table.add_column("Data Type", style="green")
|
|
93
|
+
table.add_column("Nullable", style="yellow")
|
|
94
|
+
|
|
95
|
+
for col in schema_info:
|
|
96
|
+
table.add_row(col["name"], col["type"], "โ" if col["nullable"] else "โ")
|
|
97
|
+
|
|
98
|
+
console.print(table)
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def print_table(arrow_table: pa.Table, title: str = "Data Preview") -> None:
|
|
102
|
+
"""
|
|
103
|
+
Print PyArrow table as a Rich table.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
arrow_table: PyArrow table to display
|
|
107
|
+
title: Title for the table
|
|
108
|
+
"""
|
|
109
|
+
# Convert to pandas for easier display
|
|
110
|
+
df = arrow_table.to_pandas()
|
|
111
|
+
|
|
112
|
+
table = Table(
|
|
113
|
+
title=f"[bold blue]๐ {title}[/bold blue]",
|
|
114
|
+
box=box.ROUNDED,
|
|
115
|
+
show_header=True,
|
|
116
|
+
header_style="bold magenta",
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Add columns
|
|
120
|
+
for col in df.columns:
|
|
121
|
+
table.add_column(str(col), style="cyan")
|
|
122
|
+
|
|
123
|
+
# Add rows
|
|
124
|
+
for _, row in df.iterrows():
|
|
125
|
+
table.add_row(*[str(val) for val in row])
|
|
126
|
+
|
|
127
|
+
console.print(table)
|
|
128
|
+
|
|
129
|
+
@staticmethod
|
|
130
|
+
def print_count(count: int) -> None:
|
|
131
|
+
"""
|
|
132
|
+
Print row count.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
count: Number of rows
|
|
136
|
+
"""
|
|
137
|
+
panel = Panel(
|
|
138
|
+
f"[bold yellow]{count:,}[/bold yellow] rows",
|
|
139
|
+
title="[bold green]๐ Total Rows[/bold green]",
|
|
140
|
+
border_style="green",
|
|
141
|
+
box=box.ROUNDED,
|
|
142
|
+
)
|
|
143
|
+
console.print(panel)
|
|
144
|
+
|
|
145
|
+
@staticmethod
|
|
146
|
+
def print_error(message: str) -> None:
|
|
147
|
+
"""
|
|
148
|
+
Print error message.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
message: Error message to display
|
|
152
|
+
"""
|
|
153
|
+
console.print(f"[bold red]โ Error:[/bold red] {message}")
|
|
154
|
+
|
|
155
|
+
@staticmethod
|
|
156
|
+
def print_success(message: str) -> None:
|
|
157
|
+
"""
|
|
158
|
+
Print success message.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
message: Success message to display
|
|
162
|
+
"""
|
|
163
|
+
console.print(f"[bold green]โ[/bold green] {message}")
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# {{CHENGQI:
|
|
167
|
+
# Action: Modified; Timestamp: 2025-10-14 HH:MM:SS +08:00;
|
|
168
|
+
# Reason: Enhanced metadata display to show both logical and physical column counts;
|
|
169
|
+
# Principle_Applied: User-centric design - clear distinction between logical and physical columns
|
|
170
|
+
# }}
|
|
171
|
+
# {{START MODIFICATIONS}}
|
|
172
|
+
# - Enhanced print_metadata to distinguish logical vs physical columns
|
|
173
|
+
# - Added visual indicators: (logical) for num_columns, (storage) for num_physical_columns
|
|
174
|
+
# - Added informative note when physical columns differ from logical columns
|
|
175
|
+
# - Helps users understand nested structure impact on column count
|
|
176
|
+
# {{END MODIFICATIONS}}
|
parq/reader.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Parquet file reader module.
|
|
3
|
+
Provides functionality to read and inspect Parquet files.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
|
|
9
|
+
import pyarrow as pa
|
|
10
|
+
import pyarrow.parquet as pq
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ParquetReader:
|
|
14
|
+
"""Parquet file reader with metadata inspection capabilities."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, file_path: str):
|
|
17
|
+
"""
|
|
18
|
+
Initialize ParquetReader with a file path.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
file_path: Path to the Parquet file
|
|
22
|
+
"""
|
|
23
|
+
self.file_path = Path(file_path)
|
|
24
|
+
if not self.file_path.exists():
|
|
25
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
26
|
+
|
|
27
|
+
self._parquet_file = pq.ParquetFile(self.file_path)
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def metadata(self) -> pq.FileMetaData:
|
|
31
|
+
"""Get file metadata."""
|
|
32
|
+
return self._parquet_file.metadata
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def schema(self) -> pa.Schema:
|
|
36
|
+
"""Get file schema."""
|
|
37
|
+
return self._parquet_file.schema_arrow
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def num_rows(self) -> int:
|
|
41
|
+
"""Get total number of rows."""
|
|
42
|
+
return self.metadata.num_rows
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def num_columns(self) -> int:
|
|
46
|
+
"""Get total number of columns (logical schema columns)."""
|
|
47
|
+
return len(self.schema)
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def num_physical_columns(self) -> int:
|
|
51
|
+
"""Get total number of physical columns (from metadata)."""
|
|
52
|
+
return self.metadata.num_columns
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def num_row_groups(self) -> int:
|
|
56
|
+
"""Get number of row groups."""
|
|
57
|
+
return self.metadata.num_row_groups
|
|
58
|
+
|
|
59
|
+
def get_metadata_dict(self) -> dict:
|
|
60
|
+
"""
|
|
61
|
+
Get metadata as a dictionary.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Dictionary containing file metadata
|
|
65
|
+
"""
|
|
66
|
+
metadata_dict = {
|
|
67
|
+
"file_path": str(self.file_path),
|
|
68
|
+
"num_rows": self.num_rows,
|
|
69
|
+
"num_columns": self.num_columns,
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
# Add physical columns right after logical columns if different
|
|
73
|
+
if self.num_physical_columns != self.num_columns:
|
|
74
|
+
metadata_dict["num_physical_columns"] = self.num_physical_columns
|
|
75
|
+
|
|
76
|
+
# Add file size
|
|
77
|
+
file_size = self.file_path.stat().st_size
|
|
78
|
+
metadata_dict["file_size"] = file_size
|
|
79
|
+
|
|
80
|
+
# Add compression type (from first row group, first column)
|
|
81
|
+
if self.num_row_groups > 0:
|
|
82
|
+
compression = self.metadata.row_group(0).column(0).compression
|
|
83
|
+
metadata_dict["compression_types"] = compression
|
|
84
|
+
|
|
85
|
+
# Add remaining metadata
|
|
86
|
+
metadata_dict.update(
|
|
87
|
+
{
|
|
88
|
+
"num_row_groups": self.num_row_groups,
|
|
89
|
+
"format_version": self.metadata.format_version,
|
|
90
|
+
"serialized_size": self.metadata.serialized_size,
|
|
91
|
+
"created_by": self.metadata.created_by,
|
|
92
|
+
}
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
return metadata_dict
|
|
96
|
+
|
|
97
|
+
def get_schema_info(self) -> List[dict]:
|
|
98
|
+
"""
|
|
99
|
+
Get schema information as a list of column details.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
List of dictionaries with column information
|
|
103
|
+
"""
|
|
104
|
+
schema_info = []
|
|
105
|
+
for field in self.schema:
|
|
106
|
+
schema_info.append(
|
|
107
|
+
{
|
|
108
|
+
"name": field.name,
|
|
109
|
+
"type": str(field.type),
|
|
110
|
+
"nullable": field.nullable,
|
|
111
|
+
}
|
|
112
|
+
)
|
|
113
|
+
return schema_info
|
|
114
|
+
|
|
115
|
+
def read_head(self, n: int = 5) -> pa.Table:
|
|
116
|
+
"""
|
|
117
|
+
Read first n rows.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
n: Number of rows to read
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
PyArrow table with first n rows
|
|
124
|
+
"""
|
|
125
|
+
table = self._parquet_file.read()
|
|
126
|
+
return table.slice(0, min(n, self.num_rows))
|
|
127
|
+
|
|
128
|
+
def read_tail(self, n: int = 5) -> pa.Table:
|
|
129
|
+
"""
|
|
130
|
+
Read last n rows.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
n: Number of rows to read
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
PyArrow table with last n rows
|
|
137
|
+
"""
|
|
138
|
+
table = self._parquet_file.read()
|
|
139
|
+
start = max(0, self.num_rows - n)
|
|
140
|
+
return table.slice(start, n)
|
|
141
|
+
|
|
142
|
+
def read_columns(self, columns: Optional[List[str]] = None) -> pa.Table:
|
|
143
|
+
"""
|
|
144
|
+
Read specific columns.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
columns: List of column names to read. If None, read all columns.
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
PyArrow table with selected columns
|
|
151
|
+
"""
|
|
152
|
+
return self._parquet_file.read(columns=columns)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# {{CHENGQI:
|
|
156
|
+
# Action: Modified; Timestamp: 2025-10-14 HH:MM:SS +08:00;
|
|
157
|
+
# Reason: Fixed num_columns to use schema length instead of metadata for accurate logical column count;
|
|
158
|
+
# Principle_Applied: User-centric design - show logical columns that users actually see
|
|
159
|
+
# }}
|
|
160
|
+
# {{START MODIFICATIONS}}
|
|
161
|
+
# - Changed num_columns from metadata.num_columns to len(self.schema)
|
|
162
|
+
# - Reason: metadata.num_columns may include physical columns from nested structures
|
|
163
|
+
# - Schema length represents logical columns that users expect to see
|
|
164
|
+
# {{END MODIFICATIONS}}
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: parq-cli
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A powerful command-line tool for inspecting and analyzing Apache Parquet files
|
|
5
|
+
Project-URL: Homepage, https://github.com/Tendo33/parq-cli
|
|
6
|
+
Project-URL: Repository, https://github.com/Tendo33/parq-cli
|
|
7
|
+
Project-URL: Issues, https://github.com/Tendo33/parq-cli/issues
|
|
8
|
+
Project-URL: Documentation, https://github.com/Tendo33/parq-cli#readme
|
|
9
|
+
Author-email: SimonSun <sjf19981112@gmail.com>
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: analytics,apache-parquet,cli,data,data-tools,parquet
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Requires-Dist: pandas>=2.0.0
|
|
24
|
+
Requires-Dist: pyarrow>=18.0.0
|
|
25
|
+
Requires-Dist: rich>=13.0.0
|
|
26
|
+
Requires-Dist: typer>=0.15.0
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: black>=24.0.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: ruff>=0.7.0; extra == 'dev'
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# parq-cli
|
|
35
|
+
|
|
36
|
+
[](https://www.python.org/downloads/)
|
|
37
|
+
[](LICENSE)
|
|
38
|
+
|
|
39
|
+
ไธไธชๅผบๅคง็ Apache Parquet ๆไปถๅฝไปค่กๅทฅๅ
ท ๐
|
|
40
|
+
|
|
41
|
+
## โจ ็นๆง
|
|
42
|
+
|
|
43
|
+
- ๐ **ๅ
ๆฐๆฎๆฅ็**: ๅฟซ้ๆฅ็ Parquet ๆไปถ็ๅ
ๆฐๆฎไฟกๆฏ
|
|
44
|
+
- ๐ **Schema ๅฑ็คบ**: ็พ่งๅฐๅฑ็คบๆไปถ็ๅ็ปๆๅๆฐๆฎ็ฑปๅ
|
|
45
|
+
- ๐ **ๆฐๆฎ้ข่ง**: ๆฏๆๆฅ็ๆไปถ็ๅ N ่กๆๅ N ่ก
|
|
46
|
+
- ๐ข **่กๆฐ็ป่ฎก**: ๅฟซ้่ทๅๆไปถ็ๆป่กๆฐ
|
|
47
|
+
- ๐จ **็พ่ง่พๅบ**: ไฝฟ็จ Rich ๅบๆไพๅฝฉ่ฒใๆ ผๅผๅ็็ป็ซฏ่พๅบ
|
|
48
|
+
|
|
49
|
+
## ๐ฆ ๅฎ่ฃ
|
|
50
|
+
|
|
51
|
+
### ไปๆบ็ ๅฎ่ฃ
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
git clone https://github.com/yourusername/parq-cli.git
|
|
55
|
+
cd parq-cli
|
|
56
|
+
pip install -e .
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### ไฝฟ็จ pip ๅฎ่ฃ
๏ผๅณๅฐๆฏๆ๏ผ
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install parq-cli
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## ๐ ๅฟซ้ๅผๅง
|
|
66
|
+
|
|
67
|
+
### ๅบๆฌ็จๆณ
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# ๆฅ็ๆไปถๅ
ๆฐๆฎ
|
|
71
|
+
parq data.parquet
|
|
72
|
+
|
|
73
|
+
# ๆพ็คบ schema ไฟกๆฏ
|
|
74
|
+
parq data.parquet --schema
|
|
75
|
+
|
|
76
|
+
# ๆพ็คบๅ 10 ่ก
|
|
77
|
+
parq data.parquet --head 10
|
|
78
|
+
|
|
79
|
+
# ๆพ็คบๅ 5 ่ก
|
|
80
|
+
parq data.parquet --tail 5
|
|
81
|
+
|
|
82
|
+
# ๆพ็คบๆป่กๆฐ
|
|
83
|
+
parq data.parquet --count
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### ็ปๅไฝฟ็จ
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
# ๅๆถๆพ็คบ schema ๅ่กๆฐ
|
|
90
|
+
parq data.parquet --schema --count
|
|
91
|
+
|
|
92
|
+
# ๆพ็คบๅ 5 ่กๅ schema
|
|
93
|
+
parq data.parquet --head 5 --schema
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## ๐ ๅฝไปคๅ่
|
|
97
|
+
|
|
98
|
+
### ไธปๅฝไปค
|
|
99
|
+
|
|
100
|
+
```
|
|
101
|
+
parq [OPTIONS] FILE
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
**ๅๆฐ:**
|
|
105
|
+
- `FILE`: Parquet ๆไปถ่ทฏๅพ๏ผๅฟ
้๏ผ
|
|
106
|
+
|
|
107
|
+
**้้กน:**
|
|
108
|
+
- `--schema, -s`: ๆพ็คบ schema ไฟกๆฏ
|
|
109
|
+
- `--head N`: ๆพ็คบๅ N ่ก
|
|
110
|
+
- `--tail N`: ๆพ็คบๅ N ่ก
|
|
111
|
+
- `--count, -c`: ๆพ็คบๆป่กๆฐ
|
|
112
|
+
- `--help`: ๆพ็คบๅธฎๅฉไฟกๆฏ
|
|
113
|
+
|
|
114
|
+
### ็ๆฌไฟกๆฏ
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
parq version
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## ๐จ ่พๅบ็คบไพ
|
|
121
|
+
|
|
122
|
+
### ๅ
ๆฐๆฎๅฑ็คบ
|
|
123
|
+
|
|
124
|
+
```
|
|
125
|
+
โญโโโโโโโโโโโโโโโโโโโโโโโ ๐ Parquet File Metadata โโโโโโโโโโโโโโโโโโโโโโโโฎ
|
|
126
|
+
โ file_path: /path/to/data.parquet โ
|
|
127
|
+
โ num_rows: 1000 โ
|
|
128
|
+
โ num_columns: 5 โ
|
|
129
|
+
โ num_row_groups: 1 โ
|
|
130
|
+
โ format_version: 2.6 โ
|
|
131
|
+
โ serialized_size: 2048 โ
|
|
132
|
+
โ created_by: parquet-cpp-arrow version 18.0.0 โ
|
|
133
|
+
โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### Schema ๅฑ็คบ
|
|
137
|
+
|
|
138
|
+
```
|
|
139
|
+
๐ Schema Information
|
|
140
|
+
โโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโณโโโโโโโโโโโ
|
|
141
|
+
โ Column Name โ Data Type โ Nullable โ
|
|
142
|
+
โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ
|
|
143
|
+
โ id โ int64 โ โ โ
|
|
144
|
+
โ name โ string โ โ โ
|
|
145
|
+
โ age โ int64 โ โ โ
|
|
146
|
+
โ city โ string โ โ โ
|
|
147
|
+
โ salary โ double โ โ โ
|
|
148
|
+
โโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโดโโโโโโโโโโโ
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## ๐ ๏ธ ๆๆฏๆ
|
|
152
|
+
|
|
153
|
+
- **[PyArrow](https://arrow.apache.org/docs/python/)**: ้ซๆง่ฝ็ Parquet ่ฏปๅๅผๆ
|
|
154
|
+
- **[Typer](https://typer.tiangolo.com/)**: ็ฐไปฃๅ็ CLI ๆกๆถ
|
|
155
|
+
- **[Rich](https://rich.readthedocs.io/)**: ็พ่ง็็ป็ซฏ่พๅบ
|
|
156
|
+
|
|
157
|
+
## ๐งช ๅผๅ
|
|
158
|
+
|
|
159
|
+
### ๅฎ่ฃ
ๅผๅไพ่ต
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
pip install -e ".[dev]"
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### ่ฟ่กๆต่ฏ
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
pytest
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### ่ฟ่กๆต่ฏ๏ผๅธฆ่ฆ็็๏ผ
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
pytest --cov=parq --cov-report=html
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### ไปฃ็ ๆ ผๅผๅ
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
# ไฝฟ็จ Black
|
|
181
|
+
black parq tests
|
|
182
|
+
|
|
183
|
+
# ไฝฟ็จ Ruff ๆฃๆฅ
|
|
184
|
+
ruff check parq tests
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
## ๐บ๏ธ ่ทฏ็บฟๅพ
|
|
188
|
+
|
|
189
|
+
- [x] ๅบ็กๅ
ๆฐๆฎๆฅ็
|
|
190
|
+
- [x] Schema ๅฑ็คบ
|
|
191
|
+
- [x] ๆฐๆฎ้ข่ง๏ผhead/tail๏ผ
|
|
192
|
+
- [x] ่กๆฐ็ป่ฎก
|
|
193
|
+
- [ ] SQL ๆฅ่ฏขๆฏๆ
|
|
194
|
+
- [ ] ๆฐๆฎ็ป่ฎกๅๆ
|
|
195
|
+
- [ ] ๆ ผๅผ่ฝฌๆข๏ผCSV, JSON, Excel๏ผ
|
|
196
|
+
- [ ] ๆไปถๅฏนๆฏ
|
|
197
|
+
- [ ] ไบๅญๅจๆฏๆ๏ผS3, GCS, Azure๏ผ
|
|
198
|
+
|
|
199
|
+
## ๐ค ่ดก็ฎ
|
|
200
|
+
|
|
201
|
+
ๆฌข่ฟๆไบค Issue ๅ Pull Request๏ผ
|
|
202
|
+
|
|
203
|
+
1. Fork ๆฌไปๅบ
|
|
204
|
+
2. ๅๅปบ็นๆงๅๆฏ (`git checkout -b feature/AmazingFeature`)
|
|
205
|
+
3. ๆไบคๆดๆน (`git commit -m 'Add some AmazingFeature'`)
|
|
206
|
+
4. ๆจ้ๅฐๅๆฏ (`git push origin feature/AmazingFeature`)
|
|
207
|
+
5. ๅผๅฏ Pull Request
|
|
208
|
+
|
|
209
|
+
## ๐ ่ฎธๅฏ่ฏ
|
|
210
|
+
|
|
211
|
+
ๆฌ้กน็ฎ้็จ MIT ่ฎธๅฏ่ฏ - ่ฏฆ่ง [LICENSE](LICENSE) ๆไปถ
|
|
212
|
+
|
|
213
|
+
## ๐ ่ด่ฐข
|
|
214
|
+
|
|
215
|
+
- ็ตๆๆฅๆบไบ [parquet-cli](https://github.com/chhantyal/parquet-cli)
|
|
216
|
+
- ๆ่ฐข Apache Arrow ๅข้ๆไพๅผบๅคง็ Parquet ๆฏๆ
|
|
217
|
+
- ๆ่ฐข Rich ๅบไธบ็ป็ซฏ่พๅบๅขๆทป่ฒๅฝฉ
|
|
218
|
+
|
|
219
|
+
## ๐ฎ ่็ณปๆนๅผ
|
|
220
|
+
|
|
221
|
+
- ไฝ่
: Jinfeng Sun
|
|
222
|
+
- ้กน็ฎๅฐๅ: https://github.com/Tendo33/parq-cli
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
**โญ ๅฆๆ่ฟไธช้กน็ฎๅฏนไฝ ๆๅธฎๅฉ๏ผ่ฏท็ปไธช Star๏ผ**
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
parq/__init__.py,sha256=gwi-2yJ5Vi64or1d2PUmaNh5kqK3_Zk0NqNTR5Ehyos,286
|
|
2
|
+
parq/__main__.py,sha256=WGh0rg8ZQI3jEKQDY07_v9B4ZHhvfyF0bLQi-oQnpxY,338
|
|
3
|
+
parq/cli.py,sha256=cNeWoO1vuDGhIQxFjB6EyWtVZngZKa8uxKSXIUw-j5w,3694
|
|
4
|
+
parq/output.py,sha256=irl0ckCBWy15hJqy4QxGBLrRAdIWT-6AxFxjAoJac24,5415
|
|
5
|
+
parq/reader.py,sha256=7S0D4RVDz0ElB5XugbfI4AIAI--MFsg8JqEUb4zkJVI,4859
|
|
6
|
+
parq_cli-0.0.1.dist-info/METADATA,sha256=wBWuZf6ft07TfOTlp5aFwJcMeWGYCbLpzKQtXzZ9qug,6315
|
|
7
|
+
parq_cli-0.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
8
|
+
parq_cli-0.0.1.dist-info/entry_points.txt,sha256=reTENlFOrUkuoCs5VGNvxD1FWvnpyIL4CP2GucIY0Hw,38
|
|
9
|
+
parq_cli-0.0.1.dist-info/licenses/LICENSE,sha256=RCmRAGCROPvNK8H6jyCxN2CH3ka_-lb2_m8LpwjJl3w,1068
|
|
10
|
+
parq_cli-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Jinfeng Sun
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|