parqv 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parqv/app.py +87 -50
- parqv/handlers/__init__.py +13 -0
- parqv/handlers/base_handler.py +114 -0
- parqv/handlers/json.py +450 -0
- parqv/handlers/parquet.py +640 -0
- parqv/views/metadata_view.py +11 -4
- parqv/views/schema_view.py +147 -88
- parqv-0.2.0.dist-info/METADATA +104 -0
- parqv-0.2.0.dist-info/RECORD +17 -0
- {parqv-0.1.0.dist-info → parqv-0.2.0.dist-info}/WHEEL +1 -1
- parqv/parquet_handler.py +0 -389
- parqv/views/row_group_view.py +0 -33
- parqv-0.1.0.dist-info/METADATA +0 -91
- parqv-0.1.0.dist-info/RECORD +0 -15
- {parqv-0.1.0.dist-info → parqv-0.2.0.dist-info}/entry_points.txt +0 -0
- {parqv-0.1.0.dist-info → parqv-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {parqv-0.1.0.dist-info → parqv-0.2.0.dist-info}/top_level.txt +0 -0
parqv/parquet_handler.py
DELETED
@@ -1,389 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
from pathlib import Path
|
3
|
-
from typing import Any, Dict, List, Tuple, Optional, Union
|
4
|
-
|
5
|
-
import pandas as pd
|
6
|
-
import pyarrow as pa
|
7
|
-
import pyarrow.compute as pc
|
8
|
-
import pyarrow.parquet as pq
|
9
|
-
|
10
|
-
log = logging.getLogger(__name__)
|
11
|
-
|
12
|
-
|
13
|
-
class ParquetHandlerError(Exception):
|
14
|
-
pass
|
15
|
-
|
16
|
-
|
17
|
-
class ParquetHandler:
|
18
|
-
|
19
|
-
def __init__(self, file_path: Path):
|
20
|
-
self.file_path = file_path
|
21
|
-
self.pq_file: Optional[pq.ParquetFile] = None
|
22
|
-
self.schema: Optional[pa.Schema] = None
|
23
|
-
self.metadata: Optional[pq.FileMetaData] = None
|
24
|
-
try:
|
25
|
-
self.pq_file = pq.ParquetFile(file_path)
|
26
|
-
self.schema = self.pq_file.schema_arrow
|
27
|
-
self.metadata = self.pq_file.metadata
|
28
|
-
except Exception as e:
|
29
|
-
log.exception("Error initializing ParquetHandler")
|
30
|
-
raise ParquetHandlerError(f"Failed to open or read Parquet file: {e}") from e
|
31
|
-
|
32
|
-
def get_metadata_summary(self) -> Dict[str, Any]:
|
33
|
-
if not self.metadata or not self.schema:
|
34
|
-
return {"error": "Metadata or schema not available"}
|
35
|
-
|
36
|
-
created_by = self._decode_metadata_bytes(self.metadata.created_by) or "N/A"
|
37
|
-
summary = {
|
38
|
-
"File Path": str(self.file_path.resolve()),
|
39
|
-
"Size": f"{self.file_path.stat().st_size:,} bytes",
|
40
|
-
"Total Rows": f"{self.metadata.num_rows:,}",
|
41
|
-
"Row Groups": self.metadata.num_row_groups,
|
42
|
-
"Columns": self.metadata.num_columns,
|
43
|
-
"Format Version": self.metadata.format_version,
|
44
|
-
"Creator": created_by,
|
45
|
-
"Schema Fields": len(self.schema.names),
|
46
|
-
}
|
47
|
-
kv_meta = self._decode_key_value_metadata(self.metadata.metadata)
|
48
|
-
if kv_meta:
|
49
|
-
summary["Key Value Metadata"] = kv_meta
|
50
|
-
return summary
|
51
|
-
|
52
|
-
def get_schema_tree_data(self) -> Optional[Dict[str, Any]]:
|
53
|
-
if not self.schema or not self.schema.names:
|
54
|
-
log.warning("Schema is not available or has no fields.")
|
55
|
-
return None
|
56
|
-
|
57
|
-
root_data: Dict[str, Any] = {}
|
58
|
-
for field in self.schema:
|
59
|
-
try:
|
60
|
-
label, children = self._build_schema_tree_nodes(field)
|
61
|
-
root_data[label] = children
|
62
|
-
except Exception as field_e:
|
63
|
-
log.error(f"Error processing schema field '{field.name}': {field_e}", exc_info=True)
|
64
|
-
root_data[f"[red]Error processing: {field.name}[/red]"] = None
|
65
|
-
|
66
|
-
if not root_data:
|
67
|
-
log.warning("Processed schema data resulted in an empty dictionary.")
|
68
|
-
return None
|
69
|
-
|
70
|
-
return root_data
|
71
|
-
|
72
|
-
def get_data_preview(self, num_rows: int = 50) -> Optional[pd.DataFrame]:
|
73
|
-
if not self.pq_file:
|
74
|
-
log.warning("ParquetFile handler not available for data preview.")
|
75
|
-
return None
|
76
|
-
|
77
|
-
tables_to_concat = []
|
78
|
-
rows_accumulated = 0
|
79
|
-
for i in range(self.pq_file.num_row_groups):
|
80
|
-
rg_meta = self.pq_file.metadata.row_group(i)
|
81
|
-
rows_to_read_from_group = min(rg_meta.num_rows, num_rows - rows_accumulated)
|
82
|
-
|
83
|
-
if rows_to_read_from_group <= 0:
|
84
|
-
log.debug(f"Limit {num_rows} reached after {i} groups.")
|
85
|
-
break
|
86
|
-
|
87
|
-
rg_table = self.pq_file.read_row_group(i)
|
88
|
-
tables_to_concat.append(rg_table)
|
89
|
-
rows_accumulated += rg_meta.num_rows
|
90
|
-
|
91
|
-
if not tables_to_concat:
|
92
|
-
log.warning("No row groups read or file is empty.")
|
93
|
-
return pd.DataFrame()
|
94
|
-
|
95
|
-
combined_table = pa.concat_tables(tables_to_concat)
|
96
|
-
preview_table = combined_table.slice(0, num_rows)
|
97
|
-
|
98
|
-
df = preview_table.to_pandas(
|
99
|
-
split_blocks=True, self_destruct=True, date_as_object=False, types_mapper=pd.ArrowDtype
|
100
|
-
)
|
101
|
-
return df
|
102
|
-
|
103
|
-
def get_column_stats(self, column_name: str) -> Dict[str, Any]:
|
104
|
-
if not self.pq_file or not self.schema or not column_name:
|
105
|
-
log.warning("Prerequisites not met for get_column_stats.")
|
106
|
-
return {"error": "File, schema, or column name not available"}
|
107
|
-
|
108
|
-
error_msg: Optional[str] = None
|
109
|
-
field: Optional[pa.Field] = None
|
110
|
-
calculated_stats: Dict[str, Any] = {}
|
111
|
-
|
112
|
-
field = self.schema.field(column_name)
|
113
|
-
col_type = field.type
|
114
|
-
|
115
|
-
table = self.pq_file.read(columns=[column_name])
|
116
|
-
if table.num_rows == 0:
|
117
|
-
log.warning(f"Column '{column_name}' is empty.")
|
118
|
-
return self._create_stats_result(column_name, field, msg="Column is empty (0 rows).")
|
119
|
-
|
120
|
-
column_data = table.column(column_name)
|
121
|
-
# Basic counts
|
122
|
-
total_count = len(column_data)
|
123
|
-
null_count = column_data.null_count
|
124
|
-
valid_count = total_count - null_count
|
125
|
-
calculated_stats["Total Count"] = f"{total_count:,}"
|
126
|
-
calculated_stats["Valid Count"] = f"{valid_count:,}"
|
127
|
-
calculated_stats["Null Count"] = f"{null_count:,}"
|
128
|
-
calculated_stats[
|
129
|
-
"Null Percentage"] = f"{(null_count / total_count * 100):.2f}%" if total_count > 0 else "N/A"
|
130
|
-
|
131
|
-
# Type-specific calculations
|
132
|
-
if valid_count > 0:
|
133
|
-
valid_data = column_data.drop_null()
|
134
|
-
if pa.types.is_integer(col_type) or pa.types.is_floating(col_type):
|
135
|
-
calculated_stats.update(self._calculate_numeric_stats(valid_data))
|
136
|
-
elif pa.types.is_temporal(col_type):
|
137
|
-
calculated_stats.update(self._calculate_temporal_stats(valid_data))
|
138
|
-
elif pa.types.is_string(col_type) or pa.types.is_large_string(col_type):
|
139
|
-
calculated_stats.update(self._calculate_string_stats(valid_data))
|
140
|
-
elif pa.types.is_boolean(col_type):
|
141
|
-
calculated_stats.update(self._calculate_boolean_stats(valid_data))
|
142
|
-
else:
|
143
|
-
log.debug("No valid data points for calculation.")
|
144
|
-
|
145
|
-
metadata_stats, metadata_stats_error = self._get_stats_from_metadata(column_name)
|
146
|
-
return self._create_stats_result(
|
147
|
-
column_name, field, calculated_stats, metadata_stats, metadata_stats_error, error_msg
|
148
|
-
)
|
149
|
-
|
150
|
-
def get_row_group_info(self) -> List[Dict[str, Any]]:
|
151
|
-
if not self.metadata:
|
152
|
-
log.warning("Metadata not available for row group info.")
|
153
|
-
return []
|
154
|
-
groups = []
|
155
|
-
num_groups = self.metadata.num_row_groups
|
156
|
-
|
157
|
-
for i in range(num_groups):
|
158
|
-
try:
|
159
|
-
rg_meta = self.metadata.row_group(i)
|
160
|
-
num_rows = getattr(rg_meta, 'num_rows', 'N/A')
|
161
|
-
size = getattr(rg_meta, 'total_byte_size', 'N/A')
|
162
|
-
comp_size_val = getattr(rg_meta, 'total_compressed_size', -1)
|
163
|
-
comp_size = f"{comp_size_val:,}" if isinstance(comp_size_val, int) and comp_size_val > 0 else "N/A"
|
164
|
-
|
165
|
-
groups.append({
|
166
|
-
"Group": i,
|
167
|
-
"Rows": f"{num_rows:,}" if isinstance(num_rows, int) else num_rows,
|
168
|
-
"Size (bytes)": f"{size:,}" if isinstance(size, int) else size,
|
169
|
-
"Size (comp.)": comp_size,
|
170
|
-
})
|
171
|
-
except Exception as e:
|
172
|
-
log.error(f"Error reading metadata for row group {i}", exc_info=True)
|
173
|
-
groups.append({"Group": i, "Rows": "Error", "Size (bytes)": "Error", "Size (comp.)": "Error"})
|
174
|
-
return groups
|
175
|
-
|
176
|
-
def _decode_metadata_bytes(self, value: Optional[bytes]) -> Optional[str]:
|
177
|
-
if isinstance(value, bytes):
|
178
|
-
try:
|
179
|
-
return value.decode('utf-8', errors='replace')
|
180
|
-
except Exception as e:
|
181
|
-
log.warning(f"Could not decode metadata bytes: {e}")
|
182
|
-
return repr(value)
|
183
|
-
return value
|
184
|
-
|
185
|
-
def _decode_key_value_metadata(self, kv_meta: Optional[Dict[bytes, bytes]]) -> Optional[Dict[str, str]]:
|
186
|
-
if not kv_meta:
|
187
|
-
return None
|
188
|
-
decoded_kv = {}
|
189
|
-
try:
|
190
|
-
for k, v in kv_meta.items():
|
191
|
-
key_str = self._decode_metadata_bytes(k) or repr(k)
|
192
|
-
val_str = self._decode_metadata_bytes(v) or repr(v)
|
193
|
-
decoded_kv[key_str] = val_str
|
194
|
-
return decoded_kv
|
195
|
-
except Exception as e:
|
196
|
-
log.warning(f"Could not decode key-value metadata: {e}")
|
197
|
-
return {"error": f"Error decoding: {e}"}
|
198
|
-
|
199
|
-
def _format_pyarrow_type(self, field_type: pa.DataType) -> str:
|
200
|
-
if pa.types.is_timestamp(field_type):
|
201
|
-
return f"TIMESTAMP(unit={field_type.unit}, tz={field_type.tz})"
|
202
|
-
|
203
|
-
if pa.types.is_decimal128(field_type) or pa.types.is_decimal256(field_type):
|
204
|
-
return f"DECIMAL({field_type.precision}, {field_type.scale})"
|
205
|
-
|
206
|
-
if pa.types.is_list(field_type) or pa.types.is_large_list(field_type):
|
207
|
-
return f"LIST<{self._format_pyarrow_type(field_type.value_type)}>"
|
208
|
-
|
209
|
-
if pa.types.is_struct(field_type):
|
210
|
-
return f"STRUCT<{field_type.num_fields} fields>"
|
211
|
-
|
212
|
-
if pa.types.is_map(field_type):
|
213
|
-
return f"MAP<{self._format_pyarrow_type(field_type.key_type)}, {self._format_pyarrow_type(field_type.item_type)}>"
|
214
|
-
|
215
|
-
return str(field_type).upper()
|
216
|
-
|
217
|
-
def _build_schema_tree_nodes(self, field: pa.Field) -> Tuple[str, Optional[Dict]]:
|
218
|
-
node_label = f"[bold]{field.name}[/] ({self._format_pyarrow_type(field.type)})"
|
219
|
-
if not field.nullable:
|
220
|
-
node_label += " [red]REQUIRED[/]"
|
221
|
-
|
222
|
-
children_data: Dict[str, Any] = {}
|
223
|
-
field_type = field.type
|
224
|
-
|
225
|
-
if pa.types.is_struct(field_type):
|
226
|
-
for i in range(field_type.num_fields):
|
227
|
-
child_label, grandchild_data = self._build_schema_tree_nodes(field_type.field(i))
|
228
|
-
children_data[child_label] = grandchild_data
|
229
|
-
|
230
|
-
elif pa.types.is_list(field_type) or pa.types.is_large_list(field_type):
|
231
|
-
element_field = pa.field("item", field_type.value_type, nullable=True)
|
232
|
-
child_label, grandchild_data = self._build_schema_tree_nodes(element_field)
|
233
|
-
children_data[child_label] = grandchild_data
|
234
|
-
|
235
|
-
elif pa.types.is_map(field_type):
|
236
|
-
key_field = pa.field("key", field_type.key_type, nullable=False)
|
237
|
-
value_field = pa.field("value", field_type.item_type, nullable=True)
|
238
|
-
key_label, _ = self._build_schema_tree_nodes(key_field)
|
239
|
-
value_label, value_grandchild = self._build_schema_tree_nodes(value_field)
|
240
|
-
children_data[key_label] = None
|
241
|
-
children_data[value_label] = value_grandchild
|
242
|
-
|
243
|
-
return node_label, children_data if children_data else None
|
244
|
-
|
245
|
-
def _calculate_numeric_stats(self, valid_data: pa.ChunkedArray) -> Dict[str, Any]:
|
246
|
-
stats: Dict[str, Any] = {}
|
247
|
-
try:
|
248
|
-
stats["Min"] = pc.min(valid_data).as_py()
|
249
|
-
except Exception as e:
|
250
|
-
log.warning(f"Min calc error: {e}");
|
251
|
-
stats["Min"] = "Error"
|
252
|
-
try:
|
253
|
-
stats["Max"] = pc.max(valid_data).as_py()
|
254
|
-
except Exception as e:
|
255
|
-
log.warning(f"Max calc error: {e}");
|
256
|
-
stats["Max"] = "Error"
|
257
|
-
try:
|
258
|
-
stats["Mean"] = f"{pc.mean(valid_data).as_py():.4f}"
|
259
|
-
except Exception as e:
|
260
|
-
log.warning(f"Mean calc error: {e}");
|
261
|
-
stats["Mean"] = "Error"
|
262
|
-
try:
|
263
|
-
stats["StdDev"] = f"{pc.stddev(valid_data, ddof=1).as_py():.4f}"
|
264
|
-
except Exception as e:
|
265
|
-
log.warning(f"StdDev calc error: {e}");
|
266
|
-
stats["StdDev"] = "Error"
|
267
|
-
return stats
|
268
|
-
|
269
|
-
def _calculate_temporal_stats(self, valid_data: pa.ChunkedArray) -> Dict[str, Any]:
|
270
|
-
stats: Dict[str, Any] = {}
|
271
|
-
try:
|
272
|
-
stats["Min"] = pc.min(valid_data).as_py()
|
273
|
-
except Exception as e:
|
274
|
-
log.warning(f"Min calc error (temporal): {e}");
|
275
|
-
stats["Min"] = "Error"
|
276
|
-
try:
|
277
|
-
stats["Max"] = pc.max(valid_data).as_py()
|
278
|
-
except Exception as e:
|
279
|
-
log.warning(f"Max calc error (temporal): {e}");
|
280
|
-
stats["Max"] = "Error"
|
281
|
-
return stats
|
282
|
-
|
283
|
-
def _calculate_string_stats(self, valid_data: pa.ChunkedArray) -> Dict[str, Any]:
|
284
|
-
stats: Dict[str, Any] = {}
|
285
|
-
try:
|
286
|
-
stats["Distinct Count"] = f"{pc.count_distinct(valid_data).as_py():,}"
|
287
|
-
except Exception as e:
|
288
|
-
log.warning(f"Distinct count error: {e}");
|
289
|
-
stats["Distinct Count"] = "Error"
|
290
|
-
# TopN removed as requested
|
291
|
-
return stats
|
292
|
-
|
293
|
-
def _calculate_boolean_stats(self, valid_data: pa.ChunkedArray) -> Dict[str, Any]:
|
294
|
-
stats: Dict[str, Any] = {}
|
295
|
-
try:
|
296
|
-
value_counts_table = valid_data.value_counts()
|
297
|
-
if isinstance(value_counts_table, pa.Table):
|
298
|
-
counts_df = value_counts_table.to_pandas()
|
299
|
-
elif isinstance(value_counts_table, pa.StructArray):
|
300
|
-
try:
|
301
|
-
counts_df = value_counts_table.flatten().to_pandas()
|
302
|
-
except NotImplementedError:
|
303
|
-
counts_df = pd.DataFrame(value_counts_table.to_pylist()) # Fallback
|
304
|
-
|
305
|
-
else:
|
306
|
-
counts_df = pd.DataFrame(value_counts_table.to_pylist())
|
307
|
-
|
308
|
-
if 'values' in counts_df.columns and 'counts' in counts_df.columns:
|
309
|
-
stats["Value Counts"] = counts_df.set_index('values')['counts'].to_dict()
|
310
|
-
elif len(counts_df.columns) == 2: # Assume first is value, second is count
|
311
|
-
stats["Value Counts"] = counts_df.set_index(counts_df.columns[0])[counts_df.columns[1]].to_dict()
|
312
|
-
else:
|
313
|
-
log.warning("Could not parse boolean value counts structure.")
|
314
|
-
stats["Value Counts"] = "Could not parse structure"
|
315
|
-
|
316
|
-
except Exception as vc_e:
|
317
|
-
log.warning(f"Boolean value count error: {vc_e}");
|
318
|
-
stats["Value Counts"] = "Error calculating"
|
319
|
-
return stats
|
320
|
-
|
321
|
-
def _extract_stats_for_single_group(
|
322
|
-
self, rg_meta: pq.RowGroupMetaData, col_index: int
|
323
|
-
) -> Union[str, Dict[str, Any]]:
|
324
|
-
|
325
|
-
if col_index >= rg_meta.num_columns:
|
326
|
-
log.warning(
|
327
|
-
f"Column index {col_index} out of bounds for row group "
|
328
|
-
f"with {rg_meta.num_columns} columns."
|
329
|
-
)
|
330
|
-
return "Index Error"
|
331
|
-
|
332
|
-
col_chunk_meta = rg_meta.column(col_index)
|
333
|
-
stats = col_chunk_meta.statistics
|
334
|
-
|
335
|
-
if not stats:
|
336
|
-
return "No stats"
|
337
|
-
|
338
|
-
has_min_max = getattr(stats, 'has_min_max', False)
|
339
|
-
has_distinct = getattr(stats, 'has_distinct_count', False)
|
340
|
-
|
341
|
-
return {
|
342
|
-
"min": getattr(stats, 'min', 'N/A') if has_min_max else "N/A",
|
343
|
-
"max": getattr(stats, 'max', 'N/A') if has_min_max else "N/A",
|
344
|
-
"nulls": getattr(stats, 'null_count', 'N/A'),
|
345
|
-
"distinct": getattr(stats, 'distinct_count', 'N/A') if has_distinct else "N/A",
|
346
|
-
}
|
347
|
-
|
348
|
-
def _get_stats_from_metadata(self, column_name: str) -> Tuple[Dict[str, Any], Optional[str]]:
|
349
|
-
metadata_stats: Dict[str, Any] = {}
|
350
|
-
if not self.metadata or not self.schema:
|
351
|
-
log.warning("Metadata or Schema not available for _get_stats_from_metadata.")
|
352
|
-
return {}, "Metadata or Schema not available"
|
353
|
-
col_index = self.schema.get_field_index(column_name)
|
354
|
-
|
355
|
-
for i in range(self.metadata.num_row_groups):
|
356
|
-
group_key = f"RG {i}"
|
357
|
-
try:
|
358
|
-
rg_meta = self.metadata.row_group(i)
|
359
|
-
metadata_stats[group_key] = self._extract_stats_for_single_group(
|
360
|
-
rg_meta, col_index
|
361
|
-
)
|
362
|
-
except Exception as e:
|
363
|
-
log.warning(
|
364
|
-
f"Error processing metadata for row group {i}, column '{column_name}': {e}"
|
365
|
-
)
|
366
|
-
metadata_stats[group_key] = f"Read Error: {e}"
|
367
|
-
|
368
|
-
return metadata_stats, None
|
369
|
-
|
370
|
-
def _create_stats_result(
|
371
|
-
self,
|
372
|
-
column_name: str,
|
373
|
-
field: Optional[pa.Field],
|
374
|
-
calculated_stats: Optional[Dict] = None,
|
375
|
-
metadata_stats: Optional[Dict] = None,
|
376
|
-
metadata_stats_error: Optional[str] = None,
|
377
|
-
calculation_error: Optional[str] = None,
|
378
|
-
message: Optional[str] = None
|
379
|
-
) -> Dict[str, Any]:
|
380
|
-
return {
|
381
|
-
"column": column_name,
|
382
|
-
"type": self._format_pyarrow_type(field.type) if field else "Unknown",
|
383
|
-
"nullable": field.nullable if field else "Unknown",
|
384
|
-
"calculated": calculated_stats if calculated_stats else None,
|
385
|
-
"basic_metadata_stats": metadata_stats if metadata_stats else None,
|
386
|
-
"metadata_stats_error": metadata_stats_error,
|
387
|
-
"error": calculation_error,
|
388
|
-
"message": message
|
389
|
-
}
|
parqv/views/row_group_view.py
DELETED
@@ -1,33 +0,0 @@
|
|
1
|
-
from textual.containers import VerticalScroll
|
2
|
-
from textual.widgets import DataTable, Static
|
3
|
-
|
4
|
-
|
5
|
-
class RowGroupView(VerticalScroll):
|
6
|
-
|
7
|
-
def on_mount(self) -> None:
|
8
|
-
self.load_row_groups()
|
9
|
-
|
10
|
-
def load_row_groups(self):
|
11
|
-
try:
|
12
|
-
if self.app.handler:
|
13
|
-
rg_info_list = self.app.handler.get_row_group_info()
|
14
|
-
|
15
|
-
if rg_info_list:
|
16
|
-
table = DataTable(id="rowgroup-table")
|
17
|
-
table.cursor_type = "row"
|
18
|
-
|
19
|
-
columns = list(rg_info_list[0].keys())
|
20
|
-
table.add_columns(*columns)
|
21
|
-
|
22
|
-
rows_data = [
|
23
|
-
tuple(str(rg.get(col, '')) for col in columns)
|
24
|
-
for rg in rg_info_list
|
25
|
-
]
|
26
|
-
table.add_rows(rows_data)
|
27
|
-
self.mount(table)
|
28
|
-
else:
|
29
|
-
self.mount(Static("No row group information available or file has no row groups."))
|
30
|
-
else:
|
31
|
-
self.mount(Static("Parquet handler not available.", classes="error-content"))
|
32
|
-
except Exception as e:
|
33
|
-
self.mount(Static(f"Error loading row group info: {e}", classes="error-content"))
|
parqv-0.1.0.dist-info/METADATA
DELETED
@@ -1,91 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: parqv
|
3
|
-
Version: 0.1.0
|
4
|
-
Summary: An interactive Python TUI for visualizing, exploring, and analyzing Parquet files directly in your terminal.
|
5
|
-
Author-email: Sangmin Yoon <sanspareilsmyn@gmail.com>
|
6
|
-
License-Expression: Apache-2.0
|
7
|
-
Requires-Python: >=3.10
|
8
|
-
Description-Content-Type: text/markdown
|
9
|
-
License-File: LICENSE
|
10
|
-
Requires-Dist: textual>=1.0.0
|
11
|
-
Requires-Dist: pyarrow>=16.0.0
|
12
|
-
Requires-Dist: pandas>=2.0.0
|
13
|
-
Requires-Dist: numpy>=1.20.0
|
14
|
-
Dynamic: license-file
|
15
|
-
|
16
|
-
# parqv
|
17
|
-
|
18
|
-
[](https://www.python.org/)
|
19
|
-
[](LICENSE)
|
20
|
-
[](https://badge.fury.io/py/parqv) <!-- Link after PyPI release -->
|
21
|
-
[](https://textual.textualize.io/)
|
22
|
-
<!-- Optional: Add BuyMeACoffee or other badges later if desired -->
|
23
|
-
|
24
|
-
**`parqv` is a Python-based interactive TUI (Text User Interface) tool designed to explore, analyze, and understand Parquet files directly within your terminal.** Forget juggling multiple commands; `parqv` provides a unified, visual experience.
|
25
|
-
|
26
|
-
## 💻 Demo (Placeholder)
|
27
|
-
|
28
|
-

|
29
|
-
|
30
|
-
## 🤔 Why `parqv`?
|
31
|
-
1. **Unified Interface:** Launch `parqv <file.parquet>` to access **metadata, schema, data preview, column statistics, and row group details** all within a single, navigable terminal window. No more memorizing different commands.
|
32
|
-
2. **Interactive Exploration:**
|
33
|
-
* **🖱️ Keyboard & Mouse Driven:** Navigate using familiar keys (arrows, `hjkl`, Tab) or even your mouse (thanks to `Textual`).
|
34
|
-
* **📜 Scrollable Views:** Easily scroll through large schemas, data tables, or row group lists.
|
35
|
-
* **🌲 Expandable Schema:** Visualize and navigate complex nested structures (Structs, Lists) effortlessly.
|
36
|
-
* **📊 Dynamic Stats:** Select a column and instantly see its detailed statistics and distribution.
|
37
|
-
3. **Enhanced Analysis & Visualization:**
|
38
|
-
* **🎨 Rich Display:** Leverages `rich` and `Textual` for colorful, readable tables and syntax-highlighted schema.
|
39
|
-
* **📈 Quick Stats:** Go beyond min/max/nulls. See means, medians, quantiles, distinct counts, frequency distributions, and even text-based histograms.
|
40
|
-
* **🔬 Row Group Deep Dive:** Inspect individual row groups to understand compression, encoding, and potential data skew.
|
41
|
-
|
42
|
-
## ✨ Features (TUI Mode)
|
43
|
-
* **Interactive TUI:** Run `parqv <file.parquet>` to launch the main interface.
|
44
|
-
* **Metadata Panel:** Displays key file information (path, creator, total rows, row groups, compression, etc.).
|
45
|
-
* **Schema Explorer:**
|
46
|
-
* Interactive, collapsible tree view for schemas.
|
47
|
-
* Clearly shows column names, data types (including nested types), and nullability.
|
48
|
-
* Syntax highlighting for better readability.
|
49
|
-
* **Data Table Viewer:**
|
50
|
-
* Scrollable table preview of the file's data.
|
51
|
-
* Handles large files by loading data pages on demand.
|
52
|
-
* (Planned) Column selection/reordering.
|
53
|
-
* **Row Group Inspector:**
|
54
|
-
* List all row groups with key stats (row count, compressed/uncompressed size).
|
55
|
-
* Select a row group to view per-column details (encoding, size, stats within the group).
|
56
|
-
|
57
|
-
## 🚀 Getting Started
|
58
|
-
|
59
|
-
**1. Prerequisites:**
|
60
|
-
* **Python:** Version 3.10 or higher.
|
61
|
-
* **pip:** The Python package installer.
|
62
|
-
|
63
|
-
**2. Install `parqv`:**
|
64
|
-
* Open your terminal and run:
|
65
|
-
```bash
|
66
|
-
pip install parqv
|
67
|
-
```
|
68
|
-
* **Updating `parqv`:**
|
69
|
-
```bash
|
70
|
-
pip install --upgrade parqv
|
71
|
-
```
|
72
|
-
|
73
|
-
**3. Run `parqv`:**
|
74
|
-
* Point `parqv` to your Parquet file:
|
75
|
-
```bash
|
76
|
-
parqv /path/to/your/data.parquet
|
77
|
-
```
|
78
|
-
* The interactive TUI will launch. Use your keyboard (and mouse, if supported by your terminal) to navigate:
|
79
|
-
* **Arrow Keys / `h`,`j`,`k`,`l`:** Move focus within lists, tables, trees.
|
80
|
-
* **`Tab` / `Shift+Tab`:** Cycle focus between different panes/widgets.
|
81
|
-
* **`Enter`:** Select items, expand/collapse tree nodes.
|
82
|
-
* **View Switching Keys (Examples - check help):** `m` (Metadata), `s` (Schema), `d` (Data), `t` (Stats), `g` (Row Groups).
|
83
|
-
* **`PageUp` / `PageDown` / `Home` / `End`:** Scroll long lists or tables.
|
84
|
-
* **`?`:** Show help screen with keybindings.
|
85
|
-
* **`q` / `Ctrl+C`:** Quit `parqv`.
|
86
|
-
|
87
|
-
---
|
88
|
-
|
89
|
-
## 📄 License
|
90
|
-
|
91
|
-
Licensed under the Apache License, Version 2.0. See [LICENSE](LICENSE) for the full license text.
|
parqv-0.1.0.dist-info/RECORD
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
parqv/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
parqv/app.py,sha256=akN-tLb0QpYrG9A_1-Sa2zg--MOZpSGMbVYEZDFpyY0,4587
|
3
|
-
parqv/parquet_handler.py,sha256=Jh210_neAhVZU4vcz8N4gVbV0NqilkYZ0IlQTMcrrkc,16589
|
4
|
-
parqv/parqv.css,sha256=C42ZXUwMX1ZXfGo0AmixbHxz0CWKzWBHZ_hkhq5aehg,2920
|
5
|
-
parqv/views/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
parqv/views/data_view.py,sha256=T_LPbXdxm_KOmwtTQWAxlUeNTlSGfIgQIg853WDMAME,2355
|
7
|
-
parqv/views/metadata_view.py,sha256=KgLOjefGyY50zTs-H99PwvJ9IDPE1jfq6--qmD5zp48,656
|
8
|
-
parqv/views/row_group_view.py,sha256=4BkaUM2Q2HQ-bvdRYpiYIl5T4oGJGReoaJOcttUd-8s,1210
|
9
|
-
parqv/views/schema_view.py,sha256=IYsxkIV9kvOKGfylOQhqeUyakvymlGlgwXWMibhslmI,7880
|
10
|
-
parqv-0.1.0.dist-info/licenses/LICENSE,sha256=Ewl2wCa8r6ncxHlpf-ZZXb77c82zdfxHuEeKzBbm6nM,11324
|
11
|
-
parqv-0.1.0.dist-info/METADATA,sha256=TxX4NHmXdfu4O5bK6-UiMqlncusxfxj1KvZQaGVLPR0,4525
|
12
|
-
parqv-0.1.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
13
|
-
parqv-0.1.0.dist-info/entry_points.txt,sha256=8Tm8rTiIB-tbVItoOA4M7seEmFnrtK25BMH9UKzqfXg,44
|
14
|
-
parqv-0.1.0.dist-info/top_level.txt,sha256=_t3_49ZluJbvl0QU_P3GNVuXxCffqiTp37dzZIa2GEw,6
|
15
|
-
parqv-0.1.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|