parqv 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parqv/app.py +87 -50
- parqv/handlers/__init__.py +13 -0
- parqv/handlers/base_handler.py +114 -0
- parqv/handlers/json.py +450 -0
- parqv/handlers/parquet.py +640 -0
- parqv/views/metadata_view.py +11 -4
- parqv/views/schema_view.py +147 -88
- parqv-0.2.0.dist-info/METADATA +104 -0
- parqv-0.2.0.dist-info/RECORD +17 -0
- {parqv-0.1.0.dist-info → parqv-0.2.0.dist-info}/WHEEL +1 -1
- parqv/parquet_handler.py +0 -389
- parqv/views/row_group_view.py +0 -33
- parqv-0.1.0.dist-info/METADATA +0 -91
- parqv-0.1.0.dist-info/RECORD +0 -15
- {parqv-0.1.0.dist-info → parqv-0.2.0.dist-info}/entry_points.txt +0 -0
- {parqv-0.1.0.dist-info → parqv-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {parqv-0.1.0.dist-info → parqv-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,640 @@
|
|
1
|
+
import logging
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Any, Dict, List, Tuple, Optional, Union
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
import pyarrow as pa
|
7
|
+
import pyarrow.compute as pc
|
8
|
+
import pyarrow.parquet as pq
|
9
|
+
|
10
|
+
from .base_handler import DataHandler, DataHandlerError
|
11
|
+
|
12
|
+
log = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
class ParquetHandlerError(DataHandlerError):
|
16
|
+
"""Custom exception for Parquet Handler errors."""
|
17
|
+
pass
|
18
|
+
|
19
|
+
|
20
|
+
class ParquetHandler(DataHandler):
|
21
|
+
"""
|
22
|
+
Handles Parquet file interactions using PyArrow.
|
23
|
+
|
24
|
+
Provides methods to access metadata, schema, data preview, and column statistics.
|
25
|
+
Manages the Parquet file resource lifecycle.
|
26
|
+
"""
|
27
|
+
|
28
|
+
def __init__(self, file_path: Path):
|
29
|
+
"""
|
30
|
+
Initializes the ParquetHandler by validating the path and opening the Parquet file.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
file_path: Path to the Parquet file.
|
34
|
+
|
35
|
+
Raises:
|
36
|
+
ParquetHandlerError: If the file is not found, not a file, or cannot be opened/read.
|
37
|
+
"""
|
38
|
+
super().__init__(file_path)
|
39
|
+
self.pq_file: Optional[pq.ParquetFile] = None
|
40
|
+
self.schema: Optional[pa.Schema] = None
|
41
|
+
self.metadata: Optional[pq.FileMetaData] = None
|
42
|
+
|
43
|
+
try:
|
44
|
+
# Validate file existence using the path stored by the base class
|
45
|
+
if not self.file_path.is_file():
|
46
|
+
raise FileNotFoundError(f"Parquet file not found or is not a file: {self.file_path}")
|
47
|
+
|
48
|
+
# Open the Parquet file
|
49
|
+
self.pq_file = pq.ParquetFile(self.file_path)
|
50
|
+
self.schema = self.pq_file.schema_arrow
|
51
|
+
self.metadata = self.pq_file.metadata
|
52
|
+
log.info(f"Successfully initialized ParquetHandler for: {self.file_path.name}")
|
53
|
+
|
54
|
+
except FileNotFoundError as fnf_e:
|
55
|
+
log.error(f"File not found during ParquetHandler initialization: {fnf_e}")
|
56
|
+
raise ParquetHandlerError(str(fnf_e)) from fnf_e
|
57
|
+
except pa.lib.ArrowIOError as arrow_io_e:
|
58
|
+
log.error(f"Arrow IO Error initializing ParquetHandler for {self.file_path.name}: {arrow_io_e}")
|
59
|
+
raise ParquetHandlerError(
|
60
|
+
f"Failed to open Parquet file '{self.file_path.name}': {arrow_io_e}") from arrow_io_e
|
61
|
+
except Exception as e:
|
62
|
+
log.exception(f"Unexpected error initializing ParquetHandler for {self.file_path.name}")
|
63
|
+
self.close()
|
64
|
+
raise ParquetHandlerError(f"Failed to initialize Parquet handler '{self.file_path.name}': {e}") from e
|
65
|
+
|
66
|
+
# Resource Management
|
67
|
+
def close(self) -> None:
|
68
|
+
"""Closes the Parquet file resource if it's open."""
|
69
|
+
if self.pq_file is not None:
|
70
|
+
try:
|
71
|
+
# ParquetFile might not have a close method depending on source, check first
|
72
|
+
if hasattr(self.pq_file, 'close'):
|
73
|
+
self.pq_file.close()
|
74
|
+
log.info(f"Closed Parquet file: {self.file_path.name}")
|
75
|
+
except Exception as e:
|
76
|
+
# Log error during close but don't raise, as we're cleaning up
|
77
|
+
log.warning(f"Exception while closing Parquet file {self.file_path.name}: {e}")
|
78
|
+
finally:
|
79
|
+
self.pq_file = None
|
80
|
+
self.schema = None
|
81
|
+
self.metadata = None
|
82
|
+
|
83
|
+
def __enter__(self):
|
84
|
+
"""Enter the runtime context related to this object."""
|
85
|
+
if not self.pq_file:
|
86
|
+
raise ParquetHandlerError("Parquet file is not open or handler was closed.")
|
87
|
+
return self
|
88
|
+
|
89
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
90
|
+
"""Exit the runtime context related to this object, ensuring cleanup."""
|
91
|
+
self.close()
|
92
|
+
|
93
|
+
def __del__(self):
|
94
|
+
"""Attempt to close the file when the object is garbage collected (best effort)."""
|
95
|
+
self.close()
|
96
|
+
|
97
|
+
def get_metadata_summary(self) -> Dict[str, Any]:
|
98
|
+
"""
|
99
|
+
Provides a summary dictionary of the Parquet file's metadata.
|
100
|
+
|
101
|
+
Returns:
|
102
|
+
A dictionary containing key metadata attributes, or an error dictionary.
|
103
|
+
"""
|
104
|
+
if not self.metadata or not self.schema:
|
105
|
+
log.warning(f"Metadata or schema not available for summary: {self.file_path.name}")
|
106
|
+
return {"error": "Metadata or schema not available"}
|
107
|
+
|
108
|
+
try:
|
109
|
+
created_by = self._decode_metadata_bytes(self.metadata.created_by) or "N/A"
|
110
|
+
file_size = self.file_path.stat().st_size
|
111
|
+
summary = {
|
112
|
+
"File Path": str(self.file_path.resolve()),
|
113
|
+
"Format": "Parquet",
|
114
|
+
"Size": self._format_size(file_size),
|
115
|
+
"Total Rows": f"{self.metadata.num_rows:,}",
|
116
|
+
"Row Groups": self.metadata.num_row_groups,
|
117
|
+
"Columns": self.metadata.num_columns,
|
118
|
+
"Format Version": self.metadata.format_version,
|
119
|
+
"Creator": created_by,
|
120
|
+
"Serialization Library": self._decode_metadata_bytes(
|
121
|
+
self.metadata.serialized_size > 0 and self.metadata.created_by) or "N/A",
|
122
|
+
}
|
123
|
+
kv_meta = self._decode_key_value_metadata(self.metadata.metadata)
|
124
|
+
if kv_meta:
|
125
|
+
summary["Key Value Metadata"] = kv_meta
|
126
|
+
|
127
|
+
return summary
|
128
|
+
except Exception as e:
|
129
|
+
log.exception(f"Error generating metadata summary for {self.file_path.name}")
|
130
|
+
return {"error": f"Error getting metadata summary: {e}"}
|
131
|
+
|
132
|
+
def get_schema_data(self) -> Optional[List[Dict[str, Any]]]:
|
133
|
+
"""
|
134
|
+
Returns a simplified list representation of the Arrow schema.
|
135
|
+
|
136
|
+
Returns:
|
137
|
+
A list of dictionaries, each describing a column (name, type string, nullable bool),
|
138
|
+
or None if the schema is unavailable.
|
139
|
+
"""
|
140
|
+
if not self.schema:
|
141
|
+
log.warning(f"Schema is not available for get_schema_data: {self.file_path.name}")
|
142
|
+
return None
|
143
|
+
|
144
|
+
schema_list = []
|
145
|
+
for field in self.schema:
|
146
|
+
try:
|
147
|
+
type_str = self._format_pyarrow_type(field.type)
|
148
|
+
schema_list.append({
|
149
|
+
"name": field.name,
|
150
|
+
"type": type_str,
|
151
|
+
"nullable": field.nullable
|
152
|
+
})
|
153
|
+
except Exception as e:
|
154
|
+
log.error(f"Error processing field '{field.name}' for schema data: {e}", exc_info=True)
|
155
|
+
schema_list.append({
|
156
|
+
"name": field.name,
|
157
|
+
"type": f"[Error: {e}]",
|
158
|
+
"nullable": None
|
159
|
+
})
|
160
|
+
return schema_list
|
161
|
+
|
162
|
+
def get_data_preview(self, num_rows: int = 50) -> pd.DataFrame:
|
163
|
+
"""
|
164
|
+
Fetches a preview of the data from the Parquet file using efficient batch iteration.
|
165
|
+
|
166
|
+
Args:
|
167
|
+
num_rows: The maximum number of rows to fetch.
|
168
|
+
|
169
|
+
Returns:
|
170
|
+
A pandas DataFrame with the preview data, potentially using ArrowDTypes.
|
171
|
+
Returns an empty DataFrame if the file is empty or no data is read.
|
172
|
+
Returns a DataFrame with an 'error' column on failure.
|
173
|
+
"""
|
174
|
+
if not self.pq_file:
|
175
|
+
log.warning(f"ParquetFile handler not available for data preview: {self.file_path.name}")
|
176
|
+
return pd.DataFrame({"error": ["Parquet handler not initialized or closed."]})
|
177
|
+
|
178
|
+
if self.metadata and self.metadata.num_rows == 0:
|
179
|
+
log.info(f"Parquet file is empty based on metadata: {self.file_path.name}")
|
180
|
+
if self.schema:
|
181
|
+
return pd.DataFrame(columns=self.schema.names)
|
182
|
+
else:
|
183
|
+
return pd.DataFrame()
|
184
|
+
|
185
|
+
try:
|
186
|
+
# Determine rows to fetch, capped by file total
|
187
|
+
num_rows_to_fetch = num_rows
|
188
|
+
if self.metadata:
|
189
|
+
num_rows_to_fetch = min(num_rows, self.metadata.num_rows)
|
190
|
+
|
191
|
+
# Use iter_batches for memory efficiency
|
192
|
+
batches = []
|
193
|
+
rows_read = 0
|
194
|
+
internal_batch_size = min(max(num_rows_to_fetch // 2, 1024), 65536)
|
195
|
+
|
196
|
+
for batch in self.pq_file.iter_batches(batch_size=internal_batch_size):
|
197
|
+
if rows_read >= num_rows_to_fetch:
|
198
|
+
break
|
199
|
+
rows_needed_in_batch = num_rows_to_fetch - rows_read
|
200
|
+
slice_len = min(len(batch), rows_needed_in_batch)
|
201
|
+
batches.append(batch.slice(0, slice_len))
|
202
|
+
rows_read += slice_len
|
203
|
+
if rows_read >= num_rows_to_fetch:
|
204
|
+
break
|
205
|
+
|
206
|
+
if not batches:
|
207
|
+
# Check if file might have rows but reading yielded nothing
|
208
|
+
if self.metadata and self.metadata.num_rows > 0:
|
209
|
+
log.warning(
|
210
|
+
f"No batches read for preview, though metadata indicates {self.metadata.num_rows} rows: {self.file_path.name}")
|
211
|
+
else:
|
212
|
+
log.info(f"No data read for preview (file likely empty): {self.file_path.name}")
|
213
|
+
# Return empty DF with columns if schema available
|
214
|
+
if self.schema:
|
215
|
+
return pd.DataFrame(columns=self.schema.names)
|
216
|
+
else:
|
217
|
+
return pd.DataFrame()
|
218
|
+
|
219
|
+
# Combine batches and convert to Pandas
|
220
|
+
preview_table = pa.Table.from_batches(batches)
|
221
|
+
df = preview_table.to_pandas(
|
222
|
+
split_blocks=True,
|
223
|
+
self_destruct=True,
|
224
|
+
types_mapper=pd.ArrowDtype
|
225
|
+
)
|
226
|
+
log.info(f"Generated preview of {len(df)} rows for {self.file_path.name}")
|
227
|
+
return df
|
228
|
+
|
229
|
+
except Exception as e:
|
230
|
+
log.exception(f"Error generating data preview from Parquet file: {self.file_path.name}")
|
231
|
+
return pd.DataFrame({"error": [f"Failed to fetch preview: {e}"]})
|
232
|
+
|
233
|
+
def get_column_stats(self, column_name: str) -> Dict[str, Any]:
|
234
|
+
"""
|
235
|
+
Calculates statistics for a specific column by reading its data.
|
236
|
+
|
237
|
+
Args:
|
238
|
+
column_name: The name of the column to analyze.
|
239
|
+
|
240
|
+
Returns:
|
241
|
+
A dictionary containing calculated statistics, metadata statistics,
|
242
|
+
and potential error or message keys.
|
243
|
+
"""
|
244
|
+
if not self.pq_file or not self.schema:
|
245
|
+
log.warning(f"Parquet file/schema unavailable for column stats: {self.file_path.name}")
|
246
|
+
return self._create_stats_result(column_name, None, error="File or schema not available")
|
247
|
+
|
248
|
+
try:
|
249
|
+
field = self.schema.field(column_name)
|
250
|
+
except KeyError:
|
251
|
+
log.warning(f"Column '{column_name}' not found in schema: {self.file_path.name}")
|
252
|
+
return self._create_stats_result(column_name, None, error=f"Column '{column_name}' not found in schema")
|
253
|
+
|
254
|
+
calculated_stats: Dict[str, Any] = {}
|
255
|
+
error_msg: Optional[str] = None
|
256
|
+
message: Optional[str] = None
|
257
|
+
metadata_stats: Optional[Dict] = None
|
258
|
+
metadata_stats_error: Optional[str] = None
|
259
|
+
|
260
|
+
try:
|
261
|
+
# Data Reading
|
262
|
+
table = self.pq_file.read(columns=[column_name])
|
263
|
+
column_data = table.column(0)
|
264
|
+
log.debug(
|
265
|
+
f"Finished reading column '{column_name}'. Rows: {len(column_data)}, Nulls: {column_data.null_count}")
|
266
|
+
|
267
|
+
# Basic Counts
|
268
|
+
total_count = len(column_data)
|
269
|
+
if total_count > 0:
|
270
|
+
null_count = column_data.null_count
|
271
|
+
valid_count = total_count - null_count
|
272
|
+
calculated_stats["Total Count"] = f"{total_count:,}"
|
273
|
+
calculated_stats["Valid Count"] = f"{valid_count:,}"
|
274
|
+
calculated_stats["Null Count"] = f"{null_count:,}"
|
275
|
+
calculated_stats["Null Percentage"] = f"{(null_count / total_count * 100):.2f}%"
|
276
|
+
else:
|
277
|
+
log.info(f"Column '{column_name}' read resulted in 0 rows.")
|
278
|
+
message = "Column is empty (0 rows)."
|
279
|
+
valid_count = 0 # Ensure valid_count is 0 for later checks
|
280
|
+
|
281
|
+
# Type-Specific Calculations
|
282
|
+
if valid_count > 0:
|
283
|
+
col_type = field.type
|
284
|
+
log.debug(f"Calculating stats for type: {self._format_pyarrow_type(col_type)}")
|
285
|
+
try:
|
286
|
+
if pa.types.is_floating(col_type) or pa.types.is_integer(col_type):
|
287
|
+
calculated_stats.update(self._calculate_numeric_stats(column_data))
|
288
|
+
elif pa.types.is_temporal(col_type):
|
289
|
+
calculated_stats.update(self._calculate_temporal_stats(column_data))
|
290
|
+
elif pa.types.is_string(col_type) or pa.types.is_large_string(col_type) \
|
291
|
+
or pa.types.is_binary(col_type) or pa.types.is_large_binary(col_type):
|
292
|
+
calculated_stats.update(self._calculate_string_binary_stats(column_data))
|
293
|
+
elif pa.types.is_boolean(col_type):
|
294
|
+
calculated_stats.update(self._calculate_boolean_stats(column_data))
|
295
|
+
elif pa.types.is_dictionary(col_type):
|
296
|
+
calculated_stats.update(self._calculate_dictionary_stats(column_data, col_type))
|
297
|
+
message = calculated_stats.pop("message", message)
|
298
|
+
elif pa.types.is_struct(col_type) or pa.types.is_list(col_type) or pa.types.is_map(col_type) \
|
299
|
+
or pa.types.is_fixed_size_list(col_type) or pa.types.is_union(col_type):
|
300
|
+
calculated_stats.update(self._calculate_complex_type_stats(column_data, col_type))
|
301
|
+
message = f"Basic aggregate stats (min/max/mean) not applicable for complex type '{self._format_pyarrow_type(col_type)}'."
|
302
|
+
else:
|
303
|
+
log.warning(f"Statistics calculation not fully implemented for type: {col_type}")
|
304
|
+
message = f"Statistics calculation not implemented for type '{self._format_pyarrow_type(col_type)}'."
|
305
|
+
|
306
|
+
except Exception as calc_err:
|
307
|
+
log.exception(f"Error during type-specific calculation for column '{column_name}': {calc_err}")
|
308
|
+
error_msg = f"Calculation error for type {field.type}: {calc_err}"
|
309
|
+
calculated_stats["Calculation Error"] = str(calc_err) # Add specific error key
|
310
|
+
|
311
|
+
elif total_count > 0:
|
312
|
+
message = "Column contains only NULL values."
|
313
|
+
|
314
|
+
# Metadata Statistics ---
|
315
|
+
metadata_stats, metadata_stats_error = self._get_stats_from_metadata(column_name)
|
316
|
+
|
317
|
+
except pa.lib.ArrowException as arrow_e:
|
318
|
+
log.exception(f"Arrow error during stats processing for column '{column_name}': {arrow_e}")
|
319
|
+
error_msg = f"Arrow processing error: {arrow_e}"
|
320
|
+
except Exception as e:
|
321
|
+
log.exception(f"Unexpected error during stats calculation for column '{column_name}'")
|
322
|
+
error_msg = f"Calculation failed unexpectedly: {e}"
|
323
|
+
|
324
|
+
return self._create_stats_result(
|
325
|
+
column_name, field, calculated_stats, metadata_stats, metadata_stats_error, error_msg, message
|
326
|
+
)
|
327
|
+
|
328
|
+
def _decode_metadata_bytes(self, value: Optional[Union[bytes, str]]) -> Optional[str]:
|
329
|
+
"""Safely decodes bytes metadata values to UTF-8 strings, replacing errors."""
|
330
|
+
if isinstance(value, bytes):
|
331
|
+
try:
|
332
|
+
return value.decode('utf-8', errors='replace')
|
333
|
+
except Exception as e:
|
334
|
+
log.warning(f"Could not decode metadata bytes: {e}. Value: {value!r}")
|
335
|
+
return f"[Decode Error: {value!r}]"
|
336
|
+
return str(value) if value is not None else None
|
337
|
+
|
338
|
+
def _decode_key_value_metadata(self, kv_meta: Optional[Dict[Union[str, bytes], Union[str, bytes]]]) -> Optional[
|
339
|
+
Dict[str, str]]:
|
340
|
+
"""Decodes keys and values of the key-value metadata dictionary."""
|
341
|
+
if not kv_meta:
|
342
|
+
return None
|
343
|
+
decoded_kv = {}
|
344
|
+
try:
|
345
|
+
for k, v in kv_meta.items():
|
346
|
+
key_str = self._decode_metadata_bytes(k) or "[Invalid Key]"
|
347
|
+
val_str = self._decode_metadata_bytes(v) or "[Invalid Value]"
|
348
|
+
decoded_kv[key_str] = val_str
|
349
|
+
return decoded_kv
|
350
|
+
except Exception as e:
|
351
|
+
log.warning(f"Could not decode key-value metadata: {e}")
|
352
|
+
return {"error": f"Error decoding key-value metadata: {e}"}
|
353
|
+
|
354
|
+
def _format_pyarrow_type(self, field_type: pa.DataType) -> str:
|
355
|
+
"""Formats a PyArrow DataType into a readable string, including details."""
|
356
|
+
if pa.types.is_timestamp(field_type):
|
357
|
+
tz_str = f", tz='{field_type.tz}'" if field_type.tz else ""
|
358
|
+
return f"TIMESTAMP(unit='{field_type.unit}'{tz_str})"
|
359
|
+
if pa.types.is_time32(field_type) or pa.types.is_time64(field_type):
|
360
|
+
return f"TIME(unit='{field_type.unit}')"
|
361
|
+
if pa.types.is_duration(field_type):
|
362
|
+
return f"DURATION(unit='{field_type.unit}')"
|
363
|
+
if pa.types.is_decimal128(field_type) or pa.types.is_decimal256(field_type):
|
364
|
+
return f"DECIMAL({field_type.precision}, {field_type.scale})"
|
365
|
+
if pa.types.is_fixed_size_binary(field_type):
|
366
|
+
return f"FIXED_SIZE_BINARY({field_type.byte_width})"
|
367
|
+
if pa.types.is_list(field_type) or pa.types.is_large_list(field_type) or pa.types.is_fixed_size_list(
|
368
|
+
field_type):
|
369
|
+
prefix = "LIST"
|
370
|
+
if pa.types.is_large_list(field_type): prefix = "LARGE_LIST"
|
371
|
+
if pa.types.is_fixed_size_list(field_type): prefix = f"FIXED_SIZE_LIST({field_type.list_size})"
|
372
|
+
value_type_str = self._format_pyarrow_type(field_type.value_type)
|
373
|
+
return f"{prefix}<item: {value_type_str}>"
|
374
|
+
if pa.types.is_struct(field_type):
|
375
|
+
num_fields_to_show = 3
|
376
|
+
field_details = ", ".join(
|
377
|
+
f"{f.name}: {self._format_pyarrow_type(f.type)}" for f in field_type[:num_fields_to_show])
|
378
|
+
suffix = "..." if field_type.num_fields > num_fields_to_show else ""
|
379
|
+
return f"STRUCT<{field_details}{suffix}>"
|
380
|
+
if pa.types.is_map(field_type):
|
381
|
+
keys_sorted = getattr(field_type, 'keys_sorted', False)
|
382
|
+
sorted_str = ", keys_sorted" if keys_sorted else ""
|
383
|
+
key_type_str = self._format_pyarrow_type(field_type.key_type)
|
384
|
+
item_type_str = self._format_pyarrow_type(field_type.item_type)
|
385
|
+
return f"MAP<key: {key_type_str}, value: {item_type_str}{sorted_str}>"
|
386
|
+
if pa.types.is_dictionary(field_type):
|
387
|
+
index_type_str = self._format_pyarrow_type(field_type.index_type)
|
388
|
+
value_type_str = self._format_pyarrow_type(field_type.value_type)
|
389
|
+
ordered = getattr(field_type, 'ordered', False)
|
390
|
+
return f"DICTIONARY<indices: {index_type_str}, values: {value_type_str}{', ordered' if ordered else ''}>"
|
391
|
+
if pa.types.is_union(field_type):
|
392
|
+
type_codes = getattr(field_type, 'type_codes', [])
|
393
|
+
mode = getattr(field_type, 'mode', 'sparse')
|
394
|
+
field_details = ", ".join(
|
395
|
+
f"{f.name}: {self._format_pyarrow_type(f.type)}" for f in field_type[:3]) # Show first few fields
|
396
|
+
suffix = "..." if field_type.num_fields > 3 else ""
|
397
|
+
return f"UNION<{field_details}{suffix}> (mode='{mode}', codes={type_codes[:5]}{'...' if len(type_codes) > 5 else ''})"
|
398
|
+
|
399
|
+
return str(field_type).upper()
|
400
|
+
|
401
|
+
def _safe_compute(self, func, data, *args, **kwargs) -> Tuple[Optional[Any], Optional[str]]:
|
402
|
+
"""Helper to safely execute a pyarrow.compute function and handle errors."""
|
403
|
+
if data.null_count == len(data):
|
404
|
+
return None, "Input data is all NULL"
|
405
|
+
try:
|
406
|
+
result_scalar = func(data, *args, **kwargs)
|
407
|
+
return result_scalar.as_py() if result_scalar.is_valid else None, None
|
408
|
+
except pa.lib.ArrowNotImplementedError as nie:
|
409
|
+
return None, "Not Implemented"
|
410
|
+
except Exception as e:
|
411
|
+
return None, f"Compute Error: {e}"
|
412
|
+
|
413
|
+
def _calculate_numeric_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]:
|
414
|
+
"""Calculates min, max, mean, stddev for numeric columns using _safe_compute."""
|
415
|
+
stats: Dict[str, Any] = {}
|
416
|
+
min_val, err = self._safe_compute(pc.min, column_data)
|
417
|
+
stats["Min"] = min_val if err is None else err
|
418
|
+
max_val, err = self._safe_compute(pc.max, column_data)
|
419
|
+
stats["Max"] = max_val if err is None else err
|
420
|
+
mean_val, err = self._safe_compute(pc.mean, column_data)
|
421
|
+
stats["Mean"] = f"{mean_val:.4f}" if mean_val is not None and err is None else (err or "N/A")
|
422
|
+
stddev_val, err = self._safe_compute(pc.stddev, column_data, ddof=1)
|
423
|
+
stats["StdDev"] = f"{stddev_val:.4f}" if stddev_val is not None and err is None else (err or "N/A")
|
424
|
+
if stats["StdDev"] == "Not Implemented":
|
425
|
+
variance_val, err_var = self._safe_compute(pc.variance, column_data, ddof=1)
|
426
|
+
stats["Variance"] = f"{variance_val:.4f}" if variance_val is not None and err_var is None else (
|
427
|
+
err_var or "N/A")
|
428
|
+
|
429
|
+
return stats
|
430
|
+
|
431
|
+
def _calculate_temporal_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]:
|
432
|
+
"""Calculates min and max for temporal columns using _safe_compute."""
|
433
|
+
stats: Dict[str, Any] = {}
|
434
|
+
min_val, err = self._safe_compute(pc.min, column_data)
|
435
|
+
stats["Min"] = min_val if err is None else err # .as_py() handles conversion
|
436
|
+
max_val, err = self._safe_compute(pc.max, column_data)
|
437
|
+
stats["Max"] = max_val if err is None else err
|
438
|
+
return stats
|
439
|
+
|
440
|
+
def _calculate_string_binary_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]:
|
441
|
+
"""Calculates distinct count and optionally length stats for string/binary."""
|
442
|
+
stats: Dict[str, Any] = {}
|
443
|
+
distinct_val, err = self._safe_compute(pc.count_distinct, column_data)
|
444
|
+
stats["Distinct Count"] = f"{distinct_val:,}" if distinct_val is not None and err is None else (err or "N/A")
|
445
|
+
|
446
|
+
if pa.types.is_string(column_data.type) or pa.types.is_large_string(column_data.type):
|
447
|
+
lengths, err_len = self._safe_compute(pc.binary_length, column_data)
|
448
|
+
if err_len is None and lengths is not None:
|
449
|
+
min_len, err_min = self._safe_compute(pc.min, lengths)
|
450
|
+
stats["Min Length"] = min_len if err_min is None else err_min
|
451
|
+
max_len, err_max = self._safe_compute(pc.max, lengths)
|
452
|
+
stats["Max Length"] = max_len if err_max is None else err_max
|
453
|
+
avg_len, err_avg = self._safe_compute(pc.mean, lengths)
|
454
|
+
stats["Avg Length"] = f"{avg_len:.2f}" if avg_len is not None and err_avg is None else (
|
455
|
+
err_avg or "N/A")
|
456
|
+
else:
|
457
|
+
stats.update({"Min Length": "Error", "Max Length": "Error", "Avg Length": "Error"})
|
458
|
+
return stats
|
459
|
+
|
460
|
+
def _calculate_boolean_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]:
|
461
|
+
"""Calculates value counts (True/False) for boolean columns."""
|
462
|
+
stats: Dict[str, Any] = {}
|
463
|
+
try:
|
464
|
+
if column_data.null_count == len(column_data):
|
465
|
+
stats["Value Counts"] = "All NULL"
|
466
|
+
return stats
|
467
|
+
|
468
|
+
# value_counts returns a StructArray [{values: bool, counts: int64}, ...]
|
469
|
+
value_counts_struct = pc.value_counts(column_data)
|
470
|
+
counts_dict = {}
|
471
|
+
if len(value_counts_struct) > 0:
|
472
|
+
for i in range(len(value_counts_struct)):
|
473
|
+
value = value_counts_struct.field("values")[i].as_py()
|
474
|
+
count = value_counts_struct.field("counts")[i].as_py()
|
475
|
+
counts_dict[value] = count # Keys are True/False
|
476
|
+
|
477
|
+
stats["Value Counts"] = {str(k): f"{v:,}" for k, v in counts_dict.items()}
|
478
|
+
# Ensure both True and False are present, even if count is 0
|
479
|
+
if 'True' not in stats["Value Counts"]: stats["Value Counts"]['True'] = "0"
|
480
|
+
if 'False' not in stats["Value Counts"]: stats["Value Counts"]['False'] = "0"
|
481
|
+
|
482
|
+
except Exception as vc_e:
|
483
|
+
log.warning(f"Boolean value count calculation error: {vc_e}", exc_info=True)
|
484
|
+
stats["Value Counts"] = "Error calculating"
|
485
|
+
return stats
|
486
|
+
|
487
|
+
def _calculate_dictionary_stats(self, column_data: pa.ChunkedArray, col_type: pa.DictionaryType) -> Dict[str, Any]:
|
488
|
+
"""Calculates stats for dictionary type based on its value type."""
|
489
|
+
stats: Dict[str, Any] = {"message": "Stats calculated on dictionary values."} # Start with message
|
490
|
+
try:
|
491
|
+
unwrapped_data = column_data.dictionary_decode()
|
492
|
+
value_type = col_type.value_type
|
493
|
+
log.debug(f"Calculating dictionary stats based on value type: {value_type}")
|
494
|
+
|
495
|
+
# Delegate calculation based on the *value* type
|
496
|
+
if pa.types.is_floating(value_type) or pa.types.is_integer(value_type):
|
497
|
+
stats.update(self._calculate_numeric_stats(unwrapped_data))
|
498
|
+
elif pa.types.is_temporal(value_type):
|
499
|
+
stats.update(self._calculate_temporal_stats(unwrapped_data))
|
500
|
+
elif pa.types.is_string(value_type) or pa.types.is_large_string(value_type) \
|
501
|
+
or pa.types.is_binary(value_type) or pa.types.is_large_binary(value_type):
|
502
|
+
stats.update(self._calculate_string_binary_stats(unwrapped_data))
|
503
|
+
# Add other dictionary value types if necessary (boolean, etc.)
|
504
|
+
else:
|
505
|
+
stats[
|
506
|
+
"message"] += f" (Stats for value type '{self._format_pyarrow_type(value_type)}' not fully implemented)."
|
507
|
+
# Calculate distinct count on the original dictionary array (can be faster)
|
508
|
+
distinct_val, err = self._safe_compute(pc.count_distinct, column_data)
|
509
|
+
stats[
|
510
|
+
"Distinct Values (Approx)"] = f"{distinct_val:,}" if distinct_val is not None and err is None else (
|
511
|
+
err or "N/A")
|
512
|
+
|
513
|
+
except pa.lib.ArrowException as arrow_decode_err:
|
514
|
+
log.warning(f"Arrow error decoding dictionary type for stats: {arrow_decode_err}")
|
515
|
+
stats["Dictionary Error"] = f"Decode Error: {arrow_decode_err}"
|
516
|
+
except Exception as dict_e:
|
517
|
+
log.warning(f"Could not process dictionary type for stats: {dict_e}")
|
518
|
+
stats["Dictionary Error"] = f"Processing Error: {dict_e}"
|
519
|
+
return stats
|
520
|
+
|
521
|
+
def _calculate_complex_type_stats(self, column_data: pa.ChunkedArray, col_type: pa.DataType) -> Dict[str, Any]:
|
522
|
+
"""Calculates basic stats (like distinct count) for complex types."""
|
523
|
+
stats: Dict[str, Any] = {}
|
524
|
+
# Distinct count is often the most feasible stat for complex types
|
525
|
+
distinct_val, err = self._safe_compute(pc.count_distinct, column_data)
|
526
|
+
# Note: Distinct count on complex types can be approximate or may error depending on type
|
527
|
+
stats["Distinct Count (Approx)"] = f"{distinct_val:,}" if distinct_val is not None and err is None else (
|
528
|
+
err or "N/A")
|
529
|
+
return stats
|
530
|
+
|
531
|
+
def _get_stats_from_metadata(self, column_name: str) -> Tuple[Dict[str, Any], Optional[str]]:
|
532
|
+
"""Retrieves statistics stored within the Parquet file metadata per row group."""
|
533
|
+
metadata_stats: Dict[str, Any] = {}
|
534
|
+
error_str: Optional[str] = None
|
535
|
+
|
536
|
+
if not self.metadata or not self.schema:
|
537
|
+
return {}, "Metadata or Schema not available"
|
538
|
+
|
539
|
+
try:
|
540
|
+
col_index = self.schema.get_field_index(column_name)
|
541
|
+
|
542
|
+
for i in range(self.metadata.num_row_groups):
|
543
|
+
group_key = f"RG {i}"
|
544
|
+
try:
|
545
|
+
rg_meta = self.metadata.row_group(i)
|
546
|
+
metadata_stats[group_key] = self._extract_stats_for_single_group(rg_meta, col_index)
|
547
|
+
except IndexError:
|
548
|
+
log.warning(f"Column index {col_index} out of bounds for row group {i}.")
|
549
|
+
metadata_stats[group_key] = "Index Error"
|
550
|
+
except Exception as e:
|
551
|
+
log.warning(f"Error processing metadata stats for RG {i}, column '{column_name}': {e}")
|
552
|
+
metadata_stats[group_key] = f"Read Error: {e}"
|
553
|
+
|
554
|
+
except KeyError:
|
555
|
+
log.warning(f"Column '{column_name}' not found in schema for metadata stats.")
|
556
|
+
error_str = f"Column '{column_name}' not found in schema"
|
557
|
+
except Exception as e:
|
558
|
+
log.exception(f"Failed to get metadata statistics structure for column '{column_name}'.")
|
559
|
+
error_str = f"Error accessing metadata structure: {e}"
|
560
|
+
|
561
|
+
return metadata_stats, error_str
|
562
|
+
|
563
|
+
def _extract_stats_for_single_group(self, rg_meta: pq.RowGroupMetaData, col_index: int) -> Union[
|
564
|
+
str, Dict[str, Any]]:
|
565
|
+
"""Extracts stats from a column chunk's metadata within a row group."""
|
566
|
+
try:
|
567
|
+
col_chunk_meta = rg_meta.column(col_index)
|
568
|
+
stats = col_chunk_meta.statistics
|
569
|
+
if not stats: return "No stats in metadata"
|
570
|
+
|
571
|
+
def _format_stat(value, is_present, is_numeric=True):
|
572
|
+
if not is_present: return "N/A"
|
573
|
+
try:
|
574
|
+
# Attempt to format nicely, fallback to repr for safety
|
575
|
+
return f"{value:,}" if is_numeric else str(value)
|
576
|
+
except Exception:
|
577
|
+
return repr(value)
|
578
|
+
|
579
|
+
return {
|
580
|
+
"min": _format_stat(stats.min, stats.has_min_max, is_numeric=False),
|
581
|
+
"max": _format_stat(stats.max, stats.has_min_max, is_numeric=False),
|
582
|
+
"nulls": _format_stat(stats.null_count, stats.has_null_count),
|
583
|
+
"distinct": _format_stat(stats.distinct_count, stats.has_distinct_count),
|
584
|
+
"size_comp": _format_stat(col_chunk_meta.total_compressed_size,
|
585
|
+
col_chunk_meta.total_compressed_size is not None),
|
586
|
+
"size_uncomp": _format_stat(col_chunk_meta.total_uncompressed_size,
|
587
|
+
col_chunk_meta.total_uncompressed_size is not None),
|
588
|
+
}
|
589
|
+
except IndexError:
|
590
|
+
log.warning(f"Column index {col_index} out of bounds for row group {rg_meta.num_columns} columns.")
|
591
|
+
return "Index Error"
|
592
|
+
except Exception as e:
|
593
|
+
log.error(f"Error reading column chunk metadata stats for index {col_index}: {e}", exc_info=True)
|
594
|
+
return f"Metadata Read Error: {e}"
|
595
|
+
|
596
|
+
def _create_stats_result(
|
597
|
+
self,
|
598
|
+
column_name: str,
|
599
|
+
field: Optional[pa.Field],
|
600
|
+
calculated_stats: Optional[Dict] = None,
|
601
|
+
metadata_stats: Optional[Dict] = None,
|
602
|
+
metadata_stats_error: Optional[str] = None,
|
603
|
+
calculation_error: Optional[str] = None,
|
604
|
+
message: Optional[str] = None
|
605
|
+
) -> Dict[str, Any]:
|
606
|
+
"""Consistently packages the results of column statistics calculation."""
|
607
|
+
calculated_stats_dict = calculated_stats if calculated_stats is not None else {}
|
608
|
+
|
609
|
+
col_type_str = "Unknown"
|
610
|
+
col_nullable = None
|
611
|
+
if field:
|
612
|
+
try:
|
613
|
+
col_type_str = self._format_pyarrow_type(field.type)
|
614
|
+
col_nullable = field.nullable
|
615
|
+
except Exception as e:
|
616
|
+
log.error(f"Error formatting type for column {column_name}: {e}")
|
617
|
+
col_type_str = f"[Error formatting: {field.type}]"
|
618
|
+
col_nullable = None
|
619
|
+
|
620
|
+
return {
|
621
|
+
"column": column_name,
|
622
|
+
"type": col_type_str,
|
623
|
+
"nullable": col_nullable,
|
624
|
+
"calculated": calculated_stats_dict,
|
625
|
+
"basic_metadata_stats": metadata_stats,
|
626
|
+
"metadata_stats_error": metadata_stats_error,
|
627
|
+
"error": calculation_error,
|
628
|
+
"message": message
|
629
|
+
}
|
630
|
+
|
631
|
+
def _format_size(self, num_bytes: int) -> str:
|
632
|
+
"""Formats bytes into a human-readable string (KB, MB, GB)."""
|
633
|
+
if num_bytes < 1024:
|
634
|
+
return f"{num_bytes} Bytes"
|
635
|
+
elif num_bytes < 1024 ** 2:
|
636
|
+
return f"{num_bytes / 1024:.2f} KB"
|
637
|
+
elif num_bytes < 1024 ** 3:
|
638
|
+
return f"{num_bytes / 1024 ** 2:.2f} MB"
|
639
|
+
else:
|
640
|
+
return f"{num_bytes / 1024 ** 3:.2f} GB"
|
parqv/views/metadata_view.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
+
import logging
|
1
2
|
from textual.containers import VerticalScroll
|
2
3
|
from textual.widgets import Static, Pretty
|
3
4
|
|
5
|
+
log = logging.getLogger(__name__)
|
4
6
|
|
5
7
|
class MetadataView(VerticalScroll):
|
6
8
|
|
@@ -8,12 +10,17 @@ class MetadataView(VerticalScroll):
|
|
8
10
|
self.load_metadata()
|
9
11
|
|
10
12
|
def load_metadata(self):
|
13
|
+
self.query("*").remove()
|
11
14
|
try:
|
12
15
|
if self.app.handler:
|
13
16
|
meta_data = self.app.handler.get_metadata_summary()
|
14
|
-
|
15
|
-
|
17
|
+
if meta_data.get("error"):
|
18
|
+
self.mount(Static(f"[red]Error getting metadata: {meta_data['error']}[/red]", classes="error-content"))
|
19
|
+
else:
|
20
|
+
pretty_widget = Pretty(meta_data)
|
21
|
+
self.mount(pretty_widget)
|
16
22
|
else:
|
17
|
-
self.mount(Static("
|
23
|
+
self.mount(Static("[red]Data handler not available.[/red]", classes="error-content"))
|
18
24
|
except Exception as e:
|
19
|
-
|
25
|
+
log.exception("Error loading metadata view")
|
26
|
+
self.mount(Static(f"[red]Error loading metadata: {e}[/red]", classes="error-content"))
|