parqv 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,640 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Any, Dict, List, Tuple, Optional, Union
4
+
5
+ import pandas as pd
6
+ import pyarrow as pa
7
+ import pyarrow.compute as pc
8
+ import pyarrow.parquet as pq
9
+
10
+ from .base_handler import DataHandler, DataHandlerError
11
+
12
+ log = logging.getLogger(__name__)
13
+
14
+
15
+ class ParquetHandlerError(DataHandlerError):
16
+ """Custom exception for Parquet Handler errors."""
17
+ pass
18
+
19
+
20
+ class ParquetHandler(DataHandler):
21
+ """
22
+ Handles Parquet file interactions using PyArrow.
23
+
24
+ Provides methods to access metadata, schema, data preview, and column statistics.
25
+ Manages the Parquet file resource lifecycle.
26
+ """
27
+
28
+ def __init__(self, file_path: Path):
29
+ """
30
+ Initializes the ParquetHandler by validating the path and opening the Parquet file.
31
+
32
+ Args:
33
+ file_path: Path to the Parquet file.
34
+
35
+ Raises:
36
+ ParquetHandlerError: If the file is not found, not a file, or cannot be opened/read.
37
+ """
38
+ super().__init__(file_path)
39
+ self.pq_file: Optional[pq.ParquetFile] = None
40
+ self.schema: Optional[pa.Schema] = None
41
+ self.metadata: Optional[pq.FileMetaData] = None
42
+
43
+ try:
44
+ # Validate file existence using the path stored by the base class
45
+ if not self.file_path.is_file():
46
+ raise FileNotFoundError(f"Parquet file not found or is not a file: {self.file_path}")
47
+
48
+ # Open the Parquet file
49
+ self.pq_file = pq.ParquetFile(self.file_path)
50
+ self.schema = self.pq_file.schema_arrow
51
+ self.metadata = self.pq_file.metadata
52
+ log.info(f"Successfully initialized ParquetHandler for: {self.file_path.name}")
53
+
54
+ except FileNotFoundError as fnf_e:
55
+ log.error(f"File not found during ParquetHandler initialization: {fnf_e}")
56
+ raise ParquetHandlerError(str(fnf_e)) from fnf_e
57
+ except pa.lib.ArrowIOError as arrow_io_e:
58
+ log.error(f"Arrow IO Error initializing ParquetHandler for {self.file_path.name}: {arrow_io_e}")
59
+ raise ParquetHandlerError(
60
+ f"Failed to open Parquet file '{self.file_path.name}': {arrow_io_e}") from arrow_io_e
61
+ except Exception as e:
62
+ log.exception(f"Unexpected error initializing ParquetHandler for {self.file_path.name}")
63
+ self.close()
64
+ raise ParquetHandlerError(f"Failed to initialize Parquet handler '{self.file_path.name}': {e}") from e
65
+
66
+ # Resource Management
67
+ def close(self) -> None:
68
+ """Closes the Parquet file resource if it's open."""
69
+ if self.pq_file is not None:
70
+ try:
71
+ # ParquetFile might not have a close method depending on source, check first
72
+ if hasattr(self.pq_file, 'close'):
73
+ self.pq_file.close()
74
+ log.info(f"Closed Parquet file: {self.file_path.name}")
75
+ except Exception as e:
76
+ # Log error during close but don't raise, as we're cleaning up
77
+ log.warning(f"Exception while closing Parquet file {self.file_path.name}: {e}")
78
+ finally:
79
+ self.pq_file = None
80
+ self.schema = None
81
+ self.metadata = None
82
+
83
+ def __enter__(self):
84
+ """Enter the runtime context related to this object."""
85
+ if not self.pq_file:
86
+ raise ParquetHandlerError("Parquet file is not open or handler was closed.")
87
+ return self
88
+
89
+ def __exit__(self, exc_type, exc_val, exc_tb):
90
+ """Exit the runtime context related to this object, ensuring cleanup."""
91
+ self.close()
92
+
93
+ def __del__(self):
94
+ """Attempt to close the file when the object is garbage collected (best effort)."""
95
+ self.close()
96
+
97
+ def get_metadata_summary(self) -> Dict[str, Any]:
98
+ """
99
+ Provides a summary dictionary of the Parquet file's metadata.
100
+
101
+ Returns:
102
+ A dictionary containing key metadata attributes, or an error dictionary.
103
+ """
104
+ if not self.metadata or not self.schema:
105
+ log.warning(f"Metadata or schema not available for summary: {self.file_path.name}")
106
+ return {"error": "Metadata or schema not available"}
107
+
108
+ try:
109
+ created_by = self._decode_metadata_bytes(self.metadata.created_by) or "N/A"
110
+ file_size = self.file_path.stat().st_size
111
+ summary = {
112
+ "File Path": str(self.file_path.resolve()),
113
+ "Format": "Parquet",
114
+ "Size": self._format_size(file_size),
115
+ "Total Rows": f"{self.metadata.num_rows:,}",
116
+ "Row Groups": self.metadata.num_row_groups,
117
+ "Columns": self.metadata.num_columns,
118
+ "Format Version": self.metadata.format_version,
119
+ "Creator": created_by,
120
+ "Serialization Library": self._decode_metadata_bytes(
121
+ self.metadata.serialized_size > 0 and self.metadata.created_by) or "N/A",
122
+ }
123
+ kv_meta = self._decode_key_value_metadata(self.metadata.metadata)
124
+ if kv_meta:
125
+ summary["Key Value Metadata"] = kv_meta
126
+
127
+ return summary
128
+ except Exception as e:
129
+ log.exception(f"Error generating metadata summary for {self.file_path.name}")
130
+ return {"error": f"Error getting metadata summary: {e}"}
131
+
132
+ def get_schema_data(self) -> Optional[List[Dict[str, Any]]]:
133
+ """
134
+ Returns a simplified list representation of the Arrow schema.
135
+
136
+ Returns:
137
+ A list of dictionaries, each describing a column (name, type string, nullable bool),
138
+ or None if the schema is unavailable.
139
+ """
140
+ if not self.schema:
141
+ log.warning(f"Schema is not available for get_schema_data: {self.file_path.name}")
142
+ return None
143
+
144
+ schema_list = []
145
+ for field in self.schema:
146
+ try:
147
+ type_str = self._format_pyarrow_type(field.type)
148
+ schema_list.append({
149
+ "name": field.name,
150
+ "type": type_str,
151
+ "nullable": field.nullable
152
+ })
153
+ except Exception as e:
154
+ log.error(f"Error processing field '{field.name}' for schema data: {e}", exc_info=True)
155
+ schema_list.append({
156
+ "name": field.name,
157
+ "type": f"[Error: {e}]",
158
+ "nullable": None
159
+ })
160
+ return schema_list
161
+
162
+ def get_data_preview(self, num_rows: int = 50) -> pd.DataFrame:
163
+ """
164
+ Fetches a preview of the data from the Parquet file using efficient batch iteration.
165
+
166
+ Args:
167
+ num_rows: The maximum number of rows to fetch.
168
+
169
+ Returns:
170
+ A pandas DataFrame with the preview data, potentially using ArrowDTypes.
171
+ Returns an empty DataFrame if the file is empty or no data is read.
172
+ Returns a DataFrame with an 'error' column on failure.
173
+ """
174
+ if not self.pq_file:
175
+ log.warning(f"ParquetFile handler not available for data preview: {self.file_path.name}")
176
+ return pd.DataFrame({"error": ["Parquet handler not initialized or closed."]})
177
+
178
+ if self.metadata and self.metadata.num_rows == 0:
179
+ log.info(f"Parquet file is empty based on metadata: {self.file_path.name}")
180
+ if self.schema:
181
+ return pd.DataFrame(columns=self.schema.names)
182
+ else:
183
+ return pd.DataFrame()
184
+
185
+ try:
186
+ # Determine rows to fetch, capped by file total
187
+ num_rows_to_fetch = num_rows
188
+ if self.metadata:
189
+ num_rows_to_fetch = min(num_rows, self.metadata.num_rows)
190
+
191
+ # Use iter_batches for memory efficiency
192
+ batches = []
193
+ rows_read = 0
194
+ internal_batch_size = min(max(num_rows_to_fetch // 2, 1024), 65536)
195
+
196
+ for batch in self.pq_file.iter_batches(batch_size=internal_batch_size):
197
+ if rows_read >= num_rows_to_fetch:
198
+ break
199
+ rows_needed_in_batch = num_rows_to_fetch - rows_read
200
+ slice_len = min(len(batch), rows_needed_in_batch)
201
+ batches.append(batch.slice(0, slice_len))
202
+ rows_read += slice_len
203
+ if rows_read >= num_rows_to_fetch:
204
+ break
205
+
206
+ if not batches:
207
+ # Check if file might have rows but reading yielded nothing
208
+ if self.metadata and self.metadata.num_rows > 0:
209
+ log.warning(
210
+ f"No batches read for preview, though metadata indicates {self.metadata.num_rows} rows: {self.file_path.name}")
211
+ else:
212
+ log.info(f"No data read for preview (file likely empty): {self.file_path.name}")
213
+ # Return empty DF with columns if schema available
214
+ if self.schema:
215
+ return pd.DataFrame(columns=self.schema.names)
216
+ else:
217
+ return pd.DataFrame()
218
+
219
+ # Combine batches and convert to Pandas
220
+ preview_table = pa.Table.from_batches(batches)
221
+ df = preview_table.to_pandas(
222
+ split_blocks=True,
223
+ self_destruct=True,
224
+ types_mapper=pd.ArrowDtype
225
+ )
226
+ log.info(f"Generated preview of {len(df)} rows for {self.file_path.name}")
227
+ return df
228
+
229
+ except Exception as e:
230
+ log.exception(f"Error generating data preview from Parquet file: {self.file_path.name}")
231
+ return pd.DataFrame({"error": [f"Failed to fetch preview: {e}"]})
232
+
233
+ def get_column_stats(self, column_name: str) -> Dict[str, Any]:
234
+ """
235
+ Calculates statistics for a specific column by reading its data.
236
+
237
+ Args:
238
+ column_name: The name of the column to analyze.
239
+
240
+ Returns:
241
+ A dictionary containing calculated statistics, metadata statistics,
242
+ and potential error or message keys.
243
+ """
244
+ if not self.pq_file or not self.schema:
245
+ log.warning(f"Parquet file/schema unavailable for column stats: {self.file_path.name}")
246
+ return self._create_stats_result(column_name, None, error="File or schema not available")
247
+
248
+ try:
249
+ field = self.schema.field(column_name)
250
+ except KeyError:
251
+ log.warning(f"Column '{column_name}' not found in schema: {self.file_path.name}")
252
+ return self._create_stats_result(column_name, None, error=f"Column '{column_name}' not found in schema")
253
+
254
+ calculated_stats: Dict[str, Any] = {}
255
+ error_msg: Optional[str] = None
256
+ message: Optional[str] = None
257
+ metadata_stats: Optional[Dict] = None
258
+ metadata_stats_error: Optional[str] = None
259
+
260
+ try:
261
+ # Data Reading
262
+ table = self.pq_file.read(columns=[column_name])
263
+ column_data = table.column(0)
264
+ log.debug(
265
+ f"Finished reading column '{column_name}'. Rows: {len(column_data)}, Nulls: {column_data.null_count}")
266
+
267
+ # Basic Counts
268
+ total_count = len(column_data)
269
+ if total_count > 0:
270
+ null_count = column_data.null_count
271
+ valid_count = total_count - null_count
272
+ calculated_stats["Total Count"] = f"{total_count:,}"
273
+ calculated_stats["Valid Count"] = f"{valid_count:,}"
274
+ calculated_stats["Null Count"] = f"{null_count:,}"
275
+ calculated_stats["Null Percentage"] = f"{(null_count / total_count * 100):.2f}%"
276
+ else:
277
+ log.info(f"Column '{column_name}' read resulted in 0 rows.")
278
+ message = "Column is empty (0 rows)."
279
+ valid_count = 0 # Ensure valid_count is 0 for later checks
280
+
281
+ # Type-Specific Calculations
282
+ if valid_count > 0:
283
+ col_type = field.type
284
+ log.debug(f"Calculating stats for type: {self._format_pyarrow_type(col_type)}")
285
+ try:
286
+ if pa.types.is_floating(col_type) or pa.types.is_integer(col_type):
287
+ calculated_stats.update(self._calculate_numeric_stats(column_data))
288
+ elif pa.types.is_temporal(col_type):
289
+ calculated_stats.update(self._calculate_temporal_stats(column_data))
290
+ elif pa.types.is_string(col_type) or pa.types.is_large_string(col_type) \
291
+ or pa.types.is_binary(col_type) or pa.types.is_large_binary(col_type):
292
+ calculated_stats.update(self._calculate_string_binary_stats(column_data))
293
+ elif pa.types.is_boolean(col_type):
294
+ calculated_stats.update(self._calculate_boolean_stats(column_data))
295
+ elif pa.types.is_dictionary(col_type):
296
+ calculated_stats.update(self._calculate_dictionary_stats(column_data, col_type))
297
+ message = calculated_stats.pop("message", message)
298
+ elif pa.types.is_struct(col_type) or pa.types.is_list(col_type) or pa.types.is_map(col_type) \
299
+ or pa.types.is_fixed_size_list(col_type) or pa.types.is_union(col_type):
300
+ calculated_stats.update(self._calculate_complex_type_stats(column_data, col_type))
301
+ message = f"Basic aggregate stats (min/max/mean) not applicable for complex type '{self._format_pyarrow_type(col_type)}'."
302
+ else:
303
+ log.warning(f"Statistics calculation not fully implemented for type: {col_type}")
304
+ message = f"Statistics calculation not implemented for type '{self._format_pyarrow_type(col_type)}'."
305
+
306
+ except Exception as calc_err:
307
+ log.exception(f"Error during type-specific calculation for column '{column_name}': {calc_err}")
308
+ error_msg = f"Calculation error for type {field.type}: {calc_err}"
309
+ calculated_stats["Calculation Error"] = str(calc_err) # Add specific error key
310
+
311
+ elif total_count > 0:
312
+ message = "Column contains only NULL values."
313
+
314
+ # Metadata Statistics ---
315
+ metadata_stats, metadata_stats_error = self._get_stats_from_metadata(column_name)
316
+
317
+ except pa.lib.ArrowException as arrow_e:
318
+ log.exception(f"Arrow error during stats processing for column '{column_name}': {arrow_e}")
319
+ error_msg = f"Arrow processing error: {arrow_e}"
320
+ except Exception as e:
321
+ log.exception(f"Unexpected error during stats calculation for column '{column_name}'")
322
+ error_msg = f"Calculation failed unexpectedly: {e}"
323
+
324
+ return self._create_stats_result(
325
+ column_name, field, calculated_stats, metadata_stats, metadata_stats_error, error_msg, message
326
+ )
327
+
328
+ def _decode_metadata_bytes(self, value: Optional[Union[bytes, str]]) -> Optional[str]:
329
+ """Safely decodes bytes metadata values to UTF-8 strings, replacing errors."""
330
+ if isinstance(value, bytes):
331
+ try:
332
+ return value.decode('utf-8', errors='replace')
333
+ except Exception as e:
334
+ log.warning(f"Could not decode metadata bytes: {e}. Value: {value!r}")
335
+ return f"[Decode Error: {value!r}]"
336
+ return str(value) if value is not None else None
337
+
338
+ def _decode_key_value_metadata(self, kv_meta: Optional[Dict[Union[str, bytes], Union[str, bytes]]]) -> Optional[
339
+ Dict[str, str]]:
340
+ """Decodes keys and values of the key-value metadata dictionary."""
341
+ if not kv_meta:
342
+ return None
343
+ decoded_kv = {}
344
+ try:
345
+ for k, v in kv_meta.items():
346
+ key_str = self._decode_metadata_bytes(k) or "[Invalid Key]"
347
+ val_str = self._decode_metadata_bytes(v) or "[Invalid Value]"
348
+ decoded_kv[key_str] = val_str
349
+ return decoded_kv
350
+ except Exception as e:
351
+ log.warning(f"Could not decode key-value metadata: {e}")
352
+ return {"error": f"Error decoding key-value metadata: {e}"}
353
+
354
+ def _format_pyarrow_type(self, field_type: pa.DataType) -> str:
355
+ """Formats a PyArrow DataType into a readable string, including details."""
356
+ if pa.types.is_timestamp(field_type):
357
+ tz_str = f", tz='{field_type.tz}'" if field_type.tz else ""
358
+ return f"TIMESTAMP(unit='{field_type.unit}'{tz_str})"
359
+ if pa.types.is_time32(field_type) or pa.types.is_time64(field_type):
360
+ return f"TIME(unit='{field_type.unit}')"
361
+ if pa.types.is_duration(field_type):
362
+ return f"DURATION(unit='{field_type.unit}')"
363
+ if pa.types.is_decimal128(field_type) or pa.types.is_decimal256(field_type):
364
+ return f"DECIMAL({field_type.precision}, {field_type.scale})"
365
+ if pa.types.is_fixed_size_binary(field_type):
366
+ return f"FIXED_SIZE_BINARY({field_type.byte_width})"
367
+ if pa.types.is_list(field_type) or pa.types.is_large_list(field_type) or pa.types.is_fixed_size_list(
368
+ field_type):
369
+ prefix = "LIST"
370
+ if pa.types.is_large_list(field_type): prefix = "LARGE_LIST"
371
+ if pa.types.is_fixed_size_list(field_type): prefix = f"FIXED_SIZE_LIST({field_type.list_size})"
372
+ value_type_str = self._format_pyarrow_type(field_type.value_type)
373
+ return f"{prefix}<item: {value_type_str}>"
374
+ if pa.types.is_struct(field_type):
375
+ num_fields_to_show = 3
376
+ field_details = ", ".join(
377
+ f"{f.name}: {self._format_pyarrow_type(f.type)}" for f in field_type[:num_fields_to_show])
378
+ suffix = "..." if field_type.num_fields > num_fields_to_show else ""
379
+ return f"STRUCT<{field_details}{suffix}>"
380
+ if pa.types.is_map(field_type):
381
+ keys_sorted = getattr(field_type, 'keys_sorted', False)
382
+ sorted_str = ", keys_sorted" if keys_sorted else ""
383
+ key_type_str = self._format_pyarrow_type(field_type.key_type)
384
+ item_type_str = self._format_pyarrow_type(field_type.item_type)
385
+ return f"MAP<key: {key_type_str}, value: {item_type_str}{sorted_str}>"
386
+ if pa.types.is_dictionary(field_type):
387
+ index_type_str = self._format_pyarrow_type(field_type.index_type)
388
+ value_type_str = self._format_pyarrow_type(field_type.value_type)
389
+ ordered = getattr(field_type, 'ordered', False)
390
+ return f"DICTIONARY<indices: {index_type_str}, values: {value_type_str}{', ordered' if ordered else ''}>"
391
+ if pa.types.is_union(field_type):
392
+ type_codes = getattr(field_type, 'type_codes', [])
393
+ mode = getattr(field_type, 'mode', 'sparse')
394
+ field_details = ", ".join(
395
+ f"{f.name}: {self._format_pyarrow_type(f.type)}" for f in field_type[:3]) # Show first few fields
396
+ suffix = "..." if field_type.num_fields > 3 else ""
397
+ return f"UNION<{field_details}{suffix}> (mode='{mode}', codes={type_codes[:5]}{'...' if len(type_codes) > 5 else ''})"
398
+
399
+ return str(field_type).upper()
400
+
401
+ def _safe_compute(self, func, data, *args, **kwargs) -> Tuple[Optional[Any], Optional[str]]:
402
+ """Helper to safely execute a pyarrow.compute function and handle errors."""
403
+ if data.null_count == len(data):
404
+ return None, "Input data is all NULL"
405
+ try:
406
+ result_scalar = func(data, *args, **kwargs)
407
+ return result_scalar.as_py() if result_scalar.is_valid else None, None
408
+ except pa.lib.ArrowNotImplementedError as nie:
409
+ return None, "Not Implemented"
410
+ except Exception as e:
411
+ return None, f"Compute Error: {e}"
412
+
413
+ def _calculate_numeric_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]:
414
+ """Calculates min, max, mean, stddev for numeric columns using _safe_compute."""
415
+ stats: Dict[str, Any] = {}
416
+ min_val, err = self._safe_compute(pc.min, column_data)
417
+ stats["Min"] = min_val if err is None else err
418
+ max_val, err = self._safe_compute(pc.max, column_data)
419
+ stats["Max"] = max_val if err is None else err
420
+ mean_val, err = self._safe_compute(pc.mean, column_data)
421
+ stats["Mean"] = f"{mean_val:.4f}" if mean_val is not None and err is None else (err or "N/A")
422
+ stddev_val, err = self._safe_compute(pc.stddev, column_data, ddof=1)
423
+ stats["StdDev"] = f"{stddev_val:.4f}" if stddev_val is not None and err is None else (err or "N/A")
424
+ if stats["StdDev"] == "Not Implemented":
425
+ variance_val, err_var = self._safe_compute(pc.variance, column_data, ddof=1)
426
+ stats["Variance"] = f"{variance_val:.4f}" if variance_val is not None and err_var is None else (
427
+ err_var or "N/A")
428
+
429
+ return stats
430
+
431
+ def _calculate_temporal_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]:
432
+ """Calculates min and max for temporal columns using _safe_compute."""
433
+ stats: Dict[str, Any] = {}
434
+ min_val, err = self._safe_compute(pc.min, column_data)
435
+ stats["Min"] = min_val if err is None else err # .as_py() handles conversion
436
+ max_val, err = self._safe_compute(pc.max, column_data)
437
+ stats["Max"] = max_val if err is None else err
438
+ return stats
439
+
440
+ def _calculate_string_binary_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]:
441
+ """Calculates distinct count and optionally length stats for string/binary."""
442
+ stats: Dict[str, Any] = {}
443
+ distinct_val, err = self._safe_compute(pc.count_distinct, column_data)
444
+ stats["Distinct Count"] = f"{distinct_val:,}" if distinct_val is not None and err is None else (err or "N/A")
445
+
446
+ if pa.types.is_string(column_data.type) or pa.types.is_large_string(column_data.type):
447
+ lengths, err_len = self._safe_compute(pc.binary_length, column_data)
448
+ if err_len is None and lengths is not None:
449
+ min_len, err_min = self._safe_compute(pc.min, lengths)
450
+ stats["Min Length"] = min_len if err_min is None else err_min
451
+ max_len, err_max = self._safe_compute(pc.max, lengths)
452
+ stats["Max Length"] = max_len if err_max is None else err_max
453
+ avg_len, err_avg = self._safe_compute(pc.mean, lengths)
454
+ stats["Avg Length"] = f"{avg_len:.2f}" if avg_len is not None and err_avg is None else (
455
+ err_avg or "N/A")
456
+ else:
457
+ stats.update({"Min Length": "Error", "Max Length": "Error", "Avg Length": "Error"})
458
+ return stats
459
+
460
+ def _calculate_boolean_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]:
461
+ """Calculates value counts (True/False) for boolean columns."""
462
+ stats: Dict[str, Any] = {}
463
+ try:
464
+ if column_data.null_count == len(column_data):
465
+ stats["Value Counts"] = "All NULL"
466
+ return stats
467
+
468
+ # value_counts returns a StructArray [{values: bool, counts: int64}, ...]
469
+ value_counts_struct = pc.value_counts(column_data)
470
+ counts_dict = {}
471
+ if len(value_counts_struct) > 0:
472
+ for i in range(len(value_counts_struct)):
473
+ value = value_counts_struct.field("values")[i].as_py()
474
+ count = value_counts_struct.field("counts")[i].as_py()
475
+ counts_dict[value] = count # Keys are True/False
476
+
477
+ stats["Value Counts"] = {str(k): f"{v:,}" for k, v in counts_dict.items()}
478
+ # Ensure both True and False are present, even if count is 0
479
+ if 'True' not in stats["Value Counts"]: stats["Value Counts"]['True'] = "0"
480
+ if 'False' not in stats["Value Counts"]: stats["Value Counts"]['False'] = "0"
481
+
482
+ except Exception as vc_e:
483
+ log.warning(f"Boolean value count calculation error: {vc_e}", exc_info=True)
484
+ stats["Value Counts"] = "Error calculating"
485
+ return stats
486
+
487
+ def _calculate_dictionary_stats(self, column_data: pa.ChunkedArray, col_type: pa.DictionaryType) -> Dict[str, Any]:
488
+ """Calculates stats for dictionary type based on its value type."""
489
+ stats: Dict[str, Any] = {"message": "Stats calculated on dictionary values."} # Start with message
490
+ try:
491
+ unwrapped_data = column_data.dictionary_decode()
492
+ value_type = col_type.value_type
493
+ log.debug(f"Calculating dictionary stats based on value type: {value_type}")
494
+
495
+ # Delegate calculation based on the *value* type
496
+ if pa.types.is_floating(value_type) or pa.types.is_integer(value_type):
497
+ stats.update(self._calculate_numeric_stats(unwrapped_data))
498
+ elif pa.types.is_temporal(value_type):
499
+ stats.update(self._calculate_temporal_stats(unwrapped_data))
500
+ elif pa.types.is_string(value_type) or pa.types.is_large_string(value_type) \
501
+ or pa.types.is_binary(value_type) or pa.types.is_large_binary(value_type):
502
+ stats.update(self._calculate_string_binary_stats(unwrapped_data))
503
+ # Add other dictionary value types if necessary (boolean, etc.)
504
+ else:
505
+ stats[
506
+ "message"] += f" (Stats for value type '{self._format_pyarrow_type(value_type)}' not fully implemented)."
507
+ # Calculate distinct count on the original dictionary array (can be faster)
508
+ distinct_val, err = self._safe_compute(pc.count_distinct, column_data)
509
+ stats[
510
+ "Distinct Values (Approx)"] = f"{distinct_val:,}" if distinct_val is not None and err is None else (
511
+ err or "N/A")
512
+
513
+ except pa.lib.ArrowException as arrow_decode_err:
514
+ log.warning(f"Arrow error decoding dictionary type for stats: {arrow_decode_err}")
515
+ stats["Dictionary Error"] = f"Decode Error: {arrow_decode_err}"
516
+ except Exception as dict_e:
517
+ log.warning(f"Could not process dictionary type for stats: {dict_e}")
518
+ stats["Dictionary Error"] = f"Processing Error: {dict_e}"
519
+ return stats
520
+
521
+ def _calculate_complex_type_stats(self, column_data: pa.ChunkedArray, col_type: pa.DataType) -> Dict[str, Any]:
522
+ """Calculates basic stats (like distinct count) for complex types."""
523
+ stats: Dict[str, Any] = {}
524
+ # Distinct count is often the most feasible stat for complex types
525
+ distinct_val, err = self._safe_compute(pc.count_distinct, column_data)
526
+ # Note: Distinct count on complex types can be approximate or may error depending on type
527
+ stats["Distinct Count (Approx)"] = f"{distinct_val:,}" if distinct_val is not None and err is None else (
528
+ err or "N/A")
529
+ return stats
530
+
531
+ def _get_stats_from_metadata(self, column_name: str) -> Tuple[Dict[str, Any], Optional[str]]:
532
+ """Retrieves statistics stored within the Parquet file metadata per row group."""
533
+ metadata_stats: Dict[str, Any] = {}
534
+ error_str: Optional[str] = None
535
+
536
+ if not self.metadata or not self.schema:
537
+ return {}, "Metadata or Schema not available"
538
+
539
+ try:
540
+ col_index = self.schema.get_field_index(column_name)
541
+
542
+ for i in range(self.metadata.num_row_groups):
543
+ group_key = f"RG {i}"
544
+ try:
545
+ rg_meta = self.metadata.row_group(i)
546
+ metadata_stats[group_key] = self._extract_stats_for_single_group(rg_meta, col_index)
547
+ except IndexError:
548
+ log.warning(f"Column index {col_index} out of bounds for row group {i}.")
549
+ metadata_stats[group_key] = "Index Error"
550
+ except Exception as e:
551
+ log.warning(f"Error processing metadata stats for RG {i}, column '{column_name}': {e}")
552
+ metadata_stats[group_key] = f"Read Error: {e}"
553
+
554
+ except KeyError:
555
+ log.warning(f"Column '{column_name}' not found in schema for metadata stats.")
556
+ error_str = f"Column '{column_name}' not found in schema"
557
+ except Exception as e:
558
+ log.exception(f"Failed to get metadata statistics structure for column '{column_name}'.")
559
+ error_str = f"Error accessing metadata structure: {e}"
560
+
561
+ return metadata_stats, error_str
562
+
563
+ def _extract_stats_for_single_group(self, rg_meta: pq.RowGroupMetaData, col_index: int) -> Union[
564
+ str, Dict[str, Any]]:
565
+ """Extracts stats from a column chunk's metadata within a row group."""
566
+ try:
567
+ col_chunk_meta = rg_meta.column(col_index)
568
+ stats = col_chunk_meta.statistics
569
+ if not stats: return "No stats in metadata"
570
+
571
+ def _format_stat(value, is_present, is_numeric=True):
572
+ if not is_present: return "N/A"
573
+ try:
574
+ # Attempt to format nicely, fallback to repr for safety
575
+ return f"{value:,}" if is_numeric else str(value)
576
+ except Exception:
577
+ return repr(value)
578
+
579
+ return {
580
+ "min": _format_stat(stats.min, stats.has_min_max, is_numeric=False),
581
+ "max": _format_stat(stats.max, stats.has_min_max, is_numeric=False),
582
+ "nulls": _format_stat(stats.null_count, stats.has_null_count),
583
+ "distinct": _format_stat(stats.distinct_count, stats.has_distinct_count),
584
+ "size_comp": _format_stat(col_chunk_meta.total_compressed_size,
585
+ col_chunk_meta.total_compressed_size is not None),
586
+ "size_uncomp": _format_stat(col_chunk_meta.total_uncompressed_size,
587
+ col_chunk_meta.total_uncompressed_size is not None),
588
+ }
589
+ except IndexError:
590
+ log.warning(f"Column index {col_index} out of bounds for row group {rg_meta.num_columns} columns.")
591
+ return "Index Error"
592
+ except Exception as e:
593
+ log.error(f"Error reading column chunk metadata stats for index {col_index}: {e}", exc_info=True)
594
+ return f"Metadata Read Error: {e}"
595
+
596
+ def _create_stats_result(
597
+ self,
598
+ column_name: str,
599
+ field: Optional[pa.Field],
600
+ calculated_stats: Optional[Dict] = None,
601
+ metadata_stats: Optional[Dict] = None,
602
+ metadata_stats_error: Optional[str] = None,
603
+ calculation_error: Optional[str] = None,
604
+ message: Optional[str] = None
605
+ ) -> Dict[str, Any]:
606
+ """Consistently packages the results of column statistics calculation."""
607
+ calculated_stats_dict = calculated_stats if calculated_stats is not None else {}
608
+
609
+ col_type_str = "Unknown"
610
+ col_nullable = None
611
+ if field:
612
+ try:
613
+ col_type_str = self._format_pyarrow_type(field.type)
614
+ col_nullable = field.nullable
615
+ except Exception as e:
616
+ log.error(f"Error formatting type for column {column_name}: {e}")
617
+ col_type_str = f"[Error formatting: {field.type}]"
618
+ col_nullable = None
619
+
620
+ return {
621
+ "column": column_name,
622
+ "type": col_type_str,
623
+ "nullable": col_nullable,
624
+ "calculated": calculated_stats_dict,
625
+ "basic_metadata_stats": metadata_stats,
626
+ "metadata_stats_error": metadata_stats_error,
627
+ "error": calculation_error,
628
+ "message": message
629
+ }
630
+
631
+ def _format_size(self, num_bytes: int) -> str:
632
+ """Formats bytes into a human-readable string (KB, MB, GB)."""
633
+ if num_bytes < 1024:
634
+ return f"{num_bytes} Bytes"
635
+ elif num_bytes < 1024 ** 2:
636
+ return f"{num_bytes / 1024:.2f} KB"
637
+ elif num_bytes < 1024 ** 3:
638
+ return f"{num_bytes / 1024 ** 2:.2f} MB"
639
+ else:
640
+ return f"{num_bytes / 1024 ** 3:.2f} GB"
@@ -1,6 +1,8 @@
1
+ import logging
1
2
  from textual.containers import VerticalScroll
2
3
  from textual.widgets import Static, Pretty
3
4
 
5
+ log = logging.getLogger(__name__)
4
6
 
5
7
  class MetadataView(VerticalScroll):
6
8
 
@@ -8,12 +10,17 @@ class MetadataView(VerticalScroll):
8
10
  self.load_metadata()
9
11
 
10
12
  def load_metadata(self):
13
+ self.query("*").remove()
11
14
  try:
12
15
  if self.app.handler:
13
16
  meta_data = self.app.handler.get_metadata_summary()
14
- pretty_widget = Pretty(meta_data)
15
- self.mount(pretty_widget)
17
+ if meta_data.get("error"):
18
+ self.mount(Static(f"[red]Error getting metadata: {meta_data['error']}[/red]", classes="error-content"))
19
+ else:
20
+ pretty_widget = Pretty(meta_data)
21
+ self.mount(pretty_widget)
16
22
  else:
17
- self.mount(Static("Parquet handler not available.", classes="error-content"))
23
+ self.mount(Static("[red]Data handler not available.[/red]", classes="error-content"))
18
24
  except Exception as e:
19
- self.mount(Static(f"Error loading metadata: {e}", classes="error-content"))
25
+ log.exception("Error loading metadata view")
26
+ self.mount(Static(f"[red]Error loading metadata: {e}[/red]", classes="error-content"))