parqv 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parqv/__init__.py +31 -0
- parqv/app.py +84 -102
- parqv/cli.py +112 -0
- parqv/core/__init__.py +31 -0
- parqv/core/config.py +26 -0
- parqv/core/file_utils.py +88 -0
- parqv/core/handler_factory.py +90 -0
- parqv/core/logging.py +46 -0
- parqv/data_sources/__init__.py +48 -0
- parqv/data_sources/base/__init__.py +28 -0
- parqv/data_sources/base/exceptions.py +38 -0
- parqv/{handlers/base_handler.py → data_sources/base/handler.py} +54 -25
- parqv/{handlers → data_sources/formats}/__init__.py +13 -5
- parqv/data_sources/formats/csv.py +460 -0
- parqv/{handlers → data_sources/formats}/json.py +68 -32
- parqv/{handlers → data_sources/formats}/parquet.py +67 -56
- parqv/views/__init__.py +38 -0
- parqv/views/base.py +98 -0
- parqv/views/components/__init__.py +13 -0
- parqv/views/components/enhanced_data_table.py +152 -0
- parqv/views/components/error_display.py +72 -0
- parqv/views/components/loading_display.py +44 -0
- parqv/views/data_view.py +119 -46
- parqv/views/metadata_view.py +57 -20
- parqv/views/schema_view.py +190 -200
- parqv/views/utils/__init__.py +19 -0
- parqv/views/utils/data_formatters.py +184 -0
- parqv/views/utils/stats_formatters.py +220 -0
- parqv/views/utils/visualization.py +204 -0
- {parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/METADATA +5 -6
- parqv-0.3.0.dist-info/RECORD +36 -0
- {parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/WHEEL +1 -1
- parqv-0.2.0.dist-info/RECORD +0 -17
- {parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/entry_points.txt +0 -0
- {parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,10 @@
|
|
1
|
-
import logging
|
2
1
|
from pathlib import Path
|
3
2
|
from typing import Any, Dict, List, Optional, Tuple
|
4
3
|
|
5
4
|
import duckdb
|
6
5
|
import pandas as pd
|
7
6
|
|
8
|
-
from
|
9
|
-
|
10
|
-
log = logging.getLogger(__name__)
|
7
|
+
from ..base import DataHandler, DataHandlerError
|
11
8
|
|
12
9
|
|
13
10
|
class JsonHandlerError(DataHandlerError):
|
@@ -38,6 +35,8 @@ class JsonHandler(DataHandler):
|
|
38
35
|
JsonHandlerError: If the file doesn't exist, isn't a file, or if
|
39
36
|
initialization fails (e.g., DuckDB connection, view creation).
|
40
37
|
"""
|
38
|
+
super().__init__(file_path)
|
39
|
+
|
41
40
|
self.file_path = self._validate_file_path(file_path)
|
42
41
|
self._db_conn: Optional[duckdb.DuckDBPyConnection] = None
|
43
42
|
self._view_name: str = self.DEFAULT_VIEW_NAME
|
@@ -48,9 +47,9 @@ class JsonHandler(DataHandler):
|
|
48
47
|
self._connect_db()
|
49
48
|
self._create_duckdb_view()
|
50
49
|
self._load_metadata()
|
51
|
-
|
50
|
+
self.logger.info(f"JsonHandler initialized successfully for: {self.file_path}")
|
52
51
|
except Exception as e:
|
53
|
-
|
52
|
+
self.logger.exception(f"Error during JsonHandler initialization for {self.file_path}")
|
54
53
|
self.close()
|
55
54
|
if isinstance(e, JsonHandlerError):
|
56
55
|
raise
|
@@ -67,9 +66,9 @@ class JsonHandler(DataHandler):
|
|
67
66
|
"""Establishes a connection to an in-memory DuckDB database."""
|
68
67
|
try:
|
69
68
|
self._db_conn = duckdb.connect(database=':memory:', read_only=False)
|
70
|
-
|
69
|
+
self.logger.debug("DuckDB in-memory connection established.")
|
71
70
|
except Exception as e:
|
72
|
-
|
71
|
+
self.logger.exception("Failed to initialize DuckDB connection.")
|
73
72
|
raise JsonHandlerError(f"DuckDB connection failed: {e}") from e
|
74
73
|
|
75
74
|
def _create_duckdb_view(self):
|
@@ -83,9 +82,9 @@ class JsonHandler(DataHandler):
|
|
83
82
|
|
84
83
|
try:
|
85
84
|
self._db_conn.sql(load_query)
|
86
|
-
|
85
|
+
self.logger.debug(f"DuckDB view '{self._view_name}' created for file '{file_path_str}'.")
|
87
86
|
except duckdb.Error as db_err:
|
88
|
-
|
87
|
+
self.logger.exception(f"DuckDB Error creating view '{self._view_name}' from '{file_path_str}': {db_err}")
|
89
88
|
if "Could not open file" in str(db_err):
|
90
89
|
raise JsonHandlerError(
|
91
90
|
f"DuckDB could not open file: {file_path_str}. Check permissions or path. Error: {db_err}") from db_err
|
@@ -95,7 +94,7 @@ class JsonHandler(DataHandler):
|
|
95
94
|
else:
|
96
95
|
raise JsonHandlerError(f"DuckDB failed create view for JSON file: {db_err}") from db_err
|
97
96
|
except Exception as e:
|
98
|
-
|
97
|
+
self.logger.exception(f"Unexpected error creating DuckDB view '{self._view_name}'.")
|
99
98
|
raise JsonHandlerError(f"Failed to create DuckDB view: {e}") from e
|
100
99
|
|
101
100
|
def _load_metadata(self):
|
@@ -108,27 +107,27 @@ class JsonHandler(DataHandler):
|
|
108
107
|
describe_query = f"DESCRIBE \"{self._view_name}\";"
|
109
108
|
schema_result = self._db_conn.sql(describe_query).fetchall()
|
110
109
|
self._schema = self._parse_schema(schema_result)
|
111
|
-
|
110
|
+
self.logger.debug(f"Schema fetched for view '{self._view_name}': {len(self._schema)} columns.")
|
112
111
|
|
113
112
|
# Fetch Row Count
|
114
113
|
count_query = f"SELECT COUNT(*) FROM \"{self._view_name}\";"
|
115
114
|
count_result = self._db_conn.sql(count_query).fetchone()
|
116
115
|
self._row_count = count_result[0] if count_result else 0
|
117
|
-
|
116
|
+
self.logger.debug(f"Row count fetched for view '{self._view_name}': {self._row_count}")
|
118
117
|
|
119
118
|
except duckdb.Error as db_err:
|
120
|
-
|
119
|
+
self.logger.exception(f"DuckDB Error fetching metadata for view '{self._view_name}': {db_err}")
|
121
120
|
self._schema = None
|
122
121
|
self._row_count = None
|
123
122
|
except Exception as e:
|
124
|
-
|
123
|
+
self.logger.exception(f"Unexpected error fetching metadata for view '{self._view_name}'")
|
125
124
|
self._schema = None
|
126
125
|
self._row_count = None
|
127
126
|
|
128
127
|
def _parse_schema(self, describe_output: List[Tuple]) -> List[Dict[str, Any]]:
|
129
128
|
"""Parses the output of DuckDB's DESCRIBE query."""
|
130
129
|
if not describe_output:
|
131
|
-
|
130
|
+
self.logger.warning(f"DESCRIBE query for view '{self._view_name}' returned no schema info.")
|
132
131
|
return []
|
133
132
|
|
134
133
|
parsed_schema = []
|
@@ -141,7 +140,7 @@ class JsonHandler(DataHandler):
|
|
141
140
|
is_nullable = null_str.upper() == 'YES'
|
142
141
|
parsed_schema.append({"name": name, "type": type_str, "nullable": is_nullable})
|
143
142
|
else:
|
144
|
-
|
143
|
+
self.logger.warning(f"Unexpected format in DESCRIBE output row: {row}")
|
145
144
|
return parsed_schema
|
146
145
|
|
147
146
|
def get_schema_data(self) -> Optional[List[Dict[str, Any]]]:
|
@@ -153,7 +152,7 @@ class JsonHandler(DataHandler):
|
|
153
152
|
or None if schema couldn't be fetched.
|
154
153
|
"""
|
155
154
|
if self._schema is None:
|
156
|
-
|
155
|
+
self.logger.warning("Schema is unavailable. It might not have been fetched successfully.")
|
157
156
|
return self._schema
|
158
157
|
|
159
158
|
def get_metadata_summary(self) -> Dict[str, Any]:
|
@@ -184,7 +183,7 @@ class JsonHandler(DataHandler):
|
|
184
183
|
try:
|
185
184
|
summary["Size"] = f"{self.file_path.stat().st_size:,} bytes"
|
186
185
|
except Exception as e:
|
187
|
-
|
186
|
+
self.logger.warning(f"Could not get file size for {self.file_path}: {e}")
|
188
187
|
summary["Size"] = "N/A"
|
189
188
|
|
190
189
|
return summary
|
@@ -202,13 +201,13 @@ class JsonHandler(DataHandler):
|
|
202
201
|
error message if fetching fails.
|
203
202
|
"""
|
204
203
|
if not self._db_conn:
|
205
|
-
|
204
|
+
self.logger.warning("Data preview unavailable: DuckDB connection is closed or uninitialized.")
|
206
205
|
return pd.DataFrame({"error": ["DuckDB connection not available."]})
|
207
206
|
if self._schema is None:
|
208
|
-
|
207
|
+
self.logger.warning("Data preview unavailable: Schema couldn't be determined.")
|
209
208
|
return pd.DataFrame({"error": ["Schema not available, cannot fetch preview."]})
|
210
209
|
if self._row_count == 0:
|
211
|
-
|
210
|
+
self.logger.info("Data preview: Source JSON view is empty.")
|
212
211
|
# Return empty DataFrame with correct columns if possible
|
213
212
|
if self._schema:
|
214
213
|
return pd.DataFrame(columns=[col['name'] for col in self._schema])
|
@@ -221,10 +220,10 @@ class JsonHandler(DataHandler):
|
|
221
220
|
df = self._db_conn.sql(preview_query).df()
|
222
221
|
return df
|
223
222
|
except duckdb.Error as db_err:
|
224
|
-
|
223
|
+
self.logger.exception(f"DuckDB error getting data preview from '{self._view_name}': {db_err}")
|
225
224
|
return pd.DataFrame({"error": [f"DuckDB error fetching preview: {db_err}"]})
|
226
225
|
except Exception as e:
|
227
|
-
|
226
|
+
self.logger.exception(f"Unexpected error getting data preview from '{self._view_name}'")
|
228
227
|
return pd.DataFrame({"error": [f"Failed to fetch preview: {e}"]})
|
229
228
|
|
230
229
|
def _get_column_info(self, column_name: str) -> Optional[Dict[str, Any]]:
|
@@ -274,7 +273,7 @@ class JsonHandler(DataHandler):
|
|
274
273
|
|
275
274
|
if is_complex:
|
276
275
|
# Use basic counts for complex types as SUMMARIZE is less informative
|
277
|
-
|
276
|
+
self.logger.debug(f"Calculating basic counts for complex type column: {column_name}")
|
278
277
|
stats = self._get_basic_column_counts(safe_column_name)
|
279
278
|
message = f"Only basic counts calculated for complex type '{col_type}'."
|
280
279
|
# Attempt distinct count for complex types (can be slow/error-prone)
|
@@ -286,13 +285,13 @@ class JsonHandler(DataHandler):
|
|
286
285
|
else:
|
287
286
|
stats["Distinct Count"] = "N/A" # Or 0 if appropriate
|
288
287
|
except duckdb.Error as distinct_err:
|
289
|
-
|
288
|
+
self.logger.warning(
|
290
289
|
f"Could not calculate distinct count for complex column '{column_name}': {distinct_err}")
|
291
290
|
stats["Distinct Count"] = "Error"
|
292
291
|
|
293
292
|
else:
|
294
293
|
# Use SUMMARIZE for non-complex types
|
295
|
-
|
294
|
+
self.logger.debug(f"Using SUMMARIZE for simple type column: {column_name}")
|
296
295
|
summarize_query = f"SUMMARIZE SELECT {safe_column_name} FROM \"{self._view_name}\";"
|
297
296
|
summarize_df = self._db_conn.sql(summarize_query).df()
|
298
297
|
|
@@ -304,11 +303,17 @@ class JsonHandler(DataHandler):
|
|
304
303
|
# SUMMARIZE puts results in the first row
|
305
304
|
stats = self._format_summarize_stats(summarize_df.iloc[0])
|
306
305
|
|
306
|
+
# Add histogram data for numeric columns
|
307
|
+
try:
|
308
|
+
self._add_histogram_data_if_numeric(stats, safe_column_name)
|
309
|
+
except Exception as hist_e:
|
310
|
+
self.logger.warning(f"Failed to add histogram data for {column_name}: {hist_e}")
|
311
|
+
|
307
312
|
except duckdb.Error as db_err:
|
308
|
-
|
313
|
+
self.logger.exception(f"DuckDB Error calculating statistics for column '{column_name}': {db_err}")
|
309
314
|
error_msg = f"DuckDB calculation failed: {db_err}"
|
310
315
|
except Exception as e:
|
311
|
-
|
316
|
+
self.logger.exception(f"Unexpected error calculating statistics for column '{column_name}'")
|
312
317
|
error_msg = f"Calculation failed unexpectedly: {e}"
|
313
318
|
|
314
319
|
return self._create_stats_result(
|
@@ -351,7 +356,7 @@ class JsonHandler(DataHandler):
|
|
351
356
|
stats["Null Percentage"] = "Error"
|
352
357
|
|
353
358
|
except duckdb.Error as db_err:
|
354
|
-
|
359
|
+
self.logger.warning(f"Failed to get basic counts for {safe_column_name}: {db_err}")
|
355
360
|
stats["Counts Error"] = str(db_err)
|
356
361
|
return stats
|
357
362
|
|
@@ -404,6 +409,37 @@ class JsonHandler(DataHandler):
|
|
404
409
|
|
405
410
|
return stats
|
406
411
|
|
412
|
+
def _add_histogram_data_if_numeric(self, stats: Dict[str, Any], safe_column_name: str) -> None:
|
413
|
+
"""Add histogram data for numeric columns by sampling from DuckDB."""
|
414
|
+
# Check if this looks like numeric data (has Mean, Min, Max)
|
415
|
+
if not all(key in stats for key in ["Mean", "Min", "Max"]):
|
416
|
+
return
|
417
|
+
|
418
|
+
try:
|
419
|
+
# Sample data for histogram (limit to 10k samples for performance)
|
420
|
+
sample_query = f"""
|
421
|
+
SELECT {safe_column_name}
|
422
|
+
FROM "{self._view_name}"
|
423
|
+
WHERE {safe_column_name} IS NOT NULL
|
424
|
+
USING SAMPLE 10000
|
425
|
+
"""
|
426
|
+
|
427
|
+
sample_df = self._db_conn.sql(sample_query).df()
|
428
|
+
|
429
|
+
if not sample_df.empty and len(sample_df) > 10:
|
430
|
+
# Extract the column data
|
431
|
+
column_data = sample_df.iloc[:, 0].tolist()
|
432
|
+
|
433
|
+
# Filter out any remaining nulls
|
434
|
+
clean_data = [val for val in column_data if val is not None]
|
435
|
+
|
436
|
+
if len(clean_data) > 10:
|
437
|
+
stats["_histogram_data"] = clean_data
|
438
|
+
stats["_data_type"] = "numeric"
|
439
|
+
|
440
|
+
except Exception as e:
|
441
|
+
self.logger.warning(f"Failed to sample data for histogram: {e}")
|
442
|
+
|
407
443
|
def _create_stats_result(
|
408
444
|
self,
|
409
445
|
column_name: str,
|
@@ -430,11 +466,11 @@ class JsonHandler(DataHandler):
|
|
430
466
|
if self._db_conn:
|
431
467
|
try:
|
432
468
|
self._db_conn.close()
|
433
|
-
|
469
|
+
self.logger.info(f"DuckDB connection closed for {self.file_path}.")
|
434
470
|
self._db_conn = None
|
435
471
|
except Exception as e:
|
436
472
|
# Log error but don't raise during close typically
|
437
|
-
|
473
|
+
self.logger.error(f"Error closing DuckDB connection for {self.file_path}: {e}")
|
438
474
|
self._db_conn = None # Assume closed even if error occurred
|
439
475
|
|
440
476
|
def __enter__(self):
|
@@ -1,4 +1,3 @@
|
|
1
|
-
import logging
|
2
1
|
from pathlib import Path
|
3
2
|
from typing import Any, Dict, List, Tuple, Optional, Union
|
4
3
|
|
@@ -7,9 +6,7 @@ import pyarrow as pa
|
|
7
6
|
import pyarrow.compute as pc
|
8
7
|
import pyarrow.parquet as pq
|
9
8
|
|
10
|
-
from
|
11
|
-
|
12
|
-
log = logging.getLogger(__name__)
|
9
|
+
from ..base import DataHandler, DataHandlerError
|
13
10
|
|
14
11
|
|
15
12
|
class ParquetHandlerError(DataHandlerError):
|
@@ -49,17 +46,17 @@ class ParquetHandler(DataHandler):
|
|
49
46
|
self.pq_file = pq.ParquetFile(self.file_path)
|
50
47
|
self.schema = self.pq_file.schema_arrow
|
51
48
|
self.metadata = self.pq_file.metadata
|
52
|
-
|
49
|
+
self.logger.info(f"Successfully initialized ParquetHandler for: {self.file_path.name}")
|
53
50
|
|
54
51
|
except FileNotFoundError as fnf_e:
|
55
|
-
|
52
|
+
self.logger.error(f"File not found during ParquetHandler initialization: {fnf_e}")
|
56
53
|
raise ParquetHandlerError(str(fnf_e)) from fnf_e
|
57
54
|
except pa.lib.ArrowIOError as arrow_io_e:
|
58
|
-
|
55
|
+
self.logger.error(f"Arrow IO Error initializing ParquetHandler for {self.file_path.name}: {arrow_io_e}")
|
59
56
|
raise ParquetHandlerError(
|
60
57
|
f"Failed to open Parquet file '{self.file_path.name}': {arrow_io_e}") from arrow_io_e
|
61
58
|
except Exception as e:
|
62
|
-
|
59
|
+
self.logger.exception(f"Unexpected error initializing ParquetHandler for {self.file_path.name}")
|
63
60
|
self.close()
|
64
61
|
raise ParquetHandlerError(f"Failed to initialize Parquet handler '{self.file_path.name}': {e}") from e
|
65
62
|
|
@@ -71,10 +68,10 @@ class ParquetHandler(DataHandler):
|
|
71
68
|
# ParquetFile might not have a close method depending on source, check first
|
72
69
|
if hasattr(self.pq_file, 'close'):
|
73
70
|
self.pq_file.close()
|
74
|
-
|
71
|
+
self.logger.info(f"Closed Parquet file: {self.file_path.name}")
|
75
72
|
except Exception as e:
|
76
73
|
# Log error during close but don't raise, as we're cleaning up
|
77
|
-
|
74
|
+
self.logger.warning(f"Exception while closing Parquet file {self.file_path.name}: {e}")
|
78
75
|
finally:
|
79
76
|
self.pq_file = None
|
80
77
|
self.schema = None
|
@@ -102,7 +99,7 @@ class ParquetHandler(DataHandler):
|
|
102
99
|
A dictionary containing key metadata attributes, or an error dictionary.
|
103
100
|
"""
|
104
101
|
if not self.metadata or not self.schema:
|
105
|
-
|
102
|
+
self.logger.warning(f"Metadata or schema not available for summary: {self.file_path.name}")
|
106
103
|
return {"error": "Metadata or schema not available"}
|
107
104
|
|
108
105
|
try:
|
@@ -126,7 +123,7 @@ class ParquetHandler(DataHandler):
|
|
126
123
|
|
127
124
|
return summary
|
128
125
|
except Exception as e:
|
129
|
-
|
126
|
+
self.logger.exception(f"Error generating metadata summary for {self.file_path.name}")
|
130
127
|
return {"error": f"Error getting metadata summary: {e}"}
|
131
128
|
|
132
129
|
def get_schema_data(self) -> Optional[List[Dict[str, Any]]]:
|
@@ -138,7 +135,7 @@ class ParquetHandler(DataHandler):
|
|
138
135
|
or None if the schema is unavailable.
|
139
136
|
"""
|
140
137
|
if not self.schema:
|
141
|
-
|
138
|
+
self.logger.warning(f"Schema is not available for get_schema_data: {self.file_path.name}")
|
142
139
|
return None
|
143
140
|
|
144
141
|
schema_list = []
|
@@ -151,7 +148,7 @@ class ParquetHandler(DataHandler):
|
|
151
148
|
"nullable": field.nullable
|
152
149
|
})
|
153
150
|
except Exception as e:
|
154
|
-
|
151
|
+
self.logger.error(f"Error processing field '{field.name}' for schema data: {e}", exc_info=True)
|
155
152
|
schema_list.append({
|
156
153
|
"name": field.name,
|
157
154
|
"type": f"[Error: {e}]",
|
@@ -172,11 +169,11 @@ class ParquetHandler(DataHandler):
|
|
172
169
|
Returns a DataFrame with an 'error' column on failure.
|
173
170
|
"""
|
174
171
|
if not self.pq_file:
|
175
|
-
|
172
|
+
self.logger.warning(f"ParquetFile handler not available for data preview: {self.file_path.name}")
|
176
173
|
return pd.DataFrame({"error": ["Parquet handler not initialized or closed."]})
|
177
174
|
|
178
175
|
if self.metadata and self.metadata.num_rows == 0:
|
179
|
-
|
176
|
+
self.logger.info(f"Parquet file is empty based on metadata: {self.file_path.name}")
|
180
177
|
if self.schema:
|
181
178
|
return pd.DataFrame(columns=self.schema.names)
|
182
179
|
else:
|
@@ -206,10 +203,10 @@ class ParquetHandler(DataHandler):
|
|
206
203
|
if not batches:
|
207
204
|
# Check if file might have rows but reading yielded nothing
|
208
205
|
if self.metadata and self.metadata.num_rows > 0:
|
209
|
-
|
206
|
+
self.logger.warning(
|
210
207
|
f"No batches read for preview, though metadata indicates {self.metadata.num_rows} rows: {self.file_path.name}")
|
211
208
|
else:
|
212
|
-
|
209
|
+
self.logger.info(f"No data read for preview (file likely empty): {self.file_path.name}")
|
213
210
|
# Return empty DF with columns if schema available
|
214
211
|
if self.schema:
|
215
212
|
return pd.DataFrame(columns=self.schema.names)
|
@@ -223,11 +220,11 @@ class ParquetHandler(DataHandler):
|
|
223
220
|
self_destruct=True,
|
224
221
|
types_mapper=pd.ArrowDtype
|
225
222
|
)
|
226
|
-
|
223
|
+
self.logger.info(f"Generated preview of {len(df)} rows for {self.file_path.name}")
|
227
224
|
return df
|
228
225
|
|
229
226
|
except Exception as e:
|
230
|
-
|
227
|
+
self.logger.exception(f"Error generating data preview from Parquet file: {self.file_path.name}")
|
231
228
|
return pd.DataFrame({"error": [f"Failed to fetch preview: {e}"]})
|
232
229
|
|
233
230
|
def get_column_stats(self, column_name: str) -> Dict[str, Any]:
|
@@ -242,13 +239,13 @@ class ParquetHandler(DataHandler):
|
|
242
239
|
and potential error or message keys.
|
243
240
|
"""
|
244
241
|
if not self.pq_file or not self.schema:
|
245
|
-
|
242
|
+
self.logger.warning(f"Parquet file/schema unavailable for column stats: {self.file_path.name}")
|
246
243
|
return self._create_stats_result(column_name, None, error="File or schema not available")
|
247
244
|
|
248
245
|
try:
|
249
246
|
field = self.schema.field(column_name)
|
250
247
|
except KeyError:
|
251
|
-
|
248
|
+
self.logger.warning(f"Column '{column_name}' not found in schema: {self.file_path.name}")
|
252
249
|
return self._create_stats_result(column_name, None, error=f"Column '{column_name}' not found in schema")
|
253
250
|
|
254
251
|
calculated_stats: Dict[str, Any] = {}
|
@@ -261,7 +258,7 @@ class ParquetHandler(DataHandler):
|
|
261
258
|
# Data Reading
|
262
259
|
table = self.pq_file.read(columns=[column_name])
|
263
260
|
column_data = table.column(0)
|
264
|
-
|
261
|
+
self.logger.debug(
|
265
262
|
f"Finished reading column '{column_name}'. Rows: {len(column_data)}, Nulls: {column_data.null_count}")
|
266
263
|
|
267
264
|
# Basic Counts
|
@@ -274,14 +271,14 @@ class ParquetHandler(DataHandler):
|
|
274
271
|
calculated_stats["Null Count"] = f"{null_count:,}"
|
275
272
|
calculated_stats["Null Percentage"] = f"{(null_count / total_count * 100):.2f}%"
|
276
273
|
else:
|
277
|
-
|
274
|
+
self.logger.info(f"Column '{column_name}' read resulted in 0 rows.")
|
278
275
|
message = "Column is empty (0 rows)."
|
279
276
|
valid_count = 0 # Ensure valid_count is 0 for later checks
|
280
277
|
|
281
278
|
# Type-Specific Calculations
|
282
279
|
if valid_count > 0:
|
283
280
|
col_type = field.type
|
284
|
-
|
281
|
+
self.logger.debug(f"Calculating stats for type: {self._format_pyarrow_type(col_type)}")
|
285
282
|
try:
|
286
283
|
if pa.types.is_floating(col_type) or pa.types.is_integer(col_type):
|
287
284
|
calculated_stats.update(self._calculate_numeric_stats(column_data))
|
@@ -300,11 +297,12 @@ class ParquetHandler(DataHandler):
|
|
300
297
|
calculated_stats.update(self._calculate_complex_type_stats(column_data, col_type))
|
301
298
|
message = f"Basic aggregate stats (min/max/mean) not applicable for complex type '{self._format_pyarrow_type(col_type)}'."
|
302
299
|
else:
|
303
|
-
|
300
|
+
self.logger.warning(f"Statistics calculation not fully implemented for type: {col_type}")
|
304
301
|
message = f"Statistics calculation not implemented for type '{self._format_pyarrow_type(col_type)}'."
|
305
302
|
|
306
303
|
except Exception as calc_err:
|
307
|
-
|
304
|
+
self.logger.exception(
|
305
|
+
f"Error during type-specific calculation for column '{column_name}': {calc_err}")
|
308
306
|
error_msg = f"Calculation error for type {field.type}: {calc_err}"
|
309
307
|
calculated_stats["Calculation Error"] = str(calc_err) # Add specific error key
|
310
308
|
|
@@ -315,10 +313,10 @@ class ParquetHandler(DataHandler):
|
|
315
313
|
metadata_stats, metadata_stats_error = self._get_stats_from_metadata(column_name)
|
316
314
|
|
317
315
|
except pa.lib.ArrowException as arrow_e:
|
318
|
-
|
316
|
+
self.logger.exception(f"Arrow error during stats processing for column '{column_name}': {arrow_e}")
|
319
317
|
error_msg = f"Arrow processing error: {arrow_e}"
|
320
318
|
except Exception as e:
|
321
|
-
|
319
|
+
self.logger.exception(f"Unexpected error during stats calculation for column '{column_name}'")
|
322
320
|
error_msg = f"Calculation failed unexpectedly: {e}"
|
323
321
|
|
324
322
|
return self._create_stats_result(
|
@@ -331,7 +329,7 @@ class ParquetHandler(DataHandler):
|
|
331
329
|
try:
|
332
330
|
return value.decode('utf-8', errors='replace')
|
333
331
|
except Exception as e:
|
334
|
-
|
332
|
+
self.logger.warning(f"Could not decode metadata bytes: {e}. Value: {value!r}")
|
335
333
|
return f"[Decode Error: {value!r}]"
|
336
334
|
return str(value) if value is not None else None
|
337
335
|
|
@@ -348,7 +346,7 @@ class ParquetHandler(DataHandler):
|
|
348
346
|
decoded_kv[key_str] = val_str
|
349
347
|
return decoded_kv
|
350
348
|
except Exception as e:
|
351
|
-
|
349
|
+
self.logger.warning(f"Could not decode key-value metadata: {e}")
|
352
350
|
return {"error": f"Error decoding key-value metadata: {e}"}
|
353
351
|
|
354
352
|
def _format_pyarrow_type(self, field_type: pa.DataType) -> str:
|
@@ -425,6 +423,32 @@ class ParquetHandler(DataHandler):
|
|
425
423
|
variance_val, err_var = self._safe_compute(pc.variance, column_data, ddof=1)
|
426
424
|
stats["Variance"] = f"{variance_val:.4f}" if variance_val is not None and err_var is None else (
|
427
425
|
err_var or "N/A")
|
426
|
+
distinct_val, err = self._safe_compute(pc.count_distinct, column_data)
|
427
|
+
stats["Distinct Count"] = f"{distinct_val:,}" if distinct_val is not None and err is None else (err or "N/A")
|
428
|
+
|
429
|
+
# Add histogram data for visualization
|
430
|
+
try:
|
431
|
+
# Convert to Python list for histogram calculation (sample if too large)
|
432
|
+
data_length = len(column_data)
|
433
|
+
sample_size = min(10000, data_length) # Limit to 10k samples for performance
|
434
|
+
|
435
|
+
if data_length > sample_size:
|
436
|
+
# Sample the data
|
437
|
+
import random
|
438
|
+
indices = sorted(random.sample(range(data_length), sample_size))
|
439
|
+
sampled_data = [column_data[i].as_py() for i in indices]
|
440
|
+
else:
|
441
|
+
sampled_data = column_data.to_pylist()
|
442
|
+
|
443
|
+
# Filter out None values
|
444
|
+
clean_data = [val for val in sampled_data if val is not None]
|
445
|
+
|
446
|
+
if len(clean_data) > 10: # Only create histogram if we have enough data
|
447
|
+
stats["_histogram_data"] = clean_data
|
448
|
+
stats["_data_type"] = "numeric"
|
449
|
+
|
450
|
+
except Exception as e:
|
451
|
+
self.logger.warning(f"Failed to prepare histogram data: {e}")
|
428
452
|
|
429
453
|
return stats
|
430
454
|
|
@@ -438,23 +462,10 @@ class ParquetHandler(DataHandler):
|
|
438
462
|
return stats
|
439
463
|
|
440
464
|
def _calculate_string_binary_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]:
|
441
|
-
"""Calculates distinct count
|
465
|
+
"""Calculates distinct count for string/binary columns."""
|
442
466
|
stats: Dict[str, Any] = {}
|
443
467
|
distinct_val, err = self._safe_compute(pc.count_distinct, column_data)
|
444
468
|
stats["Distinct Count"] = f"{distinct_val:,}" if distinct_val is not None and err is None else (err or "N/A")
|
445
|
-
|
446
|
-
if pa.types.is_string(column_data.type) or pa.types.is_large_string(column_data.type):
|
447
|
-
lengths, err_len = self._safe_compute(pc.binary_length, column_data)
|
448
|
-
if err_len is None and lengths is not None:
|
449
|
-
min_len, err_min = self._safe_compute(pc.min, lengths)
|
450
|
-
stats["Min Length"] = min_len if err_min is None else err_min
|
451
|
-
max_len, err_max = self._safe_compute(pc.max, lengths)
|
452
|
-
stats["Max Length"] = max_len if err_max is None else err_max
|
453
|
-
avg_len, err_avg = self._safe_compute(pc.mean, lengths)
|
454
|
-
stats["Avg Length"] = f"{avg_len:.2f}" if avg_len is not None and err_avg is None else (
|
455
|
-
err_avg or "N/A")
|
456
|
-
else:
|
457
|
-
stats.update({"Min Length": "Error", "Max Length": "Error", "Avg Length": "Error"})
|
458
469
|
return stats
|
459
470
|
|
460
471
|
def _calculate_boolean_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]:
|
@@ -480,7 +491,7 @@ class ParquetHandler(DataHandler):
|
|
480
491
|
if 'False' not in stats["Value Counts"]: stats["Value Counts"]['False'] = "0"
|
481
492
|
|
482
493
|
except Exception as vc_e:
|
483
|
-
|
494
|
+
self.logger.warning(f"Boolean value count calculation error: {vc_e}", exc_info=True)
|
484
495
|
stats["Value Counts"] = "Error calculating"
|
485
496
|
return stats
|
486
497
|
|
@@ -490,7 +501,7 @@ class ParquetHandler(DataHandler):
|
|
490
501
|
try:
|
491
502
|
unwrapped_data = column_data.dictionary_decode()
|
492
503
|
value_type = col_type.value_type
|
493
|
-
|
504
|
+
self.logger.debug(f"Calculating dictionary stats based on value type: {value_type}")
|
494
505
|
|
495
506
|
# Delegate calculation based on the *value* type
|
496
507
|
if pa.types.is_floating(value_type) or pa.types.is_integer(value_type):
|
@@ -511,10 +522,10 @@ class ParquetHandler(DataHandler):
|
|
511
522
|
err or "N/A")
|
512
523
|
|
513
524
|
except pa.lib.ArrowException as arrow_decode_err:
|
514
|
-
|
525
|
+
self.logger.warning(f"Arrow error decoding dictionary type for stats: {arrow_decode_err}")
|
515
526
|
stats["Dictionary Error"] = f"Decode Error: {arrow_decode_err}"
|
516
527
|
except Exception as dict_e:
|
517
|
-
|
528
|
+
self.logger.warning(f"Could not process dictionary type for stats: {dict_e}")
|
518
529
|
stats["Dictionary Error"] = f"Processing Error: {dict_e}"
|
519
530
|
return stats
|
520
531
|
|
@@ -545,17 +556,17 @@ class ParquetHandler(DataHandler):
|
|
545
556
|
rg_meta = self.metadata.row_group(i)
|
546
557
|
metadata_stats[group_key] = self._extract_stats_for_single_group(rg_meta, col_index)
|
547
558
|
except IndexError:
|
548
|
-
|
559
|
+
self.logger.warning(f"Column index {col_index} out of bounds for row group {i}.")
|
549
560
|
metadata_stats[group_key] = "Index Error"
|
550
561
|
except Exception as e:
|
551
|
-
|
562
|
+
self.logger.warning(f"Error processing metadata stats for RG {i}, column '{column_name}': {e}")
|
552
563
|
metadata_stats[group_key] = f"Read Error: {e}"
|
553
564
|
|
554
565
|
except KeyError:
|
555
|
-
|
566
|
+
self.logger.warning(f"Column '{column_name}' not found in schema for metadata stats.")
|
556
567
|
error_str = f"Column '{column_name}' not found in schema"
|
557
568
|
except Exception as e:
|
558
|
-
|
569
|
+
self.logger.exception(f"Failed to get metadata statistics structure for column '{column_name}'.")
|
559
570
|
error_str = f"Error accessing metadata structure: {e}"
|
560
571
|
|
561
572
|
return metadata_stats, error_str
|
@@ -587,10 +598,10 @@ class ParquetHandler(DataHandler):
|
|
587
598
|
col_chunk_meta.total_uncompressed_size is not None),
|
588
599
|
}
|
589
600
|
except IndexError:
|
590
|
-
|
601
|
+
self.logger.warning(f"Column index {col_index} out of bounds for row group {rg_meta.num_columns} columns.")
|
591
602
|
return "Index Error"
|
592
603
|
except Exception as e:
|
593
|
-
|
604
|
+
self.logger.error(f"Error reading column chunk metadata stats for index {col_index}: {e}", exc_info=True)
|
594
605
|
return f"Metadata Read Error: {e}"
|
595
606
|
|
596
607
|
def _create_stats_result(
|
@@ -613,7 +624,7 @@ class ParquetHandler(DataHandler):
|
|
613
624
|
col_type_str = self._format_pyarrow_type(field.type)
|
614
625
|
col_nullable = field.nullable
|
615
626
|
except Exception as e:
|
616
|
-
|
627
|
+
self.logger.error(f"Error formatting type for column {column_name}: {e}")
|
617
628
|
col_type_str = f"[Error formatting: {field.type}]"
|
618
629
|
col_nullable = None
|
619
630
|
|
parqv/views/__init__.py
CHANGED
@@ -0,0 +1,38 @@
|
|
1
|
+
"""
|
2
|
+
Views package for parqv application.
|
3
|
+
|
4
|
+
This package contains all UI views and their supporting components and utilities.
|
5
|
+
"""
|
6
|
+
|
7
|
+
# Main views
|
8
|
+
from .metadata_view import MetadataView
|
9
|
+
from .data_view import DataView
|
10
|
+
from .schema_view import SchemaView
|
11
|
+
|
12
|
+
# Base classes
|
13
|
+
from .base import BaseView
|
14
|
+
|
15
|
+
# Components (optional, for advanced usage)
|
16
|
+
from .components import ErrorDisplay, LoadingDisplay, EnhancedDataTable
|
17
|
+
|
18
|
+
# Utilities (optional, for advanced usage)
|
19
|
+
from .utils import format_metadata_for_display, format_stats_for_display
|
20
|
+
|
21
|
+
__all__ = [
|
22
|
+
# Main views - these are the primary exports
|
23
|
+
"MetadataView",
|
24
|
+
"DataView",
|
25
|
+
"SchemaView",
|
26
|
+
|
27
|
+
# Base class - for extending functionality
|
28
|
+
"BaseView",
|
29
|
+
|
30
|
+
# Components - for custom view development
|
31
|
+
"ErrorDisplay",
|
32
|
+
"LoadingDisplay",
|
33
|
+
"EnhancedDataTable",
|
34
|
+
|
35
|
+
# Utilities - for data formatting
|
36
|
+
"format_metadata_for_display",
|
37
|
+
"format_stats_for_display",
|
38
|
+
]
|