parqv 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. parqv/__init__.py +31 -0
  2. parqv/app.py +84 -102
  3. parqv/cli.py +112 -0
  4. parqv/core/__init__.py +31 -0
  5. parqv/core/config.py +26 -0
  6. parqv/core/file_utils.py +88 -0
  7. parqv/core/handler_factory.py +90 -0
  8. parqv/core/logging.py +46 -0
  9. parqv/data_sources/__init__.py +48 -0
  10. parqv/data_sources/base/__init__.py +28 -0
  11. parqv/data_sources/base/exceptions.py +38 -0
  12. parqv/{handlers/base_handler.py → data_sources/base/handler.py} +54 -25
  13. parqv/{handlers → data_sources/formats}/__init__.py +13 -5
  14. parqv/data_sources/formats/csv.py +460 -0
  15. parqv/{handlers → data_sources/formats}/json.py +68 -32
  16. parqv/{handlers → data_sources/formats}/parquet.py +67 -56
  17. parqv/views/__init__.py +38 -0
  18. parqv/views/base.py +98 -0
  19. parqv/views/components/__init__.py +13 -0
  20. parqv/views/components/enhanced_data_table.py +152 -0
  21. parqv/views/components/error_display.py +72 -0
  22. parqv/views/components/loading_display.py +44 -0
  23. parqv/views/data_view.py +119 -46
  24. parqv/views/metadata_view.py +57 -20
  25. parqv/views/schema_view.py +190 -200
  26. parqv/views/utils/__init__.py +19 -0
  27. parqv/views/utils/data_formatters.py +184 -0
  28. parqv/views/utils/stats_formatters.py +220 -0
  29. parqv/views/utils/visualization.py +204 -0
  30. {parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/METADATA +5 -6
  31. parqv-0.3.0.dist-info/RECORD +36 -0
  32. {parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/WHEEL +1 -1
  33. parqv-0.2.0.dist-info/RECORD +0 -17
  34. {parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/entry_points.txt +0 -0
  35. {parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/licenses/LICENSE +0 -0
  36. {parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,10 @@
1
- import logging
2
1
  from pathlib import Path
3
2
  from typing import Any, Dict, List, Optional, Tuple
4
3
 
5
4
  import duckdb
6
5
  import pandas as pd
7
6
 
8
- from .base_handler import DataHandler, DataHandlerError
9
-
10
- log = logging.getLogger(__name__)
7
+ from ..base import DataHandler, DataHandlerError
11
8
 
12
9
 
13
10
  class JsonHandlerError(DataHandlerError):
@@ -38,6 +35,8 @@ class JsonHandler(DataHandler):
38
35
  JsonHandlerError: If the file doesn't exist, isn't a file, or if
39
36
  initialization fails (e.g., DuckDB connection, view creation).
40
37
  """
38
+ super().__init__(file_path)
39
+
41
40
  self.file_path = self._validate_file_path(file_path)
42
41
  self._db_conn: Optional[duckdb.DuckDBPyConnection] = None
43
42
  self._view_name: str = self.DEFAULT_VIEW_NAME
@@ -48,9 +47,9 @@ class JsonHandler(DataHandler):
48
47
  self._connect_db()
49
48
  self._create_duckdb_view()
50
49
  self._load_metadata()
51
- log.info(f"JsonHandler initialized successfully for: {self.file_path}")
50
+ self.logger.info(f"JsonHandler initialized successfully for: {self.file_path}")
52
51
  except Exception as e:
53
- log.exception(f"Error during JsonHandler initialization for {self.file_path}")
52
+ self.logger.exception(f"Error during JsonHandler initialization for {self.file_path}")
54
53
  self.close()
55
54
  if isinstance(e, JsonHandlerError):
56
55
  raise
@@ -67,9 +66,9 @@ class JsonHandler(DataHandler):
67
66
  """Establishes a connection to an in-memory DuckDB database."""
68
67
  try:
69
68
  self._db_conn = duckdb.connect(database=':memory:', read_only=False)
70
- log.debug("DuckDB in-memory connection established.")
69
+ self.logger.debug("DuckDB in-memory connection established.")
71
70
  except Exception as e:
72
- log.exception("Failed to initialize DuckDB connection.")
71
+ self.logger.exception("Failed to initialize DuckDB connection.")
73
72
  raise JsonHandlerError(f"DuckDB connection failed: {e}") from e
74
73
 
75
74
  def _create_duckdb_view(self):
@@ -83,9 +82,9 @@ class JsonHandler(DataHandler):
83
82
 
84
83
  try:
85
84
  self._db_conn.sql(load_query)
86
- log.debug(f"DuckDB view '{self._view_name}' created for file '{file_path_str}'.")
85
+ self.logger.debug(f"DuckDB view '{self._view_name}' created for file '{file_path_str}'.")
87
86
  except duckdb.Error as db_err:
88
- log.exception(f"DuckDB Error creating view '{self._view_name}' from '{file_path_str}': {db_err}")
87
+ self.logger.exception(f"DuckDB Error creating view '{self._view_name}' from '{file_path_str}': {db_err}")
89
88
  if "Could not open file" in str(db_err):
90
89
  raise JsonHandlerError(
91
90
  f"DuckDB could not open file: {file_path_str}. Check permissions or path. Error: {db_err}") from db_err
@@ -95,7 +94,7 @@ class JsonHandler(DataHandler):
95
94
  else:
96
95
  raise JsonHandlerError(f"DuckDB failed create view for JSON file: {db_err}") from db_err
97
96
  except Exception as e:
98
- log.exception(f"Unexpected error creating DuckDB view '{self._view_name}'.")
97
+ self.logger.exception(f"Unexpected error creating DuckDB view '{self._view_name}'.")
99
98
  raise JsonHandlerError(f"Failed to create DuckDB view: {e}") from e
100
99
 
101
100
  def _load_metadata(self):
@@ -108,27 +107,27 @@ class JsonHandler(DataHandler):
108
107
  describe_query = f"DESCRIBE \"{self._view_name}\";"
109
108
  schema_result = self._db_conn.sql(describe_query).fetchall()
110
109
  self._schema = self._parse_schema(schema_result)
111
- log.debug(f"Schema fetched for view '{self._view_name}': {len(self._schema)} columns.")
110
+ self.logger.debug(f"Schema fetched for view '{self._view_name}': {len(self._schema)} columns.")
112
111
 
113
112
  # Fetch Row Count
114
113
  count_query = f"SELECT COUNT(*) FROM \"{self._view_name}\";"
115
114
  count_result = self._db_conn.sql(count_query).fetchone()
116
115
  self._row_count = count_result[0] if count_result else 0
117
- log.debug(f"Row count fetched for view '{self._view_name}': {self._row_count}")
116
+ self.logger.debug(f"Row count fetched for view '{self._view_name}': {self._row_count}")
118
117
 
119
118
  except duckdb.Error as db_err:
120
- log.exception(f"DuckDB Error fetching metadata for view '{self._view_name}': {db_err}")
119
+ self.logger.exception(f"DuckDB Error fetching metadata for view '{self._view_name}': {db_err}")
121
120
  self._schema = None
122
121
  self._row_count = None
123
122
  except Exception as e:
124
- log.exception(f"Unexpected error fetching metadata for view '{self._view_name}'")
123
+ self.logger.exception(f"Unexpected error fetching metadata for view '{self._view_name}'")
125
124
  self._schema = None
126
125
  self._row_count = None
127
126
 
128
127
  def _parse_schema(self, describe_output: List[Tuple]) -> List[Dict[str, Any]]:
129
128
  """Parses the output of DuckDB's DESCRIBE query."""
130
129
  if not describe_output:
131
- log.warning(f"DESCRIBE query for view '{self._view_name}' returned no schema info.")
130
+ self.logger.warning(f"DESCRIBE query for view '{self._view_name}' returned no schema info.")
132
131
  return []
133
132
 
134
133
  parsed_schema = []
@@ -141,7 +140,7 @@ class JsonHandler(DataHandler):
141
140
  is_nullable = null_str.upper() == 'YES'
142
141
  parsed_schema.append({"name": name, "type": type_str, "nullable": is_nullable})
143
142
  else:
144
- log.warning(f"Unexpected format in DESCRIBE output row: {row}")
143
+ self.logger.warning(f"Unexpected format in DESCRIBE output row: {row}")
145
144
  return parsed_schema
146
145
 
147
146
  def get_schema_data(self) -> Optional[List[Dict[str, Any]]]:
@@ -153,7 +152,7 @@ class JsonHandler(DataHandler):
153
152
  or None if schema couldn't be fetched.
154
153
  """
155
154
  if self._schema is None:
156
- log.warning("Schema is unavailable. It might not have been fetched successfully.")
155
+ self.logger.warning("Schema is unavailable. It might not have been fetched successfully.")
157
156
  return self._schema
158
157
 
159
158
  def get_metadata_summary(self) -> Dict[str, Any]:
@@ -184,7 +183,7 @@ class JsonHandler(DataHandler):
184
183
  try:
185
184
  summary["Size"] = f"{self.file_path.stat().st_size:,} bytes"
186
185
  except Exception as e:
187
- log.warning(f"Could not get file size for {self.file_path}: {e}")
186
+ self.logger.warning(f"Could not get file size for {self.file_path}: {e}")
188
187
  summary["Size"] = "N/A"
189
188
 
190
189
  return summary
@@ -202,13 +201,13 @@ class JsonHandler(DataHandler):
202
201
  error message if fetching fails.
203
202
  """
204
203
  if not self._db_conn:
205
- log.warning("Data preview unavailable: DuckDB connection is closed or uninitialized.")
204
+ self.logger.warning("Data preview unavailable: DuckDB connection is closed or uninitialized.")
206
205
  return pd.DataFrame({"error": ["DuckDB connection not available."]})
207
206
  if self._schema is None:
208
- log.warning("Data preview unavailable: Schema couldn't be determined.")
207
+ self.logger.warning("Data preview unavailable: Schema couldn't be determined.")
209
208
  return pd.DataFrame({"error": ["Schema not available, cannot fetch preview."]})
210
209
  if self._row_count == 0:
211
- log.info("Data preview: Source JSON view is empty.")
210
+ self.logger.info("Data preview: Source JSON view is empty.")
212
211
  # Return empty DataFrame with correct columns if possible
213
212
  if self._schema:
214
213
  return pd.DataFrame(columns=[col['name'] for col in self._schema])
@@ -221,10 +220,10 @@ class JsonHandler(DataHandler):
221
220
  df = self._db_conn.sql(preview_query).df()
222
221
  return df
223
222
  except duckdb.Error as db_err:
224
- log.exception(f"DuckDB error getting data preview from '{self._view_name}': {db_err}")
223
+ self.logger.exception(f"DuckDB error getting data preview from '{self._view_name}': {db_err}")
225
224
  return pd.DataFrame({"error": [f"DuckDB error fetching preview: {db_err}"]})
226
225
  except Exception as e:
227
- log.exception(f"Unexpected error getting data preview from '{self._view_name}'")
226
+ self.logger.exception(f"Unexpected error getting data preview from '{self._view_name}'")
228
227
  return pd.DataFrame({"error": [f"Failed to fetch preview: {e}"]})
229
228
 
230
229
  def _get_column_info(self, column_name: str) -> Optional[Dict[str, Any]]:
@@ -274,7 +273,7 @@ class JsonHandler(DataHandler):
274
273
 
275
274
  if is_complex:
276
275
  # Use basic counts for complex types as SUMMARIZE is less informative
277
- log.debug(f"Calculating basic counts for complex type column: {column_name}")
276
+ self.logger.debug(f"Calculating basic counts for complex type column: {column_name}")
278
277
  stats = self._get_basic_column_counts(safe_column_name)
279
278
  message = f"Only basic counts calculated for complex type '{col_type}'."
280
279
  # Attempt distinct count for complex types (can be slow/error-prone)
@@ -286,13 +285,13 @@ class JsonHandler(DataHandler):
286
285
  else:
287
286
  stats["Distinct Count"] = "N/A" # Or 0 if appropriate
288
287
  except duckdb.Error as distinct_err:
289
- log.warning(
288
+ self.logger.warning(
290
289
  f"Could not calculate distinct count for complex column '{column_name}': {distinct_err}")
291
290
  stats["Distinct Count"] = "Error"
292
291
 
293
292
  else:
294
293
  # Use SUMMARIZE for non-complex types
295
- log.debug(f"Using SUMMARIZE for simple type column: {column_name}")
294
+ self.logger.debug(f"Using SUMMARIZE for simple type column: {column_name}")
296
295
  summarize_query = f"SUMMARIZE SELECT {safe_column_name} FROM \"{self._view_name}\";"
297
296
  summarize_df = self._db_conn.sql(summarize_query).df()
298
297
 
@@ -304,11 +303,17 @@ class JsonHandler(DataHandler):
304
303
  # SUMMARIZE puts results in the first row
305
304
  stats = self._format_summarize_stats(summarize_df.iloc[0])
306
305
 
306
+ # Add histogram data for numeric columns
307
+ try:
308
+ self._add_histogram_data_if_numeric(stats, safe_column_name)
309
+ except Exception as hist_e:
310
+ self.logger.warning(f"Failed to add histogram data for {column_name}: {hist_e}")
311
+
307
312
  except duckdb.Error as db_err:
308
- log.exception(f"DuckDB Error calculating statistics for column '{column_name}': {db_err}")
313
+ self.logger.exception(f"DuckDB Error calculating statistics for column '{column_name}': {db_err}")
309
314
  error_msg = f"DuckDB calculation failed: {db_err}"
310
315
  except Exception as e:
311
- log.exception(f"Unexpected error calculating statistics for column '{column_name}'")
316
+ self.logger.exception(f"Unexpected error calculating statistics for column '{column_name}'")
312
317
  error_msg = f"Calculation failed unexpectedly: {e}"
313
318
 
314
319
  return self._create_stats_result(
@@ -351,7 +356,7 @@ class JsonHandler(DataHandler):
351
356
  stats["Null Percentage"] = "Error"
352
357
 
353
358
  except duckdb.Error as db_err:
354
- log.warning(f"Failed to get basic counts for {safe_column_name}: {db_err}")
359
+ self.logger.warning(f"Failed to get basic counts for {safe_column_name}: {db_err}")
355
360
  stats["Counts Error"] = str(db_err)
356
361
  return stats
357
362
 
@@ -404,6 +409,37 @@ class JsonHandler(DataHandler):
404
409
 
405
410
  return stats
406
411
 
412
+ def _add_histogram_data_if_numeric(self, stats: Dict[str, Any], safe_column_name: str) -> None:
413
+ """Add histogram data for numeric columns by sampling from DuckDB."""
414
+ # Check if this looks like numeric data (has Mean, Min, Max)
415
+ if not all(key in stats for key in ["Mean", "Min", "Max"]):
416
+ return
417
+
418
+ try:
419
+ # Sample data for histogram (limit to 10k samples for performance)
420
+ sample_query = f"""
421
+ SELECT {safe_column_name}
422
+ FROM "{self._view_name}"
423
+ WHERE {safe_column_name} IS NOT NULL
424
+ USING SAMPLE 10000
425
+ """
426
+
427
+ sample_df = self._db_conn.sql(sample_query).df()
428
+
429
+ if not sample_df.empty and len(sample_df) > 10:
430
+ # Extract the column data
431
+ column_data = sample_df.iloc[:, 0].tolist()
432
+
433
+ # Filter out any remaining nulls
434
+ clean_data = [val for val in column_data if val is not None]
435
+
436
+ if len(clean_data) > 10:
437
+ stats["_histogram_data"] = clean_data
438
+ stats["_data_type"] = "numeric"
439
+
440
+ except Exception as e:
441
+ self.logger.warning(f"Failed to sample data for histogram: {e}")
442
+
407
443
  def _create_stats_result(
408
444
  self,
409
445
  column_name: str,
@@ -430,11 +466,11 @@ class JsonHandler(DataHandler):
430
466
  if self._db_conn:
431
467
  try:
432
468
  self._db_conn.close()
433
- log.info(f"DuckDB connection closed for {self.file_path}.")
469
+ self.logger.info(f"DuckDB connection closed for {self.file_path}.")
434
470
  self._db_conn = None
435
471
  except Exception as e:
436
472
  # Log error but don't raise during close typically
437
- log.error(f"Error closing DuckDB connection for {self.file_path}: {e}")
473
+ self.logger.error(f"Error closing DuckDB connection for {self.file_path}: {e}")
438
474
  self._db_conn = None # Assume closed even if error occurred
439
475
 
440
476
  def __enter__(self):
@@ -1,4 +1,3 @@
1
- import logging
2
1
  from pathlib import Path
3
2
  from typing import Any, Dict, List, Tuple, Optional, Union
4
3
 
@@ -7,9 +6,7 @@ import pyarrow as pa
7
6
  import pyarrow.compute as pc
8
7
  import pyarrow.parquet as pq
9
8
 
10
- from .base_handler import DataHandler, DataHandlerError
11
-
12
- log = logging.getLogger(__name__)
9
+ from ..base import DataHandler, DataHandlerError
13
10
 
14
11
 
15
12
  class ParquetHandlerError(DataHandlerError):
@@ -49,17 +46,17 @@ class ParquetHandler(DataHandler):
49
46
  self.pq_file = pq.ParquetFile(self.file_path)
50
47
  self.schema = self.pq_file.schema_arrow
51
48
  self.metadata = self.pq_file.metadata
52
- log.info(f"Successfully initialized ParquetHandler for: {self.file_path.name}")
49
+ self.logger.info(f"Successfully initialized ParquetHandler for: {self.file_path.name}")
53
50
 
54
51
  except FileNotFoundError as fnf_e:
55
- log.error(f"File not found during ParquetHandler initialization: {fnf_e}")
52
+ self.logger.error(f"File not found during ParquetHandler initialization: {fnf_e}")
56
53
  raise ParquetHandlerError(str(fnf_e)) from fnf_e
57
54
  except pa.lib.ArrowIOError as arrow_io_e:
58
- log.error(f"Arrow IO Error initializing ParquetHandler for {self.file_path.name}: {arrow_io_e}")
55
+ self.logger.error(f"Arrow IO Error initializing ParquetHandler for {self.file_path.name}: {arrow_io_e}")
59
56
  raise ParquetHandlerError(
60
57
  f"Failed to open Parquet file '{self.file_path.name}': {arrow_io_e}") from arrow_io_e
61
58
  except Exception as e:
62
- log.exception(f"Unexpected error initializing ParquetHandler for {self.file_path.name}")
59
+ self.logger.exception(f"Unexpected error initializing ParquetHandler for {self.file_path.name}")
63
60
  self.close()
64
61
  raise ParquetHandlerError(f"Failed to initialize Parquet handler '{self.file_path.name}': {e}") from e
65
62
 
@@ -71,10 +68,10 @@ class ParquetHandler(DataHandler):
71
68
  # ParquetFile might not have a close method depending on source, check first
72
69
  if hasattr(self.pq_file, 'close'):
73
70
  self.pq_file.close()
74
- log.info(f"Closed Parquet file: {self.file_path.name}")
71
+ self.logger.info(f"Closed Parquet file: {self.file_path.name}")
75
72
  except Exception as e:
76
73
  # Log error during close but don't raise, as we're cleaning up
77
- log.warning(f"Exception while closing Parquet file {self.file_path.name}: {e}")
74
+ self.logger.warning(f"Exception while closing Parquet file {self.file_path.name}: {e}")
78
75
  finally:
79
76
  self.pq_file = None
80
77
  self.schema = None
@@ -102,7 +99,7 @@ class ParquetHandler(DataHandler):
102
99
  A dictionary containing key metadata attributes, or an error dictionary.
103
100
  """
104
101
  if not self.metadata or not self.schema:
105
- log.warning(f"Metadata or schema not available for summary: {self.file_path.name}")
102
+ self.logger.warning(f"Metadata or schema not available for summary: {self.file_path.name}")
106
103
  return {"error": "Metadata or schema not available"}
107
104
 
108
105
  try:
@@ -126,7 +123,7 @@ class ParquetHandler(DataHandler):
126
123
 
127
124
  return summary
128
125
  except Exception as e:
129
- log.exception(f"Error generating metadata summary for {self.file_path.name}")
126
+ self.logger.exception(f"Error generating metadata summary for {self.file_path.name}")
130
127
  return {"error": f"Error getting metadata summary: {e}"}
131
128
 
132
129
  def get_schema_data(self) -> Optional[List[Dict[str, Any]]]:
@@ -138,7 +135,7 @@ class ParquetHandler(DataHandler):
138
135
  or None if the schema is unavailable.
139
136
  """
140
137
  if not self.schema:
141
- log.warning(f"Schema is not available for get_schema_data: {self.file_path.name}")
138
+ self.logger.warning(f"Schema is not available for get_schema_data: {self.file_path.name}")
142
139
  return None
143
140
 
144
141
  schema_list = []
@@ -151,7 +148,7 @@ class ParquetHandler(DataHandler):
151
148
  "nullable": field.nullable
152
149
  })
153
150
  except Exception as e:
154
- log.error(f"Error processing field '{field.name}' for schema data: {e}", exc_info=True)
151
+ self.logger.error(f"Error processing field '{field.name}' for schema data: {e}", exc_info=True)
155
152
  schema_list.append({
156
153
  "name": field.name,
157
154
  "type": f"[Error: {e}]",
@@ -172,11 +169,11 @@ class ParquetHandler(DataHandler):
172
169
  Returns a DataFrame with an 'error' column on failure.
173
170
  """
174
171
  if not self.pq_file:
175
- log.warning(f"ParquetFile handler not available for data preview: {self.file_path.name}")
172
+ self.logger.warning(f"ParquetFile handler not available for data preview: {self.file_path.name}")
176
173
  return pd.DataFrame({"error": ["Parquet handler not initialized or closed."]})
177
174
 
178
175
  if self.metadata and self.metadata.num_rows == 0:
179
- log.info(f"Parquet file is empty based on metadata: {self.file_path.name}")
176
+ self.logger.info(f"Parquet file is empty based on metadata: {self.file_path.name}")
180
177
  if self.schema:
181
178
  return pd.DataFrame(columns=self.schema.names)
182
179
  else:
@@ -206,10 +203,10 @@ class ParquetHandler(DataHandler):
206
203
  if not batches:
207
204
  # Check if file might have rows but reading yielded nothing
208
205
  if self.metadata and self.metadata.num_rows > 0:
209
- log.warning(
206
+ self.logger.warning(
210
207
  f"No batches read for preview, though metadata indicates {self.metadata.num_rows} rows: {self.file_path.name}")
211
208
  else:
212
- log.info(f"No data read for preview (file likely empty): {self.file_path.name}")
209
+ self.logger.info(f"No data read for preview (file likely empty): {self.file_path.name}")
213
210
  # Return empty DF with columns if schema available
214
211
  if self.schema:
215
212
  return pd.DataFrame(columns=self.schema.names)
@@ -223,11 +220,11 @@ class ParquetHandler(DataHandler):
223
220
  self_destruct=True,
224
221
  types_mapper=pd.ArrowDtype
225
222
  )
226
- log.info(f"Generated preview of {len(df)} rows for {self.file_path.name}")
223
+ self.logger.info(f"Generated preview of {len(df)} rows for {self.file_path.name}")
227
224
  return df
228
225
 
229
226
  except Exception as e:
230
- log.exception(f"Error generating data preview from Parquet file: {self.file_path.name}")
227
+ self.logger.exception(f"Error generating data preview from Parquet file: {self.file_path.name}")
231
228
  return pd.DataFrame({"error": [f"Failed to fetch preview: {e}"]})
232
229
 
233
230
  def get_column_stats(self, column_name: str) -> Dict[str, Any]:
@@ -242,13 +239,13 @@ class ParquetHandler(DataHandler):
242
239
  and potential error or message keys.
243
240
  """
244
241
  if not self.pq_file or not self.schema:
245
- log.warning(f"Parquet file/schema unavailable for column stats: {self.file_path.name}")
242
+ self.logger.warning(f"Parquet file/schema unavailable for column stats: {self.file_path.name}")
246
243
  return self._create_stats_result(column_name, None, error="File or schema not available")
247
244
 
248
245
  try:
249
246
  field = self.schema.field(column_name)
250
247
  except KeyError:
251
- log.warning(f"Column '{column_name}' not found in schema: {self.file_path.name}")
248
+ self.logger.warning(f"Column '{column_name}' not found in schema: {self.file_path.name}")
252
249
  return self._create_stats_result(column_name, None, error=f"Column '{column_name}' not found in schema")
253
250
 
254
251
  calculated_stats: Dict[str, Any] = {}
@@ -261,7 +258,7 @@ class ParquetHandler(DataHandler):
261
258
  # Data Reading
262
259
  table = self.pq_file.read(columns=[column_name])
263
260
  column_data = table.column(0)
264
- log.debug(
261
+ self.logger.debug(
265
262
  f"Finished reading column '{column_name}'. Rows: {len(column_data)}, Nulls: {column_data.null_count}")
266
263
 
267
264
  # Basic Counts
@@ -274,14 +271,14 @@ class ParquetHandler(DataHandler):
274
271
  calculated_stats["Null Count"] = f"{null_count:,}"
275
272
  calculated_stats["Null Percentage"] = f"{(null_count / total_count * 100):.2f}%"
276
273
  else:
277
- log.info(f"Column '{column_name}' read resulted in 0 rows.")
274
+ self.logger.info(f"Column '{column_name}' read resulted in 0 rows.")
278
275
  message = "Column is empty (0 rows)."
279
276
  valid_count = 0 # Ensure valid_count is 0 for later checks
280
277
 
281
278
  # Type-Specific Calculations
282
279
  if valid_count > 0:
283
280
  col_type = field.type
284
- log.debug(f"Calculating stats for type: {self._format_pyarrow_type(col_type)}")
281
+ self.logger.debug(f"Calculating stats for type: {self._format_pyarrow_type(col_type)}")
285
282
  try:
286
283
  if pa.types.is_floating(col_type) or pa.types.is_integer(col_type):
287
284
  calculated_stats.update(self._calculate_numeric_stats(column_data))
@@ -300,11 +297,12 @@ class ParquetHandler(DataHandler):
300
297
  calculated_stats.update(self._calculate_complex_type_stats(column_data, col_type))
301
298
  message = f"Basic aggregate stats (min/max/mean) not applicable for complex type '{self._format_pyarrow_type(col_type)}'."
302
299
  else:
303
- log.warning(f"Statistics calculation not fully implemented for type: {col_type}")
300
+ self.logger.warning(f"Statistics calculation not fully implemented for type: {col_type}")
304
301
  message = f"Statistics calculation not implemented for type '{self._format_pyarrow_type(col_type)}'."
305
302
 
306
303
  except Exception as calc_err:
307
- log.exception(f"Error during type-specific calculation for column '{column_name}': {calc_err}")
304
+ self.logger.exception(
305
+ f"Error during type-specific calculation for column '{column_name}': {calc_err}")
308
306
  error_msg = f"Calculation error for type {field.type}: {calc_err}"
309
307
  calculated_stats["Calculation Error"] = str(calc_err) # Add specific error key
310
308
 
@@ -315,10 +313,10 @@ class ParquetHandler(DataHandler):
315
313
  metadata_stats, metadata_stats_error = self._get_stats_from_metadata(column_name)
316
314
 
317
315
  except pa.lib.ArrowException as arrow_e:
318
- log.exception(f"Arrow error during stats processing for column '{column_name}': {arrow_e}")
316
+ self.logger.exception(f"Arrow error during stats processing for column '{column_name}': {arrow_e}")
319
317
  error_msg = f"Arrow processing error: {arrow_e}"
320
318
  except Exception as e:
321
- log.exception(f"Unexpected error during stats calculation for column '{column_name}'")
319
+ self.logger.exception(f"Unexpected error during stats calculation for column '{column_name}'")
322
320
  error_msg = f"Calculation failed unexpectedly: {e}"
323
321
 
324
322
  return self._create_stats_result(
@@ -331,7 +329,7 @@ class ParquetHandler(DataHandler):
331
329
  try:
332
330
  return value.decode('utf-8', errors='replace')
333
331
  except Exception as e:
334
- log.warning(f"Could not decode metadata bytes: {e}. Value: {value!r}")
332
+ self.logger.warning(f"Could not decode metadata bytes: {e}. Value: {value!r}")
335
333
  return f"[Decode Error: {value!r}]"
336
334
  return str(value) if value is not None else None
337
335
 
@@ -348,7 +346,7 @@ class ParquetHandler(DataHandler):
348
346
  decoded_kv[key_str] = val_str
349
347
  return decoded_kv
350
348
  except Exception as e:
351
- log.warning(f"Could not decode key-value metadata: {e}")
349
+ self.logger.warning(f"Could not decode key-value metadata: {e}")
352
350
  return {"error": f"Error decoding key-value metadata: {e}"}
353
351
 
354
352
  def _format_pyarrow_type(self, field_type: pa.DataType) -> str:
@@ -425,6 +423,32 @@ class ParquetHandler(DataHandler):
425
423
  variance_val, err_var = self._safe_compute(pc.variance, column_data, ddof=1)
426
424
  stats["Variance"] = f"{variance_val:.4f}" if variance_val is not None and err_var is None else (
427
425
  err_var or "N/A")
426
+ distinct_val, err = self._safe_compute(pc.count_distinct, column_data)
427
+ stats["Distinct Count"] = f"{distinct_val:,}" if distinct_val is not None and err is None else (err or "N/A")
428
+
429
+ # Add histogram data for visualization
430
+ try:
431
+ # Convert to Python list for histogram calculation (sample if too large)
432
+ data_length = len(column_data)
433
+ sample_size = min(10000, data_length) # Limit to 10k samples for performance
434
+
435
+ if data_length > sample_size:
436
+ # Sample the data
437
+ import random
438
+ indices = sorted(random.sample(range(data_length), sample_size))
439
+ sampled_data = [column_data[i].as_py() for i in indices]
440
+ else:
441
+ sampled_data = column_data.to_pylist()
442
+
443
+ # Filter out None values
444
+ clean_data = [val for val in sampled_data if val is not None]
445
+
446
+ if len(clean_data) > 10: # Only create histogram if we have enough data
447
+ stats["_histogram_data"] = clean_data
448
+ stats["_data_type"] = "numeric"
449
+
450
+ except Exception as e:
451
+ self.logger.warning(f"Failed to prepare histogram data: {e}")
428
452
 
429
453
  return stats
430
454
 
@@ -438,23 +462,10 @@ class ParquetHandler(DataHandler):
438
462
  return stats
439
463
 
440
464
  def _calculate_string_binary_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]:
441
- """Calculates distinct count and optionally length stats for string/binary."""
465
+ """Calculates distinct count for string/binary columns."""
442
466
  stats: Dict[str, Any] = {}
443
467
  distinct_val, err = self._safe_compute(pc.count_distinct, column_data)
444
468
  stats["Distinct Count"] = f"{distinct_val:,}" if distinct_val is not None and err is None else (err or "N/A")
445
-
446
- if pa.types.is_string(column_data.type) or pa.types.is_large_string(column_data.type):
447
- lengths, err_len = self._safe_compute(pc.binary_length, column_data)
448
- if err_len is None and lengths is not None:
449
- min_len, err_min = self._safe_compute(pc.min, lengths)
450
- stats["Min Length"] = min_len if err_min is None else err_min
451
- max_len, err_max = self._safe_compute(pc.max, lengths)
452
- stats["Max Length"] = max_len if err_max is None else err_max
453
- avg_len, err_avg = self._safe_compute(pc.mean, lengths)
454
- stats["Avg Length"] = f"{avg_len:.2f}" if avg_len is not None and err_avg is None else (
455
- err_avg or "N/A")
456
- else:
457
- stats.update({"Min Length": "Error", "Max Length": "Error", "Avg Length": "Error"})
458
469
  return stats
459
470
 
460
471
  def _calculate_boolean_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]:
@@ -480,7 +491,7 @@ class ParquetHandler(DataHandler):
480
491
  if 'False' not in stats["Value Counts"]: stats["Value Counts"]['False'] = "0"
481
492
 
482
493
  except Exception as vc_e:
483
- log.warning(f"Boolean value count calculation error: {vc_e}", exc_info=True)
494
+ self.logger.warning(f"Boolean value count calculation error: {vc_e}", exc_info=True)
484
495
  stats["Value Counts"] = "Error calculating"
485
496
  return stats
486
497
 
@@ -490,7 +501,7 @@ class ParquetHandler(DataHandler):
490
501
  try:
491
502
  unwrapped_data = column_data.dictionary_decode()
492
503
  value_type = col_type.value_type
493
- log.debug(f"Calculating dictionary stats based on value type: {value_type}")
504
+ self.logger.debug(f"Calculating dictionary stats based on value type: {value_type}")
494
505
 
495
506
  # Delegate calculation based on the *value* type
496
507
  if pa.types.is_floating(value_type) or pa.types.is_integer(value_type):
@@ -511,10 +522,10 @@ class ParquetHandler(DataHandler):
511
522
  err or "N/A")
512
523
 
513
524
  except pa.lib.ArrowException as arrow_decode_err:
514
- log.warning(f"Arrow error decoding dictionary type for stats: {arrow_decode_err}")
525
+ self.logger.warning(f"Arrow error decoding dictionary type for stats: {arrow_decode_err}")
515
526
  stats["Dictionary Error"] = f"Decode Error: {arrow_decode_err}"
516
527
  except Exception as dict_e:
517
- log.warning(f"Could not process dictionary type for stats: {dict_e}")
528
+ self.logger.warning(f"Could not process dictionary type for stats: {dict_e}")
518
529
  stats["Dictionary Error"] = f"Processing Error: {dict_e}"
519
530
  return stats
520
531
 
@@ -545,17 +556,17 @@ class ParquetHandler(DataHandler):
545
556
  rg_meta = self.metadata.row_group(i)
546
557
  metadata_stats[group_key] = self._extract_stats_for_single_group(rg_meta, col_index)
547
558
  except IndexError:
548
- log.warning(f"Column index {col_index} out of bounds for row group {i}.")
559
+ self.logger.warning(f"Column index {col_index} out of bounds for row group {i}.")
549
560
  metadata_stats[group_key] = "Index Error"
550
561
  except Exception as e:
551
- log.warning(f"Error processing metadata stats for RG {i}, column '{column_name}': {e}")
562
+ self.logger.warning(f"Error processing metadata stats for RG {i}, column '{column_name}': {e}")
552
563
  metadata_stats[group_key] = f"Read Error: {e}"
553
564
 
554
565
  except KeyError:
555
- log.warning(f"Column '{column_name}' not found in schema for metadata stats.")
566
+ self.logger.warning(f"Column '{column_name}' not found in schema for metadata stats.")
556
567
  error_str = f"Column '{column_name}' not found in schema"
557
568
  except Exception as e:
558
- log.exception(f"Failed to get metadata statistics structure for column '{column_name}'.")
569
+ self.logger.exception(f"Failed to get metadata statistics structure for column '{column_name}'.")
559
570
  error_str = f"Error accessing metadata structure: {e}"
560
571
 
561
572
  return metadata_stats, error_str
@@ -587,10 +598,10 @@ class ParquetHandler(DataHandler):
587
598
  col_chunk_meta.total_uncompressed_size is not None),
588
599
  }
589
600
  except IndexError:
590
- log.warning(f"Column index {col_index} out of bounds for row group {rg_meta.num_columns} columns.")
601
+ self.logger.warning(f"Column index {col_index} out of bounds for row group {rg_meta.num_columns} columns.")
591
602
  return "Index Error"
592
603
  except Exception as e:
593
- log.error(f"Error reading column chunk metadata stats for index {col_index}: {e}", exc_info=True)
604
+ self.logger.error(f"Error reading column chunk metadata stats for index {col_index}: {e}", exc_info=True)
594
605
  return f"Metadata Read Error: {e}"
595
606
 
596
607
  def _create_stats_result(
@@ -613,7 +624,7 @@ class ParquetHandler(DataHandler):
613
624
  col_type_str = self._format_pyarrow_type(field.type)
614
625
  col_nullable = field.nullable
615
626
  except Exception as e:
616
- log.error(f"Error formatting type for column {column_name}: {e}")
627
+ self.logger.error(f"Error formatting type for column {column_name}: {e}")
617
628
  col_type_str = f"[Error formatting: {field.type}]"
618
629
  col_nullable = None
619
630
 
parqv/views/__init__.py CHANGED
@@ -0,0 +1,38 @@
1
+ """
2
+ Views package for parqv application.
3
+
4
+ This package contains all UI views and their supporting components and utilities.
5
+ """
6
+
7
+ # Main views
8
+ from .metadata_view import MetadataView
9
+ from .data_view import DataView
10
+ from .schema_view import SchemaView
11
+
12
+ # Base classes
13
+ from .base import BaseView
14
+
15
+ # Components (optional, for advanced usage)
16
+ from .components import ErrorDisplay, LoadingDisplay, EnhancedDataTable
17
+
18
+ # Utilities (optional, for advanced usage)
19
+ from .utils import format_metadata_for_display, format_stats_for_display
20
+
21
+ __all__ = [
22
+ # Main views - these are the primary exports
23
+ "MetadataView",
24
+ "DataView",
25
+ "SchemaView",
26
+
27
+ # Base class - for extending functionality
28
+ "BaseView",
29
+
30
+ # Components - for custom view development
31
+ "ErrorDisplay",
32
+ "LoadingDisplay",
33
+ "EnhancedDataTable",
34
+
35
+ # Utilities - for data formatting
36
+ "format_metadata_for_display",
37
+ "format_stats_for_display",
38
+ ]