parqv 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. parqv/__init__.py +31 -0
  2. parqv/app.py +84 -102
  3. parqv/cli.py +112 -0
  4. parqv/core/__init__.py +31 -0
  5. parqv/core/config.py +26 -0
  6. parqv/core/file_utils.py +88 -0
  7. parqv/core/handler_factory.py +90 -0
  8. parqv/core/logging.py +46 -0
  9. parqv/data_sources/__init__.py +48 -0
  10. parqv/data_sources/base/__init__.py +28 -0
  11. parqv/data_sources/base/exceptions.py +38 -0
  12. parqv/{handlers/base_handler.py → data_sources/base/handler.py} +54 -25
  13. parqv/{handlers → data_sources/formats}/__init__.py +13 -5
  14. parqv/data_sources/formats/csv.py +460 -0
  15. parqv/{handlers → data_sources/formats}/json.py +68 -32
  16. parqv/{handlers → data_sources/formats}/parquet.py +67 -56
  17. parqv/views/__init__.py +38 -0
  18. parqv/views/base.py +98 -0
  19. parqv/views/components/__init__.py +13 -0
  20. parqv/views/components/enhanced_data_table.py +152 -0
  21. parqv/views/components/error_display.py +72 -0
  22. parqv/views/components/loading_display.py +44 -0
  23. parqv/views/data_view.py +119 -46
  24. parqv/views/metadata_view.py +57 -20
  25. parqv/views/schema_view.py +190 -200
  26. parqv/views/utils/__init__.py +19 -0
  27. parqv/views/utils/data_formatters.py +184 -0
  28. parqv/views/utils/stats_formatters.py +220 -0
  29. parqv/views/utils/visualization.py +204 -0
  30. {parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/METADATA +5 -6
  31. parqv-0.3.0.dist-info/RECORD +36 -0
  32. {parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/WHEEL +1 -1
  33. parqv-0.2.0.dist-info/RECORD +0 -17
  34. {parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/entry_points.txt +0 -0
  35. {parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/licenses/LICENSE +0 -0
  36. {parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,460 @@
1
+ """
2
+ CSV file handler for parqv data sources.
3
+ """
4
+
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ import pandas as pd
9
+
10
+ from ..base import DataHandler, DataHandlerError
11
+
12
+
13
+ class CsvHandlerError(DataHandlerError):
14
+ """Custom exception for CSV handling errors."""
15
+ pass
16
+
17
+
18
+ class CsvHandler(DataHandler):
19
+ """
20
+ Handles CSV file interactions using pandas.
21
+
22
+ Provides methods to access metadata, schema, data preview, and column statistics
23
+ for CSV files using pandas DataFrame operations.
24
+ """
25
+
26
+ def __init__(self, file_path: Path):
27
+ """
28
+ Initialize the CsvHandler by validating the path and reading the CSV file.
29
+
30
+ Args:
31
+ file_path: Path to the CSV file.
32
+
33
+ Raises:
34
+ CsvHandlerError: If the file is not found, not a file, or cannot be read.
35
+ """
36
+ super().__init__(file_path)
37
+ self.df: Optional[pd.DataFrame] = None
38
+ self._original_dtypes: Optional[Dict[str, str]] = None
39
+
40
+ try:
41
+ # Validate file existence
42
+ if not self.file_path.is_file():
43
+ raise FileNotFoundError(f"CSV file not found or is not a regular file: {self.file_path}")
44
+
45
+ # Read the CSV file with pandas
46
+ self._read_csv_file()
47
+
48
+ self.logger.info(f"Successfully initialized CsvHandler for: {self.file_path.name}")
49
+
50
+ except FileNotFoundError as fnf_e:
51
+ self.logger.error(f"File not found during CsvHandler initialization: {fnf_e}")
52
+ raise CsvHandlerError(str(fnf_e)) from fnf_e
53
+ except pd.errors.EmptyDataError as empty_e:
54
+ self.logger.error(f"CSV file is empty: {empty_e}")
55
+ raise CsvHandlerError(f"CSV file '{self.file_path.name}' is empty") from empty_e
56
+ except pd.errors.ParserError as parse_e:
57
+ self.logger.error(f"CSV parsing error: {parse_e}")
58
+ raise CsvHandlerError(f"Failed to parse CSV file '{self.file_path.name}': {parse_e}") from parse_e
59
+ except Exception as e:
60
+ self.logger.exception(f"Unexpected error initializing CsvHandler for {self.file_path.name}")
61
+ raise CsvHandlerError(f"Failed to initialize CSV handler '{self.file_path.name}': {e}") from e
62
+
63
+ def _read_csv_file(self) -> None:
64
+ """Read the CSV file using pandas with appropriate settings."""
65
+ try:
66
+ # Read CSV with automatic type inference
67
+ self.df = pd.read_csv(
68
+ self.file_path,
69
+ # Basic settings
70
+ encoding='utf-8',
71
+ # Handle various separators automatically
72
+ sep=None, # Let pandas auto-detect
73
+ engine='python', # More flexible parsing
74
+ # Preserve original string representation for better type info
75
+ dtype=str, # Read everything as string first
76
+ na_values=['', 'NULL', 'null', 'None', 'N/A', 'n/a', 'NaN', 'nan'],
77
+ keep_default_na=True,
78
+ )
79
+
80
+ # Store original dtypes before conversion
81
+ self._original_dtypes = {col: 'string' for col in self.df.columns}
82
+
83
+ # Try to infer better types
84
+ self._infer_types()
85
+
86
+ self.logger.debug(f"Successfully read CSV with shape: {self.df.shape}")
87
+
88
+ except UnicodeDecodeError:
89
+ # Try with different encodings
90
+ for encoding in ['latin1', 'cp1252', 'iso-8859-1']:
91
+ try:
92
+ self.logger.warning(f"Trying encoding: {encoding}")
93
+ self.df = pd.read_csv(
94
+ self.file_path,
95
+ encoding=encoding,
96
+ sep=None,
97
+ engine='python',
98
+ dtype=str,
99
+ na_values=['', 'NULL', 'null', 'None', 'N/A', 'n/a', 'NaN', 'nan'],
100
+ keep_default_na=True,
101
+ )
102
+ self._original_dtypes = {col: 'string' for col in self.df.columns}
103
+ self._infer_types()
104
+ self.logger.info(f"Successfully read CSV with encoding: {encoding}")
105
+ break
106
+ except UnicodeDecodeError:
107
+ continue
108
+ else:
109
+ raise CsvHandlerError(f"Could not decode CSV file with any common encoding")
110
+
111
+ def _infer_types(self) -> None:
112
+ """Infer appropriate data types for columns."""
113
+ if self.df is None:
114
+ return
115
+
116
+ for col in self.df.columns:
117
+ # Try to convert to numeric
118
+ numeric_converted = pd.to_numeric(self.df[col], errors='coerce')
119
+ if not numeric_converted.isna().all():
120
+ # If most values can be converted to numeric, use numeric type
121
+ non_na_original = self.df[col].notna().sum()
122
+ non_na_converted = numeric_converted.notna().sum()
123
+
124
+ if non_na_converted / max(non_na_original, 1) > 0.8: # 80% conversion success
125
+ self.df[col] = numeric_converted
126
+ if (numeric_converted == numeric_converted.astype('Int64', errors='ignore')).all():
127
+ self._original_dtypes[col] = 'integer'
128
+ else:
129
+ self._original_dtypes[col] = 'float'
130
+ continue
131
+
132
+ # Try to convert to datetime
133
+ try:
134
+ datetime_converted = pd.to_datetime(self.df[col], errors='coerce', infer_datetime_format=True)
135
+ if not datetime_converted.isna().all():
136
+ non_na_original = self.df[col].notna().sum()
137
+ non_na_converted = datetime_converted.notna().sum()
138
+
139
+ if non_na_converted / max(non_na_original, 1) > 0.8: # 80% conversion success
140
+ self.df[col] = datetime_converted
141
+ self._original_dtypes[col] = 'datetime'
142
+ continue
143
+ except (ValueError, TypeError):
144
+ pass
145
+
146
+ # Try to convert to boolean
147
+ bool_values = self.df[col].str.lower().isin(['true', 'false', 't', 'f', '1', '0', 'yes', 'no', 'y', 'n'])
148
+ if bool_values.sum() / len(self.df[col]) > 0.8:
149
+ bool_mapping = {
150
+ 'true': True, 'false': False, 't': True, 'f': False,
151
+ '1': True, '0': False, 'yes': True, 'no': False,
152
+ 'y': True, 'n': False
153
+ }
154
+ self.df[col] = self.df[col].str.lower().map(bool_mapping)
155
+ self._original_dtypes[col] = 'boolean'
156
+ continue
157
+
158
+ # Keep as string
159
+ self._original_dtypes[col] = 'string'
160
+
161
+ def close(self) -> None:
162
+ """Close and cleanup resources (CSV data is held in memory)."""
163
+ if self.df is not None:
164
+ self.logger.info(f"Closed CSV handler for: {self.file_path.name}")
165
+ self.df = None
166
+ self._original_dtypes = None
167
+
168
+ def get_metadata_summary(self) -> Dict[str, Any]:
169
+ """
170
+ Get a summary dictionary of the CSV file's metadata.
171
+
172
+ Returns:
173
+ A dictionary containing metadata like file path, format, row count, columns, size.
174
+ """
175
+ if self.df is None:
176
+ return {"error": "CSV data not loaded or handler closed."}
177
+
178
+ try:
179
+ file_size = self.file_path.stat().st_size
180
+ size_str = self.format_size(file_size)
181
+ except Exception as e:
182
+ self.logger.warning(f"Could not get file size for {self.file_path}: {e}")
183
+ size_str = "N/A"
184
+
185
+ # Create a well-structured metadata summary
186
+ summary = {
187
+ "File Information": {
188
+ "Path": str(self.file_path),
189
+ "Format": "CSV",
190
+ "Size": size_str
191
+ },
192
+ "Data Structure": {
193
+ "Total Rows": f"{len(self.df):,}",
194
+ "Total Columns": f"{len(self.df.columns):,}",
195
+ "Memory Usage": f"{self.df.memory_usage(deep=True).sum():,} bytes"
196
+ },
197
+ "Column Types Summary": self._get_column_types_summary()
198
+ }
199
+
200
+ return summary
201
+
202
+ def _get_column_types_summary(self) -> Dict[str, int]:
203
+ """Get a summary of column types in the CSV data."""
204
+ if self.df is None or self._original_dtypes is None:
205
+ return {}
206
+
207
+ type_counts = {}
208
+ for col_type in self._original_dtypes.values():
209
+ type_counts[col_type] = type_counts.get(col_type, 0) + 1
210
+
211
+ # Format for better display
212
+ formatted_summary = {}
213
+ type_labels = {
214
+ 'string': 'Text Columns',
215
+ 'integer': 'Integer Columns',
216
+ 'float': 'Numeric Columns',
217
+ 'datetime': 'Date/Time Columns',
218
+ 'boolean': 'Boolean Columns'
219
+ }
220
+
221
+ for type_key, count in type_counts.items():
222
+ label = type_labels.get(type_key, f'{type_key.title()} Columns')
223
+ formatted_summary[label] = f"{count:,}"
224
+
225
+ return formatted_summary
226
+
227
+ def get_schema_data(self) -> Optional[List[Dict[str, Any]]]:
228
+ """
229
+ Get the schema of the CSV data.
230
+
231
+ Returns:
232
+ A list of dictionaries describing columns (name, type, nullable),
233
+ or None if schema couldn't be determined.
234
+ """
235
+ if self.df is None:
236
+ self.logger.warning("DataFrame is not available for schema data")
237
+ return None
238
+
239
+ schema_list = []
240
+
241
+ for col in self.df.columns:
242
+ try:
243
+ # Get the inferred type
244
+ col_type = self._original_dtypes.get(col, 'string')
245
+
246
+ # Check for null values
247
+ has_nulls = self.df[col].isna().any()
248
+
249
+ schema_list.append({
250
+ "name": str(col),
251
+ "type": col_type,
252
+ "nullable": bool(has_nulls)
253
+ })
254
+
255
+ except Exception as e:
256
+ self.logger.error(f"Error processing column '{col}' for schema data: {e}")
257
+ schema_list.append({
258
+ "name": str(col),
259
+ "type": f"[Error: {e}]",
260
+ "nullable": None
261
+ })
262
+
263
+ return schema_list
264
+
265
+ def get_data_preview(self, num_rows: int = 50) -> Optional[pd.DataFrame]:
266
+ """
267
+ Fetch a preview of the data.
268
+
269
+ Args:
270
+ num_rows: The maximum number of rows to fetch.
271
+
272
+ Returns:
273
+ A pandas DataFrame with preview data, an empty DataFrame if no data,
274
+ or a DataFrame with an 'error' column on failure.
275
+ """
276
+ if self.df is None:
277
+ self.logger.warning("CSV data not available for preview")
278
+ return pd.DataFrame({"error": ["CSV data not loaded or handler closed."]})
279
+
280
+ try:
281
+ if self.df.empty:
282
+ self.logger.info("CSV file has no data rows")
283
+ return pd.DataFrame(columns=self.df.columns)
284
+
285
+ # Return first num_rows
286
+ preview_df = self.df.head(num_rows).copy()
287
+ self.logger.info(f"Generated preview of {len(preview_df)} rows for {self.file_path.name}")
288
+ return preview_df
289
+
290
+ except Exception as e:
291
+ self.logger.exception(f"Error generating data preview from CSV file: {self.file_path.name}")
292
+ return pd.DataFrame({"error": [f"Failed to generate preview: {e}"]})
293
+
294
+ def get_column_stats(self, column_name: str) -> Dict[str, Any]:
295
+ """
296
+ Calculate and return statistics for a specific column.
297
+
298
+ Args:
299
+ column_name: The name of the column.
300
+
301
+ Returns:
302
+ A dictionary containing column statistics or error information.
303
+ """
304
+ if self.df is None:
305
+ return self._create_stats_result(
306
+ column_name, "Unknown", {}, error="CSV data not loaded or handler closed."
307
+ )
308
+
309
+ if column_name not in self.df.columns:
310
+ return self._create_stats_result(
311
+ column_name, "Unknown", {}, error=f"Column '{column_name}' not found in CSV data."
312
+ )
313
+
314
+ try:
315
+ col_series = self.df[column_name]
316
+ col_type = self._original_dtypes.get(column_name, 'string')
317
+
318
+ # Basic counts
319
+ total_count = len(col_series)
320
+ null_count = col_series.isna().sum()
321
+ valid_count = total_count - null_count
322
+ null_percentage = (null_count / total_count * 100) if total_count > 0 else 0
323
+
324
+ stats = {
325
+ "Total Count": f"{total_count:,}",
326
+ "Valid Count": f"{valid_count:,}",
327
+ "Null Count": f"{null_count:,}",
328
+ "Null Percentage": f"{null_percentage:.2f}%"
329
+ }
330
+
331
+ # Type-specific statistics
332
+ if valid_count > 0:
333
+ valid_series = col_series.dropna()
334
+
335
+ # Distinct count (always applicable)
336
+ distinct_count = valid_series.nunique()
337
+ stats["Distinct Count"] = f"{distinct_count:,}"
338
+
339
+ if col_type in ['integer', 'float']:
340
+ # Numeric statistics
341
+ stats.update(self._calculate_numeric_stats_pandas(valid_series))
342
+ elif col_type == 'datetime':
343
+ # Datetime statistics
344
+ stats.update(self._calculate_datetime_stats_pandas(valid_series))
345
+ elif col_type == 'boolean':
346
+ # Boolean statistics
347
+ stats.update(self._calculate_boolean_stats_pandas(valid_series))
348
+ elif col_type == 'string':
349
+ # String statistics (min/max by alphabetical order)
350
+ stats.update(self._calculate_string_stats_pandas(valid_series))
351
+
352
+ return self._create_stats_result(column_name, col_type, stats, nullable=null_count > 0)
353
+
354
+ except Exception as e:
355
+ self.logger.exception(f"Error calculating stats for column '{column_name}'")
356
+ return self._create_stats_result(
357
+ column_name, "Unknown", {}, error=f"Failed to calculate statistics: {e}"
358
+ )
359
+
360
+ def _calculate_numeric_stats_pandas(self, series: pd.Series) -> Dict[str, Any]:
361
+ """Calculate statistics for numeric columns using pandas."""
362
+ stats = {}
363
+ try:
364
+ stats["Min"] = series.min()
365
+ stats["Max"] = series.max()
366
+ stats["Mean"] = f"{series.mean():.4f}"
367
+ stats["Median (50%)"] = series.median()
368
+ stats["StdDev"] = f"{series.std():.4f}"
369
+
370
+ # Add histogram data for visualization
371
+ try:
372
+ # Sample data if too large for performance
373
+ sample_size = min(10000, len(series))
374
+ if len(series) > sample_size:
375
+ sampled_series = series.sample(n=sample_size, random_state=42)
376
+ else:
377
+ sampled_series = series
378
+
379
+ # Convert to list for histogram
380
+ clean_data = sampled_series.tolist()
381
+
382
+ if len(clean_data) > 10: # Only create histogram if we have enough data
383
+ stats["_histogram_data"] = clean_data
384
+ stats["_data_type"] = "numeric"
385
+
386
+ except Exception as e:
387
+ self.logger.warning(f"Failed to prepare histogram data: {e}")
388
+
389
+ except Exception as e:
390
+ self.logger.warning(f"Error calculating numeric stats: {e}")
391
+ stats["Calculation Error"] = str(e)
392
+ return stats
393
+
394
+ def _calculate_datetime_stats_pandas(self, series: pd.Series) -> Dict[str, Any]:
395
+ """Calculate statistics for datetime columns using pandas."""
396
+ stats = {}
397
+ try:
398
+ stats["Min"] = series.min()
399
+ stats["Max"] = series.max()
400
+ # Calculate time range
401
+ time_range = series.max() - series.min()
402
+ stats["Range"] = str(time_range)
403
+ except Exception as e:
404
+ self.logger.warning(f"Error calculating datetime stats: {e}")
405
+ stats["Calculation Error"] = str(e)
406
+ return stats
407
+
408
+ def _calculate_boolean_stats_pandas(self, series: pd.Series) -> Dict[str, Any]:
409
+ """Calculate statistics for boolean columns using pandas."""
410
+ stats = {}
411
+ try:
412
+ value_counts = series.value_counts()
413
+ stats["True Count"] = f"{value_counts.get(True, 0):,}"
414
+ stats["False Count"] = f"{value_counts.get(False, 0):,}"
415
+ if len(value_counts) > 0:
416
+ true_pct = (value_counts.get(True, 0) / len(series) * 100)
417
+ stats["True Percentage"] = f"{true_pct:.2f}%"
418
+ except Exception as e:
419
+ self.logger.warning(f"Error calculating boolean stats: {e}")
420
+ stats["Calculation Error"] = str(e)
421
+ return stats
422
+
423
+ def _calculate_string_stats_pandas(self, series: pd.Series) -> Dict[str, Any]:
424
+ """Calculate statistics for string columns using pandas."""
425
+ stats = {}
426
+ try:
427
+ # Only min/max for strings (alphabetical order)
428
+ stats["Min"] = str(series.min())
429
+ stats["Max"] = str(series.max())
430
+
431
+ # Most common values
432
+ value_counts = series.value_counts().head(5)
433
+ if len(value_counts) > 0:
434
+ top_values = {}
435
+ for value, count in value_counts.items():
436
+ top_values[str(value)] = f"{count:,}"
437
+ stats["Top Values"] = top_values
438
+ except Exception as e:
439
+ self.logger.warning(f"Error calculating string stats: {e}")
440
+ stats["Calculation Error"] = str(e)
441
+ return stats
442
+
443
+ def _create_stats_result(
444
+ self,
445
+ column_name: str,
446
+ col_type: str,
447
+ calculated_stats: Dict[str, Any],
448
+ nullable: Optional[bool] = None,
449
+ error: Optional[str] = None,
450
+ message: Optional[str] = None
451
+ ) -> Dict[str, Any]:
452
+ """Package the stats results consistently."""
453
+ return {
454
+ "column": column_name,
455
+ "type": col_type,
456
+ "nullable": nullable if nullable is not None else "Unknown",
457
+ "calculated": calculated_stats or {},
458
+ "error": error,
459
+ "message": message,
460
+ }