parqv 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
parqv/handlers/json.py ADDED
@@ -0,0 +1,450 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+
5
+ import duckdb
6
+ import pandas as pd
7
+
8
+ from .base_handler import DataHandler, DataHandlerError
9
+
10
+ log = logging.getLogger(__name__)
11
+
12
+
13
+ class JsonHandlerError(DataHandlerError):
14
+ """Custom exception for JSON handling errors."""
15
+ pass
16
+
17
+
18
+ class JsonHandler(DataHandler):
19
+ """
20
+ Handles JSON file interactions using DuckDB.
21
+
22
+ Leverages DuckDB's `read_json_auto` for parsing standard JSON and JSON Lines (ndjson)
23
+ and `SUMMARIZE` for efficient statistics calculation.
24
+
25
+ Attributes:
26
+ file_path (Path): Path to the JSON file.
27
+ """
28
+ DEFAULT_VIEW_NAME = "json_data_view"
29
+
30
+ def __init__(self, file_path: Path):
31
+ """
32
+ Initializes the JsonHandler.
33
+
34
+ Args:
35
+ file_path: Path to the JSON file.
36
+
37
+ Raises:
38
+ JsonHandlerError: If the file doesn't exist, isn't a file, or if
39
+ initialization fails (e.g., DuckDB connection, view creation).
40
+ """
41
+ self.file_path = self._validate_file_path(file_path)
42
+ self._db_conn: Optional[duckdb.DuckDBPyConnection] = None
43
+ self._view_name: str = self.DEFAULT_VIEW_NAME
44
+ self._schema: Optional[List[Dict[str, Any]]] = None
45
+ self._row_count: Optional[int] = None
46
+
47
+ try:
48
+ self._connect_db()
49
+ self._create_duckdb_view()
50
+ self._load_metadata()
51
+ log.info(f"JsonHandler initialized successfully for: {self.file_path}")
52
+ except Exception as e:
53
+ log.exception(f"Error during JsonHandler initialization for {self.file_path}")
54
+ self.close()
55
+ if isinstance(e, JsonHandlerError):
56
+ raise
57
+ raise JsonHandlerError(f"Failed to initialize JSON handler: {e}") from e
58
+
59
+ def _validate_file_path(self, file_path: Path) -> Path:
60
+ """Checks if the file path is valid."""
61
+ resolved_path = file_path.resolve()
62
+ if not resolved_path.is_file():
63
+ raise JsonHandlerError(f"JSON file not found or is not a file: {resolved_path}")
64
+ return resolved_path
65
+
66
+ def _connect_db(self):
67
+ """Establishes a connection to an in-memory DuckDB database."""
68
+ try:
69
+ self._db_conn = duckdb.connect(database=':memory:', read_only=False)
70
+ log.debug("DuckDB in-memory connection established.")
71
+ except Exception as e:
72
+ log.exception("Failed to initialize DuckDB connection.")
73
+ raise JsonHandlerError(f"DuckDB connection failed: {e}") from e
74
+
75
+ def _create_duckdb_view(self):
76
+ """Creates a DuckDB view pointing to the JSON file."""
77
+ if not self._db_conn:
78
+ raise JsonHandlerError("DuckDB connection not available for view creation.")
79
+
80
+ file_path_str = str(self.file_path)
81
+ safe_view_name = f'"{self._view_name}"'
82
+ load_query = f"CREATE OR REPLACE VIEW {safe_view_name} AS SELECT * FROM read_json_auto('{file_path_str}');"
83
+
84
+ try:
85
+ self._db_conn.sql(load_query)
86
+ log.debug(f"DuckDB view '{self._view_name}' created for file '{file_path_str}'.")
87
+ except duckdb.Error as db_err:
88
+ log.exception(f"DuckDB Error creating view '{self._view_name}' from '{file_path_str}': {db_err}")
89
+ if "Could not open file" in str(db_err):
90
+ raise JsonHandlerError(
91
+ f"DuckDB could not open file: {file_path_str}. Check permissions or path. Error: {db_err}") from db_err
92
+ elif "JSON Error" in str(db_err) or "Parse Error" in str(db_err):
93
+ raise JsonHandlerError(
94
+ f"DuckDB failed to parse JSON file: {file_path_str}. Check format. Error: {db_err}") from db_err
95
+ else:
96
+ raise JsonHandlerError(f"DuckDB failed create view for JSON file: {db_err}") from db_err
97
+ except Exception as e:
98
+ log.exception(f"Unexpected error creating DuckDB view '{self._view_name}'.")
99
+ raise JsonHandlerError(f"Failed to create DuckDB view: {e}") from e
100
+
101
+ def _load_metadata(self):
102
+ """Fetches schema and row count from the DuckDB view."""
103
+ if not self._db_conn:
104
+ raise JsonHandlerError("Cannot fetch metadata, DuckDB connection not available.")
105
+
106
+ try:
107
+ # Fetch Schema
108
+ describe_query = f"DESCRIBE \"{self._view_name}\";"
109
+ schema_result = self._db_conn.sql(describe_query).fetchall()
110
+ self._schema = self._parse_schema(schema_result)
111
+ log.debug(f"Schema fetched for view '{self._view_name}': {len(self._schema)} columns.")
112
+
113
+ # Fetch Row Count
114
+ count_query = f"SELECT COUNT(*) FROM \"{self._view_name}\";"
115
+ count_result = self._db_conn.sql(count_query).fetchone()
116
+ self._row_count = count_result[0] if count_result else 0
117
+ log.debug(f"Row count fetched for view '{self._view_name}': {self._row_count}")
118
+
119
+ except duckdb.Error as db_err:
120
+ log.exception(f"DuckDB Error fetching metadata for view '{self._view_name}': {db_err}")
121
+ self._schema = None
122
+ self._row_count = None
123
+ except Exception as e:
124
+ log.exception(f"Unexpected error fetching metadata for view '{self._view_name}'")
125
+ self._schema = None
126
+ self._row_count = None
127
+
128
+ def _parse_schema(self, describe_output: List[Tuple]) -> List[Dict[str, Any]]:
129
+ """Parses the output of DuckDB's DESCRIBE query."""
130
+ if not describe_output:
131
+ log.warning(f"DESCRIBE query for view '{self._view_name}' returned no schema info.")
132
+ return []
133
+
134
+ parsed_schema = []
135
+ for row in describe_output:
136
+ # Handle potential variations in DESCRIBE output columns
137
+ if len(row) >= 3:
138
+ name, type_str, null_str = row[0], row[1], row[2]
139
+ is_nullable = None
140
+ if isinstance(null_str, str):
141
+ is_nullable = null_str.upper() == 'YES'
142
+ parsed_schema.append({"name": name, "type": type_str, "nullable": is_nullable})
143
+ else:
144
+ log.warning(f"Unexpected format in DESCRIBE output row: {row}")
145
+ return parsed_schema
146
+
147
+ def get_schema_data(self) -> Optional[List[Dict[str, Any]]]:
148
+ """
149
+ Returns the schema of the JSON data.
150
+
151
+ Returns:
152
+ A list of dictionaries describing columns (name, type, nullable),
153
+ or None if schema couldn't be fetched.
154
+ """
155
+ if self._schema is None:
156
+ log.warning("Schema is unavailable. It might not have been fetched successfully.")
157
+ return self._schema
158
+
159
+ def get_metadata_summary(self) -> Dict[str, Any]:
160
+ """
161
+ Provides a summary dictionary of the JSON file's metadata.
162
+
163
+ Returns:
164
+ A dictionary containing metadata like file path, format, row count, columns, size.
165
+ """
166
+ if not self._db_conn:
167
+ return {"error": "DuckDB connection not initialized or closed."}
168
+
169
+ row_count_str = "N/A (Error fetching)"
170
+ if self._row_count is not None:
171
+ row_count_str = f"{self._row_count:,}"
172
+
173
+ columns_str = "N/A (Error fetching)"
174
+ if self._schema is not None:
175
+ columns_str = f"{len(self._schema):,}"
176
+
177
+ summary = {
178
+ "File Path": str(self.file_path),
179
+ "Format": "JSON/JSONL",
180
+ "DuckDB View": self._view_name,
181
+ "Total Rows": row_count_str,
182
+ "Columns": columns_str,
183
+ }
184
+ try:
185
+ summary["Size"] = f"{self.file_path.stat().st_size:,} bytes"
186
+ except Exception as e:
187
+ log.warning(f"Could not get file size for {self.file_path}: {e}")
188
+ summary["Size"] = "N/A"
189
+
190
+ return summary
191
+
192
+ def get_data_preview(self, num_rows: int = 50) -> pd.DataFrame:
193
+ """
194
+ Fetches a preview of the data.
195
+
196
+ Args:
197
+ num_rows: The maximum number of rows to preview.
198
+
199
+ Returns:
200
+ A pandas DataFrame containing the first `num_rows` of data,
201
+ an empty DataFrame if the file is empty, or a DataFrame with an
202
+ error message if fetching fails.
203
+ """
204
+ if not self._db_conn:
205
+ log.warning("Data preview unavailable: DuckDB connection is closed or uninitialized.")
206
+ return pd.DataFrame({"error": ["DuckDB connection not available."]})
207
+ if self._schema is None:
208
+ log.warning("Data preview unavailable: Schema couldn't be determined.")
209
+ return pd.DataFrame({"error": ["Schema not available, cannot fetch preview."]})
210
+ if self._row_count == 0:
211
+ log.info("Data preview: Source JSON view is empty.")
212
+ # Return empty DataFrame with correct columns if possible
213
+ if self._schema:
214
+ return pd.DataFrame(columns=[col['name'] for col in self._schema])
215
+ else:
216
+ return pd.DataFrame() # Fallback
217
+
218
+ try:
219
+ limit = max(1, num_rows)
220
+ preview_query = f"SELECT * FROM \"{self._view_name}\" LIMIT {limit};"
221
+ df = self._db_conn.sql(preview_query).df()
222
+ return df
223
+ except duckdb.Error as db_err:
224
+ log.exception(f"DuckDB error getting data preview from '{self._view_name}': {db_err}")
225
+ return pd.DataFrame({"error": [f"DuckDB error fetching preview: {db_err}"]})
226
+ except Exception as e:
227
+ log.exception(f"Unexpected error getting data preview from '{self._view_name}'")
228
+ return pd.DataFrame({"error": [f"Failed to fetch preview: {e}"]})
229
+
230
+ def _get_column_info(self, column_name: str) -> Optional[Dict[str, Any]]:
231
+ """Retrieves schema information for a specific column."""
232
+ if self._schema is None:
233
+ return None
234
+ return next((col for col in self._schema if col["name"] == column_name), None)
235
+
236
+ def _is_complex_type(self, dtype_str: str) -> bool:
237
+ """Checks if a DuckDB data type string represents a complex type."""
238
+ if not isinstance(dtype_str, str):
239
+ return False
240
+ dtype_upper = dtype_str.upper()
241
+ return any(t in dtype_upper for t in ['STRUCT', 'LIST', 'MAP', 'UNION'])
242
+
243
+ def get_column_stats(self, column_name: str) -> Dict[str, Any]:
244
+ """
245
+ Calculates statistics for a given column using DuckDB's SUMMARIZE or basic counts.
246
+
247
+ Args:
248
+ column_name: The name of the column to analyze.
249
+
250
+ Returns:
251
+ A dictionary containing calculated statistics, type information, and
252
+ any errors or messages.
253
+ """
254
+ if not self._db_conn:
255
+ return self._create_stats_result(column_name, "Unknown", {}, error="DuckDB connection not available.")
256
+
257
+ col_info = self._get_column_info(column_name)
258
+ if not col_info:
259
+ return self._create_stats_result(column_name, "Unknown", {},
260
+ error=f"Column '{column_name}' not found in schema.")
261
+
262
+ col_type = col_info["type"]
263
+ col_nullable = col_info["nullable"] # Already boolean or None
264
+ is_complex = self._is_complex_type(col_type)
265
+ safe_column_name = f'"{column_name}"' # Quote column name for safety
266
+ stats: Dict[str, Any] = {}
267
+ error_msg: Optional[str] = None
268
+ message: Optional[str] = None
269
+
270
+ try:
271
+ if self._row_count == 0:
272
+ message = "Table is empty. No statistics calculated."
273
+ return self._create_stats_result(column_name, col_type, stats, nullable=col_nullable, message=message)
274
+
275
+ if is_complex:
276
+ # Use basic counts for complex types as SUMMARIZE is less informative
277
+ log.debug(f"Calculating basic counts for complex type column: {column_name}")
278
+ stats = self._get_basic_column_counts(safe_column_name)
279
+ message = f"Only basic counts calculated for complex type '{col_type}'."
280
+ # Attempt distinct count for complex types (can be slow/error-prone)
281
+ try:
282
+ distinct_q = f"SELECT COUNT(DISTINCT {safe_column_name}) FROM \"{self._view_name}\" WHERE {safe_column_name} IS NOT NULL;"
283
+ distinct_res = self._db_conn.sql(distinct_q).fetchone()
284
+ if distinct_res and distinct_res[0] is not None:
285
+ stats["Distinct Count"] = f"{distinct_res[0]:,}"
286
+ else:
287
+ stats["Distinct Count"] = "N/A" # Or 0 if appropriate
288
+ except duckdb.Error as distinct_err:
289
+ log.warning(
290
+ f"Could not calculate distinct count for complex column '{column_name}': {distinct_err}")
291
+ stats["Distinct Count"] = "Error"
292
+
293
+ else:
294
+ # Use SUMMARIZE for non-complex types
295
+ log.debug(f"Using SUMMARIZE for simple type column: {column_name}")
296
+ summarize_query = f"SUMMARIZE SELECT {safe_column_name} FROM \"{self._view_name}\";"
297
+ summarize_df = self._db_conn.sql(summarize_query).df()
298
+
299
+ if summarize_df.empty:
300
+ message = "SUMMARIZE returned no results (column might be all NULLs or empty)."
301
+ # Get basic counts as fallback if summarize is empty
302
+ stats = self._get_basic_column_counts(safe_column_name)
303
+ else:
304
+ # SUMMARIZE puts results in the first row
305
+ stats = self._format_summarize_stats(summarize_df.iloc[0])
306
+
307
+ except duckdb.Error as db_err:
308
+ log.exception(f"DuckDB Error calculating statistics for column '{column_name}': {db_err}")
309
+ error_msg = f"DuckDB calculation failed: {db_err}"
310
+ except Exception as e:
311
+ log.exception(f"Unexpected error calculating statistics for column '{column_name}'")
312
+ error_msg = f"Calculation failed unexpectedly: {e}"
313
+
314
+ return self._create_stats_result(
315
+ column_name, col_type, stats, nullable=col_nullable, error=error_msg, message=message
316
+ )
317
+
318
+ def _get_basic_column_counts(self, safe_column_name: str) -> Dict[str, Any]:
319
+ """Helper to get total, null, and valid counts for a column."""
320
+ stats = {}
321
+ if not self._db_conn or self._row_count is None:
322
+ return {"error": "Connection or row count unavailable for basic counts"}
323
+
324
+ if self._row_count == 0:
325
+ stats["Total Count"] = "0"
326
+ stats["Valid Count"] = "0"
327
+ stats["Null Count"] = "0"
328
+ stats["Null Percentage"] = "N/A"
329
+ return stats
330
+
331
+ try:
332
+ q_counts = f"""
333
+ SELECT
334
+ SUM(CASE WHEN {safe_column_name} IS NULL THEN 1 ELSE 0 END) AS null_count
335
+ FROM "{self._view_name}";
336
+ """
337
+ counts_res = self._db_conn.sql(q_counts).fetchone()
338
+
339
+ if counts_res:
340
+ null_count = counts_res[0] if counts_res[0] is not None else 0
341
+ total_count = self._row_count
342
+ valid_count = total_count - null_count
343
+ stats["Total Count"] = f"{total_count:,}"
344
+ stats["Valid Count"] = f"{valid_count:,}"
345
+ stats["Null Count"] = f"{null_count:,}"
346
+ stats["Null Percentage"] = f"{(null_count / total_count * 100):.2f}%" if total_count > 0 else "N/A"
347
+ else:
348
+ stats["Total Count"] = f"{self._row_count:,}"
349
+ stats["Valid Count"] = "Error"
350
+ stats["Null Count"] = "Error"
351
+ stats["Null Percentage"] = "Error"
352
+
353
+ except duckdb.Error as db_err:
354
+ log.warning(f"Failed to get basic counts for {safe_column_name}: {db_err}")
355
+ stats["Counts Error"] = str(db_err)
356
+ return stats
357
+
358
+ def _format_summarize_stats(self, summarize_row: pd.Series) -> Dict[str, Any]:
359
+ """Formats the output of DuckDB's SUMMARIZE into a stats dictionary."""
360
+ stats = {}
361
+ if 'count' in summarize_row and pd.notna(summarize_row['count']):
362
+ total_count = int(summarize_row['count'])
363
+ stats["Total Count"] = f"{total_count:,}"
364
+ null_count = 0
365
+ if 'null_percentage' in summarize_row and pd.notna(summarize_row['null_percentage']):
366
+ null_perc = summarize_row['null_percentage']
367
+ null_count = int(round(total_count * (null_perc / 100.0)))
368
+ stats["Null Percentage"] = f"{null_perc:.2f}%"
369
+ stats["Null Count"] = f"{null_count:,}"
370
+ else:
371
+ stats["Null Percentage"] = "0.00%" # Assume 0 if missing
372
+ stats["Null Count"] = "0"
373
+
374
+ stats["Valid Count"] = f"{total_count - null_count:,}"
375
+ else:
376
+ stats["Total Count"] = "N/A"
377
+ stats["Valid Count"] = "N/A"
378
+ stats["Null Count"] = "N/A"
379
+ stats["Null Percentage"] = "N/A"
380
+
381
+ # Distinct Count
382
+ if 'distinct' in summarize_row and pd.notna(summarize_row['distinct']):
383
+ stats["Distinct Count"] = f"{int(summarize_row['distinct']):,}"
384
+
385
+ # Numeric Stats
386
+ if 'min' in summarize_row and pd.notna(summarize_row['min']):
387
+ stats["Min"] = summarize_row['min']
388
+ if 'max' in summarize_row and pd.notna(summarize_row['max']):
389
+ stats["Max"] = summarize_row['max']
390
+ if 'mean' in summarize_row and pd.notna(summarize_row['mean']):
391
+ try:
392
+ stats["Mean"] = f"{float(summarize_row['mean']):.4f}"
393
+ except (ValueError, TypeError):
394
+ stats["Mean"] = str(summarize_row['mean'])
395
+ if 'std' in summarize_row and pd.notna(summarize_row['std']):
396
+ try:
397
+ stats["StdDev"] = f"{float(summarize_row['std']):.4f}"
398
+ except (ValueError, TypeError):
399
+ stats["StdDev"] = str(summarize_row['std'])
400
+
401
+ # Quantiles (example for median)
402
+ if '50%' in summarize_row and pd.notna(summarize_row['50%']):
403
+ stats["Median (50%)"] = summarize_row['50%']
404
+
405
+ return stats
406
+
407
+ def _create_stats_result(
408
+ self,
409
+ column_name: str,
410
+ col_type: str,
411
+ calculated_stats: Dict[str, Any],
412
+ nullable: Optional[bool] = None,
413
+ error: Optional[str] = None,
414
+ message: Optional[str] = None
415
+ ) -> Dict[str, Any]:
416
+ """Packages the stats results consistently."""
417
+ return {
418
+ "column": column_name,
419
+ "type": col_type,
420
+ "nullable": nullable if nullable is not None else "Unknown",
421
+ "calculated": calculated_stats or {},
422
+ "basic_metadata_stats": None,
423
+ "metadata_stats_error": None,
424
+ "error": error,
425
+ "message": message,
426
+ }
427
+
428
+ def close(self):
429
+ """Closes the DuckDB connection if it's open."""
430
+ if self._db_conn:
431
+ try:
432
+ self._db_conn.close()
433
+ log.info(f"DuckDB connection closed for {self.file_path}.")
434
+ self._db_conn = None
435
+ except Exception as e:
436
+ # Log error but don't raise during close typically
437
+ log.error(f"Error closing DuckDB connection for {self.file_path}: {e}")
438
+ self._db_conn = None # Assume closed even if error occurred
439
+
440
+ def __enter__(self):
441
+ """Enter context management."""
442
+ return self
443
+
444
+ def __exit__(self, exc_type, exc_val, exc_tb):
445
+ """Exit context management, ensuring connection closure."""
446
+ self.close()
447
+
448
+ def __del__(self):
449
+ """Ensures connection is closed when object is garbage collected (best effort)."""
450
+ self.close()