PyPI - parqv - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

parqv 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

parqv/__init__.py +31 -0
parqv/app.py +84 -102
parqv/cli.py +112 -0
parqv/core/__init__.py +31 -0
parqv/core/config.py +26 -0
parqv/core/file_utils.py +88 -0
parqv/core/handler_factory.py +90 -0
parqv/core/logging.py +46 -0
parqv/data_sources/__init__.py +48 -0
parqv/data_sources/base/__init__.py +28 -0
parqv/data_sources/base/exceptions.py +38 -0
parqv/{handlers/base_handler.py → data_sources/base/handler.py} +54 -25
parqv/{handlers → data_sources/formats}/__init__.py +13 -5
parqv/data_sources/formats/csv.py +460 -0
parqv/{handlers → data_sources/formats}/json.py +68 -32
parqv/{handlers → data_sources/formats}/parquet.py +67 -56
parqv/views/__init__.py +38 -0
parqv/views/base.py +98 -0
parqv/views/components/__init__.py +13 -0
parqv/views/components/enhanced_data_table.py +152 -0
parqv/views/components/error_display.py +72 -0
parqv/views/components/loading_display.py +44 -0
parqv/views/data_view.py +119 -46
parqv/views/metadata_view.py +57 -20
parqv/views/schema_view.py +190 -200
parqv/views/utils/__init__.py +19 -0
parqv/views/utils/data_formatters.py +184 -0
parqv/views/utils/stats_formatters.py +220 -0
parqv/views/utils/visualization.py +204 -0
{parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/METADATA +5 -6
parqv-0.3.0.dist-info/RECORD +36 -0
{parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/WHEEL +1 -1
parqv-0.2.0.dist-info/RECORD +0 -17
{parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/entry_points.txt +0 -0
{parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/licenses/LICENSE +0 -0
{parqv-0.2.0.dist-info → parqv-0.3.0.dist-info}/top_level.txt +0 -0

parqv/{handlers → data_sources/formats}/json.py RENAMED Viewed

@@ -1,13 +1,10 @@
-import logging
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 import duckdb
 import pandas as pd
-from .base_handler import DataHandler, DataHandlerError
-log = logging.getLogger(__name__)
+from ..base import DataHandler, DataHandlerError
 class JsonHandlerError(DataHandlerError):
@@ -38,6 +35,8 @@ class JsonHandler(DataHandler):
             JsonHandlerError: If the file doesn't exist, isn't a file, or if
                               initialization fails (e.g., DuckDB connection, view creation).
         """
+        super().__init__(file_path)
         self.file_path = self._validate_file_path(file_path)
         self._db_conn: Optional[duckdb.DuckDBPyConnection] = None
         self._view_name: str = self.DEFAULT_VIEW_NAME
@@ -48,9 +47,9 @@ class JsonHandler(DataHandler):
             self._connect_db()
             self._create_duckdb_view()
             self._load_metadata()
-            log.info(f"JsonHandler initialized successfully for: {self.file_path}")
+            self.logger.info(f"JsonHandler initialized successfully for: {self.file_path}")
         except Exception as e:
-            log.exception(f"Error during JsonHandler initialization for {self.file_path}")
+            self.logger.exception(f"Error during JsonHandler initialization for {self.file_path}")
             self.close()
             if isinstance(e, JsonHandlerError):
                 raise
@@ -67,9 +66,9 @@ class JsonHandler(DataHandler):
         """Establishes a connection to an in-memory DuckDB database."""
         try:
             self._db_conn = duckdb.connect(database=':memory:', read_only=False)
-            log.debug("DuckDB in-memory connection established.")
+            self.logger.debug("DuckDB in-memory connection established.")
         except Exception as e:
-            log.exception("Failed to initialize DuckDB connection.")
+            self.logger.exception("Failed to initialize DuckDB connection.")
             raise JsonHandlerError(f"DuckDB connection failed: {e}") from e
     def _create_duckdb_view(self):
@@ -83,9 +82,9 @@ class JsonHandler(DataHandler):
         try:
             self._db_conn.sql(load_query)
-            log.debug(f"DuckDB view '{self._view_name}' created for file '{file_path_str}'.")
+            self.logger.debug(f"DuckDB view '{self._view_name}' created for file '{file_path_str}'.")
         except duckdb.Error as db_err:
-            log.exception(f"DuckDB Error creating view '{self._view_name}' from '{file_path_str}': {db_err}")
+            self.logger.exception(f"DuckDB Error creating view '{self._view_name}' from '{file_path_str}': {db_err}")
             if "Could not open file" in str(db_err):
                 raise JsonHandlerError(
                     f"DuckDB could not open file: {file_path_str}. Check permissions or path. Error: {db_err}") from db_err
@@ -95,7 +94,7 @@ class JsonHandler(DataHandler):
             else:
                 raise JsonHandlerError(f"DuckDB failed create view for JSON file: {db_err}") from db_err
         except Exception as e:
-            log.exception(f"Unexpected error creating DuckDB view '{self._view_name}'.")
+            self.logger.exception(f"Unexpected error creating DuckDB view '{self._view_name}'.")
             raise JsonHandlerError(f"Failed to create DuckDB view: {e}") from e
     def _load_metadata(self):
@@ -108,27 +107,27 @@ class JsonHandler(DataHandler):
             describe_query = f"DESCRIBE \"{self._view_name}\";"
             schema_result = self._db_conn.sql(describe_query).fetchall()
             self._schema = self._parse_schema(schema_result)
-            log.debug(f"Schema fetched for view '{self._view_name}': {len(self._schema)} columns.")
+            self.logger.debug(f"Schema fetched for view '{self._view_name}': {len(self._schema)} columns.")
             # Fetch Row Count
             count_query = f"SELECT COUNT(*) FROM \"{self._view_name}\";"
             count_result = self._db_conn.sql(count_query).fetchone()
             self._row_count = count_result[0] if count_result else 0
-            log.debug(f"Row count fetched for view '{self._view_name}': {self._row_count}")
+            self.logger.debug(f"Row count fetched for view '{self._view_name}': {self._row_count}")
         except duckdb.Error as db_err:
-            log.exception(f"DuckDB Error fetching metadata for view '{self._view_name}': {db_err}")
+            self.logger.exception(f"DuckDB Error fetching metadata for view '{self._view_name}': {db_err}")
             self._schema = None
             self._row_count = None
         except Exception as e:
-            log.exception(f"Unexpected error fetching metadata for view '{self._view_name}'")
+            self.logger.exception(f"Unexpected error fetching metadata for view '{self._view_name}'")
             self._schema = None
             self._row_count = None
     def _parse_schema(self, describe_output: List[Tuple]) -> List[Dict[str, Any]]:
         """Parses the output of DuckDB's DESCRIBE query."""
         if not describe_output:
-            log.warning(f"DESCRIBE query for view '{self._view_name}' returned no schema info.")
+            self.logger.warning(f"DESCRIBE query for view '{self._view_name}' returned no schema info.")
             return []
         parsed_schema = []
@@ -141,7 +140,7 @@ class JsonHandler(DataHandler):
                     is_nullable = null_str.upper() == 'YES'
                 parsed_schema.append({"name": name, "type": type_str, "nullable": is_nullable})
             else:
-                log.warning(f"Unexpected format in DESCRIBE output row: {row}")
+                self.logger.warning(f"Unexpected format in DESCRIBE output row: {row}")
         return parsed_schema
     def get_schema_data(self) -> Optional[List[Dict[str, Any]]]:
@@ -153,7 +152,7 @@ class JsonHandler(DataHandler):
             or None if schema couldn't be fetched.
         """
         if self._schema is None:
-            log.warning("Schema is unavailable. It might not have been fetched successfully.")
+            self.logger.warning("Schema is unavailable. It might not have been fetched successfully.")
         return self._schema
     def get_metadata_summary(self) -> Dict[str, Any]:
@@ -184,7 +183,7 @@ class JsonHandler(DataHandler):
         try:
             summary["Size"] = f"{self.file_path.stat().st_size:,} bytes"
         except Exception as e:
-            log.warning(f"Could not get file size for {self.file_path}: {e}")
+            self.logger.warning(f"Could not get file size for {self.file_path}: {e}")
             summary["Size"] = "N/A"
         return summary
@@ -202,13 +201,13 @@ class JsonHandler(DataHandler):
             error message if fetching fails.
         """
         if not self._db_conn:
-            log.warning("Data preview unavailable: DuckDB connection is closed or uninitialized.")
+            self.logger.warning("Data preview unavailable: DuckDB connection is closed or uninitialized.")
             return pd.DataFrame({"error": ["DuckDB connection not available."]})
         if self._schema is None:
-            log.warning("Data preview unavailable: Schema couldn't be determined.")
+            self.logger.warning("Data preview unavailable: Schema couldn't be determined.")
             return pd.DataFrame({"error": ["Schema not available, cannot fetch preview."]})
         if self._row_count == 0:
-            log.info("Data preview: Source JSON view is empty.")
+            self.logger.info("Data preview: Source JSON view is empty.")
             # Return empty DataFrame with correct columns if possible
             if self._schema:
                 return pd.DataFrame(columns=[col['name'] for col in self._schema])
@@ -221,10 +220,10 @@ class JsonHandler(DataHandler):
             df = self._db_conn.sql(preview_query).df()
             return df
         except duckdb.Error as db_err:
-            log.exception(f"DuckDB error getting data preview from '{self._view_name}': {db_err}")
+            self.logger.exception(f"DuckDB error getting data preview from '{self._view_name}': {db_err}")
             return pd.DataFrame({"error": [f"DuckDB error fetching preview: {db_err}"]})
         except Exception as e:
-            log.exception(f"Unexpected error getting data preview from '{self._view_name}'")
+            self.logger.exception(f"Unexpected error getting data preview from '{self._view_name}'")
             return pd.DataFrame({"error": [f"Failed to fetch preview: {e}"]})
     def _get_column_info(self, column_name: str) -> Optional[Dict[str, Any]]:
@@ -274,7 +273,7 @@ class JsonHandler(DataHandler):
             if is_complex:
                 # Use basic counts for complex types as SUMMARIZE is less informative
-                log.debug(f"Calculating basic counts for complex type column: {column_name}")
+                self.logger.debug(f"Calculating basic counts for complex type column: {column_name}")
                 stats = self._get_basic_column_counts(safe_column_name)
                 message = f"Only basic counts calculated for complex type '{col_type}'."
                 # Attempt distinct count for complex types (can be slow/error-prone)
@@ -286,13 +285,13 @@ class JsonHandler(DataHandler):
                     else:
                         stats["Distinct Count"] = "N/A"  # Or 0 if appropriate
                 except duckdb.Error as distinct_err:
-                    log.warning(
+                    self.logger.warning(
                         f"Could not calculate distinct count for complex column '{column_name}': {distinct_err}")
                     stats["Distinct Count"] = "Error"
             else:
                 # Use SUMMARIZE for non-complex types
-                log.debug(f"Using SUMMARIZE for simple type column: {column_name}")
+                self.logger.debug(f"Using SUMMARIZE for simple type column: {column_name}")
                 summarize_query = f"SUMMARIZE SELECT {safe_column_name} FROM \"{self._view_name}\";"
                 summarize_df = self._db_conn.sql(summarize_query).df()
@@ -304,11 +303,17 @@ class JsonHandler(DataHandler):
                     # SUMMARIZE puts results in the first row
                     stats = self._format_summarize_stats(summarize_df.iloc[0])
+                    # Add histogram data for numeric columns
+                    try:
+                        self._add_histogram_data_if_numeric(stats, safe_column_name)
+                    except Exception as hist_e:
+                        self.logger.warning(f"Failed to add histogram data for {column_name}: {hist_e}")
         except duckdb.Error as db_err:
-            log.exception(f"DuckDB Error calculating statistics for column '{column_name}': {db_err}")
+            self.logger.exception(f"DuckDB Error calculating statistics for column '{column_name}': {db_err}")
             error_msg = f"DuckDB calculation failed: {db_err}"
         except Exception as e:
-            log.exception(f"Unexpected error calculating statistics for column '{column_name}'")
+            self.logger.exception(f"Unexpected error calculating statistics for column '{column_name}'")
             error_msg = f"Calculation failed unexpectedly: {e}"
         return self._create_stats_result(
@@ -351,7 +356,7 @@ class JsonHandler(DataHandler):
                 stats["Null Percentage"] = "Error"
         except duckdb.Error as db_err:
-            log.warning(f"Failed to get basic counts for {safe_column_name}: {db_err}")
+            self.logger.warning(f"Failed to get basic counts for {safe_column_name}: {db_err}")
             stats["Counts Error"] = str(db_err)
         return stats
@@ -404,6 +409,37 @@ class JsonHandler(DataHandler):
         return stats
+    def _add_histogram_data_if_numeric(self, stats: Dict[str, Any], safe_column_name: str) -> None:
+        """Add histogram data for numeric columns by sampling from DuckDB."""
+        # Check if this looks like numeric data (has Mean, Min, Max)
+        if not all(key in stats for key in ["Mean", "Min", "Max"]):
+            return
+        try:
+            # Sample data for histogram (limit to 10k samples for performance)
+            sample_query = f"""
+            SELECT {safe_column_name}
+            FROM "{self._view_name}"
+            WHERE {safe_column_name} IS NOT NULL
+            USING SAMPLE 10000
+            """
+            sample_df = self._db_conn.sql(sample_query).df()
+            if not sample_df.empty and len(sample_df) > 10:
+                # Extract the column data
+                column_data = sample_df.iloc[:, 0].tolist()
+                # Filter out any remaining nulls
+                clean_data = [val for val in column_data if val is not None]
+                if len(clean_data) > 10:
+                    stats["_histogram_data"] = clean_data
+                    stats["_data_type"] = "numeric"
+        except Exception as e:
+            self.logger.warning(f"Failed to sample data for histogram: {e}")
     def _create_stats_result(
             self,
             column_name: str,
@@ -430,11 +466,11 @@ class JsonHandler(DataHandler):
         if self._db_conn:
             try:
                 self._db_conn.close()
-                log.info(f"DuckDB connection closed for {self.file_path}.")
+                self.logger.info(f"DuckDB connection closed for {self.file_path}.")
                 self._db_conn = None
             except Exception as e:
                 # Log error but don't raise during close typically
-                log.error(f"Error closing DuckDB connection for {self.file_path}: {e}")
+                self.logger.error(f"Error closing DuckDB connection for {self.file_path}: {e}")
                 self._db_conn = None  # Assume closed even if error occurred
     def __enter__(self):

parqv/{handlers → data_sources/formats}/parquet.py RENAMED Viewed

@@ -1,4 +1,3 @@
-import logging
 from pathlib import Path
 from typing import Any, Dict, List, Tuple, Optional, Union
@@ -7,9 +6,7 @@ import pyarrow as pa
 import pyarrow.compute as pc
 import pyarrow.parquet as pq
-from .base_handler import DataHandler, DataHandlerError
-log = logging.getLogger(__name__)
+from ..base import DataHandler, DataHandlerError
 class ParquetHandlerError(DataHandlerError):
@@ -49,17 +46,17 @@ class ParquetHandler(DataHandler):
             self.pq_file = pq.ParquetFile(self.file_path)
             self.schema = self.pq_file.schema_arrow
             self.metadata = self.pq_file.metadata
-            log.info(f"Successfully initialized ParquetHandler for: {self.file_path.name}")
+            self.logger.info(f"Successfully initialized ParquetHandler for: {self.file_path.name}")
         except FileNotFoundError as fnf_e:
-            log.error(f"File not found during ParquetHandler initialization: {fnf_e}")
+            self.logger.error(f"File not found during ParquetHandler initialization: {fnf_e}")
             raise ParquetHandlerError(str(fnf_e)) from fnf_e
         except pa.lib.ArrowIOError as arrow_io_e:
-            log.error(f"Arrow IO Error initializing ParquetHandler for {self.file_path.name}: {arrow_io_e}")
+            self.logger.error(f"Arrow IO Error initializing ParquetHandler for {self.file_path.name}: {arrow_io_e}")
             raise ParquetHandlerError(
                 f"Failed to open Parquet file '{self.file_path.name}': {arrow_io_e}") from arrow_io_e
         except Exception as e:
-            log.exception(f"Unexpected error initializing ParquetHandler for {self.file_path.name}")
+            self.logger.exception(f"Unexpected error initializing ParquetHandler for {self.file_path.name}")
             self.close()
             raise ParquetHandlerError(f"Failed to initialize Parquet handler '{self.file_path.name}': {e}") from e
@@ -71,10 +68,10 @@ class ParquetHandler(DataHandler):
                 # ParquetFile might not have a close method depending on source, check first
                 if hasattr(self.pq_file, 'close'):
                     self.pq_file.close()
-                log.info(f"Closed Parquet file: {self.file_path.name}")
+                self.logger.info(f"Closed Parquet file: {self.file_path.name}")
             except Exception as e:
                 # Log error during close but don't raise, as we're cleaning up
-                log.warning(f"Exception while closing Parquet file {self.file_path.name}: {e}")
+                self.logger.warning(f"Exception while closing Parquet file {self.file_path.name}: {e}")
             finally:
                 self.pq_file = None
                 self.schema = None
@@ -102,7 +99,7 @@ class ParquetHandler(DataHandler):
             A dictionary containing key metadata attributes, or an error dictionary.
         """
         if not self.metadata or not self.schema:
-            log.warning(f"Metadata or schema not available for summary: {self.file_path.name}")
+            self.logger.warning(f"Metadata or schema not available for summary: {self.file_path.name}")
             return {"error": "Metadata or schema not available"}
         try:
@@ -126,7 +123,7 @@ class ParquetHandler(DataHandler):
             return summary
         except Exception as e:
-            log.exception(f"Error generating metadata summary for {self.file_path.name}")
+            self.logger.exception(f"Error generating metadata summary for {self.file_path.name}")
             return {"error": f"Error getting metadata summary: {e}"}
     def get_schema_data(self) -> Optional[List[Dict[str, Any]]]:
@@ -138,7 +135,7 @@ class ParquetHandler(DataHandler):
             or None if the schema is unavailable.
         """
         if not self.schema:
-            log.warning(f"Schema is not available for get_schema_data: {self.file_path.name}")
+            self.logger.warning(f"Schema is not available for get_schema_data: {self.file_path.name}")
             return None
         schema_list = []
@@ -151,7 +148,7 @@ class ParquetHandler(DataHandler):
                     "nullable": field.nullable
                 })
             except Exception as e:
-                log.error(f"Error processing field '{field.name}' for schema data: {e}", exc_info=True)
+                self.logger.error(f"Error processing field '{field.name}' for schema data: {e}", exc_info=True)
                 schema_list.append({
                     "name": field.name,
                     "type": f"[Error: {e}]",
@@ -172,11 +169,11 @@ class ParquetHandler(DataHandler):
             Returns a DataFrame with an 'error' column on failure.
         """
         if not self.pq_file:
-            log.warning(f"ParquetFile handler not available for data preview: {self.file_path.name}")
+            self.logger.warning(f"ParquetFile handler not available for data preview: {self.file_path.name}")
             return pd.DataFrame({"error": ["Parquet handler not initialized or closed."]})
         if self.metadata and self.metadata.num_rows == 0:
-            log.info(f"Parquet file is empty based on metadata: {self.file_path.name}")
+            self.logger.info(f"Parquet file is empty based on metadata: {self.file_path.name}")
             if self.schema:
                 return pd.DataFrame(columns=self.schema.names)
             else:
@@ -206,10 +203,10 @@ class ParquetHandler(DataHandler):
             if not batches:
                 # Check if file might have rows but reading yielded nothing
                 if self.metadata and self.metadata.num_rows > 0:
-                    log.warning(
+                    self.logger.warning(
                         f"No batches read for preview, though metadata indicates {self.metadata.num_rows} rows: {self.file_path.name}")
                 else:
-                    log.info(f"No data read for preview (file likely empty): {self.file_path.name}")
+                    self.logger.info(f"No data read for preview (file likely empty): {self.file_path.name}")
                 # Return empty DF with columns if schema available
                 if self.schema:
                     return pd.DataFrame(columns=self.schema.names)
@@ -223,11 +220,11 @@ class ParquetHandler(DataHandler):
                 self_destruct=True,
                 types_mapper=pd.ArrowDtype
             )
-            log.info(f"Generated preview of {len(df)} rows for {self.file_path.name}")
+            self.logger.info(f"Generated preview of {len(df)} rows for {self.file_path.name}")
             return df
         except Exception as e:
-            log.exception(f"Error generating data preview from Parquet file: {self.file_path.name}")
+            self.logger.exception(f"Error generating data preview from Parquet file: {self.file_path.name}")
             return pd.DataFrame({"error": [f"Failed to fetch preview: {e}"]})
     def get_column_stats(self, column_name: str) -> Dict[str, Any]:
@@ -242,13 +239,13 @@ class ParquetHandler(DataHandler):
             and potential error or message keys.
         """
         if not self.pq_file or not self.schema:
-            log.warning(f"Parquet file/schema unavailable for column stats: {self.file_path.name}")
+            self.logger.warning(f"Parquet file/schema unavailable for column stats: {self.file_path.name}")
             return self._create_stats_result(column_name, None, error="File or schema not available")
         try:
             field = self.schema.field(column_name)
         except KeyError:
-            log.warning(f"Column '{column_name}' not found in schema: {self.file_path.name}")
+            self.logger.warning(f"Column '{column_name}' not found in schema: {self.file_path.name}")
             return self._create_stats_result(column_name, None, error=f"Column '{column_name}' not found in schema")
         calculated_stats: Dict[str, Any] = {}
@@ -261,7 +258,7 @@ class ParquetHandler(DataHandler):
             # Data Reading
             table = self.pq_file.read(columns=[column_name])
             column_data = table.column(0)
-            log.debug(
+            self.logger.debug(
                 f"Finished reading column '{column_name}'. Rows: {len(column_data)}, Nulls: {column_data.null_count}")
             # Basic Counts
@@ -274,14 +271,14 @@ class ParquetHandler(DataHandler):
                 calculated_stats["Null Count"] = f"{null_count:,}"
                 calculated_stats["Null Percentage"] = f"{(null_count / total_count * 100):.2f}%"
             else:
-                log.info(f"Column '{column_name}' read resulted in 0 rows.")
+                self.logger.info(f"Column '{column_name}' read resulted in 0 rows.")
                 message = "Column is empty (0 rows)."
                 valid_count = 0  # Ensure valid_count is 0 for later checks
             # Type-Specific Calculations
             if valid_count > 0:
                 col_type = field.type
-                log.debug(f"Calculating stats for type: {self._format_pyarrow_type(col_type)}")
+                self.logger.debug(f"Calculating stats for type: {self._format_pyarrow_type(col_type)}")
                 try:
                     if pa.types.is_floating(col_type) or pa.types.is_integer(col_type):
                         calculated_stats.update(self._calculate_numeric_stats(column_data))
@@ -300,11 +297,12 @@ class ParquetHandler(DataHandler):
                         calculated_stats.update(self._calculate_complex_type_stats(column_data, col_type))
                         message = f"Basic aggregate stats (min/max/mean) not applicable for complex type '{self._format_pyarrow_type(col_type)}'."
                     else:
-                        log.warning(f"Statistics calculation not fully implemented for type: {col_type}")
+                        self.logger.warning(f"Statistics calculation not fully implemented for type: {col_type}")
                         message = f"Statistics calculation not implemented for type '{self._format_pyarrow_type(col_type)}'."
                 except Exception as calc_err:
-                    log.exception(f"Error during type-specific calculation for column '{column_name}': {calc_err}")
+                    self.logger.exception(
+                        f"Error during type-specific calculation for column '{column_name}': {calc_err}")
                     error_msg = f"Calculation error for type {field.type}: {calc_err}"
                     calculated_stats["Calculation Error"] = str(calc_err)  # Add specific error key
@@ -315,10 +313,10 @@ class ParquetHandler(DataHandler):
             metadata_stats, metadata_stats_error = self._get_stats_from_metadata(column_name)
         except pa.lib.ArrowException as arrow_e:
-            log.exception(f"Arrow error during stats processing for column '{column_name}': {arrow_e}")
+            self.logger.exception(f"Arrow error during stats processing for column '{column_name}': {arrow_e}")
             error_msg = f"Arrow processing error: {arrow_e}"
         except Exception as e:
-            log.exception(f"Unexpected error during stats calculation for column '{column_name}'")
+            self.logger.exception(f"Unexpected error during stats calculation for column '{column_name}'")
             error_msg = f"Calculation failed unexpectedly: {e}"
         return self._create_stats_result(
@@ -331,7 +329,7 @@ class ParquetHandler(DataHandler):
             try:
                 return value.decode('utf-8', errors='replace')
             except Exception as e:
-                log.warning(f"Could not decode metadata bytes: {e}. Value: {value!r}")
+                self.logger.warning(f"Could not decode metadata bytes: {e}. Value: {value!r}")
                 return f"[Decode Error: {value!r}]"
         return str(value) if value is not None else None
@@ -348,7 +346,7 @@ class ParquetHandler(DataHandler):
                 decoded_kv[key_str] = val_str
             return decoded_kv
         except Exception as e:
-            log.warning(f"Could not decode key-value metadata: {e}")
+            self.logger.warning(f"Could not decode key-value metadata: {e}")
             return {"error": f"Error decoding key-value metadata: {e}"}
     def _format_pyarrow_type(self, field_type: pa.DataType) -> str:
@@ -425,6 +423,32 @@ class ParquetHandler(DataHandler):
             variance_val, err_var = self._safe_compute(pc.variance, column_data, ddof=1)
             stats["Variance"] = f"{variance_val:.4f}" if variance_val is not None and err_var is None else (
                     err_var or "N/A")
+        distinct_val, err = self._safe_compute(pc.count_distinct, column_data)
+        stats["Distinct Count"] = f"{distinct_val:,}" if distinct_val is not None and err is None else (err or "N/A")
+        # Add histogram data for visualization
+        try:
+            # Convert to Python list for histogram calculation (sample if too large)
+            data_length = len(column_data)
+            sample_size = min(10000, data_length)  # Limit to 10k samples for performance
+            if data_length > sample_size:
+                # Sample the data
+                import random
+                indices = sorted(random.sample(range(data_length), sample_size))
+                sampled_data = [column_data[i].as_py() for i in indices]
+            else:
+                sampled_data = column_data.to_pylist()
+            # Filter out None values
+            clean_data = [val for val in sampled_data if val is not None]
+            if len(clean_data) > 10:  # Only create histogram if we have enough data
+                stats["_histogram_data"] = clean_data
+                stats["_data_type"] = "numeric"
+        except Exception as e:
+            self.logger.warning(f"Failed to prepare histogram data: {e}")
         return stats
@@ -438,23 +462,10 @@ class ParquetHandler(DataHandler):
         return stats
     def _calculate_string_binary_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]:
-        """Calculates distinct count and optionally length stats for string/binary."""
+        """Calculates distinct count for string/binary columns."""
         stats: Dict[str, Any] = {}
         distinct_val, err = self._safe_compute(pc.count_distinct, column_data)
         stats["Distinct Count"] = f"{distinct_val:,}" if distinct_val is not None and err is None else (err or "N/A")
-        if pa.types.is_string(column_data.type) or pa.types.is_large_string(column_data.type):
-            lengths, err_len = self._safe_compute(pc.binary_length, column_data)
-            if err_len is None and lengths is not None:
-                min_len, err_min = self._safe_compute(pc.min, lengths)
-                stats["Min Length"] = min_len if err_min is None else err_min
-                max_len, err_max = self._safe_compute(pc.max, lengths)
-                stats["Max Length"] = max_len if err_max is None else err_max
-                avg_len, err_avg = self._safe_compute(pc.mean, lengths)
-                stats["Avg Length"] = f"{avg_len:.2f}" if avg_len is not None and err_avg is None else (
-                        err_avg or "N/A")
-            else:
-                stats.update({"Min Length": "Error", "Max Length": "Error", "Avg Length": "Error"})
         return stats
     def _calculate_boolean_stats(self, column_data: pa.ChunkedArray) -> Dict[str, Any]:
@@ -480,7 +491,7 @@ class ParquetHandler(DataHandler):
             if 'False' not in stats["Value Counts"]: stats["Value Counts"]['False'] = "0"
         except Exception as vc_e:
-            log.warning(f"Boolean value count calculation error: {vc_e}", exc_info=True)
+            self.logger.warning(f"Boolean value count calculation error: {vc_e}", exc_info=True)
             stats["Value Counts"] = "Error calculating"
         return stats
@@ -490,7 +501,7 @@ class ParquetHandler(DataHandler):
         try:
             unwrapped_data = column_data.dictionary_decode()
             value_type = col_type.value_type
-            log.debug(f"Calculating dictionary stats based on value type: {value_type}")
+            self.logger.debug(f"Calculating dictionary stats based on value type: {value_type}")
             # Delegate calculation based on the *value* type
             if pa.types.is_floating(value_type) or pa.types.is_integer(value_type):
@@ -511,10 +522,10 @@ class ParquetHandler(DataHandler):
                         err or "N/A")
         except pa.lib.ArrowException as arrow_decode_err:
-            log.warning(f"Arrow error decoding dictionary type for stats: {arrow_decode_err}")
+            self.logger.warning(f"Arrow error decoding dictionary type for stats: {arrow_decode_err}")
             stats["Dictionary Error"] = f"Decode Error: {arrow_decode_err}"
         except Exception as dict_e:
-            log.warning(f"Could not process dictionary type for stats: {dict_e}")
+            self.logger.warning(f"Could not process dictionary type for stats: {dict_e}")
             stats["Dictionary Error"] = f"Processing Error: {dict_e}"
         return stats
@@ -545,17 +556,17 @@ class ParquetHandler(DataHandler):
                     rg_meta = self.metadata.row_group(i)
                     metadata_stats[group_key] = self._extract_stats_for_single_group(rg_meta, col_index)
                 except IndexError:
-                    log.warning(f"Column index {col_index} out of bounds for row group {i}.")
+                    self.logger.warning(f"Column index {col_index} out of bounds for row group {i}.")
                     metadata_stats[group_key] = "Index Error"
                 except Exception as e:
-                    log.warning(f"Error processing metadata stats for RG {i}, column '{column_name}': {e}")
+                    self.logger.warning(f"Error processing metadata stats for RG {i}, column '{column_name}': {e}")
                     metadata_stats[group_key] = f"Read Error: {e}"
         except KeyError:
-            log.warning(f"Column '{column_name}' not found in schema for metadata stats.")
+            self.logger.warning(f"Column '{column_name}' not found in schema for metadata stats.")
             error_str = f"Column '{column_name}' not found in schema"
         except Exception as e:
-            log.exception(f"Failed to get metadata statistics structure for column '{column_name}'.")
+            self.logger.exception(f"Failed to get metadata statistics structure for column '{column_name}'.")
             error_str = f"Error accessing metadata structure: {e}"
         return metadata_stats, error_str
@@ -587,10 +598,10 @@ class ParquetHandler(DataHandler):
                                             col_chunk_meta.total_uncompressed_size is not None),
             }
         except IndexError:
-            log.warning(f"Column index {col_index} out of bounds for row group {rg_meta.num_columns} columns.")
+            self.logger.warning(f"Column index {col_index} out of bounds for row group {rg_meta.num_columns} columns.")
             return "Index Error"
         except Exception as e:
-            log.error(f"Error reading column chunk metadata stats for index {col_index}: {e}", exc_info=True)
+            self.logger.error(f"Error reading column chunk metadata stats for index {col_index}: {e}", exc_info=True)
             return f"Metadata Read Error: {e}"
     def _create_stats_result(
@@ -613,7 +624,7 @@ class ParquetHandler(DataHandler):
                 col_type_str = self._format_pyarrow_type(field.type)
                 col_nullable = field.nullable
             except Exception as e:
-                log.error(f"Error formatting type for column {column_name}: {e}")
+                self.logger.error(f"Error formatting type for column {column_name}: {e}")
                 col_type_str = f"[Error formatting: {field.type}]"
                 col_nullable = None

parqv/views/__init__.py CHANGED Viewed

@@ -0,0 +1,38 @@
+"""
+Views package for parqv application.
+This package contains all UI views and their supporting components and utilities.
+"""
+# Main views
+from .metadata_view import MetadataView
+from .data_view import DataView
+from .schema_view import SchemaView
+# Base classes
+from .base import BaseView
+# Components (optional, for advanced usage)
+from .components import ErrorDisplay, LoadingDisplay, EnhancedDataTable
+# Utilities (optional, for advanced usage)
+from .utils import format_metadata_for_display, format_stats_for_display
+__all__ = [
+    # Main views - these are the primary exports
+    "MetadataView",
+    "DataView",
+    "SchemaView",
+    # Base class - for extending functionality
+    "BaseView",
+    # Components - for custom view development
+    "ErrorDisplay",
+    "LoadingDisplay",
+    "EnhancedDataTable",
+    # Utilities - for data formatting
+    "format_metadata_for_display",
+    "format_stats_for_display",
+]

parqv 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

parqv 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl