PyPI - sqlshell - Versions diffs - 0.2.3__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

sqlshell 0.2.3py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sqlshell might be problematic. Click here for more details.

Files changed (17) hide show

sqlshell/__init__.py +34 -4
sqlshell/db/__init__.py +2 -1
sqlshell/db/database_manager.py +336 -23
sqlshell/db/export_manager.py +188 -0
sqlshell/editor_integration.py +127 -0
sqlshell/execution_handler.py +421 -0
sqlshell/main.py +570 -140
sqlshell/query_tab.py +592 -7
sqlshell/ui/filter_header.py +22 -1
sqlshell/utils/profile_column.py +1586 -170
sqlshell/utils/profile_foreign_keys.py +103 -11
sqlshell/utils/profile_ohe.py +631 -0
{sqlshell-0.2.3.dist-info → sqlshell-0.3.0.dist-info}/METADATA +126 -7
{sqlshell-0.2.3.dist-info → sqlshell-0.3.0.dist-info}/RECORD +17 -13
{sqlshell-0.2.3.dist-info → sqlshell-0.3.0.dist-info}/WHEEL +1 -1
{sqlshell-0.2.3.dist-info → sqlshell-0.3.0.dist-info}/entry_points.txt +0 -0
{sqlshell-0.2.3.dist-info → sqlshell-0.3.0.dist-info}/top_level.txt +0 -0

sqlshell/__init__.py CHANGED Viewed

@@ -5,10 +5,40 @@ SQLShell - A powerful SQL shell with GUI interface for data analysis
 __version__ = "0.2.3"
 __author__ = "SQLShell Team"
-from sqlshell.main import main
+from sqlshell.main import main, SQLShell
+from PyQt6.QtWidgets import QApplication
+import sys
-def start():
-    """Start the SQLShell application."""
-    main()
+def start(database_path=None):
+    """Start the SQLShell application.
+    Args:
+        database_path (str, optional): Path to a database file to open. If provided,
+            SQLShell will automatically open this database on startup.
+    """
+    app = QApplication(sys.argv)
+    window = SQLShell()
+    if database_path:
+        try:
+            # Open the database
+            window.db_manager.open_database(database_path, load_all_tables=True)
+            # Update UI with tables from the database
+            for table_name, source in window.db_manager.loaded_tables.items():
+                if source == 'database':
+                    window.tables_list.add_table_item(table_name, "database")
+            # Update the completer with table and column names
+            window.update_completer()
+            # Update status bar
+            window.statusBar().showMessage(f"Connected to database: {database_path}")
+            window.db_info_label.setText(window.db_manager.get_connection_info())
+        except Exception as e:
+            print(f"Error opening database: {e}")
+    window.show()
+    sys.exit(app.exec())
 # SQLShell package initialization

sqlshell/db/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Database management components for SQLShell application."""
 from sqlshell.db.database_manager import DatabaseManager
+from sqlshell.db.export_manager import ExportManager
-__all__ = ['DatabaseManager']
+__all__ = ['DatabaseManager', 'ExportManager']

sqlshell/db/database_manager.py CHANGED Viewed

@@ -219,7 +219,7 @@ class DatabaseManager:
         Load data from a file into the database.
         Args:
-            file_path: Path to the data file (Excel, CSV, Parquet, Delta)
+            file_path: Path to the data file (Excel, CSV, TXT, Parquet, Delta)
         Returns:
             Tuple of (table_name, DataFrame) for the loaded data
@@ -240,8 +240,29 @@ class DatabaseManager:
                     # Load the Delta table
                     import deltalake
                     delta_table = deltalake.DeltaTable(file_path)
-                    # Convert to pandas DataFrame
+                    # Get the schema to identify decimal columns
+                    schema = delta_table.schema()
+                    decimal_columns = []
+                    # Identify decimal columns from schema
+                    for field in schema.fields:
+                        # Use string representation to check for decimal
+                        if 'decimal' in str(field.type).lower():
+                            decimal_columns.append(field.name)
+                    # Read the data
                     df = delta_table.to_pandas()
+                    # Try to convert decimal columns to float64, warn if not possible
+                    for col in decimal_columns:
+                        if col in df.columns:
+                            try:
+                                df[col] = pd.to_numeric(df[col], errors='coerce').astype('float64')
+                                if df[col].isna().any():
+                                    print(f"Warning: Some values in column '{col}' could not be converted to float64 and are set as NaN.")
+                            except Exception as e:
+                                print(f"Warning: Could not convert column '{col}' to float64: {e}")
                 except Exception as e:
                     raise ValueError(f"Error loading Delta table: {str(e)}")
             elif file_path.endswith(('.xlsx', '.xls')):
@@ -267,37 +288,208 @@ class DatabaseManager:
                 except Exception:
                     # Fallback to standard reading method
                     df = pd.read_excel(file_path)
-            elif file_path.endswith('.csv'):
-                # For CSV files, we can use chunking for large files
+            elif file_path.endswith(('.csv', '.txt')):
+                # For CSV and TXT files, detect separator and use chunking for large files
                 try:
                     # Check if it's a large file
                     file_size = os.path.getsize(file_path) / (1024 * 1024)  # Size in MB
-                    if file_size > 50:  # If file is larger than 50MB
-                        # Read the first chunk to get column types
-                        df_preview = pd.read_csv(file_path, nrows=1000)
+                    # Try multiple encodings if needed
+                    encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'ISO-8859-1']
+                    # Detect the separator automatically
+                    def detect_separator(sample_data):
+                        # Common separators to check
+                        separators = [',', ';', '\t']
+                        separator_scores = {}
+                        # Split into lines and analyze
+                        lines = [line.strip() for line in sample_data.split('\n') if line.strip()]
+                        if not lines:
+                            return ','  # Default if no content
+                        # Check for quoted content with separators
+                        has_quotes = '"' in sample_data or "'" in sample_data
-                        # Use optimized dtypes for better memory usage
-                        dtypes = {col: df_preview[col].dtype for col in df_preview.columns}
+                        # If we have quoted content, use a different approach
+                        if has_quotes:
+                            for sep in separators:
+                                # Look for patterns like "value";
+                                pattern_count = 0
+                                for line in lines:
+                                    # Count occurrences of quote + separator
+                                    double_quote_pattern = f'"{sep}'
+                                    single_quote_pattern = f"'{sep}"
+                                    pattern_count += line.count(double_quote_pattern) + line.count(single_quote_pattern)
+                                # If we found clear quote+separator patterns, this is likely our separator
+                                if pattern_count > 0:
+                                    separator_scores[sep] = pattern_count
-                        # Read again with chunk processing, combining up to 100k rows
-                        chunks = []
-                        for chunk in pd.read_csv(file_path, dtype=dtypes, chunksize=10000):
-                            chunks.append(chunk)
-                            if len(chunks) * 10000 >= 100000:  # Cap at 100k rows
-                                break
+                        # Standard approach based on consistent column counts
+                        if not separator_scores:
+                            for sep in separators:
+                                # Count consistent occurrences across lines
+                                counts = [line.count(sep) for line in lines]
+                                if counts and all(c > 0 for c in counts):
+                                    # Calculate consistency score: higher if all counts are the same
+                                    consistency = 1.0 if all(c == counts[0] for c in counts) else 0.5
+                                    # Score is average count * consistency
+                                    separator_scores[sep] = sum(counts) / len(counts) * consistency
-                        df = pd.concat(chunks, ignore_index=True)
+                        # Choose the separator with the highest score
+                        if separator_scores:
+                            return max(separator_scores.items(), key=lambda x: x[1])[0]
+                        # Default to comma if we couldn't determine
+                        return ','
+                    # First, sample the file to detect separator
+                    with open(file_path, 'rb') as f:
+                        # Read first few KB to detect encoding and separator
+                        raw_sample = f.read(4096)
+                    # Try to decode with various encodings
+                    sample_text = None
+                    detected_encoding = None
+                    for encoding in encodings_to_try:
+                        try:
+                            sample_text = raw_sample.decode(encoding)
+                            detected_encoding = encoding
+                            break
+                        except UnicodeDecodeError:
+                            continue
+                    if not sample_text:
+                        raise ValueError("Could not decode file with any of the attempted encodings")
+                    # Detect separator from the sample
+                    separator = detect_separator(sample_text)
+                    # Determine quote character (default to double quote)
+                    quotechar = '"'
+                    if sample_text.count("'") > sample_text.count('"'):
+                        quotechar = "'"
+                    if file_size > 50:  # If file is larger than 50MB
+                        # Read the first chunk to get column types
+                        try:
+                            df_preview = pd.read_csv(
+                                file_path,
+                                sep=separator,
+                                nrows=1000,
+                                encoding=detected_encoding,
+                                engine='python' if separator != ',' else 'c',
+                                quotechar=quotechar,
+                                doublequote=True
+                            )
+                            # Use optimized dtypes for better memory usage
+                            dtypes = {col: df_preview[col].dtype for col in df_preview.columns}
+                            # Read again with chunk processing, combining up to 100k rows
+                            chunks = []
+                            for chunk in pd.read_csv(
+                                file_path,
+                                sep=separator,
+                                dtype=dtypes,
+                                chunksize=10000,
+                                encoding=detected_encoding,
+                                engine='python' if separator != ',' else 'c',
+                                quotechar=quotechar,
+                                doublequote=True
+                            ):
+                                chunks.append(chunk)
+                                if len(chunks) * 10000 >= 100000:  # Cap at 100k rows
+                                    break
+                            df = pd.concat(chunks, ignore_index=True)
+                        except pd.errors.ParserError as e:
+                            # If parsing fails, try again with error recovery options
+                            print(f"Initial parsing failed: {str(e)}. Trying with error recovery options...")
+                            # Try with Python engine which is more flexible
+                            try:
+                                # First try with pandas >= 1.3 parameters
+                                df = pd.read_csv(
+                                    file_path,
+                                    sep=separator,
+                                    encoding=detected_encoding,
+                                    engine='python',  # Always use python engine for error recovery
+                                    quotechar=quotechar,
+                                    doublequote=True,
+                                    on_bad_lines='warn',  # New parameter in pandas >= 1.3
+                                    na_values=[''],
+                                    keep_default_na=True
+                                )
+                            except TypeError:
+                                # Fall back to pandas < 1.3 parameters
+                                df = pd.read_csv(
+                                    file_path,
+                                    sep=separator,
+                                    encoding=detected_encoding,
+                                    engine='python',
+                                    quotechar=quotechar,
+                                    doublequote=True,
+                                    error_bad_lines=False,  # Old parameter
+                                    warn_bad_lines=True,    # Old parameter
+                                    na_values=[''],
+                                    keep_default_na=True
+                                )
                     else:
                         # For smaller files, read everything at once
-                        df = pd.read_csv(file_path)
-                except Exception:
-                    # Fallback to standard reading method
-                    df = pd.read_csv(file_path)
+                        try:
+                            df = pd.read_csv(
+                                file_path,
+                                sep=separator,
+                                encoding=detected_encoding,
+                                engine='python' if separator != ',' else 'c',
+                                quotechar=quotechar,
+                                doublequote=True
+                            )
+                        except pd.errors.ParserError as e:
+                            # If parsing fails, try again with error recovery options
+                            print(f"Initial parsing failed: {str(e)}. Trying with error recovery options...")
+                            # Try with Python engine which is more flexible
+                            try:
+                                # First try with pandas >= 1.3 parameters
+                                df = pd.read_csv(
+                                    file_path,
+                                    sep=separator,
+                                    encoding=detected_encoding,
+                                    engine='python',  # Always use python engine for error recovery
+                                    quotechar=quotechar,
+                                    doublequote=True,
+                                    on_bad_lines='warn',  # New parameter in pandas >= 1.3
+                                    na_values=[''],
+                                    keep_default_na=True
+                                )
+                            except TypeError:
+                                # Fall back to pandas < 1.3 parameters
+                                df = pd.read_csv(
+                                    file_path,
+                                    sep=separator,
+                                    encoding=detected_encoding,
+                                    engine='python',
+                                    quotechar=quotechar,
+                                    doublequote=True,
+                                    error_bad_lines=False,  # Old parameter
+                                    warn_bad_lines=True,    # Old parameter
+                                    na_values=[''],
+                                    keep_default_na=True
+                                )
+                except Exception as e:
+                    # Log the error for debugging
+                    import traceback
+                    print(f"Error loading CSV/TXT file: {str(e)}")
+                    print(traceback.format_exc())
+                    raise ValueError(f"Error loading CSV/TXT file: {str(e)}")
             elif file_path.endswith('.parquet'):
                 df = pd.read_parquet(file_path)
             else:
-                raise ValueError("Unsupported file format")
+                raise ValueError("Unsupported file format. Supported formats: .xlsx, .xls, .csv, .txt, .parquet, and Delta tables.")
             # Generate table name from file name
             base_name = os.path.splitext(os.path.basename(file_path))[0]
@@ -448,8 +640,128 @@ class DatabaseManager:
                 df = delta_table.to_pandas()
             elif file_path.endswith(('.xlsx', '.xls')):
                 df = pd.read_excel(file_path)
-            elif file_path.endswith('.csv'):
-                df = pd.read_csv(file_path)
+            elif file_path.endswith(('.csv', '.txt')):
+                # Try multiple encodings for CSV/TXT files
+                encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'ISO-8859-1']
+                # Detect the separator automatically
+                def detect_separator(sample_data):
+                    # Common separators to check
+                    separators = [',', ';', '\t']
+                    separator_scores = {}
+                    # Split into lines and analyze
+                    lines = [line.strip() for line in sample_data.split('\n') if line.strip()]
+                    if not lines:
+                        return ','  # Default if no content
+                    # Check for quoted content with separators
+                    has_quotes = '"' in sample_data or "'" in sample_data
+                    # If we have quoted content, use a different approach
+                    if has_quotes:
+                        for sep in separators:
+                            # Look for patterns like "value";
+                            pattern_count = 0
+                            for line in lines:
+                                # Count occurrences of quote + separator
+                                double_quote_pattern = f'"{sep}'
+                                single_quote_pattern = f"'{sep}"
+                                pattern_count += line.count(double_quote_pattern) + line.count(single_quote_pattern)
+                            # If we found clear quote+separator patterns, this is likely our separator
+                            if pattern_count > 0:
+                                separator_scores[sep] = pattern_count
+                    # Standard approach based on consistent column counts
+                    if not separator_scores:
+                        for sep in separators:
+                            # Count consistent occurrences across lines
+                            counts = [line.count(sep) for line in lines]
+                            if counts and all(c > 0 for c in counts):
+                                # Calculate consistency score: higher if all counts are the same
+                                consistency = 1.0 if all(c == counts[0] for c in counts) else 0.5
+                                # Score is average count * consistency
+                                separator_scores[sep] = sum(counts) / len(counts) * consistency
+                    # Choose the separator with the highest score
+                    if separator_scores:
+                        return max(separator_scores.items(), key=lambda x: x[1])[0]
+                    # Default to comma if we couldn't determine
+                    return ','
+                # First, sample the file to detect separator and encoding
+                with open(file_path, 'rb') as f:
+                    # Read first few KB to detect encoding and separator
+                    raw_sample = f.read(4096)
+                # Try to decode with various encodings
+                sample_text = None
+                detected_encoding = None
+                for encoding in encodings_to_try:
+                    try:
+                        sample_text = raw_sample.decode(encoding)
+                        detected_encoding = encoding
+                        break
+                    except UnicodeDecodeError:
+                        # If this encoding fails, try the next one
+                        continue
+                if not sample_text:
+                    raise ValueError("Could not decode file with any of the attempted encodings")
+                # Detect separator from the sample
+                separator = detect_separator(sample_text)
+                # Determine quote character (default to double quote)
+                quotechar = '"'
+                if sample_text.count("'") > sample_text.count('"'):
+                    quotechar = "'"
+                # Read with detected parameters
+                try:
+                    df = pd.read_csv(
+                        file_path,
+                        sep=separator,
+                        encoding=detected_encoding,
+                        engine='python' if separator != ',' else 'c',
+                        quotechar=quotechar,
+                        doublequote=True
+                    )
+                except pd.errors.ParserError as e:
+                    # If parsing fails, try again with error recovery options
+                    print(f"Initial parsing failed on reload: {str(e)}. Trying with error recovery options...")
+                    # Try with Python engine which is more flexible
+                    try:
+                        # First try with pandas >= 1.3 parameters
+                        df = pd.read_csv(
+                            file_path,
+                            sep=separator,
+                            encoding=detected_encoding,
+                            engine='python',  # Always use python engine for error recovery
+                            quotechar=quotechar,
+                            doublequote=True,
+                            on_bad_lines='warn',  # New parameter in pandas >= 1.3
+                            na_values=[''],
+                            keep_default_na=True
+                        )
+                    except TypeError:
+                        # Fall back to pandas < 1.3 parameters
+                        df = pd.read_csv(
+                            file_path,
+                            sep=separator,
+                            encoding=detected_encoding,
+                            engine='python',
+                            quotechar=quotechar,
+                            doublequote=True,
+                            error_bad_lines=False,  # Old parameter
+                            warn_bad_lines=True,    # Old parameter
+                            na_values=[''],
+                            keep_default_na=True
+                        )
             elif file_path.endswith('.parquet'):
                 df = pd.read_parquet(file_path)
             else:
@@ -547,6 +859,7 @@ class DatabaseManager:
         if self.connection_type == 'sqlite':
             df.to_sql(table_name, self.conn, index=False, if_exists='replace')
         else:  # duckdb
+            # Register the DataFrame directly
             self.conn.register(table_name, df)
         # Track the table

sqlshell/db/export_manager.py ADDED Viewed

@@ -0,0 +1,188 @@
+"""Export functionality for SQLShell application."""
+import os
+import pandas as pd
+import numpy as np
+from typing import Optional, Tuple, Dict, Any
+class ExportManager:
+    """Manages data export functionality for SQLShell."""
+    def __init__(self, db_manager):
+        """Initialize the export manager.
+        Args:
+            db_manager: The database manager instance to use for table registration
+        """
+        self.db_manager = db_manager
+    def export_to_excel(self, df: pd.DataFrame, file_name: str) -> Tuple[str, Dict[str, Any]]:
+        """Export data to Excel format.
+        Args:
+            df: The DataFrame to export
+            file_name: The target file path
+        Returns:
+            Tuple containing:
+            - The generated table name
+            - Dictionary with export metadata
+        """
+        try:
+            # Export to Excel
+            df.to_excel(file_name, index=False)
+            # Generate table name from file name
+            base_name = os.path.splitext(os.path.basename(file_name))[0]
+            table_name = self.db_manager.sanitize_table_name(base_name)
+            # Ensure unique table name
+            original_name = table_name
+            counter = 1
+            while table_name in self.db_manager.loaded_tables:
+                table_name = f"{original_name}_{counter}"
+                counter += 1
+            # Register the table in the database manager
+            self.db_manager.register_dataframe(df, table_name, file_name)
+            # Update tracking
+            self.db_manager.loaded_tables[table_name] = file_name
+            self.db_manager.table_columns[table_name] = df.columns.tolist()
+            return table_name, {
+                'file_path': file_name,
+                'columns': df.columns.tolist(),
+                'row_count': len(df)
+            }
+        except Exception as e:
+            raise Exception(f"Failed to export to Excel: {str(e)}")
+    def export_to_parquet(self, df: pd.DataFrame, file_name: str) -> Tuple[str, Dict[str, Any]]:
+        """Export data to Parquet format.
+        Args:
+            df: The DataFrame to export
+            file_name: The target file path
+        Returns:
+            Tuple containing:
+            - The generated table name
+            - Dictionary with export metadata
+        """
+        try:
+            # Export to Parquet
+            df.to_parquet(file_name, index=False)
+            # Generate table name from file name
+            base_name = os.path.splitext(os.path.basename(file_name))[0]
+            table_name = self.db_manager.sanitize_table_name(base_name)
+            # Ensure unique table name
+            original_name = table_name
+            counter = 1
+            while table_name in self.db_manager.loaded_tables:
+                table_name = f"{original_name}_{counter}"
+                counter += 1
+            # Register the table in the database manager
+            self.db_manager.register_dataframe(df, table_name, file_name)
+            # Update tracking
+            self.db_manager.loaded_tables[table_name] = file_name
+            self.db_manager.table_columns[table_name] = df.columns.tolist()
+            return table_name, {
+                'file_path': file_name,
+                'columns': df.columns.tolist(),
+                'row_count': len(df)
+            }
+        except Exception as e:
+            raise Exception(f"Failed to export to Parquet: {str(e)}")
+    def convert_table_to_dataframe(self, table_widget) -> Optional[pd.DataFrame]:
+        """Convert a QTableWidget to a pandas DataFrame with proper data types.
+        Args:
+            table_widget: The QTableWidget containing the data
+        Returns:
+            DataFrame with properly typed data, or None if conversion fails
+        """
+        if not table_widget or table_widget.rowCount() == 0:
+            return None
+        # Get headers
+        headers = [table_widget.horizontalHeaderItem(i).text()
+                  for i in range(table_widget.columnCount())]
+        # Get data
+        data = []
+        for row in range(table_widget.rowCount()):
+            row_data = []
+            for column in range(table_widget.columnCount()):
+                item = table_widget.item(row, column)
+                row_data.append(item.text() if item else '')
+            data.append(row_data)
+        # Create DataFrame from raw string data
+        df_raw = pd.DataFrame(data, columns=headers)
+        # Try to use the original dataframe's dtypes if available
+        if hasattr(table_widget, 'current_df') and table_widget.current_df is not None:
+            original_df = table_widget.current_df
+            # Create a new DataFrame with appropriate types
+            df_typed = pd.DataFrame()
+            for col in df_raw.columns:
+                if col in original_df.columns:
+                    # Get the original column type
+                    orig_type = original_df[col].dtype
+                    # Special handling for different data types
+                    if pd.api.types.is_numeric_dtype(orig_type):
+                        try:
+                            numeric_col = pd.to_numeric(
+                                df_raw[col].str.replace(',', '').replace('NULL', np.nan)
+                            )
+                            df_typed[col] = numeric_col
+                        except:
+                            df_typed[col] = df_raw[col]
+                    elif pd.api.types.is_datetime64_dtype(orig_type):
+                        try:
+                            df_typed[col] = pd.to_datetime(df_raw[col].replace('NULL', np.nan))
+                        except:
+                            df_typed[col] = df_raw[col]
+                    elif pd.api.types.is_bool_dtype(orig_type):
+                        try:
+                            df_typed[col] = df_raw[col].map({'True': True, 'False': False}).replace('NULL', np.nan)
+                        except:
+                            df_typed[col] = df_raw[col]
+                    else:
+                        df_typed[col] = df_raw[col]
+                else:
+                    df_typed[col] = df_raw[col]
+            return df_typed
+        else:
+            # If we don't have the original dataframe, try to infer types
+            df_raw.replace('NULL', np.nan, inplace=True)
+            for col in df_raw.columns:
+                try:
+                    df_raw[col] = pd.to_numeric(df_raw[col].str.replace(',', ''))
+                except:
+                    try:
+                        df_raw[col] = pd.to_datetime(df_raw[col])
+                    except:
+                        try:
+                            if df_raw[col].dropna().isin(['True', 'False']).all():
+                                df_raw[col] = df_raw[col].map({'True': True, 'False': False})
+                        except:
+                            pass
+            return df_raw

sqlshell 0.2.3__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

sqlshell 0.2.3py3-none-any.whl → 0.3.0py3-none-any.whl