PyPI - sqlshell - Versions diffs - 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

sqlshell 0.2.2py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sqlshell might be problematic. Click here for more details.

Files changed (22) hide show

sqlshell/README.md +5 -1
sqlshell/__init__.py +35 -5
sqlshell/create_test_data.py +29 -0
sqlshell/db/__init__.py +2 -1
sqlshell/db/database_manager.py +336 -23
sqlshell/db/export_manager.py +188 -0
sqlshell/editor_integration.py +127 -0
sqlshell/execution_handler.py +421 -0
sqlshell/main.py +784 -143
sqlshell/query_tab.py +592 -7
sqlshell/table_list.py +90 -1
sqlshell/ui/filter_header.py +36 -1
sqlshell/utils/profile_column.py +2515 -0
sqlshell/utils/profile_distributions.py +613 -0
sqlshell/utils/profile_foreign_keys.py +547 -0
sqlshell/utils/profile_ohe.py +631 -0
sqlshell-0.3.0.dist-info/METADATA +400 -0
{sqlshell-0.2.2.dist-info → sqlshell-0.3.0.dist-info}/RECORD +21 -14
{sqlshell-0.2.2.dist-info → sqlshell-0.3.0.dist-info}/WHEEL +1 -1
sqlshell-0.2.2.dist-info/METADATA +0 -198
{sqlshell-0.2.2.dist-info → sqlshell-0.3.0.dist-info}/entry_points.txt +0 -0
{sqlshell-0.2.2.dist-info → sqlshell-0.3.0.dist-info}/top_level.txt +0 -0

sqlshell/README.md CHANGED Viewed

@@ -1,4 +1,6 @@
-# SQLShell
+# SQLShell - DEPRECATED README
+**NOTE: This README is deprecated. Please refer to the main README.md file in the root directory of the repository for the most up-to-date information.**
 A powerful SQL shell with GUI interface for data analysis. SQLShell provides an intuitive interface for working with various data formats (CSV, Excel, Parquet) using SQL queries powered by DuckDB.
@@ -12,6 +14,7 @@ A powerful SQL shell with GUI interface for data analysis. SQLShell provides an
 - Table preview functionality
 - Built-in test data generation
 - Support for multiple concurrent table views
+- "Explain Column" feature for analyzing relationships between data columns
 ## Installation
@@ -45,6 +48,7 @@ This will open the GUI interface where you can:
 3. Execute queries using the "Execute" button or Ctrl+Enter
 4. View results in the table view below
 5. Load sample test data using the "Test" button
+6. Right-click on column headers in the results to access features like sorting, filtering, and the "Explain Column" analysis tool
 ## Requirements

sqlshell/__init__.py CHANGED Viewed

@@ -2,13 +2,43 @@
 SQLShell - A powerful SQL shell with GUI interface for data analysis
 """
-__version__ = "0.2.2"
+__version__ = "0.2.3"
 __author__ = "SQLShell Team"
-from sqlshell.main import main
+from sqlshell.main import main, SQLShell
+from PyQt6.QtWidgets import QApplication
+import sys
-def start():
-    """Start the SQLShell application."""
-    main()
+def start(database_path=None):
+    """Start the SQLShell application.
+    Args:
+        database_path (str, optional): Path to a database file to open. If provided,
+            SQLShell will automatically open this database on startup.
+    """
+    app = QApplication(sys.argv)
+    window = SQLShell()
+    if database_path:
+        try:
+            # Open the database
+            window.db_manager.open_database(database_path, load_all_tables=True)
+            # Update UI with tables from the database
+            for table_name, source in window.db_manager.loaded_tables.items():
+                if source == 'database':
+                    window.tables_list.add_table_item(table_name, "database")
+            # Update the completer with table and column names
+            window.update_completer()
+            # Update status bar
+            window.statusBar().showMessage(f"Connected to database: {database_path}")
+            window.db_info_label.setText(window.db_manager.get_connection_info())
+        except Exception as e:
+            print(f"Error opening database: {e}")
+    window.show()
+    sys.exit(app.exec())
 # SQLShell package initialization

sqlshell/create_test_data.py CHANGED Viewed

@@ -10,6 +10,35 @@ np.random.seed(42)
 OUTPUT_DIR = 'test_data'
 os.makedirs(OUTPUT_DIR, exist_ok=True)
+def create_california_housing_data(output_file='california_housing_data.parquet'):
+    """Use the real world california housing dataset"""
+    # Load the dataset
+    df = pd.read_csv('https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv')
+    # Save to Parquet
+    df.to_parquet(output_file)
+    return df
+def create_large_customer_data(num_customers=1_000_000, chunk_size=100_000, output_file='large_customer_data.parquet'):
+    """Create a large customer dataset """
+    # Generate customer data
+    data = {
+        'CustomerID': range(1, num_customers + 1),
+        'FirstName': [f'Customer{i}' for i in range(1, num_customers + 1)],
+        'LastName': [f'Lastname{i}' for i in range(1, num_customers + 1)],
+        'Email': [f'customer{i}@example.com' for i in range(1, num_customers + 1)],
+        'JoinDate': [datetime.now() - timedelta(days=np.random.randint(1, 1000))
+                     for _ in range(num_customers)],
+        'CustomerType': np.random.choice(['Regular', 'Premium', 'VIP'], num_customers),
+        'CreditScore': np.random.randint(300, 851, num_customers)
+    }
+    # Create DataFrame
+    df = pd.DataFrame(data)
+    return df
 def create_sales_data(num_records=1000):
     # Generate dates for the last 365 days
     end_date = datetime.now()

sqlshell/db/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Database management components for SQLShell application."""
 from sqlshell.db.database_manager import DatabaseManager
+from sqlshell.db.export_manager import ExportManager
-__all__ = ['DatabaseManager']
+__all__ = ['DatabaseManager', 'ExportManager']

sqlshell/db/database_manager.py CHANGED Viewed

@@ -219,7 +219,7 @@ class DatabaseManager:
         Load data from a file into the database.
         Args:
-            file_path: Path to the data file (Excel, CSV, Parquet, Delta)
+            file_path: Path to the data file (Excel, CSV, TXT, Parquet, Delta)
         Returns:
             Tuple of (table_name, DataFrame) for the loaded data
@@ -240,8 +240,29 @@ class DatabaseManager:
                     # Load the Delta table
                     import deltalake
                     delta_table = deltalake.DeltaTable(file_path)
-                    # Convert to pandas DataFrame
+                    # Get the schema to identify decimal columns
+                    schema = delta_table.schema()
+                    decimal_columns = []
+                    # Identify decimal columns from schema
+                    for field in schema.fields:
+                        # Use string representation to check for decimal
+                        if 'decimal' in str(field.type).lower():
+                            decimal_columns.append(field.name)
+                    # Read the data
                     df = delta_table.to_pandas()
+                    # Try to convert decimal columns to float64, warn if not possible
+                    for col in decimal_columns:
+                        if col in df.columns:
+                            try:
+                                df[col] = pd.to_numeric(df[col], errors='coerce').astype('float64')
+                                if df[col].isna().any():
+                                    print(f"Warning: Some values in column '{col}' could not be converted to float64 and are set as NaN.")
+                            except Exception as e:
+                                print(f"Warning: Could not convert column '{col}' to float64: {e}")
                 except Exception as e:
                     raise ValueError(f"Error loading Delta table: {str(e)}")
             elif file_path.endswith(('.xlsx', '.xls')):
@@ -267,37 +288,208 @@ class DatabaseManager:
                 except Exception:
                     # Fallback to standard reading method
                     df = pd.read_excel(file_path)
-            elif file_path.endswith('.csv'):
-                # For CSV files, we can use chunking for large files
+            elif file_path.endswith(('.csv', '.txt')):
+                # For CSV and TXT files, detect separator and use chunking for large files
                 try:
                     # Check if it's a large file
                     file_size = os.path.getsize(file_path) / (1024 * 1024)  # Size in MB
-                    if file_size > 50:  # If file is larger than 50MB
-                        # Read the first chunk to get column types
-                        df_preview = pd.read_csv(file_path, nrows=1000)
+                    # Try multiple encodings if needed
+                    encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'ISO-8859-1']
+                    # Detect the separator automatically
+                    def detect_separator(sample_data):
+                        # Common separators to check
+                        separators = [',', ';', '\t']
+                        separator_scores = {}
+                        # Split into lines and analyze
+                        lines = [line.strip() for line in sample_data.split('\n') if line.strip()]
+                        if not lines:
+                            return ','  # Default if no content
+                        # Check for quoted content with separators
+                        has_quotes = '"' in sample_data or "'" in sample_data
-                        # Use optimized dtypes for better memory usage
-                        dtypes = {col: df_preview[col].dtype for col in df_preview.columns}
+                        # If we have quoted content, use a different approach
+                        if has_quotes:
+                            for sep in separators:
+                                # Look for patterns like "value";
+                                pattern_count = 0
+                                for line in lines:
+                                    # Count occurrences of quote + separator
+                                    double_quote_pattern = f'"{sep}'
+                                    single_quote_pattern = f"'{sep}"
+                                    pattern_count += line.count(double_quote_pattern) + line.count(single_quote_pattern)
+                                # If we found clear quote+separator patterns, this is likely our separator
+                                if pattern_count > 0:
+                                    separator_scores[sep] = pattern_count
-                        # Read again with chunk processing, combining up to 100k rows
-                        chunks = []
-                        for chunk in pd.read_csv(file_path, dtype=dtypes, chunksize=10000):
-                            chunks.append(chunk)
-                            if len(chunks) * 10000 >= 100000:  # Cap at 100k rows
-                                break
+                        # Standard approach based on consistent column counts
+                        if not separator_scores:
+                            for sep in separators:
+                                # Count consistent occurrences across lines
+                                counts = [line.count(sep) for line in lines]
+                                if counts and all(c > 0 for c in counts):
+                                    # Calculate consistency score: higher if all counts are the same
+                                    consistency = 1.0 if all(c == counts[0] for c in counts) else 0.5
+                                    # Score is average count * consistency
+                                    separator_scores[sep] = sum(counts) / len(counts) * consistency
-                        df = pd.concat(chunks, ignore_index=True)
+                        # Choose the separator with the highest score
+                        if separator_scores:
+                            return max(separator_scores.items(), key=lambda x: x[1])[0]
+                        # Default to comma if we couldn't determine
+                        return ','
+                    # First, sample the file to detect separator
+                    with open(file_path, 'rb') as f:
+                        # Read first few KB to detect encoding and separator
+                        raw_sample = f.read(4096)
+                    # Try to decode with various encodings
+                    sample_text = None
+                    detected_encoding = None
+                    for encoding in encodings_to_try:
+                        try:
+                            sample_text = raw_sample.decode(encoding)
+                            detected_encoding = encoding
+                            break
+                        except UnicodeDecodeError:
+                            continue
+                    if not sample_text:
+                        raise ValueError("Could not decode file with any of the attempted encodings")
+                    # Detect separator from the sample
+                    separator = detect_separator(sample_text)
+                    # Determine quote character (default to double quote)
+                    quotechar = '"'
+                    if sample_text.count("'") > sample_text.count('"'):
+                        quotechar = "'"
+                    if file_size > 50:  # If file is larger than 50MB
+                        # Read the first chunk to get column types
+                        try:
+                            df_preview = pd.read_csv(
+                                file_path,
+                                sep=separator,
+                                nrows=1000,
+                                encoding=detected_encoding,
+                                engine='python' if separator != ',' else 'c',
+                                quotechar=quotechar,
+                                doublequote=True
+                            )
+                            # Use optimized dtypes for better memory usage
+                            dtypes = {col: df_preview[col].dtype for col in df_preview.columns}
+                            # Read again with chunk processing, combining up to 100k rows
+                            chunks = []
+                            for chunk in pd.read_csv(
+                                file_path,
+                                sep=separator,
+                                dtype=dtypes,
+                                chunksize=10000,
+                                encoding=detected_encoding,
+                                engine='python' if separator != ',' else 'c',
+                                quotechar=quotechar,
+                                doublequote=True
+                            ):
+                                chunks.append(chunk)
+                                if len(chunks) * 10000 >= 100000:  # Cap at 100k rows
+                                    break
+                            df = pd.concat(chunks, ignore_index=True)
+                        except pd.errors.ParserError as e:
+                            # If parsing fails, try again with error recovery options
+                            print(f"Initial parsing failed: {str(e)}. Trying with error recovery options...")
+                            # Try with Python engine which is more flexible
+                            try:
+                                # First try with pandas >= 1.3 parameters
+                                df = pd.read_csv(
+                                    file_path,
+                                    sep=separator,
+                                    encoding=detected_encoding,
+                                    engine='python',  # Always use python engine for error recovery
+                                    quotechar=quotechar,
+                                    doublequote=True,
+                                    on_bad_lines='warn',  # New parameter in pandas >= 1.3
+                                    na_values=[''],
+                                    keep_default_na=True
+                                )
+                            except TypeError:
+                                # Fall back to pandas < 1.3 parameters
+                                df = pd.read_csv(
+                                    file_path,
+                                    sep=separator,
+                                    encoding=detected_encoding,
+                                    engine='python',
+                                    quotechar=quotechar,
+                                    doublequote=True,
+                                    error_bad_lines=False,  # Old parameter
+                                    warn_bad_lines=True,    # Old parameter
+                                    na_values=[''],
+                                    keep_default_na=True
+                                )
                     else:
                         # For smaller files, read everything at once
-                        df = pd.read_csv(file_path)
-                except Exception:
-                    # Fallback to standard reading method
-                    df = pd.read_csv(file_path)
+                        try:
+                            df = pd.read_csv(
+                                file_path,
+                                sep=separator,
+                                encoding=detected_encoding,
+                                engine='python' if separator != ',' else 'c',
+                                quotechar=quotechar,
+                                doublequote=True
+                            )
+                        except pd.errors.ParserError as e:
+                            # If parsing fails, try again with error recovery options
+                            print(f"Initial parsing failed: {str(e)}. Trying with error recovery options...")
+                            # Try with Python engine which is more flexible
+                            try:
+                                # First try with pandas >= 1.3 parameters
+                                df = pd.read_csv(
+                                    file_path,
+                                    sep=separator,
+                                    encoding=detected_encoding,
+                                    engine='python',  # Always use python engine for error recovery
+                                    quotechar=quotechar,
+                                    doublequote=True,
+                                    on_bad_lines='warn',  # New parameter in pandas >= 1.3
+                                    na_values=[''],
+                                    keep_default_na=True
+                                )
+                            except TypeError:
+                                # Fall back to pandas < 1.3 parameters
+                                df = pd.read_csv(
+                                    file_path,
+                                    sep=separator,
+                                    encoding=detected_encoding,
+                                    engine='python',
+                                    quotechar=quotechar,
+                                    doublequote=True,
+                                    error_bad_lines=False,  # Old parameter
+                                    warn_bad_lines=True,    # Old parameter
+                                    na_values=[''],
+                                    keep_default_na=True
+                                )
+                except Exception as e:
+                    # Log the error for debugging
+                    import traceback
+                    print(f"Error loading CSV/TXT file: {str(e)}")
+                    print(traceback.format_exc())
+                    raise ValueError(f"Error loading CSV/TXT file: {str(e)}")
             elif file_path.endswith('.parquet'):
                 df = pd.read_parquet(file_path)
             else:
-                raise ValueError("Unsupported file format")
+                raise ValueError("Unsupported file format. Supported formats: .xlsx, .xls, .csv, .txt, .parquet, and Delta tables.")
             # Generate table name from file name
             base_name = os.path.splitext(os.path.basename(file_path))[0]
@@ -448,8 +640,128 @@ class DatabaseManager:
                 df = delta_table.to_pandas()
             elif file_path.endswith(('.xlsx', '.xls')):
                 df = pd.read_excel(file_path)
-            elif file_path.endswith('.csv'):
-                df = pd.read_csv(file_path)
+            elif file_path.endswith(('.csv', '.txt')):
+                # Try multiple encodings for CSV/TXT files
+                encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'ISO-8859-1']
+                # Detect the separator automatically
+                def detect_separator(sample_data):
+                    # Common separators to check
+                    separators = [',', ';', '\t']
+                    separator_scores = {}
+                    # Split into lines and analyze
+                    lines = [line.strip() for line in sample_data.split('\n') if line.strip()]
+                    if not lines:
+                        return ','  # Default if no content
+                    # Check for quoted content with separators
+                    has_quotes = '"' in sample_data or "'" in sample_data
+                    # If we have quoted content, use a different approach
+                    if has_quotes:
+                        for sep in separators:
+                            # Look for patterns like "value";
+                            pattern_count = 0
+                            for line in lines:
+                                # Count occurrences of quote + separator
+                                double_quote_pattern = f'"{sep}'
+                                single_quote_pattern = f"'{sep}"
+                                pattern_count += line.count(double_quote_pattern) + line.count(single_quote_pattern)
+                            # If we found clear quote+separator patterns, this is likely our separator
+                            if pattern_count > 0:
+                                separator_scores[sep] = pattern_count
+                    # Standard approach based on consistent column counts
+                    if not separator_scores:
+                        for sep in separators:
+                            # Count consistent occurrences across lines
+                            counts = [line.count(sep) for line in lines]
+                            if counts and all(c > 0 for c in counts):
+                                # Calculate consistency score: higher if all counts are the same
+                                consistency = 1.0 if all(c == counts[0] for c in counts) else 0.5
+                                # Score is average count * consistency
+                                separator_scores[sep] = sum(counts) / len(counts) * consistency
+                    # Choose the separator with the highest score
+                    if separator_scores:
+                        return max(separator_scores.items(), key=lambda x: x[1])[0]
+                    # Default to comma if we couldn't determine
+                    return ','
+                # First, sample the file to detect separator and encoding
+                with open(file_path, 'rb') as f:
+                    # Read first few KB to detect encoding and separator
+                    raw_sample = f.read(4096)
+                # Try to decode with various encodings
+                sample_text = None
+                detected_encoding = None
+                for encoding in encodings_to_try:
+                    try:
+                        sample_text = raw_sample.decode(encoding)
+                        detected_encoding = encoding
+                        break
+                    except UnicodeDecodeError:
+                        # If this encoding fails, try the next one
+                        continue
+                if not sample_text:
+                    raise ValueError("Could not decode file with any of the attempted encodings")
+                # Detect separator from the sample
+                separator = detect_separator(sample_text)
+                # Determine quote character (default to double quote)
+                quotechar = '"'
+                if sample_text.count("'") > sample_text.count('"'):
+                    quotechar = "'"
+                # Read with detected parameters
+                try:
+                    df = pd.read_csv(
+                        file_path,
+                        sep=separator,
+                        encoding=detected_encoding,
+                        engine='python' if separator != ',' else 'c',
+                        quotechar=quotechar,
+                        doublequote=True
+                    )
+                except pd.errors.ParserError as e:
+                    # If parsing fails, try again with error recovery options
+                    print(f"Initial parsing failed on reload: {str(e)}. Trying with error recovery options...")
+                    # Try with Python engine which is more flexible
+                    try:
+                        # First try with pandas >= 1.3 parameters
+                        df = pd.read_csv(
+                            file_path,
+                            sep=separator,
+                            encoding=detected_encoding,
+                            engine='python',  # Always use python engine for error recovery
+                            quotechar=quotechar,
+                            doublequote=True,
+                            on_bad_lines='warn',  # New parameter in pandas >= 1.3
+                            na_values=[''],
+                            keep_default_na=True
+                        )
+                    except TypeError:
+                        # Fall back to pandas < 1.3 parameters
+                        df = pd.read_csv(
+                            file_path,
+                            sep=separator,
+                            encoding=detected_encoding,
+                            engine='python',
+                            quotechar=quotechar,
+                            doublequote=True,
+                            error_bad_lines=False,  # Old parameter
+                            warn_bad_lines=True,    # Old parameter
+                            na_values=[''],
+                            keep_default_na=True
+                        )
             elif file_path.endswith('.parquet'):
                 df = pd.read_parquet(file_path)
             else:
@@ -547,6 +859,7 @@ class DatabaseManager:
         if self.connection_type == 'sqlite':
             df.to_sql(table_name, self.conn, index=False, if_exists='replace')
         else:  # duckdb
+            # Register the DataFrame directly
             self.conn.register(table_name, df)
         # Track the table

sqlshell 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

sqlshell 0.2.2py3-none-any.whl → 0.3.0py3-none-any.whl