sqlshell 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sqlshell might be problematic. Click here for more details.

sqlshell/README.md CHANGED
@@ -1,4 +1,6 @@
1
- # SQLShell
1
+ # SQLShell - DEPRECATED README
2
+
3
+ **NOTE: This README is deprecated. Please refer to the main README.md file in the root directory of the repository for the most up-to-date information.**
2
4
 
3
5
  A powerful SQL shell with GUI interface for data analysis. SQLShell provides an intuitive interface for working with various data formats (CSV, Excel, Parquet) using SQL queries powered by DuckDB.
4
6
 
@@ -12,6 +14,7 @@ A powerful SQL shell with GUI interface for data analysis. SQLShell provides an
12
14
  - Table preview functionality
13
15
  - Built-in test data generation
14
16
  - Support for multiple concurrent table views
17
+ - "Explain Column" feature for analyzing relationships between data columns
15
18
 
16
19
  ## Installation
17
20
 
@@ -45,6 +48,7 @@ This will open the GUI interface where you can:
45
48
  3. Execute queries using the "Execute" button or Ctrl+Enter
46
49
  4. View results in the table view below
47
50
  5. Load sample test data using the "Test" button
51
+ 6. Right-click on column headers in the results to access features like sorting, filtering, and the "Explain Column" analysis tool
48
52
 
49
53
  ## Requirements
50
54
 
sqlshell/__init__.py CHANGED
@@ -2,13 +2,43 @@
2
2
  SQLShell - A powerful SQL shell with GUI interface for data analysis
3
3
  """
4
4
 
5
- __version__ = "0.2.2"
5
+ __version__ = "0.2.3"
6
6
  __author__ = "SQLShell Team"
7
7
 
8
- from sqlshell.main import main
8
+ from sqlshell.main import main, SQLShell
9
+ from PyQt6.QtWidgets import QApplication
10
+ import sys
9
11
 
10
- def start():
11
- """Start the SQLShell application."""
12
- main()
12
+ def start(database_path=None):
13
+ """Start the SQLShell application.
14
+
15
+ Args:
16
+ database_path (str, optional): Path to a database file to open. If provided,
17
+ SQLShell will automatically open this database on startup.
18
+ """
19
+ app = QApplication(sys.argv)
20
+ window = SQLShell()
21
+
22
+ if database_path:
23
+ try:
24
+ # Open the database
25
+ window.db_manager.open_database(database_path, load_all_tables=True)
26
+
27
+ # Update UI with tables from the database
28
+ for table_name, source in window.db_manager.loaded_tables.items():
29
+ if source == 'database':
30
+ window.tables_list.add_table_item(table_name, "database")
31
+
32
+ # Update the completer with table and column names
33
+ window.update_completer()
34
+
35
+ # Update status bar
36
+ window.statusBar().showMessage(f"Connected to database: {database_path}")
37
+ window.db_info_label.setText(window.db_manager.get_connection_info())
38
+ except Exception as e:
39
+ print(f"Error opening database: {e}")
40
+
41
+ window.show()
42
+ sys.exit(app.exec())
13
43
 
14
44
  # SQLShell package initialization
@@ -10,6 +10,35 @@ np.random.seed(42)
10
10
  OUTPUT_DIR = 'test_data'
11
11
  os.makedirs(OUTPUT_DIR, exist_ok=True)
12
12
 
13
+ def create_california_housing_data(output_file='california_housing_data.parquet'):
14
+ """Use the real world california housing dataset"""
15
+ # Load the dataset
16
+ df = pd.read_csv('https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv')
17
+
18
+ # Save to Parquet
19
+ df.to_parquet(output_file)
20
+ return df
21
+
22
+ def create_large_customer_data(num_customers=1_000_000, chunk_size=100_000, output_file='large_customer_data.parquet'):
23
+ """Create a large customer dataset """
24
+ # Generate customer data
25
+ data = {
26
+ 'CustomerID': range(1, num_customers + 1),
27
+ 'FirstName': [f'Customer{i}' for i in range(1, num_customers + 1)],
28
+ 'LastName': [f'Lastname{i}' for i in range(1, num_customers + 1)],
29
+ 'Email': [f'customer{i}@example.com' for i in range(1, num_customers + 1)],
30
+ 'JoinDate': [datetime.now() - timedelta(days=np.random.randint(1, 1000))
31
+ for _ in range(num_customers)],
32
+ 'CustomerType': np.random.choice(['Regular', 'Premium', 'VIP'], num_customers),
33
+ 'CreditScore': np.random.randint(300, 851, num_customers)
34
+ }
35
+
36
+ # Create DataFrame
37
+ df = pd.DataFrame(data)
38
+
39
+ return df
40
+
41
+
13
42
  def create_sales_data(num_records=1000):
14
43
  # Generate dates for the last 365 days
15
44
  end_date = datetime.now()
sqlshell/db/__init__.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Database management components for SQLShell application."""
2
2
 
3
3
  from sqlshell.db.database_manager import DatabaseManager
4
+ from sqlshell.db.export_manager import ExportManager
4
5
 
5
- __all__ = ['DatabaseManager']
6
+ __all__ = ['DatabaseManager', 'ExportManager']
@@ -219,7 +219,7 @@ class DatabaseManager:
219
219
  Load data from a file into the database.
220
220
 
221
221
  Args:
222
- file_path: Path to the data file (Excel, CSV, Parquet, Delta)
222
+ file_path: Path to the data file (Excel, CSV, TXT, Parquet, Delta)
223
223
 
224
224
  Returns:
225
225
  Tuple of (table_name, DataFrame) for the loaded data
@@ -240,8 +240,29 @@ class DatabaseManager:
240
240
  # Load the Delta table
241
241
  import deltalake
242
242
  delta_table = deltalake.DeltaTable(file_path)
243
- # Convert to pandas DataFrame
243
+
244
+ # Get the schema to identify decimal columns
245
+ schema = delta_table.schema()
246
+ decimal_columns = []
247
+
248
+ # Identify decimal columns from schema
249
+ for field in schema.fields:
250
+ # Use string representation to check for decimal
251
+ if 'decimal' in str(field.type).lower():
252
+ decimal_columns.append(field.name)
253
+
254
+ # Read the data
244
255
  df = delta_table.to_pandas()
256
+
257
+ # Try to convert decimal columns to float64, warn if not possible
258
+ for col in decimal_columns:
259
+ if col in df.columns:
260
+ try:
261
+ df[col] = pd.to_numeric(df[col], errors='coerce').astype('float64')
262
+ if df[col].isna().any():
263
+ print(f"Warning: Some values in column '{col}' could not be converted to float64 and are set as NaN.")
264
+ except Exception as e:
265
+ print(f"Warning: Could not convert column '{col}' to float64: {e}")
245
266
  except Exception as e:
246
267
  raise ValueError(f"Error loading Delta table: {str(e)}")
247
268
  elif file_path.endswith(('.xlsx', '.xls')):
@@ -267,37 +288,208 @@ class DatabaseManager:
267
288
  except Exception:
268
289
  # Fallback to standard reading method
269
290
  df = pd.read_excel(file_path)
270
- elif file_path.endswith('.csv'):
271
- # For CSV files, we can use chunking for large files
291
+ elif file_path.endswith(('.csv', '.txt')):
292
+ # For CSV and TXT files, detect separator and use chunking for large files
272
293
  try:
273
294
  # Check if it's a large file
274
295
  file_size = os.path.getsize(file_path) / (1024 * 1024) # Size in MB
275
296
 
276
- if file_size > 50: # If file is larger than 50MB
277
- # Read the first chunk to get column types
278
- df_preview = pd.read_csv(file_path, nrows=1000)
297
+ # Try multiple encodings if needed
298
+ encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'ISO-8859-1']
299
+
300
+ # Detect the separator automatically
301
+ def detect_separator(sample_data):
302
+ # Common separators to check
303
+ separators = [',', ';', '\t']
304
+ separator_scores = {}
305
+
306
+ # Split into lines and analyze
307
+ lines = [line.strip() for line in sample_data.split('\n') if line.strip()]
308
+ if not lines:
309
+ return ',' # Default if no content
310
+
311
+ # Check for quoted content with separators
312
+ has_quotes = '"' in sample_data or "'" in sample_data
279
313
 
280
- # Use optimized dtypes for better memory usage
281
- dtypes = {col: df_preview[col].dtype for col in df_preview.columns}
314
+ # If we have quoted content, use a different approach
315
+ if has_quotes:
316
+ for sep in separators:
317
+ # Look for patterns like "value";
318
+ pattern_count = 0
319
+ for line in lines:
320
+ # Count occurrences of quote + separator
321
+ double_quote_pattern = f'"{sep}'
322
+ single_quote_pattern = f"'{sep}"
323
+ pattern_count += line.count(double_quote_pattern) + line.count(single_quote_pattern)
324
+
325
+ # If we found clear quote+separator patterns, this is likely our separator
326
+ if pattern_count > 0:
327
+ separator_scores[sep] = pattern_count
282
328
 
283
- # Read again with chunk processing, combining up to 100k rows
284
- chunks = []
285
- for chunk in pd.read_csv(file_path, dtype=dtypes, chunksize=10000):
286
- chunks.append(chunk)
287
- if len(chunks) * 10000 >= 100000: # Cap at 100k rows
288
- break
329
+ # Standard approach based on consistent column counts
330
+ if not separator_scores:
331
+ for sep in separators:
332
+ # Count consistent occurrences across lines
333
+ counts = [line.count(sep) for line in lines]
334
+ if counts and all(c > 0 for c in counts):
335
+ # Calculate consistency score: higher if all counts are the same
336
+ consistency = 1.0 if all(c == counts[0] for c in counts) else 0.5
337
+ # Score is average count * consistency
338
+ separator_scores[sep] = sum(counts) / len(counts) * consistency
289
339
 
290
- df = pd.concat(chunks, ignore_index=True)
340
+ # Choose the separator with the highest score
341
+ if separator_scores:
342
+ return max(separator_scores.items(), key=lambda x: x[1])[0]
343
+
344
+ # Default to comma if we couldn't determine
345
+ return ','
346
+
347
+ # First, sample the file to detect separator
348
+ with open(file_path, 'rb') as f:
349
+ # Read first few KB to detect encoding and separator
350
+ raw_sample = f.read(4096)
351
+
352
+ # Try to decode with various encodings
353
+ sample_text = None
354
+ detected_encoding = None
355
+
356
+ for encoding in encodings_to_try:
357
+ try:
358
+ sample_text = raw_sample.decode(encoding)
359
+ detected_encoding = encoding
360
+ break
361
+ except UnicodeDecodeError:
362
+ continue
363
+
364
+ if not sample_text:
365
+ raise ValueError("Could not decode file with any of the attempted encodings")
366
+
367
+ # Detect separator from the sample
368
+ separator = detect_separator(sample_text)
369
+
370
+ # Determine quote character (default to double quote)
371
+ quotechar = '"'
372
+ if sample_text.count("'") > sample_text.count('"'):
373
+ quotechar = "'"
374
+
375
+ if file_size > 50: # If file is larger than 50MB
376
+ # Read the first chunk to get column types
377
+ try:
378
+ df_preview = pd.read_csv(
379
+ file_path,
380
+ sep=separator,
381
+ nrows=1000,
382
+ encoding=detected_encoding,
383
+ engine='python' if separator != ',' else 'c',
384
+ quotechar=quotechar,
385
+ doublequote=True
386
+ )
387
+
388
+ # Use optimized dtypes for better memory usage
389
+ dtypes = {col: df_preview[col].dtype for col in df_preview.columns}
390
+
391
+ # Read again with chunk processing, combining up to 100k rows
392
+ chunks = []
393
+ for chunk in pd.read_csv(
394
+ file_path,
395
+ sep=separator,
396
+ dtype=dtypes,
397
+ chunksize=10000,
398
+ encoding=detected_encoding,
399
+ engine='python' if separator != ',' else 'c',
400
+ quotechar=quotechar,
401
+ doublequote=True
402
+ ):
403
+ chunks.append(chunk)
404
+ if len(chunks) * 10000 >= 100000: # Cap at 100k rows
405
+ break
406
+
407
+ df = pd.concat(chunks, ignore_index=True)
408
+ except pd.errors.ParserError as e:
409
+ # If parsing fails, try again with error recovery options
410
+ print(f"Initial parsing failed: {str(e)}. Trying with error recovery options...")
411
+
412
+ # Try with Python engine which is more flexible
413
+ try:
414
+ # First try with pandas >= 1.3 parameters
415
+ df = pd.read_csv(
416
+ file_path,
417
+ sep=separator,
418
+ encoding=detected_encoding,
419
+ engine='python', # Always use python engine for error recovery
420
+ quotechar=quotechar,
421
+ doublequote=True,
422
+ on_bad_lines='warn', # New parameter in pandas >= 1.3
423
+ na_values=[''],
424
+ keep_default_na=True
425
+ )
426
+ except TypeError:
427
+ # Fall back to pandas < 1.3 parameters
428
+ df = pd.read_csv(
429
+ file_path,
430
+ sep=separator,
431
+ encoding=detected_encoding,
432
+ engine='python',
433
+ quotechar=quotechar,
434
+ doublequote=True,
435
+ error_bad_lines=False, # Old parameter
436
+ warn_bad_lines=True, # Old parameter
437
+ na_values=[''],
438
+ keep_default_na=True
439
+ )
291
440
  else:
292
441
  # For smaller files, read everything at once
293
- df = pd.read_csv(file_path)
294
- except Exception:
295
- # Fallback to standard reading method
296
- df = pd.read_csv(file_path)
442
+ try:
443
+ df = pd.read_csv(
444
+ file_path,
445
+ sep=separator,
446
+ encoding=detected_encoding,
447
+ engine='python' if separator != ',' else 'c',
448
+ quotechar=quotechar,
449
+ doublequote=True
450
+ )
451
+ except pd.errors.ParserError as e:
452
+ # If parsing fails, try again with error recovery options
453
+ print(f"Initial parsing failed: {str(e)}. Trying with error recovery options...")
454
+
455
+ # Try with Python engine which is more flexible
456
+ try:
457
+ # First try with pandas >= 1.3 parameters
458
+ df = pd.read_csv(
459
+ file_path,
460
+ sep=separator,
461
+ encoding=detected_encoding,
462
+ engine='python', # Always use python engine for error recovery
463
+ quotechar=quotechar,
464
+ doublequote=True,
465
+ on_bad_lines='warn', # New parameter in pandas >= 1.3
466
+ na_values=[''],
467
+ keep_default_na=True
468
+ )
469
+ except TypeError:
470
+ # Fall back to pandas < 1.3 parameters
471
+ df = pd.read_csv(
472
+ file_path,
473
+ sep=separator,
474
+ encoding=detected_encoding,
475
+ engine='python',
476
+ quotechar=quotechar,
477
+ doublequote=True,
478
+ error_bad_lines=False, # Old parameter
479
+ warn_bad_lines=True, # Old parameter
480
+ na_values=[''],
481
+ keep_default_na=True
482
+ )
483
+ except Exception as e:
484
+ # Log the error for debugging
485
+ import traceback
486
+ print(f"Error loading CSV/TXT file: {str(e)}")
487
+ print(traceback.format_exc())
488
+ raise ValueError(f"Error loading CSV/TXT file: {str(e)}")
297
489
  elif file_path.endswith('.parquet'):
298
490
  df = pd.read_parquet(file_path)
299
491
  else:
300
- raise ValueError("Unsupported file format")
492
+ raise ValueError("Unsupported file format. Supported formats: .xlsx, .xls, .csv, .txt, .parquet, and Delta tables.")
301
493
 
302
494
  # Generate table name from file name
303
495
  base_name = os.path.splitext(os.path.basename(file_path))[0]
@@ -448,8 +640,128 @@ class DatabaseManager:
448
640
  df = delta_table.to_pandas()
449
641
  elif file_path.endswith(('.xlsx', '.xls')):
450
642
  df = pd.read_excel(file_path)
451
- elif file_path.endswith('.csv'):
452
- df = pd.read_csv(file_path)
643
+ elif file_path.endswith(('.csv', '.txt')):
644
+ # Try multiple encodings for CSV/TXT files
645
+ encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'ISO-8859-1']
646
+
647
+ # Detect the separator automatically
648
+ def detect_separator(sample_data):
649
+ # Common separators to check
650
+ separators = [',', ';', '\t']
651
+ separator_scores = {}
652
+
653
+ # Split into lines and analyze
654
+ lines = [line.strip() for line in sample_data.split('\n') if line.strip()]
655
+ if not lines:
656
+ return ',' # Default if no content
657
+
658
+ # Check for quoted content with separators
659
+ has_quotes = '"' in sample_data or "'" in sample_data
660
+
661
+ # If we have quoted content, use a different approach
662
+ if has_quotes:
663
+ for sep in separators:
664
+ # Look for patterns like "value";
665
+ pattern_count = 0
666
+ for line in lines:
667
+ # Count occurrences of quote + separator
668
+ double_quote_pattern = f'"{sep}'
669
+ single_quote_pattern = f"'{sep}"
670
+ pattern_count += line.count(double_quote_pattern) + line.count(single_quote_pattern)
671
+
672
+ # If we found clear quote+separator patterns, this is likely our separator
673
+ if pattern_count > 0:
674
+ separator_scores[sep] = pattern_count
675
+
676
+ # Standard approach based on consistent column counts
677
+ if not separator_scores:
678
+ for sep in separators:
679
+ # Count consistent occurrences across lines
680
+ counts = [line.count(sep) for line in lines]
681
+ if counts and all(c > 0 for c in counts):
682
+ # Calculate consistency score: higher if all counts are the same
683
+ consistency = 1.0 if all(c == counts[0] for c in counts) else 0.5
684
+ # Score is average count * consistency
685
+ separator_scores[sep] = sum(counts) / len(counts) * consistency
686
+
687
+ # Choose the separator with the highest score
688
+ if separator_scores:
689
+ return max(separator_scores.items(), key=lambda x: x[1])[0]
690
+
691
+ # Default to comma if we couldn't determine
692
+ return ','
693
+
694
+ # First, sample the file to detect separator and encoding
695
+ with open(file_path, 'rb') as f:
696
+ # Read first few KB to detect encoding and separator
697
+ raw_sample = f.read(4096)
698
+
699
+ # Try to decode with various encodings
700
+ sample_text = None
701
+ detected_encoding = None
702
+
703
+ for encoding in encodings_to_try:
704
+ try:
705
+ sample_text = raw_sample.decode(encoding)
706
+ detected_encoding = encoding
707
+ break
708
+ except UnicodeDecodeError:
709
+ # If this encoding fails, try the next one
710
+ continue
711
+
712
+ if not sample_text:
713
+ raise ValueError("Could not decode file with any of the attempted encodings")
714
+
715
+ # Detect separator from the sample
716
+ separator = detect_separator(sample_text)
717
+
718
+ # Determine quote character (default to double quote)
719
+ quotechar = '"'
720
+ if sample_text.count("'") > sample_text.count('"'):
721
+ quotechar = "'"
722
+
723
+ # Read with detected parameters
724
+ try:
725
+ df = pd.read_csv(
726
+ file_path,
727
+ sep=separator,
728
+ encoding=detected_encoding,
729
+ engine='python' if separator != ',' else 'c',
730
+ quotechar=quotechar,
731
+ doublequote=True
732
+ )
733
+ except pd.errors.ParserError as e:
734
+ # If parsing fails, try again with error recovery options
735
+ print(f"Initial parsing failed on reload: {str(e)}. Trying with error recovery options...")
736
+
737
+ # Try with Python engine which is more flexible
738
+ try:
739
+ # First try with pandas >= 1.3 parameters
740
+ df = pd.read_csv(
741
+ file_path,
742
+ sep=separator,
743
+ encoding=detected_encoding,
744
+ engine='python', # Always use python engine for error recovery
745
+ quotechar=quotechar,
746
+ doublequote=True,
747
+ on_bad_lines='warn', # New parameter in pandas >= 1.3
748
+ na_values=[''],
749
+ keep_default_na=True
750
+ )
751
+ except TypeError:
752
+ # Fall back to pandas < 1.3 parameters
753
+ df = pd.read_csv(
754
+ file_path,
755
+ sep=separator,
756
+ encoding=detected_encoding,
757
+ engine='python',
758
+ quotechar=quotechar,
759
+ doublequote=True,
760
+ error_bad_lines=False, # Old parameter
761
+ warn_bad_lines=True, # Old parameter
762
+ na_values=[''],
763
+ keep_default_na=True
764
+ )
453
765
  elif file_path.endswith('.parquet'):
454
766
  df = pd.read_parquet(file_path)
455
767
  else:
@@ -547,6 +859,7 @@ class DatabaseManager:
547
859
  if self.connection_type == 'sqlite':
548
860
  df.to_sql(table_name, self.conn, index=False, if_exists='replace')
549
861
  else: # duckdb
862
+ # Register the DataFrame directly
550
863
  self.conn.register(table_name, df)
551
864
 
552
865
  # Track the table