sqlshell 0.2.3__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sqlshell might be problematic. Click here for more details.
- sqlshell/__init__.py +34 -4
- sqlshell/db/__init__.py +2 -1
- sqlshell/db/database_manager.py +336 -23
- sqlshell/db/export_manager.py +188 -0
- sqlshell/editor_integration.py +127 -0
- sqlshell/execution_handler.py +421 -0
- sqlshell/main.py +570 -140
- sqlshell/query_tab.py +592 -7
- sqlshell/ui/filter_header.py +22 -1
- sqlshell/utils/profile_column.py +1586 -170
- sqlshell/utils/profile_foreign_keys.py +103 -11
- sqlshell/utils/profile_ohe.py +631 -0
- {sqlshell-0.2.3.dist-info → sqlshell-0.3.0.dist-info}/METADATA +126 -7
- {sqlshell-0.2.3.dist-info → sqlshell-0.3.0.dist-info}/RECORD +17 -13
- {sqlshell-0.2.3.dist-info → sqlshell-0.3.0.dist-info}/WHEEL +1 -1
- {sqlshell-0.2.3.dist-info → sqlshell-0.3.0.dist-info}/entry_points.txt +0 -0
- {sqlshell-0.2.3.dist-info → sqlshell-0.3.0.dist-info}/top_level.txt +0 -0
sqlshell/__init__.py
CHANGED
|
@@ -5,10 +5,40 @@ SQLShell - A powerful SQL shell with GUI interface for data analysis
|
|
|
5
5
|
__version__ = "0.2.3"
|
|
6
6
|
__author__ = "SQLShell Team"
|
|
7
7
|
|
|
8
|
-
from sqlshell.main import main
|
|
8
|
+
from sqlshell.main import main, SQLShell
|
|
9
|
+
from PyQt6.QtWidgets import QApplication
|
|
10
|
+
import sys
|
|
9
11
|
|
|
10
|
-
def start():
|
|
11
|
-
"""Start the SQLShell application.
|
|
12
|
-
|
|
12
|
+
def start(database_path=None):
|
|
13
|
+
"""Start the SQLShell application.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
database_path (str, optional): Path to a database file to open. If provided,
|
|
17
|
+
SQLShell will automatically open this database on startup.
|
|
18
|
+
"""
|
|
19
|
+
app = QApplication(sys.argv)
|
|
20
|
+
window = SQLShell()
|
|
21
|
+
|
|
22
|
+
if database_path:
|
|
23
|
+
try:
|
|
24
|
+
# Open the database
|
|
25
|
+
window.db_manager.open_database(database_path, load_all_tables=True)
|
|
26
|
+
|
|
27
|
+
# Update UI with tables from the database
|
|
28
|
+
for table_name, source in window.db_manager.loaded_tables.items():
|
|
29
|
+
if source == 'database':
|
|
30
|
+
window.tables_list.add_table_item(table_name, "database")
|
|
31
|
+
|
|
32
|
+
# Update the completer with table and column names
|
|
33
|
+
window.update_completer()
|
|
34
|
+
|
|
35
|
+
# Update status bar
|
|
36
|
+
window.statusBar().showMessage(f"Connected to database: {database_path}")
|
|
37
|
+
window.db_info_label.setText(window.db_manager.get_connection_info())
|
|
38
|
+
except Exception as e:
|
|
39
|
+
print(f"Error opening database: {e}")
|
|
40
|
+
|
|
41
|
+
window.show()
|
|
42
|
+
sys.exit(app.exec())
|
|
13
43
|
|
|
14
44
|
# SQLShell package initialization
|
sqlshell/db/__init__.py
CHANGED
sqlshell/db/database_manager.py
CHANGED
|
@@ -219,7 +219,7 @@ class DatabaseManager:
|
|
|
219
219
|
Load data from a file into the database.
|
|
220
220
|
|
|
221
221
|
Args:
|
|
222
|
-
file_path: Path to the data file (Excel, CSV, Parquet, Delta)
|
|
222
|
+
file_path: Path to the data file (Excel, CSV, TXT, Parquet, Delta)
|
|
223
223
|
|
|
224
224
|
Returns:
|
|
225
225
|
Tuple of (table_name, DataFrame) for the loaded data
|
|
@@ -240,8 +240,29 @@ class DatabaseManager:
|
|
|
240
240
|
# Load the Delta table
|
|
241
241
|
import deltalake
|
|
242
242
|
delta_table = deltalake.DeltaTable(file_path)
|
|
243
|
-
|
|
243
|
+
|
|
244
|
+
# Get the schema to identify decimal columns
|
|
245
|
+
schema = delta_table.schema()
|
|
246
|
+
decimal_columns = []
|
|
247
|
+
|
|
248
|
+
# Identify decimal columns from schema
|
|
249
|
+
for field in schema.fields:
|
|
250
|
+
# Use string representation to check for decimal
|
|
251
|
+
if 'decimal' in str(field.type).lower():
|
|
252
|
+
decimal_columns.append(field.name)
|
|
253
|
+
|
|
254
|
+
# Read the data
|
|
244
255
|
df = delta_table.to_pandas()
|
|
256
|
+
|
|
257
|
+
# Try to convert decimal columns to float64, warn if not possible
|
|
258
|
+
for col in decimal_columns:
|
|
259
|
+
if col in df.columns:
|
|
260
|
+
try:
|
|
261
|
+
df[col] = pd.to_numeric(df[col], errors='coerce').astype('float64')
|
|
262
|
+
if df[col].isna().any():
|
|
263
|
+
print(f"Warning: Some values in column '{col}' could not be converted to float64 and are set as NaN.")
|
|
264
|
+
except Exception as e:
|
|
265
|
+
print(f"Warning: Could not convert column '{col}' to float64: {e}")
|
|
245
266
|
except Exception as e:
|
|
246
267
|
raise ValueError(f"Error loading Delta table: {str(e)}")
|
|
247
268
|
elif file_path.endswith(('.xlsx', '.xls')):
|
|
@@ -267,37 +288,208 @@ class DatabaseManager:
|
|
|
267
288
|
except Exception:
|
|
268
289
|
# Fallback to standard reading method
|
|
269
290
|
df = pd.read_excel(file_path)
|
|
270
|
-
elif file_path.endswith('.csv'):
|
|
271
|
-
# For CSV files,
|
|
291
|
+
elif file_path.endswith(('.csv', '.txt')):
|
|
292
|
+
# For CSV and TXT files, detect separator and use chunking for large files
|
|
272
293
|
try:
|
|
273
294
|
# Check if it's a large file
|
|
274
295
|
file_size = os.path.getsize(file_path) / (1024 * 1024) # Size in MB
|
|
275
296
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
297
|
+
# Try multiple encodings if needed
|
|
298
|
+
encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'ISO-8859-1']
|
|
299
|
+
|
|
300
|
+
# Detect the separator automatically
|
|
301
|
+
def detect_separator(sample_data):
|
|
302
|
+
# Common separators to check
|
|
303
|
+
separators = [',', ';', '\t']
|
|
304
|
+
separator_scores = {}
|
|
305
|
+
|
|
306
|
+
# Split into lines and analyze
|
|
307
|
+
lines = [line.strip() for line in sample_data.split('\n') if line.strip()]
|
|
308
|
+
if not lines:
|
|
309
|
+
return ',' # Default if no content
|
|
310
|
+
|
|
311
|
+
# Check for quoted content with separators
|
|
312
|
+
has_quotes = '"' in sample_data or "'" in sample_data
|
|
279
313
|
|
|
280
|
-
#
|
|
281
|
-
|
|
314
|
+
# If we have quoted content, use a different approach
|
|
315
|
+
if has_quotes:
|
|
316
|
+
for sep in separators:
|
|
317
|
+
# Look for patterns like "value";
|
|
318
|
+
pattern_count = 0
|
|
319
|
+
for line in lines:
|
|
320
|
+
# Count occurrences of quote + separator
|
|
321
|
+
double_quote_pattern = f'"{sep}'
|
|
322
|
+
single_quote_pattern = f"'{sep}"
|
|
323
|
+
pattern_count += line.count(double_quote_pattern) + line.count(single_quote_pattern)
|
|
324
|
+
|
|
325
|
+
# If we found clear quote+separator patterns, this is likely our separator
|
|
326
|
+
if pattern_count > 0:
|
|
327
|
+
separator_scores[sep] = pattern_count
|
|
282
328
|
|
|
283
|
-
#
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
329
|
+
# Standard approach based on consistent column counts
|
|
330
|
+
if not separator_scores:
|
|
331
|
+
for sep in separators:
|
|
332
|
+
# Count consistent occurrences across lines
|
|
333
|
+
counts = [line.count(sep) for line in lines]
|
|
334
|
+
if counts and all(c > 0 for c in counts):
|
|
335
|
+
# Calculate consistency score: higher if all counts are the same
|
|
336
|
+
consistency = 1.0 if all(c == counts[0] for c in counts) else 0.5
|
|
337
|
+
# Score is average count * consistency
|
|
338
|
+
separator_scores[sep] = sum(counts) / len(counts) * consistency
|
|
289
339
|
|
|
290
|
-
|
|
340
|
+
# Choose the separator with the highest score
|
|
341
|
+
if separator_scores:
|
|
342
|
+
return max(separator_scores.items(), key=lambda x: x[1])[0]
|
|
343
|
+
|
|
344
|
+
# Default to comma if we couldn't determine
|
|
345
|
+
return ','
|
|
346
|
+
|
|
347
|
+
# First, sample the file to detect separator
|
|
348
|
+
with open(file_path, 'rb') as f:
|
|
349
|
+
# Read first few KB to detect encoding and separator
|
|
350
|
+
raw_sample = f.read(4096)
|
|
351
|
+
|
|
352
|
+
# Try to decode with various encodings
|
|
353
|
+
sample_text = None
|
|
354
|
+
detected_encoding = None
|
|
355
|
+
|
|
356
|
+
for encoding in encodings_to_try:
|
|
357
|
+
try:
|
|
358
|
+
sample_text = raw_sample.decode(encoding)
|
|
359
|
+
detected_encoding = encoding
|
|
360
|
+
break
|
|
361
|
+
except UnicodeDecodeError:
|
|
362
|
+
continue
|
|
363
|
+
|
|
364
|
+
if not sample_text:
|
|
365
|
+
raise ValueError("Could not decode file with any of the attempted encodings")
|
|
366
|
+
|
|
367
|
+
# Detect separator from the sample
|
|
368
|
+
separator = detect_separator(sample_text)
|
|
369
|
+
|
|
370
|
+
# Determine quote character (default to double quote)
|
|
371
|
+
quotechar = '"'
|
|
372
|
+
if sample_text.count("'") > sample_text.count('"'):
|
|
373
|
+
quotechar = "'"
|
|
374
|
+
|
|
375
|
+
if file_size > 50: # If file is larger than 50MB
|
|
376
|
+
# Read the first chunk to get column types
|
|
377
|
+
try:
|
|
378
|
+
df_preview = pd.read_csv(
|
|
379
|
+
file_path,
|
|
380
|
+
sep=separator,
|
|
381
|
+
nrows=1000,
|
|
382
|
+
encoding=detected_encoding,
|
|
383
|
+
engine='python' if separator != ',' else 'c',
|
|
384
|
+
quotechar=quotechar,
|
|
385
|
+
doublequote=True
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
# Use optimized dtypes for better memory usage
|
|
389
|
+
dtypes = {col: df_preview[col].dtype for col in df_preview.columns}
|
|
390
|
+
|
|
391
|
+
# Read again with chunk processing, combining up to 100k rows
|
|
392
|
+
chunks = []
|
|
393
|
+
for chunk in pd.read_csv(
|
|
394
|
+
file_path,
|
|
395
|
+
sep=separator,
|
|
396
|
+
dtype=dtypes,
|
|
397
|
+
chunksize=10000,
|
|
398
|
+
encoding=detected_encoding,
|
|
399
|
+
engine='python' if separator != ',' else 'c',
|
|
400
|
+
quotechar=quotechar,
|
|
401
|
+
doublequote=True
|
|
402
|
+
):
|
|
403
|
+
chunks.append(chunk)
|
|
404
|
+
if len(chunks) * 10000 >= 100000: # Cap at 100k rows
|
|
405
|
+
break
|
|
406
|
+
|
|
407
|
+
df = pd.concat(chunks, ignore_index=True)
|
|
408
|
+
except pd.errors.ParserError as e:
|
|
409
|
+
# If parsing fails, try again with error recovery options
|
|
410
|
+
print(f"Initial parsing failed: {str(e)}. Trying with error recovery options...")
|
|
411
|
+
|
|
412
|
+
# Try with Python engine which is more flexible
|
|
413
|
+
try:
|
|
414
|
+
# First try with pandas >= 1.3 parameters
|
|
415
|
+
df = pd.read_csv(
|
|
416
|
+
file_path,
|
|
417
|
+
sep=separator,
|
|
418
|
+
encoding=detected_encoding,
|
|
419
|
+
engine='python', # Always use python engine for error recovery
|
|
420
|
+
quotechar=quotechar,
|
|
421
|
+
doublequote=True,
|
|
422
|
+
on_bad_lines='warn', # New parameter in pandas >= 1.3
|
|
423
|
+
na_values=[''],
|
|
424
|
+
keep_default_na=True
|
|
425
|
+
)
|
|
426
|
+
except TypeError:
|
|
427
|
+
# Fall back to pandas < 1.3 parameters
|
|
428
|
+
df = pd.read_csv(
|
|
429
|
+
file_path,
|
|
430
|
+
sep=separator,
|
|
431
|
+
encoding=detected_encoding,
|
|
432
|
+
engine='python',
|
|
433
|
+
quotechar=quotechar,
|
|
434
|
+
doublequote=True,
|
|
435
|
+
error_bad_lines=False, # Old parameter
|
|
436
|
+
warn_bad_lines=True, # Old parameter
|
|
437
|
+
na_values=[''],
|
|
438
|
+
keep_default_na=True
|
|
439
|
+
)
|
|
291
440
|
else:
|
|
292
441
|
# For smaller files, read everything at once
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
442
|
+
try:
|
|
443
|
+
df = pd.read_csv(
|
|
444
|
+
file_path,
|
|
445
|
+
sep=separator,
|
|
446
|
+
encoding=detected_encoding,
|
|
447
|
+
engine='python' if separator != ',' else 'c',
|
|
448
|
+
quotechar=quotechar,
|
|
449
|
+
doublequote=True
|
|
450
|
+
)
|
|
451
|
+
except pd.errors.ParserError as e:
|
|
452
|
+
# If parsing fails, try again with error recovery options
|
|
453
|
+
print(f"Initial parsing failed: {str(e)}. Trying with error recovery options...")
|
|
454
|
+
|
|
455
|
+
# Try with Python engine which is more flexible
|
|
456
|
+
try:
|
|
457
|
+
# First try with pandas >= 1.3 parameters
|
|
458
|
+
df = pd.read_csv(
|
|
459
|
+
file_path,
|
|
460
|
+
sep=separator,
|
|
461
|
+
encoding=detected_encoding,
|
|
462
|
+
engine='python', # Always use python engine for error recovery
|
|
463
|
+
quotechar=quotechar,
|
|
464
|
+
doublequote=True,
|
|
465
|
+
on_bad_lines='warn', # New parameter in pandas >= 1.3
|
|
466
|
+
na_values=[''],
|
|
467
|
+
keep_default_na=True
|
|
468
|
+
)
|
|
469
|
+
except TypeError:
|
|
470
|
+
# Fall back to pandas < 1.3 parameters
|
|
471
|
+
df = pd.read_csv(
|
|
472
|
+
file_path,
|
|
473
|
+
sep=separator,
|
|
474
|
+
encoding=detected_encoding,
|
|
475
|
+
engine='python',
|
|
476
|
+
quotechar=quotechar,
|
|
477
|
+
doublequote=True,
|
|
478
|
+
error_bad_lines=False, # Old parameter
|
|
479
|
+
warn_bad_lines=True, # Old parameter
|
|
480
|
+
na_values=[''],
|
|
481
|
+
keep_default_na=True
|
|
482
|
+
)
|
|
483
|
+
except Exception as e:
|
|
484
|
+
# Log the error for debugging
|
|
485
|
+
import traceback
|
|
486
|
+
print(f"Error loading CSV/TXT file: {str(e)}")
|
|
487
|
+
print(traceback.format_exc())
|
|
488
|
+
raise ValueError(f"Error loading CSV/TXT file: {str(e)}")
|
|
297
489
|
elif file_path.endswith('.parquet'):
|
|
298
490
|
df = pd.read_parquet(file_path)
|
|
299
491
|
else:
|
|
300
|
-
raise ValueError("Unsupported file format")
|
|
492
|
+
raise ValueError("Unsupported file format. Supported formats: .xlsx, .xls, .csv, .txt, .parquet, and Delta tables.")
|
|
301
493
|
|
|
302
494
|
# Generate table name from file name
|
|
303
495
|
base_name = os.path.splitext(os.path.basename(file_path))[0]
|
|
@@ -448,8 +640,128 @@ class DatabaseManager:
|
|
|
448
640
|
df = delta_table.to_pandas()
|
|
449
641
|
elif file_path.endswith(('.xlsx', '.xls')):
|
|
450
642
|
df = pd.read_excel(file_path)
|
|
451
|
-
elif file_path.endswith('.csv'):
|
|
452
|
-
|
|
643
|
+
elif file_path.endswith(('.csv', '.txt')):
|
|
644
|
+
# Try multiple encodings for CSV/TXT files
|
|
645
|
+
encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'ISO-8859-1']
|
|
646
|
+
|
|
647
|
+
# Detect the separator automatically
|
|
648
|
+
def detect_separator(sample_data):
|
|
649
|
+
# Common separators to check
|
|
650
|
+
separators = [',', ';', '\t']
|
|
651
|
+
separator_scores = {}
|
|
652
|
+
|
|
653
|
+
# Split into lines and analyze
|
|
654
|
+
lines = [line.strip() for line in sample_data.split('\n') if line.strip()]
|
|
655
|
+
if not lines:
|
|
656
|
+
return ',' # Default if no content
|
|
657
|
+
|
|
658
|
+
# Check for quoted content with separators
|
|
659
|
+
has_quotes = '"' in sample_data or "'" in sample_data
|
|
660
|
+
|
|
661
|
+
# If we have quoted content, use a different approach
|
|
662
|
+
if has_quotes:
|
|
663
|
+
for sep in separators:
|
|
664
|
+
# Look for patterns like "value";
|
|
665
|
+
pattern_count = 0
|
|
666
|
+
for line in lines:
|
|
667
|
+
# Count occurrences of quote + separator
|
|
668
|
+
double_quote_pattern = f'"{sep}'
|
|
669
|
+
single_quote_pattern = f"'{sep}"
|
|
670
|
+
pattern_count += line.count(double_quote_pattern) + line.count(single_quote_pattern)
|
|
671
|
+
|
|
672
|
+
# If we found clear quote+separator patterns, this is likely our separator
|
|
673
|
+
if pattern_count > 0:
|
|
674
|
+
separator_scores[sep] = pattern_count
|
|
675
|
+
|
|
676
|
+
# Standard approach based on consistent column counts
|
|
677
|
+
if not separator_scores:
|
|
678
|
+
for sep in separators:
|
|
679
|
+
# Count consistent occurrences across lines
|
|
680
|
+
counts = [line.count(sep) for line in lines]
|
|
681
|
+
if counts and all(c > 0 for c in counts):
|
|
682
|
+
# Calculate consistency score: higher if all counts are the same
|
|
683
|
+
consistency = 1.0 if all(c == counts[0] for c in counts) else 0.5
|
|
684
|
+
# Score is average count * consistency
|
|
685
|
+
separator_scores[sep] = sum(counts) / len(counts) * consistency
|
|
686
|
+
|
|
687
|
+
# Choose the separator with the highest score
|
|
688
|
+
if separator_scores:
|
|
689
|
+
return max(separator_scores.items(), key=lambda x: x[1])[0]
|
|
690
|
+
|
|
691
|
+
# Default to comma if we couldn't determine
|
|
692
|
+
return ','
|
|
693
|
+
|
|
694
|
+
# First, sample the file to detect separator and encoding
|
|
695
|
+
with open(file_path, 'rb') as f:
|
|
696
|
+
# Read first few KB to detect encoding and separator
|
|
697
|
+
raw_sample = f.read(4096)
|
|
698
|
+
|
|
699
|
+
# Try to decode with various encodings
|
|
700
|
+
sample_text = None
|
|
701
|
+
detected_encoding = None
|
|
702
|
+
|
|
703
|
+
for encoding in encodings_to_try:
|
|
704
|
+
try:
|
|
705
|
+
sample_text = raw_sample.decode(encoding)
|
|
706
|
+
detected_encoding = encoding
|
|
707
|
+
break
|
|
708
|
+
except UnicodeDecodeError:
|
|
709
|
+
# If this encoding fails, try the next one
|
|
710
|
+
continue
|
|
711
|
+
|
|
712
|
+
if not sample_text:
|
|
713
|
+
raise ValueError("Could not decode file with any of the attempted encodings")
|
|
714
|
+
|
|
715
|
+
# Detect separator from the sample
|
|
716
|
+
separator = detect_separator(sample_text)
|
|
717
|
+
|
|
718
|
+
# Determine quote character (default to double quote)
|
|
719
|
+
quotechar = '"'
|
|
720
|
+
if sample_text.count("'") > sample_text.count('"'):
|
|
721
|
+
quotechar = "'"
|
|
722
|
+
|
|
723
|
+
# Read with detected parameters
|
|
724
|
+
try:
|
|
725
|
+
df = pd.read_csv(
|
|
726
|
+
file_path,
|
|
727
|
+
sep=separator,
|
|
728
|
+
encoding=detected_encoding,
|
|
729
|
+
engine='python' if separator != ',' else 'c',
|
|
730
|
+
quotechar=quotechar,
|
|
731
|
+
doublequote=True
|
|
732
|
+
)
|
|
733
|
+
except pd.errors.ParserError as e:
|
|
734
|
+
# If parsing fails, try again with error recovery options
|
|
735
|
+
print(f"Initial parsing failed on reload: {str(e)}. Trying with error recovery options...")
|
|
736
|
+
|
|
737
|
+
# Try with Python engine which is more flexible
|
|
738
|
+
try:
|
|
739
|
+
# First try with pandas >= 1.3 parameters
|
|
740
|
+
df = pd.read_csv(
|
|
741
|
+
file_path,
|
|
742
|
+
sep=separator,
|
|
743
|
+
encoding=detected_encoding,
|
|
744
|
+
engine='python', # Always use python engine for error recovery
|
|
745
|
+
quotechar=quotechar,
|
|
746
|
+
doublequote=True,
|
|
747
|
+
on_bad_lines='warn', # New parameter in pandas >= 1.3
|
|
748
|
+
na_values=[''],
|
|
749
|
+
keep_default_na=True
|
|
750
|
+
)
|
|
751
|
+
except TypeError:
|
|
752
|
+
# Fall back to pandas < 1.3 parameters
|
|
753
|
+
df = pd.read_csv(
|
|
754
|
+
file_path,
|
|
755
|
+
sep=separator,
|
|
756
|
+
encoding=detected_encoding,
|
|
757
|
+
engine='python',
|
|
758
|
+
quotechar=quotechar,
|
|
759
|
+
doublequote=True,
|
|
760
|
+
error_bad_lines=False, # Old parameter
|
|
761
|
+
warn_bad_lines=True, # Old parameter
|
|
762
|
+
na_values=[''],
|
|
763
|
+
keep_default_na=True
|
|
764
|
+
)
|
|
453
765
|
elif file_path.endswith('.parquet'):
|
|
454
766
|
df = pd.read_parquet(file_path)
|
|
455
767
|
else:
|
|
@@ -547,6 +859,7 @@ class DatabaseManager:
|
|
|
547
859
|
if self.connection_type == 'sqlite':
|
|
548
860
|
df.to_sql(table_name, self.conn, index=False, if_exists='replace')
|
|
549
861
|
else: # duckdb
|
|
862
|
+
# Register the DataFrame directly
|
|
550
863
|
self.conn.register(table_name, df)
|
|
551
864
|
|
|
552
865
|
# Track the table
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""Export functionality for SQLShell application."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import numpy as np
|
|
6
|
+
from typing import Optional, Tuple, Dict, Any
|
|
7
|
+
|
|
8
|
+
class ExportManager:
|
|
9
|
+
"""Manages data export functionality for SQLShell."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, db_manager):
|
|
12
|
+
"""Initialize the export manager.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
db_manager: The database manager instance to use for table registration
|
|
16
|
+
"""
|
|
17
|
+
self.db_manager = db_manager
|
|
18
|
+
|
|
19
|
+
def export_to_excel(self, df: pd.DataFrame, file_name: str) -> Tuple[str, Dict[str, Any]]:
|
|
20
|
+
"""Export data to Excel format.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
df: The DataFrame to export
|
|
24
|
+
file_name: The target file path
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Tuple containing:
|
|
28
|
+
- The generated table name
|
|
29
|
+
- Dictionary with export metadata
|
|
30
|
+
"""
|
|
31
|
+
try:
|
|
32
|
+
# Export to Excel
|
|
33
|
+
df.to_excel(file_name, index=False)
|
|
34
|
+
|
|
35
|
+
# Generate table name from file name
|
|
36
|
+
base_name = os.path.splitext(os.path.basename(file_name))[0]
|
|
37
|
+
table_name = self.db_manager.sanitize_table_name(base_name)
|
|
38
|
+
|
|
39
|
+
# Ensure unique table name
|
|
40
|
+
original_name = table_name
|
|
41
|
+
counter = 1
|
|
42
|
+
while table_name in self.db_manager.loaded_tables:
|
|
43
|
+
table_name = f"{original_name}_{counter}"
|
|
44
|
+
counter += 1
|
|
45
|
+
|
|
46
|
+
# Register the table in the database manager
|
|
47
|
+
self.db_manager.register_dataframe(df, table_name, file_name)
|
|
48
|
+
|
|
49
|
+
# Update tracking
|
|
50
|
+
self.db_manager.loaded_tables[table_name] = file_name
|
|
51
|
+
self.db_manager.table_columns[table_name] = df.columns.tolist()
|
|
52
|
+
|
|
53
|
+
return table_name, {
|
|
54
|
+
'file_path': file_name,
|
|
55
|
+
'columns': df.columns.tolist(),
|
|
56
|
+
'row_count': len(df)
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
except Exception as e:
|
|
60
|
+
raise Exception(f"Failed to export to Excel: {str(e)}")
|
|
61
|
+
|
|
62
|
+
def export_to_parquet(self, df: pd.DataFrame, file_name: str) -> Tuple[str, Dict[str, Any]]:
|
|
63
|
+
"""Export data to Parquet format.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
df: The DataFrame to export
|
|
67
|
+
file_name: The target file path
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Tuple containing:
|
|
71
|
+
- The generated table name
|
|
72
|
+
- Dictionary with export metadata
|
|
73
|
+
"""
|
|
74
|
+
try:
|
|
75
|
+
# Export to Parquet
|
|
76
|
+
df.to_parquet(file_name, index=False)
|
|
77
|
+
|
|
78
|
+
# Generate table name from file name
|
|
79
|
+
base_name = os.path.splitext(os.path.basename(file_name))[0]
|
|
80
|
+
table_name = self.db_manager.sanitize_table_name(base_name)
|
|
81
|
+
|
|
82
|
+
# Ensure unique table name
|
|
83
|
+
original_name = table_name
|
|
84
|
+
counter = 1
|
|
85
|
+
while table_name in self.db_manager.loaded_tables:
|
|
86
|
+
table_name = f"{original_name}_{counter}"
|
|
87
|
+
counter += 1
|
|
88
|
+
|
|
89
|
+
# Register the table in the database manager
|
|
90
|
+
self.db_manager.register_dataframe(df, table_name, file_name)
|
|
91
|
+
|
|
92
|
+
# Update tracking
|
|
93
|
+
self.db_manager.loaded_tables[table_name] = file_name
|
|
94
|
+
self.db_manager.table_columns[table_name] = df.columns.tolist()
|
|
95
|
+
|
|
96
|
+
return table_name, {
|
|
97
|
+
'file_path': file_name,
|
|
98
|
+
'columns': df.columns.tolist(),
|
|
99
|
+
'row_count': len(df)
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
except Exception as e:
|
|
103
|
+
raise Exception(f"Failed to export to Parquet: {str(e)}")
|
|
104
|
+
|
|
105
|
+
def convert_table_to_dataframe(self, table_widget) -> Optional[pd.DataFrame]:
|
|
106
|
+
"""Convert a QTableWidget to a pandas DataFrame with proper data types.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
table_widget: The QTableWidget containing the data
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
DataFrame with properly typed data, or None if conversion fails
|
|
113
|
+
"""
|
|
114
|
+
if not table_widget or table_widget.rowCount() == 0:
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
# Get headers
|
|
118
|
+
headers = [table_widget.horizontalHeaderItem(i).text()
|
|
119
|
+
for i in range(table_widget.columnCount())]
|
|
120
|
+
|
|
121
|
+
# Get data
|
|
122
|
+
data = []
|
|
123
|
+
for row in range(table_widget.rowCount()):
|
|
124
|
+
row_data = []
|
|
125
|
+
for column in range(table_widget.columnCount()):
|
|
126
|
+
item = table_widget.item(row, column)
|
|
127
|
+
row_data.append(item.text() if item else '')
|
|
128
|
+
data.append(row_data)
|
|
129
|
+
|
|
130
|
+
# Create DataFrame from raw string data
|
|
131
|
+
df_raw = pd.DataFrame(data, columns=headers)
|
|
132
|
+
|
|
133
|
+
# Try to use the original dataframe's dtypes if available
|
|
134
|
+
if hasattr(table_widget, 'current_df') and table_widget.current_df is not None:
|
|
135
|
+
original_df = table_widget.current_df
|
|
136
|
+
|
|
137
|
+
# Create a new DataFrame with appropriate types
|
|
138
|
+
df_typed = pd.DataFrame()
|
|
139
|
+
|
|
140
|
+
for col in df_raw.columns:
|
|
141
|
+
if col in original_df.columns:
|
|
142
|
+
# Get the original column type
|
|
143
|
+
orig_type = original_df[col].dtype
|
|
144
|
+
|
|
145
|
+
# Special handling for different data types
|
|
146
|
+
if pd.api.types.is_numeric_dtype(orig_type):
|
|
147
|
+
try:
|
|
148
|
+
numeric_col = pd.to_numeric(
|
|
149
|
+
df_raw[col].str.replace(',', '').replace('NULL', np.nan)
|
|
150
|
+
)
|
|
151
|
+
df_typed[col] = numeric_col
|
|
152
|
+
except:
|
|
153
|
+
df_typed[col] = df_raw[col]
|
|
154
|
+
elif pd.api.types.is_datetime64_dtype(orig_type):
|
|
155
|
+
try:
|
|
156
|
+
df_typed[col] = pd.to_datetime(df_raw[col].replace('NULL', np.nan))
|
|
157
|
+
except:
|
|
158
|
+
df_typed[col] = df_raw[col]
|
|
159
|
+
elif pd.api.types.is_bool_dtype(orig_type):
|
|
160
|
+
try:
|
|
161
|
+
df_typed[col] = df_raw[col].map({'True': True, 'False': False}).replace('NULL', np.nan)
|
|
162
|
+
except:
|
|
163
|
+
df_typed[col] = df_raw[col]
|
|
164
|
+
else:
|
|
165
|
+
df_typed[col] = df_raw[col]
|
|
166
|
+
else:
|
|
167
|
+
df_typed[col] = df_raw[col]
|
|
168
|
+
|
|
169
|
+
return df_typed
|
|
170
|
+
|
|
171
|
+
else:
|
|
172
|
+
# If we don't have the original dataframe, try to infer types
|
|
173
|
+
df_raw.replace('NULL', np.nan, inplace=True)
|
|
174
|
+
|
|
175
|
+
for col in df_raw.columns:
|
|
176
|
+
try:
|
|
177
|
+
df_raw[col] = pd.to_numeric(df_raw[col].str.replace(',', ''))
|
|
178
|
+
except:
|
|
179
|
+
try:
|
|
180
|
+
df_raw[col] = pd.to_datetime(df_raw[col])
|
|
181
|
+
except:
|
|
182
|
+
try:
|
|
183
|
+
if df_raw[col].dropna().isin(['True', 'False']).all():
|
|
184
|
+
df_raw[col] = df_raw[col].map({'True': True, 'False': False})
|
|
185
|
+
except:
|
|
186
|
+
pass
|
|
187
|
+
|
|
188
|
+
return df_raw
|