sqlshell 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. sqlshell/__init__.py +84 -0
  2. sqlshell/__main__.py +4926 -0
  3. sqlshell/ai_autocomplete.py +392 -0
  4. sqlshell/ai_settings_dialog.py +337 -0
  5. sqlshell/context_suggester.py +768 -0
  6. sqlshell/create_test_data.py +152 -0
  7. sqlshell/data/create_test_data.py +137 -0
  8. sqlshell/db/__init__.py +6 -0
  9. sqlshell/db/database_manager.py +1318 -0
  10. sqlshell/db/export_manager.py +188 -0
  11. sqlshell/editor.py +1166 -0
  12. sqlshell/editor_integration.py +127 -0
  13. sqlshell/execution_handler.py +421 -0
  14. sqlshell/menus.py +262 -0
  15. sqlshell/notification_manager.py +370 -0
  16. sqlshell/query_tab.py +904 -0
  17. sqlshell/resources/__init__.py +1 -0
  18. sqlshell/resources/icon.png +0 -0
  19. sqlshell/resources/logo_large.png +0 -0
  20. sqlshell/resources/logo_medium.png +0 -0
  21. sqlshell/resources/logo_small.png +0 -0
  22. sqlshell/resources/splash_screen.gif +0 -0
  23. sqlshell/space_invaders.py +501 -0
  24. sqlshell/splash_screen.py +405 -0
  25. sqlshell/sqlshell/__init__.py +5 -0
  26. sqlshell/sqlshell/create_test_data.py +118 -0
  27. sqlshell/sqlshell/create_test_databases.py +96 -0
  28. sqlshell/sqlshell_demo.png +0 -0
  29. sqlshell/styles.py +257 -0
  30. sqlshell/suggester_integration.py +330 -0
  31. sqlshell/syntax_highlighter.py +124 -0
  32. sqlshell/table_list.py +996 -0
  33. sqlshell/ui/__init__.py +6 -0
  34. sqlshell/ui/bar_chart_delegate.py +49 -0
  35. sqlshell/ui/filter_header.py +469 -0
  36. sqlshell/utils/__init__.py +16 -0
  37. sqlshell/utils/profile_cn2.py +1661 -0
  38. sqlshell/utils/profile_column.py +2635 -0
  39. sqlshell/utils/profile_distributions.py +616 -0
  40. sqlshell/utils/profile_entropy.py +347 -0
  41. sqlshell/utils/profile_foreign_keys.py +779 -0
  42. sqlshell/utils/profile_keys.py +2834 -0
  43. sqlshell/utils/profile_ohe.py +934 -0
  44. sqlshell/utils/profile_ohe_advanced.py +754 -0
  45. sqlshell/utils/profile_ohe_comparison.py +237 -0
  46. sqlshell/utils/profile_prediction.py +926 -0
  47. sqlshell/utils/profile_similarity.py +876 -0
  48. sqlshell/utils/search_in_df.py +90 -0
  49. sqlshell/widgets.py +400 -0
  50. sqlshell-0.4.4.dist-info/METADATA +441 -0
  51. sqlshell-0.4.4.dist-info/RECORD +54 -0
  52. sqlshell-0.4.4.dist-info/WHEEL +5 -0
  53. sqlshell-0.4.4.dist-info/entry_points.txt +2 -0
  54. sqlshell-0.4.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1318 @@
1
+ import os
2
+ import sqlite3
3
+ import pandas as pd
4
+ import duckdb
5
+ from pathlib import Path
6
+
7
+ class DatabaseManager:
8
+ """
9
+ Manages database connections and operations for SQLShell.
10
+ Uses an in-memory DuckDB as the primary connection and can attach external
11
+ SQLite and DuckDB databases for querying alongside loaded files.
12
+ """
13
+
14
+ def __init__(self):
15
+ """Initialize the database manager with an in-memory DuckDB connection."""
16
+ self.conn = None
17
+ self.connection_type = 'duckdb'
18
+ self.loaded_tables = {} # Maps table_name to file_path or 'database:alias'/'query_result'
19
+ self.table_columns = {} # Maps table_name to list of column names
20
+ self.database_path = None # Track the path to the primary attached database (for display)
21
+ self.attached_databases = {} # Maps alias to {'path': path, 'type': 'sqlite'/'duckdb', 'tables': []}
22
+ self._sqlite_scanner_loaded = False
23
+
24
+ # Initialize the in-memory DuckDB connection
25
+ self._init_connection()
26
+
27
+ def _init_connection(self):
28
+ """Initialize the in-memory DuckDB connection."""
29
+ self.conn = duckdb.connect(':memory:')
30
+ self.connection_type = 'duckdb'
31
+
32
+ def _ensure_sqlite_scanner(self):
33
+ """Load the sqlite_scanner extension if not already loaded."""
34
+ if not self._sqlite_scanner_loaded:
35
+ try:
36
+ self.conn.execute("INSTALL sqlite_scanner")
37
+ self.conn.execute("LOAD sqlite_scanner")
38
+ self._sqlite_scanner_loaded = True
39
+ except Exception as e:
40
+ raise Exception(f"Failed to load sqlite_scanner extension: {str(e)}")
41
+
42
+ def is_connected(self):
43
+ """Check if there is an active database connection."""
44
+ return self.conn is not None
45
+
46
+ def get_connection_info(self):
47
+ """Get information about the current connection."""
48
+ if not self.is_connected():
49
+ return "No database connected"
50
+
51
+ info_parts = ["In-memory DuckDB"]
52
+
53
+ if self.attached_databases:
54
+ db_info = []
55
+ for alias, db_data in self.attached_databases.items():
56
+ db_type = db_data['type'].upper()
57
+ db_info.append(f"{alias} ({db_type})")
58
+ info_parts.append(f"Attached: {', '.join(db_info)}")
59
+
60
+ return " | ".join(info_parts)
61
+
62
+ def close_connection(self):
63
+ """Close the current database connection if one exists."""
64
+ if self.conn:
65
+ try:
66
+ # Detach all attached databases first
67
+ for alias in list(self.attached_databases.keys()):
68
+ try:
69
+ self.conn.execute(f"DETACH {alias}")
70
+ except Exception:
71
+ pass
72
+ self.conn.close()
73
+ except Exception:
74
+ pass # Ignore errors when closing
75
+ finally:
76
+ self.conn = None
77
+ self.connection_type = None
78
+ self.database_path = None
79
+ self.attached_databases = {}
80
+ self._sqlite_scanner_loaded = False
81
+
82
+ def open_database(self, filename, load_all_tables=True):
83
+ """
84
+ Attach a database file to the in-memory connection.
85
+ Detects whether it's a SQLite or DuckDB database.
86
+ This preserves any existing loaded files/tables.
87
+
88
+ Args:
89
+ filename: Path to the database file
90
+ load_all_tables: Whether to automatically load all tables from the database
91
+
92
+ Returns:
93
+ True if successful, False otherwise
94
+
95
+ Raises:
96
+ Exception: If there's an error opening the database
97
+ """
98
+ # Ensure we have a connection
99
+ if not self.is_connected():
100
+ self._init_connection()
101
+
102
+ # First, detach any existing database with the same alias and remove its tables
103
+ if 'db' in self.attached_databases:
104
+ self.detach_database('db')
105
+
106
+ abs_path = os.path.abspath(filename)
107
+
108
+ try:
109
+ if self.is_sqlite_db(filename):
110
+ # Attach SQLite database using sqlite_scanner
111
+ self._ensure_sqlite_scanner()
112
+ self.conn.execute(f"ATTACH '{abs_path}' AS db (TYPE SQLITE, READ_ONLY)")
113
+ db_type = 'sqlite'
114
+ else:
115
+ # Attach DuckDB database in read-only mode
116
+ self.conn.execute(f"ATTACH '{abs_path}' AS db (READ_ONLY)")
117
+ db_type = 'duckdb'
118
+
119
+ # Store the database path for display
120
+ self.database_path = abs_path
121
+
122
+ # Track this attached database
123
+ self.attached_databases['db'] = {
124
+ 'path': abs_path,
125
+ 'type': db_type,
126
+ 'tables': []
127
+ }
128
+
129
+ # Load tables from the database if requested
130
+ if load_all_tables:
131
+ self._load_attached_database_tables('db')
132
+
133
+ return True
134
+
135
+ except Exception as e:
136
+ raise Exception(f"Failed to open database: {str(e)}")
137
+
138
+ def _load_attached_database_tables(self, alias):
139
+ """
140
+ Load all tables from an attached database.
141
+
142
+ Args:
143
+ alias: The alias of the attached database
144
+
145
+ Returns:
146
+ A list of table names loaded
147
+ """
148
+ if alias not in self.attached_databases:
149
+ return []
150
+
151
+ try:
152
+ table_names = []
153
+
154
+ # Query for tables in the attached database using duckdb_tables()
155
+ # This works for attached databases unlike information_schema.tables
156
+ query = f"SELECT table_name FROM duckdb_tables() WHERE database_name='{alias}'"
157
+ result = self.conn.execute(query).fetchdf()
158
+
159
+ for table_name in result['table_name']:
160
+ # Store with 'database:alias' as source
161
+ self.loaded_tables[table_name] = f'database:{alias}'
162
+ table_names.append(table_name)
163
+
164
+ # Get column names for each table using duckdb_columns()
165
+ try:
166
+ column_query = f"SELECT column_name FROM duckdb_columns() WHERE database_name='{alias}' AND table_name='{table_name}'"
167
+ columns = self.conn.execute(column_query).fetchdf()
168
+ self.table_columns[table_name] = columns['column_name'].tolist()
169
+ except Exception:
170
+ self.table_columns[table_name] = []
171
+
172
+ # Track which tables came from this database
173
+ self.attached_databases[alias]['tables'] = table_names
174
+
175
+ return table_names
176
+
177
+ except Exception as e:
178
+ raise Exception(f'Error loading tables from {alias}: {str(e)}')
179
+
180
+ def detach_database(self, alias):
181
+ """
182
+ Detach a database and remove its tables from tracking.
183
+
184
+ Args:
185
+ alias: The alias of the database to detach
186
+ """
187
+ if alias not in self.attached_databases:
188
+ return
189
+
190
+ # Remove all tables that came from this database
191
+ tables_to_remove = self.attached_databases[alias].get('tables', [])
192
+ for table_name in tables_to_remove:
193
+ if table_name in self.loaded_tables:
194
+ del self.loaded_tables[table_name]
195
+ if table_name in self.table_columns:
196
+ del self.table_columns[table_name]
197
+
198
+ # Detach the database
199
+ try:
200
+ self.conn.execute(f"DETACH {alias}")
201
+ except Exception:
202
+ pass
203
+
204
+ # Remove from tracking
205
+ del self.attached_databases[alias]
206
+
207
+ # Clear database_path if this was the main database
208
+ if alias == 'db':
209
+ self.database_path = None
210
+
211
+ def create_memory_connection(self):
212
+ """Create/reset the in-memory DuckDB connection, preserving nothing."""
213
+ self.close_connection()
214
+ self._init_connection()
215
+ self.loaded_tables = {}
216
+ self.table_columns = {}
217
+ return "Connected to: in-memory DuckDB"
218
+
219
+ def is_sqlite_db(self, filename):
220
+ """
221
+ Check if the file is a SQLite database by examining its header.
222
+
223
+ Args:
224
+ filename: Path to the database file
225
+
226
+ Returns:
227
+ Boolean indicating if the file is a SQLite database
228
+ """
229
+ try:
230
+ with open(filename, 'rb') as f:
231
+ header = f.read(16)
232
+ return header[:16] == b'SQLite format 3\x00'
233
+ except:
234
+ return False
235
+
236
+ def load_database_tables(self):
237
+ """
238
+ Load all tables from the attached database (alias 'db').
239
+ This is a convenience method that calls _load_attached_database_tables.
240
+
241
+ Returns:
242
+ A list of table names loaded
243
+ """
244
+ if 'db' in self.attached_databases:
245
+ return self._load_attached_database_tables('db')
246
+ return []
247
+
248
+ def execute_query(self, query):
249
+ """
250
+ Execute a SQL query against the current database connection.
251
+ Tables from attached databases are automatically qualified with their alias.
252
+
253
+ Args:
254
+ query: SQL query string to execute
255
+
256
+ Returns:
257
+ Pandas DataFrame with the query results
258
+
259
+ Raises:
260
+ Exception: If there's an error executing the query
261
+ """
262
+ if not query.strip():
263
+ raise ValueError("Empty query")
264
+
265
+ if not self.is_connected():
266
+ self._init_connection()
267
+
268
+ try:
269
+ # Preprocess query to qualify table names from attached databases
270
+ processed_query = self._qualify_table_names(query)
271
+ result = self.conn.execute(processed_query).fetchdf()
272
+ return result
273
+
274
+ except duckdb.Error as e:
275
+ error_msg = str(e).lower()
276
+ if "syntax error" in error_msg:
277
+ raise SyntaxError(f"SQL syntax error: {str(e)}")
278
+ elif "does not exist" in error_msg or "not found" in error_msg:
279
+ # Extract the table name from the error message when possible
280
+ import re
281
+ table_match = re.search(r"Table[^']*'([^']+)'|\"([^\"]+)\"", str(e), re.IGNORECASE)
282
+ table_name = (table_match.group(1) or table_match.group(2)) if table_match else "unknown"
283
+
284
+ # Check if this table is in our loaded_tables dict but came from a database
285
+ source = self.loaded_tables.get(table_name, '')
286
+ if source.startswith('database:'):
287
+ raise ValueError(f"Table '{table_name}' was part of a database but is not accessible. "
288
+ f"Please reconnect to the original database using the 'Open Database' button.")
289
+ else:
290
+ raise ValueError(f"Table not found: {str(e)}")
291
+ elif "no such column" in error_msg or "column" in error_msg and "not found" in error_msg:
292
+ raise ValueError(f"Column not found: {str(e)}")
293
+ else:
294
+ raise Exception(f"Database error: {str(e)}")
295
+
296
+ def _qualify_table_names(self, query):
297
+ """
298
+ Qualify unqualified table names in the query with their database alias.
299
+ This allows users to write 'SELECT * FROM customers' instead of 'SELECT * FROM db.customers'.
300
+
301
+ Args:
302
+ query: The SQL query to process
303
+
304
+ Returns:
305
+ The processed query with qualified table names
306
+ """
307
+ import re
308
+
309
+ # Build a mapping of table names to their qualified names
310
+ table_qualifications = {}
311
+ for table_name, source in self.loaded_tables.items():
312
+ if source.startswith('database:'):
313
+ alias = source.split(':')[1]
314
+ table_qualifications[table_name.lower()] = f"{alias}.{table_name}"
315
+
316
+ if not table_qualifications:
317
+ return query
318
+
319
+ # Pattern to match table names in common SQL contexts
320
+ # This is a simplified approach - handles most common cases
321
+ # Look for: FROM table, JOIN table, INTO table, UPDATE table
322
+ def replace_table(match):
323
+ keyword = match.group(1)
324
+ table = match.group(2)
325
+ rest = match.group(3) if match.lastindex >= 3 else ''
326
+
327
+ # Don't replace if already qualified (contains a dot)
328
+ if '.' in table:
329
+ return match.group(0)
330
+
331
+ # Check if this table needs qualification
332
+ qualified = table_qualifications.get(table.lower())
333
+ if qualified:
334
+ return f"{keyword}{qualified}{rest}"
335
+ return match.group(0)
336
+
337
+ # Pattern for FROM, JOIN, INTO, UPDATE followed by table name
338
+ pattern = r'(FROM\s+|JOIN\s+|INTO\s+|UPDATE\s+)([a-zA-Z_][a-zA-Z0-9_]*)(\s|$|,|\))'
339
+ processed = re.sub(pattern, replace_table, query, flags=re.IGNORECASE)
340
+
341
+ return processed
342
+
343
+ def load_file(self, file_path, table_prefix=""):
344
+ """
345
+ Load data from a file into the database.
346
+
347
+ Args:
348
+ file_path: Path to the data file (Excel, CSV, TXT, Parquet, Delta)
349
+ table_prefix: Optional prefix to prepend to the table name (e.g., "prod_")
350
+
351
+ Returns:
352
+ Tuple of (table_name, DataFrame) for the loaded data
353
+
354
+ Raises:
355
+ ValueError: If the file format is unsupported or there's an error
356
+ """
357
+ try:
358
+ # Check if this is a Delta table (folder with _delta_log)
359
+ delta_path = Path(file_path)
360
+ is_delta_table = (delta_path.is_dir() and
361
+ (delta_path / '_delta_log').exists()) or file_path.endswith('.delta')
362
+
363
+ # Read the file into a DataFrame, using optimized loading strategies
364
+ if is_delta_table:
365
+ # Read as Delta table using deltalake library
366
+ try:
367
+ # Load the Delta table
368
+ import deltalake
369
+ delta_table = deltalake.DeltaTable(file_path)
370
+
371
+ # Get the schema to identify decimal columns
372
+ schema = delta_table.schema()
373
+ decimal_columns = []
374
+
375
+ # Identify decimal columns from schema
376
+ for field in schema.fields:
377
+ # Use string representation to check for decimal
378
+ if 'decimal' in str(field.type).lower():
379
+ decimal_columns.append(field.name)
380
+
381
+ # Read the data
382
+ df = delta_table.to_pandas()
383
+
384
+ # Try to convert decimal columns to float64, warn if not possible
385
+ for col in decimal_columns:
386
+ if col in df.columns:
387
+ try:
388
+ df[col] = pd.to_numeric(df[col], errors='coerce').astype('float64')
389
+ if df[col].isna().any():
390
+ print(f"Warning: Some values in column '{col}' could not be converted to float64 and are set as NaN.")
391
+ except Exception as e:
392
+ print(f"Warning: Could not convert column '{col}' to float64: {e}")
393
+ except Exception as e:
394
+ raise ValueError(f"Error loading Delta table: {str(e)}")
395
+ elif file_path.endswith(('.xlsx', '.xls')):
396
+ # Try to use a streaming approach for Excel files
397
+ try:
398
+ # For Excel files, we first check if it's a large file
399
+ # If it's large, we may want to show only a subset
400
+ excel_file = pd.ExcelFile(file_path)
401
+ sheet_name = excel_file.sheet_names[0] # Default to first sheet
402
+
403
+ # Read the first row to get column names
404
+ df_preview = pd.read_excel(excel_file, sheet_name=sheet_name, nrows=5)
405
+
406
+ # If the file is very large, use chunksize
407
+ file_size = os.path.getsize(file_path) / (1024 * 1024) # Size in MB
408
+
409
+ if file_size > 50: # If file is larger than 50MB
410
+ # Use a limited subset for large files to avoid memory issues
411
+ df = pd.read_excel(excel_file, sheet_name=sheet_name, nrows=100000) # Cap at 100k rows
412
+ else:
413
+ # For smaller files, read everything
414
+ df = pd.read_excel(excel_file, sheet_name=sheet_name)
415
+ except Exception:
416
+ # Fallback to standard reading method
417
+ df = pd.read_excel(file_path)
418
+ elif file_path.endswith(('.csv', '.txt')):
419
+ # For CSV and TXT files, detect separator and use chunking for large files
420
+ try:
421
+ # Check if it's a large file
422
+ file_size = os.path.getsize(file_path) / (1024 * 1024) # Size in MB
423
+
424
+ # Try multiple encodings if needed
425
+ encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'ISO-8859-1']
426
+
427
+ # Detect the separator automatically
428
+ def detect_separator(sample_data):
429
+ # Common separators to check
430
+ separators = [',', ';', '\t']
431
+ separator_scores = {}
432
+
433
+ # Split into lines and analyze
434
+ lines = [line.strip() for line in sample_data.split('\n') if line.strip()]
435
+ if not lines:
436
+ return ',' # Default if no content
437
+
438
+ # Check for quoted content with separators
439
+ has_quotes = '"' in sample_data or "'" in sample_data
440
+
441
+ # If we have quoted content, use a different approach
442
+ if has_quotes:
443
+ for sep in separators:
444
+ # Look for patterns like "value";
445
+ pattern_count = 0
446
+ for line in lines:
447
+ # Count occurrences of quote + separator
448
+ double_quote_pattern = f'"{sep}'
449
+ single_quote_pattern = f"'{sep}"
450
+ pattern_count += line.count(double_quote_pattern) + line.count(single_quote_pattern)
451
+
452
+ # If we found clear quote+separator patterns, this is likely our separator
453
+ if pattern_count > 0:
454
+ separator_scores[sep] = pattern_count
455
+
456
+ # Standard approach based on consistent column counts
457
+ if not separator_scores:
458
+ for sep in separators:
459
+ # Count consistent occurrences across lines
460
+ counts = [line.count(sep) for line in lines]
461
+ if counts and all(c > 0 for c in counts):
462
+ # Calculate consistency score: higher if all counts are the same
463
+ consistency = 1.0 if all(c == counts[0] for c in counts) else 0.5
464
+ # Score is average count * consistency
465
+ separator_scores[sep] = sum(counts) / len(counts) * consistency
466
+
467
+ # Choose the separator with the highest score
468
+ if separator_scores:
469
+ return max(separator_scores.items(), key=lambda x: x[1])[0]
470
+
471
+ # Default to comma if we couldn't determine
472
+ return ','
473
+
474
+ # First, sample the file to detect separator
475
+ with open(file_path, 'rb') as f:
476
+ # Read first few KB to detect encoding and separator
477
+ raw_sample = f.read(4096)
478
+
479
+ # Try to decode with various encodings
480
+ sample_text = None
481
+ detected_encoding = None
482
+
483
+ for encoding in encodings_to_try:
484
+ try:
485
+ sample_text = raw_sample.decode(encoding)
486
+ detected_encoding = encoding
487
+ break
488
+ except UnicodeDecodeError:
489
+ continue
490
+
491
+ if not sample_text:
492
+ raise ValueError("Could not decode file with any of the attempted encodings")
493
+
494
+ # Detect separator from the sample
495
+ separator = detect_separator(sample_text)
496
+
497
+ # Determine quote character (default to double quote)
498
+ quotechar = '"'
499
+ if sample_text.count("'") > sample_text.count('"'):
500
+ quotechar = "'"
501
+
502
+ if file_size > 50: # If file is larger than 50MB
503
+ # Read the first chunk to get column types
504
+ try:
505
+ df_preview = pd.read_csv(
506
+ file_path,
507
+ sep=separator,
508
+ nrows=1000,
509
+ encoding=detected_encoding,
510
+ engine='python' if separator != ',' else 'c',
511
+ quotechar=quotechar,
512
+ doublequote=True
513
+ )
514
+
515
+ # Use optimized dtypes for better memory usage
516
+ dtypes = {col: df_preview[col].dtype for col in df_preview.columns}
517
+
518
+ # Read again with chunk processing, combining up to 100k rows
519
+ chunks = []
520
+ for chunk in pd.read_csv(
521
+ file_path,
522
+ sep=separator,
523
+ dtype=dtypes,
524
+ chunksize=10000,
525
+ encoding=detected_encoding,
526
+ engine='python' if separator != ',' else 'c',
527
+ quotechar=quotechar,
528
+ doublequote=True
529
+ ):
530
+ chunks.append(chunk)
531
+ if len(chunks) * 10000 >= 100000: # Cap at 100k rows
532
+ break
533
+
534
+ df = pd.concat(chunks, ignore_index=True)
535
+ except pd.errors.ParserError as e:
536
+ # If parsing fails, try again with error recovery options
537
+ print(f"Initial parsing failed: {str(e)}. Trying with error recovery options...")
538
+
539
+ # Try with Python engine which is more flexible
540
+ try:
541
+ # First try with pandas >= 1.3 parameters
542
+ df = pd.read_csv(
543
+ file_path,
544
+ sep=separator,
545
+ encoding=detected_encoding,
546
+ engine='python', # Always use python engine for error recovery
547
+ quotechar=quotechar,
548
+ doublequote=True,
549
+ on_bad_lines='warn', # New parameter in pandas >= 1.3
550
+ na_values=[''],
551
+ keep_default_na=True
552
+ )
553
+ except TypeError:
554
+ # Fall back to pandas < 1.3 parameters
555
+ df = pd.read_csv(
556
+ file_path,
557
+ sep=separator,
558
+ encoding=detected_encoding,
559
+ engine='python',
560
+ quotechar=quotechar,
561
+ doublequote=True,
562
+ error_bad_lines=False, # Old parameter
563
+ warn_bad_lines=True, # Old parameter
564
+ na_values=[''],
565
+ keep_default_na=True
566
+ )
567
+ else:
568
+ # For smaller files, read everything at once
569
+ try:
570
+ df = pd.read_csv(
571
+ file_path,
572
+ sep=separator,
573
+ encoding=detected_encoding,
574
+ engine='python' if separator != ',' else 'c',
575
+ quotechar=quotechar,
576
+ doublequote=True
577
+ )
578
+ except pd.errors.ParserError as e:
579
+ # If parsing fails, try again with error recovery options
580
+ print(f"Initial parsing failed: {str(e)}. Trying with error recovery options...")
581
+
582
+ # Try with Python engine which is more flexible
583
+ try:
584
+ # First try with pandas >= 1.3 parameters
585
+ df = pd.read_csv(
586
+ file_path,
587
+ sep=separator,
588
+ encoding=detected_encoding,
589
+ engine='python', # Always use python engine for error recovery
590
+ quotechar=quotechar,
591
+ doublequote=True,
592
+ on_bad_lines='warn', # New parameter in pandas >= 1.3
593
+ na_values=[''],
594
+ keep_default_na=True
595
+ )
596
+ except TypeError:
597
+ # Fall back to pandas < 1.3 parameters
598
+ df = pd.read_csv(
599
+ file_path,
600
+ sep=separator,
601
+ encoding=detected_encoding,
602
+ engine='python',
603
+ quotechar=quotechar,
604
+ doublequote=True,
605
+ error_bad_lines=False, # Old parameter
606
+ warn_bad_lines=True, # Old parameter
607
+ na_values=[''],
608
+ keep_default_na=True
609
+ )
610
+ except Exception as e:
611
+ # Log the error for debugging
612
+ import traceback
613
+ print(f"Error loading CSV/TXT file: {str(e)}")
614
+ print(traceback.format_exc())
615
+ raise ValueError(f"Error loading CSV/TXT file: {str(e)}")
616
+ elif file_path.endswith('.parquet'):
617
+ # Use fastparquet engine (lighter than pyarrow - saves 147MB in builds)
618
+ df = pd.read_parquet(file_path, engine='fastparquet')
619
+ else:
620
+ raise ValueError("Unsupported file format. Supported formats: .xlsx, .xls, .csv, .txt, .parquet, and Delta tables.")
621
+
622
+ # Generate table name from file name
623
+ base_name = os.path.splitext(os.path.basename(file_path))[0]
624
+
625
+ # For directories like Delta tables, use the directory name
626
+ if os.path.isdir(file_path):
627
+ base_name = os.path.basename(file_path)
628
+
629
+ # Apply prefix if provided
630
+ if table_prefix:
631
+ base_name = f"{table_prefix}{base_name}"
632
+
633
+ table_name = self.sanitize_table_name(base_name)
634
+
635
+ # Ensure unique table name
636
+ original_name = table_name
637
+ counter = 1
638
+ while table_name in self.loaded_tables:
639
+ table_name = f"{original_name}_{counter}"
640
+ counter += 1
641
+
642
+ # Ensure we have a connection (always in-memory DuckDB)
643
+ if not self.is_connected():
644
+ self._init_connection()
645
+
646
+ # Register the DataFrame as a view in DuckDB
647
+ # This preserves any attached databases and their tables
648
+ self.conn.register(table_name, df)
649
+
650
+ # Store information about the table
651
+ self.loaded_tables[table_name] = file_path
652
+ self.table_columns[table_name] = [str(col) for col in df.columns.tolist()]
653
+
654
+ return table_name, df
655
+
656
+ except MemoryError:
657
+ raise ValueError("Not enough memory to load this file. Try using a smaller file or increasing available memory.")
658
+ except Exception as e:
659
+ raise ValueError(f"Error loading file: {str(e)}")
660
+
661
+ def remove_table(self, table_name):
662
+ """
663
+ Remove a table from the database.
664
+
665
+ Args:
666
+ table_name: Name of the table to remove
667
+
668
+ Returns:
669
+ Boolean indicating success
670
+ """
671
+ if not table_name in self.loaded_tables:
672
+ return False
673
+
674
+ try:
675
+ source = self.loaded_tables[table_name]
676
+
677
+ # For file-based tables (registered DataFrames), drop the view
678
+ if not source.startswith('database:'):
679
+ self.conn.execute(f'DROP VIEW IF EXISTS {table_name}')
680
+ else:
681
+ # For database tables, we just remove from tracking
682
+ # The actual table remains in the attached database
683
+ # Also remove from the attached database's table list
684
+ alias = source.split(':')[1]
685
+ if alias in self.attached_databases:
686
+ tables = self.attached_databases[alias].get('tables', [])
687
+ if table_name in tables:
688
+ tables.remove(table_name)
689
+
690
+ # Remove from tracking
691
+ del self.loaded_tables[table_name]
692
+ if table_name in self.table_columns:
693
+ del self.table_columns[table_name]
694
+
695
+ return True
696
+ except Exception:
697
+ return False
698
+
699
+ def remove_multiple_tables(self, table_names):
700
+ """
701
+ Remove multiple tables from the database.
702
+
703
+ Args:
704
+ table_names: List of table names to remove
705
+
706
+ Returns:
707
+ Tuple of (successful_removals, failed_removals) as lists of table names
708
+ """
709
+ successful_removals = []
710
+ failed_removals = []
711
+
712
+ for table_name in table_names:
713
+ if self.remove_table(table_name):
714
+ successful_removals.append(table_name)
715
+ else:
716
+ failed_removals.append(table_name)
717
+
718
+ return successful_removals, failed_removals
719
+
720
+ def get_table_preview(self, table_name, limit=5):
721
+ """
722
+ Get a preview of the data in a table.
723
+
724
+ Args:
725
+ table_name: Name of the table to preview
726
+ limit: Number of rows to preview
727
+
728
+ Returns:
729
+ Pandas DataFrame with the preview data
730
+ """
731
+ if not table_name in self.loaded_tables:
732
+ raise ValueError(f"Table '{table_name}' not found")
733
+
734
+ try:
735
+ source = self.loaded_tables[table_name]
736
+
737
+ # For database tables, use the qualified name
738
+ if source.startswith('database:'):
739
+ alias = source.split(':')[1]
740
+ return self.conn.execute(f'SELECT * FROM {alias}.{table_name} LIMIT {limit}').fetchdf()
741
+ else:
742
+ # For file-based tables (registered views)
743
+ return self.conn.execute(f'SELECT * FROM {table_name} LIMIT {limit}').fetchdf()
744
+ except Exception as e:
745
+ raise Exception(f"Error previewing table: {str(e)}")
746
+
747
+ def get_full_table(self, table_name):
748
+ """
749
+ Get all data from a table (no row limit).
750
+
751
+ Args:
752
+ table_name: Name of the table to retrieve
753
+
754
+ Returns:
755
+ Pandas DataFrame with all the table data
756
+ """
757
+ if not table_name in self.loaded_tables:
758
+ raise ValueError(f"Table '{table_name}' not found")
759
+
760
+ try:
761
+ source = self.loaded_tables[table_name]
762
+
763
+ # For database tables, use the qualified name
764
+ if source.startswith('database:'):
765
+ alias = source.split(':')[1]
766
+ return self.conn.execute(f'SELECT * FROM {alias}.{table_name}').fetchdf()
767
+ else:
768
+ # For file-based tables (registered views)
769
+ return self.conn.execute(f'SELECT * FROM {table_name}').fetchdf()
770
+ except Exception as e:
771
+ raise Exception(f"Error getting table data: {str(e)}")
772
+
773
+ def reload_table(self, table_name):
774
+ """
775
+ Reload a table's data from its source file.
776
+
777
+ Args:
778
+ table_name: Name of the table to reload
779
+
780
+ Returns:
781
+ Tuple of (bool, message) indicating success/failure and a message
782
+
783
+ Raises:
784
+ ValueError: If the table cannot be reloaded
785
+ """
786
+ if not table_name in self.loaded_tables:
787
+ return False, f"Table '{table_name}' not found"
788
+
789
+ file_path = self.loaded_tables[table_name]
790
+
791
+ # Check if this is a file-based table
792
+ if file_path in ['database', 'query_result']:
793
+ return False, f"Cannot reload '{table_name}' because it's not a file-based table"
794
+
795
+ try:
796
+ # Check if the file still exists
797
+ if not os.path.exists(file_path):
798
+ return False, f"Source file '{file_path}' no longer exists"
799
+
800
+ # Store the original table name
801
+ original_name = table_name
802
+
803
+ # Remove the existing table
804
+ self.remove_table(table_name)
805
+
806
+ # Check if this is a Delta table
807
+ delta_path = Path(file_path)
808
+ is_delta_table = (delta_path.is_dir() and
809
+ (delta_path / '_delta_log').exists()) or file_path.endswith('.delta')
810
+
811
+ # Load the file with the original table name
812
+ df = None
813
+ if is_delta_table:
814
+ # Read as Delta table
815
+ import deltalake
816
+ delta_table = deltalake.DeltaTable(file_path)
817
+ df = delta_table.to_pandas()
818
+ elif file_path.endswith(('.xlsx', '.xls')):
819
+ df = pd.read_excel(file_path)
820
+ elif file_path.endswith(('.csv', '.txt')):
821
+ # Try multiple encodings for CSV/TXT files
822
+ encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'ISO-8859-1']
823
+
824
+ # Detect the separator automatically
825
+ def detect_separator(sample_data):
826
+ # Common separators to check
827
+ separators = [',', ';', '\t']
828
+ separator_scores = {}
829
+
830
+ # Split into lines and analyze
831
+ lines = [line.strip() for line in sample_data.split('\n') if line.strip()]
832
+ if not lines:
833
+ return ',' # Default if no content
834
+
835
+ # Check for quoted content with separators
836
+ has_quotes = '"' in sample_data or "'" in sample_data
837
+
838
+ # If we have quoted content, use a different approach
839
+ if has_quotes:
840
+ for sep in separators:
841
+ # Look for patterns like "value";
842
+ pattern_count = 0
843
+ for line in lines:
844
+ # Count occurrences of quote + separator
845
+ double_quote_pattern = f'"{sep}'
846
+ single_quote_pattern = f"'{sep}"
847
+ pattern_count += line.count(double_quote_pattern) + line.count(single_quote_pattern)
848
+
849
+ # If we found clear quote+separator patterns, this is likely our separator
850
+ if pattern_count > 0:
851
+ separator_scores[sep] = pattern_count
852
+
853
+ # Standard approach based on consistent column counts
854
+ if not separator_scores:
855
+ for sep in separators:
856
+ # Count consistent occurrences across lines
857
+ counts = [line.count(sep) for line in lines]
858
+ if counts and all(c > 0 for c in counts):
859
+ # Calculate consistency score: higher if all counts are the same
860
+ consistency = 1.0 if all(c == counts[0] for c in counts) else 0.5
861
+ # Score is average count * consistency
862
+ separator_scores[sep] = sum(counts) / len(counts) * consistency
863
+
864
+ # Choose the separator with the highest score
865
+ if separator_scores:
866
+ return max(separator_scores.items(), key=lambda x: x[1])[0]
867
+
868
+ # Default to comma if we couldn't determine
869
+ return ','
870
+
871
+ # First, sample the file to detect separator and encoding
872
+ with open(file_path, 'rb') as f:
873
+ # Read first few KB to detect encoding and separator
874
+ raw_sample = f.read(4096)
875
+
876
+ # Try to decode with various encodings
877
+ sample_text = None
878
+ detected_encoding = None
879
+
880
+ for encoding in encodings_to_try:
881
+ try:
882
+ sample_text = raw_sample.decode(encoding)
883
+ detected_encoding = encoding
884
+ break
885
+ except UnicodeDecodeError:
886
+ # If this encoding fails, try the next one
887
+ continue
888
+
889
+ if not sample_text:
890
+ raise ValueError("Could not decode file with any of the attempted encodings")
891
+
892
+ # Detect separator from the sample
893
+ separator = detect_separator(sample_text)
894
+
895
+ # Determine quote character (default to double quote)
896
+ quotechar = '"'
897
+ if sample_text.count("'") > sample_text.count('"'):
898
+ quotechar = "'"
899
+
900
+ # Read with detected parameters
901
+ try:
902
+ df = pd.read_csv(
903
+ file_path,
904
+ sep=separator,
905
+ encoding=detected_encoding,
906
+ engine='python' if separator != ',' else 'c',
907
+ quotechar=quotechar,
908
+ doublequote=True
909
+ )
910
+ except pd.errors.ParserError as e:
911
+ # If parsing fails, try again with error recovery options
912
+ print(f"Initial parsing failed on reload: {str(e)}. Trying with error recovery options...")
913
+
914
+ # Try with Python engine which is more flexible
915
+ try:
916
+ # First try with pandas >= 1.3 parameters
917
+ df = pd.read_csv(
918
+ file_path,
919
+ sep=separator,
920
+ encoding=detected_encoding,
921
+ engine='python', # Always use python engine for error recovery
922
+ quotechar=quotechar,
923
+ doublequote=True,
924
+ on_bad_lines='warn', # New parameter in pandas >= 1.3
925
+ na_values=[''],
926
+ keep_default_na=True
927
+ )
928
+ except TypeError:
929
+ # Fall back to pandas < 1.3 parameters
930
+ df = pd.read_csv(
931
+ file_path,
932
+ sep=separator,
933
+ encoding=detected_encoding,
934
+ engine='python',
935
+ quotechar=quotechar,
936
+ doublequote=True,
937
+ error_bad_lines=False, # Old parameter
938
+ warn_bad_lines=True, # Old parameter
939
+ na_values=[''],
940
+ keep_default_na=True
941
+ )
942
+ elif file_path.endswith('.parquet'):
943
+ # Use fastparquet engine (lighter than pyarrow - saves 147MB in builds)
944
+ df = pd.read_parquet(file_path, engine='fastparquet')
945
+ else:
946
+ return False, "Unsupported file format"
947
+
948
+ # Register the dataframe with the original name
949
+ self.register_dataframe(df, original_name, file_path)
950
+
951
+ return True, f"Table '{table_name}' reloaded successfully"
952
+
953
+ except Exception as e:
954
+ return False, f"Error reloading table: {str(e)}"
955
+
956
+ def rename_table(self, old_name, new_name):
957
+ """
958
+ Rename a table in the database.
959
+ Only file-based tables can be renamed; database tables are read-only.
960
+
961
+ Args:
962
+ old_name: Current name of the table
963
+ new_name: New name for the table
964
+
965
+ Returns:
966
+ Boolean indicating success
967
+ """
968
+ if not old_name in self.loaded_tables:
969
+ return False
970
+
971
+ source = self.loaded_tables[old_name]
972
+
973
+ # Database tables cannot be renamed (read-only)
974
+ if source.startswith('database:'):
975
+ raise ValueError(f"Cannot rename table '{old_name}' because it's from an attached database (read-only)")
976
+
977
+ try:
978
+ # Sanitize the new name
979
+ new_name = self.sanitize_table_name(new_name)
980
+
981
+ # Check if new name already exists
982
+ if new_name in self.loaded_tables and new_name != old_name:
983
+ raise ValueError(f"Table '{new_name}' already exists")
984
+
985
+ # For file-based tables (registered views in DuckDB):
986
+ # 1. Get the data from the old view
987
+ df = self.conn.execute(f'SELECT * FROM {old_name}').fetchdf()
988
+ # 2. Drop the old view
989
+ self.conn.execute(f'DROP VIEW IF EXISTS {old_name}')
990
+ # 3. Register the data under the new name
991
+ self.conn.register(new_name, df)
992
+
993
+ # Update tracking
994
+ self.loaded_tables[new_name] = self.loaded_tables.pop(old_name)
995
+ self.table_columns[new_name] = self.table_columns.pop(old_name)
996
+
997
+ return True
998
+
999
+ except Exception as e:
1000
+ raise Exception(f"Failed to rename table: {str(e)}")
1001
+
1002
+ def sanitize_table_name(self, name):
1003
+ """
1004
+ Sanitize a table name to be valid in SQL.
1005
+
1006
+ Args:
1007
+ name: The proposed table name
1008
+
1009
+ Returns:
1010
+ A sanitized table name
1011
+ """
1012
+ import re
1013
+ name = re.sub(r'[^a-zA-Z0-9_]', '_', name)
1014
+ # Ensure it starts with a letter
1015
+ if not name or not name[0].isalpha():
1016
+ name = 'table_' + name
1017
+ return name.lower()
1018
+
1019
+ def register_dataframe(self, df, table_name, source='query_result'):
1020
+ """
1021
+ Register a DataFrame as a table in the database.
1022
+
1023
+ Args:
1024
+ df: Pandas DataFrame to register
1025
+ table_name: Name for the table
1026
+ source: Source of the data (for tracking)
1027
+
1028
+ Returns:
1029
+ The table name used (may be different if there was a conflict)
1030
+ """
1031
+ # Ensure we have a connection
1032
+ if not self.is_connected():
1033
+ self._init_connection()
1034
+
1035
+ # Sanitize and ensure unique name
1036
+ table_name = self.sanitize_table_name(table_name)
1037
+ original_name = table_name
1038
+ counter = 1
1039
+ while table_name in self.loaded_tables:
1040
+ table_name = f"{original_name}_{counter}"
1041
+ counter += 1
1042
+
1043
+ # Register the DataFrame directly in DuckDB
1044
+ self.conn.register(table_name, df)
1045
+
1046
+ # Track the table
1047
+ self.loaded_tables[table_name] = source
1048
+ self.table_columns[table_name] = [str(col) for col in df.columns.tolist()]
1049
+
1050
+ return table_name
1051
+
1052
+ def get_all_table_columns(self):
1053
+ """
1054
+ Get all table and column names for autocompletion.
1055
+
1056
+ Returns:
1057
+ List of completion words (table names and column names)
1058
+ """
1059
+ # Start with table names
1060
+ completion_words = set(self.loaded_tables.keys())
1061
+
1062
+ # Track column data types for smarter autocompletion
1063
+ column_data_types = {} # {table.column: data_type}
1064
+
1065
+ # Detect potential table relationships for JOIN suggestions
1066
+ potential_relationships = [] # [(table1, column1, table2, column2)]
1067
+
1068
+ # Add column names with and without table prefixes, up to reasonable limits
1069
+ MAX_COLUMNS_PER_TABLE = 100 # Limit columns to prevent memory issues
1070
+ MAX_TABLES_WITH_COLUMNS = 20 # Limit the number of tables to process
1071
+
1072
+ # Sort tables by name to ensure consistent behavior
1073
+ table_items = sorted(list(self.table_columns.items()))
1074
+
1075
+ # Process only a limited number of tables
1076
+ for table, columns in table_items[:MAX_TABLES_WITH_COLUMNS]:
1077
+ # Add each column name by itself
1078
+ for col in columns[:MAX_COLUMNS_PER_TABLE]:
1079
+ completion_words.add(col)
1080
+
1081
+ # Add qualified column names (table.column)
1082
+ for col in columns[:MAX_COLUMNS_PER_TABLE]:
1083
+ completion_words.add(f"{table}.{col}")
1084
+
1085
+ # Try to infer table relationships based on column naming
1086
+ self._detect_relationships(table, columns, potential_relationships)
1087
+
1088
+ # Try to infer column data types when possible
1089
+ if self.is_connected():
1090
+ try:
1091
+ self._detect_column_types(table, column_data_types)
1092
+ except Exception:
1093
+ pass
1094
+
1095
+ # Add common SQL functions and aggregations with context-aware completions
1096
+ sql_functions = [
1097
+ # Aggregation functions with completed parentheses
1098
+ "COUNT(*)", "COUNT(DISTINCT ", "SUM(", "AVG(", "MIN(", "MAX(",
1099
+
1100
+ # String functions
1101
+ "CONCAT(", "SUBSTR(", "LOWER(", "UPPER(", "TRIM(", "REPLACE(", "LENGTH(",
1102
+ "REGEXP_REPLACE(", "REGEXP_EXTRACT(", "REGEXP_MATCH(",
1103
+
1104
+ # Date/time functions
1105
+ "CURRENT_DATE", "CURRENT_TIME", "CURRENT_TIMESTAMP", "NOW()",
1106
+ "EXTRACT(", "DATE_TRUNC(", "DATE_PART(", "DATEADD(", "DATEDIFF(",
1107
+
1108
+ # Type conversion
1109
+ "CAST( AS ", "CONVERT(", "TRY_CAST( AS ", "FORMAT(",
1110
+
1111
+ # Conditional functions
1112
+ "COALESCE(", "NULLIF(", "GREATEST(", "LEAST(", "IFF(", "IFNULL(",
1113
+
1114
+ # Window functions
1115
+ "ROW_NUMBER() OVER (", "RANK() OVER (", "DENSE_RANK() OVER (",
1116
+ "LEAD( OVER (", "LAG( OVER (", "FIRST_VALUE( OVER (", "LAST_VALUE( OVER ("
1117
+ ]
1118
+
1119
+ # Add common SQL patterns with context awareness
1120
+ sql_patterns = [
1121
+ # Basic query patterns
1122
+ "SELECT * FROM ", "SELECT COUNT(*) FROM ",
1123
+ "SELECT DISTINCT ", "GROUP BY ", "ORDER BY ", "HAVING ",
1124
+ "LIMIT ", "OFFSET ", "WHERE ",
1125
+
1126
+ # JOIN patterns - complete with ON and common join points
1127
+ "INNER JOIN ", "LEFT JOIN ", "RIGHT JOIN ", "FULL OUTER JOIN ",
1128
+ "LEFT OUTER JOIN ", "RIGHT OUTER JOIN ", "CROSS JOIN ",
1129
+
1130
+ # Advanced patterns
1131
+ "WITH _ AS (", "CASE WHEN _ THEN _ ELSE _ END",
1132
+ "OVER (PARTITION BY _ ORDER BY _)",
1133
+ "EXISTS (SELECT 1 FROM _ WHERE _)",
1134
+ "NOT EXISTS (SELECT 1 FROM _ WHERE _)",
1135
+
1136
+ # Common operator patterns
1137
+ "BETWEEN _ AND _", "IN (", "NOT IN (", "IS NULL", "IS NOT NULL",
1138
+ "LIKE '%_%'", "NOT LIKE ", "ILIKE ",
1139
+
1140
+ # Data manipulation patterns
1141
+ "INSERT INTO _ VALUES (", "INSERT INTO _ (_) VALUES (_)",
1142
+ "UPDATE _ SET _ = _ WHERE _", "DELETE FROM _ WHERE _"
1143
+ ]
1144
+
1145
+ # Add table relationships as suggested JOIN patterns
1146
+ for table1, col1, table2, col2 in potential_relationships:
1147
+ join_pattern = f"JOIN {table2} ON {table1}.{col1} = {table2}.{col2}"
1148
+ completion_words.add(join_pattern)
1149
+
1150
+ # Also add the reverse relationship
1151
+ join_pattern_rev = f"JOIN {table1} ON {table2}.{col2} = {table1}.{col1}"
1152
+ completion_words.add(join_pattern_rev)
1153
+
1154
+ # Add all SQL extras to the completion words
1155
+ completion_words.update(sql_functions)
1156
+ completion_words.update(sql_patterns)
1157
+
1158
+ # Add common data-specific comparison patterns based on column types
1159
+ for col_name, data_type in column_data_types.items():
1160
+ if 'INT' in data_type.upper() or 'NUM' in data_type.upper() or 'FLOAT' in data_type.upper():
1161
+ # Numeric columns
1162
+ completion_words.add(f"{col_name} > ")
1163
+ completion_words.add(f"{col_name} < ")
1164
+ completion_words.add(f"{col_name} >= ")
1165
+ completion_words.add(f"{col_name} <= ")
1166
+ completion_words.add(f"{col_name} BETWEEN ")
1167
+ elif 'DATE' in data_type.upper() or 'TIME' in data_type.upper():
1168
+ # Date/time columns
1169
+ completion_words.add(f"{col_name} > CURRENT_DATE")
1170
+ completion_words.add(f"{col_name} < CURRENT_DATE")
1171
+ completion_words.add(f"{col_name} BETWEEN CURRENT_DATE - INTERVAL ")
1172
+ completion_words.add(f"EXTRACT(YEAR FROM {col_name})")
1173
+ completion_words.add(f"DATE_TRUNC('month', {col_name})")
1174
+ elif 'CHAR' in data_type.upper() or 'TEXT' in data_type.upper() or 'VARCHAR' in data_type.upper():
1175
+ # String columns
1176
+ completion_words.add(f"{col_name} LIKE '%")
1177
+ completion_words.add(f"{col_name} ILIKE '%")
1178
+ completion_words.add(f"LOWER({col_name}) = ")
1179
+ completion_words.add(f"UPPER({col_name}) = ")
1180
+
1181
+ # Convert set back to list and sort for better usability
1182
+ completion_list = list(completion_words)
1183
+ completion_list.sort(key=lambda x: (not x.isupper(), x)) # Prioritize SQL keywords
1184
+
1185
+ return completion_list
1186
+
1187
+ def _detect_relationships(self, table, columns, potential_relationships):
1188
+ """
1189
+ Detect potential relationships between tables based on column naming patterns.
1190
+
1191
+ Args:
1192
+ table: Current table name
1193
+ columns: List of column names in this table
1194
+ potential_relationships: List to populate with detected relationships
1195
+ """
1196
+ # Look for columns that might be foreign keys (common patterns)
1197
+ for col in columns:
1198
+ # Common ID patterns: table_id, tableId, TableID, etc.
1199
+ if col.lower().endswith('_id') or col.lower().endswith('id'):
1200
+ # Extract potential table name from column name
1201
+ if col.lower().endswith('_id'):
1202
+ potential_table = col[:-3] # Remove '_id'
1203
+ else:
1204
+ # Try to extract tablename from camelCase or PascalCase
1205
+ potential_table = col[:-2] # Remove 'Id'
1206
+
1207
+ # Normalize to lowercase for comparison
1208
+ potential_table = potential_table.lower()
1209
+
1210
+ # Check if this potential table exists in our loaded tables
1211
+ for existing_table in self.loaded_tables.keys():
1212
+ # Normalize for comparison
1213
+ existing_lower = existing_table.lower()
1214
+
1215
+ # If we find a matching table, it's likely a relationship
1216
+ if existing_lower == potential_table or existing_lower.endswith(f"_{potential_table}"):
1217
+ # Add this relationship
1218
+ # We assume the target column in the referenced table is 'id'
1219
+ potential_relationships.append((table, col, existing_table, 'id'))
1220
+ break
1221
+
1222
+ # Also detect columns with same name across tables (potential join points)
1223
+ for other_table, other_columns in self.table_columns.items():
1224
+ if other_table != table and col in other_columns:
1225
+ # Same column name in different tables - potential join point
1226
+ potential_relationships.append((table, col, other_table, col))
1227
+
1228
+ def _detect_column_types(self, table, column_data_types):
1229
+ """
1230
+ Detect column data types for a table to enable smarter autocompletion.
1231
+
1232
+ Args:
1233
+ table: Table name to analyze
1234
+ column_data_types: Dictionary to populate with column data types
1235
+ """
1236
+ if not self.is_connected():
1237
+ return
1238
+
1239
+ try:
1240
+ # Determine the database to query
1241
+ source = self.loaded_tables.get(table, '')
1242
+ if source.startswith('database:'):
1243
+ db_name = source.split(':')[1]
1244
+ # Use duckdb_columns() for attached databases
1245
+ query = f"""
1246
+ SELECT column_name, data_type
1247
+ FROM duckdb_columns()
1248
+ WHERE database_name='{db_name}' AND table_name='{table}'
1249
+ """
1250
+ else:
1251
+ # For in-memory tables, use information_schema
1252
+ query = f"""
1253
+ SELECT column_name, data_type
1254
+ FROM information_schema.columns
1255
+ WHERE table_name='{table}' AND table_schema='main'
1256
+ """
1257
+
1258
+ result = self.conn.execute(query).fetchdf()
1259
+
1260
+ for _, row in result.iterrows():
1261
+ col_name = row['column_name']
1262
+ data_type = row['data_type']
1263
+
1264
+ # Store as table.column: data_type for qualified lookups
1265
+ column_data_types[f"{table}.{col_name}"] = data_type
1266
+ # Also store just column: data_type for unqualified lookups
1267
+ column_data_types[col_name] = data_type
1268
+ except Exception:
1269
+ # Ignore errors in type detection - this is just for enhancement
1270
+ pass
1271
+
1272
+ def load_specific_table(self, table_name, database_alias='db'):
1273
+ """
1274
+ Load metadata for a specific table from an attached database.
1275
+ This is used when we know which tables we want to load rather than loading all tables.
1276
+
1277
+ Args:
1278
+ table_name: Name of the table to load
1279
+ database_alias: The alias of the attached database (default: 'db')
1280
+
1281
+ Returns:
1282
+ Boolean indicating if the table was found and loaded
1283
+ """
1284
+ if not self.is_connected():
1285
+ return False
1286
+
1287
+ if database_alias not in self.attached_databases:
1288
+ return False
1289
+
1290
+ try:
1291
+ # Check if the table exists in the attached database using duckdb_tables()
1292
+ query = f"SELECT table_name FROM duckdb_tables() WHERE table_name='{table_name}' AND database_name='{database_alias}'"
1293
+ result = self.conn.execute(query).fetchdf()
1294
+
1295
+ if not result.empty:
1296
+ # Get column names for the table using duckdb_columns()
1297
+ try:
1298
+ column_query = f"SELECT column_name FROM duckdb_columns() WHERE table_name='{table_name}' AND database_name='{database_alias}'"
1299
+ columns = self.conn.execute(column_query).fetchdf()
1300
+ self.table_columns[table_name] = columns['column_name'].tolist()
1301
+ except Exception:
1302
+ self.table_columns[table_name] = []
1303
+
1304
+ # Register the table
1305
+ self.loaded_tables[table_name] = f'database:{database_alias}'
1306
+
1307
+ # Add to the database's table list
1308
+ if 'tables' not in self.attached_databases[database_alias]:
1309
+ self.attached_databases[database_alias]['tables'] = []
1310
+ if table_name not in self.attached_databases[database_alias]['tables']:
1311
+ self.attached_databases[database_alias]['tables'].append(table_name)
1312
+
1313
+ return True
1314
+
1315
+ return False
1316
+
1317
+ except Exception:
1318
+ return False