sqlshell 0.1.8__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sqlshell might be problematic. Click here for more details.

@@ -0,0 +1,837 @@
1
+ import os
2
+ import sqlite3
3
+ import pandas as pd
4
+ import duckdb
5
+ from pathlib import Path
6
+
7
+ class DatabaseManager:
8
+ """
9
+ Manages database connections and operations for SQLShell.
10
+ Handles both SQLite and DuckDB connections.
11
+ """
12
+
13
+ def __init__(self):
14
+ """Initialize the database manager with no active connection."""
15
+ self.conn = None
16
+ self.connection_type = None
17
+ self.loaded_tables = {} # Maps table_name to file_path or 'database'/'query_result'
18
+ self.table_columns = {} # Maps table_name to list of column names
19
+ self.database_path = None # Track the path to the current database file
20
+
21
+ def is_connected(self):
22
+ """Check if there is an active database connection."""
23
+ return self.conn is not None
24
+
25
+ def get_connection_info(self):
26
+ """Get information about the current connection."""
27
+ if not self.is_connected():
28
+ return "No database connected"
29
+
30
+ if self.connection_type == "sqlite":
31
+ return "Connected to: SQLite database"
32
+ elif self.connection_type == "duckdb":
33
+ return "Connected to: DuckDB database"
34
+ return "Connected to: Unknown database type"
35
+
36
+ def close_connection(self):
37
+ """Close the current database connection if one exists."""
38
+ if self.conn:
39
+ try:
40
+ if self.connection_type == "duckdb":
41
+ self.conn.close()
42
+ else: # sqlite
43
+ self.conn.close()
44
+ except Exception:
45
+ pass # Ignore errors when closing
46
+ finally:
47
+ self.conn = None
48
+ self.connection_type = None
49
+ self.database_path = None # Clear the database path
50
+
51
+ def open_database(self, filename, load_all_tables=True):
52
+ """
53
+ Open a database connection to the specified file.
54
+ Detects whether it's a SQLite or DuckDB database.
55
+
56
+ Args:
57
+ filename: Path to the database file
58
+ load_all_tables: Whether to automatically load all tables from the database
59
+
60
+ Returns:
61
+ True if successful, False otherwise
62
+
63
+ Raises:
64
+ Exception: If there's an error opening the database
65
+ """
66
+ # Close any existing connection
67
+ self.close_connection()
68
+
69
+ # Clear any existing loaded tables
70
+ self.loaded_tables = {}
71
+ self.table_columns = {}
72
+
73
+ try:
74
+ if self.is_sqlite_db(filename):
75
+ self.conn = sqlite3.connect(filename)
76
+ self.connection_type = "sqlite"
77
+ else:
78
+ self.conn = duckdb.connect(filename)
79
+ self.connection_type = "duckdb"
80
+
81
+ # Store the database path
82
+ self.database_path = os.path.abspath(filename)
83
+
84
+ # Load tables from the database if requested
85
+ if load_all_tables:
86
+ self.load_database_tables()
87
+ return True
88
+ except (sqlite3.Error, duckdb.Error) as e:
89
+ self.conn = None
90
+ self.connection_type = None
91
+ self.database_path = None
92
+ raise Exception(f"Failed to open database: {str(e)}")
93
+
94
+ def create_memory_connection(self):
95
+ """Create an in-memory DuckDB connection."""
96
+ self.close_connection()
97
+ self.conn = duckdb.connect(':memory:')
98
+ self.connection_type = 'duckdb'
99
+ self.database_path = None # No file path for in-memory database
100
+ return "Connected to: in-memory DuckDB"
101
+
102
+ def is_sqlite_db(self, filename):
103
+ """
104
+ Check if the file is a SQLite database by examining its header.
105
+
106
+ Args:
107
+ filename: Path to the database file
108
+
109
+ Returns:
110
+ Boolean indicating if the file is a SQLite database
111
+ """
112
+ try:
113
+ with open(filename, 'rb') as f:
114
+ header = f.read(16)
115
+ return header[:16] == b'SQLite format 3\x00'
116
+ except:
117
+ return False
118
+
119
+ def load_database_tables(self):
120
+ """
121
+ Load all tables from the current database connection.
122
+
123
+ Returns:
124
+ A list of table names loaded
125
+ """
126
+ try:
127
+ if not self.is_connected():
128
+ return []
129
+
130
+ table_names = []
131
+
132
+ if self.connection_type == 'sqlite':
133
+ query = "SELECT name FROM sqlite_master WHERE type='table'"
134
+ cursor = self.conn.cursor()
135
+ tables = cursor.execute(query).fetchall()
136
+
137
+ for (table_name,) in tables:
138
+ self.loaded_tables[table_name] = 'database'
139
+ table_names.append(table_name)
140
+
141
+ # Get column names for each table
142
+ try:
143
+ column_query = f"PRAGMA table_info({table_name})"
144
+ columns = cursor.execute(column_query).fetchall()
145
+ self.table_columns[table_name] = [col[1] for col in columns] # Column name is at index 1
146
+ except Exception:
147
+ self.table_columns[table_name] = []
148
+
149
+ else: # duckdb
150
+ query = "SELECT table_name FROM information_schema.tables WHERE table_schema='main'"
151
+ result = self.conn.execute(query).fetchdf()
152
+
153
+ for table_name in result['table_name']:
154
+ self.loaded_tables[table_name] = 'database'
155
+ table_names.append(table_name)
156
+
157
+ # Get column names for each table
158
+ try:
159
+ column_query = f"SELECT column_name FROM information_schema.columns WHERE table_name='{table_name}' AND table_schema='main'"
160
+ columns = self.conn.execute(column_query).fetchdf()
161
+ self.table_columns[table_name] = columns['column_name'].tolist()
162
+ except Exception:
163
+ self.table_columns[table_name] = []
164
+
165
+ return table_names
166
+
167
+ except Exception as e:
168
+ raise Exception(f'Error loading tables: {str(e)}')
169
+
170
+ def execute_query(self, query):
171
+ """
172
+ Execute a SQL query against the current database connection.
173
+
174
+ Args:
175
+ query: SQL query string to execute
176
+
177
+ Returns:
178
+ Pandas DataFrame with the query results
179
+
180
+ Raises:
181
+ Exception: If there's an error executing the query
182
+ """
183
+ if not query.strip():
184
+ raise ValueError("Empty query")
185
+
186
+ if not self.is_connected():
187
+ raise ValueError("No database connection")
188
+
189
+ try:
190
+ if self.connection_type == "duckdb":
191
+ result = self.conn.execute(query).fetchdf()
192
+ else: # sqlite
193
+ result = pd.read_sql_query(query, self.conn)
194
+
195
+ return result
196
+ except (duckdb.Error, sqlite3.Error) as e:
197
+ error_msg = str(e).lower()
198
+ if "syntax error" in error_msg:
199
+ raise SyntaxError(f"SQL syntax error: {str(e)}")
200
+ elif "no such table" in error_msg:
201
+ # Extract the table name from the error message when possible
202
+ import re
203
+ table_match = re.search(r"'([^']+)'", str(e))
204
+ table_name = table_match.group(1) if table_match else "unknown"
205
+
206
+ # Check if this table is in our loaded_tables dict but came from a database
207
+ if table_name in self.loaded_tables and self.loaded_tables[table_name] == 'database':
208
+ raise ValueError(f"Table '{table_name}' was part of a database but is not accessible. "
209
+ f"Please reconnect to the original database using the 'Open Database' button.")
210
+ else:
211
+ raise ValueError(f"Table not found: {str(e)}")
212
+ elif "no such column" in error_msg:
213
+ raise ValueError(f"Column not found: {str(e)}")
214
+ else:
215
+ raise Exception(f"Database error: {str(e)}")
216
+
217
+ def load_file(self, file_path):
218
+ """
219
+ Load data from a file into the database.
220
+
221
+ Args:
222
+ file_path: Path to the data file (Excel, CSV, Parquet, Delta)
223
+
224
+ Returns:
225
+ Tuple of (table_name, DataFrame) for the loaded data
226
+
227
+ Raises:
228
+ ValueError: If the file format is unsupported or there's an error
229
+ """
230
+ try:
231
+ # Check if this is a Delta table (folder with _delta_log)
232
+ delta_path = Path(file_path)
233
+ is_delta_table = (delta_path.is_dir() and
234
+ (delta_path / '_delta_log').exists()) or file_path.endswith('.delta')
235
+
236
+ # Read the file into a DataFrame, using optimized loading strategies
237
+ if is_delta_table:
238
+ # Read as Delta table using deltalake library
239
+ try:
240
+ # Load the Delta table
241
+ import deltalake
242
+ delta_table = deltalake.DeltaTable(file_path)
243
+ # Convert to pandas DataFrame
244
+ df = delta_table.to_pandas()
245
+ except Exception as e:
246
+ raise ValueError(f"Error loading Delta table: {str(e)}")
247
+ elif file_path.endswith(('.xlsx', '.xls')):
248
+ # Try to use a streaming approach for Excel files
249
+ try:
250
+ # For Excel files, we first check if it's a large file
251
+ # If it's large, we may want to show only a subset
252
+ excel_file = pd.ExcelFile(file_path)
253
+ sheet_name = excel_file.sheet_names[0] # Default to first sheet
254
+
255
+ # Read the first row to get column names
256
+ df_preview = pd.read_excel(excel_file, sheet_name=sheet_name, nrows=5)
257
+
258
+ # If the file is very large, use chunksize
259
+ file_size = os.path.getsize(file_path) / (1024 * 1024) # Size in MB
260
+
261
+ if file_size > 50: # If file is larger than 50MB
262
+ # Use a limited subset for large files to avoid memory issues
263
+ df = pd.read_excel(excel_file, sheet_name=sheet_name, nrows=100000) # Cap at 100k rows
264
+ else:
265
+ # For smaller files, read everything
266
+ df = pd.read_excel(excel_file, sheet_name=sheet_name)
267
+ except Exception:
268
+ # Fallback to standard reading method
269
+ df = pd.read_excel(file_path)
270
+ elif file_path.endswith('.csv'):
271
+ # For CSV files, we can use chunking for large files
272
+ try:
273
+ # Check if it's a large file
274
+ file_size = os.path.getsize(file_path) / (1024 * 1024) # Size in MB
275
+
276
+ if file_size > 50: # If file is larger than 50MB
277
+ # Read the first chunk to get column types
278
+ df_preview = pd.read_csv(file_path, nrows=1000)
279
+
280
+ # Use optimized dtypes for better memory usage
281
+ dtypes = {col: df_preview[col].dtype for col in df_preview.columns}
282
+
283
+ # Read again with chunk processing, combining up to 100k rows
284
+ chunks = []
285
+ for chunk in pd.read_csv(file_path, dtype=dtypes, chunksize=10000):
286
+ chunks.append(chunk)
287
+ if len(chunks) * 10000 >= 100000: # Cap at 100k rows
288
+ break
289
+
290
+ df = pd.concat(chunks, ignore_index=True)
291
+ else:
292
+ # For smaller files, read everything at once
293
+ df = pd.read_csv(file_path)
294
+ except Exception:
295
+ # Fallback to standard reading method
296
+ df = pd.read_csv(file_path)
297
+ elif file_path.endswith('.parquet'):
298
+ df = pd.read_parquet(file_path)
299
+ else:
300
+ raise ValueError("Unsupported file format")
301
+
302
+ # Generate table name from file name
303
+ base_name = os.path.splitext(os.path.basename(file_path))[0]
304
+
305
+ # For directories like Delta tables, use the directory name
306
+ if os.path.isdir(file_path):
307
+ base_name = os.path.basename(file_path)
308
+
309
+ table_name = self.sanitize_table_name(base_name)
310
+
311
+ # Ensure unique table name
312
+ original_name = table_name
313
+ counter = 1
314
+ while table_name in self.loaded_tables:
315
+ table_name = f"{original_name}_{counter}"
316
+ counter += 1
317
+
318
+ # Register the table in the database
319
+ if not self.is_connected():
320
+ self.create_memory_connection()
321
+
322
+ # Handle table creation based on database type
323
+ if self.connection_type == 'sqlite':
324
+ # For SQLite, create a table from the DataFrame
325
+ # For large dataframes, use a chunked approach to avoid memory issues
326
+ if len(df) > 10000:
327
+ # Create the table with the first chunk
328
+ df.iloc[:1000].to_sql(table_name, self.conn, index=False, if_exists='replace')
329
+
330
+ # Append the rest in chunks
331
+ chunk_size = 5000
332
+ for i in range(1000, len(df), chunk_size):
333
+ end = min(i + chunk_size, len(df))
334
+ df.iloc[i:end].to_sql(table_name, self.conn, index=False, if_exists='append')
335
+ else:
336
+ # For smaller dataframes, do it in one go
337
+ df.to_sql(table_name, self.conn, index=False, if_exists='replace')
338
+ else:
339
+ # For DuckDB, register the DataFrame as a view
340
+ self.conn.register(table_name, df)
341
+
342
+ # Store information about the table
343
+ self.loaded_tables[table_name] = file_path
344
+ self.table_columns[table_name] = df.columns.tolist()
345
+
346
+ return table_name, df
347
+
348
+ except MemoryError:
349
+ raise ValueError("Not enough memory to load this file. Try using a smaller file or increasing available memory.")
350
+ except Exception as e:
351
+ raise ValueError(f"Error loading file: {str(e)}")
352
+
353
+ def remove_table(self, table_name):
354
+ """
355
+ Remove a table from the database.
356
+
357
+ Args:
358
+ table_name: Name of the table to remove
359
+
360
+ Returns:
361
+ Boolean indicating success
362
+ """
363
+ if not table_name in self.loaded_tables:
364
+ return False
365
+
366
+ try:
367
+ # Remove from database
368
+ if self.connection_type == 'sqlite':
369
+ self.conn.execute(f'DROP TABLE IF EXISTS "{table_name}"')
370
+ else: # duckdb
371
+ self.conn.execute(f'DROP VIEW IF EXISTS {table_name}')
372
+
373
+ # Remove from tracking
374
+ del self.loaded_tables[table_name]
375
+ if table_name in self.table_columns:
376
+ del self.table_columns[table_name]
377
+
378
+ return True
379
+ except Exception:
380
+ return False
381
+
382
+ def get_table_preview(self, table_name, limit=5):
383
+ """
384
+ Get a preview of the data in a table.
385
+
386
+ Args:
387
+ table_name: Name of the table to preview
388
+ limit: Number of rows to preview
389
+
390
+ Returns:
391
+ Pandas DataFrame with the preview data
392
+ """
393
+ if not table_name in self.loaded_tables:
394
+ raise ValueError(f"Table '{table_name}' not found")
395
+
396
+ try:
397
+ if self.connection_type == 'sqlite':
398
+ return pd.read_sql_query(f'SELECT * FROM "{table_name}" LIMIT {limit}', self.conn)
399
+ else:
400
+ return self.conn.execute(f'SELECT * FROM {table_name} LIMIT {limit}').fetchdf()
401
+ except Exception as e:
402
+ raise Exception(f"Error previewing table: {str(e)}")
403
+
404
+ def reload_table(self, table_name):
405
+ """
406
+ Reload a table's data from its source file.
407
+
408
+ Args:
409
+ table_name: Name of the table to reload
410
+
411
+ Returns:
412
+ Tuple of (bool, message) indicating success/failure and a message
413
+
414
+ Raises:
415
+ ValueError: If the table cannot be reloaded
416
+ """
417
+ if not table_name in self.loaded_tables:
418
+ return False, f"Table '{table_name}' not found"
419
+
420
+ file_path = self.loaded_tables[table_name]
421
+
422
+ # Check if this is a file-based table
423
+ if file_path in ['database', 'query_result']:
424
+ return False, f"Cannot reload '{table_name}' because it's not a file-based table"
425
+
426
+ try:
427
+ # Check if the file still exists
428
+ if not os.path.exists(file_path):
429
+ return False, f"Source file '{file_path}' no longer exists"
430
+
431
+ # Store the original table name
432
+ original_name = table_name
433
+
434
+ # Remove the existing table
435
+ self.remove_table(table_name)
436
+
437
+ # Check if this is a Delta table
438
+ delta_path = Path(file_path)
439
+ is_delta_table = (delta_path.is_dir() and
440
+ (delta_path / '_delta_log').exists()) or file_path.endswith('.delta')
441
+
442
+ # Load the file with the original table name
443
+ df = None
444
+ if is_delta_table:
445
+ # Read as Delta table
446
+ import deltalake
447
+ delta_table = deltalake.DeltaTable(file_path)
448
+ df = delta_table.to_pandas()
449
+ elif file_path.endswith(('.xlsx', '.xls')):
450
+ df = pd.read_excel(file_path)
451
+ elif file_path.endswith('.csv'):
452
+ df = pd.read_csv(file_path)
453
+ elif file_path.endswith('.parquet'):
454
+ df = pd.read_parquet(file_path)
455
+ else:
456
+ return False, "Unsupported file format"
457
+
458
+ # Register the dataframe with the original name
459
+ self.register_dataframe(df, original_name, file_path)
460
+
461
+ return True, f"Table '{table_name}' reloaded successfully"
462
+
463
+ except Exception as e:
464
+ return False, f"Error reloading table: {str(e)}"
465
+
466
+ def rename_table(self, old_name, new_name):
467
+ """
468
+ Rename a table in the database.
469
+
470
+ Args:
471
+ old_name: Current name of the table
472
+ new_name: New name for the table
473
+
474
+ Returns:
475
+ Boolean indicating success
476
+ """
477
+ if not old_name in self.loaded_tables:
478
+ return False
479
+
480
+ try:
481
+ # Sanitize the new name
482
+ new_name = self.sanitize_table_name(new_name)
483
+
484
+ # Check if new name already exists
485
+ if new_name in self.loaded_tables and new_name != old_name:
486
+ raise ValueError(f"Table '{new_name}' already exists")
487
+
488
+ # Rename in database
489
+ if self.connection_type == 'sqlite':
490
+ self.conn.execute(f'ALTER TABLE "{old_name}" RENAME TO "{new_name}"')
491
+ else: # duckdb
492
+ # For DuckDB, we need to:
493
+ # 1. Get the data from the old view/table
494
+ df = self.conn.execute(f'SELECT * FROM {old_name}').fetchdf()
495
+ # 2. Drop the old view
496
+ self.conn.execute(f'DROP VIEW IF EXISTS {old_name}')
497
+ # 3. Register the data under the new name
498
+ self.conn.register(new_name, df)
499
+
500
+ # Update tracking
501
+ self.loaded_tables[new_name] = self.loaded_tables.pop(old_name)
502
+ self.table_columns[new_name] = self.table_columns.pop(old_name)
503
+
504
+ return True
505
+
506
+ except Exception as e:
507
+ raise Exception(f"Failed to rename table: {str(e)}")
508
+
509
+ def sanitize_table_name(self, name):
510
+ """
511
+ Sanitize a table name to be valid in SQL.
512
+
513
+ Args:
514
+ name: The proposed table name
515
+
516
+ Returns:
517
+ A sanitized table name
518
+ """
519
+ import re
520
+ name = re.sub(r'[^a-zA-Z0-9_]', '_', name)
521
+ # Ensure it starts with a letter
522
+ if not name or not name[0].isalpha():
523
+ name = 'table_' + name
524
+ return name.lower()
525
+
526
+ def register_dataframe(self, df, table_name, source='query_result'):
527
+ """
528
+ Register a DataFrame as a table in the database.
529
+
530
+ Args:
531
+ df: Pandas DataFrame to register
532
+ table_name: Name for the table
533
+ source: Source of the data (for tracking)
534
+
535
+ Returns:
536
+ The table name used (may be different if there was a conflict)
537
+ """
538
+ # Sanitize and ensure unique name
539
+ table_name = self.sanitize_table_name(table_name)
540
+ original_name = table_name
541
+ counter = 1
542
+ while table_name in self.loaded_tables:
543
+ table_name = f"{original_name}_{counter}"
544
+ counter += 1
545
+
546
+ # Register in database
547
+ if self.connection_type == 'sqlite':
548
+ df.to_sql(table_name, self.conn, index=False, if_exists='replace')
549
+ else: # duckdb
550
+ self.conn.register(table_name, df)
551
+
552
+ # Track the table
553
+ self.loaded_tables[table_name] = source
554
+ self.table_columns[table_name] = df.columns.tolist()
555
+
556
+ return table_name
557
+
558
+ def get_all_table_columns(self):
559
+ """
560
+ Get all table and column names for autocompletion.
561
+
562
+ Returns:
563
+ List of completion words (table names and column names)
564
+ """
565
+ # Start with table names
566
+ completion_words = set(self.loaded_tables.keys())
567
+
568
+ # Track column data types for smarter autocompletion
569
+ column_data_types = {} # {table.column: data_type}
570
+
571
+ # Detect potential table relationships for JOIN suggestions
572
+ potential_relationships = [] # [(table1, column1, table2, column2)]
573
+
574
+ # Add column names with and without table prefixes, up to reasonable limits
575
+ MAX_COLUMNS_PER_TABLE = 100 # Limit columns to prevent memory issues
576
+ MAX_TABLES_WITH_COLUMNS = 20 # Limit the number of tables to process
577
+
578
+ # Sort tables by name to ensure consistent behavior
579
+ table_items = sorted(list(self.table_columns.items()))
580
+
581
+ # Process only a limited number of tables
582
+ for table, columns in table_items[:MAX_TABLES_WITH_COLUMNS]:
583
+ # Add each column name by itself
584
+ for col in columns[:MAX_COLUMNS_PER_TABLE]:
585
+ completion_words.add(col)
586
+
587
+ # Add qualified column names (table.column)
588
+ for col in columns[:MAX_COLUMNS_PER_TABLE]:
589
+ completion_words.add(f"{table}.{col}")
590
+
591
+ # Try to infer table relationships based on column naming
592
+ self._detect_relationships(table, columns, potential_relationships)
593
+
594
+ # Try to infer column data types when possible
595
+ if self.is_connected():
596
+ try:
597
+ self._detect_column_types(table, column_data_types)
598
+ except Exception:
599
+ pass
600
+
601
+ # Add common SQL functions and aggregations with context-aware completions
602
+ sql_functions = [
603
+ # Aggregation functions with completed parentheses
604
+ "COUNT(*)", "COUNT(DISTINCT ", "SUM(", "AVG(", "MIN(", "MAX(",
605
+
606
+ # String functions
607
+ "CONCAT(", "SUBSTR(", "LOWER(", "UPPER(", "TRIM(", "REPLACE(", "LENGTH(",
608
+ "REGEXP_REPLACE(", "REGEXP_EXTRACT(", "REGEXP_MATCH(",
609
+
610
+ # Date/time functions
611
+ "CURRENT_DATE", "CURRENT_TIME", "CURRENT_TIMESTAMP", "NOW()",
612
+ "EXTRACT(", "DATE_TRUNC(", "DATE_PART(", "DATEADD(", "DATEDIFF(",
613
+
614
+ # Type conversion
615
+ "CAST( AS ", "CONVERT(", "TRY_CAST( AS ", "FORMAT(",
616
+
617
+ # Conditional functions
618
+ "COALESCE(", "NULLIF(", "GREATEST(", "LEAST(", "IFF(", "IFNULL(",
619
+
620
+ # Window functions
621
+ "ROW_NUMBER() OVER (", "RANK() OVER (", "DENSE_RANK() OVER (",
622
+ "LEAD( OVER (", "LAG( OVER (", "FIRST_VALUE( OVER (", "LAST_VALUE( OVER ("
623
+ ]
624
+
625
+ # Add common SQL patterns with context awareness
626
+ sql_patterns = [
627
+ # Basic query patterns
628
+ "SELECT * FROM ", "SELECT COUNT(*) FROM ",
629
+ "SELECT DISTINCT ", "GROUP BY ", "ORDER BY ", "HAVING ",
630
+ "LIMIT ", "OFFSET ", "WHERE ",
631
+
632
+ # JOIN patterns - complete with ON and common join points
633
+ "INNER JOIN ", "LEFT JOIN ", "RIGHT JOIN ", "FULL OUTER JOIN ",
634
+ "LEFT OUTER JOIN ", "RIGHT OUTER JOIN ", "CROSS JOIN ",
635
+
636
+ # Advanced patterns
637
+ "WITH _ AS (", "CASE WHEN _ THEN _ ELSE _ END",
638
+ "OVER (PARTITION BY _ ORDER BY _)",
639
+ "EXISTS (SELECT 1 FROM _ WHERE _)",
640
+ "NOT EXISTS (SELECT 1 FROM _ WHERE _)",
641
+
642
+ # Common operator patterns
643
+ "BETWEEN _ AND _", "IN (", "NOT IN (", "IS NULL", "IS NOT NULL",
644
+ "LIKE '%_%'", "NOT LIKE ", "ILIKE ",
645
+
646
+ # Data manipulation patterns
647
+ "INSERT INTO _ VALUES (", "INSERT INTO _ (_) VALUES (_)",
648
+ "UPDATE _ SET _ = _ WHERE _", "DELETE FROM _ WHERE _"
649
+ ]
650
+
651
+ # Add table relationships as suggested JOIN patterns
652
+ for table1, col1, table2, col2 in potential_relationships:
653
+ join_pattern = f"JOIN {table2} ON {table1}.{col1} = {table2}.{col2}"
654
+ completion_words.add(join_pattern)
655
+
656
+ # Also add the reverse relationship
657
+ join_pattern_rev = f"JOIN {table1} ON {table2}.{col2} = {table1}.{col1}"
658
+ completion_words.add(join_pattern_rev)
659
+
660
+ # Add all SQL extras to the completion words
661
+ completion_words.update(sql_functions)
662
+ completion_words.update(sql_patterns)
663
+
664
+ # Add common data-specific comparison patterns based on column types
665
+ for col_name, data_type in column_data_types.items():
666
+ if 'INT' in data_type.upper() or 'NUM' in data_type.upper() or 'FLOAT' in data_type.upper():
667
+ # Numeric columns
668
+ completion_words.add(f"{col_name} > ")
669
+ completion_words.add(f"{col_name} < ")
670
+ completion_words.add(f"{col_name} >= ")
671
+ completion_words.add(f"{col_name} <= ")
672
+ completion_words.add(f"{col_name} BETWEEN ")
673
+ elif 'DATE' in data_type.upper() or 'TIME' in data_type.upper():
674
+ # Date/time columns
675
+ completion_words.add(f"{col_name} > CURRENT_DATE")
676
+ completion_words.add(f"{col_name} < CURRENT_DATE")
677
+ completion_words.add(f"{col_name} BETWEEN CURRENT_DATE - INTERVAL ")
678
+ completion_words.add(f"EXTRACT(YEAR FROM {col_name})")
679
+ completion_words.add(f"DATE_TRUNC('month', {col_name})")
680
+ elif 'CHAR' in data_type.upper() or 'TEXT' in data_type.upper() or 'VARCHAR' in data_type.upper():
681
+ # String columns
682
+ completion_words.add(f"{col_name} LIKE '%")
683
+ completion_words.add(f"{col_name} ILIKE '%")
684
+ completion_words.add(f"LOWER({col_name}) = ")
685
+ completion_words.add(f"UPPER({col_name}) = ")
686
+
687
+ # Convert set back to list and sort for better usability
688
+ completion_list = list(completion_words)
689
+ completion_list.sort(key=lambda x: (not x.isupper(), x)) # Prioritize SQL keywords
690
+
691
+ return completion_list
692
+
693
+ def _detect_relationships(self, table, columns, potential_relationships):
694
+ """
695
+ Detect potential relationships between tables based on column naming patterns.
696
+
697
+ Args:
698
+ table: Current table name
699
+ columns: List of column names in this table
700
+ potential_relationships: List to populate with detected relationships
701
+ """
702
+ # Look for columns that might be foreign keys (common patterns)
703
+ for col in columns:
704
+ # Common ID patterns: table_id, tableId, TableID, etc.
705
+ if col.lower().endswith('_id') or col.lower().endswith('id'):
706
+ # Extract potential table name from column name
707
+ if col.lower().endswith('_id'):
708
+ potential_table = col[:-3] # Remove '_id'
709
+ else:
710
+ # Try to extract tablename from camelCase or PascalCase
711
+ potential_table = col[:-2] # Remove 'Id'
712
+
713
+ # Normalize to lowercase for comparison
714
+ potential_table = potential_table.lower()
715
+
716
+ # Check if this potential table exists in our loaded tables
717
+ for existing_table in self.loaded_tables.keys():
718
+ # Normalize for comparison
719
+ existing_lower = existing_table.lower()
720
+
721
+ # If we find a matching table, it's likely a relationship
722
+ if existing_lower == potential_table or existing_lower.endswith(f"_{potential_table}"):
723
+ # Add this relationship
724
+ # We assume the target column in the referenced table is 'id'
725
+ potential_relationships.append((table, col, existing_table, 'id'))
726
+ break
727
+
728
+ # Also detect columns with same name across tables (potential join points)
729
+ for other_table, other_columns in self.table_columns.items():
730
+ if other_table != table and col in other_columns:
731
+ # Same column name in different tables - potential join point
732
+ potential_relationships.append((table, col, other_table, col))
733
+
734
+ def _detect_column_types(self, table, column_data_types):
735
+ """
736
+ Detect column data types for a table to enable smarter autocompletion.
737
+
738
+ Args:
739
+ table: Table name to analyze
740
+ column_data_types: Dictionary to populate with column data types
741
+ """
742
+ if not self.is_connected():
743
+ return
744
+
745
+ try:
746
+ if self.connection_type == 'sqlite':
747
+ # Get column info from SQLite
748
+ cursor = self.conn.cursor()
749
+ cursor.execute(f"PRAGMA table_info({table})")
750
+ columns_info = cursor.fetchall()
751
+
752
+ for column_info in columns_info:
753
+ col_name = column_info[1] # Column name is at index 1
754
+ data_type = column_info[2] # Data type is at index 2
755
+
756
+ # Store as table.column: data_type for qualified lookups
757
+ column_data_types[f"{table}.{col_name}"] = data_type
758
+ # Also store just column: data_type for unqualified lookups
759
+ column_data_types[col_name] = data_type
760
+
761
+ elif self.connection_type == 'duckdb':
762
+ # Get column info from DuckDB
763
+ query = f"""
764
+ SELECT column_name, data_type
765
+ FROM information_schema.columns
766
+ WHERE table_name='{table}' AND table_schema='main'
767
+ """
768
+ result = self.conn.execute(query).fetchdf()
769
+
770
+ for _, row in result.iterrows():
771
+ col_name = row['column_name']
772
+ data_type = row['data_type']
773
+
774
+ # Store as table.column: data_type for qualified lookups
775
+ column_data_types[f"{table}.{col_name}"] = data_type
776
+ # Also store just column: data_type for unqualified lookups
777
+ column_data_types[col_name] = data_type
778
+ except Exception:
779
+ # Ignore errors in type detection - this is just for enhancement
780
+ pass
781
+
782
+ def load_specific_table(self, table_name):
783
+ """
784
+ Load metadata for a specific table from the database.
785
+ This is used when we know which tables we want to load rather than loading all tables.
786
+
787
+ Args:
788
+ table_name: Name of the table to load
789
+
790
+ Returns:
791
+ Boolean indicating if the table was found and loaded
792
+ """
793
+ if not self.is_connected():
794
+ return False
795
+
796
+ try:
797
+ if self.connection_type == 'sqlite':
798
+ # Check if the table exists in SQLite
799
+ cursor = self.conn.cursor()
800
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (table_name,))
801
+ result = cursor.fetchone()
802
+
803
+ if result:
804
+ # Get column names for the table
805
+ try:
806
+ column_query = f"PRAGMA table_info({table_name})"
807
+ columns = cursor.execute(column_query).fetchall()
808
+ self.table_columns[table_name] = [col[1] for col in columns] # Column name is at index 1
809
+ except Exception:
810
+ self.table_columns[table_name] = []
811
+
812
+ # Register the table
813
+ self.loaded_tables[table_name] = 'database'
814
+ return True
815
+
816
+ else: # duckdb
817
+ # Check if the table exists in DuckDB
818
+ query = f"SELECT table_name FROM information_schema.tables WHERE table_name='{table_name}' AND table_schema='main'"
819
+ result = self.conn.execute(query).fetchdf()
820
+
821
+ if not result.empty:
822
+ # Get column names for the table
823
+ try:
824
+ column_query = f"SELECT column_name FROM information_schema.columns WHERE table_name='{table_name}' AND table_schema='main'"
825
+ columns = self.conn.execute(column_query).fetchdf()
826
+ self.table_columns[table_name] = columns['column_name'].tolist()
827
+ except Exception:
828
+ self.table_columns[table_name] = []
829
+
830
+ # Register the table
831
+ self.loaded_tables[table_name] = 'database'
832
+ return True
833
+
834
+ return False
835
+
836
+ except Exception:
837
+ return False