sqlshell 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlshell/__init__.py +84 -0
- sqlshell/__main__.py +4926 -0
- sqlshell/ai_autocomplete.py +392 -0
- sqlshell/ai_settings_dialog.py +337 -0
- sqlshell/context_suggester.py +768 -0
- sqlshell/create_test_data.py +152 -0
- sqlshell/data/create_test_data.py +137 -0
- sqlshell/db/__init__.py +6 -0
- sqlshell/db/database_manager.py +1318 -0
- sqlshell/db/export_manager.py +188 -0
- sqlshell/editor.py +1166 -0
- sqlshell/editor_integration.py +127 -0
- sqlshell/execution_handler.py +421 -0
- sqlshell/menus.py +262 -0
- sqlshell/notification_manager.py +370 -0
- sqlshell/query_tab.py +904 -0
- sqlshell/resources/__init__.py +1 -0
- sqlshell/resources/icon.png +0 -0
- sqlshell/resources/logo_large.png +0 -0
- sqlshell/resources/logo_medium.png +0 -0
- sqlshell/resources/logo_small.png +0 -0
- sqlshell/resources/splash_screen.gif +0 -0
- sqlshell/space_invaders.py +501 -0
- sqlshell/splash_screen.py +405 -0
- sqlshell/sqlshell/__init__.py +5 -0
- sqlshell/sqlshell/create_test_data.py +118 -0
- sqlshell/sqlshell/create_test_databases.py +96 -0
- sqlshell/sqlshell_demo.png +0 -0
- sqlshell/styles.py +257 -0
- sqlshell/suggester_integration.py +330 -0
- sqlshell/syntax_highlighter.py +124 -0
- sqlshell/table_list.py +996 -0
- sqlshell/ui/__init__.py +6 -0
- sqlshell/ui/bar_chart_delegate.py +49 -0
- sqlshell/ui/filter_header.py +469 -0
- sqlshell/utils/__init__.py +16 -0
- sqlshell/utils/profile_cn2.py +1661 -0
- sqlshell/utils/profile_column.py +2635 -0
- sqlshell/utils/profile_distributions.py +616 -0
- sqlshell/utils/profile_entropy.py +347 -0
- sqlshell/utils/profile_foreign_keys.py +779 -0
- sqlshell/utils/profile_keys.py +2834 -0
- sqlshell/utils/profile_ohe.py +934 -0
- sqlshell/utils/profile_ohe_advanced.py +754 -0
- sqlshell/utils/profile_ohe_comparison.py +237 -0
- sqlshell/utils/profile_prediction.py +926 -0
- sqlshell/utils/profile_similarity.py +876 -0
- sqlshell/utils/search_in_df.py +90 -0
- sqlshell/widgets.py +400 -0
- sqlshell-0.4.4.dist-info/METADATA +441 -0
- sqlshell-0.4.4.dist-info/RECORD +54 -0
- sqlshell-0.4.4.dist-info/WHEEL +5 -0
- sqlshell-0.4.4.dist-info/entry_points.txt +2 -0
- sqlshell-0.4.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1318 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sqlite3
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import duckdb
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
class DatabaseManager:
|
|
8
|
+
"""
|
|
9
|
+
Manages database connections and operations for SQLShell.
|
|
10
|
+
Uses an in-memory DuckDB as the primary connection and can attach external
|
|
11
|
+
SQLite and DuckDB databases for querying alongside loaded files.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self):
|
|
15
|
+
"""Initialize the database manager with an in-memory DuckDB connection."""
|
|
16
|
+
self.conn = None
|
|
17
|
+
self.connection_type = 'duckdb'
|
|
18
|
+
self.loaded_tables = {} # Maps table_name to file_path or 'database:alias'/'query_result'
|
|
19
|
+
self.table_columns = {} # Maps table_name to list of column names
|
|
20
|
+
self.database_path = None # Track the path to the primary attached database (for display)
|
|
21
|
+
self.attached_databases = {} # Maps alias to {'path': path, 'type': 'sqlite'/'duckdb', 'tables': []}
|
|
22
|
+
self._sqlite_scanner_loaded = False
|
|
23
|
+
|
|
24
|
+
# Initialize the in-memory DuckDB connection
|
|
25
|
+
self._init_connection()
|
|
26
|
+
|
|
27
|
+
def _init_connection(self):
|
|
28
|
+
"""Initialize the in-memory DuckDB connection."""
|
|
29
|
+
self.conn = duckdb.connect(':memory:')
|
|
30
|
+
self.connection_type = 'duckdb'
|
|
31
|
+
|
|
32
|
+
def _ensure_sqlite_scanner(self):
|
|
33
|
+
"""Load the sqlite_scanner extension if not already loaded."""
|
|
34
|
+
if not self._sqlite_scanner_loaded:
|
|
35
|
+
try:
|
|
36
|
+
self.conn.execute("INSTALL sqlite_scanner")
|
|
37
|
+
self.conn.execute("LOAD sqlite_scanner")
|
|
38
|
+
self._sqlite_scanner_loaded = True
|
|
39
|
+
except Exception as e:
|
|
40
|
+
raise Exception(f"Failed to load sqlite_scanner extension: {str(e)}")
|
|
41
|
+
|
|
42
|
+
def is_connected(self):
|
|
43
|
+
"""Check if there is an active database connection."""
|
|
44
|
+
return self.conn is not None
|
|
45
|
+
|
|
46
|
+
def get_connection_info(self):
|
|
47
|
+
"""Get information about the current connection."""
|
|
48
|
+
if not self.is_connected():
|
|
49
|
+
return "No database connected"
|
|
50
|
+
|
|
51
|
+
info_parts = ["In-memory DuckDB"]
|
|
52
|
+
|
|
53
|
+
if self.attached_databases:
|
|
54
|
+
db_info = []
|
|
55
|
+
for alias, db_data in self.attached_databases.items():
|
|
56
|
+
db_type = db_data['type'].upper()
|
|
57
|
+
db_info.append(f"{alias} ({db_type})")
|
|
58
|
+
info_parts.append(f"Attached: {', '.join(db_info)}")
|
|
59
|
+
|
|
60
|
+
return " | ".join(info_parts)
|
|
61
|
+
|
|
62
|
+
def close_connection(self):
|
|
63
|
+
"""Close the current database connection if one exists."""
|
|
64
|
+
if self.conn:
|
|
65
|
+
try:
|
|
66
|
+
# Detach all attached databases first
|
|
67
|
+
for alias in list(self.attached_databases.keys()):
|
|
68
|
+
try:
|
|
69
|
+
self.conn.execute(f"DETACH {alias}")
|
|
70
|
+
except Exception:
|
|
71
|
+
pass
|
|
72
|
+
self.conn.close()
|
|
73
|
+
except Exception:
|
|
74
|
+
pass # Ignore errors when closing
|
|
75
|
+
finally:
|
|
76
|
+
self.conn = None
|
|
77
|
+
self.connection_type = None
|
|
78
|
+
self.database_path = None
|
|
79
|
+
self.attached_databases = {}
|
|
80
|
+
self._sqlite_scanner_loaded = False
|
|
81
|
+
|
|
82
|
+
def open_database(self, filename, load_all_tables=True):
|
|
83
|
+
"""
|
|
84
|
+
Attach a database file to the in-memory connection.
|
|
85
|
+
Detects whether it's a SQLite or DuckDB database.
|
|
86
|
+
This preserves any existing loaded files/tables.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
filename: Path to the database file
|
|
90
|
+
load_all_tables: Whether to automatically load all tables from the database
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
True if successful, False otherwise
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
Exception: If there's an error opening the database
|
|
97
|
+
"""
|
|
98
|
+
# Ensure we have a connection
|
|
99
|
+
if not self.is_connected():
|
|
100
|
+
self._init_connection()
|
|
101
|
+
|
|
102
|
+
# First, detach any existing database with the same alias and remove its tables
|
|
103
|
+
if 'db' in self.attached_databases:
|
|
104
|
+
self.detach_database('db')
|
|
105
|
+
|
|
106
|
+
abs_path = os.path.abspath(filename)
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
if self.is_sqlite_db(filename):
|
|
110
|
+
# Attach SQLite database using sqlite_scanner
|
|
111
|
+
self._ensure_sqlite_scanner()
|
|
112
|
+
self.conn.execute(f"ATTACH '{abs_path}' AS db (TYPE SQLITE, READ_ONLY)")
|
|
113
|
+
db_type = 'sqlite'
|
|
114
|
+
else:
|
|
115
|
+
# Attach DuckDB database in read-only mode
|
|
116
|
+
self.conn.execute(f"ATTACH '{abs_path}' AS db (READ_ONLY)")
|
|
117
|
+
db_type = 'duckdb'
|
|
118
|
+
|
|
119
|
+
# Store the database path for display
|
|
120
|
+
self.database_path = abs_path
|
|
121
|
+
|
|
122
|
+
# Track this attached database
|
|
123
|
+
self.attached_databases['db'] = {
|
|
124
|
+
'path': abs_path,
|
|
125
|
+
'type': db_type,
|
|
126
|
+
'tables': []
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
# Load tables from the database if requested
|
|
130
|
+
if load_all_tables:
|
|
131
|
+
self._load_attached_database_tables('db')
|
|
132
|
+
|
|
133
|
+
return True
|
|
134
|
+
|
|
135
|
+
except Exception as e:
|
|
136
|
+
raise Exception(f"Failed to open database: {str(e)}")
|
|
137
|
+
|
|
138
|
+
def _load_attached_database_tables(self, alias):
|
|
139
|
+
"""
|
|
140
|
+
Load all tables from an attached database.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
alias: The alias of the attached database
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
A list of table names loaded
|
|
147
|
+
"""
|
|
148
|
+
if alias not in self.attached_databases:
|
|
149
|
+
return []
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
table_names = []
|
|
153
|
+
|
|
154
|
+
# Query for tables in the attached database using duckdb_tables()
|
|
155
|
+
# This works for attached databases unlike information_schema.tables
|
|
156
|
+
query = f"SELECT table_name FROM duckdb_tables() WHERE database_name='{alias}'"
|
|
157
|
+
result = self.conn.execute(query).fetchdf()
|
|
158
|
+
|
|
159
|
+
for table_name in result['table_name']:
|
|
160
|
+
# Store with 'database:alias' as source
|
|
161
|
+
self.loaded_tables[table_name] = f'database:{alias}'
|
|
162
|
+
table_names.append(table_name)
|
|
163
|
+
|
|
164
|
+
# Get column names for each table using duckdb_columns()
|
|
165
|
+
try:
|
|
166
|
+
column_query = f"SELECT column_name FROM duckdb_columns() WHERE database_name='{alias}' AND table_name='{table_name}'"
|
|
167
|
+
columns = self.conn.execute(column_query).fetchdf()
|
|
168
|
+
self.table_columns[table_name] = columns['column_name'].tolist()
|
|
169
|
+
except Exception:
|
|
170
|
+
self.table_columns[table_name] = []
|
|
171
|
+
|
|
172
|
+
# Track which tables came from this database
|
|
173
|
+
self.attached_databases[alias]['tables'] = table_names
|
|
174
|
+
|
|
175
|
+
return table_names
|
|
176
|
+
|
|
177
|
+
except Exception as e:
|
|
178
|
+
raise Exception(f'Error loading tables from {alias}: {str(e)}')
|
|
179
|
+
|
|
180
|
+
def detach_database(self, alias):
|
|
181
|
+
"""
|
|
182
|
+
Detach a database and remove its tables from tracking.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
alias: The alias of the database to detach
|
|
186
|
+
"""
|
|
187
|
+
if alias not in self.attached_databases:
|
|
188
|
+
return
|
|
189
|
+
|
|
190
|
+
# Remove all tables that came from this database
|
|
191
|
+
tables_to_remove = self.attached_databases[alias].get('tables', [])
|
|
192
|
+
for table_name in tables_to_remove:
|
|
193
|
+
if table_name in self.loaded_tables:
|
|
194
|
+
del self.loaded_tables[table_name]
|
|
195
|
+
if table_name in self.table_columns:
|
|
196
|
+
del self.table_columns[table_name]
|
|
197
|
+
|
|
198
|
+
# Detach the database
|
|
199
|
+
try:
|
|
200
|
+
self.conn.execute(f"DETACH {alias}")
|
|
201
|
+
except Exception:
|
|
202
|
+
pass
|
|
203
|
+
|
|
204
|
+
# Remove from tracking
|
|
205
|
+
del self.attached_databases[alias]
|
|
206
|
+
|
|
207
|
+
# Clear database_path if this was the main database
|
|
208
|
+
if alias == 'db':
|
|
209
|
+
self.database_path = None
|
|
210
|
+
|
|
211
|
+
def create_memory_connection(self):
|
|
212
|
+
"""Create/reset the in-memory DuckDB connection, preserving nothing."""
|
|
213
|
+
self.close_connection()
|
|
214
|
+
self._init_connection()
|
|
215
|
+
self.loaded_tables = {}
|
|
216
|
+
self.table_columns = {}
|
|
217
|
+
return "Connected to: in-memory DuckDB"
|
|
218
|
+
|
|
219
|
+
def is_sqlite_db(self, filename):
|
|
220
|
+
"""
|
|
221
|
+
Check if the file is a SQLite database by examining its header.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
filename: Path to the database file
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
Boolean indicating if the file is a SQLite database
|
|
228
|
+
"""
|
|
229
|
+
try:
|
|
230
|
+
with open(filename, 'rb') as f:
|
|
231
|
+
header = f.read(16)
|
|
232
|
+
return header[:16] == b'SQLite format 3\x00'
|
|
233
|
+
except:
|
|
234
|
+
return False
|
|
235
|
+
|
|
236
|
+
def load_database_tables(self):
|
|
237
|
+
"""
|
|
238
|
+
Load all tables from the attached database (alias 'db').
|
|
239
|
+
This is a convenience method that calls _load_attached_database_tables.
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
A list of table names loaded
|
|
243
|
+
"""
|
|
244
|
+
if 'db' in self.attached_databases:
|
|
245
|
+
return self._load_attached_database_tables('db')
|
|
246
|
+
return []
|
|
247
|
+
|
|
248
|
+
def execute_query(self, query):
|
|
249
|
+
"""
|
|
250
|
+
Execute a SQL query against the current database connection.
|
|
251
|
+
Tables from attached databases are automatically qualified with their alias.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
query: SQL query string to execute
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
Pandas DataFrame with the query results
|
|
258
|
+
|
|
259
|
+
Raises:
|
|
260
|
+
Exception: If there's an error executing the query
|
|
261
|
+
"""
|
|
262
|
+
if not query.strip():
|
|
263
|
+
raise ValueError("Empty query")
|
|
264
|
+
|
|
265
|
+
if not self.is_connected():
|
|
266
|
+
self._init_connection()
|
|
267
|
+
|
|
268
|
+
try:
|
|
269
|
+
# Preprocess query to qualify table names from attached databases
|
|
270
|
+
processed_query = self._qualify_table_names(query)
|
|
271
|
+
result = self.conn.execute(processed_query).fetchdf()
|
|
272
|
+
return result
|
|
273
|
+
|
|
274
|
+
except duckdb.Error as e:
|
|
275
|
+
error_msg = str(e).lower()
|
|
276
|
+
if "syntax error" in error_msg:
|
|
277
|
+
raise SyntaxError(f"SQL syntax error: {str(e)}")
|
|
278
|
+
elif "does not exist" in error_msg or "not found" in error_msg:
|
|
279
|
+
# Extract the table name from the error message when possible
|
|
280
|
+
import re
|
|
281
|
+
table_match = re.search(r"Table[^']*'([^']+)'|\"([^\"]+)\"", str(e), re.IGNORECASE)
|
|
282
|
+
table_name = (table_match.group(1) or table_match.group(2)) if table_match else "unknown"
|
|
283
|
+
|
|
284
|
+
# Check if this table is in our loaded_tables dict but came from a database
|
|
285
|
+
source = self.loaded_tables.get(table_name, '')
|
|
286
|
+
if source.startswith('database:'):
|
|
287
|
+
raise ValueError(f"Table '{table_name}' was part of a database but is not accessible. "
|
|
288
|
+
f"Please reconnect to the original database using the 'Open Database' button.")
|
|
289
|
+
else:
|
|
290
|
+
raise ValueError(f"Table not found: {str(e)}")
|
|
291
|
+
elif "no such column" in error_msg or "column" in error_msg and "not found" in error_msg:
|
|
292
|
+
raise ValueError(f"Column not found: {str(e)}")
|
|
293
|
+
else:
|
|
294
|
+
raise Exception(f"Database error: {str(e)}")
|
|
295
|
+
|
|
296
|
+
def _qualify_table_names(self, query):
|
|
297
|
+
"""
|
|
298
|
+
Qualify unqualified table names in the query with their database alias.
|
|
299
|
+
This allows users to write 'SELECT * FROM customers' instead of 'SELECT * FROM db.customers'.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
query: The SQL query to process
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
The processed query with qualified table names
|
|
306
|
+
"""
|
|
307
|
+
import re
|
|
308
|
+
|
|
309
|
+
# Build a mapping of table names to their qualified names
|
|
310
|
+
table_qualifications = {}
|
|
311
|
+
for table_name, source in self.loaded_tables.items():
|
|
312
|
+
if source.startswith('database:'):
|
|
313
|
+
alias = source.split(':')[1]
|
|
314
|
+
table_qualifications[table_name.lower()] = f"{alias}.{table_name}"
|
|
315
|
+
|
|
316
|
+
if not table_qualifications:
|
|
317
|
+
return query
|
|
318
|
+
|
|
319
|
+
# Pattern to match table names in common SQL contexts
|
|
320
|
+
# This is a simplified approach - handles most common cases
|
|
321
|
+
# Look for: FROM table, JOIN table, INTO table, UPDATE table
|
|
322
|
+
def replace_table(match):
|
|
323
|
+
keyword = match.group(1)
|
|
324
|
+
table = match.group(2)
|
|
325
|
+
rest = match.group(3) if match.lastindex >= 3 else ''
|
|
326
|
+
|
|
327
|
+
# Don't replace if already qualified (contains a dot)
|
|
328
|
+
if '.' in table:
|
|
329
|
+
return match.group(0)
|
|
330
|
+
|
|
331
|
+
# Check if this table needs qualification
|
|
332
|
+
qualified = table_qualifications.get(table.lower())
|
|
333
|
+
if qualified:
|
|
334
|
+
return f"{keyword}{qualified}{rest}"
|
|
335
|
+
return match.group(0)
|
|
336
|
+
|
|
337
|
+
# Pattern for FROM, JOIN, INTO, UPDATE followed by table name
|
|
338
|
+
pattern = r'(FROM\s+|JOIN\s+|INTO\s+|UPDATE\s+)([a-zA-Z_][a-zA-Z0-9_]*)(\s|$|,|\))'
|
|
339
|
+
processed = re.sub(pattern, replace_table, query, flags=re.IGNORECASE)
|
|
340
|
+
|
|
341
|
+
return processed
|
|
342
|
+
|
|
343
|
+
def load_file(self, file_path, table_prefix=""):
|
|
344
|
+
"""
|
|
345
|
+
Load data from a file into the database.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
file_path: Path to the data file (Excel, CSV, TXT, Parquet, Delta)
|
|
349
|
+
table_prefix: Optional prefix to prepend to the table name (e.g., "prod_")
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
Tuple of (table_name, DataFrame) for the loaded data
|
|
353
|
+
|
|
354
|
+
Raises:
|
|
355
|
+
ValueError: If the file format is unsupported or there's an error
|
|
356
|
+
"""
|
|
357
|
+
try:
|
|
358
|
+
# Check if this is a Delta table (folder with _delta_log)
|
|
359
|
+
delta_path = Path(file_path)
|
|
360
|
+
is_delta_table = (delta_path.is_dir() and
|
|
361
|
+
(delta_path / '_delta_log').exists()) or file_path.endswith('.delta')
|
|
362
|
+
|
|
363
|
+
# Read the file into a DataFrame, using optimized loading strategies
|
|
364
|
+
if is_delta_table:
|
|
365
|
+
# Read as Delta table using deltalake library
|
|
366
|
+
try:
|
|
367
|
+
# Load the Delta table
|
|
368
|
+
import deltalake
|
|
369
|
+
delta_table = deltalake.DeltaTable(file_path)
|
|
370
|
+
|
|
371
|
+
# Get the schema to identify decimal columns
|
|
372
|
+
schema = delta_table.schema()
|
|
373
|
+
decimal_columns = []
|
|
374
|
+
|
|
375
|
+
# Identify decimal columns from schema
|
|
376
|
+
for field in schema.fields:
|
|
377
|
+
# Use string representation to check for decimal
|
|
378
|
+
if 'decimal' in str(field.type).lower():
|
|
379
|
+
decimal_columns.append(field.name)
|
|
380
|
+
|
|
381
|
+
# Read the data
|
|
382
|
+
df = delta_table.to_pandas()
|
|
383
|
+
|
|
384
|
+
# Try to convert decimal columns to float64, warn if not possible
|
|
385
|
+
for col in decimal_columns:
|
|
386
|
+
if col in df.columns:
|
|
387
|
+
try:
|
|
388
|
+
df[col] = pd.to_numeric(df[col], errors='coerce').astype('float64')
|
|
389
|
+
if df[col].isna().any():
|
|
390
|
+
print(f"Warning: Some values in column '{col}' could not be converted to float64 and are set as NaN.")
|
|
391
|
+
except Exception as e:
|
|
392
|
+
print(f"Warning: Could not convert column '{col}' to float64: {e}")
|
|
393
|
+
except Exception as e:
|
|
394
|
+
raise ValueError(f"Error loading Delta table: {str(e)}")
|
|
395
|
+
elif file_path.endswith(('.xlsx', '.xls')):
|
|
396
|
+
# Try to use a streaming approach for Excel files
|
|
397
|
+
try:
|
|
398
|
+
# For Excel files, we first check if it's a large file
|
|
399
|
+
# If it's large, we may want to show only a subset
|
|
400
|
+
excel_file = pd.ExcelFile(file_path)
|
|
401
|
+
sheet_name = excel_file.sheet_names[0] # Default to first sheet
|
|
402
|
+
|
|
403
|
+
# Read the first row to get column names
|
|
404
|
+
df_preview = pd.read_excel(excel_file, sheet_name=sheet_name, nrows=5)
|
|
405
|
+
|
|
406
|
+
# If the file is very large, use chunksize
|
|
407
|
+
file_size = os.path.getsize(file_path) / (1024 * 1024) # Size in MB
|
|
408
|
+
|
|
409
|
+
if file_size > 50: # If file is larger than 50MB
|
|
410
|
+
# Use a limited subset for large files to avoid memory issues
|
|
411
|
+
df = pd.read_excel(excel_file, sheet_name=sheet_name, nrows=100000) # Cap at 100k rows
|
|
412
|
+
else:
|
|
413
|
+
# For smaller files, read everything
|
|
414
|
+
df = pd.read_excel(excel_file, sheet_name=sheet_name)
|
|
415
|
+
except Exception:
|
|
416
|
+
# Fallback to standard reading method
|
|
417
|
+
df = pd.read_excel(file_path)
|
|
418
|
+
elif file_path.endswith(('.csv', '.txt')):
|
|
419
|
+
# For CSV and TXT files, detect separator and use chunking for large files
|
|
420
|
+
try:
|
|
421
|
+
# Check if it's a large file
|
|
422
|
+
file_size = os.path.getsize(file_path) / (1024 * 1024) # Size in MB
|
|
423
|
+
|
|
424
|
+
# Try multiple encodings if needed
|
|
425
|
+
encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'ISO-8859-1']
|
|
426
|
+
|
|
427
|
+
# Detect the separator automatically
|
|
428
|
+
def detect_separator(sample_data):
|
|
429
|
+
# Common separators to check
|
|
430
|
+
separators = [',', ';', '\t']
|
|
431
|
+
separator_scores = {}
|
|
432
|
+
|
|
433
|
+
# Split into lines and analyze
|
|
434
|
+
lines = [line.strip() for line in sample_data.split('\n') if line.strip()]
|
|
435
|
+
if not lines:
|
|
436
|
+
return ',' # Default if no content
|
|
437
|
+
|
|
438
|
+
# Check for quoted content with separators
|
|
439
|
+
has_quotes = '"' in sample_data or "'" in sample_data
|
|
440
|
+
|
|
441
|
+
# If we have quoted content, use a different approach
|
|
442
|
+
if has_quotes:
|
|
443
|
+
for sep in separators:
|
|
444
|
+
# Look for patterns like "value";
|
|
445
|
+
pattern_count = 0
|
|
446
|
+
for line in lines:
|
|
447
|
+
# Count occurrences of quote + separator
|
|
448
|
+
double_quote_pattern = f'"{sep}'
|
|
449
|
+
single_quote_pattern = f"'{sep}"
|
|
450
|
+
pattern_count += line.count(double_quote_pattern) + line.count(single_quote_pattern)
|
|
451
|
+
|
|
452
|
+
# If we found clear quote+separator patterns, this is likely our separator
|
|
453
|
+
if pattern_count > 0:
|
|
454
|
+
separator_scores[sep] = pattern_count
|
|
455
|
+
|
|
456
|
+
# Standard approach based on consistent column counts
|
|
457
|
+
if not separator_scores:
|
|
458
|
+
for sep in separators:
|
|
459
|
+
# Count consistent occurrences across lines
|
|
460
|
+
counts = [line.count(sep) for line in lines]
|
|
461
|
+
if counts and all(c > 0 for c in counts):
|
|
462
|
+
# Calculate consistency score: higher if all counts are the same
|
|
463
|
+
consistency = 1.0 if all(c == counts[0] for c in counts) else 0.5
|
|
464
|
+
# Score is average count * consistency
|
|
465
|
+
separator_scores[sep] = sum(counts) / len(counts) * consistency
|
|
466
|
+
|
|
467
|
+
# Choose the separator with the highest score
|
|
468
|
+
if separator_scores:
|
|
469
|
+
return max(separator_scores.items(), key=lambda x: x[1])[0]
|
|
470
|
+
|
|
471
|
+
# Default to comma if we couldn't determine
|
|
472
|
+
return ','
|
|
473
|
+
|
|
474
|
+
# First, sample the file to detect separator
|
|
475
|
+
with open(file_path, 'rb') as f:
|
|
476
|
+
# Read first few KB to detect encoding and separator
|
|
477
|
+
raw_sample = f.read(4096)
|
|
478
|
+
|
|
479
|
+
# Try to decode with various encodings
|
|
480
|
+
sample_text = None
|
|
481
|
+
detected_encoding = None
|
|
482
|
+
|
|
483
|
+
for encoding in encodings_to_try:
|
|
484
|
+
try:
|
|
485
|
+
sample_text = raw_sample.decode(encoding)
|
|
486
|
+
detected_encoding = encoding
|
|
487
|
+
break
|
|
488
|
+
except UnicodeDecodeError:
|
|
489
|
+
continue
|
|
490
|
+
|
|
491
|
+
if not sample_text:
|
|
492
|
+
raise ValueError("Could not decode file with any of the attempted encodings")
|
|
493
|
+
|
|
494
|
+
# Detect separator from the sample
|
|
495
|
+
separator = detect_separator(sample_text)
|
|
496
|
+
|
|
497
|
+
# Determine quote character (default to double quote)
|
|
498
|
+
quotechar = '"'
|
|
499
|
+
if sample_text.count("'") > sample_text.count('"'):
|
|
500
|
+
quotechar = "'"
|
|
501
|
+
|
|
502
|
+
if file_size > 50: # If file is larger than 50MB
|
|
503
|
+
# Read the first chunk to get column types
|
|
504
|
+
try:
|
|
505
|
+
df_preview = pd.read_csv(
|
|
506
|
+
file_path,
|
|
507
|
+
sep=separator,
|
|
508
|
+
nrows=1000,
|
|
509
|
+
encoding=detected_encoding,
|
|
510
|
+
engine='python' if separator != ',' else 'c',
|
|
511
|
+
quotechar=quotechar,
|
|
512
|
+
doublequote=True
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
# Use optimized dtypes for better memory usage
|
|
516
|
+
dtypes = {col: df_preview[col].dtype for col in df_preview.columns}
|
|
517
|
+
|
|
518
|
+
# Read again with chunk processing, combining up to 100k rows
|
|
519
|
+
chunks = []
|
|
520
|
+
for chunk in pd.read_csv(
|
|
521
|
+
file_path,
|
|
522
|
+
sep=separator,
|
|
523
|
+
dtype=dtypes,
|
|
524
|
+
chunksize=10000,
|
|
525
|
+
encoding=detected_encoding,
|
|
526
|
+
engine='python' if separator != ',' else 'c',
|
|
527
|
+
quotechar=quotechar,
|
|
528
|
+
doublequote=True
|
|
529
|
+
):
|
|
530
|
+
chunks.append(chunk)
|
|
531
|
+
if len(chunks) * 10000 >= 100000: # Cap at 100k rows
|
|
532
|
+
break
|
|
533
|
+
|
|
534
|
+
df = pd.concat(chunks, ignore_index=True)
|
|
535
|
+
except pd.errors.ParserError as e:
|
|
536
|
+
# If parsing fails, try again with error recovery options
|
|
537
|
+
print(f"Initial parsing failed: {str(e)}. Trying with error recovery options...")
|
|
538
|
+
|
|
539
|
+
# Try with Python engine which is more flexible
|
|
540
|
+
try:
|
|
541
|
+
# First try with pandas >= 1.3 parameters
|
|
542
|
+
df = pd.read_csv(
|
|
543
|
+
file_path,
|
|
544
|
+
sep=separator,
|
|
545
|
+
encoding=detected_encoding,
|
|
546
|
+
engine='python', # Always use python engine for error recovery
|
|
547
|
+
quotechar=quotechar,
|
|
548
|
+
doublequote=True,
|
|
549
|
+
on_bad_lines='warn', # New parameter in pandas >= 1.3
|
|
550
|
+
na_values=[''],
|
|
551
|
+
keep_default_na=True
|
|
552
|
+
)
|
|
553
|
+
except TypeError:
|
|
554
|
+
# Fall back to pandas < 1.3 parameters
|
|
555
|
+
df = pd.read_csv(
|
|
556
|
+
file_path,
|
|
557
|
+
sep=separator,
|
|
558
|
+
encoding=detected_encoding,
|
|
559
|
+
engine='python',
|
|
560
|
+
quotechar=quotechar,
|
|
561
|
+
doublequote=True,
|
|
562
|
+
error_bad_lines=False, # Old parameter
|
|
563
|
+
warn_bad_lines=True, # Old parameter
|
|
564
|
+
na_values=[''],
|
|
565
|
+
keep_default_na=True
|
|
566
|
+
)
|
|
567
|
+
else:
|
|
568
|
+
# For smaller files, read everything at once
|
|
569
|
+
try:
|
|
570
|
+
df = pd.read_csv(
|
|
571
|
+
file_path,
|
|
572
|
+
sep=separator,
|
|
573
|
+
encoding=detected_encoding,
|
|
574
|
+
engine='python' if separator != ',' else 'c',
|
|
575
|
+
quotechar=quotechar,
|
|
576
|
+
doublequote=True
|
|
577
|
+
)
|
|
578
|
+
except pd.errors.ParserError as e:
|
|
579
|
+
# If parsing fails, try again with error recovery options
|
|
580
|
+
print(f"Initial parsing failed: {str(e)}. Trying with error recovery options...")
|
|
581
|
+
|
|
582
|
+
# Try with Python engine which is more flexible
|
|
583
|
+
try:
|
|
584
|
+
# First try with pandas >= 1.3 parameters
|
|
585
|
+
df = pd.read_csv(
|
|
586
|
+
file_path,
|
|
587
|
+
sep=separator,
|
|
588
|
+
encoding=detected_encoding,
|
|
589
|
+
engine='python', # Always use python engine for error recovery
|
|
590
|
+
quotechar=quotechar,
|
|
591
|
+
doublequote=True,
|
|
592
|
+
on_bad_lines='warn', # New parameter in pandas >= 1.3
|
|
593
|
+
na_values=[''],
|
|
594
|
+
keep_default_na=True
|
|
595
|
+
)
|
|
596
|
+
except TypeError:
|
|
597
|
+
# Fall back to pandas < 1.3 parameters
|
|
598
|
+
df = pd.read_csv(
|
|
599
|
+
file_path,
|
|
600
|
+
sep=separator,
|
|
601
|
+
encoding=detected_encoding,
|
|
602
|
+
engine='python',
|
|
603
|
+
quotechar=quotechar,
|
|
604
|
+
doublequote=True,
|
|
605
|
+
error_bad_lines=False, # Old parameter
|
|
606
|
+
warn_bad_lines=True, # Old parameter
|
|
607
|
+
na_values=[''],
|
|
608
|
+
keep_default_na=True
|
|
609
|
+
)
|
|
610
|
+
except Exception as e:
|
|
611
|
+
# Log the error for debugging
|
|
612
|
+
import traceback
|
|
613
|
+
print(f"Error loading CSV/TXT file: {str(e)}")
|
|
614
|
+
print(traceback.format_exc())
|
|
615
|
+
raise ValueError(f"Error loading CSV/TXT file: {str(e)}")
|
|
616
|
+
elif file_path.endswith('.parquet'):
|
|
617
|
+
# Use fastparquet engine (lighter than pyarrow - saves 147MB in builds)
|
|
618
|
+
df = pd.read_parquet(file_path, engine='fastparquet')
|
|
619
|
+
else:
|
|
620
|
+
raise ValueError("Unsupported file format. Supported formats: .xlsx, .xls, .csv, .txt, .parquet, and Delta tables.")
|
|
621
|
+
|
|
622
|
+
# Generate table name from file name
|
|
623
|
+
base_name = os.path.splitext(os.path.basename(file_path))[0]
|
|
624
|
+
|
|
625
|
+
# For directories like Delta tables, use the directory name
|
|
626
|
+
if os.path.isdir(file_path):
|
|
627
|
+
base_name = os.path.basename(file_path)
|
|
628
|
+
|
|
629
|
+
# Apply prefix if provided
|
|
630
|
+
if table_prefix:
|
|
631
|
+
base_name = f"{table_prefix}{base_name}"
|
|
632
|
+
|
|
633
|
+
table_name = self.sanitize_table_name(base_name)
|
|
634
|
+
|
|
635
|
+
# Ensure unique table name
|
|
636
|
+
original_name = table_name
|
|
637
|
+
counter = 1
|
|
638
|
+
while table_name in self.loaded_tables:
|
|
639
|
+
table_name = f"{original_name}_{counter}"
|
|
640
|
+
counter += 1
|
|
641
|
+
|
|
642
|
+
# Ensure we have a connection (always in-memory DuckDB)
|
|
643
|
+
if not self.is_connected():
|
|
644
|
+
self._init_connection()
|
|
645
|
+
|
|
646
|
+
# Register the DataFrame as a view in DuckDB
|
|
647
|
+
# This preserves any attached databases and their tables
|
|
648
|
+
self.conn.register(table_name, df)
|
|
649
|
+
|
|
650
|
+
# Store information about the table
|
|
651
|
+
self.loaded_tables[table_name] = file_path
|
|
652
|
+
self.table_columns[table_name] = [str(col) for col in df.columns.tolist()]
|
|
653
|
+
|
|
654
|
+
return table_name, df
|
|
655
|
+
|
|
656
|
+
except MemoryError:
|
|
657
|
+
raise ValueError("Not enough memory to load this file. Try using a smaller file or increasing available memory.")
|
|
658
|
+
except Exception as e:
|
|
659
|
+
raise ValueError(f"Error loading file: {str(e)}")
|
|
660
|
+
|
|
661
|
+
def remove_table(self, table_name):
|
|
662
|
+
"""
|
|
663
|
+
Remove a table from the database.
|
|
664
|
+
|
|
665
|
+
Args:
|
|
666
|
+
table_name: Name of the table to remove
|
|
667
|
+
|
|
668
|
+
Returns:
|
|
669
|
+
Boolean indicating success
|
|
670
|
+
"""
|
|
671
|
+
if not table_name in self.loaded_tables:
|
|
672
|
+
return False
|
|
673
|
+
|
|
674
|
+
try:
|
|
675
|
+
source = self.loaded_tables[table_name]
|
|
676
|
+
|
|
677
|
+
# For file-based tables (registered DataFrames), drop the view
|
|
678
|
+
if not source.startswith('database:'):
|
|
679
|
+
self.conn.execute(f'DROP VIEW IF EXISTS {table_name}')
|
|
680
|
+
else:
|
|
681
|
+
# For database tables, we just remove from tracking
|
|
682
|
+
# The actual table remains in the attached database
|
|
683
|
+
# Also remove from the attached database's table list
|
|
684
|
+
alias = source.split(':')[1]
|
|
685
|
+
if alias in self.attached_databases:
|
|
686
|
+
tables = self.attached_databases[alias].get('tables', [])
|
|
687
|
+
if table_name in tables:
|
|
688
|
+
tables.remove(table_name)
|
|
689
|
+
|
|
690
|
+
# Remove from tracking
|
|
691
|
+
del self.loaded_tables[table_name]
|
|
692
|
+
if table_name in self.table_columns:
|
|
693
|
+
del self.table_columns[table_name]
|
|
694
|
+
|
|
695
|
+
return True
|
|
696
|
+
except Exception:
|
|
697
|
+
return False
|
|
698
|
+
|
|
699
|
+
def remove_multiple_tables(self, table_names):
|
|
700
|
+
"""
|
|
701
|
+
Remove multiple tables from the database.
|
|
702
|
+
|
|
703
|
+
Args:
|
|
704
|
+
table_names: List of table names to remove
|
|
705
|
+
|
|
706
|
+
Returns:
|
|
707
|
+
Tuple of (successful_removals, failed_removals) as lists of table names
|
|
708
|
+
"""
|
|
709
|
+
successful_removals = []
|
|
710
|
+
failed_removals = []
|
|
711
|
+
|
|
712
|
+
for table_name in table_names:
|
|
713
|
+
if self.remove_table(table_name):
|
|
714
|
+
successful_removals.append(table_name)
|
|
715
|
+
else:
|
|
716
|
+
failed_removals.append(table_name)
|
|
717
|
+
|
|
718
|
+
return successful_removals, failed_removals
|
|
719
|
+
|
|
720
|
+
def get_table_preview(self, table_name, limit=5):
|
|
721
|
+
"""
|
|
722
|
+
Get a preview of the data in a table.
|
|
723
|
+
|
|
724
|
+
Args:
|
|
725
|
+
table_name: Name of the table to preview
|
|
726
|
+
limit: Number of rows to preview
|
|
727
|
+
|
|
728
|
+
Returns:
|
|
729
|
+
Pandas DataFrame with the preview data
|
|
730
|
+
"""
|
|
731
|
+
if not table_name in self.loaded_tables:
|
|
732
|
+
raise ValueError(f"Table '{table_name}' not found")
|
|
733
|
+
|
|
734
|
+
try:
|
|
735
|
+
source = self.loaded_tables[table_name]
|
|
736
|
+
|
|
737
|
+
# For database tables, use the qualified name
|
|
738
|
+
if source.startswith('database:'):
|
|
739
|
+
alias = source.split(':')[1]
|
|
740
|
+
return self.conn.execute(f'SELECT * FROM {alias}.{table_name} LIMIT {limit}').fetchdf()
|
|
741
|
+
else:
|
|
742
|
+
# For file-based tables (registered views)
|
|
743
|
+
return self.conn.execute(f'SELECT * FROM {table_name} LIMIT {limit}').fetchdf()
|
|
744
|
+
except Exception as e:
|
|
745
|
+
raise Exception(f"Error previewing table: {str(e)}")
|
|
746
|
+
|
|
747
|
+
def get_full_table(self, table_name):
|
|
748
|
+
"""
|
|
749
|
+
Get all data from a table (no row limit).
|
|
750
|
+
|
|
751
|
+
Args:
|
|
752
|
+
table_name: Name of the table to retrieve
|
|
753
|
+
|
|
754
|
+
Returns:
|
|
755
|
+
Pandas DataFrame with all the table data
|
|
756
|
+
"""
|
|
757
|
+
if not table_name in self.loaded_tables:
|
|
758
|
+
raise ValueError(f"Table '{table_name}' not found")
|
|
759
|
+
|
|
760
|
+
try:
|
|
761
|
+
source = self.loaded_tables[table_name]
|
|
762
|
+
|
|
763
|
+
# For database tables, use the qualified name
|
|
764
|
+
if source.startswith('database:'):
|
|
765
|
+
alias = source.split(':')[1]
|
|
766
|
+
return self.conn.execute(f'SELECT * FROM {alias}.{table_name}').fetchdf()
|
|
767
|
+
else:
|
|
768
|
+
# For file-based tables (registered views)
|
|
769
|
+
return self.conn.execute(f'SELECT * FROM {table_name}').fetchdf()
|
|
770
|
+
except Exception as e:
|
|
771
|
+
raise Exception(f"Error getting table data: {str(e)}")
|
|
772
|
+
|
|
773
|
+
def reload_table(self, table_name):
|
|
774
|
+
"""
|
|
775
|
+
Reload a table's data from its source file.
|
|
776
|
+
|
|
777
|
+
Args:
|
|
778
|
+
table_name: Name of the table to reload
|
|
779
|
+
|
|
780
|
+
Returns:
|
|
781
|
+
Tuple of (bool, message) indicating success/failure and a message
|
|
782
|
+
|
|
783
|
+
Raises:
|
|
784
|
+
ValueError: If the table cannot be reloaded
|
|
785
|
+
"""
|
|
786
|
+
if not table_name in self.loaded_tables:
|
|
787
|
+
return False, f"Table '{table_name}' not found"
|
|
788
|
+
|
|
789
|
+
file_path = self.loaded_tables[table_name]
|
|
790
|
+
|
|
791
|
+
# Check if this is a file-based table
|
|
792
|
+
if file_path in ['database', 'query_result']:
|
|
793
|
+
return False, f"Cannot reload '{table_name}' because it's not a file-based table"
|
|
794
|
+
|
|
795
|
+
try:
|
|
796
|
+
# Check if the file still exists
|
|
797
|
+
if not os.path.exists(file_path):
|
|
798
|
+
return False, f"Source file '{file_path}' no longer exists"
|
|
799
|
+
|
|
800
|
+
# Store the original table name
|
|
801
|
+
original_name = table_name
|
|
802
|
+
|
|
803
|
+
# Remove the existing table
|
|
804
|
+
self.remove_table(table_name)
|
|
805
|
+
|
|
806
|
+
# Check if this is a Delta table
|
|
807
|
+
delta_path = Path(file_path)
|
|
808
|
+
is_delta_table = (delta_path.is_dir() and
|
|
809
|
+
(delta_path / '_delta_log').exists()) or file_path.endswith('.delta')
|
|
810
|
+
|
|
811
|
+
# Load the file with the original table name
|
|
812
|
+
df = None
|
|
813
|
+
if is_delta_table:
|
|
814
|
+
# Read as Delta table
|
|
815
|
+
import deltalake
|
|
816
|
+
delta_table = deltalake.DeltaTable(file_path)
|
|
817
|
+
df = delta_table.to_pandas()
|
|
818
|
+
elif file_path.endswith(('.xlsx', '.xls')):
|
|
819
|
+
df = pd.read_excel(file_path)
|
|
820
|
+
elif file_path.endswith(('.csv', '.txt')):
|
|
821
|
+
# Try multiple encodings for CSV/TXT files
|
|
822
|
+
encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'ISO-8859-1']
|
|
823
|
+
|
|
824
|
+
# Detect the separator automatically
|
|
825
|
+
def detect_separator(sample_data):
|
|
826
|
+
# Common separators to check
|
|
827
|
+
separators = [',', ';', '\t']
|
|
828
|
+
separator_scores = {}
|
|
829
|
+
|
|
830
|
+
# Split into lines and analyze
|
|
831
|
+
lines = [line.strip() for line in sample_data.split('\n') if line.strip()]
|
|
832
|
+
if not lines:
|
|
833
|
+
return ',' # Default if no content
|
|
834
|
+
|
|
835
|
+
# Check for quoted content with separators
|
|
836
|
+
has_quotes = '"' in sample_data or "'" in sample_data
|
|
837
|
+
|
|
838
|
+
# If we have quoted content, use a different approach
|
|
839
|
+
if has_quotes:
|
|
840
|
+
for sep in separators:
|
|
841
|
+
# Look for patterns like "value";
|
|
842
|
+
pattern_count = 0
|
|
843
|
+
for line in lines:
|
|
844
|
+
# Count occurrences of quote + separator
|
|
845
|
+
double_quote_pattern = f'"{sep}'
|
|
846
|
+
single_quote_pattern = f"'{sep}"
|
|
847
|
+
pattern_count += line.count(double_quote_pattern) + line.count(single_quote_pattern)
|
|
848
|
+
|
|
849
|
+
# If we found clear quote+separator patterns, this is likely our separator
|
|
850
|
+
if pattern_count > 0:
|
|
851
|
+
separator_scores[sep] = pattern_count
|
|
852
|
+
|
|
853
|
+
# Standard approach based on consistent column counts
|
|
854
|
+
if not separator_scores:
|
|
855
|
+
for sep in separators:
|
|
856
|
+
# Count consistent occurrences across lines
|
|
857
|
+
counts = [line.count(sep) for line in lines]
|
|
858
|
+
if counts and all(c > 0 for c in counts):
|
|
859
|
+
# Calculate consistency score: higher if all counts are the same
|
|
860
|
+
consistency = 1.0 if all(c == counts[0] for c in counts) else 0.5
|
|
861
|
+
# Score is average count * consistency
|
|
862
|
+
separator_scores[sep] = sum(counts) / len(counts) * consistency
|
|
863
|
+
|
|
864
|
+
# Choose the separator with the highest score
|
|
865
|
+
if separator_scores:
|
|
866
|
+
return max(separator_scores.items(), key=lambda x: x[1])[0]
|
|
867
|
+
|
|
868
|
+
# Default to comma if we couldn't determine
|
|
869
|
+
return ','
|
|
870
|
+
|
|
871
|
+
# First, sample the file to detect separator and encoding
|
|
872
|
+
with open(file_path, 'rb') as f:
|
|
873
|
+
# Read first few KB to detect encoding and separator
|
|
874
|
+
raw_sample = f.read(4096)
|
|
875
|
+
|
|
876
|
+
# Try to decode with various encodings
|
|
877
|
+
sample_text = None
|
|
878
|
+
detected_encoding = None
|
|
879
|
+
|
|
880
|
+
for encoding in encodings_to_try:
|
|
881
|
+
try:
|
|
882
|
+
sample_text = raw_sample.decode(encoding)
|
|
883
|
+
detected_encoding = encoding
|
|
884
|
+
break
|
|
885
|
+
except UnicodeDecodeError:
|
|
886
|
+
# If this encoding fails, try the next one
|
|
887
|
+
continue
|
|
888
|
+
|
|
889
|
+
if not sample_text:
|
|
890
|
+
raise ValueError("Could not decode file with any of the attempted encodings")
|
|
891
|
+
|
|
892
|
+
# Detect separator from the sample
|
|
893
|
+
separator = detect_separator(sample_text)
|
|
894
|
+
|
|
895
|
+
# Determine quote character (default to double quote)
|
|
896
|
+
quotechar = '"'
|
|
897
|
+
if sample_text.count("'") > sample_text.count('"'):
|
|
898
|
+
quotechar = "'"
|
|
899
|
+
|
|
900
|
+
# Read with detected parameters
|
|
901
|
+
try:
|
|
902
|
+
df = pd.read_csv(
|
|
903
|
+
file_path,
|
|
904
|
+
sep=separator,
|
|
905
|
+
encoding=detected_encoding,
|
|
906
|
+
engine='python' if separator != ',' else 'c',
|
|
907
|
+
quotechar=quotechar,
|
|
908
|
+
doublequote=True
|
|
909
|
+
)
|
|
910
|
+
except pd.errors.ParserError as e:
|
|
911
|
+
# If parsing fails, try again with error recovery options
|
|
912
|
+
print(f"Initial parsing failed on reload: {str(e)}. Trying with error recovery options...")
|
|
913
|
+
|
|
914
|
+
# Try with Python engine which is more flexible
|
|
915
|
+
try:
|
|
916
|
+
# First try with pandas >= 1.3 parameters
|
|
917
|
+
df = pd.read_csv(
|
|
918
|
+
file_path,
|
|
919
|
+
sep=separator,
|
|
920
|
+
encoding=detected_encoding,
|
|
921
|
+
engine='python', # Always use python engine for error recovery
|
|
922
|
+
quotechar=quotechar,
|
|
923
|
+
doublequote=True,
|
|
924
|
+
on_bad_lines='warn', # New parameter in pandas >= 1.3
|
|
925
|
+
na_values=[''],
|
|
926
|
+
keep_default_na=True
|
|
927
|
+
)
|
|
928
|
+
except TypeError:
|
|
929
|
+
# Fall back to pandas < 1.3 parameters
|
|
930
|
+
df = pd.read_csv(
|
|
931
|
+
file_path,
|
|
932
|
+
sep=separator,
|
|
933
|
+
encoding=detected_encoding,
|
|
934
|
+
engine='python',
|
|
935
|
+
quotechar=quotechar,
|
|
936
|
+
doublequote=True,
|
|
937
|
+
error_bad_lines=False, # Old parameter
|
|
938
|
+
warn_bad_lines=True, # Old parameter
|
|
939
|
+
na_values=[''],
|
|
940
|
+
keep_default_na=True
|
|
941
|
+
)
|
|
942
|
+
elif file_path.endswith('.parquet'):
|
|
943
|
+
# Use fastparquet engine (lighter than pyarrow - saves 147MB in builds)
|
|
944
|
+
df = pd.read_parquet(file_path, engine='fastparquet')
|
|
945
|
+
else:
|
|
946
|
+
return False, "Unsupported file format"
|
|
947
|
+
|
|
948
|
+
# Register the dataframe with the original name
|
|
949
|
+
self.register_dataframe(df, original_name, file_path)
|
|
950
|
+
|
|
951
|
+
return True, f"Table '{table_name}' reloaded successfully"
|
|
952
|
+
|
|
953
|
+
except Exception as e:
|
|
954
|
+
return False, f"Error reloading table: {str(e)}"
|
|
955
|
+
|
|
956
|
+
def rename_table(self, old_name, new_name):
|
|
957
|
+
"""
|
|
958
|
+
Rename a table in the database.
|
|
959
|
+
Only file-based tables can be renamed; database tables are read-only.
|
|
960
|
+
|
|
961
|
+
Args:
|
|
962
|
+
old_name: Current name of the table
|
|
963
|
+
new_name: New name for the table
|
|
964
|
+
|
|
965
|
+
Returns:
|
|
966
|
+
Boolean indicating success
|
|
967
|
+
"""
|
|
968
|
+
if not old_name in self.loaded_tables:
|
|
969
|
+
return False
|
|
970
|
+
|
|
971
|
+
source = self.loaded_tables[old_name]
|
|
972
|
+
|
|
973
|
+
# Database tables cannot be renamed (read-only)
|
|
974
|
+
if source.startswith('database:'):
|
|
975
|
+
raise ValueError(f"Cannot rename table '{old_name}' because it's from an attached database (read-only)")
|
|
976
|
+
|
|
977
|
+
try:
|
|
978
|
+
# Sanitize the new name
|
|
979
|
+
new_name = self.sanitize_table_name(new_name)
|
|
980
|
+
|
|
981
|
+
# Check if new name already exists
|
|
982
|
+
if new_name in self.loaded_tables and new_name != old_name:
|
|
983
|
+
raise ValueError(f"Table '{new_name}' already exists")
|
|
984
|
+
|
|
985
|
+
# For file-based tables (registered views in DuckDB):
|
|
986
|
+
# 1. Get the data from the old view
|
|
987
|
+
df = self.conn.execute(f'SELECT * FROM {old_name}').fetchdf()
|
|
988
|
+
# 2. Drop the old view
|
|
989
|
+
self.conn.execute(f'DROP VIEW IF EXISTS {old_name}')
|
|
990
|
+
# 3. Register the data under the new name
|
|
991
|
+
self.conn.register(new_name, df)
|
|
992
|
+
|
|
993
|
+
# Update tracking
|
|
994
|
+
self.loaded_tables[new_name] = self.loaded_tables.pop(old_name)
|
|
995
|
+
self.table_columns[new_name] = self.table_columns.pop(old_name)
|
|
996
|
+
|
|
997
|
+
return True
|
|
998
|
+
|
|
999
|
+
except Exception as e:
|
|
1000
|
+
raise Exception(f"Failed to rename table: {str(e)}")
|
|
1001
|
+
|
|
1002
|
+
def sanitize_table_name(self, name):
|
|
1003
|
+
"""
|
|
1004
|
+
Sanitize a table name to be valid in SQL.
|
|
1005
|
+
|
|
1006
|
+
Args:
|
|
1007
|
+
name: The proposed table name
|
|
1008
|
+
|
|
1009
|
+
Returns:
|
|
1010
|
+
A sanitized table name
|
|
1011
|
+
"""
|
|
1012
|
+
import re
|
|
1013
|
+
name = re.sub(r'[^a-zA-Z0-9_]', '_', name)
|
|
1014
|
+
# Ensure it starts with a letter
|
|
1015
|
+
if not name or not name[0].isalpha():
|
|
1016
|
+
name = 'table_' + name
|
|
1017
|
+
return name.lower()
|
|
1018
|
+
|
|
1019
|
+
def register_dataframe(self, df, table_name, source='query_result'):
|
|
1020
|
+
"""
|
|
1021
|
+
Register a DataFrame as a table in the database.
|
|
1022
|
+
|
|
1023
|
+
Args:
|
|
1024
|
+
df: Pandas DataFrame to register
|
|
1025
|
+
table_name: Name for the table
|
|
1026
|
+
source: Source of the data (for tracking)
|
|
1027
|
+
|
|
1028
|
+
Returns:
|
|
1029
|
+
The table name used (may be different if there was a conflict)
|
|
1030
|
+
"""
|
|
1031
|
+
# Ensure we have a connection
|
|
1032
|
+
if not self.is_connected():
|
|
1033
|
+
self._init_connection()
|
|
1034
|
+
|
|
1035
|
+
# Sanitize and ensure unique name
|
|
1036
|
+
table_name = self.sanitize_table_name(table_name)
|
|
1037
|
+
original_name = table_name
|
|
1038
|
+
counter = 1
|
|
1039
|
+
while table_name in self.loaded_tables:
|
|
1040
|
+
table_name = f"{original_name}_{counter}"
|
|
1041
|
+
counter += 1
|
|
1042
|
+
|
|
1043
|
+
# Register the DataFrame directly in DuckDB
|
|
1044
|
+
self.conn.register(table_name, df)
|
|
1045
|
+
|
|
1046
|
+
# Track the table
|
|
1047
|
+
self.loaded_tables[table_name] = source
|
|
1048
|
+
self.table_columns[table_name] = [str(col) for col in df.columns.tolist()]
|
|
1049
|
+
|
|
1050
|
+
return table_name
|
|
1051
|
+
|
|
1052
|
+
def get_all_table_columns(self):
|
|
1053
|
+
"""
|
|
1054
|
+
Get all table and column names for autocompletion.
|
|
1055
|
+
|
|
1056
|
+
Returns:
|
|
1057
|
+
List of completion words (table names and column names)
|
|
1058
|
+
"""
|
|
1059
|
+
# Start with table names
|
|
1060
|
+
completion_words = set(self.loaded_tables.keys())
|
|
1061
|
+
|
|
1062
|
+
# Track column data types for smarter autocompletion
|
|
1063
|
+
column_data_types = {} # {table.column: data_type}
|
|
1064
|
+
|
|
1065
|
+
# Detect potential table relationships for JOIN suggestions
|
|
1066
|
+
potential_relationships = [] # [(table1, column1, table2, column2)]
|
|
1067
|
+
|
|
1068
|
+
# Add column names with and without table prefixes, up to reasonable limits
|
|
1069
|
+
MAX_COLUMNS_PER_TABLE = 100 # Limit columns to prevent memory issues
|
|
1070
|
+
MAX_TABLES_WITH_COLUMNS = 20 # Limit the number of tables to process
|
|
1071
|
+
|
|
1072
|
+
# Sort tables by name to ensure consistent behavior
|
|
1073
|
+
table_items = sorted(list(self.table_columns.items()))
|
|
1074
|
+
|
|
1075
|
+
# Process only a limited number of tables
|
|
1076
|
+
for table, columns in table_items[:MAX_TABLES_WITH_COLUMNS]:
|
|
1077
|
+
# Add each column name by itself
|
|
1078
|
+
for col in columns[:MAX_COLUMNS_PER_TABLE]:
|
|
1079
|
+
completion_words.add(col)
|
|
1080
|
+
|
|
1081
|
+
# Add qualified column names (table.column)
|
|
1082
|
+
for col in columns[:MAX_COLUMNS_PER_TABLE]:
|
|
1083
|
+
completion_words.add(f"{table}.{col}")
|
|
1084
|
+
|
|
1085
|
+
# Try to infer table relationships based on column naming
|
|
1086
|
+
self._detect_relationships(table, columns, potential_relationships)
|
|
1087
|
+
|
|
1088
|
+
# Try to infer column data types when possible
|
|
1089
|
+
if self.is_connected():
|
|
1090
|
+
try:
|
|
1091
|
+
self._detect_column_types(table, column_data_types)
|
|
1092
|
+
except Exception:
|
|
1093
|
+
pass
|
|
1094
|
+
|
|
1095
|
+
# Add common SQL functions and aggregations with context-aware completions
|
|
1096
|
+
sql_functions = [
|
|
1097
|
+
# Aggregation functions with completed parentheses
|
|
1098
|
+
"COUNT(*)", "COUNT(DISTINCT ", "SUM(", "AVG(", "MIN(", "MAX(",
|
|
1099
|
+
|
|
1100
|
+
# String functions
|
|
1101
|
+
"CONCAT(", "SUBSTR(", "LOWER(", "UPPER(", "TRIM(", "REPLACE(", "LENGTH(",
|
|
1102
|
+
"REGEXP_REPLACE(", "REGEXP_EXTRACT(", "REGEXP_MATCH(",
|
|
1103
|
+
|
|
1104
|
+
# Date/time functions
|
|
1105
|
+
"CURRENT_DATE", "CURRENT_TIME", "CURRENT_TIMESTAMP", "NOW()",
|
|
1106
|
+
"EXTRACT(", "DATE_TRUNC(", "DATE_PART(", "DATEADD(", "DATEDIFF(",
|
|
1107
|
+
|
|
1108
|
+
# Type conversion
|
|
1109
|
+
"CAST( AS ", "CONVERT(", "TRY_CAST( AS ", "FORMAT(",
|
|
1110
|
+
|
|
1111
|
+
# Conditional functions
|
|
1112
|
+
"COALESCE(", "NULLIF(", "GREATEST(", "LEAST(", "IFF(", "IFNULL(",
|
|
1113
|
+
|
|
1114
|
+
# Window functions
|
|
1115
|
+
"ROW_NUMBER() OVER (", "RANK() OVER (", "DENSE_RANK() OVER (",
|
|
1116
|
+
"LEAD( OVER (", "LAG( OVER (", "FIRST_VALUE( OVER (", "LAST_VALUE( OVER ("
|
|
1117
|
+
]
|
|
1118
|
+
|
|
1119
|
+
# Add common SQL patterns with context awareness
|
|
1120
|
+
sql_patterns = [
|
|
1121
|
+
# Basic query patterns
|
|
1122
|
+
"SELECT * FROM ", "SELECT COUNT(*) FROM ",
|
|
1123
|
+
"SELECT DISTINCT ", "GROUP BY ", "ORDER BY ", "HAVING ",
|
|
1124
|
+
"LIMIT ", "OFFSET ", "WHERE ",
|
|
1125
|
+
|
|
1126
|
+
# JOIN patterns - complete with ON and common join points
|
|
1127
|
+
"INNER JOIN ", "LEFT JOIN ", "RIGHT JOIN ", "FULL OUTER JOIN ",
|
|
1128
|
+
"LEFT OUTER JOIN ", "RIGHT OUTER JOIN ", "CROSS JOIN ",
|
|
1129
|
+
|
|
1130
|
+
# Advanced patterns
|
|
1131
|
+
"WITH _ AS (", "CASE WHEN _ THEN _ ELSE _ END",
|
|
1132
|
+
"OVER (PARTITION BY _ ORDER BY _)",
|
|
1133
|
+
"EXISTS (SELECT 1 FROM _ WHERE _)",
|
|
1134
|
+
"NOT EXISTS (SELECT 1 FROM _ WHERE _)",
|
|
1135
|
+
|
|
1136
|
+
# Common operator patterns
|
|
1137
|
+
"BETWEEN _ AND _", "IN (", "NOT IN (", "IS NULL", "IS NOT NULL",
|
|
1138
|
+
"LIKE '%_%'", "NOT LIKE ", "ILIKE ",
|
|
1139
|
+
|
|
1140
|
+
# Data manipulation patterns
|
|
1141
|
+
"INSERT INTO _ VALUES (", "INSERT INTO _ (_) VALUES (_)",
|
|
1142
|
+
"UPDATE _ SET _ = _ WHERE _", "DELETE FROM _ WHERE _"
|
|
1143
|
+
]
|
|
1144
|
+
|
|
1145
|
+
# Add table relationships as suggested JOIN patterns
|
|
1146
|
+
for table1, col1, table2, col2 in potential_relationships:
|
|
1147
|
+
join_pattern = f"JOIN {table2} ON {table1}.{col1} = {table2}.{col2}"
|
|
1148
|
+
completion_words.add(join_pattern)
|
|
1149
|
+
|
|
1150
|
+
# Also add the reverse relationship
|
|
1151
|
+
join_pattern_rev = f"JOIN {table1} ON {table2}.{col2} = {table1}.{col1}"
|
|
1152
|
+
completion_words.add(join_pattern_rev)
|
|
1153
|
+
|
|
1154
|
+
# Add all SQL extras to the completion words
|
|
1155
|
+
completion_words.update(sql_functions)
|
|
1156
|
+
completion_words.update(sql_patterns)
|
|
1157
|
+
|
|
1158
|
+
# Add common data-specific comparison patterns based on column types
|
|
1159
|
+
for col_name, data_type in column_data_types.items():
|
|
1160
|
+
if 'INT' in data_type.upper() or 'NUM' in data_type.upper() or 'FLOAT' in data_type.upper():
|
|
1161
|
+
# Numeric columns
|
|
1162
|
+
completion_words.add(f"{col_name} > ")
|
|
1163
|
+
completion_words.add(f"{col_name} < ")
|
|
1164
|
+
completion_words.add(f"{col_name} >= ")
|
|
1165
|
+
completion_words.add(f"{col_name} <= ")
|
|
1166
|
+
completion_words.add(f"{col_name} BETWEEN ")
|
|
1167
|
+
elif 'DATE' in data_type.upper() or 'TIME' in data_type.upper():
|
|
1168
|
+
# Date/time columns
|
|
1169
|
+
completion_words.add(f"{col_name} > CURRENT_DATE")
|
|
1170
|
+
completion_words.add(f"{col_name} < CURRENT_DATE")
|
|
1171
|
+
completion_words.add(f"{col_name} BETWEEN CURRENT_DATE - INTERVAL ")
|
|
1172
|
+
completion_words.add(f"EXTRACT(YEAR FROM {col_name})")
|
|
1173
|
+
completion_words.add(f"DATE_TRUNC('month', {col_name})")
|
|
1174
|
+
elif 'CHAR' in data_type.upper() or 'TEXT' in data_type.upper() or 'VARCHAR' in data_type.upper():
|
|
1175
|
+
# String columns
|
|
1176
|
+
completion_words.add(f"{col_name} LIKE '%")
|
|
1177
|
+
completion_words.add(f"{col_name} ILIKE '%")
|
|
1178
|
+
completion_words.add(f"LOWER({col_name}) = ")
|
|
1179
|
+
completion_words.add(f"UPPER({col_name}) = ")
|
|
1180
|
+
|
|
1181
|
+
# Convert set back to list and sort for better usability
|
|
1182
|
+
completion_list = list(completion_words)
|
|
1183
|
+
completion_list.sort(key=lambda x: (not x.isupper(), x)) # Prioritize SQL keywords
|
|
1184
|
+
|
|
1185
|
+
return completion_list
|
|
1186
|
+
|
|
1187
|
+
def _detect_relationships(self, table, columns, potential_relationships):
|
|
1188
|
+
"""
|
|
1189
|
+
Detect potential relationships between tables based on column naming patterns.
|
|
1190
|
+
|
|
1191
|
+
Args:
|
|
1192
|
+
table: Current table name
|
|
1193
|
+
columns: List of column names in this table
|
|
1194
|
+
potential_relationships: List to populate with detected relationships
|
|
1195
|
+
"""
|
|
1196
|
+
# Look for columns that might be foreign keys (common patterns)
|
|
1197
|
+
for col in columns:
|
|
1198
|
+
# Common ID patterns: table_id, tableId, TableID, etc.
|
|
1199
|
+
if col.lower().endswith('_id') or col.lower().endswith('id'):
|
|
1200
|
+
# Extract potential table name from column name
|
|
1201
|
+
if col.lower().endswith('_id'):
|
|
1202
|
+
potential_table = col[:-3] # Remove '_id'
|
|
1203
|
+
else:
|
|
1204
|
+
# Try to extract tablename from camelCase or PascalCase
|
|
1205
|
+
potential_table = col[:-2] # Remove 'Id'
|
|
1206
|
+
|
|
1207
|
+
# Normalize to lowercase for comparison
|
|
1208
|
+
potential_table = potential_table.lower()
|
|
1209
|
+
|
|
1210
|
+
# Check if this potential table exists in our loaded tables
|
|
1211
|
+
for existing_table in self.loaded_tables.keys():
|
|
1212
|
+
# Normalize for comparison
|
|
1213
|
+
existing_lower = existing_table.lower()
|
|
1214
|
+
|
|
1215
|
+
# If we find a matching table, it's likely a relationship
|
|
1216
|
+
if existing_lower == potential_table or existing_lower.endswith(f"_{potential_table}"):
|
|
1217
|
+
# Add this relationship
|
|
1218
|
+
# We assume the target column in the referenced table is 'id'
|
|
1219
|
+
potential_relationships.append((table, col, existing_table, 'id'))
|
|
1220
|
+
break
|
|
1221
|
+
|
|
1222
|
+
# Also detect columns with same name across tables (potential join points)
|
|
1223
|
+
for other_table, other_columns in self.table_columns.items():
|
|
1224
|
+
if other_table != table and col in other_columns:
|
|
1225
|
+
# Same column name in different tables - potential join point
|
|
1226
|
+
potential_relationships.append((table, col, other_table, col))
|
|
1227
|
+
|
|
1228
|
+
def _detect_column_types(self, table, column_data_types):
|
|
1229
|
+
"""
|
|
1230
|
+
Detect column data types for a table to enable smarter autocompletion.
|
|
1231
|
+
|
|
1232
|
+
Args:
|
|
1233
|
+
table: Table name to analyze
|
|
1234
|
+
column_data_types: Dictionary to populate with column data types
|
|
1235
|
+
"""
|
|
1236
|
+
if not self.is_connected():
|
|
1237
|
+
return
|
|
1238
|
+
|
|
1239
|
+
try:
|
|
1240
|
+
# Determine the database to query
|
|
1241
|
+
source = self.loaded_tables.get(table, '')
|
|
1242
|
+
if source.startswith('database:'):
|
|
1243
|
+
db_name = source.split(':')[1]
|
|
1244
|
+
# Use duckdb_columns() for attached databases
|
|
1245
|
+
query = f"""
|
|
1246
|
+
SELECT column_name, data_type
|
|
1247
|
+
FROM duckdb_columns()
|
|
1248
|
+
WHERE database_name='{db_name}' AND table_name='{table}'
|
|
1249
|
+
"""
|
|
1250
|
+
else:
|
|
1251
|
+
# For in-memory tables, use information_schema
|
|
1252
|
+
query = f"""
|
|
1253
|
+
SELECT column_name, data_type
|
|
1254
|
+
FROM information_schema.columns
|
|
1255
|
+
WHERE table_name='{table}' AND table_schema='main'
|
|
1256
|
+
"""
|
|
1257
|
+
|
|
1258
|
+
result = self.conn.execute(query).fetchdf()
|
|
1259
|
+
|
|
1260
|
+
for _, row in result.iterrows():
|
|
1261
|
+
col_name = row['column_name']
|
|
1262
|
+
data_type = row['data_type']
|
|
1263
|
+
|
|
1264
|
+
# Store as table.column: data_type for qualified lookups
|
|
1265
|
+
column_data_types[f"{table}.{col_name}"] = data_type
|
|
1266
|
+
# Also store just column: data_type for unqualified lookups
|
|
1267
|
+
column_data_types[col_name] = data_type
|
|
1268
|
+
except Exception:
|
|
1269
|
+
# Ignore errors in type detection - this is just for enhancement
|
|
1270
|
+
pass
|
|
1271
|
+
|
|
1272
|
+
def load_specific_table(self, table_name, database_alias='db'):
|
|
1273
|
+
"""
|
|
1274
|
+
Load metadata for a specific table from an attached database.
|
|
1275
|
+
This is used when we know which tables we want to load rather than loading all tables.
|
|
1276
|
+
|
|
1277
|
+
Args:
|
|
1278
|
+
table_name: Name of the table to load
|
|
1279
|
+
database_alias: The alias of the attached database (default: 'db')
|
|
1280
|
+
|
|
1281
|
+
Returns:
|
|
1282
|
+
Boolean indicating if the table was found and loaded
|
|
1283
|
+
"""
|
|
1284
|
+
if not self.is_connected():
|
|
1285
|
+
return False
|
|
1286
|
+
|
|
1287
|
+
if database_alias not in self.attached_databases:
|
|
1288
|
+
return False
|
|
1289
|
+
|
|
1290
|
+
try:
|
|
1291
|
+
# Check if the table exists in the attached database using duckdb_tables()
|
|
1292
|
+
query = f"SELECT table_name FROM duckdb_tables() WHERE table_name='{table_name}' AND database_name='{database_alias}'"
|
|
1293
|
+
result = self.conn.execute(query).fetchdf()
|
|
1294
|
+
|
|
1295
|
+
if not result.empty:
|
|
1296
|
+
# Get column names for the table using duckdb_columns()
|
|
1297
|
+
try:
|
|
1298
|
+
column_query = f"SELECT column_name FROM duckdb_columns() WHERE table_name='{table_name}' AND database_name='{database_alias}'"
|
|
1299
|
+
columns = self.conn.execute(column_query).fetchdf()
|
|
1300
|
+
self.table_columns[table_name] = columns['column_name'].tolist()
|
|
1301
|
+
except Exception:
|
|
1302
|
+
self.table_columns[table_name] = []
|
|
1303
|
+
|
|
1304
|
+
# Register the table
|
|
1305
|
+
self.loaded_tables[table_name] = f'database:{database_alias}'
|
|
1306
|
+
|
|
1307
|
+
# Add to the database's table list
|
|
1308
|
+
if 'tables' not in self.attached_databases[database_alias]:
|
|
1309
|
+
self.attached_databases[database_alias]['tables'] = []
|
|
1310
|
+
if table_name not in self.attached_databases[database_alias]['tables']:
|
|
1311
|
+
self.attached_databases[database_alias]['tables'].append(table_name)
|
|
1312
|
+
|
|
1313
|
+
return True
|
|
1314
|
+
|
|
1315
|
+
return False
|
|
1316
|
+
|
|
1317
|
+
except Exception:
|
|
1318
|
+
return False
|