sqlshell 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sqlshell might be problematic. Click here for more details.
- sqlshell/README.md +5 -1
- sqlshell/__init__.py +35 -5
- sqlshell/create_test_data.py +29 -0
- sqlshell/db/__init__.py +2 -1
- sqlshell/db/database_manager.py +336 -23
- sqlshell/db/export_manager.py +188 -0
- sqlshell/editor_integration.py +127 -0
- sqlshell/execution_handler.py +421 -0
- sqlshell/main.py +784 -143
- sqlshell/query_tab.py +592 -7
- sqlshell/table_list.py +90 -1
- sqlshell/ui/filter_header.py +36 -1
- sqlshell/utils/profile_column.py +2515 -0
- sqlshell/utils/profile_distributions.py +613 -0
- sqlshell/utils/profile_foreign_keys.py +547 -0
- sqlshell/utils/profile_ohe.py +631 -0
- sqlshell-0.3.0.dist-info/METADATA +400 -0
- {sqlshell-0.2.2.dist-info → sqlshell-0.3.0.dist-info}/RECORD +21 -14
- {sqlshell-0.2.2.dist-info → sqlshell-0.3.0.dist-info}/WHEEL +1 -1
- sqlshell-0.2.2.dist-info/METADATA +0 -198
- {sqlshell-0.2.2.dist-info → sqlshell-0.3.0.dist-info}/entry_points.txt +0 -0
- {sqlshell-0.2.2.dist-info → sqlshell-0.3.0.dist-info}/top_level.txt +0 -0
sqlshell/README.md
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
# SQLShell
|
|
1
|
+
# SQLShell - DEPRECATED README
|
|
2
|
+
|
|
3
|
+
**NOTE: This README is deprecated. Please refer to the main README.md file in the root directory of the repository for the most up-to-date information.**
|
|
2
4
|
|
|
3
5
|
A powerful SQL shell with GUI interface for data analysis. SQLShell provides an intuitive interface for working with various data formats (CSV, Excel, Parquet) using SQL queries powered by DuckDB.
|
|
4
6
|
|
|
@@ -12,6 +14,7 @@ A powerful SQL shell with GUI interface for data analysis. SQLShell provides an
|
|
|
12
14
|
- Table preview functionality
|
|
13
15
|
- Built-in test data generation
|
|
14
16
|
- Support for multiple concurrent table views
|
|
17
|
+
- "Explain Column" feature for analyzing relationships between data columns
|
|
15
18
|
|
|
16
19
|
## Installation
|
|
17
20
|
|
|
@@ -45,6 +48,7 @@ This will open the GUI interface where you can:
|
|
|
45
48
|
3. Execute queries using the "Execute" button or Ctrl+Enter
|
|
46
49
|
4. View results in the table view below
|
|
47
50
|
5. Load sample test data using the "Test" button
|
|
51
|
+
6. Right-click on column headers in the results to access features like sorting, filtering, and the "Explain Column" analysis tool
|
|
48
52
|
|
|
49
53
|
## Requirements
|
|
50
54
|
|
sqlshell/__init__.py
CHANGED
|
@@ -2,13 +2,43 @@
|
|
|
2
2
|
SQLShell - A powerful SQL shell with GUI interface for data analysis
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
__version__ = "0.2.
|
|
5
|
+
__version__ = "0.2.3"
|
|
6
6
|
__author__ = "SQLShell Team"
|
|
7
7
|
|
|
8
|
-
from sqlshell.main import main
|
|
8
|
+
from sqlshell.main import main, SQLShell
|
|
9
|
+
from PyQt6.QtWidgets import QApplication
|
|
10
|
+
import sys
|
|
9
11
|
|
|
10
|
-
def start():
|
|
11
|
-
"""Start the SQLShell application.
|
|
12
|
-
|
|
12
|
+
def start(database_path=None):
|
|
13
|
+
"""Start the SQLShell application.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
database_path (str, optional): Path to a database file to open. If provided,
|
|
17
|
+
SQLShell will automatically open this database on startup.
|
|
18
|
+
"""
|
|
19
|
+
app = QApplication(sys.argv)
|
|
20
|
+
window = SQLShell()
|
|
21
|
+
|
|
22
|
+
if database_path:
|
|
23
|
+
try:
|
|
24
|
+
# Open the database
|
|
25
|
+
window.db_manager.open_database(database_path, load_all_tables=True)
|
|
26
|
+
|
|
27
|
+
# Update UI with tables from the database
|
|
28
|
+
for table_name, source in window.db_manager.loaded_tables.items():
|
|
29
|
+
if source == 'database':
|
|
30
|
+
window.tables_list.add_table_item(table_name, "database")
|
|
31
|
+
|
|
32
|
+
# Update the completer with table and column names
|
|
33
|
+
window.update_completer()
|
|
34
|
+
|
|
35
|
+
# Update status bar
|
|
36
|
+
window.statusBar().showMessage(f"Connected to database: {database_path}")
|
|
37
|
+
window.db_info_label.setText(window.db_manager.get_connection_info())
|
|
38
|
+
except Exception as e:
|
|
39
|
+
print(f"Error opening database: {e}")
|
|
40
|
+
|
|
41
|
+
window.show()
|
|
42
|
+
sys.exit(app.exec())
|
|
13
43
|
|
|
14
44
|
# SQLShell package initialization
|
sqlshell/create_test_data.py
CHANGED
|
@@ -10,6 +10,35 @@ np.random.seed(42)
|
|
|
10
10
|
OUTPUT_DIR = 'test_data'
|
|
11
11
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
12
12
|
|
|
13
|
+
def create_california_housing_data(output_file='california_housing_data.parquet'):
|
|
14
|
+
"""Use the real world california housing dataset"""
|
|
15
|
+
# Load the dataset
|
|
16
|
+
df = pd.read_csv('https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv')
|
|
17
|
+
|
|
18
|
+
# Save to Parquet
|
|
19
|
+
df.to_parquet(output_file)
|
|
20
|
+
return df
|
|
21
|
+
|
|
22
|
+
def create_large_customer_data(num_customers=1_000_000, chunk_size=100_000, output_file='large_customer_data.parquet'):
|
|
23
|
+
"""Create a large customer dataset """
|
|
24
|
+
# Generate customer data
|
|
25
|
+
data = {
|
|
26
|
+
'CustomerID': range(1, num_customers + 1),
|
|
27
|
+
'FirstName': [f'Customer{i}' for i in range(1, num_customers + 1)],
|
|
28
|
+
'LastName': [f'Lastname{i}' for i in range(1, num_customers + 1)],
|
|
29
|
+
'Email': [f'customer{i}@example.com' for i in range(1, num_customers + 1)],
|
|
30
|
+
'JoinDate': [datetime.now() - timedelta(days=np.random.randint(1, 1000))
|
|
31
|
+
for _ in range(num_customers)],
|
|
32
|
+
'CustomerType': np.random.choice(['Regular', 'Premium', 'VIP'], num_customers),
|
|
33
|
+
'CreditScore': np.random.randint(300, 851, num_customers)
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
# Create DataFrame
|
|
37
|
+
df = pd.DataFrame(data)
|
|
38
|
+
|
|
39
|
+
return df
|
|
40
|
+
|
|
41
|
+
|
|
13
42
|
def create_sales_data(num_records=1000):
|
|
14
43
|
# Generate dates for the last 365 days
|
|
15
44
|
end_date = datetime.now()
|
sqlshell/db/__init__.py
CHANGED
sqlshell/db/database_manager.py
CHANGED
|
@@ -219,7 +219,7 @@ class DatabaseManager:
|
|
|
219
219
|
Load data from a file into the database.
|
|
220
220
|
|
|
221
221
|
Args:
|
|
222
|
-
file_path: Path to the data file (Excel, CSV, Parquet, Delta)
|
|
222
|
+
file_path: Path to the data file (Excel, CSV, TXT, Parquet, Delta)
|
|
223
223
|
|
|
224
224
|
Returns:
|
|
225
225
|
Tuple of (table_name, DataFrame) for the loaded data
|
|
@@ -240,8 +240,29 @@ class DatabaseManager:
|
|
|
240
240
|
# Load the Delta table
|
|
241
241
|
import deltalake
|
|
242
242
|
delta_table = deltalake.DeltaTable(file_path)
|
|
243
|
-
|
|
243
|
+
|
|
244
|
+
# Get the schema to identify decimal columns
|
|
245
|
+
schema = delta_table.schema()
|
|
246
|
+
decimal_columns = []
|
|
247
|
+
|
|
248
|
+
# Identify decimal columns from schema
|
|
249
|
+
for field in schema.fields:
|
|
250
|
+
# Use string representation to check for decimal
|
|
251
|
+
if 'decimal' in str(field.type).lower():
|
|
252
|
+
decimal_columns.append(field.name)
|
|
253
|
+
|
|
254
|
+
# Read the data
|
|
244
255
|
df = delta_table.to_pandas()
|
|
256
|
+
|
|
257
|
+
# Try to convert decimal columns to float64, warn if not possible
|
|
258
|
+
for col in decimal_columns:
|
|
259
|
+
if col in df.columns:
|
|
260
|
+
try:
|
|
261
|
+
df[col] = pd.to_numeric(df[col], errors='coerce').astype('float64')
|
|
262
|
+
if df[col].isna().any():
|
|
263
|
+
print(f"Warning: Some values in column '{col}' could not be converted to float64 and are set as NaN.")
|
|
264
|
+
except Exception as e:
|
|
265
|
+
print(f"Warning: Could not convert column '{col}' to float64: {e}")
|
|
245
266
|
except Exception as e:
|
|
246
267
|
raise ValueError(f"Error loading Delta table: {str(e)}")
|
|
247
268
|
elif file_path.endswith(('.xlsx', '.xls')):
|
|
@@ -267,37 +288,208 @@ class DatabaseManager:
|
|
|
267
288
|
except Exception:
|
|
268
289
|
# Fallback to standard reading method
|
|
269
290
|
df = pd.read_excel(file_path)
|
|
270
|
-
elif file_path.endswith('.csv'):
|
|
271
|
-
# For CSV files,
|
|
291
|
+
elif file_path.endswith(('.csv', '.txt')):
|
|
292
|
+
# For CSV and TXT files, detect separator and use chunking for large files
|
|
272
293
|
try:
|
|
273
294
|
# Check if it's a large file
|
|
274
295
|
file_size = os.path.getsize(file_path) / (1024 * 1024) # Size in MB
|
|
275
296
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
297
|
+
# Try multiple encodings if needed
|
|
298
|
+
encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'ISO-8859-1']
|
|
299
|
+
|
|
300
|
+
# Detect the separator automatically
|
|
301
|
+
def detect_separator(sample_data):
|
|
302
|
+
# Common separators to check
|
|
303
|
+
separators = [',', ';', '\t']
|
|
304
|
+
separator_scores = {}
|
|
305
|
+
|
|
306
|
+
# Split into lines and analyze
|
|
307
|
+
lines = [line.strip() for line in sample_data.split('\n') if line.strip()]
|
|
308
|
+
if not lines:
|
|
309
|
+
return ',' # Default if no content
|
|
310
|
+
|
|
311
|
+
# Check for quoted content with separators
|
|
312
|
+
has_quotes = '"' in sample_data or "'" in sample_data
|
|
279
313
|
|
|
280
|
-
#
|
|
281
|
-
|
|
314
|
+
# If we have quoted content, use a different approach
|
|
315
|
+
if has_quotes:
|
|
316
|
+
for sep in separators:
|
|
317
|
+
# Look for patterns like "value";
|
|
318
|
+
pattern_count = 0
|
|
319
|
+
for line in lines:
|
|
320
|
+
# Count occurrences of quote + separator
|
|
321
|
+
double_quote_pattern = f'"{sep}'
|
|
322
|
+
single_quote_pattern = f"'{sep}"
|
|
323
|
+
pattern_count += line.count(double_quote_pattern) + line.count(single_quote_pattern)
|
|
324
|
+
|
|
325
|
+
# If we found clear quote+separator patterns, this is likely our separator
|
|
326
|
+
if pattern_count > 0:
|
|
327
|
+
separator_scores[sep] = pattern_count
|
|
282
328
|
|
|
283
|
-
#
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
329
|
+
# Standard approach based on consistent column counts
|
|
330
|
+
if not separator_scores:
|
|
331
|
+
for sep in separators:
|
|
332
|
+
# Count consistent occurrences across lines
|
|
333
|
+
counts = [line.count(sep) for line in lines]
|
|
334
|
+
if counts and all(c > 0 for c in counts):
|
|
335
|
+
# Calculate consistency score: higher if all counts are the same
|
|
336
|
+
consistency = 1.0 if all(c == counts[0] for c in counts) else 0.5
|
|
337
|
+
# Score is average count * consistency
|
|
338
|
+
separator_scores[sep] = sum(counts) / len(counts) * consistency
|
|
289
339
|
|
|
290
|
-
|
|
340
|
+
# Choose the separator with the highest score
|
|
341
|
+
if separator_scores:
|
|
342
|
+
return max(separator_scores.items(), key=lambda x: x[1])[0]
|
|
343
|
+
|
|
344
|
+
# Default to comma if we couldn't determine
|
|
345
|
+
return ','
|
|
346
|
+
|
|
347
|
+
# First, sample the file to detect separator
|
|
348
|
+
with open(file_path, 'rb') as f:
|
|
349
|
+
# Read first few KB to detect encoding and separator
|
|
350
|
+
raw_sample = f.read(4096)
|
|
351
|
+
|
|
352
|
+
# Try to decode with various encodings
|
|
353
|
+
sample_text = None
|
|
354
|
+
detected_encoding = None
|
|
355
|
+
|
|
356
|
+
for encoding in encodings_to_try:
|
|
357
|
+
try:
|
|
358
|
+
sample_text = raw_sample.decode(encoding)
|
|
359
|
+
detected_encoding = encoding
|
|
360
|
+
break
|
|
361
|
+
except UnicodeDecodeError:
|
|
362
|
+
continue
|
|
363
|
+
|
|
364
|
+
if not sample_text:
|
|
365
|
+
raise ValueError("Could not decode file with any of the attempted encodings")
|
|
366
|
+
|
|
367
|
+
# Detect separator from the sample
|
|
368
|
+
separator = detect_separator(sample_text)
|
|
369
|
+
|
|
370
|
+
# Determine quote character (default to double quote)
|
|
371
|
+
quotechar = '"'
|
|
372
|
+
if sample_text.count("'") > sample_text.count('"'):
|
|
373
|
+
quotechar = "'"
|
|
374
|
+
|
|
375
|
+
if file_size > 50: # If file is larger than 50MB
|
|
376
|
+
# Read the first chunk to get column types
|
|
377
|
+
try:
|
|
378
|
+
df_preview = pd.read_csv(
|
|
379
|
+
file_path,
|
|
380
|
+
sep=separator,
|
|
381
|
+
nrows=1000,
|
|
382
|
+
encoding=detected_encoding,
|
|
383
|
+
engine='python' if separator != ',' else 'c',
|
|
384
|
+
quotechar=quotechar,
|
|
385
|
+
doublequote=True
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
# Use optimized dtypes for better memory usage
|
|
389
|
+
dtypes = {col: df_preview[col].dtype for col in df_preview.columns}
|
|
390
|
+
|
|
391
|
+
# Read again with chunk processing, combining up to 100k rows
|
|
392
|
+
chunks = []
|
|
393
|
+
for chunk in pd.read_csv(
|
|
394
|
+
file_path,
|
|
395
|
+
sep=separator,
|
|
396
|
+
dtype=dtypes,
|
|
397
|
+
chunksize=10000,
|
|
398
|
+
encoding=detected_encoding,
|
|
399
|
+
engine='python' if separator != ',' else 'c',
|
|
400
|
+
quotechar=quotechar,
|
|
401
|
+
doublequote=True
|
|
402
|
+
):
|
|
403
|
+
chunks.append(chunk)
|
|
404
|
+
if len(chunks) * 10000 >= 100000: # Cap at 100k rows
|
|
405
|
+
break
|
|
406
|
+
|
|
407
|
+
df = pd.concat(chunks, ignore_index=True)
|
|
408
|
+
except pd.errors.ParserError as e:
|
|
409
|
+
# If parsing fails, try again with error recovery options
|
|
410
|
+
print(f"Initial parsing failed: {str(e)}. Trying with error recovery options...")
|
|
411
|
+
|
|
412
|
+
# Try with Python engine which is more flexible
|
|
413
|
+
try:
|
|
414
|
+
# First try with pandas >= 1.3 parameters
|
|
415
|
+
df = pd.read_csv(
|
|
416
|
+
file_path,
|
|
417
|
+
sep=separator,
|
|
418
|
+
encoding=detected_encoding,
|
|
419
|
+
engine='python', # Always use python engine for error recovery
|
|
420
|
+
quotechar=quotechar,
|
|
421
|
+
doublequote=True,
|
|
422
|
+
on_bad_lines='warn', # New parameter in pandas >= 1.3
|
|
423
|
+
na_values=[''],
|
|
424
|
+
keep_default_na=True
|
|
425
|
+
)
|
|
426
|
+
except TypeError:
|
|
427
|
+
# Fall back to pandas < 1.3 parameters
|
|
428
|
+
df = pd.read_csv(
|
|
429
|
+
file_path,
|
|
430
|
+
sep=separator,
|
|
431
|
+
encoding=detected_encoding,
|
|
432
|
+
engine='python',
|
|
433
|
+
quotechar=quotechar,
|
|
434
|
+
doublequote=True,
|
|
435
|
+
error_bad_lines=False, # Old parameter
|
|
436
|
+
warn_bad_lines=True, # Old parameter
|
|
437
|
+
na_values=[''],
|
|
438
|
+
keep_default_na=True
|
|
439
|
+
)
|
|
291
440
|
else:
|
|
292
441
|
# For smaller files, read everything at once
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
442
|
+
try:
|
|
443
|
+
df = pd.read_csv(
|
|
444
|
+
file_path,
|
|
445
|
+
sep=separator,
|
|
446
|
+
encoding=detected_encoding,
|
|
447
|
+
engine='python' if separator != ',' else 'c',
|
|
448
|
+
quotechar=quotechar,
|
|
449
|
+
doublequote=True
|
|
450
|
+
)
|
|
451
|
+
except pd.errors.ParserError as e:
|
|
452
|
+
# If parsing fails, try again with error recovery options
|
|
453
|
+
print(f"Initial parsing failed: {str(e)}. Trying with error recovery options...")
|
|
454
|
+
|
|
455
|
+
# Try with Python engine which is more flexible
|
|
456
|
+
try:
|
|
457
|
+
# First try with pandas >= 1.3 parameters
|
|
458
|
+
df = pd.read_csv(
|
|
459
|
+
file_path,
|
|
460
|
+
sep=separator,
|
|
461
|
+
encoding=detected_encoding,
|
|
462
|
+
engine='python', # Always use python engine for error recovery
|
|
463
|
+
quotechar=quotechar,
|
|
464
|
+
doublequote=True,
|
|
465
|
+
on_bad_lines='warn', # New parameter in pandas >= 1.3
|
|
466
|
+
na_values=[''],
|
|
467
|
+
keep_default_na=True
|
|
468
|
+
)
|
|
469
|
+
except TypeError:
|
|
470
|
+
# Fall back to pandas < 1.3 parameters
|
|
471
|
+
df = pd.read_csv(
|
|
472
|
+
file_path,
|
|
473
|
+
sep=separator,
|
|
474
|
+
encoding=detected_encoding,
|
|
475
|
+
engine='python',
|
|
476
|
+
quotechar=quotechar,
|
|
477
|
+
doublequote=True,
|
|
478
|
+
error_bad_lines=False, # Old parameter
|
|
479
|
+
warn_bad_lines=True, # Old parameter
|
|
480
|
+
na_values=[''],
|
|
481
|
+
keep_default_na=True
|
|
482
|
+
)
|
|
483
|
+
except Exception as e:
|
|
484
|
+
# Log the error for debugging
|
|
485
|
+
import traceback
|
|
486
|
+
print(f"Error loading CSV/TXT file: {str(e)}")
|
|
487
|
+
print(traceback.format_exc())
|
|
488
|
+
raise ValueError(f"Error loading CSV/TXT file: {str(e)}")
|
|
297
489
|
elif file_path.endswith('.parquet'):
|
|
298
490
|
df = pd.read_parquet(file_path)
|
|
299
491
|
else:
|
|
300
|
-
raise ValueError("Unsupported file format")
|
|
492
|
+
raise ValueError("Unsupported file format. Supported formats: .xlsx, .xls, .csv, .txt, .parquet, and Delta tables.")
|
|
301
493
|
|
|
302
494
|
# Generate table name from file name
|
|
303
495
|
base_name = os.path.splitext(os.path.basename(file_path))[0]
|
|
@@ -448,8 +640,128 @@ class DatabaseManager:
|
|
|
448
640
|
df = delta_table.to_pandas()
|
|
449
641
|
elif file_path.endswith(('.xlsx', '.xls')):
|
|
450
642
|
df = pd.read_excel(file_path)
|
|
451
|
-
elif file_path.endswith('.csv'):
|
|
452
|
-
|
|
643
|
+
elif file_path.endswith(('.csv', '.txt')):
|
|
644
|
+
# Try multiple encodings for CSV/TXT files
|
|
645
|
+
encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'ISO-8859-1']
|
|
646
|
+
|
|
647
|
+
# Detect the separator automatically
|
|
648
|
+
def detect_separator(sample_data):
|
|
649
|
+
# Common separators to check
|
|
650
|
+
separators = [',', ';', '\t']
|
|
651
|
+
separator_scores = {}
|
|
652
|
+
|
|
653
|
+
# Split into lines and analyze
|
|
654
|
+
lines = [line.strip() for line in sample_data.split('\n') if line.strip()]
|
|
655
|
+
if not lines:
|
|
656
|
+
return ',' # Default if no content
|
|
657
|
+
|
|
658
|
+
# Check for quoted content with separators
|
|
659
|
+
has_quotes = '"' in sample_data or "'" in sample_data
|
|
660
|
+
|
|
661
|
+
# If we have quoted content, use a different approach
|
|
662
|
+
if has_quotes:
|
|
663
|
+
for sep in separators:
|
|
664
|
+
# Look for patterns like "value";
|
|
665
|
+
pattern_count = 0
|
|
666
|
+
for line in lines:
|
|
667
|
+
# Count occurrences of quote + separator
|
|
668
|
+
double_quote_pattern = f'"{sep}'
|
|
669
|
+
single_quote_pattern = f"'{sep}"
|
|
670
|
+
pattern_count += line.count(double_quote_pattern) + line.count(single_quote_pattern)
|
|
671
|
+
|
|
672
|
+
# If we found clear quote+separator patterns, this is likely our separator
|
|
673
|
+
if pattern_count > 0:
|
|
674
|
+
separator_scores[sep] = pattern_count
|
|
675
|
+
|
|
676
|
+
# Standard approach based on consistent column counts
|
|
677
|
+
if not separator_scores:
|
|
678
|
+
for sep in separators:
|
|
679
|
+
# Count consistent occurrences across lines
|
|
680
|
+
counts = [line.count(sep) for line in lines]
|
|
681
|
+
if counts and all(c > 0 for c in counts):
|
|
682
|
+
# Calculate consistency score: higher if all counts are the same
|
|
683
|
+
consistency = 1.0 if all(c == counts[0] for c in counts) else 0.5
|
|
684
|
+
# Score is average count * consistency
|
|
685
|
+
separator_scores[sep] = sum(counts) / len(counts) * consistency
|
|
686
|
+
|
|
687
|
+
# Choose the separator with the highest score
|
|
688
|
+
if separator_scores:
|
|
689
|
+
return max(separator_scores.items(), key=lambda x: x[1])[0]
|
|
690
|
+
|
|
691
|
+
# Default to comma if we couldn't determine
|
|
692
|
+
return ','
|
|
693
|
+
|
|
694
|
+
# First, sample the file to detect separator and encoding
|
|
695
|
+
with open(file_path, 'rb') as f:
|
|
696
|
+
# Read first few KB to detect encoding and separator
|
|
697
|
+
raw_sample = f.read(4096)
|
|
698
|
+
|
|
699
|
+
# Try to decode with various encodings
|
|
700
|
+
sample_text = None
|
|
701
|
+
detected_encoding = None
|
|
702
|
+
|
|
703
|
+
for encoding in encodings_to_try:
|
|
704
|
+
try:
|
|
705
|
+
sample_text = raw_sample.decode(encoding)
|
|
706
|
+
detected_encoding = encoding
|
|
707
|
+
break
|
|
708
|
+
except UnicodeDecodeError:
|
|
709
|
+
# If this encoding fails, try the next one
|
|
710
|
+
continue
|
|
711
|
+
|
|
712
|
+
if not sample_text:
|
|
713
|
+
raise ValueError("Could not decode file with any of the attempted encodings")
|
|
714
|
+
|
|
715
|
+
# Detect separator from the sample
|
|
716
|
+
separator = detect_separator(sample_text)
|
|
717
|
+
|
|
718
|
+
# Determine quote character (default to double quote)
|
|
719
|
+
quotechar = '"'
|
|
720
|
+
if sample_text.count("'") > sample_text.count('"'):
|
|
721
|
+
quotechar = "'"
|
|
722
|
+
|
|
723
|
+
# Read with detected parameters
|
|
724
|
+
try:
|
|
725
|
+
df = pd.read_csv(
|
|
726
|
+
file_path,
|
|
727
|
+
sep=separator,
|
|
728
|
+
encoding=detected_encoding,
|
|
729
|
+
engine='python' if separator != ',' else 'c',
|
|
730
|
+
quotechar=quotechar,
|
|
731
|
+
doublequote=True
|
|
732
|
+
)
|
|
733
|
+
except pd.errors.ParserError as e:
|
|
734
|
+
# If parsing fails, try again with error recovery options
|
|
735
|
+
print(f"Initial parsing failed on reload: {str(e)}. Trying with error recovery options...")
|
|
736
|
+
|
|
737
|
+
# Try with Python engine which is more flexible
|
|
738
|
+
try:
|
|
739
|
+
# First try with pandas >= 1.3 parameters
|
|
740
|
+
df = pd.read_csv(
|
|
741
|
+
file_path,
|
|
742
|
+
sep=separator,
|
|
743
|
+
encoding=detected_encoding,
|
|
744
|
+
engine='python', # Always use python engine for error recovery
|
|
745
|
+
quotechar=quotechar,
|
|
746
|
+
doublequote=True,
|
|
747
|
+
on_bad_lines='warn', # New parameter in pandas >= 1.3
|
|
748
|
+
na_values=[''],
|
|
749
|
+
keep_default_na=True
|
|
750
|
+
)
|
|
751
|
+
except TypeError:
|
|
752
|
+
# Fall back to pandas < 1.3 parameters
|
|
753
|
+
df = pd.read_csv(
|
|
754
|
+
file_path,
|
|
755
|
+
sep=separator,
|
|
756
|
+
encoding=detected_encoding,
|
|
757
|
+
engine='python',
|
|
758
|
+
quotechar=quotechar,
|
|
759
|
+
doublequote=True,
|
|
760
|
+
error_bad_lines=False, # Old parameter
|
|
761
|
+
warn_bad_lines=True, # Old parameter
|
|
762
|
+
na_values=[''],
|
|
763
|
+
keep_default_na=True
|
|
764
|
+
)
|
|
453
765
|
elif file_path.endswith('.parquet'):
|
|
454
766
|
df = pd.read_parquet(file_path)
|
|
455
767
|
else:
|
|
@@ -547,6 +859,7 @@ class DatabaseManager:
|
|
|
547
859
|
if self.connection_type == 'sqlite':
|
|
548
860
|
df.to_sql(table_name, self.conn, index=False, if_exists='replace')
|
|
549
861
|
else: # duckdb
|
|
862
|
+
# Register the DataFrame directly
|
|
550
863
|
self.conn.register(table_name, df)
|
|
551
864
|
|
|
552
865
|
# Track the table
|