sqlshell 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. sqlshell/__init__.py +84 -0
  2. sqlshell/__main__.py +4926 -0
  3. sqlshell/ai_autocomplete.py +392 -0
  4. sqlshell/ai_settings_dialog.py +337 -0
  5. sqlshell/context_suggester.py +768 -0
  6. sqlshell/create_test_data.py +152 -0
  7. sqlshell/data/create_test_data.py +137 -0
  8. sqlshell/db/__init__.py +6 -0
  9. sqlshell/db/database_manager.py +1318 -0
  10. sqlshell/db/export_manager.py +188 -0
  11. sqlshell/editor.py +1166 -0
  12. sqlshell/editor_integration.py +127 -0
  13. sqlshell/execution_handler.py +421 -0
  14. sqlshell/menus.py +262 -0
  15. sqlshell/notification_manager.py +370 -0
  16. sqlshell/query_tab.py +904 -0
  17. sqlshell/resources/__init__.py +1 -0
  18. sqlshell/resources/icon.png +0 -0
  19. sqlshell/resources/logo_large.png +0 -0
  20. sqlshell/resources/logo_medium.png +0 -0
  21. sqlshell/resources/logo_small.png +0 -0
  22. sqlshell/resources/splash_screen.gif +0 -0
  23. sqlshell/space_invaders.py +501 -0
  24. sqlshell/splash_screen.py +405 -0
  25. sqlshell/sqlshell/__init__.py +5 -0
  26. sqlshell/sqlshell/create_test_data.py +118 -0
  27. sqlshell/sqlshell/create_test_databases.py +96 -0
  28. sqlshell/sqlshell_demo.png +0 -0
  29. sqlshell/styles.py +257 -0
  30. sqlshell/suggester_integration.py +330 -0
  31. sqlshell/syntax_highlighter.py +124 -0
  32. sqlshell/table_list.py +996 -0
  33. sqlshell/ui/__init__.py +6 -0
  34. sqlshell/ui/bar_chart_delegate.py +49 -0
  35. sqlshell/ui/filter_header.py +469 -0
  36. sqlshell/utils/__init__.py +16 -0
  37. sqlshell/utils/profile_cn2.py +1661 -0
  38. sqlshell/utils/profile_column.py +2635 -0
  39. sqlshell/utils/profile_distributions.py +616 -0
  40. sqlshell/utils/profile_entropy.py +347 -0
  41. sqlshell/utils/profile_foreign_keys.py +779 -0
  42. sqlshell/utils/profile_keys.py +2834 -0
  43. sqlshell/utils/profile_ohe.py +934 -0
  44. sqlshell/utils/profile_ohe_advanced.py +754 -0
  45. sqlshell/utils/profile_ohe_comparison.py +237 -0
  46. sqlshell/utils/profile_prediction.py +926 -0
  47. sqlshell/utils/profile_similarity.py +876 -0
  48. sqlshell/utils/search_in_df.py +90 -0
  49. sqlshell/widgets.py +400 -0
  50. sqlshell-0.4.4.dist-info/METADATA +441 -0
  51. sqlshell-0.4.4.dist-info/RECORD +54 -0
  52. sqlshell-0.4.4.dist-info/WHEEL +5 -0
  53. sqlshell-0.4.4.dist-info/entry_points.txt +2 -0
  54. sqlshell-0.4.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,926 @@
1
+ """
2
+ Column Prediction Module
3
+
4
+ This module provides prediction functionality for columns using modern machine learning techniques.
5
+ It creates a new "Predict <column_name>" column with predictions based on other columns in the dataframe.
6
+ """
7
+
8
+ import pandas as pd
9
+ import numpy as np
10
+ from sklearn.model_selection import train_test_split
11
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
12
+ from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
13
+ from sklearn.linear_model import LinearRegression, LogisticRegression
14
+ from sklearn.metrics import mean_squared_error, accuracy_score, r2_score
15
+ import warnings
16
+ import joblib
17
+ import os
18
+ import json
19
+ from datetime import datetime
20
+ warnings.filterwarnings('ignore')
21
+
22
+ from PyQt6.QtWidgets import (QMainWindow, QVBoxLayout, QHBoxLayout, QWidget, QLabel,
23
+ QTableView, QPushButton, QProgressBar, QComboBox, QCheckBox,
24
+ QTextEdit, QSplitter, QHeaderView, QMessageBox, QGroupBox,
25
+ QFormLayout, QSpinBox, QDoubleSpinBox, QFileDialog)
26
+ from PyQt6.QtCore import Qt, QAbstractTableModel, QModelIndex, QThread, pyqtSignal, QTimer
27
+ from PyQt6.QtGui import QStandardItemModel, QStandardItem, QColor, QPalette, QBrush
28
+
29
+
30
+ class PredictionThread(QThread):
31
+ """Worker thread for background prediction model training and evaluation"""
32
+
33
+ progress = pyqtSignal(int, str)
34
+ result = pyqtSignal(object)
35
+ error = pyqtSignal(str)
36
+
37
+ def __init__(self, df, target_column, prediction_type='auto', test_size=0.2, random_state=42):
38
+ super().__init__()
39
+ self.df = df.copy()
40
+ self.target_column = target_column
41
+ self.prediction_type = prediction_type
42
+ self.test_size = test_size
43
+ self.random_state = random_state
44
+ self._is_canceled = False
45
+
46
+ def cancel(self):
47
+ """Mark the thread as canceled"""
48
+ self._is_canceled = True
49
+
50
+ def detect_prediction_type(self, target_series):
51
+ """Automatically detect whether to use regression or classification"""
52
+ if pd.api.types.is_numeric_dtype(target_series):
53
+ # Check if it looks like a categorical variable (few unique values)
54
+ unique_count = target_series.nunique()
55
+ total_count = len(target_series.dropna())
56
+
57
+ if unique_count <= 10 or (unique_count / total_count) < 0.05:
58
+ return 'classification'
59
+ else:
60
+ return 'regression'
61
+ else:
62
+ return 'classification'
63
+
64
+ def prepare_features(self, df, target_column):
65
+ """Prepare features for machine learning"""
66
+ # Separate features and target
67
+ X_all = df.drop(columns=[target_column])
68
+ y_all = df[target_column]
69
+
70
+ # Identify rows with non-null targets (for training/testing)
71
+ # and rows with null targets (for prediction)
72
+ non_null_mask = ~pd.isna(y_all)
73
+ null_mask = pd.isna(y_all)
74
+
75
+ # Get training data (non-null targets only)
76
+ X_train_data = X_all[non_null_mask]
77
+ y_train_data = y_all[non_null_mask]
78
+
79
+ if len(X_train_data) == 0:
80
+ raise ValueError("No valid data with non-missing target values for training")
81
+
82
+ # Handle categorical features for ALL data (including null targets)
83
+ categorical_cols = X_all.select_dtypes(include=['object', 'category']).columns
84
+ numerical_cols = X_all.select_dtypes(include=[np.number]).columns
85
+
86
+ # Process features consistently across all data
87
+ X_processed = X_all.copy()
88
+ label_encoders = {}
89
+
90
+ # Encode categorical variables
91
+ for col in categorical_cols:
92
+ # Fill missing values with 'missing'
93
+ X_processed[col] = X_processed[col].fillna('missing')
94
+
95
+ # Only encode if column has reasonable cardinality (based on training data)
96
+ if X_train_data[col].fillna('missing').nunique() < len(X_train_data) * 0.5:
97
+ le = LabelEncoder()
98
+ # Fit encoder on all data (including null targets) to handle unseen categories
99
+ le.fit(X_processed[col].astype(str))
100
+ X_processed[col] = le.transform(X_processed[col].astype(str))
101
+ label_encoders[col] = le
102
+ else:
103
+ # Drop high cardinality categorical columns
104
+ X_processed = X_processed.drop(columns=[col])
105
+
106
+ # Handle numerical features
107
+ for col in numerical_cols:
108
+ if col in X_processed.columns: # Column might have been dropped
109
+ # Fill missing values with median (computed from training data)
110
+ median_val = X_train_data[col].median()
111
+ X_processed[col] = X_processed[col].fillna(median_val)
112
+
113
+ # Return processed features for training and the target values
114
+ return X_processed[non_null_mask], y_train_data, label_encoders, X_processed, null_mask
115
+
116
+ def run(self):
117
+ try:
118
+ if self._is_canceled:
119
+ return
120
+
121
+ self.progress.emit(10, "Preparing data...")
122
+
123
+ # Check if target column exists
124
+ if self.target_column not in self.df.columns:
125
+ raise ValueError(f"Target column '{self.target_column}' not found")
126
+
127
+ # Prepare features
128
+ X, y, label_encoders, X_all, null_mask = self.prepare_features(self.df, self.target_column)
129
+
130
+ if self._is_canceled:
131
+ return
132
+
133
+ self.progress.emit(25, "Determining prediction type...")
134
+
135
+ # Determine prediction type if auto
136
+ if self.prediction_type == 'auto':
137
+ prediction_type = self.detect_prediction_type(y)
138
+ else:
139
+ prediction_type = self.prediction_type
140
+
141
+ self.progress.emit(35, f"Using {prediction_type} approach...")
142
+
143
+ # Encode target variable if classification
144
+ target_encoder = None
145
+ if prediction_type == 'classification' and not pd.api.types.is_numeric_dtype(y):
146
+ target_encoder = LabelEncoder()
147
+ y = target_encoder.fit_transform(y.astype(str))
148
+
149
+ if self._is_canceled:
150
+ return
151
+
152
+ self.progress.emit(50, "Training models...")
153
+
154
+ # Split data
155
+ X_train, X_test, y_train, y_test = train_test_split(
156
+ X, y, test_size=self.test_size, random_state=self.random_state
157
+ )
158
+
159
+ # Scale features for linear models
160
+ scaler = StandardScaler()
161
+ X_train_scaled = scaler.fit_transform(X_train)
162
+ X_test_scaled = scaler.transform(X_test)
163
+
164
+ models = {}
165
+ scores = {}
166
+ predictions = {}
167
+
168
+ if prediction_type == 'regression':
169
+ # Train regression models
170
+ models['Random Forest'] = RandomForestRegressor(
171
+ n_estimators=100, random_state=self.random_state, n_jobs=-1
172
+ )
173
+ models['Linear Regression'] = LinearRegression()
174
+
175
+ for name, model in models.items():
176
+ if self._is_canceled:
177
+ return
178
+
179
+ # Use scaled features for linear models
180
+ if 'Linear' in name:
181
+ model.fit(X_train_scaled, y_train)
182
+ pred = model.predict(X_test_scaled)
183
+ # Make predictions on test set + null target rows
184
+ # Test set predictions for validation
185
+ test_pred = pred
186
+ # Null target predictions (the main goal)
187
+ null_pred = model.predict(scaler.transform(X_all[null_mask])) if null_mask.any() else []
188
+ else:
189
+ model.fit(X_train, y_train)
190
+ pred = model.predict(X_test)
191
+ # Make predictions on test set + null target rows
192
+ # Test set predictions for validation
193
+ test_pred = pred
194
+ # Null target predictions (the main goal)
195
+ null_pred = model.predict(X_all[null_mask]) if null_mask.any() else []
196
+
197
+ scores[name] = {
198
+ 'mse': mean_squared_error(y_test, pred),
199
+ 'r2': r2_score(y_test, pred)
200
+ }
201
+ # Combine test predictions and null predictions
202
+ all_pred = {
203
+ 'test_predictions': test_pred,
204
+ 'test_indices': X_test.index.tolist(),
205
+ 'null_predictions': null_pred,
206
+ 'null_indices': X_all[null_mask].index.tolist()
207
+ }
208
+ predictions[name] = all_pred
209
+
210
+ else: # classification
211
+ # Train classification models
212
+ models['Random Forest'] = RandomForestClassifier(
213
+ n_estimators=100, random_state=self.random_state, n_jobs=-1
214
+ )
215
+ models['Logistic Regression'] = LogisticRegression(
216
+ random_state=self.random_state, max_iter=1000
217
+ )
218
+
219
+ for name, model in models.items():
220
+ if self._is_canceled:
221
+ return
222
+
223
+ # Use scaled features for linear models
224
+ if 'Logistic' in name:
225
+ model.fit(X_train_scaled, y_train)
226
+ pred = model.predict(X_test_scaled)
227
+ # Make predictions on test set + null target rows
228
+ # Test set predictions for validation
229
+ test_pred = pred
230
+ # Null target predictions (the main goal)
231
+ null_pred = model.predict(scaler.transform(X_all[null_mask])) if null_mask.any() else []
232
+ else:
233
+ model.fit(X_train, y_train)
234
+ pred = model.predict(X_test)
235
+ # Make predictions on test set + null target rows
236
+ # Test set predictions for validation
237
+ test_pred = pred
238
+ # Null target predictions (the main goal)
239
+ null_pred = model.predict(X_all[null_mask]) if null_mask.any() else []
240
+
241
+ scores[name] = {
242
+ 'accuracy': accuracy_score(y_test, pred)
243
+ }
244
+ # Combine test predictions and null predictions
245
+ all_pred = {
246
+ 'test_predictions': test_pred,
247
+ 'test_indices': X_test.index.tolist(),
248
+ 'null_predictions': null_pred,
249
+ 'null_indices': X_all[null_mask].index.tolist()
250
+ }
251
+ predictions[name] = all_pred
252
+
253
+ if self._is_canceled:
254
+ return
255
+
256
+ self.progress.emit(90, "Finalizing results...")
257
+
258
+ # Select best model
259
+ if prediction_type == 'regression':
260
+ best_model = max(scores.keys(), key=lambda k: scores[k]['r2'])
261
+ else:
262
+ best_model = max(scores.keys(), key=lambda k: scores[k]['accuracy'])
263
+
264
+ # Get best predictions
265
+ best_pred_dict = predictions[best_model]
266
+
267
+ # Combine test and null predictions into single arrays
268
+ combined_predictions = []
269
+ combined_indices = []
270
+
271
+ # Add test predictions
272
+ test_preds = best_pred_dict['test_predictions']
273
+ test_indices = best_pred_dict['test_indices']
274
+ combined_predictions.extend(test_preds)
275
+ combined_indices.extend(test_indices)
276
+
277
+ # Add null predictions
278
+ null_preds = best_pred_dict['null_predictions']
279
+ null_indices = best_pred_dict['null_indices']
280
+ combined_predictions.extend(null_preds)
281
+ combined_indices.extend(null_indices)
282
+
283
+ # Decode predictions if needed
284
+ if target_encoder is not None and len(combined_predictions) > 0:
285
+ # Convert to original labels
286
+ try:
287
+ combined_predictions = target_encoder.inverse_transform(np.array(combined_predictions).astype(int))
288
+ except:
289
+ # If conversion fails, use numeric predictions
290
+ pass
291
+
292
+ # Create results dictionary with detailed breakdown
293
+ results = {
294
+ 'prediction_type': prediction_type,
295
+ 'target_column': self.target_column,
296
+ 'best_model': best_model,
297
+ 'predictions': combined_predictions,
298
+ 'scores': scores,
299
+ 'feature_columns': list(X.columns),
300
+ 'target_encoder': target_encoder,
301
+ 'original_indices': combined_indices, # Both test and null indices
302
+ # Store trained models and preprocessing objects for saving
303
+ 'trained_models': models,
304
+ 'scaler': scaler,
305
+ 'label_encoders': label_encoders,
306
+ # Additional info to help detect data leakage
307
+ 'data_breakdown': {
308
+ 'total_rows': len(self.df),
309
+ 'training_rows': len(X_train),
310
+ 'test_rows': len(X_test),
311
+ 'null_target_rows': null_mask.sum(),
312
+ 'predicted_rows': len(combined_indices),
313
+ 'test_size_percentage': self.test_size * 100
314
+ }
315
+ }
316
+
317
+ self.progress.emit(100, "Complete!")
318
+ self.result.emit(results)
319
+
320
+ except Exception as e:
321
+ self.error.emit(f"Prediction error: {str(e)}")
322
+
323
+
324
+ class PredictionResultsModel(QAbstractTableModel):
325
+ """Table model for displaying prediction results"""
326
+
327
+ def __init__(self, results_data):
328
+ super().__init__()
329
+ self.results_data = results_data
330
+ self.headers = ['Model', 'Performance Metric', 'Score']
331
+
332
+ def rowCount(self, parent=QModelIndex()):
333
+ return len(self.results_data)
334
+
335
+ def columnCount(self, parent=QModelIndex()):
336
+ return len(self.headers)
337
+
338
+ def data(self, index, role=Qt.ItemDataRole.DisplayRole):
339
+ if not index.isValid():
340
+ return None
341
+
342
+ row = index.row()
343
+ col = index.column()
344
+
345
+ if role == Qt.ItemDataRole.DisplayRole:
346
+ return str(self.results_data[row][col])
347
+ elif role == Qt.ItemDataRole.BackgroundRole and row == 0:
348
+ # Highlight best model
349
+ return QBrush(QColor(200, 255, 200))
350
+
351
+ return None
352
+
353
+ def headerData(self, section, orientation, role=Qt.ItemDataRole.DisplayRole):
354
+ if orientation == Qt.Orientation.Horizontal and role == Qt.ItemDataRole.DisplayRole:
355
+ return self.headers[section]
356
+ return None
357
+
358
+
359
+ class PredictionDialog(QMainWindow):
360
+ """Main dialog for displaying prediction results and applying predictions"""
361
+
362
+ predictionApplied = pyqtSignal(object) # Signal emitted when predictions are applied
363
+
364
+ def __init__(self, df, target_column, parent=None):
365
+ super().__init__(parent)
366
+ self.df = df
367
+ self.target_column = target_column
368
+ self.prediction_results = None
369
+ self.worker_thread = None
370
+
371
+ self.setWindowTitle(f"Predict {target_column}")
372
+ self.setGeometry(100, 100, 900, 700)
373
+
374
+ # Make window stay on top and be modal
375
+ self.setWindowModality(Qt.WindowModality.ApplicationModal)
376
+ self.setWindowFlags(Qt.WindowType.Window | Qt.WindowType.WindowStaysOnTopHint)
377
+
378
+ # Create central widget and layout
379
+ central_widget = QWidget()
380
+ self.setCentralWidget(central_widget)
381
+ layout = QVBoxLayout(central_widget)
382
+
383
+ # Create header
384
+ header_label = QLabel(f"<h2>Predict Column: {target_column}</h2>")
385
+ header_label.setAlignment(Qt.AlignmentFlag.AlignCenter)
386
+ layout.addWidget(header_label)
387
+
388
+ # Create info panel
389
+ info_group = QGroupBox("Prediction Settings")
390
+ info_layout = QFormLayout(info_group)
391
+
392
+ # Test size spinner
393
+ self.test_size_spin = QDoubleSpinBox()
394
+ self.test_size_spin.setRange(0.1, 0.5)
395
+ self.test_size_spin.setValue(0.2)
396
+ self.test_size_spin.setSingleStep(0.05)
397
+ info_layout.addRow("Test Size:", self.test_size_spin)
398
+
399
+ # Prediction type combo
400
+ self.prediction_type_combo = QComboBox()
401
+ self.prediction_type_combo.addItems(['auto', 'regression', 'classification'])
402
+ info_layout.addRow("Prediction Type:", self.prediction_type_combo)
403
+
404
+ layout.addWidget(info_group)
405
+
406
+ # Create splitter for results
407
+ splitter = QSplitter(Qt.Orientation.Vertical)
408
+ layout.addWidget(splitter)
409
+
410
+ # Progress bar
411
+ self.progress_bar = QProgressBar()
412
+ self.progress_label = QLabel("Ready to start prediction...")
413
+ progress_widget = QWidget()
414
+ progress_layout = QVBoxLayout(progress_widget)
415
+ progress_layout.addWidget(self.progress_label)
416
+ progress_layout.addWidget(self.progress_bar)
417
+
418
+ # Results table
419
+ self.results_table = QTableView()
420
+ self.results_table.setMinimumHeight(200)
421
+
422
+ # Results text area
423
+ self.results_text = QTextEdit()
424
+ self.results_text.setMaximumHeight(150)
425
+ self.results_text.setPlainText("Click 'Start Prediction' to begin analysis...")
426
+
427
+ splitter.addWidget(progress_widget)
428
+ splitter.addWidget(self.results_table)
429
+ splitter.addWidget(self.results_text)
430
+
431
+ # Buttons
432
+ button_layout = QHBoxLayout()
433
+
434
+ self.start_button = QPushButton("Start Prediction")
435
+ self.start_button.clicked.connect(self.start_prediction)
436
+
437
+ self.cancel_button = QPushButton("Cancel")
438
+ self.cancel_button.clicked.connect(self.cancel_prediction)
439
+ self.cancel_button.hide()
440
+
441
+ self.apply_button = QPushButton(f"Apply Predictions (Add 'Predict {target_column}' Column)")
442
+ self.apply_button.clicked.connect(self.apply_predictions)
443
+ self.apply_button.setEnabled(False)
444
+
445
+ self.save_model_button = QPushButton("Save Model")
446
+ self.save_model_button.clicked.connect(self.save_model)
447
+ self.save_model_button.setEnabled(False)
448
+
449
+ self.close_button = QPushButton("Close")
450
+ self.close_button.clicked.connect(self.close)
451
+
452
+ button_layout.addWidget(self.start_button)
453
+ button_layout.addWidget(self.cancel_button)
454
+ button_layout.addWidget(self.apply_button)
455
+ button_layout.addWidget(self.save_model_button)
456
+ button_layout.addStretch()
457
+ button_layout.addWidget(self.close_button)
458
+
459
+ layout.addLayout(button_layout)
460
+
461
+ # Show the window and bring it to front
462
+ print(f"DEBUG: About to show prediction dialog window")
463
+ self.show()
464
+ self.raise_() # Bring to front
465
+ self.activateWindow() # Make it the active window
466
+
467
+ # Additional window focus methods
468
+ self.setWindowState(self.windowState() & ~Qt.WindowState.WindowMinimized | Qt.WindowState.WindowActive)
469
+
470
+ print(f"DEBUG: Dialog window shown. Visible: {self.isVisible()}, Position: {self.geometry()}")
471
+ print(f"DEBUG: Window title: {self.windowTitle()}")
472
+
473
+ def start_prediction(self):
474
+ """Start the prediction analysis"""
475
+ try:
476
+ self.start_button.setEnabled(False)
477
+ self.apply_button.setEnabled(False)
478
+ self.cancel_button.show()
479
+
480
+ # Get settings
481
+ test_size = self.test_size_spin.value()
482
+ prediction_type = self.prediction_type_combo.currentText()
483
+
484
+ self.progress_label.setText("Starting prediction analysis...")
485
+ self.progress_bar.setValue(0)
486
+
487
+ # Create and start worker thread
488
+ self.worker_thread = PredictionThread(
489
+ self.df, self.target_column,
490
+ prediction_type=prediction_type,
491
+ test_size=test_size
492
+ )
493
+ self.worker_thread.progress.connect(self.update_progress)
494
+ self.worker_thread.result.connect(self.handle_results)
495
+ self.worker_thread.error.connect(self.handle_error)
496
+ self.worker_thread.finished.connect(self.on_analysis_finished)
497
+ self.worker_thread.start()
498
+
499
+ except Exception as e:
500
+ self.handle_error(f"Failed to start prediction: {str(e)}")
501
+
502
+ def cancel_prediction(self):
503
+ """Cancel the current prediction"""
504
+ if self.worker_thread:
505
+ self.worker_thread.cancel()
506
+ self.worker_thread.quit()
507
+ self.worker_thread.wait()
508
+ self.on_analysis_finished()
509
+
510
+ def update_progress(self, value, message):
511
+ """Update progress bar and label"""
512
+ self.progress_bar.setValue(value)
513
+ self.progress_label.setText(message)
514
+
515
+ def handle_results(self, results):
516
+ """Handle prediction results"""
517
+ self.prediction_results = results
518
+
519
+ # Create table data for model comparison
520
+ table_data = []
521
+ for model_name, scores in results['scores'].items():
522
+ if results['prediction_type'] == 'regression':
523
+ table_data.append([
524
+ model_name,
525
+ 'R² Score',
526
+ f"{scores['r2']:.4f}"
527
+ ])
528
+ table_data.append([
529
+ model_name,
530
+ 'MSE',
531
+ f"{scores['mse']:.4f}"
532
+ ])
533
+ else:
534
+ table_data.append([
535
+ model_name,
536
+ 'Accuracy',
537
+ f"{scores['accuracy']:.4f}"
538
+ ])
539
+
540
+ # Sort by best performing model first
541
+ if results['prediction_type'] == 'regression':
542
+ table_data.sort(key=lambda x: float(x[2]) if x[1] == 'R² Score' else -float(x[2]), reverse=True)
543
+ else:
544
+ table_data.sort(key=lambda x: float(x[2]), reverse=True)
545
+
546
+ # Set up table model
547
+ model = PredictionResultsModel(table_data)
548
+ self.results_table.setModel(model)
549
+ self.results_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
550
+
551
+ # Update results text
552
+ breakdown = results['data_breakdown']
553
+ summary = f"""Prediction Analysis Complete!
554
+
555
+ Target Column: {results['target_column']}
556
+ Prediction Type: {results['prediction_type']}
557
+ Best Model: {results['best_model']}
558
+ Features Used: {len(results['feature_columns'])} columns
559
+
560
+ 📊 DATA BREAKDOWN (for leak detection):
561
+ Total Rows: {breakdown['total_rows']}
562
+ Training Rows: {breakdown['training_rows']} ({breakdown['training_rows']/breakdown['total_rows']*100:.1f}%)
563
+ Test Rows: {breakdown['test_rows']} ({breakdown['test_size_percentage']:.1f}%)
564
+ NULL Target Rows: {breakdown['null_target_rows']}
565
+ Predicted Rows: {breakdown['predicted_rows']} (test + null targets)
566
+
567
+ ⚠️ MODEL PERFORMANCE (on test set only):
568
+ Note: Scores below are ONLY on {breakdown['test_rows']} unseen test rows.
569
+ High scores are good, but should be realistic for your data.
570
+ """
571
+ for model_name, scores in results['scores'].items():
572
+ summary += f"\n{model_name}:\n"
573
+ for metric, score in scores.items():
574
+ summary += f" {metric}: {score:.4f}\n"
575
+
576
+ summary += f"""
577
+ 🔍 LEAK DETECTION GUIDE:
578
+ ✅ GOOD: R² between 0.3-0.9 (depending on your data)
579
+ ❌ SUSPICIOUS: R² > 0.95 (likely overfit or leakage)
580
+ ✅ TRAINING SIZE: {breakdown['training_rows']} rows ({breakdown['training_rows']/breakdown['total_rows']*100:.1f}%)
581
+ ✅ TEST SIZE: {breakdown['test_rows']} rows ({breakdown['test_size_percentage']:.1f}%)
582
+ ✅ PREDICTIONS: Made for {breakdown['null_target_rows']} missing values + {breakdown['test_rows']} test rows"""
583
+
584
+ self.results_text.setPlainText(summary)
585
+ self.apply_button.setEnabled(True)
586
+ self.save_model_button.setEnabled(True)
587
+
588
+ def handle_error(self, error_message):
589
+ """Handle prediction errors"""
590
+ self.results_text.setPlainText(f"Error: {error_message}")
591
+ QMessageBox.critical(self, "Prediction Error", error_message)
592
+
593
+ def on_analysis_finished(self):
594
+ """Handle cleanup when analysis is finished"""
595
+ self.start_button.setEnabled(True)
596
+ self.cancel_button.hide()
597
+ self.progress_label.setText("Analysis complete")
598
+
599
+ def apply_predictions(self):
600
+ """Apply predictions to the dataframe"""
601
+ if not self.prediction_results:
602
+ return
603
+
604
+ try:
605
+ # Create a copy of the original dataframe
606
+ result_df = self.df.copy()
607
+
608
+ # Create prediction column name
609
+ predict_column_name = f"Predict_{self.target_column}"
610
+
611
+ # Prepare prediction values
612
+ predictions = self.prediction_results['predictions']
613
+ original_indices = self.prediction_results['original_indices']
614
+
615
+ # Create prediction series with NaN values for all rows
616
+ prediction_series = pd.Series([np.nan] * len(result_df), index=result_df.index, name=predict_column_name)
617
+
618
+ # Fill predictions for rows that were in the test set AND rows with null targets
619
+ for i, idx in enumerate(original_indices):
620
+ if i < len(predictions) and idx in prediction_series.index:
621
+ prediction_series.loc[idx] = predictions[i]
622
+
623
+ # Find the position of the target column
624
+ target_column_index = result_df.columns.get_loc(self.target_column)
625
+
626
+ # Insert the prediction column right after the target column
627
+ # Split the dataframe into before and after the target column
628
+ cols_before = result_df.columns[:target_column_index + 1].tolist()
629
+ cols_after = result_df.columns[target_column_index + 1:].tolist()
630
+
631
+ # Create new column order with prediction column inserted
632
+ new_columns = cols_before + [predict_column_name] + cols_after
633
+
634
+ # Add the prediction column to the dataframe
635
+ result_df[predict_column_name] = prediction_series
636
+
637
+ # Reorder columns to place prediction column next to target column
638
+ result_df = result_df[new_columns]
639
+
640
+ # Emit signal with the updated dataframe
641
+ self.predictionApplied.emit(result_df)
642
+
643
+ # Show success message
644
+ QMessageBox.information(
645
+ self,
646
+ "Predictions Applied",
647
+ f"Successfully added '{predict_column_name}' column with predictions from {self.prediction_results['best_model']} model."
648
+ )
649
+
650
+ self.close()
651
+
652
+ except Exception as e:
653
+ QMessageBox.critical(self, "Apply Error", f"Failed to apply predictions: {str(e)}")
654
+
655
+ def save_model(self):
656
+ """Save the trained prediction model and preprocessing objects"""
657
+ if not self.prediction_results:
658
+ QMessageBox.warning(self, "No Model", "No model available to save. Please run prediction first.")
659
+ return
660
+
661
+ try:
662
+ # Get the best model and preprocessing objects
663
+ best_model_name = self.prediction_results['best_model']
664
+ best_model = self.prediction_results['trained_models'][best_model_name]
665
+
666
+ # Open file dialog to choose save location
667
+ file_path, _ = QFileDialog.getSaveFileName(
668
+ self,
669
+ "Save Prediction Model",
670
+ f"{self.target_column}_prediction_model.pkl",
671
+ "Pickle Files (*.pkl);;All Files (*)"
672
+ )
673
+
674
+ if not file_path:
675
+ return # User cancelled
676
+
677
+ # Create model package with all necessary components
678
+ model_package = {
679
+ 'model': best_model,
680
+ 'model_name': best_model_name,
681
+ 'prediction_type': self.prediction_results['prediction_type'],
682
+ 'target_column': self.prediction_results['target_column'],
683
+ 'feature_columns': self.prediction_results['feature_columns'],
684
+ 'scaler': self.prediction_results['scaler'],
685
+ 'label_encoders': self.prediction_results['label_encoders'],
686
+ 'target_encoder': self.prediction_results['target_encoder'],
687
+ 'scores': self.prediction_results['scores'][best_model_name],
688
+ 'created_date': datetime.now().isoformat(),
689
+ 'sqlshell_version': '1.0', # Could be made dynamic
690
+ 'sklearn_version': None # Will be filled by joblib
691
+ }
692
+
693
+ # Save the model package
694
+ joblib.dump(model_package, file_path)
695
+
696
+ # Show success message
697
+ QMessageBox.information(
698
+ self,
699
+ "Model Saved",
700
+ f"Model saved successfully to:\n{file_path}\n\n"
701
+ f"Model: {best_model_name}\n"
702
+ f"Target: {self.target_column}\n"
703
+ f"Features: {len(self.prediction_results['feature_columns'])} columns"
704
+ )
705
+
706
+ except Exception as e:
707
+ QMessageBox.critical(self, "Save Error", f"Failed to save model: {str(e)}")
708
+
709
+
710
+ def create_prediction_dialog(df, target_column, parent=None):
711
+ """
712
+ Main function to create and show the prediction dialog.
713
+
714
+ Args:
715
+ df (pd.DataFrame): The dataframe to analyze
716
+ target_column (str): The column to predict
717
+ parent: Parent window for the dialog
718
+
719
+ Returns:
720
+ PredictionDialog: The prediction dialog window
721
+ """
722
+ if df is None or df.empty:
723
+ raise ValueError("DataFrame is empty or None")
724
+
725
+ if target_column not in df.columns:
726
+ raise ValueError(f"Column '{target_column}' not found in DataFrame")
727
+
728
+ # Check if there are enough features for prediction
729
+ if len(df.columns) < 2:
730
+ raise ValueError("Need at least 2 columns for prediction (target + features)")
731
+
732
+ # Create and return the dialog
733
+ dialog = PredictionDialog(df, target_column, parent)
734
+ return dialog
735
+
736
+
737
+ def load_and_apply_model(df, parent=None):
738
+ """
739
+ Load a saved prediction model and apply it to a new dataset.
740
+
741
+ Args:
742
+ df (pd.DataFrame): The dataframe to apply predictions to
743
+ parent: Parent window for dialogs
744
+
745
+ Returns:
746
+ pd.DataFrame: DataFrame with predictions added, or None if cancelled/error
747
+ """
748
+ try:
749
+ # Open file dialog to select model file
750
+ file_path, _ = QFileDialog.getOpenFileName(
751
+ parent,
752
+ "Load Prediction Model",
753
+ "",
754
+ "Pickle Files (*.pkl);;All Files (*)"
755
+ )
756
+
757
+ if not file_path:
758
+ return None # User cancelled
759
+
760
+ # Load the model package
761
+ try:
762
+ model_package = joblib.load(file_path)
763
+ except Exception as e:
764
+ QMessageBox.critical(parent, "Load Error", f"Failed to load model file:\n{str(e)}")
765
+ return None
766
+
767
+ # Validate model package structure
768
+ required_keys = ['model', 'model_name', 'prediction_type', 'target_column',
769
+ 'feature_columns', 'scaler', 'label_encoders']
770
+ missing_keys = [key for key in required_keys if key not in model_package]
771
+ if missing_keys:
772
+ QMessageBox.critical(
773
+ parent,
774
+ "Invalid Model",
775
+ f"Model file is missing required components: {missing_keys}"
776
+ )
777
+ return None
778
+
779
+ # Check feature compatibility
780
+ model_features = set(model_package['feature_columns'])
781
+ df_columns = set(df.columns)
782
+ missing_features = model_features - df_columns
783
+
784
+ if missing_features:
785
+ QMessageBox.critical(
786
+ parent,
787
+ "Feature Mismatch",
788
+ f"The current dataset is missing required features:\n{list(missing_features)}\n\n"
789
+ f"Model requires: {model_package['feature_columns']}\n"
790
+ f"Dataset has: {list(df.columns)}"
791
+ )
792
+ return None
793
+
794
+ # Show model info and ask for confirmation
795
+ model_info = f"""Model Information:
796
+
797
+ Model Name: {model_package['model_name']}
798
+ Original Target: {model_package['target_column']}
799
+ Prediction Type: {model_package['prediction_type']}
800
+ Features Required: {len(model_package['feature_columns'])} columns
801
+ Created: {model_package.get('created_date', 'Unknown')}
802
+
803
+ This will add a new 'Predict_{model_package['target_column']}' column to your dataset.
804
+
805
+ Continue with prediction?"""
806
+
807
+ reply = QMessageBox.question(
808
+ parent,
809
+ "Apply Model",
810
+ model_info,
811
+ QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No
812
+ )
813
+
814
+ if reply != QMessageBox.StandardButton.Yes:
815
+ return None
816
+
817
+ # Apply the model
818
+ result_df = apply_loaded_model_to_dataframe(df, model_package)
819
+
820
+ if result_df is not None:
821
+ QMessageBox.information(
822
+ parent,
823
+ "Predictions Applied",
824
+ f"Successfully applied '{model_package['model_name']}' model.\n"
825
+ f"Added 'Predict_{model_package['target_column']}' column with predictions."
826
+ )
827
+
828
+ return result_df
829
+
830
+ except Exception as e:
831
+ QMessageBox.critical(parent, "Error", f"Failed to apply model: {str(e)}")
832
+ return None
833
+
834
+
835
+ def apply_loaded_model_to_dataframe(df, model_package):
836
+ """
837
+ Apply a loaded model package to a dataframe.
838
+
839
+ Args:
840
+ df (pd.DataFrame): The dataframe to apply predictions to
841
+ model_package (dict): The loaded model package
842
+
843
+ Returns:
844
+ pd.DataFrame: DataFrame with predictions added
845
+ """
846
+ try:
847
+ # Extract components from model package
848
+ model = model_package['model']
849
+ scaler = model_package['scaler']
850
+ label_encoders = model_package['label_encoders']
851
+ target_encoder = model_package.get('target_encoder')
852
+ feature_columns = model_package['feature_columns']
853
+ model_name = model_package['model_name']
854
+ target_column = model_package['target_column']
855
+
856
+ # Prepare features using the same preprocessing as during training
857
+ X = df[feature_columns].copy()
858
+
859
+ # Apply label encoders to categorical features
860
+ for col, encoder in label_encoders.items():
861
+ if col in X.columns:
862
+ # Handle unseen categories by replacing with 'unknown'
863
+ unique_train_values = set(encoder.classes_)
864
+ X[col] = X[col].astype(str)
865
+ X[col] = X[col].apply(lambda x: x if x in unique_train_values else 'unknown')
866
+
867
+ # Add 'unknown' to encoder if not present
868
+ if 'unknown' not in encoder.classes_:
869
+ # Create a new encoder with 'unknown' category
870
+ new_classes = list(encoder.classes_) + ['unknown']
871
+ encoder.classes_ = np.array(new_classes)
872
+
873
+ X[col] = encoder.transform(X[col])
874
+
875
+ # Handle missing values for numerical features
876
+ numerical_cols = X.select_dtypes(include=[np.number]).columns
877
+ for col in numerical_cols:
878
+ if X[col].isnull().any():
879
+ # Use median from the scaler's fitted data (approximate)
880
+ median_val = X[col].median()
881
+ X[col] = X[col].fillna(median_val)
882
+
883
+ # Apply scaling
884
+ if 'Linear' in model_name or 'Logistic' in model_name:
885
+ X_processed = scaler.transform(X)
886
+ else:
887
+ X_processed = X.values
888
+
889
+ # Make predictions
890
+ predictions = model.predict(X_processed)
891
+
892
+ # Decode predictions if target encoder exists
893
+ if target_encoder is not None and len(predictions) > 0:
894
+ try:
895
+ predictions = target_encoder.inverse_transform(predictions.astype(int))
896
+ except:
897
+ # If decoding fails, keep numeric predictions
898
+ pass
899
+
900
+ # Create result dataframe
901
+ result_df = df.copy()
902
+ predict_column_name = f"Predict_{target_column}"
903
+ result_df[predict_column_name] = predictions
904
+
905
+ return result_df
906
+
907
+ except Exception as e:
908
+ raise Exception(f"Failed to apply model: {str(e)}")
909
+
910
+
911
+ def show_load_model_dialog(df, parent=None):
912
+ """
913
+ Convenience function to show load model dialog and apply predictions.
914
+
915
+ Args:
916
+ df (pd.DataFrame): The dataframe to apply predictions to
917
+ parent: Parent window for dialogs
918
+
919
+ Returns:
920
+ pd.DataFrame: DataFrame with predictions added, or None if cancelled/error
921
+ """
922
+ if df is None or df.empty:
923
+ QMessageBox.warning(parent, "No Data", "No data available. Please load some data first.")
924
+ return None
925
+
926
+ return load_and_apply_model(df, parent)