sqlshell 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlshell/__init__.py +84 -0
- sqlshell/__main__.py +4926 -0
- sqlshell/ai_autocomplete.py +392 -0
- sqlshell/ai_settings_dialog.py +337 -0
- sqlshell/context_suggester.py +768 -0
- sqlshell/create_test_data.py +152 -0
- sqlshell/data/create_test_data.py +137 -0
- sqlshell/db/__init__.py +6 -0
- sqlshell/db/database_manager.py +1318 -0
- sqlshell/db/export_manager.py +188 -0
- sqlshell/editor.py +1166 -0
- sqlshell/editor_integration.py +127 -0
- sqlshell/execution_handler.py +421 -0
- sqlshell/menus.py +262 -0
- sqlshell/notification_manager.py +370 -0
- sqlshell/query_tab.py +904 -0
- sqlshell/resources/__init__.py +1 -0
- sqlshell/resources/icon.png +0 -0
- sqlshell/resources/logo_large.png +0 -0
- sqlshell/resources/logo_medium.png +0 -0
- sqlshell/resources/logo_small.png +0 -0
- sqlshell/resources/splash_screen.gif +0 -0
- sqlshell/space_invaders.py +501 -0
- sqlshell/splash_screen.py +405 -0
- sqlshell/sqlshell/__init__.py +5 -0
- sqlshell/sqlshell/create_test_data.py +118 -0
- sqlshell/sqlshell/create_test_databases.py +96 -0
- sqlshell/sqlshell_demo.png +0 -0
- sqlshell/styles.py +257 -0
- sqlshell/suggester_integration.py +330 -0
- sqlshell/syntax_highlighter.py +124 -0
- sqlshell/table_list.py +996 -0
- sqlshell/ui/__init__.py +6 -0
- sqlshell/ui/bar_chart_delegate.py +49 -0
- sqlshell/ui/filter_header.py +469 -0
- sqlshell/utils/__init__.py +16 -0
- sqlshell/utils/profile_cn2.py +1661 -0
- sqlshell/utils/profile_column.py +2635 -0
- sqlshell/utils/profile_distributions.py +616 -0
- sqlshell/utils/profile_entropy.py +347 -0
- sqlshell/utils/profile_foreign_keys.py +779 -0
- sqlshell/utils/profile_keys.py +2834 -0
- sqlshell/utils/profile_ohe.py +934 -0
- sqlshell/utils/profile_ohe_advanced.py +754 -0
- sqlshell/utils/profile_ohe_comparison.py +237 -0
- sqlshell/utils/profile_prediction.py +926 -0
- sqlshell/utils/profile_similarity.py +876 -0
- sqlshell/utils/search_in_df.py +90 -0
- sqlshell/widgets.py +400 -0
- sqlshell-0.4.4.dist-info/METADATA +441 -0
- sqlshell-0.4.4.dist-info/RECORD +54 -0
- sqlshell-0.4.4.dist-info/WHEEL +5 -0
- sqlshell-0.4.4.dist-info/entry_points.txt +2 -0
- sqlshell-0.4.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,926 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Column Prediction Module
|
|
3
|
+
|
|
4
|
+
This module provides prediction functionality for columns using modern machine learning techniques.
|
|
5
|
+
It creates a new "Predict <column_name>" column with predictions based on other columns in the dataframe.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import numpy as np
|
|
10
|
+
from sklearn.model_selection import train_test_split
|
|
11
|
+
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
|
12
|
+
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
|
|
13
|
+
from sklearn.linear_model import LinearRegression, LogisticRegression
|
|
14
|
+
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score
|
|
15
|
+
import warnings
|
|
16
|
+
import joblib
|
|
17
|
+
import os
|
|
18
|
+
import json
|
|
19
|
+
from datetime import datetime
|
|
20
|
+
warnings.filterwarnings('ignore')
|
|
21
|
+
|
|
22
|
+
from PyQt6.QtWidgets import (QMainWindow, QVBoxLayout, QHBoxLayout, QWidget, QLabel,
|
|
23
|
+
QTableView, QPushButton, QProgressBar, QComboBox, QCheckBox,
|
|
24
|
+
QTextEdit, QSplitter, QHeaderView, QMessageBox, QGroupBox,
|
|
25
|
+
QFormLayout, QSpinBox, QDoubleSpinBox, QFileDialog)
|
|
26
|
+
from PyQt6.QtCore import Qt, QAbstractTableModel, QModelIndex, QThread, pyqtSignal, QTimer
|
|
27
|
+
from PyQt6.QtGui import QStandardItemModel, QStandardItem, QColor, QPalette, QBrush
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class PredictionThread(QThread):
|
|
31
|
+
"""Worker thread for background prediction model training and evaluation"""
|
|
32
|
+
|
|
33
|
+
progress = pyqtSignal(int, str)
|
|
34
|
+
result = pyqtSignal(object)
|
|
35
|
+
error = pyqtSignal(str)
|
|
36
|
+
|
|
37
|
+
def __init__(self, df, target_column, prediction_type='auto', test_size=0.2, random_state=42):
|
|
38
|
+
super().__init__()
|
|
39
|
+
self.df = df.copy()
|
|
40
|
+
self.target_column = target_column
|
|
41
|
+
self.prediction_type = prediction_type
|
|
42
|
+
self.test_size = test_size
|
|
43
|
+
self.random_state = random_state
|
|
44
|
+
self._is_canceled = False
|
|
45
|
+
|
|
46
|
+
def cancel(self):
|
|
47
|
+
"""Mark the thread as canceled"""
|
|
48
|
+
self._is_canceled = True
|
|
49
|
+
|
|
50
|
+
def detect_prediction_type(self, target_series):
|
|
51
|
+
"""Automatically detect whether to use regression or classification"""
|
|
52
|
+
if pd.api.types.is_numeric_dtype(target_series):
|
|
53
|
+
# Check if it looks like a categorical variable (few unique values)
|
|
54
|
+
unique_count = target_series.nunique()
|
|
55
|
+
total_count = len(target_series.dropna())
|
|
56
|
+
|
|
57
|
+
if unique_count <= 10 or (unique_count / total_count) < 0.05:
|
|
58
|
+
return 'classification'
|
|
59
|
+
else:
|
|
60
|
+
return 'regression'
|
|
61
|
+
else:
|
|
62
|
+
return 'classification'
|
|
63
|
+
|
|
64
|
+
def prepare_features(self, df, target_column):
|
|
65
|
+
"""Prepare features for machine learning"""
|
|
66
|
+
# Separate features and target
|
|
67
|
+
X_all = df.drop(columns=[target_column])
|
|
68
|
+
y_all = df[target_column]
|
|
69
|
+
|
|
70
|
+
# Identify rows with non-null targets (for training/testing)
|
|
71
|
+
# and rows with null targets (for prediction)
|
|
72
|
+
non_null_mask = ~pd.isna(y_all)
|
|
73
|
+
null_mask = pd.isna(y_all)
|
|
74
|
+
|
|
75
|
+
# Get training data (non-null targets only)
|
|
76
|
+
X_train_data = X_all[non_null_mask]
|
|
77
|
+
y_train_data = y_all[non_null_mask]
|
|
78
|
+
|
|
79
|
+
if len(X_train_data) == 0:
|
|
80
|
+
raise ValueError("No valid data with non-missing target values for training")
|
|
81
|
+
|
|
82
|
+
# Handle categorical features for ALL data (including null targets)
|
|
83
|
+
categorical_cols = X_all.select_dtypes(include=['object', 'category']).columns
|
|
84
|
+
numerical_cols = X_all.select_dtypes(include=[np.number]).columns
|
|
85
|
+
|
|
86
|
+
# Process features consistently across all data
|
|
87
|
+
X_processed = X_all.copy()
|
|
88
|
+
label_encoders = {}
|
|
89
|
+
|
|
90
|
+
# Encode categorical variables
|
|
91
|
+
for col in categorical_cols:
|
|
92
|
+
# Fill missing values with 'missing'
|
|
93
|
+
X_processed[col] = X_processed[col].fillna('missing')
|
|
94
|
+
|
|
95
|
+
# Only encode if column has reasonable cardinality (based on training data)
|
|
96
|
+
if X_train_data[col].fillna('missing').nunique() < len(X_train_data) * 0.5:
|
|
97
|
+
le = LabelEncoder()
|
|
98
|
+
# Fit encoder on all data (including null targets) to handle unseen categories
|
|
99
|
+
le.fit(X_processed[col].astype(str))
|
|
100
|
+
X_processed[col] = le.transform(X_processed[col].astype(str))
|
|
101
|
+
label_encoders[col] = le
|
|
102
|
+
else:
|
|
103
|
+
# Drop high cardinality categorical columns
|
|
104
|
+
X_processed = X_processed.drop(columns=[col])
|
|
105
|
+
|
|
106
|
+
# Handle numerical features
|
|
107
|
+
for col in numerical_cols:
|
|
108
|
+
if col in X_processed.columns: # Column might have been dropped
|
|
109
|
+
# Fill missing values with median (computed from training data)
|
|
110
|
+
median_val = X_train_data[col].median()
|
|
111
|
+
X_processed[col] = X_processed[col].fillna(median_val)
|
|
112
|
+
|
|
113
|
+
# Return processed features for training and the target values
|
|
114
|
+
return X_processed[non_null_mask], y_train_data, label_encoders, X_processed, null_mask
|
|
115
|
+
|
|
116
|
+
def run(self):
|
|
117
|
+
try:
|
|
118
|
+
if self._is_canceled:
|
|
119
|
+
return
|
|
120
|
+
|
|
121
|
+
self.progress.emit(10, "Preparing data...")
|
|
122
|
+
|
|
123
|
+
# Check if target column exists
|
|
124
|
+
if self.target_column not in self.df.columns:
|
|
125
|
+
raise ValueError(f"Target column '{self.target_column}' not found")
|
|
126
|
+
|
|
127
|
+
# Prepare features
|
|
128
|
+
X, y, label_encoders, X_all, null_mask = self.prepare_features(self.df, self.target_column)
|
|
129
|
+
|
|
130
|
+
if self._is_canceled:
|
|
131
|
+
return
|
|
132
|
+
|
|
133
|
+
self.progress.emit(25, "Determining prediction type...")
|
|
134
|
+
|
|
135
|
+
# Determine prediction type if auto
|
|
136
|
+
if self.prediction_type == 'auto':
|
|
137
|
+
prediction_type = self.detect_prediction_type(y)
|
|
138
|
+
else:
|
|
139
|
+
prediction_type = self.prediction_type
|
|
140
|
+
|
|
141
|
+
self.progress.emit(35, f"Using {prediction_type} approach...")
|
|
142
|
+
|
|
143
|
+
# Encode target variable if classification
|
|
144
|
+
target_encoder = None
|
|
145
|
+
if prediction_type == 'classification' and not pd.api.types.is_numeric_dtype(y):
|
|
146
|
+
target_encoder = LabelEncoder()
|
|
147
|
+
y = target_encoder.fit_transform(y.astype(str))
|
|
148
|
+
|
|
149
|
+
if self._is_canceled:
|
|
150
|
+
return
|
|
151
|
+
|
|
152
|
+
self.progress.emit(50, "Training models...")
|
|
153
|
+
|
|
154
|
+
# Split data
|
|
155
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
156
|
+
X, y, test_size=self.test_size, random_state=self.random_state
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# Scale features for linear models
|
|
160
|
+
scaler = StandardScaler()
|
|
161
|
+
X_train_scaled = scaler.fit_transform(X_train)
|
|
162
|
+
X_test_scaled = scaler.transform(X_test)
|
|
163
|
+
|
|
164
|
+
models = {}
|
|
165
|
+
scores = {}
|
|
166
|
+
predictions = {}
|
|
167
|
+
|
|
168
|
+
if prediction_type == 'regression':
|
|
169
|
+
# Train regression models
|
|
170
|
+
models['Random Forest'] = RandomForestRegressor(
|
|
171
|
+
n_estimators=100, random_state=self.random_state, n_jobs=-1
|
|
172
|
+
)
|
|
173
|
+
models['Linear Regression'] = LinearRegression()
|
|
174
|
+
|
|
175
|
+
for name, model in models.items():
|
|
176
|
+
if self._is_canceled:
|
|
177
|
+
return
|
|
178
|
+
|
|
179
|
+
# Use scaled features for linear models
|
|
180
|
+
if 'Linear' in name:
|
|
181
|
+
model.fit(X_train_scaled, y_train)
|
|
182
|
+
pred = model.predict(X_test_scaled)
|
|
183
|
+
# Make predictions on test set + null target rows
|
|
184
|
+
# Test set predictions for validation
|
|
185
|
+
test_pred = pred
|
|
186
|
+
# Null target predictions (the main goal)
|
|
187
|
+
null_pred = model.predict(scaler.transform(X_all[null_mask])) if null_mask.any() else []
|
|
188
|
+
else:
|
|
189
|
+
model.fit(X_train, y_train)
|
|
190
|
+
pred = model.predict(X_test)
|
|
191
|
+
# Make predictions on test set + null target rows
|
|
192
|
+
# Test set predictions for validation
|
|
193
|
+
test_pred = pred
|
|
194
|
+
# Null target predictions (the main goal)
|
|
195
|
+
null_pred = model.predict(X_all[null_mask]) if null_mask.any() else []
|
|
196
|
+
|
|
197
|
+
scores[name] = {
|
|
198
|
+
'mse': mean_squared_error(y_test, pred),
|
|
199
|
+
'r2': r2_score(y_test, pred)
|
|
200
|
+
}
|
|
201
|
+
# Combine test predictions and null predictions
|
|
202
|
+
all_pred = {
|
|
203
|
+
'test_predictions': test_pred,
|
|
204
|
+
'test_indices': X_test.index.tolist(),
|
|
205
|
+
'null_predictions': null_pred,
|
|
206
|
+
'null_indices': X_all[null_mask].index.tolist()
|
|
207
|
+
}
|
|
208
|
+
predictions[name] = all_pred
|
|
209
|
+
|
|
210
|
+
else: # classification
|
|
211
|
+
# Train classification models
|
|
212
|
+
models['Random Forest'] = RandomForestClassifier(
|
|
213
|
+
n_estimators=100, random_state=self.random_state, n_jobs=-1
|
|
214
|
+
)
|
|
215
|
+
models['Logistic Regression'] = LogisticRegression(
|
|
216
|
+
random_state=self.random_state, max_iter=1000
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
for name, model in models.items():
|
|
220
|
+
if self._is_canceled:
|
|
221
|
+
return
|
|
222
|
+
|
|
223
|
+
# Use scaled features for linear models
|
|
224
|
+
if 'Logistic' in name:
|
|
225
|
+
model.fit(X_train_scaled, y_train)
|
|
226
|
+
pred = model.predict(X_test_scaled)
|
|
227
|
+
# Make predictions on test set + null target rows
|
|
228
|
+
# Test set predictions for validation
|
|
229
|
+
test_pred = pred
|
|
230
|
+
# Null target predictions (the main goal)
|
|
231
|
+
null_pred = model.predict(scaler.transform(X_all[null_mask])) if null_mask.any() else []
|
|
232
|
+
else:
|
|
233
|
+
model.fit(X_train, y_train)
|
|
234
|
+
pred = model.predict(X_test)
|
|
235
|
+
# Make predictions on test set + null target rows
|
|
236
|
+
# Test set predictions for validation
|
|
237
|
+
test_pred = pred
|
|
238
|
+
# Null target predictions (the main goal)
|
|
239
|
+
null_pred = model.predict(X_all[null_mask]) if null_mask.any() else []
|
|
240
|
+
|
|
241
|
+
scores[name] = {
|
|
242
|
+
'accuracy': accuracy_score(y_test, pred)
|
|
243
|
+
}
|
|
244
|
+
# Combine test predictions and null predictions
|
|
245
|
+
all_pred = {
|
|
246
|
+
'test_predictions': test_pred,
|
|
247
|
+
'test_indices': X_test.index.tolist(),
|
|
248
|
+
'null_predictions': null_pred,
|
|
249
|
+
'null_indices': X_all[null_mask].index.tolist()
|
|
250
|
+
}
|
|
251
|
+
predictions[name] = all_pred
|
|
252
|
+
|
|
253
|
+
if self._is_canceled:
|
|
254
|
+
return
|
|
255
|
+
|
|
256
|
+
self.progress.emit(90, "Finalizing results...")
|
|
257
|
+
|
|
258
|
+
# Select best model
|
|
259
|
+
if prediction_type == 'regression':
|
|
260
|
+
best_model = max(scores.keys(), key=lambda k: scores[k]['r2'])
|
|
261
|
+
else:
|
|
262
|
+
best_model = max(scores.keys(), key=lambda k: scores[k]['accuracy'])
|
|
263
|
+
|
|
264
|
+
# Get best predictions
|
|
265
|
+
best_pred_dict = predictions[best_model]
|
|
266
|
+
|
|
267
|
+
# Combine test and null predictions into single arrays
|
|
268
|
+
combined_predictions = []
|
|
269
|
+
combined_indices = []
|
|
270
|
+
|
|
271
|
+
# Add test predictions
|
|
272
|
+
test_preds = best_pred_dict['test_predictions']
|
|
273
|
+
test_indices = best_pred_dict['test_indices']
|
|
274
|
+
combined_predictions.extend(test_preds)
|
|
275
|
+
combined_indices.extend(test_indices)
|
|
276
|
+
|
|
277
|
+
# Add null predictions
|
|
278
|
+
null_preds = best_pred_dict['null_predictions']
|
|
279
|
+
null_indices = best_pred_dict['null_indices']
|
|
280
|
+
combined_predictions.extend(null_preds)
|
|
281
|
+
combined_indices.extend(null_indices)
|
|
282
|
+
|
|
283
|
+
# Decode predictions if needed
|
|
284
|
+
if target_encoder is not None and len(combined_predictions) > 0:
|
|
285
|
+
# Convert to original labels
|
|
286
|
+
try:
|
|
287
|
+
combined_predictions = target_encoder.inverse_transform(np.array(combined_predictions).astype(int))
|
|
288
|
+
except:
|
|
289
|
+
# If conversion fails, use numeric predictions
|
|
290
|
+
pass
|
|
291
|
+
|
|
292
|
+
# Create results dictionary with detailed breakdown
|
|
293
|
+
results = {
|
|
294
|
+
'prediction_type': prediction_type,
|
|
295
|
+
'target_column': self.target_column,
|
|
296
|
+
'best_model': best_model,
|
|
297
|
+
'predictions': combined_predictions,
|
|
298
|
+
'scores': scores,
|
|
299
|
+
'feature_columns': list(X.columns),
|
|
300
|
+
'target_encoder': target_encoder,
|
|
301
|
+
'original_indices': combined_indices, # Both test and null indices
|
|
302
|
+
# Store trained models and preprocessing objects for saving
|
|
303
|
+
'trained_models': models,
|
|
304
|
+
'scaler': scaler,
|
|
305
|
+
'label_encoders': label_encoders,
|
|
306
|
+
# Additional info to help detect data leakage
|
|
307
|
+
'data_breakdown': {
|
|
308
|
+
'total_rows': len(self.df),
|
|
309
|
+
'training_rows': len(X_train),
|
|
310
|
+
'test_rows': len(X_test),
|
|
311
|
+
'null_target_rows': null_mask.sum(),
|
|
312
|
+
'predicted_rows': len(combined_indices),
|
|
313
|
+
'test_size_percentage': self.test_size * 100
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
self.progress.emit(100, "Complete!")
|
|
318
|
+
self.result.emit(results)
|
|
319
|
+
|
|
320
|
+
except Exception as e:
|
|
321
|
+
self.error.emit(f"Prediction error: {str(e)}")
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
class PredictionResultsModel(QAbstractTableModel):
|
|
325
|
+
"""Table model for displaying prediction results"""
|
|
326
|
+
|
|
327
|
+
def __init__(self, results_data):
|
|
328
|
+
super().__init__()
|
|
329
|
+
self.results_data = results_data
|
|
330
|
+
self.headers = ['Model', 'Performance Metric', 'Score']
|
|
331
|
+
|
|
332
|
+
def rowCount(self, parent=QModelIndex()):
|
|
333
|
+
return len(self.results_data)
|
|
334
|
+
|
|
335
|
+
def columnCount(self, parent=QModelIndex()):
|
|
336
|
+
return len(self.headers)
|
|
337
|
+
|
|
338
|
+
def data(self, index, role=Qt.ItemDataRole.DisplayRole):
|
|
339
|
+
if not index.isValid():
|
|
340
|
+
return None
|
|
341
|
+
|
|
342
|
+
row = index.row()
|
|
343
|
+
col = index.column()
|
|
344
|
+
|
|
345
|
+
if role == Qt.ItemDataRole.DisplayRole:
|
|
346
|
+
return str(self.results_data[row][col])
|
|
347
|
+
elif role == Qt.ItemDataRole.BackgroundRole and row == 0:
|
|
348
|
+
# Highlight best model
|
|
349
|
+
return QBrush(QColor(200, 255, 200))
|
|
350
|
+
|
|
351
|
+
return None
|
|
352
|
+
|
|
353
|
+
def headerData(self, section, orientation, role=Qt.ItemDataRole.DisplayRole):
|
|
354
|
+
if orientation == Qt.Orientation.Horizontal and role == Qt.ItemDataRole.DisplayRole:
|
|
355
|
+
return self.headers[section]
|
|
356
|
+
return None
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
class PredictionDialog(QMainWindow):
|
|
360
|
+
"""Main dialog for displaying prediction results and applying predictions"""
|
|
361
|
+
|
|
362
|
+
predictionApplied = pyqtSignal(object) # Signal emitted when predictions are applied
|
|
363
|
+
|
|
364
|
+
def __init__(self, df, target_column, parent=None):
|
|
365
|
+
super().__init__(parent)
|
|
366
|
+
self.df = df
|
|
367
|
+
self.target_column = target_column
|
|
368
|
+
self.prediction_results = None
|
|
369
|
+
self.worker_thread = None
|
|
370
|
+
|
|
371
|
+
self.setWindowTitle(f"Predict {target_column}")
|
|
372
|
+
self.setGeometry(100, 100, 900, 700)
|
|
373
|
+
|
|
374
|
+
# Make window stay on top and be modal
|
|
375
|
+
self.setWindowModality(Qt.WindowModality.ApplicationModal)
|
|
376
|
+
self.setWindowFlags(Qt.WindowType.Window | Qt.WindowType.WindowStaysOnTopHint)
|
|
377
|
+
|
|
378
|
+
# Create central widget and layout
|
|
379
|
+
central_widget = QWidget()
|
|
380
|
+
self.setCentralWidget(central_widget)
|
|
381
|
+
layout = QVBoxLayout(central_widget)
|
|
382
|
+
|
|
383
|
+
# Create header
|
|
384
|
+
header_label = QLabel(f"<h2>Predict Column: {target_column}</h2>")
|
|
385
|
+
header_label.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
386
|
+
layout.addWidget(header_label)
|
|
387
|
+
|
|
388
|
+
# Create info panel
|
|
389
|
+
info_group = QGroupBox("Prediction Settings")
|
|
390
|
+
info_layout = QFormLayout(info_group)
|
|
391
|
+
|
|
392
|
+
# Test size spinner
|
|
393
|
+
self.test_size_spin = QDoubleSpinBox()
|
|
394
|
+
self.test_size_spin.setRange(0.1, 0.5)
|
|
395
|
+
self.test_size_spin.setValue(0.2)
|
|
396
|
+
self.test_size_spin.setSingleStep(0.05)
|
|
397
|
+
info_layout.addRow("Test Size:", self.test_size_spin)
|
|
398
|
+
|
|
399
|
+
# Prediction type combo
|
|
400
|
+
self.prediction_type_combo = QComboBox()
|
|
401
|
+
self.prediction_type_combo.addItems(['auto', 'regression', 'classification'])
|
|
402
|
+
info_layout.addRow("Prediction Type:", self.prediction_type_combo)
|
|
403
|
+
|
|
404
|
+
layout.addWidget(info_group)
|
|
405
|
+
|
|
406
|
+
# Create splitter for results
|
|
407
|
+
splitter = QSplitter(Qt.Orientation.Vertical)
|
|
408
|
+
layout.addWidget(splitter)
|
|
409
|
+
|
|
410
|
+
# Progress bar
|
|
411
|
+
self.progress_bar = QProgressBar()
|
|
412
|
+
self.progress_label = QLabel("Ready to start prediction...")
|
|
413
|
+
progress_widget = QWidget()
|
|
414
|
+
progress_layout = QVBoxLayout(progress_widget)
|
|
415
|
+
progress_layout.addWidget(self.progress_label)
|
|
416
|
+
progress_layout.addWidget(self.progress_bar)
|
|
417
|
+
|
|
418
|
+
# Results table
|
|
419
|
+
self.results_table = QTableView()
|
|
420
|
+
self.results_table.setMinimumHeight(200)
|
|
421
|
+
|
|
422
|
+
# Results text area
|
|
423
|
+
self.results_text = QTextEdit()
|
|
424
|
+
self.results_text.setMaximumHeight(150)
|
|
425
|
+
self.results_text.setPlainText("Click 'Start Prediction' to begin analysis...")
|
|
426
|
+
|
|
427
|
+
splitter.addWidget(progress_widget)
|
|
428
|
+
splitter.addWidget(self.results_table)
|
|
429
|
+
splitter.addWidget(self.results_text)
|
|
430
|
+
|
|
431
|
+
# Buttons
|
|
432
|
+
button_layout = QHBoxLayout()
|
|
433
|
+
|
|
434
|
+
self.start_button = QPushButton("Start Prediction")
|
|
435
|
+
self.start_button.clicked.connect(self.start_prediction)
|
|
436
|
+
|
|
437
|
+
self.cancel_button = QPushButton("Cancel")
|
|
438
|
+
self.cancel_button.clicked.connect(self.cancel_prediction)
|
|
439
|
+
self.cancel_button.hide()
|
|
440
|
+
|
|
441
|
+
self.apply_button = QPushButton(f"Apply Predictions (Add 'Predict {target_column}' Column)")
|
|
442
|
+
self.apply_button.clicked.connect(self.apply_predictions)
|
|
443
|
+
self.apply_button.setEnabled(False)
|
|
444
|
+
|
|
445
|
+
self.save_model_button = QPushButton("Save Model")
|
|
446
|
+
self.save_model_button.clicked.connect(self.save_model)
|
|
447
|
+
self.save_model_button.setEnabled(False)
|
|
448
|
+
|
|
449
|
+
self.close_button = QPushButton("Close")
|
|
450
|
+
self.close_button.clicked.connect(self.close)
|
|
451
|
+
|
|
452
|
+
button_layout.addWidget(self.start_button)
|
|
453
|
+
button_layout.addWidget(self.cancel_button)
|
|
454
|
+
button_layout.addWidget(self.apply_button)
|
|
455
|
+
button_layout.addWidget(self.save_model_button)
|
|
456
|
+
button_layout.addStretch()
|
|
457
|
+
button_layout.addWidget(self.close_button)
|
|
458
|
+
|
|
459
|
+
layout.addLayout(button_layout)
|
|
460
|
+
|
|
461
|
+
# Show the window and bring it to front
|
|
462
|
+
print(f"DEBUG: About to show prediction dialog window")
|
|
463
|
+
self.show()
|
|
464
|
+
self.raise_() # Bring to front
|
|
465
|
+
self.activateWindow() # Make it the active window
|
|
466
|
+
|
|
467
|
+
# Additional window focus methods
|
|
468
|
+
self.setWindowState(self.windowState() & ~Qt.WindowState.WindowMinimized | Qt.WindowState.WindowActive)
|
|
469
|
+
|
|
470
|
+
print(f"DEBUG: Dialog window shown. Visible: {self.isVisible()}, Position: {self.geometry()}")
|
|
471
|
+
print(f"DEBUG: Window title: {self.windowTitle()}")
|
|
472
|
+
|
|
473
|
+
def start_prediction(self):
|
|
474
|
+
"""Start the prediction analysis"""
|
|
475
|
+
try:
|
|
476
|
+
self.start_button.setEnabled(False)
|
|
477
|
+
self.apply_button.setEnabled(False)
|
|
478
|
+
self.cancel_button.show()
|
|
479
|
+
|
|
480
|
+
# Get settings
|
|
481
|
+
test_size = self.test_size_spin.value()
|
|
482
|
+
prediction_type = self.prediction_type_combo.currentText()
|
|
483
|
+
|
|
484
|
+
self.progress_label.setText("Starting prediction analysis...")
|
|
485
|
+
self.progress_bar.setValue(0)
|
|
486
|
+
|
|
487
|
+
# Create and start worker thread
|
|
488
|
+
self.worker_thread = PredictionThread(
|
|
489
|
+
self.df, self.target_column,
|
|
490
|
+
prediction_type=prediction_type,
|
|
491
|
+
test_size=test_size
|
|
492
|
+
)
|
|
493
|
+
self.worker_thread.progress.connect(self.update_progress)
|
|
494
|
+
self.worker_thread.result.connect(self.handle_results)
|
|
495
|
+
self.worker_thread.error.connect(self.handle_error)
|
|
496
|
+
self.worker_thread.finished.connect(self.on_analysis_finished)
|
|
497
|
+
self.worker_thread.start()
|
|
498
|
+
|
|
499
|
+
except Exception as e:
|
|
500
|
+
self.handle_error(f"Failed to start prediction: {str(e)}")
|
|
501
|
+
|
|
502
|
+
def cancel_prediction(self):
|
|
503
|
+
"""Cancel the current prediction"""
|
|
504
|
+
if self.worker_thread:
|
|
505
|
+
self.worker_thread.cancel()
|
|
506
|
+
self.worker_thread.quit()
|
|
507
|
+
self.worker_thread.wait()
|
|
508
|
+
self.on_analysis_finished()
|
|
509
|
+
|
|
510
|
+
def update_progress(self, value, message):
|
|
511
|
+
"""Update progress bar and label"""
|
|
512
|
+
self.progress_bar.setValue(value)
|
|
513
|
+
self.progress_label.setText(message)
|
|
514
|
+
|
|
515
|
+
def handle_results(self, results):
|
|
516
|
+
"""Handle prediction results"""
|
|
517
|
+
self.prediction_results = results
|
|
518
|
+
|
|
519
|
+
# Create table data for model comparison
|
|
520
|
+
table_data = []
|
|
521
|
+
for model_name, scores in results['scores'].items():
|
|
522
|
+
if results['prediction_type'] == 'regression':
|
|
523
|
+
table_data.append([
|
|
524
|
+
model_name,
|
|
525
|
+
'R² Score',
|
|
526
|
+
f"{scores['r2']:.4f}"
|
|
527
|
+
])
|
|
528
|
+
table_data.append([
|
|
529
|
+
model_name,
|
|
530
|
+
'MSE',
|
|
531
|
+
f"{scores['mse']:.4f}"
|
|
532
|
+
])
|
|
533
|
+
else:
|
|
534
|
+
table_data.append([
|
|
535
|
+
model_name,
|
|
536
|
+
'Accuracy',
|
|
537
|
+
f"{scores['accuracy']:.4f}"
|
|
538
|
+
])
|
|
539
|
+
|
|
540
|
+
# Sort by best performing model first
|
|
541
|
+
if results['prediction_type'] == 'regression':
|
|
542
|
+
table_data.sort(key=lambda x: float(x[2]) if x[1] == 'R² Score' else -float(x[2]), reverse=True)
|
|
543
|
+
else:
|
|
544
|
+
table_data.sort(key=lambda x: float(x[2]), reverse=True)
|
|
545
|
+
|
|
546
|
+
# Set up table model
|
|
547
|
+
model = PredictionResultsModel(table_data)
|
|
548
|
+
self.results_table.setModel(model)
|
|
549
|
+
self.results_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
|
|
550
|
+
|
|
551
|
+
# Update results text
|
|
552
|
+
breakdown = results['data_breakdown']
|
|
553
|
+
summary = f"""Prediction Analysis Complete!
|
|
554
|
+
|
|
555
|
+
Target Column: {results['target_column']}
|
|
556
|
+
Prediction Type: {results['prediction_type']}
|
|
557
|
+
Best Model: {results['best_model']}
|
|
558
|
+
Features Used: {len(results['feature_columns'])} columns
|
|
559
|
+
|
|
560
|
+
📊 DATA BREAKDOWN (for leak detection):
|
|
561
|
+
Total Rows: {breakdown['total_rows']}
|
|
562
|
+
Training Rows: {breakdown['training_rows']} ({breakdown['training_rows']/breakdown['total_rows']*100:.1f}%)
|
|
563
|
+
Test Rows: {breakdown['test_rows']} ({breakdown['test_size_percentage']:.1f}%)
|
|
564
|
+
NULL Target Rows: {breakdown['null_target_rows']}
|
|
565
|
+
Predicted Rows: {breakdown['predicted_rows']} (test + null targets)
|
|
566
|
+
|
|
567
|
+
⚠️ MODEL PERFORMANCE (on test set only):
|
|
568
|
+
Note: Scores below are ONLY on {breakdown['test_rows']} unseen test rows.
|
|
569
|
+
High scores are good, but should be realistic for your data.
|
|
570
|
+
"""
|
|
571
|
+
for model_name, scores in results['scores'].items():
|
|
572
|
+
summary += f"\n{model_name}:\n"
|
|
573
|
+
for metric, score in scores.items():
|
|
574
|
+
summary += f" {metric}: {score:.4f}\n"
|
|
575
|
+
|
|
576
|
+
summary += f"""
|
|
577
|
+
🔍 LEAK DETECTION GUIDE:
|
|
578
|
+
✅ GOOD: R² between 0.3-0.9 (depending on your data)
|
|
579
|
+
❌ SUSPICIOUS: R² > 0.95 (likely overfit or leakage)
|
|
580
|
+
✅ TRAINING SIZE: {breakdown['training_rows']} rows ({breakdown['training_rows']/breakdown['total_rows']*100:.1f}%)
|
|
581
|
+
✅ TEST SIZE: {breakdown['test_rows']} rows ({breakdown['test_size_percentage']:.1f}%)
|
|
582
|
+
✅ PREDICTIONS: Made for {breakdown['null_target_rows']} missing values + {breakdown['test_rows']} test rows"""
|
|
583
|
+
|
|
584
|
+
self.results_text.setPlainText(summary)
|
|
585
|
+
self.apply_button.setEnabled(True)
|
|
586
|
+
self.save_model_button.setEnabled(True)
|
|
587
|
+
|
|
588
|
+
def handle_error(self, error_message):
|
|
589
|
+
"""Handle prediction errors"""
|
|
590
|
+
self.results_text.setPlainText(f"Error: {error_message}")
|
|
591
|
+
QMessageBox.critical(self, "Prediction Error", error_message)
|
|
592
|
+
|
|
593
|
+
def on_analysis_finished(self):
|
|
594
|
+
"""Handle cleanup when analysis is finished"""
|
|
595
|
+
self.start_button.setEnabled(True)
|
|
596
|
+
self.cancel_button.hide()
|
|
597
|
+
self.progress_label.setText("Analysis complete")
|
|
598
|
+
|
|
599
|
+
def apply_predictions(self):
|
|
600
|
+
"""Apply predictions to the dataframe"""
|
|
601
|
+
if not self.prediction_results:
|
|
602
|
+
return
|
|
603
|
+
|
|
604
|
+
try:
|
|
605
|
+
# Create a copy of the original dataframe
|
|
606
|
+
result_df = self.df.copy()
|
|
607
|
+
|
|
608
|
+
# Create prediction column name
|
|
609
|
+
predict_column_name = f"Predict_{self.target_column}"
|
|
610
|
+
|
|
611
|
+
# Prepare prediction values
|
|
612
|
+
predictions = self.prediction_results['predictions']
|
|
613
|
+
original_indices = self.prediction_results['original_indices']
|
|
614
|
+
|
|
615
|
+
# Create prediction series with NaN values for all rows
|
|
616
|
+
prediction_series = pd.Series([np.nan] * len(result_df), index=result_df.index, name=predict_column_name)
|
|
617
|
+
|
|
618
|
+
# Fill predictions for rows that were in the test set AND rows with null targets
|
|
619
|
+
for i, idx in enumerate(original_indices):
|
|
620
|
+
if i < len(predictions) and idx in prediction_series.index:
|
|
621
|
+
prediction_series.loc[idx] = predictions[i]
|
|
622
|
+
|
|
623
|
+
# Find the position of the target column
|
|
624
|
+
target_column_index = result_df.columns.get_loc(self.target_column)
|
|
625
|
+
|
|
626
|
+
# Insert the prediction column right after the target column
|
|
627
|
+
# Split the dataframe into before and after the target column
|
|
628
|
+
cols_before = result_df.columns[:target_column_index + 1].tolist()
|
|
629
|
+
cols_after = result_df.columns[target_column_index + 1:].tolist()
|
|
630
|
+
|
|
631
|
+
# Create new column order with prediction column inserted
|
|
632
|
+
new_columns = cols_before + [predict_column_name] + cols_after
|
|
633
|
+
|
|
634
|
+
# Add the prediction column to the dataframe
|
|
635
|
+
result_df[predict_column_name] = prediction_series
|
|
636
|
+
|
|
637
|
+
# Reorder columns to place prediction column next to target column
|
|
638
|
+
result_df = result_df[new_columns]
|
|
639
|
+
|
|
640
|
+
# Emit signal with the updated dataframe
|
|
641
|
+
self.predictionApplied.emit(result_df)
|
|
642
|
+
|
|
643
|
+
# Show success message
|
|
644
|
+
QMessageBox.information(
|
|
645
|
+
self,
|
|
646
|
+
"Predictions Applied",
|
|
647
|
+
f"Successfully added '{predict_column_name}' column with predictions from {self.prediction_results['best_model']} model."
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
self.close()
|
|
651
|
+
|
|
652
|
+
except Exception as e:
|
|
653
|
+
QMessageBox.critical(self, "Apply Error", f"Failed to apply predictions: {str(e)}")
|
|
654
|
+
|
|
655
|
+
def save_model(self):
|
|
656
|
+
"""Save the trained prediction model and preprocessing objects"""
|
|
657
|
+
if not self.prediction_results:
|
|
658
|
+
QMessageBox.warning(self, "No Model", "No model available to save. Please run prediction first.")
|
|
659
|
+
return
|
|
660
|
+
|
|
661
|
+
try:
|
|
662
|
+
# Get the best model and preprocessing objects
|
|
663
|
+
best_model_name = self.prediction_results['best_model']
|
|
664
|
+
best_model = self.prediction_results['trained_models'][best_model_name]
|
|
665
|
+
|
|
666
|
+
# Open file dialog to choose save location
|
|
667
|
+
file_path, _ = QFileDialog.getSaveFileName(
|
|
668
|
+
self,
|
|
669
|
+
"Save Prediction Model",
|
|
670
|
+
f"{self.target_column}_prediction_model.pkl",
|
|
671
|
+
"Pickle Files (*.pkl);;All Files (*)"
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
if not file_path:
|
|
675
|
+
return # User cancelled
|
|
676
|
+
|
|
677
|
+
# Create model package with all necessary components
|
|
678
|
+
model_package = {
|
|
679
|
+
'model': best_model,
|
|
680
|
+
'model_name': best_model_name,
|
|
681
|
+
'prediction_type': self.prediction_results['prediction_type'],
|
|
682
|
+
'target_column': self.prediction_results['target_column'],
|
|
683
|
+
'feature_columns': self.prediction_results['feature_columns'],
|
|
684
|
+
'scaler': self.prediction_results['scaler'],
|
|
685
|
+
'label_encoders': self.prediction_results['label_encoders'],
|
|
686
|
+
'target_encoder': self.prediction_results['target_encoder'],
|
|
687
|
+
'scores': self.prediction_results['scores'][best_model_name],
|
|
688
|
+
'created_date': datetime.now().isoformat(),
|
|
689
|
+
'sqlshell_version': '1.0', # Could be made dynamic
|
|
690
|
+
'sklearn_version': None # Will be filled by joblib
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
# Save the model package
|
|
694
|
+
joblib.dump(model_package, file_path)
|
|
695
|
+
|
|
696
|
+
# Show success message
|
|
697
|
+
QMessageBox.information(
|
|
698
|
+
self,
|
|
699
|
+
"Model Saved",
|
|
700
|
+
f"Model saved successfully to:\n{file_path}\n\n"
|
|
701
|
+
f"Model: {best_model_name}\n"
|
|
702
|
+
f"Target: {self.target_column}\n"
|
|
703
|
+
f"Features: {len(self.prediction_results['feature_columns'])} columns"
|
|
704
|
+
)
|
|
705
|
+
|
|
706
|
+
except Exception as e:
|
|
707
|
+
QMessageBox.critical(self, "Save Error", f"Failed to save model: {str(e)}")
|
|
708
|
+
|
|
709
|
+
|
|
710
|
+
def create_prediction_dialog(df, target_column, parent=None):
|
|
711
|
+
"""
|
|
712
|
+
Main function to create and show the prediction dialog.
|
|
713
|
+
|
|
714
|
+
Args:
|
|
715
|
+
df (pd.DataFrame): The dataframe to analyze
|
|
716
|
+
target_column (str): The column to predict
|
|
717
|
+
parent: Parent window for the dialog
|
|
718
|
+
|
|
719
|
+
Returns:
|
|
720
|
+
PredictionDialog: The prediction dialog window
|
|
721
|
+
"""
|
|
722
|
+
if df is None or df.empty:
|
|
723
|
+
raise ValueError("DataFrame is empty or None")
|
|
724
|
+
|
|
725
|
+
if target_column not in df.columns:
|
|
726
|
+
raise ValueError(f"Column '{target_column}' not found in DataFrame")
|
|
727
|
+
|
|
728
|
+
# Check if there are enough features for prediction
|
|
729
|
+
if len(df.columns) < 2:
|
|
730
|
+
raise ValueError("Need at least 2 columns for prediction (target + features)")
|
|
731
|
+
|
|
732
|
+
# Create and return the dialog
|
|
733
|
+
dialog = PredictionDialog(df, target_column, parent)
|
|
734
|
+
return dialog
|
|
735
|
+
|
|
736
|
+
|
|
737
|
+
def load_and_apply_model(df, parent=None):
|
|
738
|
+
"""
|
|
739
|
+
Load a saved prediction model and apply it to a new dataset.
|
|
740
|
+
|
|
741
|
+
Args:
|
|
742
|
+
df (pd.DataFrame): The dataframe to apply predictions to
|
|
743
|
+
parent: Parent window for dialogs
|
|
744
|
+
|
|
745
|
+
Returns:
|
|
746
|
+
pd.DataFrame: DataFrame with predictions added, or None if cancelled/error
|
|
747
|
+
"""
|
|
748
|
+
try:
|
|
749
|
+
# Open file dialog to select model file
|
|
750
|
+
file_path, _ = QFileDialog.getOpenFileName(
|
|
751
|
+
parent,
|
|
752
|
+
"Load Prediction Model",
|
|
753
|
+
"",
|
|
754
|
+
"Pickle Files (*.pkl);;All Files (*)"
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
if not file_path:
|
|
758
|
+
return None # User cancelled
|
|
759
|
+
|
|
760
|
+
# Load the model package
|
|
761
|
+
try:
|
|
762
|
+
model_package = joblib.load(file_path)
|
|
763
|
+
except Exception as e:
|
|
764
|
+
QMessageBox.critical(parent, "Load Error", f"Failed to load model file:\n{str(e)}")
|
|
765
|
+
return None
|
|
766
|
+
|
|
767
|
+
# Validate model package structure
|
|
768
|
+
required_keys = ['model', 'model_name', 'prediction_type', 'target_column',
|
|
769
|
+
'feature_columns', 'scaler', 'label_encoders']
|
|
770
|
+
missing_keys = [key for key in required_keys if key not in model_package]
|
|
771
|
+
if missing_keys:
|
|
772
|
+
QMessageBox.critical(
|
|
773
|
+
parent,
|
|
774
|
+
"Invalid Model",
|
|
775
|
+
f"Model file is missing required components: {missing_keys}"
|
|
776
|
+
)
|
|
777
|
+
return None
|
|
778
|
+
|
|
779
|
+
# Check feature compatibility
|
|
780
|
+
model_features = set(model_package['feature_columns'])
|
|
781
|
+
df_columns = set(df.columns)
|
|
782
|
+
missing_features = model_features - df_columns
|
|
783
|
+
|
|
784
|
+
if missing_features:
|
|
785
|
+
QMessageBox.critical(
|
|
786
|
+
parent,
|
|
787
|
+
"Feature Mismatch",
|
|
788
|
+
f"The current dataset is missing required features:\n{list(missing_features)}\n\n"
|
|
789
|
+
f"Model requires: {model_package['feature_columns']}\n"
|
|
790
|
+
f"Dataset has: {list(df.columns)}"
|
|
791
|
+
)
|
|
792
|
+
return None
|
|
793
|
+
|
|
794
|
+
# Show model info and ask for confirmation
|
|
795
|
+
model_info = f"""Model Information:
|
|
796
|
+
|
|
797
|
+
Model Name: {model_package['model_name']}
|
|
798
|
+
Original Target: {model_package['target_column']}
|
|
799
|
+
Prediction Type: {model_package['prediction_type']}
|
|
800
|
+
Features Required: {len(model_package['feature_columns'])} columns
|
|
801
|
+
Created: {model_package.get('created_date', 'Unknown')}
|
|
802
|
+
|
|
803
|
+
This will add a new 'Predict_{model_package['target_column']}' column to your dataset.
|
|
804
|
+
|
|
805
|
+
Continue with prediction?"""
|
|
806
|
+
|
|
807
|
+
reply = QMessageBox.question(
|
|
808
|
+
parent,
|
|
809
|
+
"Apply Model",
|
|
810
|
+
model_info,
|
|
811
|
+
QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No
|
|
812
|
+
)
|
|
813
|
+
|
|
814
|
+
if reply != QMessageBox.StandardButton.Yes:
|
|
815
|
+
return None
|
|
816
|
+
|
|
817
|
+
# Apply the model
|
|
818
|
+
result_df = apply_loaded_model_to_dataframe(df, model_package)
|
|
819
|
+
|
|
820
|
+
if result_df is not None:
|
|
821
|
+
QMessageBox.information(
|
|
822
|
+
parent,
|
|
823
|
+
"Predictions Applied",
|
|
824
|
+
f"Successfully applied '{model_package['model_name']}' model.\n"
|
|
825
|
+
f"Added 'Predict_{model_package['target_column']}' column with predictions."
|
|
826
|
+
)
|
|
827
|
+
|
|
828
|
+
return result_df
|
|
829
|
+
|
|
830
|
+
except Exception as e:
|
|
831
|
+
QMessageBox.critical(parent, "Error", f"Failed to apply model: {str(e)}")
|
|
832
|
+
return None
|
|
833
|
+
|
|
834
|
+
|
|
835
|
+
def apply_loaded_model_to_dataframe(df, model_package):
|
|
836
|
+
"""
|
|
837
|
+
Apply a loaded model package to a dataframe.
|
|
838
|
+
|
|
839
|
+
Args:
|
|
840
|
+
df (pd.DataFrame): The dataframe to apply predictions to
|
|
841
|
+
model_package (dict): The loaded model package
|
|
842
|
+
|
|
843
|
+
Returns:
|
|
844
|
+
pd.DataFrame: DataFrame with predictions added
|
|
845
|
+
"""
|
|
846
|
+
try:
|
|
847
|
+
# Extract components from model package
|
|
848
|
+
model = model_package['model']
|
|
849
|
+
scaler = model_package['scaler']
|
|
850
|
+
label_encoders = model_package['label_encoders']
|
|
851
|
+
target_encoder = model_package.get('target_encoder')
|
|
852
|
+
feature_columns = model_package['feature_columns']
|
|
853
|
+
model_name = model_package['model_name']
|
|
854
|
+
target_column = model_package['target_column']
|
|
855
|
+
|
|
856
|
+
# Prepare features using the same preprocessing as during training
|
|
857
|
+
X = df[feature_columns].copy()
|
|
858
|
+
|
|
859
|
+
# Apply label encoders to categorical features
|
|
860
|
+
for col, encoder in label_encoders.items():
|
|
861
|
+
if col in X.columns:
|
|
862
|
+
# Handle unseen categories by replacing with 'unknown'
|
|
863
|
+
unique_train_values = set(encoder.classes_)
|
|
864
|
+
X[col] = X[col].astype(str)
|
|
865
|
+
X[col] = X[col].apply(lambda x: x if x in unique_train_values else 'unknown')
|
|
866
|
+
|
|
867
|
+
# Add 'unknown' to encoder if not present
|
|
868
|
+
if 'unknown' not in encoder.classes_:
|
|
869
|
+
# Create a new encoder with 'unknown' category
|
|
870
|
+
new_classes = list(encoder.classes_) + ['unknown']
|
|
871
|
+
encoder.classes_ = np.array(new_classes)
|
|
872
|
+
|
|
873
|
+
X[col] = encoder.transform(X[col])
|
|
874
|
+
|
|
875
|
+
# Handle missing values for numerical features
|
|
876
|
+
numerical_cols = X.select_dtypes(include=[np.number]).columns
|
|
877
|
+
for col in numerical_cols:
|
|
878
|
+
if X[col].isnull().any():
|
|
879
|
+
# Use median from the scaler's fitted data (approximate)
|
|
880
|
+
median_val = X[col].median()
|
|
881
|
+
X[col] = X[col].fillna(median_val)
|
|
882
|
+
|
|
883
|
+
# Apply scaling
|
|
884
|
+
if 'Linear' in model_name or 'Logistic' in model_name:
|
|
885
|
+
X_processed = scaler.transform(X)
|
|
886
|
+
else:
|
|
887
|
+
X_processed = X.values
|
|
888
|
+
|
|
889
|
+
# Make predictions
|
|
890
|
+
predictions = model.predict(X_processed)
|
|
891
|
+
|
|
892
|
+
# Decode predictions if target encoder exists
|
|
893
|
+
if target_encoder is not None and len(predictions) > 0:
|
|
894
|
+
try:
|
|
895
|
+
predictions = target_encoder.inverse_transform(predictions.astype(int))
|
|
896
|
+
except:
|
|
897
|
+
# If decoding fails, keep numeric predictions
|
|
898
|
+
pass
|
|
899
|
+
|
|
900
|
+
# Create result dataframe
|
|
901
|
+
result_df = df.copy()
|
|
902
|
+
predict_column_name = f"Predict_{target_column}"
|
|
903
|
+
result_df[predict_column_name] = predictions
|
|
904
|
+
|
|
905
|
+
return result_df
|
|
906
|
+
|
|
907
|
+
except Exception as e:
|
|
908
|
+
raise Exception(f"Failed to apply model: {str(e)}")
|
|
909
|
+
|
|
910
|
+
|
|
911
|
+
def show_load_model_dialog(df, parent=None):
|
|
912
|
+
"""
|
|
913
|
+
Convenience function to show load model dialog and apply predictions.
|
|
914
|
+
|
|
915
|
+
Args:
|
|
916
|
+
df (pd.DataFrame): The dataframe to apply predictions to
|
|
917
|
+
parent: Parent window for dialogs
|
|
918
|
+
|
|
919
|
+
Returns:
|
|
920
|
+
pd.DataFrame: DataFrame with predictions added, or None if cancelled/error
|
|
921
|
+
"""
|
|
922
|
+
if df is None or df.empty:
|
|
923
|
+
QMessageBox.warning(parent, "No Data", "No data available. Please load some data first.")
|
|
924
|
+
return None
|
|
925
|
+
|
|
926
|
+
return load_and_apply_model(df, parent)
|