sqlshell 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlshell/__init__.py +84 -0
- sqlshell/__main__.py +4926 -0
- sqlshell/ai_autocomplete.py +392 -0
- sqlshell/ai_settings_dialog.py +337 -0
- sqlshell/context_suggester.py +768 -0
- sqlshell/create_test_data.py +152 -0
- sqlshell/data/create_test_data.py +137 -0
- sqlshell/db/__init__.py +6 -0
- sqlshell/db/database_manager.py +1318 -0
- sqlshell/db/export_manager.py +188 -0
- sqlshell/editor.py +1166 -0
- sqlshell/editor_integration.py +127 -0
- sqlshell/execution_handler.py +421 -0
- sqlshell/menus.py +262 -0
- sqlshell/notification_manager.py +370 -0
- sqlshell/query_tab.py +904 -0
- sqlshell/resources/__init__.py +1 -0
- sqlshell/resources/icon.png +0 -0
- sqlshell/resources/logo_large.png +0 -0
- sqlshell/resources/logo_medium.png +0 -0
- sqlshell/resources/logo_small.png +0 -0
- sqlshell/resources/splash_screen.gif +0 -0
- sqlshell/space_invaders.py +501 -0
- sqlshell/splash_screen.py +405 -0
- sqlshell/sqlshell/__init__.py +5 -0
- sqlshell/sqlshell/create_test_data.py +118 -0
- sqlshell/sqlshell/create_test_databases.py +96 -0
- sqlshell/sqlshell_demo.png +0 -0
- sqlshell/styles.py +257 -0
- sqlshell/suggester_integration.py +330 -0
- sqlshell/syntax_highlighter.py +124 -0
- sqlshell/table_list.py +996 -0
- sqlshell/ui/__init__.py +6 -0
- sqlshell/ui/bar_chart_delegate.py +49 -0
- sqlshell/ui/filter_header.py +469 -0
- sqlshell/utils/__init__.py +16 -0
- sqlshell/utils/profile_cn2.py +1661 -0
- sqlshell/utils/profile_column.py +2635 -0
- sqlshell/utils/profile_distributions.py +616 -0
- sqlshell/utils/profile_entropy.py +347 -0
- sqlshell/utils/profile_foreign_keys.py +779 -0
- sqlshell/utils/profile_keys.py +2834 -0
- sqlshell/utils/profile_ohe.py +934 -0
- sqlshell/utils/profile_ohe_advanced.py +754 -0
- sqlshell/utils/profile_ohe_comparison.py +237 -0
- sqlshell/utils/profile_prediction.py +926 -0
- sqlshell/utils/profile_similarity.py +876 -0
- sqlshell/utils/search_in_df.py +90 -0
- sqlshell/widgets.py +400 -0
- sqlshell-0.4.4.dist-info/METADATA +441 -0
- sqlshell-0.4.4.dist-info/RECORD +54 -0
- sqlshell-0.4.4.dist-info/WHEEL +5 -0
- sqlshell-0.4.4.dist-info/entry_points.txt +2 -0
- sqlshell-0.4.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2635 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.model_selection import train_test_split
|
|
4
|
+
from sklearn.preprocessing import LabelEncoder
|
|
5
|
+
from sklearn.ensemble import RandomForestRegressor
|
|
6
|
+
import sys
|
|
7
|
+
import time
|
|
8
|
+
import hashlib
|
|
9
|
+
import os
|
|
10
|
+
import pickle
|
|
11
|
+
import gc
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from PyQt6.QtWidgets import (QApplication, QMainWindow, QTableWidget, QTableWidgetItem,
|
|
14
|
+
QVBoxLayout, QHBoxLayout, QLabel, QWidget, QComboBox,
|
|
15
|
+
QPushButton, QSplitter, QHeaderView, QFrame, QProgressBar,
|
|
16
|
+
QMessageBox, QDialog)
|
|
17
|
+
|
|
18
|
+
# Import notification manager (with fallback for cases where it's not available)
|
|
19
|
+
try:
|
|
20
|
+
from sqlshell.notification_manager import show_error_notification, show_warning_notification
|
|
21
|
+
except ImportError:
|
|
22
|
+
# Fallback functions for when notification manager is not available
|
|
23
|
+
def show_error_notification(message):
|
|
24
|
+
print(f"Error: {message}")
|
|
25
|
+
def show_warning_notification(message):
|
|
26
|
+
print(f"Warning: {message}")
|
|
27
|
+
from PyQt6.QtCore import Qt, QAbstractTableModel, QModelIndex, QThread, pyqtSignal, QTimer
|
|
28
|
+
from PyQt6.QtGui import QPalette, QColor, QBrush, QPainter, QPen
|
|
29
|
+
from scipy.stats import chi2_contingency, pearsonr
|
|
30
|
+
|
|
31
|
+
# Import matplotlib at the top level
|
|
32
|
+
import matplotlib
|
|
33
|
+
try:
|
|
34
|
+
matplotlib.use('QtAgg')
|
|
35
|
+
except ImportError:
|
|
36
|
+
matplotlib.use('Agg') # Fall back to headless backend for CI/testing
|
|
37
|
+
from matplotlib.backends.backend_qtagg import FigureCanvasQTAgg
|
|
38
|
+
from matplotlib.figure import Figure
|
|
39
|
+
import seaborn as sns
|
|
40
|
+
import matplotlib.pyplot as plt
|
|
41
|
+
|
|
42
|
+
# Create a cache directory in user's home directory
|
|
43
|
+
CACHE_DIR = os.path.join(Path.home(), '.sqlshell_cache')
|
|
44
|
+
os.makedirs(CACHE_DIR, exist_ok=True)
|
|
45
|
+
|
|
46
|
+
def get_cache_key(df, column):
|
|
47
|
+
"""Generate a cache key based on dataframe content and column"""
|
|
48
|
+
# Get DataFrame characteristics that make it unique
|
|
49
|
+
columns = ','.join(df.columns)
|
|
50
|
+
shapes = f"{df.shape[0]}x{df.shape[1]}"
|
|
51
|
+
col_types = ','.join(str(dtype) for dtype in df.dtypes)
|
|
52
|
+
|
|
53
|
+
# Sample some values as fingerprint without loading entire dataframe
|
|
54
|
+
sample_rows = min(50, len(df))
|
|
55
|
+
values_sample = df.head(sample_rows).values.tobytes()
|
|
56
|
+
|
|
57
|
+
# Create hash
|
|
58
|
+
hash_input = f"{columns}|{shapes}|{col_types}|{column}|{len(df)}"
|
|
59
|
+
m = hashlib.md5()
|
|
60
|
+
m.update(hash_input.encode())
|
|
61
|
+
m.update(values_sample) # Add sample data to hash
|
|
62
|
+
return m.hexdigest()
|
|
63
|
+
|
|
64
|
+
def cache_results(df, column, results):
|
|
65
|
+
"""Save results to disk cache"""
|
|
66
|
+
try:
|
|
67
|
+
cache_key = get_cache_key(df, column)
|
|
68
|
+
cache_file = os.path.join(CACHE_DIR, f"{cache_key}.pkl")
|
|
69
|
+
with open(cache_file, 'wb') as f:
|
|
70
|
+
pickle.dump(results, f)
|
|
71
|
+
return True
|
|
72
|
+
except Exception as e:
|
|
73
|
+
print(f"Cache write error: {e}")
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
def get_cached_results(df, column):
|
|
77
|
+
"""Try to get results from disk cache"""
|
|
78
|
+
try:
|
|
79
|
+
cache_key = get_cache_key(df, column)
|
|
80
|
+
cache_file = os.path.join(CACHE_DIR, f"{cache_key}.pkl")
|
|
81
|
+
if os.path.exists(cache_file):
|
|
82
|
+
# Check if cache file is recent (less than 1 day old)
|
|
83
|
+
mod_time = os.path.getmtime(cache_file)
|
|
84
|
+
if time.time() - mod_time < 86400: # 24 hours in seconds
|
|
85
|
+
with open(cache_file, 'rb') as f:
|
|
86
|
+
return pickle.load(f)
|
|
87
|
+
return None
|
|
88
|
+
except Exception as e:
|
|
89
|
+
print(f"Cache read error: {e}")
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
# Worker thread for background processing
|
|
93
|
+
class ExplainerThread(QThread):
|
|
94
|
+
# Signals for progress updates and results
|
|
95
|
+
progress = pyqtSignal(int, str)
|
|
96
|
+
result = pyqtSignal(object)
|
|
97
|
+
error = pyqtSignal(str)
|
|
98
|
+
|
|
99
|
+
def __init__(self, df, column):
|
|
100
|
+
super().__init__()
|
|
101
|
+
# Make a copy of the dataframe to avoid reference issues
|
|
102
|
+
self.df = df.copy()
|
|
103
|
+
self.column = column
|
|
104
|
+
self._is_canceled = False
|
|
105
|
+
|
|
106
|
+
def cancel(self):
|
|
107
|
+
"""Mark the thread as canceled"""
|
|
108
|
+
self._is_canceled = True
|
|
109
|
+
|
|
110
|
+
def calculate_correlation(self, x, y):
|
|
111
|
+
"""Calculate correlation between two variables, handling different data types.
|
|
112
|
+
Returns absolute correlation value between 0 and 1."""
|
|
113
|
+
try:
|
|
114
|
+
# Handle missing values
|
|
115
|
+
mask = ~(pd.isna(x) | pd.isna(y))
|
|
116
|
+
x_clean = x[mask]
|
|
117
|
+
y_clean = y[mask]
|
|
118
|
+
|
|
119
|
+
# If too few data points, return default
|
|
120
|
+
if len(x_clean) < 5:
|
|
121
|
+
return 0.0
|
|
122
|
+
|
|
123
|
+
# Check data types
|
|
124
|
+
x_is_numeric = pd.api.types.is_numeric_dtype(x_clean)
|
|
125
|
+
y_is_numeric = pd.api.types.is_numeric_dtype(y_clean)
|
|
126
|
+
|
|
127
|
+
# Case 1: Both numeric - use Pearson correlation
|
|
128
|
+
if x_is_numeric and y_is_numeric:
|
|
129
|
+
corr, _ = pearsonr(x_clean, y_clean)
|
|
130
|
+
return abs(corr)
|
|
131
|
+
|
|
132
|
+
# Case 2: Categorical vs Categorical - use Cramer's V
|
|
133
|
+
elif not x_is_numeric and not y_is_numeric:
|
|
134
|
+
# Convert to categorical codes
|
|
135
|
+
x_cat = pd.Categorical(x_clean).codes
|
|
136
|
+
y_cat = pd.Categorical(y_clean).codes
|
|
137
|
+
|
|
138
|
+
# Create contingency table
|
|
139
|
+
contingency = pd.crosstab(x_cat, y_cat)
|
|
140
|
+
|
|
141
|
+
# Calculate Cramer's V
|
|
142
|
+
chi2, _, _, _ = chi2_contingency(contingency)
|
|
143
|
+
n = contingency.sum().sum()
|
|
144
|
+
phi2 = chi2 / n
|
|
145
|
+
|
|
146
|
+
# Get dimensions
|
|
147
|
+
r, k = contingency.shape
|
|
148
|
+
|
|
149
|
+
# Calculate Cramer's V with correction for dimensions
|
|
150
|
+
cramers_v = np.sqrt(phi2 / min(k-1, r-1)) if min(k-1, r-1) > 0 else 0.0
|
|
151
|
+
return min(cramers_v, 1.0) # Cap at 1.0
|
|
152
|
+
|
|
153
|
+
# Case 3: Mixed types - convert to ranks or categories
|
|
154
|
+
else:
|
|
155
|
+
if x_is_numeric and not y_is_numeric:
|
|
156
|
+
# Convert categorical y to codes
|
|
157
|
+
y_encoded = pd.Categorical(y_clean).codes
|
|
158
|
+
|
|
159
|
+
# Calculate correlation between x and encoded y
|
|
160
|
+
# Using point-biserial correlation (special case of Pearson)
|
|
161
|
+
corr, _ = pearsonr(x_clean, y_encoded)
|
|
162
|
+
return abs(corr)
|
|
163
|
+
else: # y is numeric, x is categorical
|
|
164
|
+
# Convert categorical x to codes
|
|
165
|
+
x_encoded = pd.Categorical(x_clean).codes
|
|
166
|
+
|
|
167
|
+
# Calculate correlation
|
|
168
|
+
corr, _ = pearsonr(x_encoded, y_clean)
|
|
169
|
+
return abs(corr)
|
|
170
|
+
|
|
171
|
+
except Exception as e:
|
|
172
|
+
print(f"Error calculating correlation: {e}")
|
|
173
|
+
return 0.0 # Return zero if correlation calculation fails
|
|
174
|
+
|
|
175
|
+
def run(self):
|
|
176
|
+
try:
|
|
177
|
+
# Check if canceled
|
|
178
|
+
if self._is_canceled:
|
|
179
|
+
return
|
|
180
|
+
|
|
181
|
+
# Check disk cache first
|
|
182
|
+
self.progress.emit(0, "Checking for cached results...")
|
|
183
|
+
cached_results = get_cached_results(self.df, self.column)
|
|
184
|
+
if cached_results is not None:
|
|
185
|
+
# Check if canceled
|
|
186
|
+
if self._is_canceled:
|
|
187
|
+
return
|
|
188
|
+
|
|
189
|
+
self.progress.emit(95, "Found cached results, loading...")
|
|
190
|
+
time.sleep(0.5) # Brief pause to show the user we found a cache
|
|
191
|
+
|
|
192
|
+
# Check if canceled
|
|
193
|
+
if self._is_canceled:
|
|
194
|
+
return
|
|
195
|
+
|
|
196
|
+
self.progress.emit(100, "Loaded from cache")
|
|
197
|
+
self.result.emit(cached_results)
|
|
198
|
+
return
|
|
199
|
+
|
|
200
|
+
# Clean up memory before intensive computation
|
|
201
|
+
gc.collect()
|
|
202
|
+
|
|
203
|
+
# Check if canceled
|
|
204
|
+
if self._is_canceled:
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
# Early check for empty dataframe or no columns
|
|
208
|
+
if self.df.empty or len(self.df.columns) == 0:
|
|
209
|
+
raise ValueError("The dataframe is empty or has no columns for analysis")
|
|
210
|
+
|
|
211
|
+
# No cache found, proceed with computation
|
|
212
|
+
self.progress.emit(5, "Computing new analysis...")
|
|
213
|
+
|
|
214
|
+
# Validate that the target column exists in the dataframe
|
|
215
|
+
if self.column not in self.df.columns:
|
|
216
|
+
raise ValueError(f"Target column '{self.column}' not found in the dataframe")
|
|
217
|
+
|
|
218
|
+
# Create a copy to avoid modifying the original dataframe
|
|
219
|
+
df = self.df.copy()
|
|
220
|
+
|
|
221
|
+
# Verify we have data to work with
|
|
222
|
+
if len(df) == 0:
|
|
223
|
+
raise ValueError("No data available for analysis (empty dataframe)")
|
|
224
|
+
|
|
225
|
+
# Sample up to 500 rows for better statistical significance while maintaining speed
|
|
226
|
+
if len(df) > 500:
|
|
227
|
+
sample_size = 500 # Increased sample size for better analysis
|
|
228
|
+
self.progress.emit(10, f"Sampling dataset (using {sample_size} rows from {len(df)} total)...")
|
|
229
|
+
df = df.sample(n=sample_size, random_state=42)
|
|
230
|
+
# Force garbage collection after sampling
|
|
231
|
+
gc.collect()
|
|
232
|
+
|
|
233
|
+
# Check if canceled
|
|
234
|
+
if self._is_canceled:
|
|
235
|
+
return
|
|
236
|
+
|
|
237
|
+
# Drop columns with too many unique values (likely IDs) or excessive NaNs
|
|
238
|
+
self.progress.emit(15, "Analyzing columns for preprocessing...")
|
|
239
|
+
cols_to_drop = []
|
|
240
|
+
for col in df.columns:
|
|
241
|
+
if col == self.column: # Don't drop target column
|
|
242
|
+
continue
|
|
243
|
+
try:
|
|
244
|
+
# Only drop columns with extremely high uniqueness (99% instead of 95%)
|
|
245
|
+
# This ensures we keep more features for analysis
|
|
246
|
+
if df[col].nunique() / len(df) > 0.99 and len(df) > 100:
|
|
247
|
+
cols_to_drop.append(col)
|
|
248
|
+
# Only drop columns with very high missing values (80% instead of 50%)
|
|
249
|
+
elif df[col].isna().mean() > 0.8:
|
|
250
|
+
cols_to_drop.append(col)
|
|
251
|
+
except:
|
|
252
|
+
# If we can't analyze the column, drop it
|
|
253
|
+
cols_to_drop.append(col)
|
|
254
|
+
|
|
255
|
+
# Drop identified columns, but ensure we keep at least some features
|
|
256
|
+
remaining_cols = [col for col in df.columns if col != self.column and col not in cols_to_drop]
|
|
257
|
+
|
|
258
|
+
# If dropping would leave us with no features, keep at least 3 columns (or all if less than 3)
|
|
259
|
+
if len(remaining_cols) == 0 and len(cols_to_drop) > 0:
|
|
260
|
+
# Sort dropped columns by uniqueness (keep those with lower uniqueness)
|
|
261
|
+
col_uniqueness = {}
|
|
262
|
+
for col in cols_to_drop:
|
|
263
|
+
try:
|
|
264
|
+
col_uniqueness[col] = df[col].nunique() / len(df)
|
|
265
|
+
except:
|
|
266
|
+
col_uniqueness[col] = 1.0 # Assume high uniqueness for problematic columns
|
|
267
|
+
|
|
268
|
+
# Sort by uniqueness and keep the least unique columns
|
|
269
|
+
cols_to_keep = sorted(col_uniqueness.items(), key=lambda x: x[1])[:min(3, len(cols_to_drop))]
|
|
270
|
+
cols_to_drop = [col for col in cols_to_drop if col not in [c[0] for c in cols_to_keep]]
|
|
271
|
+
print(f"Keeping {len(cols_to_keep)} columns to ensure analysis can proceed")
|
|
272
|
+
|
|
273
|
+
if cols_to_drop:
|
|
274
|
+
self.progress.emit(20, f"Removing {len(cols_to_drop)} low-information columns...")
|
|
275
|
+
df = df.drop(columns=cols_to_drop)
|
|
276
|
+
|
|
277
|
+
# Ensure target column is still in the dataframe
|
|
278
|
+
if self.column not in df.columns:
|
|
279
|
+
raise ValueError(f"Target column '{self.column}' not found in dataframe after preprocessing")
|
|
280
|
+
|
|
281
|
+
# Calculate correlation coefficients first
|
|
282
|
+
self.progress.emit(25, "Calculating correlation measures...")
|
|
283
|
+
correlations = {}
|
|
284
|
+
|
|
285
|
+
# Get all feature columns (excluding target)
|
|
286
|
+
feature_cols = [col for col in df.columns if col != self.column]
|
|
287
|
+
|
|
288
|
+
# Calculate correlation for each feature
|
|
289
|
+
for col in feature_cols:
|
|
290
|
+
try:
|
|
291
|
+
# Calculate correlation between each feature and target
|
|
292
|
+
cor_val = self.calculate_correlation(df[col], df[self.column])
|
|
293
|
+
correlations[col] = cor_val
|
|
294
|
+
except Exception as e:
|
|
295
|
+
print(f"Error calculating correlation for {col}: {e}")
|
|
296
|
+
correlations[col] = 0.0
|
|
297
|
+
|
|
298
|
+
# Separate features and target
|
|
299
|
+
self.progress.emit(30, "Preparing features and target...")
|
|
300
|
+
X = df.drop(columns=[self.column])
|
|
301
|
+
y = df[self.column]
|
|
302
|
+
|
|
303
|
+
# Handle high-cardinality categorical features
|
|
304
|
+
self.progress.emit(35, "Encoding categorical features...")
|
|
305
|
+
# Use a simpler approach - just one-hot encode columns with few unique values
|
|
306
|
+
# and encode (don't drop) high-cardinality columns for speed
|
|
307
|
+
categorical_cols = X.select_dtypes(include='object').columns
|
|
308
|
+
high_cardinality_threshold = 20 # Higher threshold to keep more columns
|
|
309
|
+
|
|
310
|
+
# Keep track of how many columns we've processed
|
|
311
|
+
columns_processed = 0
|
|
312
|
+
columns_kept = 0
|
|
313
|
+
|
|
314
|
+
for col in categorical_cols:
|
|
315
|
+
columns_processed += 1
|
|
316
|
+
unique_count = X[col].nunique()
|
|
317
|
+
# Always keep the column, but use different encoding strategies based on cardinality
|
|
318
|
+
if unique_count <= high_cardinality_threshold:
|
|
319
|
+
# Simple label encoding for low-cardinality features
|
|
320
|
+
X[col] = X[col].fillna('_MISSING_').astype('category').cat.codes
|
|
321
|
+
columns_kept += 1
|
|
322
|
+
else:
|
|
323
|
+
# For high-cardinality features, still encode them but with a simpler approach
|
|
324
|
+
# Use label encoding instead of dropping
|
|
325
|
+
X[col] = X[col].fillna('_MISSING_').astype('category').cat.codes
|
|
326
|
+
columns_kept += 1
|
|
327
|
+
|
|
328
|
+
# Log how many columns were kept
|
|
329
|
+
if columns_processed > 0:
|
|
330
|
+
self.progress.emit(40, f"Encoded {columns_kept} categorical columns out of {columns_processed}")
|
|
331
|
+
|
|
332
|
+
# Handle target column in a simpler, faster way
|
|
333
|
+
if y.dtype == 'object':
|
|
334
|
+
# For categorical targets, use simple category codes
|
|
335
|
+
y = y.fillna('_MISSING_').astype('category').cat.codes
|
|
336
|
+
else:
|
|
337
|
+
# For numeric targets, just fill NaNs with mean
|
|
338
|
+
y = y.fillna(y.mean() if pd.api.types.is_numeric_dtype(y) else y.mode()[0])
|
|
339
|
+
|
|
340
|
+
# Train/test split
|
|
341
|
+
self.progress.emit(45, "Splitting data into train/test sets...")
|
|
342
|
+
|
|
343
|
+
# Make sure we still have features to work with
|
|
344
|
+
if X.shape[1] == 0:
|
|
345
|
+
raise ValueError("No features remain after preprocessing. Try selecting a different target column.")
|
|
346
|
+
|
|
347
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
348
|
+
|
|
349
|
+
# Check if canceled
|
|
350
|
+
if self._is_canceled:
|
|
351
|
+
return
|
|
352
|
+
|
|
353
|
+
# Train a tree-based model for feature importance
|
|
354
|
+
self.progress.emit(50, "Training RandomForest model...")
|
|
355
|
+
|
|
356
|
+
# Check the number of features left for analysis
|
|
357
|
+
feature_count = X_train.shape[1]
|
|
358
|
+
|
|
359
|
+
# Adjust model complexity based on feature count
|
|
360
|
+
if feature_count < 3:
|
|
361
|
+
max_depth = 3 # Simple trees for few features
|
|
362
|
+
n_estimators = 10 # Use more trees to compensate
|
|
363
|
+
else:
|
|
364
|
+
max_depth = 5 # Moderate depth trees
|
|
365
|
+
n_estimators = 10 # Balanced number of trees
|
|
366
|
+
|
|
367
|
+
model = RandomForestRegressor(
|
|
368
|
+
n_estimators=n_estimators,
|
|
369
|
+
max_depth=max_depth,
|
|
370
|
+
min_samples_split=5, # Prevent overfitting
|
|
371
|
+
min_samples_leaf=2, # Prevent overfitting
|
|
372
|
+
max_features='sqrt', # Use subset of features per tree
|
|
373
|
+
n_jobs=1, # Single thread to avoid overhead
|
|
374
|
+
random_state=42,
|
|
375
|
+
verbose=0 # Suppress output
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
# Set simpler parameters for large feature sets
|
|
379
|
+
if X_train.shape[1] > 100: # If there are many features
|
|
380
|
+
self.progress.emit(55, "Large feature set detected, using simpler model...")
|
|
381
|
+
model.set_params(n_estimators=5, max_depth=3)
|
|
382
|
+
|
|
383
|
+
# Fit model with a try/except to catch memory issues
|
|
384
|
+
try:
|
|
385
|
+
model.fit(X_train, y_train)
|
|
386
|
+
except Exception as e:
|
|
387
|
+
# Log the error for debugging
|
|
388
|
+
print(f"Initial RandomForest fit failed: {str(e)}")
|
|
389
|
+
|
|
390
|
+
# If we encounter an error, try with an even smaller and simpler model
|
|
391
|
+
self.progress.emit(55, "Adjusting model parameters due to computational constraints...")
|
|
392
|
+
try:
|
|
393
|
+
# Try a simpler regressor with more conservative parameters
|
|
394
|
+
model = RandomForestRegressor(
|
|
395
|
+
n_estimators=3,
|
|
396
|
+
max_depth=2,
|
|
397
|
+
max_features='sqrt',
|
|
398
|
+
n_jobs=1,
|
|
399
|
+
random_state=42,
|
|
400
|
+
verbose=0
|
|
401
|
+
)
|
|
402
|
+
model.fit(X_train, y_train)
|
|
403
|
+
except Exception as inner_e:
|
|
404
|
+
# If even the simpler model fails, resort to a fallback strategy
|
|
405
|
+
print(f"Even simpler RandomForest failed: {str(inner_e)}")
|
|
406
|
+
self.progress.emit(60, "Using fallback importance calculation method...")
|
|
407
|
+
|
|
408
|
+
# Create a basic feature importance based on correlation with target
|
|
409
|
+
# This is a simple fallback when model training fails
|
|
410
|
+
importance = []
|
|
411
|
+
for col in X.columns:
|
|
412
|
+
try:
|
|
413
|
+
# Use pre-calculated correlations for fallback importance
|
|
414
|
+
corr_value = correlations.get(col, 0.5)
|
|
415
|
+
# Scale correlation to make a reasonable importance value
|
|
416
|
+
# Higher correlation = higher importance
|
|
417
|
+
importance.append(0.5 + corr_value/2 if not pd.isna(corr_value) else 0.5)
|
|
418
|
+
except:
|
|
419
|
+
# If correlation fails, use default
|
|
420
|
+
importance.append(0.5)
|
|
421
|
+
|
|
422
|
+
# Normalize to sum to 1
|
|
423
|
+
importance = np.array(importance)
|
|
424
|
+
if sum(importance) > 0:
|
|
425
|
+
importance = importance / sum(importance)
|
|
426
|
+
else:
|
|
427
|
+
# Equal importance if everything fails
|
|
428
|
+
importance = np.ones(len(X.columns)) / len(X.columns)
|
|
429
|
+
|
|
430
|
+
# Skip the model-based code path since we calculated importances manually
|
|
431
|
+
self.progress.emit(80, "Creating importance results...")
|
|
432
|
+
feature_importance = pd.DataFrame({
|
|
433
|
+
'feature': X.columns,
|
|
434
|
+
'importance_value': importance,
|
|
435
|
+
'correlation': [correlations.get(col, 0.0) for col in X.columns]
|
|
436
|
+
}).sort_values(by='importance_value', ascending=False)
|
|
437
|
+
|
|
438
|
+
# Cache the results for future use
|
|
439
|
+
self.progress.emit(95, "Caching results for future use...")
|
|
440
|
+
cache_results(self.df, self.column, feature_importance)
|
|
441
|
+
|
|
442
|
+
# Clean up after computation
|
|
443
|
+
del df, X, y, X_train, X_test, y_train, y_test
|
|
444
|
+
gc.collect()
|
|
445
|
+
|
|
446
|
+
# Check if canceled
|
|
447
|
+
if self._is_canceled:
|
|
448
|
+
return
|
|
449
|
+
|
|
450
|
+
# Emit the result
|
|
451
|
+
self.progress.emit(100, "Analysis complete (fallback method)")
|
|
452
|
+
self.result.emit(feature_importance)
|
|
453
|
+
return
|
|
454
|
+
|
|
455
|
+
# Check if canceled
|
|
456
|
+
if self._is_canceled:
|
|
457
|
+
return
|
|
458
|
+
|
|
459
|
+
# Get feature importance from the trained model
|
|
460
|
+
self.progress.emit(80, "Calculating feature importance and correlations...")
|
|
461
|
+
|
|
462
|
+
try:
|
|
463
|
+
# Check if we have features to analyze
|
|
464
|
+
if X.shape[1] == 0:
|
|
465
|
+
raise ValueError("No features available for importance analysis")
|
|
466
|
+
|
|
467
|
+
# Get feature importance from RandomForest
|
|
468
|
+
importance = model.feature_importances_
|
|
469
|
+
|
|
470
|
+
# Verify importance values are valid
|
|
471
|
+
if np.isnan(importance).any() or np.isinf(importance).any():
|
|
472
|
+
# Handle NaN or Inf values
|
|
473
|
+
print("Warning: Invalid importance values detected, using fallback method")
|
|
474
|
+
# Replace with equal importance
|
|
475
|
+
importance = np.ones(len(X.columns)) / len(X.columns)
|
|
476
|
+
|
|
477
|
+
# Create and sort the importance dataframe with correlations
|
|
478
|
+
feature_importance = pd.DataFrame({
|
|
479
|
+
'feature': X.columns,
|
|
480
|
+
'importance_value': importance,
|
|
481
|
+
'correlation': [correlations.get(col, 0.0) for col in X.columns]
|
|
482
|
+
}).sort_values(by='importance_value', ascending=False)
|
|
483
|
+
|
|
484
|
+
# Cache the results for future use
|
|
485
|
+
self.progress.emit(95, "Caching results for future use...")
|
|
486
|
+
cache_results(self.df, self.column, feature_importance)
|
|
487
|
+
|
|
488
|
+
# Clean up after computation
|
|
489
|
+
del df, X, y, X_train, X_test, y_train, y_test, model
|
|
490
|
+
gc.collect()
|
|
491
|
+
|
|
492
|
+
# Check if canceled
|
|
493
|
+
if self._is_canceled:
|
|
494
|
+
return
|
|
495
|
+
|
|
496
|
+
# Emit the result
|
|
497
|
+
self.progress.emit(100, "Analysis complete")
|
|
498
|
+
self.result.emit(feature_importance)
|
|
499
|
+
return
|
|
500
|
+
|
|
501
|
+
except Exception as e:
|
|
502
|
+
print(f"Error in feature importance calculation: {e}")
|
|
503
|
+
import traceback
|
|
504
|
+
traceback.print_exc()
|
|
505
|
+
|
|
506
|
+
# Create fallback importance values when model-based approach fails
|
|
507
|
+
self.progress.emit(85, "Using alternative importance calculation method...")
|
|
508
|
+
|
|
509
|
+
try:
|
|
510
|
+
# Try correlation-based approach first
|
|
511
|
+
importance = []
|
|
512
|
+
has_valid_correlations = False
|
|
513
|
+
|
|
514
|
+
for col in X.columns:
|
|
515
|
+
try:
|
|
516
|
+
# Use pre-calculated correlations
|
|
517
|
+
corr = correlations.get(col, 0.1)
|
|
518
|
+
if not pd.isna(corr):
|
|
519
|
+
importance.append(corr)
|
|
520
|
+
has_valid_correlations = True
|
|
521
|
+
else:
|
|
522
|
+
importance.append(0.1) # Default for failed correlation
|
|
523
|
+
except:
|
|
524
|
+
# Default value for any error
|
|
525
|
+
importance.append(0.1)
|
|
526
|
+
|
|
527
|
+
# Normalize importance values
|
|
528
|
+
importance = np.array(importance)
|
|
529
|
+
if has_valid_correlations and sum(importance) > 0:
|
|
530
|
+
# If we have valid correlations, use them normalized
|
|
531
|
+
importance = importance / max(sum(importance), 0.001)
|
|
532
|
+
else:
|
|
533
|
+
# Otherwise use frequency-based heuristic
|
|
534
|
+
print("Using frequency-based feature importance as fallback")
|
|
535
|
+
# Count unique values as a proxy for importance
|
|
536
|
+
importance = []
|
|
537
|
+
total_rows = len(X)
|
|
538
|
+
|
|
539
|
+
for col in X.columns:
|
|
540
|
+
try:
|
|
541
|
+
# More unique values could indicate more information content
|
|
542
|
+
# But we invert the ratio so columns with fewer unique values
|
|
543
|
+
# (more predictive) get higher importance
|
|
544
|
+
uniqueness = X[col].nunique() / total_rows
|
|
545
|
+
# Invert and scale between 0.1 and 1.0
|
|
546
|
+
val = 1.0 - (0.9 * uniqueness)
|
|
547
|
+
importance.append(max(0.1, min(1.0, val)))
|
|
548
|
+
except:
|
|
549
|
+
importance.append(0.1) # Default value
|
|
550
|
+
|
|
551
|
+
# Normalize
|
|
552
|
+
importance = np.array(importance)
|
|
553
|
+
importance = importance / max(sum(importance), 0.001)
|
|
554
|
+
|
|
555
|
+
except Exception as fallback_error:
|
|
556
|
+
# Last resort: create equal importance for all features
|
|
557
|
+
print(f"Fallback error: {fallback_error}, using equal importance")
|
|
558
|
+
importance_values = np.ones(len(X.columns)) / max(len(X.columns), 1)
|
|
559
|
+
importance = importance_values
|
|
560
|
+
|
|
561
|
+
# Create dataframe with results, including correlations
|
|
562
|
+
feature_importance = pd.DataFrame({
|
|
563
|
+
'feature': X.columns,
|
|
564
|
+
'importance_value': importance,
|
|
565
|
+
'correlation': [correlations.get(col, 0.0) for col in X.columns]
|
|
566
|
+
}).sort_values(by='importance_value', ascending=False)
|
|
567
|
+
|
|
568
|
+
# Cache the results
|
|
569
|
+
try:
|
|
570
|
+
cache_results(self.df, self.column, feature_importance)
|
|
571
|
+
except:
|
|
572
|
+
pass # Ignore cache errors
|
|
573
|
+
|
|
574
|
+
# Clean up
|
|
575
|
+
try:
|
|
576
|
+
del df, X, y, X_train, X_test, y_train, y_test
|
|
577
|
+
gc.collect()
|
|
578
|
+
except:
|
|
579
|
+
pass
|
|
580
|
+
|
|
581
|
+
# Emit the result
|
|
582
|
+
self.progress.emit(100, "Analysis complete (with fallback methods)")
|
|
583
|
+
self.result.emit(feature_importance)
|
|
584
|
+
return
|
|
585
|
+
|
|
586
|
+
except IndexError as e:
|
|
587
|
+
# Handle index errors with more detail
|
|
588
|
+
import traceback
|
|
589
|
+
import inspect
|
|
590
|
+
trace = traceback.format_exc()
|
|
591
|
+
|
|
592
|
+
# Get more detailed information
|
|
593
|
+
frame = inspect.trace()[-1]
|
|
594
|
+
frame_info = inspect.getframeinfo(frame[0])
|
|
595
|
+
filename = frame_info.filename
|
|
596
|
+
lineno = frame_info.lineno
|
|
597
|
+
function = frame_info.function
|
|
598
|
+
code_context = frame_info.code_context[0].strip() if frame_info.code_context else "Unknown code context"
|
|
599
|
+
|
|
600
|
+
# Format a more detailed error message
|
|
601
|
+
detail_msg = f"IndexError: {str(e)}\nLocation: {filename}:{lineno} in function '{function}'\nCode: {code_context}\n\n{trace}"
|
|
602
|
+
print(detail_msg) # Print to console for debugging
|
|
603
|
+
|
|
604
|
+
if not self._is_canceled:
|
|
605
|
+
self.error.emit(f"Index error at line {lineno} in {function}:\n{str(e)}\nCode: {code_context}")
|
|
606
|
+
|
|
607
|
+
except Exception as e:
|
|
608
|
+
if not self._is_canceled: # Only emit error if not canceled
|
|
609
|
+
import traceback
|
|
610
|
+
trace = traceback.format_exc()
|
|
611
|
+
print(f"Error in ExplainerThread: {str(e)}")
|
|
612
|
+
print(trace) # Print full stack trace to help debug
|
|
613
|
+
self.error.emit(f"{str(e)}\n\nTrace: {trace}")
|
|
614
|
+
|
|
615
|
+
def analyze_column(self):
|
|
616
|
+
if self.df is None or self.column_selector.currentText() == "":
|
|
617
|
+
return
|
|
618
|
+
|
|
619
|
+
# Cancel any existing worker thread
|
|
620
|
+
if self.worker_thread and self.worker_thread.isRunning():
|
|
621
|
+
# Signal the thread to cancel
|
|
622
|
+
self.worker_thread.cancel()
|
|
623
|
+
|
|
624
|
+
try:
|
|
625
|
+
# Disconnect all signals to avoid callbacks during termination
|
|
626
|
+
self.worker_thread.progress.disconnect()
|
|
627
|
+
self.worker_thread.result.disconnect()
|
|
628
|
+
self.worker_thread.error.disconnect()
|
|
629
|
+
self.worker_thread.finished.disconnect()
|
|
630
|
+
except Exception:
|
|
631
|
+
pass # Already disconnected
|
|
632
|
+
|
|
633
|
+
# Terminate thread properly
|
|
634
|
+
self.worker_thread.terminate()
|
|
635
|
+
self.worker_thread.wait(1000) # Wait up to 1 second
|
|
636
|
+
self.worker_thread = None # Clear reference
|
|
637
|
+
|
|
638
|
+
target_column = self.column_selector.currentText()
|
|
639
|
+
|
|
640
|
+
# Check in-memory cache first (fastest)
|
|
641
|
+
if target_column in self.result_cache:
|
|
642
|
+
self.handle_results(self.result_cache[target_column])
|
|
643
|
+
return
|
|
644
|
+
|
|
645
|
+
# Check global application-wide cache second (still fast)
|
|
646
|
+
global_key = get_cache_key(self.df, target_column)
|
|
647
|
+
if global_key in ColumnProfilerApp.global_cache:
|
|
648
|
+
self.result_cache[target_column] = ColumnProfilerApp.global_cache[global_key]
|
|
649
|
+
self.handle_results(self.result_cache[target_column])
|
|
650
|
+
return
|
|
651
|
+
|
|
652
|
+
# Disk cache will be checked in the worker thread
|
|
653
|
+
|
|
654
|
+
# Disable the analyze button while processing
|
|
655
|
+
self.analyze_button.setEnabled(False)
|
|
656
|
+
|
|
657
|
+
# Show progress indicators
|
|
658
|
+
self.progress_bar.setValue(0)
|
|
659
|
+
self.progress_bar.show()
|
|
660
|
+
self.progress_label.setText("Starting analysis...")
|
|
661
|
+
self.progress_label.show()
|
|
662
|
+
self.cancel_button.show()
|
|
663
|
+
|
|
664
|
+
# Create and start the worker thread
|
|
665
|
+
self.worker_thread = ExplainerThread(self.df, target_column)
|
|
666
|
+
self.worker_thread.progress.connect(self.update_progress)
|
|
667
|
+
self.worker_thread.result.connect(self.cache_and_display_results)
|
|
668
|
+
self.worker_thread.error.connect(self.handle_error)
|
|
669
|
+
self.worker_thread.finished.connect(self.on_analysis_finished)
|
|
670
|
+
self.worker_thread.start()
|
|
671
|
+
|
|
672
|
+
def update_progress(self, value, message):
|
|
673
|
+
self.progress_bar.setValue(value)
|
|
674
|
+
self.progress_label.setText(message)
|
|
675
|
+
|
|
676
|
+
def cache_and_display_results(self, importance_df):
|
|
677
|
+
# Cache the results
|
|
678
|
+
target_column = self.column_selector.currentText()
|
|
679
|
+
self.result_cache[target_column] = importance_df
|
|
680
|
+
|
|
681
|
+
# Also cache in the global application cache
|
|
682
|
+
global_key = get_cache_key(self.df, target_column)
|
|
683
|
+
ColumnProfilerApp.global_cache[global_key] = importance_df
|
|
684
|
+
|
|
685
|
+
# Display the results
|
|
686
|
+
self.handle_results(importance_df)
|
|
687
|
+
|
|
688
|
+
def on_analysis_finished(self):
|
|
689
|
+
"""Handle cleanup when analysis is finished (either completed or cancelled)"""
|
|
690
|
+
self.analyze_button.setEnabled(True)
|
|
691
|
+
self.cancel_button.hide()
|
|
692
|
+
|
|
693
|
+
def handle_results(self, importance_df):
|
|
694
|
+
# Hide progress indicators
|
|
695
|
+
self.progress_bar.hide()
|
|
696
|
+
self.progress_label.hide()
|
|
697
|
+
self.cancel_button.hide()
|
|
698
|
+
|
|
699
|
+
# Update importance table to include correlation column
|
|
700
|
+
self.importance_table.setColumnCount(3)
|
|
701
|
+
self.importance_table.setHorizontalHeaderLabels(["Feature", "Importance", "Abs. Correlation"])
|
|
702
|
+
self.importance_table.setRowCount(len(importance_df))
|
|
703
|
+
|
|
704
|
+
# Using a timer for incremental updates
|
|
705
|
+
self.importance_df = importance_df # Store for incremental rendering
|
|
706
|
+
self.current_row = 0
|
|
707
|
+
self.render_timer = QTimer()
|
|
708
|
+
self.render_timer.timeout.connect(lambda: self.render_next_batch(10))
|
|
709
|
+
self.render_timer.start(10) # Update every 10ms
|
|
710
|
+
|
|
711
|
+
def render_next_batch(self, batch_size):
|
|
712
|
+
try:
|
|
713
|
+
if self.current_row >= len(self.importance_df):
|
|
714
|
+
# All rows rendered, now render the chart and stop the timer
|
|
715
|
+
self.render_chart()
|
|
716
|
+
self.render_timer.stop()
|
|
717
|
+
return
|
|
718
|
+
|
|
719
|
+
# Render a batch of rows
|
|
720
|
+
end_row = min(self.current_row + batch_size, len(self.importance_df))
|
|
721
|
+
for row in range(self.current_row, end_row):
|
|
722
|
+
try:
|
|
723
|
+
# Check if row exists in dataframe to prevent index errors
|
|
724
|
+
if row < len(self.importance_df):
|
|
725
|
+
feature = self.importance_df.iloc[row]['feature']
|
|
726
|
+
importance_value = self.importance_df.iloc[row]['importance_value']
|
|
727
|
+
|
|
728
|
+
# Add correlation if available
|
|
729
|
+
correlation = self.importance_df.iloc[row].get('correlation', None)
|
|
730
|
+
if correlation is not None:
|
|
731
|
+
self.importance_table.setItem(row, 0, QTableWidgetItem(str(feature)))
|
|
732
|
+
self.importance_table.setItem(row, 1, QTableWidgetItem(str(round(importance_value, 4))))
|
|
733
|
+
self.importance_table.setItem(row, 2, QTableWidgetItem(str(round(correlation, 4))))
|
|
734
|
+
else:
|
|
735
|
+
self.importance_table.setItem(row, 0, QTableWidgetItem(str(feature)))
|
|
736
|
+
self.importance_table.setItem(row, 1, QTableWidgetItem(str(round(importance_value, 4))))
|
|
737
|
+
else:
|
|
738
|
+
# Handle out of range index
|
|
739
|
+
print(f"Warning: Row {row} is out of range (max: {len(self.importance_df)-1})")
|
|
740
|
+
self.importance_table.setItem(row, 0, QTableWidgetItem("Error"))
|
|
741
|
+
self.importance_table.setItem(row, 1, QTableWidgetItem("Out of range"))
|
|
742
|
+
self.importance_table.setItem(row, 2, QTableWidgetItem("N/A"))
|
|
743
|
+
except (IndexError, KeyError) as e:
|
|
744
|
+
# Enhanced error reporting for index and key errors
|
|
745
|
+
import traceback
|
|
746
|
+
trace = traceback.format_exc()
|
|
747
|
+
error_msg = f"Error rendering row {row}: {e.__class__.__name__}: {e}\n{trace}"
|
|
748
|
+
print(error_msg)
|
|
749
|
+
|
|
750
|
+
# Handle missing data in the dataframe gracefully
|
|
751
|
+
self.importance_table.setItem(row, 0, QTableWidgetItem(f"Error: {e.__class__.__name__}"))
|
|
752
|
+
self.importance_table.setItem(row, 1, QTableWidgetItem(f"{str(e)[:20]}"))
|
|
753
|
+
self.importance_table.setItem(row, 2, QTableWidgetItem("Error"))
|
|
754
|
+
except Exception as e:
|
|
755
|
+
# Catch any other exceptions
|
|
756
|
+
print(f"Unexpected error rendering row {row}: {e.__class__.__name__}: {e}")
|
|
757
|
+
self.importance_table.setItem(row, 0, QTableWidgetItem(f"Error: {e.__class__.__name__}"))
|
|
758
|
+
self.importance_table.setItem(row, 1, QTableWidgetItem("See console for details"))
|
|
759
|
+
self.importance_table.setItem(row, 2, QTableWidgetItem("Error"))
|
|
760
|
+
|
|
761
|
+
self.current_row = end_row
|
|
762
|
+
QApplication.processEvents() # Allow UI to update
|
|
763
|
+
except Exception as e:
|
|
764
|
+
# Catch any exceptions in the rendering loop itself
|
|
765
|
+
import traceback
|
|
766
|
+
trace = traceback.format_exc()
|
|
767
|
+
error_msg = f"Error in render_next_batch: {e.__class__.__name__}: {e}\n{trace}"
|
|
768
|
+
print(error_msg)
|
|
769
|
+
|
|
770
|
+
# Try to stop the timer to prevent further errors
|
|
771
|
+
try:
|
|
772
|
+
if self.render_timer and self.render_timer.isActive():
|
|
773
|
+
self.render_timer.stop()
|
|
774
|
+
except:
|
|
775
|
+
pass
|
|
776
|
+
|
|
777
|
+
# Show error
|
|
778
|
+
QMessageBox.critical(self, "Rendering Error",
|
|
779
|
+
f"Error rendering results: {e.__class__.__name__}: {e}")
|
|
780
|
+
|
|
781
|
+
def render_chart(self):
|
|
782
|
+
# Create horizontal bar chart
|
|
783
|
+
try:
|
|
784
|
+
if self.importance_df is None or len(self.importance_df) == 0:
|
|
785
|
+
# No data to render
|
|
786
|
+
self.chart_view.axes.clear()
|
|
787
|
+
self.chart_view.axes.text(0.5, 0.5, "No data available for chart",
|
|
788
|
+
ha='center', va='center', fontsize=12, color='gray')
|
|
789
|
+
self.chart_view.axes.set_axis_off()
|
|
790
|
+
self.chart_view.draw()
|
|
791
|
+
return
|
|
792
|
+
|
|
793
|
+
self.chart_view.axes.clear()
|
|
794
|
+
|
|
795
|
+
# Get a sorted copy based on current sort key
|
|
796
|
+
plot_df = self.importance_df.sort_values(by=self.current_sort, ascending=False).head(20).copy()
|
|
797
|
+
|
|
798
|
+
# Verify we have data before proceeding
|
|
799
|
+
if len(plot_df) == 0:
|
|
800
|
+
self.chart_view.axes.text(0.5, 0.5, "No features found with importance values",
|
|
801
|
+
ha='center', va='center', fontsize=12, color='gray')
|
|
802
|
+
self.chart_view.axes.set_axis_off()
|
|
803
|
+
self.chart_view.draw()
|
|
804
|
+
return
|
|
805
|
+
|
|
806
|
+
# Check required columns exist
|
|
807
|
+
required_columns = ['feature', 'importance_value']
|
|
808
|
+
missing_columns = [col for col in required_columns if col not in plot_df.columns]
|
|
809
|
+
if missing_columns:
|
|
810
|
+
error_msg = f"Missing required columns: {', '.join(missing_columns)}"
|
|
811
|
+
self.chart_view.axes.text(0.5, 0.5, error_msg,
|
|
812
|
+
ha='center', va='center', fontsize=12, color='red')
|
|
813
|
+
self.chart_view.axes.set_axis_off()
|
|
814
|
+
self.chart_view.draw()
|
|
815
|
+
print(f"Chart rendering error: {error_msg}")
|
|
816
|
+
return
|
|
817
|
+
|
|
818
|
+
# Truncate long feature names for better display
|
|
819
|
+
max_feature_length = 30
|
|
820
|
+
plot_df['display_feature'] = plot_df['feature'].apply(
|
|
821
|
+
lambda x: (str(x)[:max_feature_length] + '...') if len(str(x)) > max_feature_length else str(x)
|
|
822
|
+
)
|
|
823
|
+
|
|
824
|
+
# Reverse order for better display (highest at top)
|
|
825
|
+
plot_df = plot_df.iloc[::-1].reset_index(drop=True)
|
|
826
|
+
|
|
827
|
+
# Create a figure with two subplots side by side
|
|
828
|
+
self.chart_view.figure.clear()
|
|
829
|
+
gs = self.chart_view.figure.add_gridspec(1, 2, width_ratios=[3, 2])
|
|
830
|
+
|
|
831
|
+
# First subplot for importance
|
|
832
|
+
ax1 = self.chart_view.figure.add_subplot(gs[0, 0])
|
|
833
|
+
|
|
834
|
+
# Create a colormap for better visualization
|
|
835
|
+
cmap = plt.cm.Blues
|
|
836
|
+
colors = cmap(np.linspace(0.4, 0.8, len(plot_df)))
|
|
837
|
+
|
|
838
|
+
# Plot with custom colors
|
|
839
|
+
bars = ax1.barh(
|
|
840
|
+
plot_df['display_feature'],
|
|
841
|
+
plot_df['importance_value'],
|
|
842
|
+
color=colors,
|
|
843
|
+
height=0.7, # Thinner bars for more spacing
|
|
844
|
+
alpha=0.8
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
# Add values at the end of bars
|
|
848
|
+
for bar in bars:
|
|
849
|
+
width = bar.get_width()
|
|
850
|
+
ax1.text(
|
|
851
|
+
width * 1.05,
|
|
852
|
+
bar.get_y() + bar.get_height()/2,
|
|
853
|
+
f'{width:.2f}',
|
|
854
|
+
va='center',
|
|
855
|
+
fontsize=9,
|
|
856
|
+
fontweight='bold'
|
|
857
|
+
)
|
|
858
|
+
|
|
859
|
+
# Add grid for better readability
|
|
860
|
+
ax1.grid(True, axis='x', linestyle='--', alpha=0.3)
|
|
861
|
+
|
|
862
|
+
# Remove unnecessary spines
|
|
863
|
+
for spine in ['top', 'right']:
|
|
864
|
+
ax1.spines[spine].set_visible(False)
|
|
865
|
+
|
|
866
|
+
# Make labels more readable
|
|
867
|
+
ax1.tick_params(axis='y', labelsize=9)
|
|
868
|
+
|
|
869
|
+
# Set title and labels
|
|
870
|
+
ax1.set_title(f'Feature Importance for {self.column_selector.currentText()}')
|
|
871
|
+
ax1.set_xlabel('Importance Value')
|
|
872
|
+
|
|
873
|
+
# Add a note about the sorting order
|
|
874
|
+
sort_label = "Sorted by: " + ("Importance" if self.current_sort == 'importance_value' else "Correlation")
|
|
875
|
+
|
|
876
|
+
# Second subplot for correlation if available
|
|
877
|
+
if 'correlation' in plot_df.columns:
|
|
878
|
+
ax2 = self.chart_view.figure.add_subplot(gs[0, 1], sharey=ax1)
|
|
879
|
+
|
|
880
|
+
# Create a colormap for correlation - use a different color
|
|
881
|
+
cmap_corr = plt.cm.Reds
|
|
882
|
+
colors_corr = cmap_corr(np.linspace(0.4, 0.8, len(plot_df)))
|
|
883
|
+
|
|
884
|
+
# Plot correlation bars
|
|
885
|
+
corr_bars = ax2.barh(
|
|
886
|
+
plot_df['display_feature'],
|
|
887
|
+
plot_df['correlation'],
|
|
888
|
+
color=colors_corr,
|
|
889
|
+
height=0.7,
|
|
890
|
+
alpha=0.8
|
|
891
|
+
)
|
|
892
|
+
|
|
893
|
+
# Add values at the end of correlation bars
|
|
894
|
+
for bar in corr_bars:
|
|
895
|
+
width = bar.get_width()
|
|
896
|
+
ax2.text(
|
|
897
|
+
width * 1.05,
|
|
898
|
+
bar.get_y() + bar.get_height()/2,
|
|
899
|
+
f'{width:.2f}',
|
|
900
|
+
va='center',
|
|
901
|
+
fontsize=9,
|
|
902
|
+
fontweight='bold'
|
|
903
|
+
)
|
|
904
|
+
|
|
905
|
+
# Add grid and styling
|
|
906
|
+
ax2.grid(True, axis='x', linestyle='--', alpha=0.3)
|
|
907
|
+
ax2.set_title('Absolute Correlation')
|
|
908
|
+
ax2.set_xlabel('Correlation Value')
|
|
909
|
+
|
|
910
|
+
# Hide y-axis labels since they're shared with the first plot
|
|
911
|
+
ax2.set_yticklabels([])
|
|
912
|
+
|
|
913
|
+
# Remove unnecessary spines
|
|
914
|
+
for spine in ['top', 'right']:
|
|
915
|
+
ax2.spines[spine].set_visible(False)
|
|
916
|
+
|
|
917
|
+
# Add a note about the current sort order
|
|
918
|
+
self.chart_view.figure.text(0.5, 0.01, sort_label, ha='center', fontsize=9, style='italic')
|
|
919
|
+
|
|
920
|
+
# Adjust figure size based on number of features
|
|
921
|
+
feature_count = len(plot_df)
|
|
922
|
+
self.chart_view.figure.set_figheight(max(5, min(4 + feature_count * 0.3, 12)))
|
|
923
|
+
|
|
924
|
+
# Adjust layout and draw
|
|
925
|
+
self.chart_view.figure.tight_layout(rect=[0, 0.03, 1, 0.97]) # Make room for sort label
|
|
926
|
+
self.chart_view.draw()
|
|
927
|
+
|
|
928
|
+
except IndexError as e:
|
|
929
|
+
# Special handling for index errors with detailed information
|
|
930
|
+
import traceback
|
|
931
|
+
import inspect
|
|
932
|
+
|
|
933
|
+
# Get stack trace information
|
|
934
|
+
trace = traceback.format_exc()
|
|
935
|
+
|
|
936
|
+
# Try to get line and context information
|
|
937
|
+
try:
|
|
938
|
+
frame = inspect.trace()[-1]
|
|
939
|
+
frame_info = inspect.getframeinfo(frame[0])
|
|
940
|
+
filename = frame_info.filename
|
|
941
|
+
lineno = frame_info.lineno
|
|
942
|
+
function = frame_info.function
|
|
943
|
+
code_context = frame_info.code_context[0].strip() if frame_info.code_context else "Unknown code context"
|
|
944
|
+
|
|
945
|
+
# Detailed error message
|
|
946
|
+
detail_msg = f"IndexError at line {lineno} in {function}: {str(e)}\nCode: {code_context}"
|
|
947
|
+
print(f"Chart rendering error: {detail_msg}\n{trace}")
|
|
948
|
+
|
|
949
|
+
# Display error in chart
|
|
950
|
+
self.chart_view.axes.clear()
|
|
951
|
+
self.chart_view.axes.text(0.5, 0.5,
|
|
952
|
+
f"Index Error in chart rendering:\n{str(e)}\nAt line {lineno}: {code_context}",
|
|
953
|
+
ha='center', va='center', fontsize=12, color='red',
|
|
954
|
+
wrap=True)
|
|
955
|
+
self.chart_view.axes.set_axis_off()
|
|
956
|
+
self.chart_view.draw()
|
|
957
|
+
except Exception as inner_e:
|
|
958
|
+
# Fallback if the detailed error reporting fails
|
|
959
|
+
print(f"Error getting detailed error info: {inner_e}")
|
|
960
|
+
print(f"Original error: {e}\n{trace}")
|
|
961
|
+
|
|
962
|
+
self.chart_view.axes.clear()
|
|
963
|
+
self.chart_view.axes.text(0.5, 0.5, f"Index Error: {str(e)}",
|
|
964
|
+
ha='center', va='center', fontsize=12, color='red')
|
|
965
|
+
self.chart_view.axes.set_axis_off()
|
|
966
|
+
self.chart_view.draw()
|
|
967
|
+
except Exception as e:
|
|
968
|
+
# Recover gracefully from any chart rendering errors with detailed information
|
|
969
|
+
import traceback
|
|
970
|
+
trace = traceback.format_exc()
|
|
971
|
+
error_msg = f"Error rendering chart: {e.__class__.__name__}: {str(e)}"
|
|
972
|
+
print(f"{error_msg}\n{trace}")
|
|
973
|
+
|
|
974
|
+
self.chart_view.axes.clear()
|
|
975
|
+
self.chart_view.axes.text(0.5, 0.5, error_msg,
|
|
976
|
+
ha='center', va='center', fontsize=12, color='red',
|
|
977
|
+
wrap=True)
|
|
978
|
+
self.chart_view.axes.set_axis_off()
|
|
979
|
+
self.chart_view.draw()
|
|
980
|
+
|
|
981
|
+
def handle_error(self, error_message):
|
|
982
|
+
"""Handle errors during analysis"""
|
|
983
|
+
# Hide progress indicators
|
|
984
|
+
self.progress_bar.hide()
|
|
985
|
+
self.progress_label.hide()
|
|
986
|
+
self.cancel_button.hide()
|
|
987
|
+
|
|
988
|
+
# Re-enable analyze button
|
|
989
|
+
self.analyze_button.setEnabled(True)
|
|
990
|
+
|
|
991
|
+
# Print error to console for debugging
|
|
992
|
+
print(f"Error in column profiler: {error_message}")
|
|
993
|
+
|
|
994
|
+
# Show error notification
|
|
995
|
+
show_error_notification(f"Analysis Error: {error_message.split(chr(10))[0] if chr(10) in error_message else error_message}")
|
|
996
|
+
|
|
997
|
+
# Show a message in the UI as well
|
|
998
|
+
self.importance_table.setRowCount(1)
|
|
999
|
+
self.importance_table.setColumnCount(3)
|
|
1000
|
+
self.importance_table.setHorizontalHeaderLabels(["Feature", "Importance", "Abs. Correlation"])
|
|
1001
|
+
self.importance_table.setItem(0, 0, QTableWidgetItem(f"Error: {error_message.split(chr(10))[0]}"))
|
|
1002
|
+
self.importance_table.setItem(0, 1, QTableWidgetItem(""))
|
|
1003
|
+
self.importance_table.setItem(0, 2, QTableWidgetItem(""))
|
|
1004
|
+
self.importance_table.resizeColumnsToContents()
|
|
1005
|
+
|
|
1006
|
+
# Update the chart to show error
|
|
1007
|
+
self.chart_view.axes.clear()
|
|
1008
|
+
self.chart_view.axes.text(0.5, 0.5, f"Error calculating importance:\n{error_message.split(chr(10))[0]}",
|
|
1009
|
+
ha='center', va='center', fontsize=12, color='red',
|
|
1010
|
+
wrap=True)
|
|
1011
|
+
self.chart_view.axes.set_axis_off()
|
|
1012
|
+
self.chart_view.draw()
|
|
1013
|
+
|
|
1014
|
+
def closeEvent(self, event):
|
|
1015
|
+
"""Clean up when the window is closed"""
|
|
1016
|
+
# Stop any running timer
|
|
1017
|
+
if self.render_timer and self.render_timer.isActive():
|
|
1018
|
+
self.render_timer.stop()
|
|
1019
|
+
|
|
1020
|
+
# Clean up any background threads
|
|
1021
|
+
if self.worker_thread and self.worker_thread.isRunning():
|
|
1022
|
+
# Disconnect all signals to avoid callbacks during termination
|
|
1023
|
+
try:
|
|
1024
|
+
self.worker_thread.progress.disconnect()
|
|
1025
|
+
self.worker_thread.result.disconnect()
|
|
1026
|
+
self.worker_thread.error.disconnect()
|
|
1027
|
+
self.worker_thread.finished.disconnect()
|
|
1028
|
+
except Exception:
|
|
1029
|
+
pass # Already disconnected
|
|
1030
|
+
|
|
1031
|
+
# Terminate thread properly
|
|
1032
|
+
self.worker_thread.terminate()
|
|
1033
|
+
self.worker_thread.wait(1000) # Wait up to 1 second
|
|
1034
|
+
|
|
1035
|
+
# Clear references to prevent thread issues
|
|
1036
|
+
self.worker_thread = None
|
|
1037
|
+
|
|
1038
|
+
# Clean up memory
|
|
1039
|
+
self.result_cache.clear()
|
|
1040
|
+
|
|
1041
|
+
# Accept the close event
|
|
1042
|
+
event.accept()
|
|
1043
|
+
|
|
1044
|
+
# Suggest garbage collection
|
|
1045
|
+
gc.collect()
|
|
1046
|
+
|
|
1047
|
+
def cancel_analysis(self):
|
|
1048
|
+
"""Cancel the current analysis"""
|
|
1049
|
+
if self.worker_thread and self.worker_thread.isRunning():
|
|
1050
|
+
# Signal the thread to cancel first
|
|
1051
|
+
self.worker_thread.cancel()
|
|
1052
|
+
|
|
1053
|
+
# Disconnect all signals to avoid callbacks during termination
|
|
1054
|
+
try:
|
|
1055
|
+
self.worker_thread.progress.disconnect()
|
|
1056
|
+
self.worker_thread.result.disconnect()
|
|
1057
|
+
self.worker_thread.error.disconnect()
|
|
1058
|
+
self.worker_thread.finished.disconnect()
|
|
1059
|
+
except Exception:
|
|
1060
|
+
pass # Already disconnected
|
|
1061
|
+
|
|
1062
|
+
# Terminate thread properly
|
|
1063
|
+
self.worker_thread.terminate()
|
|
1064
|
+
self.worker_thread.wait(1000) # Wait up to 1 second
|
|
1065
|
+
|
|
1066
|
+
# Clear reference
|
|
1067
|
+
self.worker_thread = None
|
|
1068
|
+
|
|
1069
|
+
# Update UI
|
|
1070
|
+
self.progress_bar.hide()
|
|
1071
|
+
self.progress_label.setText("Analysis cancelled")
|
|
1072
|
+
self.progress_label.show()
|
|
1073
|
+
self.cancel_button.hide()
|
|
1074
|
+
self.analyze_button.setEnabled(True)
|
|
1075
|
+
|
|
1076
|
+
# Hide the progress label after 2 seconds
|
|
1077
|
+
QTimer.singleShot(2000, self.progress_label.hide)
|
|
1078
|
+
|
|
1079
|
+
def show_relationship_visualization(self, row, column):
|
|
1080
|
+
"""Show visualization of relationship between selected feature and target column"""
|
|
1081
|
+
if self.importance_df is None or row < 0 or row >= len(self.importance_df):
|
|
1082
|
+
return
|
|
1083
|
+
|
|
1084
|
+
# Get the feature name and target column
|
|
1085
|
+
try:
|
|
1086
|
+
feature = self.importance_df.iloc[row]['feature']
|
|
1087
|
+
target = self.column_selector.currentText()
|
|
1088
|
+
|
|
1089
|
+
# Verify both columns exist in the dataframe
|
|
1090
|
+
if feature not in self.df.columns:
|
|
1091
|
+
QMessageBox.warning(self, "Column Not Found",
|
|
1092
|
+
f"Feature column '{feature}' not found in the dataframe")
|
|
1093
|
+
return
|
|
1094
|
+
|
|
1095
|
+
if target not in self.df.columns:
|
|
1096
|
+
QMessageBox.warning(self, "Column Not Found",
|
|
1097
|
+
f"Target column '{target}' not found in the dataframe")
|
|
1098
|
+
return
|
|
1099
|
+
except Exception as e:
|
|
1100
|
+
QMessageBox.critical(self, "Error", f"Error getting column data: {str(e)}")
|
|
1101
|
+
return
|
|
1102
|
+
|
|
1103
|
+
# Create a dialog to show the visualization
|
|
1104
|
+
dialog = QDialog(self)
|
|
1105
|
+
dialog.setWindowTitle(f"Relationship: {feature} vs {target}")
|
|
1106
|
+
dialog.resize(900, 700)
|
|
1107
|
+
|
|
1108
|
+
# Create layout
|
|
1109
|
+
layout = QVBoxLayout(dialog)
|
|
1110
|
+
|
|
1111
|
+
# Create canvas for the plot
|
|
1112
|
+
canvas = MatplotlibCanvas(width=8, height=6, dpi=100)
|
|
1113
|
+
layout.addWidget(canvas)
|
|
1114
|
+
|
|
1115
|
+
# Determine the data types
|
|
1116
|
+
feature_is_numeric = pd.api.types.is_numeric_dtype(self.df[feature])
|
|
1117
|
+
target_is_numeric = pd.api.types.is_numeric_dtype(self.df[target])
|
|
1118
|
+
|
|
1119
|
+
# Get unique counts to determine if we have high cardinality
|
|
1120
|
+
feature_unique_count = self.df[feature].nunique()
|
|
1121
|
+
target_unique_count = self.df[target].nunique()
|
|
1122
|
+
|
|
1123
|
+
# Define high cardinality threshold
|
|
1124
|
+
high_cardinality_threshold = 10
|
|
1125
|
+
|
|
1126
|
+
# Clear the figure
|
|
1127
|
+
canvas.axes.clear()
|
|
1128
|
+
|
|
1129
|
+
# Create a working copy of the dataframe
|
|
1130
|
+
working_df = self.df.copy()
|
|
1131
|
+
|
|
1132
|
+
# Prepare data for high cardinality columns
|
|
1133
|
+
if not feature_is_numeric and feature_unique_count > high_cardinality_threshold:
|
|
1134
|
+
# Get the top N categories by frequency
|
|
1135
|
+
top_categories = self.df[feature].value_counts().nlargest(high_cardinality_threshold).index.tolist()
|
|
1136
|
+
# Create "Other" category for remaining values
|
|
1137
|
+
working_df[feature] = working_df[feature].apply(lambda x: x if x in top_categories else 'Other')
|
|
1138
|
+
|
|
1139
|
+
if not target_is_numeric and target_unique_count > high_cardinality_threshold:
|
|
1140
|
+
top_categories = self.df[target].value_counts().nlargest(high_cardinality_threshold).index.tolist()
|
|
1141
|
+
working_df[target] = working_df[target].apply(lambda x: x if x in top_categories else 'Other')
|
|
1142
|
+
|
|
1143
|
+
# Create appropriate visualization based on data types and cardinality
|
|
1144
|
+
if feature_is_numeric and target_is_numeric:
|
|
1145
|
+
# Scatter plot for numeric vs numeric
|
|
1146
|
+
# Use hexbin for large datasets to avoid overplotting
|
|
1147
|
+
if len(working_df) > 100:
|
|
1148
|
+
canvas.axes.hexbin(
|
|
1149
|
+
working_df[feature],
|
|
1150
|
+
working_df[target],
|
|
1151
|
+
gridsize=25,
|
|
1152
|
+
cmap='Blues',
|
|
1153
|
+
mincnt=1
|
|
1154
|
+
)
|
|
1155
|
+
canvas.axes.set_title(f"Hexbin Density Plot: {feature} vs {target}")
|
|
1156
|
+
canvas.axes.set_xlabel(feature)
|
|
1157
|
+
canvas.axes.set_ylabel(target)
|
|
1158
|
+
# Add a colorbar
|
|
1159
|
+
cbar = canvas.figure.colorbar(canvas.axes.collections[0], ax=canvas.axes)
|
|
1160
|
+
cbar.set_label('Count')
|
|
1161
|
+
else:
|
|
1162
|
+
# For smaller datasets, use a scatter plot with transparency
|
|
1163
|
+
sns.scatterplot(
|
|
1164
|
+
x=feature,
|
|
1165
|
+
y=target,
|
|
1166
|
+
data=working_df,
|
|
1167
|
+
ax=canvas.axes,
|
|
1168
|
+
alpha=0.6
|
|
1169
|
+
)
|
|
1170
|
+
# Add regression line
|
|
1171
|
+
sns.regplot(
|
|
1172
|
+
x=feature,
|
|
1173
|
+
y=target,
|
|
1174
|
+
data=working_df,
|
|
1175
|
+
ax=canvas.axes,
|
|
1176
|
+
scatter=False,
|
|
1177
|
+
line_kws={"color": "red"}
|
|
1178
|
+
)
|
|
1179
|
+
canvas.axes.set_title(f"Scatter Plot: {feature} vs {target}")
|
|
1180
|
+
|
|
1181
|
+
elif feature_is_numeric and not target_is_numeric:
|
|
1182
|
+
# Box plot for numeric vs categorical
|
|
1183
|
+
if target_unique_count <= high_cardinality_threshold * 2:
|
|
1184
|
+
# Standard boxplot for reasonable number of categories
|
|
1185
|
+
order = working_df[target].value_counts().nlargest(high_cardinality_threshold * 2).index
|
|
1186
|
+
|
|
1187
|
+
# Calculate counts for each category
|
|
1188
|
+
category_counts = working_df[target].value_counts()
|
|
1189
|
+
|
|
1190
|
+
sns.boxplot(
|
|
1191
|
+
x=target,
|
|
1192
|
+
y=feature,
|
|
1193
|
+
data=working_df,
|
|
1194
|
+
ax=canvas.axes,
|
|
1195
|
+
order=order
|
|
1196
|
+
)
|
|
1197
|
+
canvas.axes.set_title(f"Box Plot: {feature} by {target}")
|
|
1198
|
+
|
|
1199
|
+
# Add count annotations below each box
|
|
1200
|
+
for i, category in enumerate(order):
|
|
1201
|
+
if category in category_counts:
|
|
1202
|
+
count = category_counts[category]
|
|
1203
|
+
canvas.axes.text(
|
|
1204
|
+
i,
|
|
1205
|
+
canvas.axes.get_ylim()[0] - (canvas.axes.get_ylim()[1] - canvas.axes.get_ylim()[0]) * 0.05,
|
|
1206
|
+
f'n={count}',
|
|
1207
|
+
ha='center',
|
|
1208
|
+
va='top',
|
|
1209
|
+
fontsize=8,
|
|
1210
|
+
fontweight='bold'
|
|
1211
|
+
)
|
|
1212
|
+
|
|
1213
|
+
# Rotate x-axis labels for better readability
|
|
1214
|
+
canvas.axes.set_xticklabels(
|
|
1215
|
+
canvas.axes.get_xticklabels(),
|
|
1216
|
+
rotation=45,
|
|
1217
|
+
ha='right'
|
|
1218
|
+
)
|
|
1219
|
+
else:
|
|
1220
|
+
# For very high cardinality, use a violin plot with limited categories
|
|
1221
|
+
order = working_df[target].value_counts().nlargest(high_cardinality_threshold).index
|
|
1222
|
+
working_df_filtered = working_df[working_df[target].isin(order)]
|
|
1223
|
+
|
|
1224
|
+
# Calculate counts for filtered categories
|
|
1225
|
+
category_counts = working_df_filtered[target].value_counts()
|
|
1226
|
+
|
|
1227
|
+
sns.violinplot(
|
|
1228
|
+
x=target,
|
|
1229
|
+
y=feature,
|
|
1230
|
+
data=working_df_filtered,
|
|
1231
|
+
ax=canvas.axes,
|
|
1232
|
+
inner='quartile',
|
|
1233
|
+
cut=0
|
|
1234
|
+
)
|
|
1235
|
+
canvas.axes.set_title(f"Violin Plot: {feature} by Top {len(order)} {target} Categories")
|
|
1236
|
+
|
|
1237
|
+
# Add count annotations below each violin
|
|
1238
|
+
for i, category in enumerate(order):
|
|
1239
|
+
if category in category_counts:
|
|
1240
|
+
count = category_counts[category]
|
|
1241
|
+
canvas.axes.text(
|
|
1242
|
+
i,
|
|
1243
|
+
canvas.axes.get_ylim()[0] - (canvas.axes.get_ylim()[1] - canvas.axes.get_ylim()[0]) * 0.05,
|
|
1244
|
+
f'n={count}',
|
|
1245
|
+
ha='center',
|
|
1246
|
+
va='top',
|
|
1247
|
+
fontsize=8,
|
|
1248
|
+
fontweight='bold'
|
|
1249
|
+
)
|
|
1250
|
+
|
|
1251
|
+
canvas.axes.set_xticklabels(
|
|
1252
|
+
canvas.axes.get_xticklabels(),
|
|
1253
|
+
rotation=45,
|
|
1254
|
+
ha='right'
|
|
1255
|
+
)
|
|
1256
|
+
|
|
1257
|
+
elif not feature_is_numeric and target_is_numeric:
|
|
1258
|
+
# Bar plot for categorical vs numeric
|
|
1259
|
+
if feature_unique_count <= high_cardinality_threshold * 2:
|
|
1260
|
+
# Use standard barplot for reasonable number of categories
|
|
1261
|
+
order = working_df[feature].value_counts().nlargest(high_cardinality_threshold * 2).index
|
|
1262
|
+
|
|
1263
|
+
# Calculate counts for each category for annotations
|
|
1264
|
+
category_counts = working_df[feature].value_counts()
|
|
1265
|
+
|
|
1266
|
+
sns.barplot(
|
|
1267
|
+
x=feature,
|
|
1268
|
+
y=target,
|
|
1269
|
+
data=working_df,
|
|
1270
|
+
ax=canvas.axes,
|
|
1271
|
+
order=order,
|
|
1272
|
+
estimator=np.mean,
|
|
1273
|
+
errorbar=('ci', 95),
|
|
1274
|
+
capsize=0.2
|
|
1275
|
+
)
|
|
1276
|
+
canvas.axes.set_title(f"Bar Plot: Average {target} by {feature}")
|
|
1277
|
+
|
|
1278
|
+
# Add value labels and counts on top of bars
|
|
1279
|
+
for i, p in enumerate(canvas.axes.patches):
|
|
1280
|
+
# Get the category name for this bar
|
|
1281
|
+
if i < len(order):
|
|
1282
|
+
category = order[i]
|
|
1283
|
+
count = category_counts[category]
|
|
1284
|
+
|
|
1285
|
+
# Add mean value and count
|
|
1286
|
+
canvas.axes.annotate(
|
|
1287
|
+
f'{p.get_height():.1f}\n(n={count})',
|
|
1288
|
+
(p.get_x() + p.get_width() / 2., p.get_height()),
|
|
1289
|
+
ha='center',
|
|
1290
|
+
va='bottom',
|
|
1291
|
+
fontsize=8,
|
|
1292
|
+
rotation=0
|
|
1293
|
+
)
|
|
1294
|
+
|
|
1295
|
+
# Rotate x-axis labels if needed
|
|
1296
|
+
if feature_unique_count > 5:
|
|
1297
|
+
canvas.axes.set_xticklabels(
|
|
1298
|
+
canvas.axes.get_xticklabels(),
|
|
1299
|
+
rotation=45,
|
|
1300
|
+
ha='right'
|
|
1301
|
+
)
|
|
1302
|
+
else:
|
|
1303
|
+
# For high cardinality, use a horizontal bar plot with top N categories
|
|
1304
|
+
top_n = 15 # Show top 15 categories
|
|
1305
|
+
# Calculate mean of target for each feature category
|
|
1306
|
+
grouped = working_df.groupby(feature)[target].agg(['mean', 'count', 'std']).reset_index()
|
|
1307
|
+
# Sort by mean and take top categories
|
|
1308
|
+
top_groups = grouped.nlargest(top_n, 'mean')
|
|
1309
|
+
|
|
1310
|
+
# Sort by mean value for better visualization
|
|
1311
|
+
sns.barplot(
|
|
1312
|
+
y=feature,
|
|
1313
|
+
x='mean',
|
|
1314
|
+
data=top_groups,
|
|
1315
|
+
ax=canvas.axes,
|
|
1316
|
+
orient='h'
|
|
1317
|
+
)
|
|
1318
|
+
canvas.axes.set_title(f"Top {top_n} Categories by Average {target}")
|
|
1319
|
+
canvas.axes.set_xlabel(f"Average {target}")
|
|
1320
|
+
|
|
1321
|
+
# Add count annotations
|
|
1322
|
+
for i, row in enumerate(top_groups.itertuples()):
|
|
1323
|
+
canvas.axes.text(
|
|
1324
|
+
row.mean + 0.1,
|
|
1325
|
+
i,
|
|
1326
|
+
f'n={row.count}',
|
|
1327
|
+
va='center',
|
|
1328
|
+
fontsize=8
|
|
1329
|
+
)
|
|
1330
|
+
|
|
1331
|
+
else:
|
|
1332
|
+
# Both feature and target are categorical
|
|
1333
|
+
if feature_unique_count <= high_cardinality_threshold and target_unique_count <= high_cardinality_threshold:
|
|
1334
|
+
# Heatmap for categorical vs categorical with manageable cardinality
|
|
1335
|
+
crosstab = pd.crosstab(
|
|
1336
|
+
working_df[feature],
|
|
1337
|
+
working_df[target],
|
|
1338
|
+
normalize='index'
|
|
1339
|
+
)
|
|
1340
|
+
|
|
1341
|
+
# Create heatmap with improved readability
|
|
1342
|
+
sns.heatmap(
|
|
1343
|
+
crosstab,
|
|
1344
|
+
annot=True,
|
|
1345
|
+
cmap="YlGnBu",
|
|
1346
|
+
ax=canvas.axes,
|
|
1347
|
+
fmt='.2f',
|
|
1348
|
+
linewidths=0.5,
|
|
1349
|
+
annot_kws={"size": 9 if crosstab.size < 30 else 7}
|
|
1350
|
+
)
|
|
1351
|
+
canvas.axes.set_title(f"Heatmap: {feature} vs {target} (proportions)")
|
|
1352
|
+
else:
|
|
1353
|
+
# For high cardinality in both, show a count plot of top categories
|
|
1354
|
+
feature_top = working_df[feature].value_counts().nlargest(8).index
|
|
1355
|
+
target_top = working_df[target].value_counts().nlargest(5).index
|
|
1356
|
+
|
|
1357
|
+
# Filter data to only include top categories
|
|
1358
|
+
filtered_df = working_df[
|
|
1359
|
+
working_df[feature].isin(feature_top) &
|
|
1360
|
+
working_df[target].isin(target_top)
|
|
1361
|
+
]
|
|
1362
|
+
|
|
1363
|
+
# Create a grouped count plot
|
|
1364
|
+
ax_plot = sns.countplot(
|
|
1365
|
+
x=feature,
|
|
1366
|
+
hue=target,
|
|
1367
|
+
data=filtered_df,
|
|
1368
|
+
ax=canvas.axes
|
|
1369
|
+
)
|
|
1370
|
+
canvas.axes.set_title(f"Count Plot: Top {len(feature_top)} {feature} by Top {len(target_top)} {target}")
|
|
1371
|
+
|
|
1372
|
+
# Add count labels on top of bars
|
|
1373
|
+
for p in canvas.axes.patches:
|
|
1374
|
+
if p.get_height() > 0: # Only add labels for non-zero bars
|
|
1375
|
+
canvas.axes.annotate(
|
|
1376
|
+
f'{int(p.get_height())}',
|
|
1377
|
+
(p.get_x() + p.get_width() / 2., p.get_height()),
|
|
1378
|
+
ha='center',
|
|
1379
|
+
va='bottom',
|
|
1380
|
+
fontsize=8,
|
|
1381
|
+
rotation=0
|
|
1382
|
+
)
|
|
1383
|
+
|
|
1384
|
+
# Rotate x-axis labels
|
|
1385
|
+
canvas.axes.set_xticklabels(
|
|
1386
|
+
canvas.axes.get_xticklabels(),
|
|
1387
|
+
rotation=45,
|
|
1388
|
+
ha='right'
|
|
1389
|
+
)
|
|
1390
|
+
|
|
1391
|
+
# Move legend to a better position
|
|
1392
|
+
canvas.axes.legend(title=target, bbox_to_anchor=(1.05, 1), loc='upper left')
|
|
1393
|
+
|
|
1394
|
+
# Add informational text about data reduction if applicable
|
|
1395
|
+
if (not feature_is_numeric and feature_unique_count > high_cardinality_threshold) or \
|
|
1396
|
+
(not target_is_numeric and target_unique_count > high_cardinality_threshold):
|
|
1397
|
+
canvas.figure.text(
|
|
1398
|
+
0.5, 0.01,
|
|
1399
|
+
f"Note: Visualization simplified to show top categories only. Original data has {feature_unique_count} unique {feature} values and {target_unique_count} unique {target} values.",
|
|
1400
|
+
ha='center',
|
|
1401
|
+
fontsize=8,
|
|
1402
|
+
style='italic'
|
|
1403
|
+
)
|
|
1404
|
+
|
|
1405
|
+
# Adjust layout and draw
|
|
1406
|
+
canvas.figure.tight_layout()
|
|
1407
|
+
canvas.draw()
|
|
1408
|
+
|
|
1409
|
+
# Add a close button
|
|
1410
|
+
close_button = QPushButton("Close")
|
|
1411
|
+
close_button.clicked.connect(dialog.accept)
|
|
1412
|
+
layout.addWidget(close_button)
|
|
1413
|
+
|
|
1414
|
+
# Show the dialog
|
|
1415
|
+
dialog.exec()
|
|
1416
|
+
|
|
1417
|
+
def change_sort(self, sort_key):
|
|
1418
|
+
"""Change the sort order of the results"""
|
|
1419
|
+
if self.importance_df is None:
|
|
1420
|
+
return
|
|
1421
|
+
|
|
1422
|
+
# Update button states
|
|
1423
|
+
if sort_key == 'importance_value':
|
|
1424
|
+
self.importance_sort_btn.setChecked(True)
|
|
1425
|
+
self.correlation_sort_btn.setChecked(False)
|
|
1426
|
+
else:
|
|
1427
|
+
self.importance_sort_btn.setChecked(False)
|
|
1428
|
+
self.correlation_sort_btn.setChecked(True)
|
|
1429
|
+
|
|
1430
|
+
# Store the current sort key
|
|
1431
|
+
self.current_sort = sort_key
|
|
1432
|
+
|
|
1433
|
+
# Re-sort the dataframe
|
|
1434
|
+
self.importance_df = self.importance_df.sort_values(by=sort_key, ascending=False)
|
|
1435
|
+
|
|
1436
|
+
# Reset rendering of the table
|
|
1437
|
+
self.importance_table.clearContents()
|
|
1438
|
+
self.importance_table.setRowCount(len(self.importance_df))
|
|
1439
|
+
self.current_row = 0
|
|
1440
|
+
|
|
1441
|
+
# Start incremental rendering with the new sort order
|
|
1442
|
+
if self.render_timer and self.render_timer.isActive():
|
|
1443
|
+
self.render_timer.stop()
|
|
1444
|
+
self.render_timer = QTimer()
|
|
1445
|
+
self.render_timer.timeout.connect(lambda: self.render_next_batch(10))
|
|
1446
|
+
self.render_timer.start(10) # Update every 10ms
|
|
1447
|
+
|
|
1448
|
+
# Main application class
|
|
1449
|
+
class ColumnProfilerApp(QMainWindow):
|
|
1450
|
+
# Global application-wide cache to prevent redundant computations
|
|
1451
|
+
global_cache = {}
|
|
1452
|
+
|
|
1453
|
+
def __init__(self, df):
|
|
1454
|
+
super().__init__()
|
|
1455
|
+
|
|
1456
|
+
# Store reference to data
|
|
1457
|
+
self.df = df
|
|
1458
|
+
|
|
1459
|
+
# Initialize cache for results
|
|
1460
|
+
self.result_cache = {}
|
|
1461
|
+
|
|
1462
|
+
# Initialize thread variable
|
|
1463
|
+
self.worker_thread = None
|
|
1464
|
+
|
|
1465
|
+
# Variables for incremental rendering
|
|
1466
|
+
self.importance_df = None
|
|
1467
|
+
self.current_row = 0
|
|
1468
|
+
self.render_timer = None
|
|
1469
|
+
|
|
1470
|
+
# Current sort key
|
|
1471
|
+
self.current_sort = 'importance_value'
|
|
1472
|
+
|
|
1473
|
+
# Set window properties
|
|
1474
|
+
self.setWindowTitle("Column Profiler")
|
|
1475
|
+
self.setMinimumSize(900, 600)
|
|
1476
|
+
|
|
1477
|
+
# Create central widget and main layout
|
|
1478
|
+
central_widget = QWidget()
|
|
1479
|
+
main_layout = QVBoxLayout(central_widget)
|
|
1480
|
+
|
|
1481
|
+
# Create top control panel
|
|
1482
|
+
control_panel = QWidget()
|
|
1483
|
+
control_layout = QHBoxLayout(control_panel)
|
|
1484
|
+
|
|
1485
|
+
# Column selector
|
|
1486
|
+
self.column_selector = QComboBox()
|
|
1487
|
+
self.column_selector.addItems([col for col in df.columns])
|
|
1488
|
+
control_layout.addWidget(QLabel("Select Column to Analyze:"))
|
|
1489
|
+
control_layout.addWidget(self.column_selector)
|
|
1490
|
+
|
|
1491
|
+
# Analyze button
|
|
1492
|
+
self.analyze_button = QPushButton("Analyze")
|
|
1493
|
+
self.analyze_button.clicked.connect(self.analyze_column)
|
|
1494
|
+
control_layout.addWidget(self.analyze_button)
|
|
1495
|
+
|
|
1496
|
+
# Progress indicators
|
|
1497
|
+
self.progress_bar = QProgressBar()
|
|
1498
|
+
self.progress_bar.setRange(0, 100)
|
|
1499
|
+
self.progress_bar.hide()
|
|
1500
|
+
self.progress_label = QLabel()
|
|
1501
|
+
self.progress_label.hide()
|
|
1502
|
+
|
|
1503
|
+
# Cancel button
|
|
1504
|
+
self.cancel_button = QPushButton("Cancel")
|
|
1505
|
+
self.cancel_button.clicked.connect(self.cancel_analysis)
|
|
1506
|
+
self.cancel_button.hide()
|
|
1507
|
+
|
|
1508
|
+
control_layout.addWidget(self.progress_bar)
|
|
1509
|
+
control_layout.addWidget(self.progress_label)
|
|
1510
|
+
control_layout.addWidget(self.cancel_button)
|
|
1511
|
+
|
|
1512
|
+
# Add control panel to main layout
|
|
1513
|
+
main_layout.addWidget(control_panel)
|
|
1514
|
+
|
|
1515
|
+
# Add sorting control
|
|
1516
|
+
sort_panel = QWidget()
|
|
1517
|
+
sort_layout = QHBoxLayout(sort_panel)
|
|
1518
|
+
sort_layout.setContentsMargins(0, 0, 0, 0)
|
|
1519
|
+
|
|
1520
|
+
# Add sort label
|
|
1521
|
+
sort_layout.addWidget(QLabel("Sort by:"))
|
|
1522
|
+
|
|
1523
|
+
# Add sort buttons
|
|
1524
|
+
self.importance_sort_btn = QPushButton("Importance")
|
|
1525
|
+
self.importance_sort_btn.setCheckable(True)
|
|
1526
|
+
self.importance_sort_btn.setChecked(True) # Default sort
|
|
1527
|
+
self.importance_sort_btn.clicked.connect(lambda: self.change_sort('importance_value'))
|
|
1528
|
+
|
|
1529
|
+
self.correlation_sort_btn = QPushButton("Correlation")
|
|
1530
|
+
self.correlation_sort_btn.setCheckable(True)
|
|
1531
|
+
self.correlation_sort_btn.clicked.connect(lambda: self.change_sort('correlation'))
|
|
1532
|
+
|
|
1533
|
+
sort_layout.addWidget(self.importance_sort_btn)
|
|
1534
|
+
sort_layout.addWidget(self.correlation_sort_btn)
|
|
1535
|
+
sort_layout.addStretch()
|
|
1536
|
+
|
|
1537
|
+
# Add buttons to layout
|
|
1538
|
+
main_layout.addWidget(sort_panel)
|
|
1539
|
+
|
|
1540
|
+
# Add a splitter for results area
|
|
1541
|
+
results_splitter = QSplitter(Qt.Orientation.Vertical)
|
|
1542
|
+
|
|
1543
|
+
# Create table for showing importance values
|
|
1544
|
+
self.importance_table = QTableWidget()
|
|
1545
|
+
self.importance_table.setColumnCount(3)
|
|
1546
|
+
self.importance_table.setHorizontalHeaderLabels(["Feature", "Importance", "Abs. Correlation"])
|
|
1547
|
+
self.importance_table.horizontalHeader().setSectionResizeMode(0, QHeaderView.ResizeMode.Stretch)
|
|
1548
|
+
self.importance_table.cellDoubleClicked.connect(self.show_relationship_visualization)
|
|
1549
|
+
results_splitter.addWidget(self.importance_table)
|
|
1550
|
+
|
|
1551
|
+
# Add instruction label for double-click functionality
|
|
1552
|
+
instruction_label = QLabel("Double-click on any feature to view detailed relationship visualization with the target column")
|
|
1553
|
+
instruction_label.setStyleSheet("color: #666; font-style: italic;")
|
|
1554
|
+
instruction_label.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
1555
|
+
main_layout.addWidget(instruction_label)
|
|
1556
|
+
|
|
1557
|
+
# Create matplotlib canvas for the chart
|
|
1558
|
+
self.chart_view = MatplotlibCanvas(width=8, height=5, dpi=100)
|
|
1559
|
+
results_splitter.addWidget(self.chart_view)
|
|
1560
|
+
|
|
1561
|
+
# Set initial splitter sizes
|
|
1562
|
+
results_splitter.setSizes([300, 300])
|
|
1563
|
+
|
|
1564
|
+
# Add the splitter to the main layout
|
|
1565
|
+
main_layout.addWidget(results_splitter)
|
|
1566
|
+
|
|
1567
|
+
# Set the central widget
|
|
1568
|
+
self.setCentralWidget(central_widget)
|
|
1569
|
+
|
|
1570
|
+
def analyze_column(self):
|
|
1571
|
+
if self.df is None or self.column_selector.currentText() == "":
|
|
1572
|
+
return
|
|
1573
|
+
|
|
1574
|
+
# Cancel any existing worker thread
|
|
1575
|
+
if self.worker_thread and self.worker_thread.isRunning():
|
|
1576
|
+
# Signal the thread to cancel
|
|
1577
|
+
self.worker_thread.cancel()
|
|
1578
|
+
|
|
1579
|
+
try:
|
|
1580
|
+
# Disconnect all signals to avoid callbacks during termination
|
|
1581
|
+
self.worker_thread.progress.disconnect()
|
|
1582
|
+
self.worker_thread.result.disconnect()
|
|
1583
|
+
self.worker_thread.error.disconnect()
|
|
1584
|
+
self.worker_thread.finished.disconnect()
|
|
1585
|
+
except Exception:
|
|
1586
|
+
pass # Already disconnected
|
|
1587
|
+
|
|
1588
|
+
# Terminate thread properly
|
|
1589
|
+
self.worker_thread.terminate()
|
|
1590
|
+
self.worker_thread.wait(1000) # Wait up to 1 second
|
|
1591
|
+
self.worker_thread = None # Clear reference
|
|
1592
|
+
|
|
1593
|
+
target_column = self.column_selector.currentText()
|
|
1594
|
+
|
|
1595
|
+
# Check in-memory cache first (fastest)
|
|
1596
|
+
if target_column in self.result_cache:
|
|
1597
|
+
self.handle_results(self.result_cache[target_column])
|
|
1598
|
+
return
|
|
1599
|
+
|
|
1600
|
+
# Check global application-wide cache second (still fast)
|
|
1601
|
+
global_key = get_cache_key(self.df, target_column)
|
|
1602
|
+
if global_key in ColumnProfilerApp.global_cache:
|
|
1603
|
+
self.result_cache[target_column] = ColumnProfilerApp.global_cache[global_key]
|
|
1604
|
+
self.handle_results(self.result_cache[target_column])
|
|
1605
|
+
return
|
|
1606
|
+
|
|
1607
|
+
# Disk cache will be checked in the worker thread
|
|
1608
|
+
|
|
1609
|
+
# Disable the analyze button while processing
|
|
1610
|
+
self.analyze_button.setEnabled(False)
|
|
1611
|
+
|
|
1612
|
+
# Show progress indicators
|
|
1613
|
+
self.progress_bar.setValue(0)
|
|
1614
|
+
self.progress_bar.show()
|
|
1615
|
+
self.progress_label.setText("Starting analysis...")
|
|
1616
|
+
self.progress_label.show()
|
|
1617
|
+
self.cancel_button.show()
|
|
1618
|
+
|
|
1619
|
+
# Create and start the worker thread
|
|
1620
|
+
self.worker_thread = ExplainerThread(self.df, target_column)
|
|
1621
|
+
self.worker_thread.progress.connect(self.update_progress)
|
|
1622
|
+
self.worker_thread.result.connect(self.cache_and_display_results)
|
|
1623
|
+
self.worker_thread.error.connect(self.handle_error)
|
|
1624
|
+
self.worker_thread.finished.connect(self.on_analysis_finished)
|
|
1625
|
+
self.worker_thread.start()
|
|
1626
|
+
|
|
1627
|
+
def update_progress(self, value, message):
|
|
1628
|
+
self.progress_bar.setValue(value)
|
|
1629
|
+
self.progress_label.setText(message)
|
|
1630
|
+
|
|
1631
|
+
def cache_and_display_results(self, importance_df):
|
|
1632
|
+
# Cache the results
|
|
1633
|
+
target_column = self.column_selector.currentText()
|
|
1634
|
+
self.result_cache[target_column] = importance_df
|
|
1635
|
+
|
|
1636
|
+
# Also cache in the global application cache
|
|
1637
|
+
global_key = get_cache_key(self.df, target_column)
|
|
1638
|
+
ColumnProfilerApp.global_cache[global_key] = importance_df
|
|
1639
|
+
|
|
1640
|
+
# Display the results
|
|
1641
|
+
self.handle_results(importance_df)
|
|
1642
|
+
|
|
1643
|
+
def on_analysis_finished(self):
|
|
1644
|
+
"""Handle cleanup when analysis is finished (either completed or cancelled)"""
|
|
1645
|
+
self.analyze_button.setEnabled(True)
|
|
1646
|
+
self.cancel_button.hide()
|
|
1647
|
+
|
|
1648
|
+
def handle_results(self, importance_df):
|
|
1649
|
+
# Hide progress indicators
|
|
1650
|
+
self.progress_bar.hide()
|
|
1651
|
+
self.progress_label.hide()
|
|
1652
|
+
self.cancel_button.hide()
|
|
1653
|
+
|
|
1654
|
+
# Update importance table to include correlation column
|
|
1655
|
+
self.importance_table.setColumnCount(3)
|
|
1656
|
+
self.importance_table.setHorizontalHeaderLabels(["Feature", "Importance", "Abs. Correlation"])
|
|
1657
|
+
self.importance_table.setRowCount(len(importance_df))
|
|
1658
|
+
|
|
1659
|
+
# Using a timer for incremental updates
|
|
1660
|
+
self.importance_df = importance_df # Store for incremental rendering
|
|
1661
|
+
self.current_row = 0
|
|
1662
|
+
self.render_timer = QTimer()
|
|
1663
|
+
self.render_timer.timeout.connect(lambda: self.render_next_batch(10))
|
|
1664
|
+
self.render_timer.start(10) # Update every 10ms
|
|
1665
|
+
|
|
1666
|
+
def render_next_batch(self, batch_size):
|
|
1667
|
+
try:
|
|
1668
|
+
if self.current_row >= len(self.importance_df):
|
|
1669
|
+
# All rows rendered, now render the chart and stop the timer
|
|
1670
|
+
self.render_chart()
|
|
1671
|
+
self.render_timer.stop()
|
|
1672
|
+
return
|
|
1673
|
+
|
|
1674
|
+
# Render a batch of rows
|
|
1675
|
+
end_row = min(self.current_row + batch_size, len(self.importance_df))
|
|
1676
|
+
for row in range(self.current_row, end_row):
|
|
1677
|
+
try:
|
|
1678
|
+
# Check if row exists in dataframe to prevent index errors
|
|
1679
|
+
if row < len(self.importance_df):
|
|
1680
|
+
feature = self.importance_df.iloc[row]['feature']
|
|
1681
|
+
importance_value = self.importance_df.iloc[row]['importance_value']
|
|
1682
|
+
|
|
1683
|
+
# Add correlation if available
|
|
1684
|
+
correlation = self.importance_df.iloc[row].get('correlation', None)
|
|
1685
|
+
if correlation is not None:
|
|
1686
|
+
self.importance_table.setItem(row, 0, QTableWidgetItem(str(feature)))
|
|
1687
|
+
self.importance_table.setItem(row, 1, QTableWidgetItem(str(round(importance_value, 4))))
|
|
1688
|
+
self.importance_table.setItem(row, 2, QTableWidgetItem(str(round(correlation, 4))))
|
|
1689
|
+
else:
|
|
1690
|
+
self.importance_table.setItem(row, 0, QTableWidgetItem(str(feature)))
|
|
1691
|
+
self.importance_table.setItem(row, 1, QTableWidgetItem(str(round(importance_value, 4))))
|
|
1692
|
+
else:
|
|
1693
|
+
# Handle out of range index
|
|
1694
|
+
print(f"Warning: Row {row} is out of range (max: {len(self.importance_df)-1})")
|
|
1695
|
+
self.importance_table.setItem(row, 0, QTableWidgetItem("Error"))
|
|
1696
|
+
self.importance_table.setItem(row, 1, QTableWidgetItem("Out of range"))
|
|
1697
|
+
self.importance_table.setItem(row, 2, QTableWidgetItem("N/A"))
|
|
1698
|
+
except (IndexError, KeyError) as e:
|
|
1699
|
+
# Enhanced error reporting for index and key errors
|
|
1700
|
+
import traceback
|
|
1701
|
+
trace = traceback.format_exc()
|
|
1702
|
+
error_msg = f"Error rendering row {row}: {e.__class__.__name__}: {e}\n{trace}"
|
|
1703
|
+
print(error_msg)
|
|
1704
|
+
|
|
1705
|
+
# Handle missing data in the dataframe gracefully
|
|
1706
|
+
self.importance_table.setItem(row, 0, QTableWidgetItem(f"Error: {e.__class__.__name__}"))
|
|
1707
|
+
self.importance_table.setItem(row, 1, QTableWidgetItem(f"{str(e)[:20]}"))
|
|
1708
|
+
self.importance_table.setItem(row, 2, QTableWidgetItem("Error"))
|
|
1709
|
+
except Exception as e:
|
|
1710
|
+
# Catch any other exceptions
|
|
1711
|
+
print(f"Unexpected error rendering row {row}: {e.__class__.__name__}: {e}")
|
|
1712
|
+
self.importance_table.setItem(row, 0, QTableWidgetItem(f"Error: {e.__class__.__name__}"))
|
|
1713
|
+
self.importance_table.setItem(row, 1, QTableWidgetItem("See console for details"))
|
|
1714
|
+
self.importance_table.setItem(row, 2, QTableWidgetItem("Error"))
|
|
1715
|
+
|
|
1716
|
+
self.current_row = end_row
|
|
1717
|
+
QApplication.processEvents() # Allow UI to update
|
|
1718
|
+
except Exception as e:
|
|
1719
|
+
# Catch any exceptions in the rendering loop itself
|
|
1720
|
+
import traceback
|
|
1721
|
+
trace = traceback.format_exc()
|
|
1722
|
+
error_msg = f"Error in render_next_batch: {e.__class__.__name__}: {e}\n{trace}"
|
|
1723
|
+
print(error_msg)
|
|
1724
|
+
|
|
1725
|
+
# Try to stop the timer to prevent further errors
|
|
1726
|
+
try:
|
|
1727
|
+
if self.render_timer and self.render_timer.isActive():
|
|
1728
|
+
self.render_timer.stop()
|
|
1729
|
+
except:
|
|
1730
|
+
pass
|
|
1731
|
+
|
|
1732
|
+
# Show error
|
|
1733
|
+
QMessageBox.critical(self, "Rendering Error",
|
|
1734
|
+
f"Error rendering results: {e.__class__.__name__}: {e}")
|
|
1735
|
+
|
|
1736
|
+
def render_chart(self):
|
|
1737
|
+
# Create horizontal bar chart
|
|
1738
|
+
try:
|
|
1739
|
+
if self.importance_df is None or len(self.importance_df) == 0:
|
|
1740
|
+
# No data to render
|
|
1741
|
+
self.chart_view.axes.clear()
|
|
1742
|
+
self.chart_view.axes.text(0.5, 0.5, "No data available for chart",
|
|
1743
|
+
ha='center', va='center', fontsize=12, color='gray')
|
|
1744
|
+
self.chart_view.axes.set_axis_off()
|
|
1745
|
+
self.chart_view.draw()
|
|
1746
|
+
return
|
|
1747
|
+
|
|
1748
|
+
self.chart_view.axes.clear()
|
|
1749
|
+
|
|
1750
|
+
# Get a sorted copy based on current sort key
|
|
1751
|
+
plot_df = self.importance_df.sort_values(by=self.current_sort, ascending=False).head(20).copy()
|
|
1752
|
+
|
|
1753
|
+
# Verify we have data before proceeding
|
|
1754
|
+
if len(plot_df) == 0:
|
|
1755
|
+
self.chart_view.axes.text(0.5, 0.5, "No features found with importance values",
|
|
1756
|
+
ha='center', va='center', fontsize=12, color='gray')
|
|
1757
|
+
self.chart_view.axes.set_axis_off()
|
|
1758
|
+
self.chart_view.draw()
|
|
1759
|
+
return
|
|
1760
|
+
|
|
1761
|
+
# Check required columns exist
|
|
1762
|
+
required_columns = ['feature', 'importance_value']
|
|
1763
|
+
missing_columns = [col for col in required_columns if col not in plot_df.columns]
|
|
1764
|
+
if missing_columns:
|
|
1765
|
+
error_msg = f"Missing required columns: {', '.join(missing_columns)}"
|
|
1766
|
+
self.chart_view.axes.text(0.5, 0.5, error_msg,
|
|
1767
|
+
ha='center', va='center', fontsize=12, color='red')
|
|
1768
|
+
self.chart_view.axes.set_axis_off()
|
|
1769
|
+
self.chart_view.draw()
|
|
1770
|
+
print(f"Chart rendering error: {error_msg}")
|
|
1771
|
+
return
|
|
1772
|
+
|
|
1773
|
+
# Truncate long feature names for better display
|
|
1774
|
+
max_feature_length = 30
|
|
1775
|
+
plot_df['display_feature'] = plot_df['feature'].apply(
|
|
1776
|
+
lambda x: (str(x)[:max_feature_length] + '...') if len(str(x)) > max_feature_length else str(x)
|
|
1777
|
+
)
|
|
1778
|
+
|
|
1779
|
+
# Reverse order for better display (highest at top)
|
|
1780
|
+
plot_df = plot_df.iloc[::-1].reset_index(drop=True)
|
|
1781
|
+
|
|
1782
|
+
# Create a figure with two subplots side by side
|
|
1783
|
+
self.chart_view.figure.clear()
|
|
1784
|
+
gs = self.chart_view.figure.add_gridspec(1, 2, width_ratios=[3, 2])
|
|
1785
|
+
|
|
1786
|
+
# First subplot for importance
|
|
1787
|
+
ax1 = self.chart_view.figure.add_subplot(gs[0, 0])
|
|
1788
|
+
|
|
1789
|
+
# Create a colormap for better visualization
|
|
1790
|
+
cmap = plt.cm.Blues
|
|
1791
|
+
colors = cmap(np.linspace(0.4, 0.8, len(plot_df)))
|
|
1792
|
+
|
|
1793
|
+
# Plot with custom colors
|
|
1794
|
+
bars = ax1.barh(
|
|
1795
|
+
plot_df['display_feature'],
|
|
1796
|
+
plot_df['importance_value'],
|
|
1797
|
+
color=colors,
|
|
1798
|
+
height=0.7, # Thinner bars for more spacing
|
|
1799
|
+
alpha=0.8
|
|
1800
|
+
)
|
|
1801
|
+
|
|
1802
|
+
# Add values at the end of bars
|
|
1803
|
+
for bar in bars:
|
|
1804
|
+
width = bar.get_width()
|
|
1805
|
+
ax1.text(
|
|
1806
|
+
width * 1.05,
|
|
1807
|
+
bar.get_y() + bar.get_height()/2,
|
|
1808
|
+
f'{width:.2f}',
|
|
1809
|
+
va='center',
|
|
1810
|
+
fontsize=9,
|
|
1811
|
+
fontweight='bold'
|
|
1812
|
+
)
|
|
1813
|
+
|
|
1814
|
+
# Add grid for better readability
|
|
1815
|
+
ax1.grid(True, axis='x', linestyle='--', alpha=0.3)
|
|
1816
|
+
|
|
1817
|
+
# Remove unnecessary spines
|
|
1818
|
+
for spine in ['top', 'right']:
|
|
1819
|
+
ax1.spines[spine].set_visible(False)
|
|
1820
|
+
|
|
1821
|
+
# Make labels more readable
|
|
1822
|
+
ax1.tick_params(axis='y', labelsize=9)
|
|
1823
|
+
|
|
1824
|
+
# Set title and labels
|
|
1825
|
+
ax1.set_title(f'Feature Importance for {self.column_selector.currentText()}')
|
|
1826
|
+
ax1.set_xlabel('Importance Value')
|
|
1827
|
+
|
|
1828
|
+
# Add a note about the sorting order
|
|
1829
|
+
sort_label = "Sorted by: " + ("Importance" if self.current_sort == 'importance_value' else "Correlation")
|
|
1830
|
+
|
|
1831
|
+
# Second subplot for correlation if available
|
|
1832
|
+
if 'correlation' in plot_df.columns:
|
|
1833
|
+
ax2 = self.chart_view.figure.add_subplot(gs[0, 1], sharey=ax1)
|
|
1834
|
+
|
|
1835
|
+
# Create a colormap for correlation - use a different color
|
|
1836
|
+
cmap_corr = plt.cm.Reds
|
|
1837
|
+
colors_corr = cmap_corr(np.linspace(0.4, 0.8, len(plot_df)))
|
|
1838
|
+
|
|
1839
|
+
# Plot correlation bars
|
|
1840
|
+
corr_bars = ax2.barh(
|
|
1841
|
+
plot_df['display_feature'],
|
|
1842
|
+
plot_df['correlation'],
|
|
1843
|
+
color=colors_corr,
|
|
1844
|
+
height=0.7,
|
|
1845
|
+
alpha=0.8
|
|
1846
|
+
)
|
|
1847
|
+
|
|
1848
|
+
# Add values at the end of correlation bars
|
|
1849
|
+
for bar in corr_bars:
|
|
1850
|
+
width = bar.get_width()
|
|
1851
|
+
ax2.text(
|
|
1852
|
+
width * 1.05,
|
|
1853
|
+
bar.get_y() + bar.get_height()/2,
|
|
1854
|
+
f'{width:.2f}',
|
|
1855
|
+
va='center',
|
|
1856
|
+
fontsize=9,
|
|
1857
|
+
fontweight='bold'
|
|
1858
|
+
)
|
|
1859
|
+
|
|
1860
|
+
# Add grid and styling
|
|
1861
|
+
ax2.grid(True, axis='x', linestyle='--', alpha=0.3)
|
|
1862
|
+
ax2.set_title('Absolute Correlation')
|
|
1863
|
+
ax2.set_xlabel('Correlation Value')
|
|
1864
|
+
|
|
1865
|
+
# Hide y-axis labels since they're shared with the first plot
|
|
1866
|
+
ax2.set_yticklabels([])
|
|
1867
|
+
|
|
1868
|
+
# Remove unnecessary spines
|
|
1869
|
+
for spine in ['top', 'right']:
|
|
1870
|
+
ax2.spines[spine].set_visible(False)
|
|
1871
|
+
|
|
1872
|
+
# Add a note about the current sort order
|
|
1873
|
+
self.chart_view.figure.text(0.5, 0.01, sort_label, ha='center', fontsize=9, style='italic')
|
|
1874
|
+
|
|
1875
|
+
# Adjust figure size based on number of features
|
|
1876
|
+
feature_count = len(plot_df)
|
|
1877
|
+
self.chart_view.figure.set_figheight(max(5, min(4 + feature_count * 0.3, 12)))
|
|
1878
|
+
|
|
1879
|
+
# Adjust layout and draw
|
|
1880
|
+
self.chart_view.figure.tight_layout(rect=[0, 0.03, 1, 0.97]) # Make room for sort label
|
|
1881
|
+
self.chart_view.draw()
|
|
1882
|
+
|
|
1883
|
+
except IndexError as e:
|
|
1884
|
+
# Special handling for index errors with detailed information
|
|
1885
|
+
import traceback
|
|
1886
|
+
import inspect
|
|
1887
|
+
|
|
1888
|
+
# Get stack trace information
|
|
1889
|
+
trace = traceback.format_exc()
|
|
1890
|
+
|
|
1891
|
+
# Try to get line and context information
|
|
1892
|
+
try:
|
|
1893
|
+
frame = inspect.trace()[-1]
|
|
1894
|
+
frame_info = inspect.getframeinfo(frame[0])
|
|
1895
|
+
filename = frame_info.filename
|
|
1896
|
+
lineno = frame_info.lineno
|
|
1897
|
+
function = frame_info.function
|
|
1898
|
+
code_context = frame_info.code_context[0].strip() if frame_info.code_context else "Unknown code context"
|
|
1899
|
+
|
|
1900
|
+
# Detailed error message
|
|
1901
|
+
detail_msg = f"IndexError at line {lineno} in {function}: {str(e)}\nCode: {code_context}"
|
|
1902
|
+
print(f"Chart rendering error: {detail_msg}\n{trace}")
|
|
1903
|
+
|
|
1904
|
+
# Display error in chart
|
|
1905
|
+
self.chart_view.axes.clear()
|
|
1906
|
+
self.chart_view.axes.text(0.5, 0.5,
|
|
1907
|
+
f"Index Error in chart rendering:\n{str(e)}\nAt line {lineno}: {code_context}",
|
|
1908
|
+
ha='center', va='center', fontsize=12, color='red',
|
|
1909
|
+
wrap=True)
|
|
1910
|
+
self.chart_view.axes.set_axis_off()
|
|
1911
|
+
self.chart_view.draw()
|
|
1912
|
+
except Exception as inner_e:
|
|
1913
|
+
# Fallback if the detailed error reporting fails
|
|
1914
|
+
print(f"Error getting detailed error info: {inner_e}")
|
|
1915
|
+
print(f"Original error: {e}\n{trace}")
|
|
1916
|
+
|
|
1917
|
+
self.chart_view.axes.clear()
|
|
1918
|
+
self.chart_view.axes.text(0.5, 0.5, f"Index Error: {str(e)}",
|
|
1919
|
+
ha='center', va='center', fontsize=12, color='red')
|
|
1920
|
+
self.chart_view.axes.set_axis_off()
|
|
1921
|
+
self.chart_view.draw()
|
|
1922
|
+
except Exception as e:
|
|
1923
|
+
# Recover gracefully from any chart rendering errors with detailed information
|
|
1924
|
+
import traceback
|
|
1925
|
+
trace = traceback.format_exc()
|
|
1926
|
+
error_msg = f"Error rendering chart: {e.__class__.__name__}: {str(e)}"
|
|
1927
|
+
print(f"{error_msg}\n{trace}")
|
|
1928
|
+
|
|
1929
|
+
self.chart_view.axes.clear()
|
|
1930
|
+
self.chart_view.axes.text(0.5, 0.5, error_msg,
|
|
1931
|
+
ha='center', va='center', fontsize=12, color='red',
|
|
1932
|
+
wrap=True)
|
|
1933
|
+
self.chart_view.axes.set_axis_off()
|
|
1934
|
+
self.chart_view.draw()
|
|
1935
|
+
|
|
1936
|
+
def handle_error(self, error_message):
|
|
1937
|
+
"""Handle errors during analysis"""
|
|
1938
|
+
# Hide progress indicators
|
|
1939
|
+
self.progress_bar.hide()
|
|
1940
|
+
self.progress_label.hide()
|
|
1941
|
+
self.cancel_button.hide()
|
|
1942
|
+
|
|
1943
|
+
# Re-enable analyze button
|
|
1944
|
+
self.analyze_button.setEnabled(True)
|
|
1945
|
+
|
|
1946
|
+
# Print error to console for debugging
|
|
1947
|
+
print(f"Error in column profiler: {error_message}")
|
|
1948
|
+
|
|
1949
|
+
# Show error notification
|
|
1950
|
+
show_error_notification(f"Analysis Error: {error_message.split(chr(10))[0] if chr(10) in error_message else error_message}")
|
|
1951
|
+
|
|
1952
|
+
# Show a message in the UI as well
|
|
1953
|
+
self.importance_table.setRowCount(1)
|
|
1954
|
+
self.importance_table.setColumnCount(3)
|
|
1955
|
+
self.importance_table.setHorizontalHeaderLabels(["Feature", "Importance", "Abs. Correlation"])
|
|
1956
|
+
self.importance_table.setItem(0, 0, QTableWidgetItem(f"Error: {error_message.split(chr(10))[0]}"))
|
|
1957
|
+
self.importance_table.setItem(0, 1, QTableWidgetItem(""))
|
|
1958
|
+
self.importance_table.setItem(0, 2, QTableWidgetItem(""))
|
|
1959
|
+
self.importance_table.resizeColumnsToContents()
|
|
1960
|
+
|
|
1961
|
+
# Update the chart to show error
|
|
1962
|
+
self.chart_view.axes.clear()
|
|
1963
|
+
self.chart_view.axes.text(0.5, 0.5, f"Error calculating importance:\n{error_message.split(chr(10))[0]}",
|
|
1964
|
+
ha='center', va='center', fontsize=12, color='red',
|
|
1965
|
+
wrap=True)
|
|
1966
|
+
self.chart_view.axes.set_axis_off()
|
|
1967
|
+
self.chart_view.draw()
|
|
1968
|
+
|
|
1969
|
+
def closeEvent(self, event):
|
|
1970
|
+
"""Clean up when the window is closed"""
|
|
1971
|
+
# Stop any running timer
|
|
1972
|
+
if self.render_timer and self.render_timer.isActive():
|
|
1973
|
+
self.render_timer.stop()
|
|
1974
|
+
|
|
1975
|
+
# Clean up any background threads
|
|
1976
|
+
if self.worker_thread and self.worker_thread.isRunning():
|
|
1977
|
+
# Disconnect all signals to avoid callbacks during termination
|
|
1978
|
+
try:
|
|
1979
|
+
self.worker_thread.progress.disconnect()
|
|
1980
|
+
self.worker_thread.result.disconnect()
|
|
1981
|
+
self.worker_thread.error.disconnect()
|
|
1982
|
+
self.worker_thread.finished.disconnect()
|
|
1983
|
+
except Exception:
|
|
1984
|
+
pass # Already disconnected
|
|
1985
|
+
|
|
1986
|
+
# Terminate thread properly
|
|
1987
|
+
self.worker_thread.terminate()
|
|
1988
|
+
self.worker_thread.wait(1000) # Wait up to 1 second
|
|
1989
|
+
|
|
1990
|
+
# Clear references to prevent thread issues
|
|
1991
|
+
self.worker_thread = None
|
|
1992
|
+
|
|
1993
|
+
# Clean up memory
|
|
1994
|
+
self.result_cache.clear()
|
|
1995
|
+
|
|
1996
|
+
# Accept the close event
|
|
1997
|
+
event.accept()
|
|
1998
|
+
|
|
1999
|
+
# Suggest garbage collection
|
|
2000
|
+
gc.collect()
|
|
2001
|
+
|
|
2002
|
+
def cancel_analysis(self):
|
|
2003
|
+
"""Cancel the current analysis"""
|
|
2004
|
+
if self.worker_thread and self.worker_thread.isRunning():
|
|
2005
|
+
# Signal the thread to cancel first
|
|
2006
|
+
self.worker_thread.cancel()
|
|
2007
|
+
|
|
2008
|
+
# Disconnect all signals to avoid callbacks during termination
|
|
2009
|
+
try:
|
|
2010
|
+
self.worker_thread.progress.disconnect()
|
|
2011
|
+
self.worker_thread.result.disconnect()
|
|
2012
|
+
self.worker_thread.error.disconnect()
|
|
2013
|
+
self.worker_thread.finished.disconnect()
|
|
2014
|
+
except Exception:
|
|
2015
|
+
pass # Already disconnected
|
|
2016
|
+
|
|
2017
|
+
# Terminate thread properly
|
|
2018
|
+
self.worker_thread.terminate()
|
|
2019
|
+
self.worker_thread.wait(1000) # Wait up to 1 second
|
|
2020
|
+
|
|
2021
|
+
# Clear reference
|
|
2022
|
+
self.worker_thread = None
|
|
2023
|
+
|
|
2024
|
+
# Update UI
|
|
2025
|
+
self.progress_bar.hide()
|
|
2026
|
+
self.progress_label.setText("Analysis cancelled")
|
|
2027
|
+
self.progress_label.show()
|
|
2028
|
+
self.cancel_button.hide()
|
|
2029
|
+
self.analyze_button.setEnabled(True)
|
|
2030
|
+
|
|
2031
|
+
# Hide the progress label after 2 seconds
|
|
2032
|
+
QTimer.singleShot(2000, self.progress_label.hide)
|
|
2033
|
+
|
|
2034
|
+
def show_relationship_visualization(self, row, column):
|
|
2035
|
+
"""Show visualization of relationship between selected feature and target column"""
|
|
2036
|
+
if self.importance_df is None or row < 0 or row >= len(self.importance_df):
|
|
2037
|
+
return
|
|
2038
|
+
|
|
2039
|
+
# Get the feature name and target column
|
|
2040
|
+
try:
|
|
2041
|
+
feature = self.importance_df.iloc[row]['feature']
|
|
2042
|
+
target = self.column_selector.currentText()
|
|
2043
|
+
|
|
2044
|
+
# Verify both columns exist in the dataframe
|
|
2045
|
+
if feature not in self.df.columns:
|
|
2046
|
+
QMessageBox.warning(self, "Column Not Found",
|
|
2047
|
+
f"Feature column '{feature}' not found in the dataframe")
|
|
2048
|
+
return
|
|
2049
|
+
|
|
2050
|
+
if target not in self.df.columns:
|
|
2051
|
+
QMessageBox.warning(self, "Column Not Found",
|
|
2052
|
+
f"Target column '{target}' not found in the dataframe")
|
|
2053
|
+
return
|
|
2054
|
+
except Exception as e:
|
|
2055
|
+
QMessageBox.critical(self, "Error", f"Error getting column data: {str(e)}")
|
|
2056
|
+
return
|
|
2057
|
+
|
|
2058
|
+
# Create a dialog to show the visualization
|
|
2059
|
+
dialog = QDialog(self)
|
|
2060
|
+
dialog.setWindowTitle(f"Relationship: {feature} vs {target}")
|
|
2061
|
+
dialog.resize(900, 700)
|
|
2062
|
+
|
|
2063
|
+
# Create layout
|
|
2064
|
+
layout = QVBoxLayout(dialog)
|
|
2065
|
+
|
|
2066
|
+
# Create canvas for the plot
|
|
2067
|
+
canvas = MatplotlibCanvas(width=8, height=6, dpi=100)
|
|
2068
|
+
layout.addWidget(canvas)
|
|
2069
|
+
|
|
2070
|
+
# Determine the data types
|
|
2071
|
+
feature_is_numeric = pd.api.types.is_numeric_dtype(self.df[feature])
|
|
2072
|
+
target_is_numeric = pd.api.types.is_numeric_dtype(self.df[target])
|
|
2073
|
+
|
|
2074
|
+
# Get unique counts to determine if we have high cardinality
|
|
2075
|
+
feature_unique_count = self.df[feature].nunique()
|
|
2076
|
+
target_unique_count = self.df[target].nunique()
|
|
2077
|
+
|
|
2078
|
+
# Define high cardinality threshold
|
|
2079
|
+
high_cardinality_threshold = 10
|
|
2080
|
+
|
|
2081
|
+
# Clear the figure
|
|
2082
|
+
canvas.axes.clear()
|
|
2083
|
+
|
|
2084
|
+
# Create a working copy of the dataframe
|
|
2085
|
+
working_df = self.df.copy()
|
|
2086
|
+
|
|
2087
|
+
# Prepare data for high cardinality columns
|
|
2088
|
+
if not feature_is_numeric and feature_unique_count > high_cardinality_threshold:
|
|
2089
|
+
# Get the top N categories by frequency
|
|
2090
|
+
top_categories = self.df[feature].value_counts().nlargest(high_cardinality_threshold).index.tolist()
|
|
2091
|
+
# Create "Other" category for remaining values
|
|
2092
|
+
working_df[feature] = working_df[feature].apply(lambda x: x if x in top_categories else 'Other')
|
|
2093
|
+
|
|
2094
|
+
if not target_is_numeric and target_unique_count > high_cardinality_threshold:
|
|
2095
|
+
top_categories = self.df[target].value_counts().nlargest(high_cardinality_threshold).index.tolist()
|
|
2096
|
+
working_df[target] = working_df[target].apply(lambda x: x if x in top_categories else 'Other')
|
|
2097
|
+
|
|
2098
|
+
# Create appropriate visualization based on data types and cardinality
|
|
2099
|
+
if feature_is_numeric and target_is_numeric:
|
|
2100
|
+
# Scatter plot for numeric vs numeric
|
|
2101
|
+
# Use hexbin for large datasets to avoid overplotting
|
|
2102
|
+
if len(working_df) > 100:
|
|
2103
|
+
canvas.axes.hexbin(
|
|
2104
|
+
working_df[feature],
|
|
2105
|
+
working_df[target],
|
|
2106
|
+
gridsize=25,
|
|
2107
|
+
cmap='Blues',
|
|
2108
|
+
mincnt=1
|
|
2109
|
+
)
|
|
2110
|
+
canvas.axes.set_title(f"Hexbin Density Plot: {feature} vs {target}")
|
|
2111
|
+
canvas.axes.set_xlabel(feature)
|
|
2112
|
+
canvas.axes.set_ylabel(target)
|
|
2113
|
+
# Add a colorbar
|
|
2114
|
+
cbar = canvas.figure.colorbar(canvas.axes.collections[0], ax=canvas.axes)
|
|
2115
|
+
cbar.set_label('Count')
|
|
2116
|
+
else:
|
|
2117
|
+
# For smaller datasets, use a scatter plot with transparency
|
|
2118
|
+
sns.scatterplot(
|
|
2119
|
+
x=feature,
|
|
2120
|
+
y=target,
|
|
2121
|
+
data=working_df,
|
|
2122
|
+
ax=canvas.axes,
|
|
2123
|
+
alpha=0.6
|
|
2124
|
+
)
|
|
2125
|
+
# Add regression line
|
|
2126
|
+
sns.regplot(
|
|
2127
|
+
x=feature,
|
|
2128
|
+
y=target,
|
|
2129
|
+
data=working_df,
|
|
2130
|
+
ax=canvas.axes,
|
|
2131
|
+
scatter=False,
|
|
2132
|
+
line_kws={"color": "red"}
|
|
2133
|
+
)
|
|
2134
|
+
canvas.axes.set_title(f"Scatter Plot: {feature} vs {target}")
|
|
2135
|
+
|
|
2136
|
+
elif feature_is_numeric and not target_is_numeric:
|
|
2137
|
+
# Box plot for numeric vs categorical
|
|
2138
|
+
if target_unique_count <= high_cardinality_threshold * 2:
|
|
2139
|
+
# Standard boxplot for reasonable number of categories
|
|
2140
|
+
order = working_df[target].value_counts().nlargest(high_cardinality_threshold * 2).index
|
|
2141
|
+
|
|
2142
|
+
# Calculate counts for each category
|
|
2143
|
+
category_counts = working_df[target].value_counts()
|
|
2144
|
+
|
|
2145
|
+
sns.boxplot(
|
|
2146
|
+
x=target,
|
|
2147
|
+
y=feature,
|
|
2148
|
+
data=working_df,
|
|
2149
|
+
ax=canvas.axes,
|
|
2150
|
+
order=order
|
|
2151
|
+
)
|
|
2152
|
+
canvas.axes.set_title(f"Box Plot: {feature} by {target}")
|
|
2153
|
+
|
|
2154
|
+
# Add count annotations below each box
|
|
2155
|
+
for i, category in enumerate(order):
|
|
2156
|
+
if category in category_counts:
|
|
2157
|
+
count = category_counts[category]
|
|
2158
|
+
canvas.axes.text(
|
|
2159
|
+
i,
|
|
2160
|
+
canvas.axes.get_ylim()[0] - (canvas.axes.get_ylim()[1] - canvas.axes.get_ylim()[0]) * 0.05,
|
|
2161
|
+
f'n={count}',
|
|
2162
|
+
ha='center',
|
|
2163
|
+
va='top',
|
|
2164
|
+
fontsize=8,
|
|
2165
|
+
fontweight='bold'
|
|
2166
|
+
)
|
|
2167
|
+
|
|
2168
|
+
# Rotate x-axis labels for better readability
|
|
2169
|
+
canvas.axes.set_xticklabels(
|
|
2170
|
+
canvas.axes.get_xticklabels(),
|
|
2171
|
+
rotation=45,
|
|
2172
|
+
ha='right'
|
|
2173
|
+
)
|
|
2174
|
+
else:
|
|
2175
|
+
# For very high cardinality, use a violin plot with limited categories
|
|
2176
|
+
order = working_df[target].value_counts().nlargest(high_cardinality_threshold).index
|
|
2177
|
+
working_df_filtered = working_df[working_df[target].isin(order)]
|
|
2178
|
+
|
|
2179
|
+
# Calculate counts for filtered categories
|
|
2180
|
+
category_counts = working_df_filtered[target].value_counts()
|
|
2181
|
+
|
|
2182
|
+
sns.violinplot(
|
|
2183
|
+
x=target,
|
|
2184
|
+
y=feature,
|
|
2185
|
+
data=working_df_filtered,
|
|
2186
|
+
ax=canvas.axes,
|
|
2187
|
+
inner='quartile',
|
|
2188
|
+
cut=0
|
|
2189
|
+
)
|
|
2190
|
+
canvas.axes.set_title(f"Violin Plot: {feature} by Top {len(order)} {target} Categories")
|
|
2191
|
+
|
|
2192
|
+
# Add count annotations below each violin
|
|
2193
|
+
for i, category in enumerate(order):
|
|
2194
|
+
if category in category_counts:
|
|
2195
|
+
count = category_counts[category]
|
|
2196
|
+
canvas.axes.text(
|
|
2197
|
+
i,
|
|
2198
|
+
canvas.axes.get_ylim()[0] - (canvas.axes.get_ylim()[1] - canvas.axes.get_ylim()[0]) * 0.05,
|
|
2199
|
+
f'n={count}',
|
|
2200
|
+
ha='center',
|
|
2201
|
+
va='top',
|
|
2202
|
+
fontsize=8,
|
|
2203
|
+
fontweight='bold'
|
|
2204
|
+
)
|
|
2205
|
+
|
|
2206
|
+
canvas.axes.set_xticklabels(
|
|
2207
|
+
canvas.axes.get_xticklabels(),
|
|
2208
|
+
rotation=45,
|
|
2209
|
+
ha='right'
|
|
2210
|
+
)
|
|
2211
|
+
|
|
2212
|
+
elif not feature_is_numeric and target_is_numeric:
|
|
2213
|
+
# Bar plot for categorical vs numeric
|
|
2214
|
+
if feature_unique_count <= high_cardinality_threshold * 2:
|
|
2215
|
+
# Use standard barplot for reasonable number of categories
|
|
2216
|
+
order = working_df[feature].value_counts().nlargest(high_cardinality_threshold * 2).index
|
|
2217
|
+
|
|
2218
|
+
# Calculate counts for each category for annotations
|
|
2219
|
+
category_counts = working_df[feature].value_counts()
|
|
2220
|
+
|
|
2221
|
+
sns.barplot(
|
|
2222
|
+
x=feature,
|
|
2223
|
+
y=target,
|
|
2224
|
+
data=working_df,
|
|
2225
|
+
ax=canvas.axes,
|
|
2226
|
+
order=order,
|
|
2227
|
+
estimator=np.mean,
|
|
2228
|
+
errorbar=('ci', 95),
|
|
2229
|
+
capsize=0.2
|
|
2230
|
+
)
|
|
2231
|
+
canvas.axes.set_title(f"Bar Plot: Average {target} by {feature}")
|
|
2232
|
+
|
|
2233
|
+
# Add value labels and counts on top of bars
|
|
2234
|
+
for i, p in enumerate(canvas.axes.patches):
|
|
2235
|
+
# Get the category name for this bar
|
|
2236
|
+
if i < len(order):
|
|
2237
|
+
category = order[i]
|
|
2238
|
+
count = category_counts[category]
|
|
2239
|
+
|
|
2240
|
+
# Add mean value and count
|
|
2241
|
+
canvas.axes.annotate(
|
|
2242
|
+
f'{p.get_height():.1f}\n(n={count})',
|
|
2243
|
+
(p.get_x() + p.get_width() / 2., p.get_height()),
|
|
2244
|
+
ha='center',
|
|
2245
|
+
va='bottom',
|
|
2246
|
+
fontsize=8,
|
|
2247
|
+
rotation=0
|
|
2248
|
+
)
|
|
2249
|
+
|
|
2250
|
+
# Rotate x-axis labels if needed
|
|
2251
|
+
if feature_unique_count > 5:
|
|
2252
|
+
canvas.axes.set_xticklabels(
|
|
2253
|
+
canvas.axes.get_xticklabels(),
|
|
2254
|
+
rotation=45,
|
|
2255
|
+
ha='right'
|
|
2256
|
+
)
|
|
2257
|
+
else:
|
|
2258
|
+
# For high cardinality, use a horizontal bar plot with top N categories
|
|
2259
|
+
top_n = 15 # Show top 15 categories
|
|
2260
|
+
# Calculate mean of target for each feature category
|
|
2261
|
+
grouped = working_df.groupby(feature)[target].agg(['mean', 'count', 'std']).reset_index()
|
|
2262
|
+
# Sort by mean and take top categories
|
|
2263
|
+
top_groups = grouped.nlargest(top_n, 'mean')
|
|
2264
|
+
|
|
2265
|
+
# Sort by mean value for better visualization
|
|
2266
|
+
sns.barplot(
|
|
2267
|
+
y=feature,
|
|
2268
|
+
x='mean',
|
|
2269
|
+
data=top_groups,
|
|
2270
|
+
ax=canvas.axes,
|
|
2271
|
+
orient='h'
|
|
2272
|
+
)
|
|
2273
|
+
canvas.axes.set_title(f"Top {top_n} Categories by Average {target}")
|
|
2274
|
+
canvas.axes.set_xlabel(f"Average {target}")
|
|
2275
|
+
|
|
2276
|
+
# Add count annotations
|
|
2277
|
+
for i, row in enumerate(top_groups.itertuples()):
|
|
2278
|
+
canvas.axes.text(
|
|
2279
|
+
row.mean + 0.1,
|
|
2280
|
+
i,
|
|
2281
|
+
f'n={row.count}',
|
|
2282
|
+
va='center',
|
|
2283
|
+
fontsize=8
|
|
2284
|
+
)
|
|
2285
|
+
|
|
2286
|
+
else:
|
|
2287
|
+
# Both feature and target are categorical
|
|
2288
|
+
if feature_unique_count <= high_cardinality_threshold and target_unique_count <= high_cardinality_threshold:
|
|
2289
|
+
# Heatmap for categorical vs categorical with manageable cardinality
|
|
2290
|
+
crosstab = pd.crosstab(
|
|
2291
|
+
working_df[feature],
|
|
2292
|
+
working_df[target],
|
|
2293
|
+
normalize='index'
|
|
2294
|
+
)
|
|
2295
|
+
|
|
2296
|
+
# Create heatmap with improved readability
|
|
2297
|
+
sns.heatmap(
|
|
2298
|
+
crosstab,
|
|
2299
|
+
annot=True,
|
|
2300
|
+
cmap="YlGnBu",
|
|
2301
|
+
ax=canvas.axes,
|
|
2302
|
+
fmt='.2f',
|
|
2303
|
+
linewidths=0.5,
|
|
2304
|
+
annot_kws={"size": 9 if crosstab.size < 30 else 7}
|
|
2305
|
+
)
|
|
2306
|
+
canvas.axes.set_title(f"Heatmap: {feature} vs {target} (proportions)")
|
|
2307
|
+
else:
|
|
2308
|
+
# For high cardinality in both, show a count plot of top categories
|
|
2309
|
+
feature_top = working_df[feature].value_counts().nlargest(8).index
|
|
2310
|
+
target_top = working_df[target].value_counts().nlargest(5).index
|
|
2311
|
+
|
|
2312
|
+
# Filter data to only include top categories
|
|
2313
|
+
filtered_df = working_df[
|
|
2314
|
+
working_df[feature].isin(feature_top) &
|
|
2315
|
+
working_df[target].isin(target_top)
|
|
2316
|
+
]
|
|
2317
|
+
|
|
2318
|
+
# Create a grouped count plot
|
|
2319
|
+
ax_plot = sns.countplot(
|
|
2320
|
+
x=feature,
|
|
2321
|
+
hue=target,
|
|
2322
|
+
data=filtered_df,
|
|
2323
|
+
ax=canvas.axes
|
|
2324
|
+
)
|
|
2325
|
+
canvas.axes.set_title(f"Count Plot: Top {len(feature_top)} {feature} by Top {len(target_top)} {target}")
|
|
2326
|
+
|
|
2327
|
+
# Add count labels on top of bars
|
|
2328
|
+
for p in canvas.axes.patches:
|
|
2329
|
+
if p.get_height() > 0: # Only add labels for non-zero bars
|
|
2330
|
+
canvas.axes.annotate(
|
|
2331
|
+
f'{int(p.get_height())}',
|
|
2332
|
+
(p.get_x() + p.get_width() / 2., p.get_height()),
|
|
2333
|
+
ha='center',
|
|
2334
|
+
va='bottom',
|
|
2335
|
+
fontsize=8,
|
|
2336
|
+
rotation=0
|
|
2337
|
+
)
|
|
2338
|
+
|
|
2339
|
+
# Rotate x-axis labels
|
|
2340
|
+
canvas.axes.set_xticklabels(
|
|
2341
|
+
canvas.axes.get_xticklabels(),
|
|
2342
|
+
rotation=45,
|
|
2343
|
+
ha='right'
|
|
2344
|
+
)
|
|
2345
|
+
|
|
2346
|
+
# Move legend to a better position
|
|
2347
|
+
canvas.axes.legend(title=target, bbox_to_anchor=(1.05, 1), loc='upper left')
|
|
2348
|
+
|
|
2349
|
+
# Add informational text about data reduction if applicable
|
|
2350
|
+
if (not feature_is_numeric and feature_unique_count > high_cardinality_threshold) or \
|
|
2351
|
+
(not target_is_numeric and target_unique_count > high_cardinality_threshold):
|
|
2352
|
+
canvas.figure.text(
|
|
2353
|
+
0.5, 0.01,
|
|
2354
|
+
f"Note: Visualization simplified to show top categories only. Original data has {feature_unique_count} unique {feature} values and {target_unique_count} unique {target} values.",
|
|
2355
|
+
ha='center',
|
|
2356
|
+
fontsize=8,
|
|
2357
|
+
style='italic'
|
|
2358
|
+
)
|
|
2359
|
+
|
|
2360
|
+
# Adjust layout and draw
|
|
2361
|
+
canvas.figure.tight_layout()
|
|
2362
|
+
canvas.draw()
|
|
2363
|
+
|
|
2364
|
+
# Add a close button
|
|
2365
|
+
close_button = QPushButton("Close")
|
|
2366
|
+
close_button.clicked.connect(dialog.accept)
|
|
2367
|
+
layout.addWidget(close_button)
|
|
2368
|
+
|
|
2369
|
+
# Show the dialog
|
|
2370
|
+
dialog.exec()
|
|
2371
|
+
|
|
2372
|
+
def change_sort(self, sort_key):
|
|
2373
|
+
"""Change the sort order of the results"""
|
|
2374
|
+
if self.importance_df is None:
|
|
2375
|
+
return
|
|
2376
|
+
|
|
2377
|
+
# Update button states
|
|
2378
|
+
if sort_key == 'importance_value':
|
|
2379
|
+
self.importance_sort_btn.setChecked(True)
|
|
2380
|
+
self.correlation_sort_btn.setChecked(False)
|
|
2381
|
+
else:
|
|
2382
|
+
self.importance_sort_btn.setChecked(False)
|
|
2383
|
+
self.correlation_sort_btn.setChecked(True)
|
|
2384
|
+
|
|
2385
|
+
# Store the current sort key
|
|
2386
|
+
self.current_sort = sort_key
|
|
2387
|
+
|
|
2388
|
+
# Re-sort the dataframe
|
|
2389
|
+
self.importance_df = self.importance_df.sort_values(by=sort_key, ascending=False)
|
|
2390
|
+
|
|
2391
|
+
# Reset rendering of the table
|
|
2392
|
+
self.importance_table.clearContents()
|
|
2393
|
+
self.importance_table.setRowCount(len(self.importance_df))
|
|
2394
|
+
self.current_row = 0
|
|
2395
|
+
|
|
2396
|
+
# Start incremental rendering with the new sort order
|
|
2397
|
+
if self.render_timer and self.render_timer.isActive():
|
|
2398
|
+
self.render_timer.stop()
|
|
2399
|
+
self.render_timer = QTimer()
|
|
2400
|
+
self.render_timer.timeout.connect(lambda: self.render_next_batch(10))
|
|
2401
|
+
self.render_timer.start(10) # Update every 10ms
|
|
2402
|
+
|
|
2403
|
+
# Custom matplotlib canvas for embedding in Qt
|
|
2404
|
+
class MatplotlibCanvas(FigureCanvasQTAgg):
|
|
2405
|
+
def __init__(self, width=5, height=4, dpi=100):
|
|
2406
|
+
self.figure = Figure(figsize=(width, height), dpi=dpi)
|
|
2407
|
+
self.axes = self.figure.add_subplot(111)
|
|
2408
|
+
super().__init__(self.figure)
|
|
2409
|
+
|
|
2410
|
+
def visualize_profile(df: pd.DataFrame, column: str = None) -> None:
|
|
2411
|
+
"""
|
|
2412
|
+
Launch a PyQt6 UI for visualizing column importance.
|
|
2413
|
+
|
|
2414
|
+
Args:
|
|
2415
|
+
df: DataFrame containing the data
|
|
2416
|
+
column: Optional target column to analyze immediately
|
|
2417
|
+
"""
|
|
2418
|
+
try:
|
|
2419
|
+
# Verify df is a valid DataFrame
|
|
2420
|
+
if not isinstance(df, pd.DataFrame):
|
|
2421
|
+
raise ValueError("Input must be a pandas DataFrame")
|
|
2422
|
+
|
|
2423
|
+
# Verify df has data
|
|
2424
|
+
if len(df) == 0:
|
|
2425
|
+
raise ValueError("DataFrame is empty, cannot analyze")
|
|
2426
|
+
|
|
2427
|
+
# Verify columns exist
|
|
2428
|
+
if column is not None and column not in df.columns:
|
|
2429
|
+
raise ValueError(f"Column '{column}' not found in the DataFrame")
|
|
2430
|
+
|
|
2431
|
+
# Check if dataset is too small for meaningful analysis
|
|
2432
|
+
row_count = len(df)
|
|
2433
|
+
if row_count <= 5:
|
|
2434
|
+
print(f"WARNING: Dataset only has {row_count} rows. Feature importance analysis requires more data for meaningful results.")
|
|
2435
|
+
if QApplication.instance():
|
|
2436
|
+
QMessageBox.warning(None, "Insufficient Data",
|
|
2437
|
+
f"The dataset only contains {row_count} rows. Feature importance analysis requires more data for meaningful results.")
|
|
2438
|
+
|
|
2439
|
+
# For large datasets, sample up to 500 rows for better statistical significance
|
|
2440
|
+
elif row_count > 500:
|
|
2441
|
+
print(f"Sampling 500 rows from dataset ({row_count:,} total rows)")
|
|
2442
|
+
df = df.sample(n=500, random_state=42)
|
|
2443
|
+
|
|
2444
|
+
# Check if we're already in a Qt application
|
|
2445
|
+
existing_app = QApplication.instance()
|
|
2446
|
+
standalone_mode = existing_app is None
|
|
2447
|
+
|
|
2448
|
+
# Create app if needed
|
|
2449
|
+
if standalone_mode:
|
|
2450
|
+
app = QApplication(sys.argv)
|
|
2451
|
+
else:
|
|
2452
|
+
app = existing_app
|
|
2453
|
+
|
|
2454
|
+
app.setStyle('Fusion') # Modern look
|
|
2455
|
+
|
|
2456
|
+
# Set modern dark theme (only in standalone mode to avoid affecting parent app)
|
|
2457
|
+
if standalone_mode:
|
|
2458
|
+
palette = QPalette()
|
|
2459
|
+
palette.setColor(QPalette.ColorRole.Window, QColor(53, 53, 53))
|
|
2460
|
+
palette.setColor(QPalette.ColorRole.WindowText, Qt.GlobalColor.white)
|
|
2461
|
+
palette.setColor(QPalette.ColorRole.Base, QColor(25, 25, 25))
|
|
2462
|
+
palette.setColor(QPalette.ColorRole.AlternateBase, QColor(53, 53, 53))
|
|
2463
|
+
palette.setColor(QPalette.ColorRole.ToolTipBase, Qt.GlobalColor.white)
|
|
2464
|
+
palette.setColor(QPalette.ColorRole.ToolTipText, Qt.GlobalColor.white)
|
|
2465
|
+
palette.setColor(QPalette.ColorRole.Text, Qt.GlobalColor.white)
|
|
2466
|
+
palette.setColor(QPalette.ColorRole.Button, QColor(53, 53, 53))
|
|
2467
|
+
palette.setColor(QPalette.ColorRole.ButtonText, Qt.GlobalColor.white)
|
|
2468
|
+
palette.setColor(QPalette.ColorRole.BrightText, Qt.GlobalColor.red)
|
|
2469
|
+
palette.setColor(QPalette.ColorRole.Link, QColor(42, 130, 218))
|
|
2470
|
+
palette.setColor(QPalette.ColorRole.Highlight, QColor(42, 130, 218))
|
|
2471
|
+
palette.setColor(QPalette.ColorRole.HighlightedText, Qt.GlobalColor.black)
|
|
2472
|
+
app.setPalette(palette)
|
|
2473
|
+
|
|
2474
|
+
window = ColumnProfilerApp(df)
|
|
2475
|
+
window.setAttribute(Qt.WidgetAttribute.WA_DeleteOnClose) # Ensure cleanup on close
|
|
2476
|
+
window.show()
|
|
2477
|
+
|
|
2478
|
+
# Add tooltip to explain double-click functionality
|
|
2479
|
+
window.importance_table.setToolTip("Double-click on a feature to visualize its relationship with the target column")
|
|
2480
|
+
|
|
2481
|
+
# If a specific column is provided, analyze it immediately
|
|
2482
|
+
if column is not None and column in df.columns:
|
|
2483
|
+
window.column_selector.setCurrentText(column)
|
|
2484
|
+
# Wrap the analysis in a try/except to prevent crashes
|
|
2485
|
+
def safe_analyze():
|
|
2486
|
+
try:
|
|
2487
|
+
window.analyze_column()
|
|
2488
|
+
except Exception as e:
|
|
2489
|
+
print(f"Error during column analysis: {e}")
|
|
2490
|
+
import traceback
|
|
2491
|
+
traceback.print_exc()
|
|
2492
|
+
QMessageBox.critical(window, "Analysis Error",
|
|
2493
|
+
f"Error analyzing column:\n\n{str(e)}")
|
|
2494
|
+
|
|
2495
|
+
QTimer.singleShot(100, safe_analyze) # Use timer to avoid immediate thread issues
|
|
2496
|
+
|
|
2497
|
+
# Set a watchdog timer to cancel analysis if it takes too long (30 seconds)
|
|
2498
|
+
def check_progress():
|
|
2499
|
+
if window.worker_thread and window.worker_thread.isRunning():
|
|
2500
|
+
# If still running after 30 seconds, cancel the operation
|
|
2501
|
+
QMessageBox.warning(window, "Analysis Timeout",
|
|
2502
|
+
"The analysis is taking longer than expected. It will be canceled to prevent hanging.")
|
|
2503
|
+
try:
|
|
2504
|
+
window.cancel_analysis()
|
|
2505
|
+
except Exception as e:
|
|
2506
|
+
print(f"Error canceling analysis: {e}")
|
|
2507
|
+
|
|
2508
|
+
QTimer.singleShot(30000, check_progress) # 30 seconds timeout
|
|
2509
|
+
|
|
2510
|
+
# Only enter event loop in standalone mode
|
|
2511
|
+
if standalone_mode:
|
|
2512
|
+
sys.exit(app.exec())
|
|
2513
|
+
else:
|
|
2514
|
+
# Return the window for parent app to track
|
|
2515
|
+
return window
|
|
2516
|
+
except Exception as e:
|
|
2517
|
+
# Handle any exceptions to prevent crashes
|
|
2518
|
+
print(f"Error in visualize_profile: {e}")
|
|
2519
|
+
import traceback
|
|
2520
|
+
traceback.print_exc()
|
|
2521
|
+
|
|
2522
|
+
# Show error to user
|
|
2523
|
+
if QApplication.instance():
|
|
2524
|
+
show_error_notification(f"Profile Error: Error creating column profile - {str(e)}")
|
|
2525
|
+
return None
|
|
2526
|
+
|
|
2527
|
+
def test_profile():
|
|
2528
|
+
"""
|
|
2529
|
+
Test the profile and visualization functions with sample data.
|
|
2530
|
+
"""
|
|
2531
|
+
# Create a sample DataFrame with 40 columns
|
|
2532
|
+
np.random.seed(42)
|
|
2533
|
+
n = 1000
|
|
2534
|
+
|
|
2535
|
+
# Generate core sample data with known relationships
|
|
2536
|
+
age = np.random.normal(35, 10, n).astype(int)
|
|
2537
|
+
experience = age - np.random.randint(18, 25, n) # experience correlates with age
|
|
2538
|
+
experience = np.maximum(0, experience) # no negative experience
|
|
2539
|
+
|
|
2540
|
+
salary = 30000 + 2000 * experience + np.random.normal(0, 10000, n)
|
|
2541
|
+
|
|
2542
|
+
departments = np.random.choice(['Engineering', 'Marketing', 'Sales', 'HR', 'Finance'], n)
|
|
2543
|
+
education = np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], n,
|
|
2544
|
+
p=[0.2, 0.5, 0.2, 0.1])
|
|
2545
|
+
|
|
2546
|
+
performance = np.random.normal(0, 1, n)
|
|
2547
|
+
performance += 0.5 * (education == 'Master') + 0.8 * (education == 'PhD') # education affects performance
|
|
2548
|
+
performance += 0.01 * experience # experience slightly affects performance
|
|
2549
|
+
performance = (performance - performance.min()) / (performance.max() - performance.min()) * 5 # scale to 0-5
|
|
2550
|
+
|
|
2551
|
+
# Create the base DataFrame
|
|
2552
|
+
data = {
|
|
2553
|
+
'Age': age,
|
|
2554
|
+
'Experience': experience,
|
|
2555
|
+
'Department': departments,
|
|
2556
|
+
'Education': education,
|
|
2557
|
+
'Performance': performance,
|
|
2558
|
+
'Salary': salary
|
|
2559
|
+
}
|
|
2560
|
+
|
|
2561
|
+
# Generate additional numeric columns
|
|
2562
|
+
for i in range(1, 15):
|
|
2563
|
+
# Create some columns with relationship to salary
|
|
2564
|
+
if i <= 5:
|
|
2565
|
+
data[f'Metric_{i}'] = salary * (0.01 * i) + np.random.normal(0, 5000, n)
|
|
2566
|
+
# Create columns with relationship to age
|
|
2567
|
+
elif i <= 10:
|
|
2568
|
+
data[f'Metric_{i}'] = age * (i-5) + np.random.normal(0, 10, n)
|
|
2569
|
+
# Create random columns
|
|
2570
|
+
else:
|
|
2571
|
+
data[f'Metric_{i}'] = np.random.normal(100, 50, n)
|
|
2572
|
+
|
|
2573
|
+
# Generate additional categorical columns
|
|
2574
|
+
categories = [
|
|
2575
|
+
['A', 'B', 'C', 'D'],
|
|
2576
|
+
['Low', 'Medium', 'High'],
|
|
2577
|
+
['North', 'South', 'East', 'West'],
|
|
2578
|
+
['Type1', 'Type2', 'Type3'],
|
|
2579
|
+
['Yes', 'No', 'Maybe'],
|
|
2580
|
+
['Red', 'Green', 'Blue', 'Yellow'],
|
|
2581
|
+
['Small', 'Medium', 'Large']
|
|
2582
|
+
]
|
|
2583
|
+
|
|
2584
|
+
for i in range(1, 10):
|
|
2585
|
+
# Pick a category list
|
|
2586
|
+
cat_list = categories[i % len(categories)]
|
|
2587
|
+
# Generate random categorical column
|
|
2588
|
+
data[f'Category_{i}'] = np.random.choice(cat_list, n)
|
|
2589
|
+
|
|
2590
|
+
# Generate date and time related columns
|
|
2591
|
+
base_date = np.datetime64('2020-01-01')
|
|
2592
|
+
|
|
2593
|
+
# Instead of datetime objects, convert to days since base date (numeric values)
|
|
2594
|
+
hire_days = np.array([365 * (35 - a) + np.random.randint(0, 30) for a in age])
|
|
2595
|
+
data['Hire_Days_Ago'] = hire_days
|
|
2596
|
+
|
|
2597
|
+
promotion_days = np.array([np.random.randint(0, 1000) for _ in range(n)])
|
|
2598
|
+
data['Last_Promotion_Days_Ago'] = promotion_days
|
|
2599
|
+
|
|
2600
|
+
review_days = np.array([np.random.randint(1000, 1200) for _ in range(n)])
|
|
2601
|
+
data['Next_Review_In_Days'] = review_days
|
|
2602
|
+
|
|
2603
|
+
# For reference, also store the actual dates as strings instead of datetime64
|
|
2604
|
+
data['Hire_Date_Str'] = [str(base_date + np.timedelta64(int(days), 'D')) for days in hire_days]
|
|
2605
|
+
data['Last_Promotion_Date_Str'] = [str(base_date + np.timedelta64(int(days), 'D')) for days in promotion_days]
|
|
2606
|
+
data['Review_Date_Str'] = [str(base_date + np.timedelta64(int(days), 'D')) for days in review_days]
|
|
2607
|
+
|
|
2608
|
+
# Binary columns
|
|
2609
|
+
data['IsManager'] = np.random.choice([0, 1], n, p=[0.8, 0.2])
|
|
2610
|
+
data['RemoteWorker'] = np.random.choice([0, 1], n)
|
|
2611
|
+
data['HasHealthInsurance'] = np.random.choice([0, 1], n, p=[0.1, 0.9])
|
|
2612
|
+
data['HasRetirementPlan'] = np.random.choice([0, 1], n, p=[0.15, 0.85])
|
|
2613
|
+
|
|
2614
|
+
# Columns with missing values
|
|
2615
|
+
data['OptionalMetric_1'] = np.random.normal(50, 10, n)
|
|
2616
|
+
data['OptionalMetric_1'][np.random.choice([True, False], n, p=[0.2, 0.8])] = np.nan
|
|
2617
|
+
|
|
2618
|
+
data['OptionalMetric_2'] = np.random.normal(100, 20, n)
|
|
2619
|
+
data['OptionalMetric_2'][np.random.choice([True, False], n, p=[0.3, 0.7])] = np.nan
|
|
2620
|
+
|
|
2621
|
+
data['OptionalCategory'] = np.random.choice(['Option1', 'Option2', 'Option3', None], n, p=[0.3, 0.3, 0.3, 0.1])
|
|
2622
|
+
|
|
2623
|
+
# High cardinality column (like an ID)
|
|
2624
|
+
data['ID'] = [f"ID_{i:06d}" for i in range(n)]
|
|
2625
|
+
|
|
2626
|
+
# Create the DataFrame with 40 columns
|
|
2627
|
+
df = pd.DataFrame(data)
|
|
2628
|
+
|
|
2629
|
+
print(f"Created sample DataFrame with {len(df.columns)} columns and {len(df)} rows")
|
|
2630
|
+
print("Columns:", ', '.join(df.columns))
|
|
2631
|
+
print("Launching PyQt6 Column Profiler application...")
|
|
2632
|
+
visualize_profile(df, 'Salary') # Start with Salary analysis
|
|
2633
|
+
|
|
2634
|
+
if __name__ == "__main__":
|
|
2635
|
+
test_profile()
|