sqlshell 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. sqlshell/__init__.py +84 -0
  2. sqlshell/__main__.py +4926 -0
  3. sqlshell/ai_autocomplete.py +392 -0
  4. sqlshell/ai_settings_dialog.py +337 -0
  5. sqlshell/context_suggester.py +768 -0
  6. sqlshell/create_test_data.py +152 -0
  7. sqlshell/data/create_test_data.py +137 -0
  8. sqlshell/db/__init__.py +6 -0
  9. sqlshell/db/database_manager.py +1318 -0
  10. sqlshell/db/export_manager.py +188 -0
  11. sqlshell/editor.py +1166 -0
  12. sqlshell/editor_integration.py +127 -0
  13. sqlshell/execution_handler.py +421 -0
  14. sqlshell/menus.py +262 -0
  15. sqlshell/notification_manager.py +370 -0
  16. sqlshell/query_tab.py +904 -0
  17. sqlshell/resources/__init__.py +1 -0
  18. sqlshell/resources/icon.png +0 -0
  19. sqlshell/resources/logo_large.png +0 -0
  20. sqlshell/resources/logo_medium.png +0 -0
  21. sqlshell/resources/logo_small.png +0 -0
  22. sqlshell/resources/splash_screen.gif +0 -0
  23. sqlshell/space_invaders.py +501 -0
  24. sqlshell/splash_screen.py +405 -0
  25. sqlshell/sqlshell/__init__.py +5 -0
  26. sqlshell/sqlshell/create_test_data.py +118 -0
  27. sqlshell/sqlshell/create_test_databases.py +96 -0
  28. sqlshell/sqlshell_demo.png +0 -0
  29. sqlshell/styles.py +257 -0
  30. sqlshell/suggester_integration.py +330 -0
  31. sqlshell/syntax_highlighter.py +124 -0
  32. sqlshell/table_list.py +996 -0
  33. sqlshell/ui/__init__.py +6 -0
  34. sqlshell/ui/bar_chart_delegate.py +49 -0
  35. sqlshell/ui/filter_header.py +469 -0
  36. sqlshell/utils/__init__.py +16 -0
  37. sqlshell/utils/profile_cn2.py +1661 -0
  38. sqlshell/utils/profile_column.py +2635 -0
  39. sqlshell/utils/profile_distributions.py +616 -0
  40. sqlshell/utils/profile_entropy.py +347 -0
  41. sqlshell/utils/profile_foreign_keys.py +779 -0
  42. sqlshell/utils/profile_keys.py +2834 -0
  43. sqlshell/utils/profile_ohe.py +934 -0
  44. sqlshell/utils/profile_ohe_advanced.py +754 -0
  45. sqlshell/utils/profile_ohe_comparison.py +237 -0
  46. sqlshell/utils/profile_prediction.py +926 -0
  47. sqlshell/utils/profile_similarity.py +876 -0
  48. sqlshell/utils/search_in_df.py +90 -0
  49. sqlshell/widgets.py +400 -0
  50. sqlshell-0.4.4.dist-info/METADATA +441 -0
  51. sqlshell-0.4.4.dist-info/RECORD +54 -0
  52. sqlshell-0.4.4.dist-info/WHEEL +5 -0
  53. sqlshell-0.4.4.dist-info/entry_points.txt +2 -0
  54. sqlshell-0.4.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2635 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.preprocessing import LabelEncoder
5
+ from sklearn.ensemble import RandomForestRegressor
6
+ import sys
7
+ import time
8
+ import hashlib
9
+ import os
10
+ import pickle
11
+ import gc
12
+ from pathlib import Path
13
+ from PyQt6.QtWidgets import (QApplication, QMainWindow, QTableWidget, QTableWidgetItem,
14
+ QVBoxLayout, QHBoxLayout, QLabel, QWidget, QComboBox,
15
+ QPushButton, QSplitter, QHeaderView, QFrame, QProgressBar,
16
+ QMessageBox, QDialog)
17
+
18
+ # Import notification manager (with fallback for cases where it's not available)
19
+ try:
20
+ from sqlshell.notification_manager import show_error_notification, show_warning_notification
21
+ except ImportError:
22
+ # Fallback functions for when notification manager is not available
23
+ def show_error_notification(message):
24
+ print(f"Error: {message}")
25
+ def show_warning_notification(message):
26
+ print(f"Warning: {message}")
27
+ from PyQt6.QtCore import Qt, QAbstractTableModel, QModelIndex, QThread, pyqtSignal, QTimer
28
+ from PyQt6.QtGui import QPalette, QColor, QBrush, QPainter, QPen
29
+ from scipy.stats import chi2_contingency, pearsonr
30
+
31
+ # Import matplotlib at the top level
32
+ import matplotlib
33
+ try:
34
+ matplotlib.use('QtAgg')
35
+ except ImportError:
36
+ matplotlib.use('Agg') # Fall back to headless backend for CI/testing
37
+ from matplotlib.backends.backend_qtagg import FigureCanvasQTAgg
38
+ from matplotlib.figure import Figure
39
+ import seaborn as sns
40
+ import matplotlib.pyplot as plt
41
+
42
+ # Create a cache directory in user's home directory
43
+ CACHE_DIR = os.path.join(Path.home(), '.sqlshell_cache')
44
+ os.makedirs(CACHE_DIR, exist_ok=True)
45
+
46
+ def get_cache_key(df, column):
47
+ """Generate a cache key based on dataframe content and column"""
48
+ # Get DataFrame characteristics that make it unique
49
+ columns = ','.join(df.columns)
50
+ shapes = f"{df.shape[0]}x{df.shape[1]}"
51
+ col_types = ','.join(str(dtype) for dtype in df.dtypes)
52
+
53
+ # Sample some values as fingerprint without loading entire dataframe
54
+ sample_rows = min(50, len(df))
55
+ values_sample = df.head(sample_rows).values.tobytes()
56
+
57
+ # Create hash
58
+ hash_input = f"{columns}|{shapes}|{col_types}|{column}|{len(df)}"
59
+ m = hashlib.md5()
60
+ m.update(hash_input.encode())
61
+ m.update(values_sample) # Add sample data to hash
62
+ return m.hexdigest()
63
+
64
+ def cache_results(df, column, results):
65
+ """Save results to disk cache"""
66
+ try:
67
+ cache_key = get_cache_key(df, column)
68
+ cache_file = os.path.join(CACHE_DIR, f"{cache_key}.pkl")
69
+ with open(cache_file, 'wb') as f:
70
+ pickle.dump(results, f)
71
+ return True
72
+ except Exception as e:
73
+ print(f"Cache write error: {e}")
74
+ return False
75
+
76
+ def get_cached_results(df, column):
77
+ """Try to get results from disk cache"""
78
+ try:
79
+ cache_key = get_cache_key(df, column)
80
+ cache_file = os.path.join(CACHE_DIR, f"{cache_key}.pkl")
81
+ if os.path.exists(cache_file):
82
+ # Check if cache file is recent (less than 1 day old)
83
+ mod_time = os.path.getmtime(cache_file)
84
+ if time.time() - mod_time < 86400: # 24 hours in seconds
85
+ with open(cache_file, 'rb') as f:
86
+ return pickle.load(f)
87
+ return None
88
+ except Exception as e:
89
+ print(f"Cache read error: {e}")
90
+ return None
91
+
92
+ # Worker thread for background processing
93
+ class ExplainerThread(QThread):
94
+ # Signals for progress updates and results
95
+ progress = pyqtSignal(int, str)
96
+ result = pyqtSignal(object)
97
+ error = pyqtSignal(str)
98
+
99
+ def __init__(self, df, column):
100
+ super().__init__()
101
+ # Make a copy of the dataframe to avoid reference issues
102
+ self.df = df.copy()
103
+ self.column = column
104
+ self._is_canceled = False
105
+
106
+ def cancel(self):
107
+ """Mark the thread as canceled"""
108
+ self._is_canceled = True
109
+
110
+ def calculate_correlation(self, x, y):
111
+ """Calculate correlation between two variables, handling different data types.
112
+ Returns absolute correlation value between 0 and 1."""
113
+ try:
114
+ # Handle missing values
115
+ mask = ~(pd.isna(x) | pd.isna(y))
116
+ x_clean = x[mask]
117
+ y_clean = y[mask]
118
+
119
+ # If too few data points, return default
120
+ if len(x_clean) < 5:
121
+ return 0.0
122
+
123
+ # Check data types
124
+ x_is_numeric = pd.api.types.is_numeric_dtype(x_clean)
125
+ y_is_numeric = pd.api.types.is_numeric_dtype(y_clean)
126
+
127
+ # Case 1: Both numeric - use Pearson correlation
128
+ if x_is_numeric and y_is_numeric:
129
+ corr, _ = pearsonr(x_clean, y_clean)
130
+ return abs(corr)
131
+
132
+ # Case 2: Categorical vs Categorical - use Cramer's V
133
+ elif not x_is_numeric and not y_is_numeric:
134
+ # Convert to categorical codes
135
+ x_cat = pd.Categorical(x_clean).codes
136
+ y_cat = pd.Categorical(y_clean).codes
137
+
138
+ # Create contingency table
139
+ contingency = pd.crosstab(x_cat, y_cat)
140
+
141
+ # Calculate Cramer's V
142
+ chi2, _, _, _ = chi2_contingency(contingency)
143
+ n = contingency.sum().sum()
144
+ phi2 = chi2 / n
145
+
146
+ # Get dimensions
147
+ r, k = contingency.shape
148
+
149
+ # Calculate Cramer's V with correction for dimensions
150
+ cramers_v = np.sqrt(phi2 / min(k-1, r-1)) if min(k-1, r-1) > 0 else 0.0
151
+ return min(cramers_v, 1.0) # Cap at 1.0
152
+
153
+ # Case 3: Mixed types - convert to ranks or categories
154
+ else:
155
+ if x_is_numeric and not y_is_numeric:
156
+ # Convert categorical y to codes
157
+ y_encoded = pd.Categorical(y_clean).codes
158
+
159
+ # Calculate correlation between x and encoded y
160
+ # Using point-biserial correlation (special case of Pearson)
161
+ corr, _ = pearsonr(x_clean, y_encoded)
162
+ return abs(corr)
163
+ else: # y is numeric, x is categorical
164
+ # Convert categorical x to codes
165
+ x_encoded = pd.Categorical(x_clean).codes
166
+
167
+ # Calculate correlation
168
+ corr, _ = pearsonr(x_encoded, y_clean)
169
+ return abs(corr)
170
+
171
+ except Exception as e:
172
+ print(f"Error calculating correlation: {e}")
173
+ return 0.0 # Return zero if correlation calculation fails
174
+
175
+ def run(self):
176
+ try:
177
+ # Check if canceled
178
+ if self._is_canceled:
179
+ return
180
+
181
+ # Check disk cache first
182
+ self.progress.emit(0, "Checking for cached results...")
183
+ cached_results = get_cached_results(self.df, self.column)
184
+ if cached_results is not None:
185
+ # Check if canceled
186
+ if self._is_canceled:
187
+ return
188
+
189
+ self.progress.emit(95, "Found cached results, loading...")
190
+ time.sleep(0.5) # Brief pause to show the user we found a cache
191
+
192
+ # Check if canceled
193
+ if self._is_canceled:
194
+ return
195
+
196
+ self.progress.emit(100, "Loaded from cache")
197
+ self.result.emit(cached_results)
198
+ return
199
+
200
+ # Clean up memory before intensive computation
201
+ gc.collect()
202
+
203
+ # Check if canceled
204
+ if self._is_canceled:
205
+ return
206
+
207
+ # Early check for empty dataframe or no columns
208
+ if self.df.empty or len(self.df.columns) == 0:
209
+ raise ValueError("The dataframe is empty or has no columns for analysis")
210
+
211
+ # No cache found, proceed with computation
212
+ self.progress.emit(5, "Computing new analysis...")
213
+
214
+ # Validate that the target column exists in the dataframe
215
+ if self.column not in self.df.columns:
216
+ raise ValueError(f"Target column '{self.column}' not found in the dataframe")
217
+
218
+ # Create a copy to avoid modifying the original dataframe
219
+ df = self.df.copy()
220
+
221
+ # Verify we have data to work with
222
+ if len(df) == 0:
223
+ raise ValueError("No data available for analysis (empty dataframe)")
224
+
225
+ # Sample up to 500 rows for better statistical significance while maintaining speed
226
+ if len(df) > 500:
227
+ sample_size = 500 # Increased sample size for better analysis
228
+ self.progress.emit(10, f"Sampling dataset (using {sample_size} rows from {len(df)} total)...")
229
+ df = df.sample(n=sample_size, random_state=42)
230
+ # Force garbage collection after sampling
231
+ gc.collect()
232
+
233
+ # Check if canceled
234
+ if self._is_canceled:
235
+ return
236
+
237
+ # Drop columns with too many unique values (likely IDs) or excessive NaNs
238
+ self.progress.emit(15, "Analyzing columns for preprocessing...")
239
+ cols_to_drop = []
240
+ for col in df.columns:
241
+ if col == self.column: # Don't drop target column
242
+ continue
243
+ try:
244
+ # Only drop columns with extremely high uniqueness (99% instead of 95%)
245
+ # This ensures we keep more features for analysis
246
+ if df[col].nunique() / len(df) > 0.99 and len(df) > 100:
247
+ cols_to_drop.append(col)
248
+ # Only drop columns with very high missing values (80% instead of 50%)
249
+ elif df[col].isna().mean() > 0.8:
250
+ cols_to_drop.append(col)
251
+ except:
252
+ # If we can't analyze the column, drop it
253
+ cols_to_drop.append(col)
254
+
255
+ # Drop identified columns, but ensure we keep at least some features
256
+ remaining_cols = [col for col in df.columns if col != self.column and col not in cols_to_drop]
257
+
258
+ # If dropping would leave us with no features, keep at least 3 columns (or all if less than 3)
259
+ if len(remaining_cols) == 0 and len(cols_to_drop) > 0:
260
+ # Sort dropped columns by uniqueness (keep those with lower uniqueness)
261
+ col_uniqueness = {}
262
+ for col in cols_to_drop:
263
+ try:
264
+ col_uniqueness[col] = df[col].nunique() / len(df)
265
+ except:
266
+ col_uniqueness[col] = 1.0 # Assume high uniqueness for problematic columns
267
+
268
+ # Sort by uniqueness and keep the least unique columns
269
+ cols_to_keep = sorted(col_uniqueness.items(), key=lambda x: x[1])[:min(3, len(cols_to_drop))]
270
+ cols_to_drop = [col for col in cols_to_drop if col not in [c[0] for c in cols_to_keep]]
271
+ print(f"Keeping {len(cols_to_keep)} columns to ensure analysis can proceed")
272
+
273
+ if cols_to_drop:
274
+ self.progress.emit(20, f"Removing {len(cols_to_drop)} low-information columns...")
275
+ df = df.drop(columns=cols_to_drop)
276
+
277
+ # Ensure target column is still in the dataframe
278
+ if self.column not in df.columns:
279
+ raise ValueError(f"Target column '{self.column}' not found in dataframe after preprocessing")
280
+
281
+ # Calculate correlation coefficients first
282
+ self.progress.emit(25, "Calculating correlation measures...")
283
+ correlations = {}
284
+
285
+ # Get all feature columns (excluding target)
286
+ feature_cols = [col for col in df.columns if col != self.column]
287
+
288
+ # Calculate correlation for each feature
289
+ for col in feature_cols:
290
+ try:
291
+ # Calculate correlation between each feature and target
292
+ cor_val = self.calculate_correlation(df[col], df[self.column])
293
+ correlations[col] = cor_val
294
+ except Exception as e:
295
+ print(f"Error calculating correlation for {col}: {e}")
296
+ correlations[col] = 0.0
297
+
298
+ # Separate features and target
299
+ self.progress.emit(30, "Preparing features and target...")
300
+ X = df.drop(columns=[self.column])
301
+ y = df[self.column]
302
+
303
+ # Handle high-cardinality categorical features
304
+ self.progress.emit(35, "Encoding categorical features...")
305
+ # Use a simpler approach - just one-hot encode columns with few unique values
306
+ # and encode (don't drop) high-cardinality columns for speed
307
+ categorical_cols = X.select_dtypes(include='object').columns
308
+ high_cardinality_threshold = 20 # Higher threshold to keep more columns
309
+
310
+ # Keep track of how many columns we've processed
311
+ columns_processed = 0
312
+ columns_kept = 0
313
+
314
+ for col in categorical_cols:
315
+ columns_processed += 1
316
+ unique_count = X[col].nunique()
317
+ # Always keep the column, but use different encoding strategies based on cardinality
318
+ if unique_count <= high_cardinality_threshold:
319
+ # Simple label encoding for low-cardinality features
320
+ X[col] = X[col].fillna('_MISSING_').astype('category').cat.codes
321
+ columns_kept += 1
322
+ else:
323
+ # For high-cardinality features, still encode them but with a simpler approach
324
+ # Use label encoding instead of dropping
325
+ X[col] = X[col].fillna('_MISSING_').astype('category').cat.codes
326
+ columns_kept += 1
327
+
328
+ # Log how many columns were kept
329
+ if columns_processed > 0:
330
+ self.progress.emit(40, f"Encoded {columns_kept} categorical columns out of {columns_processed}")
331
+
332
+ # Handle target column in a simpler, faster way
333
+ if y.dtype == 'object':
334
+ # For categorical targets, use simple category codes
335
+ y = y.fillna('_MISSING_').astype('category').cat.codes
336
+ else:
337
+ # For numeric targets, just fill NaNs with mean
338
+ y = y.fillna(y.mean() if pd.api.types.is_numeric_dtype(y) else y.mode()[0])
339
+
340
+ # Train/test split
341
+ self.progress.emit(45, "Splitting data into train/test sets...")
342
+
343
+ # Make sure we still have features to work with
344
+ if X.shape[1] == 0:
345
+ raise ValueError("No features remain after preprocessing. Try selecting a different target column.")
346
+
347
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
348
+
349
+ # Check if canceled
350
+ if self._is_canceled:
351
+ return
352
+
353
+ # Train a tree-based model for feature importance
354
+ self.progress.emit(50, "Training RandomForest model...")
355
+
356
+ # Check the number of features left for analysis
357
+ feature_count = X_train.shape[1]
358
+
359
+ # Adjust model complexity based on feature count
360
+ if feature_count < 3:
361
+ max_depth = 3 # Simple trees for few features
362
+ n_estimators = 10 # Use more trees to compensate
363
+ else:
364
+ max_depth = 5 # Moderate depth trees
365
+ n_estimators = 10 # Balanced number of trees
366
+
367
+ model = RandomForestRegressor(
368
+ n_estimators=n_estimators,
369
+ max_depth=max_depth,
370
+ min_samples_split=5, # Prevent overfitting
371
+ min_samples_leaf=2, # Prevent overfitting
372
+ max_features='sqrt', # Use subset of features per tree
373
+ n_jobs=1, # Single thread to avoid overhead
374
+ random_state=42,
375
+ verbose=0 # Suppress output
376
+ )
377
+
378
+ # Set simpler parameters for large feature sets
379
+ if X_train.shape[1] > 100: # If there are many features
380
+ self.progress.emit(55, "Large feature set detected, using simpler model...")
381
+ model.set_params(n_estimators=5, max_depth=3)
382
+
383
+ # Fit model with a try/except to catch memory issues
384
+ try:
385
+ model.fit(X_train, y_train)
386
+ except Exception as e:
387
+ # Log the error for debugging
388
+ print(f"Initial RandomForest fit failed: {str(e)}")
389
+
390
+ # If we encounter an error, try with an even smaller and simpler model
391
+ self.progress.emit(55, "Adjusting model parameters due to computational constraints...")
392
+ try:
393
+ # Try a simpler regressor with more conservative parameters
394
+ model = RandomForestRegressor(
395
+ n_estimators=3,
396
+ max_depth=2,
397
+ max_features='sqrt',
398
+ n_jobs=1,
399
+ random_state=42,
400
+ verbose=0
401
+ )
402
+ model.fit(X_train, y_train)
403
+ except Exception as inner_e:
404
+ # If even the simpler model fails, resort to a fallback strategy
405
+ print(f"Even simpler RandomForest failed: {str(inner_e)}")
406
+ self.progress.emit(60, "Using fallback importance calculation method...")
407
+
408
+ # Create a basic feature importance based on correlation with target
409
+ # This is a simple fallback when model training fails
410
+ importance = []
411
+ for col in X.columns:
412
+ try:
413
+ # Use pre-calculated correlations for fallback importance
414
+ corr_value = correlations.get(col, 0.5)
415
+ # Scale correlation to make a reasonable importance value
416
+ # Higher correlation = higher importance
417
+ importance.append(0.5 + corr_value/2 if not pd.isna(corr_value) else 0.5)
418
+ except:
419
+ # If correlation fails, use default
420
+ importance.append(0.5)
421
+
422
+ # Normalize to sum to 1
423
+ importance = np.array(importance)
424
+ if sum(importance) > 0:
425
+ importance = importance / sum(importance)
426
+ else:
427
+ # Equal importance if everything fails
428
+ importance = np.ones(len(X.columns)) / len(X.columns)
429
+
430
+ # Skip the model-based code path since we calculated importances manually
431
+ self.progress.emit(80, "Creating importance results...")
432
+ feature_importance = pd.DataFrame({
433
+ 'feature': X.columns,
434
+ 'importance_value': importance,
435
+ 'correlation': [correlations.get(col, 0.0) for col in X.columns]
436
+ }).sort_values(by='importance_value', ascending=False)
437
+
438
+ # Cache the results for future use
439
+ self.progress.emit(95, "Caching results for future use...")
440
+ cache_results(self.df, self.column, feature_importance)
441
+
442
+ # Clean up after computation
443
+ del df, X, y, X_train, X_test, y_train, y_test
444
+ gc.collect()
445
+
446
+ # Check if canceled
447
+ if self._is_canceled:
448
+ return
449
+
450
+ # Emit the result
451
+ self.progress.emit(100, "Analysis complete (fallback method)")
452
+ self.result.emit(feature_importance)
453
+ return
454
+
455
+ # Check if canceled
456
+ if self._is_canceled:
457
+ return
458
+
459
+ # Get feature importance from the trained model
460
+ self.progress.emit(80, "Calculating feature importance and correlations...")
461
+
462
+ try:
463
+ # Check if we have features to analyze
464
+ if X.shape[1] == 0:
465
+ raise ValueError("No features available for importance analysis")
466
+
467
+ # Get feature importance from RandomForest
468
+ importance = model.feature_importances_
469
+
470
+ # Verify importance values are valid
471
+ if np.isnan(importance).any() or np.isinf(importance).any():
472
+ # Handle NaN or Inf values
473
+ print("Warning: Invalid importance values detected, using fallback method")
474
+ # Replace with equal importance
475
+ importance = np.ones(len(X.columns)) / len(X.columns)
476
+
477
+ # Create and sort the importance dataframe with correlations
478
+ feature_importance = pd.DataFrame({
479
+ 'feature': X.columns,
480
+ 'importance_value': importance,
481
+ 'correlation': [correlations.get(col, 0.0) for col in X.columns]
482
+ }).sort_values(by='importance_value', ascending=False)
483
+
484
+ # Cache the results for future use
485
+ self.progress.emit(95, "Caching results for future use...")
486
+ cache_results(self.df, self.column, feature_importance)
487
+
488
+ # Clean up after computation
489
+ del df, X, y, X_train, X_test, y_train, y_test, model
490
+ gc.collect()
491
+
492
+ # Check if canceled
493
+ if self._is_canceled:
494
+ return
495
+
496
+ # Emit the result
497
+ self.progress.emit(100, "Analysis complete")
498
+ self.result.emit(feature_importance)
499
+ return
500
+
501
+ except Exception as e:
502
+ print(f"Error in feature importance calculation: {e}")
503
+ import traceback
504
+ traceback.print_exc()
505
+
506
+ # Create fallback importance values when model-based approach fails
507
+ self.progress.emit(85, "Using alternative importance calculation method...")
508
+
509
+ try:
510
+ # Try correlation-based approach first
511
+ importance = []
512
+ has_valid_correlations = False
513
+
514
+ for col in X.columns:
515
+ try:
516
+ # Use pre-calculated correlations
517
+ corr = correlations.get(col, 0.1)
518
+ if not pd.isna(corr):
519
+ importance.append(corr)
520
+ has_valid_correlations = True
521
+ else:
522
+ importance.append(0.1) # Default for failed correlation
523
+ except:
524
+ # Default value for any error
525
+ importance.append(0.1)
526
+
527
+ # Normalize importance values
528
+ importance = np.array(importance)
529
+ if has_valid_correlations and sum(importance) > 0:
530
+ # If we have valid correlations, use them normalized
531
+ importance = importance / max(sum(importance), 0.001)
532
+ else:
533
+ # Otherwise use frequency-based heuristic
534
+ print("Using frequency-based feature importance as fallback")
535
+ # Count unique values as a proxy for importance
536
+ importance = []
537
+ total_rows = len(X)
538
+
539
+ for col in X.columns:
540
+ try:
541
+ # More unique values could indicate more information content
542
+ # But we invert the ratio so columns with fewer unique values
543
+ # (more predictive) get higher importance
544
+ uniqueness = X[col].nunique() / total_rows
545
+ # Invert and scale between 0.1 and 1.0
546
+ val = 1.0 - (0.9 * uniqueness)
547
+ importance.append(max(0.1, min(1.0, val)))
548
+ except:
549
+ importance.append(0.1) # Default value
550
+
551
+ # Normalize
552
+ importance = np.array(importance)
553
+ importance = importance / max(sum(importance), 0.001)
554
+
555
+ except Exception as fallback_error:
556
+ # Last resort: create equal importance for all features
557
+ print(f"Fallback error: {fallback_error}, using equal importance")
558
+ importance_values = np.ones(len(X.columns)) / max(len(X.columns), 1)
559
+ importance = importance_values
560
+
561
+ # Create dataframe with results, including correlations
562
+ feature_importance = pd.DataFrame({
563
+ 'feature': X.columns,
564
+ 'importance_value': importance,
565
+ 'correlation': [correlations.get(col, 0.0) for col in X.columns]
566
+ }).sort_values(by='importance_value', ascending=False)
567
+
568
+ # Cache the results
569
+ try:
570
+ cache_results(self.df, self.column, feature_importance)
571
+ except:
572
+ pass # Ignore cache errors
573
+
574
+ # Clean up
575
+ try:
576
+ del df, X, y, X_train, X_test, y_train, y_test
577
+ gc.collect()
578
+ except:
579
+ pass
580
+
581
+ # Emit the result
582
+ self.progress.emit(100, "Analysis complete (with fallback methods)")
583
+ self.result.emit(feature_importance)
584
+ return
585
+
586
+ except IndexError as e:
587
+ # Handle index errors with more detail
588
+ import traceback
589
+ import inspect
590
+ trace = traceback.format_exc()
591
+
592
+ # Get more detailed information
593
+ frame = inspect.trace()[-1]
594
+ frame_info = inspect.getframeinfo(frame[0])
595
+ filename = frame_info.filename
596
+ lineno = frame_info.lineno
597
+ function = frame_info.function
598
+ code_context = frame_info.code_context[0].strip() if frame_info.code_context else "Unknown code context"
599
+
600
+ # Format a more detailed error message
601
+ detail_msg = f"IndexError: {str(e)}\nLocation: {filename}:{lineno} in function '{function}'\nCode: {code_context}\n\n{trace}"
602
+ print(detail_msg) # Print to console for debugging
603
+
604
+ if not self._is_canceled:
605
+ self.error.emit(f"Index error at line {lineno} in {function}:\n{str(e)}\nCode: {code_context}")
606
+
607
+ except Exception as e:
608
+ if not self._is_canceled: # Only emit error if not canceled
609
+ import traceback
610
+ trace = traceback.format_exc()
611
+ print(f"Error in ExplainerThread: {str(e)}")
612
+ print(trace) # Print full stack trace to help debug
613
+ self.error.emit(f"{str(e)}\n\nTrace: {trace}")
614
+
615
+ def analyze_column(self):
616
+ if self.df is None or self.column_selector.currentText() == "":
617
+ return
618
+
619
+ # Cancel any existing worker thread
620
+ if self.worker_thread and self.worker_thread.isRunning():
621
+ # Signal the thread to cancel
622
+ self.worker_thread.cancel()
623
+
624
+ try:
625
+ # Disconnect all signals to avoid callbacks during termination
626
+ self.worker_thread.progress.disconnect()
627
+ self.worker_thread.result.disconnect()
628
+ self.worker_thread.error.disconnect()
629
+ self.worker_thread.finished.disconnect()
630
+ except Exception:
631
+ pass # Already disconnected
632
+
633
+ # Terminate thread properly
634
+ self.worker_thread.terminate()
635
+ self.worker_thread.wait(1000) # Wait up to 1 second
636
+ self.worker_thread = None # Clear reference
637
+
638
+ target_column = self.column_selector.currentText()
639
+
640
+ # Check in-memory cache first (fastest)
641
+ if target_column in self.result_cache:
642
+ self.handle_results(self.result_cache[target_column])
643
+ return
644
+
645
+ # Check global application-wide cache second (still fast)
646
+ global_key = get_cache_key(self.df, target_column)
647
+ if global_key in ColumnProfilerApp.global_cache:
648
+ self.result_cache[target_column] = ColumnProfilerApp.global_cache[global_key]
649
+ self.handle_results(self.result_cache[target_column])
650
+ return
651
+
652
+ # Disk cache will be checked in the worker thread
653
+
654
+ # Disable the analyze button while processing
655
+ self.analyze_button.setEnabled(False)
656
+
657
+ # Show progress indicators
658
+ self.progress_bar.setValue(0)
659
+ self.progress_bar.show()
660
+ self.progress_label.setText("Starting analysis...")
661
+ self.progress_label.show()
662
+ self.cancel_button.show()
663
+
664
+ # Create and start the worker thread
665
+ self.worker_thread = ExplainerThread(self.df, target_column)
666
+ self.worker_thread.progress.connect(self.update_progress)
667
+ self.worker_thread.result.connect(self.cache_and_display_results)
668
+ self.worker_thread.error.connect(self.handle_error)
669
+ self.worker_thread.finished.connect(self.on_analysis_finished)
670
+ self.worker_thread.start()
671
+
672
+ def update_progress(self, value, message):
673
+ self.progress_bar.setValue(value)
674
+ self.progress_label.setText(message)
675
+
676
+ def cache_and_display_results(self, importance_df):
677
+ # Cache the results
678
+ target_column = self.column_selector.currentText()
679
+ self.result_cache[target_column] = importance_df
680
+
681
+ # Also cache in the global application cache
682
+ global_key = get_cache_key(self.df, target_column)
683
+ ColumnProfilerApp.global_cache[global_key] = importance_df
684
+
685
+ # Display the results
686
+ self.handle_results(importance_df)
687
+
688
+ def on_analysis_finished(self):
689
+ """Handle cleanup when analysis is finished (either completed or cancelled)"""
690
+ self.analyze_button.setEnabled(True)
691
+ self.cancel_button.hide()
692
+
693
+ def handle_results(self, importance_df):
694
+ # Hide progress indicators
695
+ self.progress_bar.hide()
696
+ self.progress_label.hide()
697
+ self.cancel_button.hide()
698
+
699
+ # Update importance table to include correlation column
700
+ self.importance_table.setColumnCount(3)
701
+ self.importance_table.setHorizontalHeaderLabels(["Feature", "Importance", "Abs. Correlation"])
702
+ self.importance_table.setRowCount(len(importance_df))
703
+
704
+ # Using a timer for incremental updates
705
+ self.importance_df = importance_df # Store for incremental rendering
706
+ self.current_row = 0
707
+ self.render_timer = QTimer()
708
+ self.render_timer.timeout.connect(lambda: self.render_next_batch(10))
709
+ self.render_timer.start(10) # Update every 10ms
710
+
711
+ def render_next_batch(self, batch_size):
712
+ try:
713
+ if self.current_row >= len(self.importance_df):
714
+ # All rows rendered, now render the chart and stop the timer
715
+ self.render_chart()
716
+ self.render_timer.stop()
717
+ return
718
+
719
+ # Render a batch of rows
720
+ end_row = min(self.current_row + batch_size, len(self.importance_df))
721
+ for row in range(self.current_row, end_row):
722
+ try:
723
+ # Check if row exists in dataframe to prevent index errors
724
+ if row < len(self.importance_df):
725
+ feature = self.importance_df.iloc[row]['feature']
726
+ importance_value = self.importance_df.iloc[row]['importance_value']
727
+
728
+ # Add correlation if available
729
+ correlation = self.importance_df.iloc[row].get('correlation', None)
730
+ if correlation is not None:
731
+ self.importance_table.setItem(row, 0, QTableWidgetItem(str(feature)))
732
+ self.importance_table.setItem(row, 1, QTableWidgetItem(str(round(importance_value, 4))))
733
+ self.importance_table.setItem(row, 2, QTableWidgetItem(str(round(correlation, 4))))
734
+ else:
735
+ self.importance_table.setItem(row, 0, QTableWidgetItem(str(feature)))
736
+ self.importance_table.setItem(row, 1, QTableWidgetItem(str(round(importance_value, 4))))
737
+ else:
738
+ # Handle out of range index
739
+ print(f"Warning: Row {row} is out of range (max: {len(self.importance_df)-1})")
740
+ self.importance_table.setItem(row, 0, QTableWidgetItem("Error"))
741
+ self.importance_table.setItem(row, 1, QTableWidgetItem("Out of range"))
742
+ self.importance_table.setItem(row, 2, QTableWidgetItem("N/A"))
743
+ except (IndexError, KeyError) as e:
744
+ # Enhanced error reporting for index and key errors
745
+ import traceback
746
+ trace = traceback.format_exc()
747
+ error_msg = f"Error rendering row {row}: {e.__class__.__name__}: {e}\n{trace}"
748
+ print(error_msg)
749
+
750
+ # Handle missing data in the dataframe gracefully
751
+ self.importance_table.setItem(row, 0, QTableWidgetItem(f"Error: {e.__class__.__name__}"))
752
+ self.importance_table.setItem(row, 1, QTableWidgetItem(f"{str(e)[:20]}"))
753
+ self.importance_table.setItem(row, 2, QTableWidgetItem("Error"))
754
+ except Exception as e:
755
+ # Catch any other exceptions
756
+ print(f"Unexpected error rendering row {row}: {e.__class__.__name__}: {e}")
757
+ self.importance_table.setItem(row, 0, QTableWidgetItem(f"Error: {e.__class__.__name__}"))
758
+ self.importance_table.setItem(row, 1, QTableWidgetItem("See console for details"))
759
+ self.importance_table.setItem(row, 2, QTableWidgetItem("Error"))
760
+
761
+ self.current_row = end_row
762
+ QApplication.processEvents() # Allow UI to update
763
+ except Exception as e:
764
+ # Catch any exceptions in the rendering loop itself
765
+ import traceback
766
+ trace = traceback.format_exc()
767
+ error_msg = f"Error in render_next_batch: {e.__class__.__name__}: {e}\n{trace}"
768
+ print(error_msg)
769
+
770
+ # Try to stop the timer to prevent further errors
771
+ try:
772
+ if self.render_timer and self.render_timer.isActive():
773
+ self.render_timer.stop()
774
+ except:
775
+ pass
776
+
777
+ # Show error
778
+ QMessageBox.critical(self, "Rendering Error",
779
+ f"Error rendering results: {e.__class__.__name__}: {e}")
780
+
781
+ def render_chart(self):
782
+ # Create horizontal bar chart
783
+ try:
784
+ if self.importance_df is None or len(self.importance_df) == 0:
785
+ # No data to render
786
+ self.chart_view.axes.clear()
787
+ self.chart_view.axes.text(0.5, 0.5, "No data available for chart",
788
+ ha='center', va='center', fontsize=12, color='gray')
789
+ self.chart_view.axes.set_axis_off()
790
+ self.chart_view.draw()
791
+ return
792
+
793
+ self.chart_view.axes.clear()
794
+
795
+ # Get a sorted copy based on current sort key
796
+ plot_df = self.importance_df.sort_values(by=self.current_sort, ascending=False).head(20).copy()
797
+
798
+ # Verify we have data before proceeding
799
+ if len(plot_df) == 0:
800
+ self.chart_view.axes.text(0.5, 0.5, "No features found with importance values",
801
+ ha='center', va='center', fontsize=12, color='gray')
802
+ self.chart_view.axes.set_axis_off()
803
+ self.chart_view.draw()
804
+ return
805
+
806
+ # Check required columns exist
807
+ required_columns = ['feature', 'importance_value']
808
+ missing_columns = [col for col in required_columns if col not in plot_df.columns]
809
+ if missing_columns:
810
+ error_msg = f"Missing required columns: {', '.join(missing_columns)}"
811
+ self.chart_view.axes.text(0.5, 0.5, error_msg,
812
+ ha='center', va='center', fontsize=12, color='red')
813
+ self.chart_view.axes.set_axis_off()
814
+ self.chart_view.draw()
815
+ print(f"Chart rendering error: {error_msg}")
816
+ return
817
+
818
+ # Truncate long feature names for better display
819
+ max_feature_length = 30
820
+ plot_df['display_feature'] = plot_df['feature'].apply(
821
+ lambda x: (str(x)[:max_feature_length] + '...') if len(str(x)) > max_feature_length else str(x)
822
+ )
823
+
824
+ # Reverse order for better display (highest at top)
825
+ plot_df = plot_df.iloc[::-1].reset_index(drop=True)
826
+
827
+ # Create a figure with two subplots side by side
828
+ self.chart_view.figure.clear()
829
+ gs = self.chart_view.figure.add_gridspec(1, 2, width_ratios=[3, 2])
830
+
831
+ # First subplot for importance
832
+ ax1 = self.chart_view.figure.add_subplot(gs[0, 0])
833
+
834
+ # Create a colormap for better visualization
835
+ cmap = plt.cm.Blues
836
+ colors = cmap(np.linspace(0.4, 0.8, len(plot_df)))
837
+
838
+ # Plot with custom colors
839
+ bars = ax1.barh(
840
+ plot_df['display_feature'],
841
+ plot_df['importance_value'],
842
+ color=colors,
843
+ height=0.7, # Thinner bars for more spacing
844
+ alpha=0.8
845
+ )
846
+
847
+ # Add values at the end of bars
848
+ for bar in bars:
849
+ width = bar.get_width()
850
+ ax1.text(
851
+ width * 1.05,
852
+ bar.get_y() + bar.get_height()/2,
853
+ f'{width:.2f}',
854
+ va='center',
855
+ fontsize=9,
856
+ fontweight='bold'
857
+ )
858
+
859
+ # Add grid for better readability
860
+ ax1.grid(True, axis='x', linestyle='--', alpha=0.3)
861
+
862
+ # Remove unnecessary spines
863
+ for spine in ['top', 'right']:
864
+ ax1.spines[spine].set_visible(False)
865
+
866
+ # Make labels more readable
867
+ ax1.tick_params(axis='y', labelsize=9)
868
+
869
+ # Set title and labels
870
+ ax1.set_title(f'Feature Importance for {self.column_selector.currentText()}')
871
+ ax1.set_xlabel('Importance Value')
872
+
873
+ # Add a note about the sorting order
874
+ sort_label = "Sorted by: " + ("Importance" if self.current_sort == 'importance_value' else "Correlation")
875
+
876
+ # Second subplot for correlation if available
877
+ if 'correlation' in plot_df.columns:
878
+ ax2 = self.chart_view.figure.add_subplot(gs[0, 1], sharey=ax1)
879
+
880
+ # Create a colormap for correlation - use a different color
881
+ cmap_corr = plt.cm.Reds
882
+ colors_corr = cmap_corr(np.linspace(0.4, 0.8, len(plot_df)))
883
+
884
+ # Plot correlation bars
885
+ corr_bars = ax2.barh(
886
+ plot_df['display_feature'],
887
+ plot_df['correlation'],
888
+ color=colors_corr,
889
+ height=0.7,
890
+ alpha=0.8
891
+ )
892
+
893
+ # Add values at the end of correlation bars
894
+ for bar in corr_bars:
895
+ width = bar.get_width()
896
+ ax2.text(
897
+ width * 1.05,
898
+ bar.get_y() + bar.get_height()/2,
899
+ f'{width:.2f}',
900
+ va='center',
901
+ fontsize=9,
902
+ fontweight='bold'
903
+ )
904
+
905
+ # Add grid and styling
906
+ ax2.grid(True, axis='x', linestyle='--', alpha=0.3)
907
+ ax2.set_title('Absolute Correlation')
908
+ ax2.set_xlabel('Correlation Value')
909
+
910
+ # Hide y-axis labels since they're shared with the first plot
911
+ ax2.set_yticklabels([])
912
+
913
+ # Remove unnecessary spines
914
+ for spine in ['top', 'right']:
915
+ ax2.spines[spine].set_visible(False)
916
+
917
+ # Add a note about the current sort order
918
+ self.chart_view.figure.text(0.5, 0.01, sort_label, ha='center', fontsize=9, style='italic')
919
+
920
+ # Adjust figure size based on number of features
921
+ feature_count = len(plot_df)
922
+ self.chart_view.figure.set_figheight(max(5, min(4 + feature_count * 0.3, 12)))
923
+
924
+ # Adjust layout and draw
925
+ self.chart_view.figure.tight_layout(rect=[0, 0.03, 1, 0.97]) # Make room for sort label
926
+ self.chart_view.draw()
927
+
928
+ except IndexError as e:
929
+ # Special handling for index errors with detailed information
930
+ import traceback
931
+ import inspect
932
+
933
+ # Get stack trace information
934
+ trace = traceback.format_exc()
935
+
936
+ # Try to get line and context information
937
+ try:
938
+ frame = inspect.trace()[-1]
939
+ frame_info = inspect.getframeinfo(frame[0])
940
+ filename = frame_info.filename
941
+ lineno = frame_info.lineno
942
+ function = frame_info.function
943
+ code_context = frame_info.code_context[0].strip() if frame_info.code_context else "Unknown code context"
944
+
945
+ # Detailed error message
946
+ detail_msg = f"IndexError at line {lineno} in {function}: {str(e)}\nCode: {code_context}"
947
+ print(f"Chart rendering error: {detail_msg}\n{trace}")
948
+
949
+ # Display error in chart
950
+ self.chart_view.axes.clear()
951
+ self.chart_view.axes.text(0.5, 0.5,
952
+ f"Index Error in chart rendering:\n{str(e)}\nAt line {lineno}: {code_context}",
953
+ ha='center', va='center', fontsize=12, color='red',
954
+ wrap=True)
955
+ self.chart_view.axes.set_axis_off()
956
+ self.chart_view.draw()
957
+ except Exception as inner_e:
958
+ # Fallback if the detailed error reporting fails
959
+ print(f"Error getting detailed error info: {inner_e}")
960
+ print(f"Original error: {e}\n{trace}")
961
+
962
+ self.chart_view.axes.clear()
963
+ self.chart_view.axes.text(0.5, 0.5, f"Index Error: {str(e)}",
964
+ ha='center', va='center', fontsize=12, color='red')
965
+ self.chart_view.axes.set_axis_off()
966
+ self.chart_view.draw()
967
+ except Exception as e:
968
+ # Recover gracefully from any chart rendering errors with detailed information
969
+ import traceback
970
+ trace = traceback.format_exc()
971
+ error_msg = f"Error rendering chart: {e.__class__.__name__}: {str(e)}"
972
+ print(f"{error_msg}\n{trace}")
973
+
974
+ self.chart_view.axes.clear()
975
+ self.chart_view.axes.text(0.5, 0.5, error_msg,
976
+ ha='center', va='center', fontsize=12, color='red',
977
+ wrap=True)
978
+ self.chart_view.axes.set_axis_off()
979
+ self.chart_view.draw()
980
+
981
+ def handle_error(self, error_message):
982
+ """Handle errors during analysis"""
983
+ # Hide progress indicators
984
+ self.progress_bar.hide()
985
+ self.progress_label.hide()
986
+ self.cancel_button.hide()
987
+
988
+ # Re-enable analyze button
989
+ self.analyze_button.setEnabled(True)
990
+
991
+ # Print error to console for debugging
992
+ print(f"Error in column profiler: {error_message}")
993
+
994
+ # Show error notification
995
+ show_error_notification(f"Analysis Error: {error_message.split(chr(10))[0] if chr(10) in error_message else error_message}")
996
+
997
+ # Show a message in the UI as well
998
+ self.importance_table.setRowCount(1)
999
+ self.importance_table.setColumnCount(3)
1000
+ self.importance_table.setHorizontalHeaderLabels(["Feature", "Importance", "Abs. Correlation"])
1001
+ self.importance_table.setItem(0, 0, QTableWidgetItem(f"Error: {error_message.split(chr(10))[0]}"))
1002
+ self.importance_table.setItem(0, 1, QTableWidgetItem(""))
1003
+ self.importance_table.setItem(0, 2, QTableWidgetItem(""))
1004
+ self.importance_table.resizeColumnsToContents()
1005
+
1006
+ # Update the chart to show error
1007
+ self.chart_view.axes.clear()
1008
+ self.chart_view.axes.text(0.5, 0.5, f"Error calculating importance:\n{error_message.split(chr(10))[0]}",
1009
+ ha='center', va='center', fontsize=12, color='red',
1010
+ wrap=True)
1011
+ self.chart_view.axes.set_axis_off()
1012
+ self.chart_view.draw()
1013
+
1014
+ def closeEvent(self, event):
1015
+ """Clean up when the window is closed"""
1016
+ # Stop any running timer
1017
+ if self.render_timer and self.render_timer.isActive():
1018
+ self.render_timer.stop()
1019
+
1020
+ # Clean up any background threads
1021
+ if self.worker_thread and self.worker_thread.isRunning():
1022
+ # Disconnect all signals to avoid callbacks during termination
1023
+ try:
1024
+ self.worker_thread.progress.disconnect()
1025
+ self.worker_thread.result.disconnect()
1026
+ self.worker_thread.error.disconnect()
1027
+ self.worker_thread.finished.disconnect()
1028
+ except Exception:
1029
+ pass # Already disconnected
1030
+
1031
+ # Terminate thread properly
1032
+ self.worker_thread.terminate()
1033
+ self.worker_thread.wait(1000) # Wait up to 1 second
1034
+
1035
+ # Clear references to prevent thread issues
1036
+ self.worker_thread = None
1037
+
1038
+ # Clean up memory
1039
+ self.result_cache.clear()
1040
+
1041
+ # Accept the close event
1042
+ event.accept()
1043
+
1044
+ # Suggest garbage collection
1045
+ gc.collect()
1046
+
1047
+ def cancel_analysis(self):
1048
+ """Cancel the current analysis"""
1049
+ if self.worker_thread and self.worker_thread.isRunning():
1050
+ # Signal the thread to cancel first
1051
+ self.worker_thread.cancel()
1052
+
1053
+ # Disconnect all signals to avoid callbacks during termination
1054
+ try:
1055
+ self.worker_thread.progress.disconnect()
1056
+ self.worker_thread.result.disconnect()
1057
+ self.worker_thread.error.disconnect()
1058
+ self.worker_thread.finished.disconnect()
1059
+ except Exception:
1060
+ pass # Already disconnected
1061
+
1062
+ # Terminate thread properly
1063
+ self.worker_thread.terminate()
1064
+ self.worker_thread.wait(1000) # Wait up to 1 second
1065
+
1066
+ # Clear reference
1067
+ self.worker_thread = None
1068
+
1069
+ # Update UI
1070
+ self.progress_bar.hide()
1071
+ self.progress_label.setText("Analysis cancelled")
1072
+ self.progress_label.show()
1073
+ self.cancel_button.hide()
1074
+ self.analyze_button.setEnabled(True)
1075
+
1076
+ # Hide the progress label after 2 seconds
1077
+ QTimer.singleShot(2000, self.progress_label.hide)
1078
+
1079
+ def show_relationship_visualization(self, row, column):
1080
+ """Show visualization of relationship between selected feature and target column"""
1081
+ if self.importance_df is None or row < 0 or row >= len(self.importance_df):
1082
+ return
1083
+
1084
+ # Get the feature name and target column
1085
+ try:
1086
+ feature = self.importance_df.iloc[row]['feature']
1087
+ target = self.column_selector.currentText()
1088
+
1089
+ # Verify both columns exist in the dataframe
1090
+ if feature not in self.df.columns:
1091
+ QMessageBox.warning(self, "Column Not Found",
1092
+ f"Feature column '{feature}' not found in the dataframe")
1093
+ return
1094
+
1095
+ if target not in self.df.columns:
1096
+ QMessageBox.warning(self, "Column Not Found",
1097
+ f"Target column '{target}' not found in the dataframe")
1098
+ return
1099
+ except Exception as e:
1100
+ QMessageBox.critical(self, "Error", f"Error getting column data: {str(e)}")
1101
+ return
1102
+
1103
+ # Create a dialog to show the visualization
1104
+ dialog = QDialog(self)
1105
+ dialog.setWindowTitle(f"Relationship: {feature} vs {target}")
1106
+ dialog.resize(900, 700)
1107
+
1108
+ # Create layout
1109
+ layout = QVBoxLayout(dialog)
1110
+
1111
+ # Create canvas for the plot
1112
+ canvas = MatplotlibCanvas(width=8, height=6, dpi=100)
1113
+ layout.addWidget(canvas)
1114
+
1115
+ # Determine the data types
1116
+ feature_is_numeric = pd.api.types.is_numeric_dtype(self.df[feature])
1117
+ target_is_numeric = pd.api.types.is_numeric_dtype(self.df[target])
1118
+
1119
+ # Get unique counts to determine if we have high cardinality
1120
+ feature_unique_count = self.df[feature].nunique()
1121
+ target_unique_count = self.df[target].nunique()
1122
+
1123
+ # Define high cardinality threshold
1124
+ high_cardinality_threshold = 10
1125
+
1126
+ # Clear the figure
1127
+ canvas.axes.clear()
1128
+
1129
+ # Create a working copy of the dataframe
1130
+ working_df = self.df.copy()
1131
+
1132
+ # Prepare data for high cardinality columns
1133
+ if not feature_is_numeric and feature_unique_count > high_cardinality_threshold:
1134
+ # Get the top N categories by frequency
1135
+ top_categories = self.df[feature].value_counts().nlargest(high_cardinality_threshold).index.tolist()
1136
+ # Create "Other" category for remaining values
1137
+ working_df[feature] = working_df[feature].apply(lambda x: x if x in top_categories else 'Other')
1138
+
1139
+ if not target_is_numeric and target_unique_count > high_cardinality_threshold:
1140
+ top_categories = self.df[target].value_counts().nlargest(high_cardinality_threshold).index.tolist()
1141
+ working_df[target] = working_df[target].apply(lambda x: x if x in top_categories else 'Other')
1142
+
1143
+ # Create appropriate visualization based on data types and cardinality
1144
+ if feature_is_numeric and target_is_numeric:
1145
+ # Scatter plot for numeric vs numeric
1146
+ # Use hexbin for large datasets to avoid overplotting
1147
+ if len(working_df) > 100:
1148
+ canvas.axes.hexbin(
1149
+ working_df[feature],
1150
+ working_df[target],
1151
+ gridsize=25,
1152
+ cmap='Blues',
1153
+ mincnt=1
1154
+ )
1155
+ canvas.axes.set_title(f"Hexbin Density Plot: {feature} vs {target}")
1156
+ canvas.axes.set_xlabel(feature)
1157
+ canvas.axes.set_ylabel(target)
1158
+ # Add a colorbar
1159
+ cbar = canvas.figure.colorbar(canvas.axes.collections[0], ax=canvas.axes)
1160
+ cbar.set_label('Count')
1161
+ else:
1162
+ # For smaller datasets, use a scatter plot with transparency
1163
+ sns.scatterplot(
1164
+ x=feature,
1165
+ y=target,
1166
+ data=working_df,
1167
+ ax=canvas.axes,
1168
+ alpha=0.6
1169
+ )
1170
+ # Add regression line
1171
+ sns.regplot(
1172
+ x=feature,
1173
+ y=target,
1174
+ data=working_df,
1175
+ ax=canvas.axes,
1176
+ scatter=False,
1177
+ line_kws={"color": "red"}
1178
+ )
1179
+ canvas.axes.set_title(f"Scatter Plot: {feature} vs {target}")
1180
+
1181
+ elif feature_is_numeric and not target_is_numeric:
1182
+ # Box plot for numeric vs categorical
1183
+ if target_unique_count <= high_cardinality_threshold * 2:
1184
+ # Standard boxplot for reasonable number of categories
1185
+ order = working_df[target].value_counts().nlargest(high_cardinality_threshold * 2).index
1186
+
1187
+ # Calculate counts for each category
1188
+ category_counts = working_df[target].value_counts()
1189
+
1190
+ sns.boxplot(
1191
+ x=target,
1192
+ y=feature,
1193
+ data=working_df,
1194
+ ax=canvas.axes,
1195
+ order=order
1196
+ )
1197
+ canvas.axes.set_title(f"Box Plot: {feature} by {target}")
1198
+
1199
+ # Add count annotations below each box
1200
+ for i, category in enumerate(order):
1201
+ if category in category_counts:
1202
+ count = category_counts[category]
1203
+ canvas.axes.text(
1204
+ i,
1205
+ canvas.axes.get_ylim()[0] - (canvas.axes.get_ylim()[1] - canvas.axes.get_ylim()[0]) * 0.05,
1206
+ f'n={count}',
1207
+ ha='center',
1208
+ va='top',
1209
+ fontsize=8,
1210
+ fontweight='bold'
1211
+ )
1212
+
1213
+ # Rotate x-axis labels for better readability
1214
+ canvas.axes.set_xticklabels(
1215
+ canvas.axes.get_xticklabels(),
1216
+ rotation=45,
1217
+ ha='right'
1218
+ )
1219
+ else:
1220
+ # For very high cardinality, use a violin plot with limited categories
1221
+ order = working_df[target].value_counts().nlargest(high_cardinality_threshold).index
1222
+ working_df_filtered = working_df[working_df[target].isin(order)]
1223
+
1224
+ # Calculate counts for filtered categories
1225
+ category_counts = working_df_filtered[target].value_counts()
1226
+
1227
+ sns.violinplot(
1228
+ x=target,
1229
+ y=feature,
1230
+ data=working_df_filtered,
1231
+ ax=canvas.axes,
1232
+ inner='quartile',
1233
+ cut=0
1234
+ )
1235
+ canvas.axes.set_title(f"Violin Plot: {feature} by Top {len(order)} {target} Categories")
1236
+
1237
+ # Add count annotations below each violin
1238
+ for i, category in enumerate(order):
1239
+ if category in category_counts:
1240
+ count = category_counts[category]
1241
+ canvas.axes.text(
1242
+ i,
1243
+ canvas.axes.get_ylim()[0] - (canvas.axes.get_ylim()[1] - canvas.axes.get_ylim()[0]) * 0.05,
1244
+ f'n={count}',
1245
+ ha='center',
1246
+ va='top',
1247
+ fontsize=8,
1248
+ fontweight='bold'
1249
+ )
1250
+
1251
+ canvas.axes.set_xticklabels(
1252
+ canvas.axes.get_xticklabels(),
1253
+ rotation=45,
1254
+ ha='right'
1255
+ )
1256
+
1257
+ elif not feature_is_numeric and target_is_numeric:
1258
+ # Bar plot for categorical vs numeric
1259
+ if feature_unique_count <= high_cardinality_threshold * 2:
1260
+ # Use standard barplot for reasonable number of categories
1261
+ order = working_df[feature].value_counts().nlargest(high_cardinality_threshold * 2).index
1262
+
1263
+ # Calculate counts for each category for annotations
1264
+ category_counts = working_df[feature].value_counts()
1265
+
1266
+ sns.barplot(
1267
+ x=feature,
1268
+ y=target,
1269
+ data=working_df,
1270
+ ax=canvas.axes,
1271
+ order=order,
1272
+ estimator=np.mean,
1273
+ errorbar=('ci', 95),
1274
+ capsize=0.2
1275
+ )
1276
+ canvas.axes.set_title(f"Bar Plot: Average {target} by {feature}")
1277
+
1278
+ # Add value labels and counts on top of bars
1279
+ for i, p in enumerate(canvas.axes.patches):
1280
+ # Get the category name for this bar
1281
+ if i < len(order):
1282
+ category = order[i]
1283
+ count = category_counts[category]
1284
+
1285
+ # Add mean value and count
1286
+ canvas.axes.annotate(
1287
+ f'{p.get_height():.1f}\n(n={count})',
1288
+ (p.get_x() + p.get_width() / 2., p.get_height()),
1289
+ ha='center',
1290
+ va='bottom',
1291
+ fontsize=8,
1292
+ rotation=0
1293
+ )
1294
+
1295
+ # Rotate x-axis labels if needed
1296
+ if feature_unique_count > 5:
1297
+ canvas.axes.set_xticklabels(
1298
+ canvas.axes.get_xticklabels(),
1299
+ rotation=45,
1300
+ ha='right'
1301
+ )
1302
+ else:
1303
+ # For high cardinality, use a horizontal bar plot with top N categories
1304
+ top_n = 15 # Show top 15 categories
1305
+ # Calculate mean of target for each feature category
1306
+ grouped = working_df.groupby(feature)[target].agg(['mean', 'count', 'std']).reset_index()
1307
+ # Sort by mean and take top categories
1308
+ top_groups = grouped.nlargest(top_n, 'mean')
1309
+
1310
+ # Sort by mean value for better visualization
1311
+ sns.barplot(
1312
+ y=feature,
1313
+ x='mean',
1314
+ data=top_groups,
1315
+ ax=canvas.axes,
1316
+ orient='h'
1317
+ )
1318
+ canvas.axes.set_title(f"Top {top_n} Categories by Average {target}")
1319
+ canvas.axes.set_xlabel(f"Average {target}")
1320
+
1321
+ # Add count annotations
1322
+ for i, row in enumerate(top_groups.itertuples()):
1323
+ canvas.axes.text(
1324
+ row.mean + 0.1,
1325
+ i,
1326
+ f'n={row.count}',
1327
+ va='center',
1328
+ fontsize=8
1329
+ )
1330
+
1331
+ else:
1332
+ # Both feature and target are categorical
1333
+ if feature_unique_count <= high_cardinality_threshold and target_unique_count <= high_cardinality_threshold:
1334
+ # Heatmap for categorical vs categorical with manageable cardinality
1335
+ crosstab = pd.crosstab(
1336
+ working_df[feature],
1337
+ working_df[target],
1338
+ normalize='index'
1339
+ )
1340
+
1341
+ # Create heatmap with improved readability
1342
+ sns.heatmap(
1343
+ crosstab,
1344
+ annot=True,
1345
+ cmap="YlGnBu",
1346
+ ax=canvas.axes,
1347
+ fmt='.2f',
1348
+ linewidths=0.5,
1349
+ annot_kws={"size": 9 if crosstab.size < 30 else 7}
1350
+ )
1351
+ canvas.axes.set_title(f"Heatmap: {feature} vs {target} (proportions)")
1352
+ else:
1353
+ # For high cardinality in both, show a count plot of top categories
1354
+ feature_top = working_df[feature].value_counts().nlargest(8).index
1355
+ target_top = working_df[target].value_counts().nlargest(5).index
1356
+
1357
+ # Filter data to only include top categories
1358
+ filtered_df = working_df[
1359
+ working_df[feature].isin(feature_top) &
1360
+ working_df[target].isin(target_top)
1361
+ ]
1362
+
1363
+ # Create a grouped count plot
1364
+ ax_plot = sns.countplot(
1365
+ x=feature,
1366
+ hue=target,
1367
+ data=filtered_df,
1368
+ ax=canvas.axes
1369
+ )
1370
+ canvas.axes.set_title(f"Count Plot: Top {len(feature_top)} {feature} by Top {len(target_top)} {target}")
1371
+
1372
+ # Add count labels on top of bars
1373
+ for p in canvas.axes.patches:
1374
+ if p.get_height() > 0: # Only add labels for non-zero bars
1375
+ canvas.axes.annotate(
1376
+ f'{int(p.get_height())}',
1377
+ (p.get_x() + p.get_width() / 2., p.get_height()),
1378
+ ha='center',
1379
+ va='bottom',
1380
+ fontsize=8,
1381
+ rotation=0
1382
+ )
1383
+
1384
+ # Rotate x-axis labels
1385
+ canvas.axes.set_xticklabels(
1386
+ canvas.axes.get_xticklabels(),
1387
+ rotation=45,
1388
+ ha='right'
1389
+ )
1390
+
1391
+ # Move legend to a better position
1392
+ canvas.axes.legend(title=target, bbox_to_anchor=(1.05, 1), loc='upper left')
1393
+
1394
+ # Add informational text about data reduction if applicable
1395
+ if (not feature_is_numeric and feature_unique_count > high_cardinality_threshold) or \
1396
+ (not target_is_numeric and target_unique_count > high_cardinality_threshold):
1397
+ canvas.figure.text(
1398
+ 0.5, 0.01,
1399
+ f"Note: Visualization simplified to show top categories only. Original data has {feature_unique_count} unique {feature} values and {target_unique_count} unique {target} values.",
1400
+ ha='center',
1401
+ fontsize=8,
1402
+ style='italic'
1403
+ )
1404
+
1405
+ # Adjust layout and draw
1406
+ canvas.figure.tight_layout()
1407
+ canvas.draw()
1408
+
1409
+ # Add a close button
1410
+ close_button = QPushButton("Close")
1411
+ close_button.clicked.connect(dialog.accept)
1412
+ layout.addWidget(close_button)
1413
+
1414
+ # Show the dialog
1415
+ dialog.exec()
1416
+
1417
+ def change_sort(self, sort_key):
1418
+ """Change the sort order of the results"""
1419
+ if self.importance_df is None:
1420
+ return
1421
+
1422
+ # Update button states
1423
+ if sort_key == 'importance_value':
1424
+ self.importance_sort_btn.setChecked(True)
1425
+ self.correlation_sort_btn.setChecked(False)
1426
+ else:
1427
+ self.importance_sort_btn.setChecked(False)
1428
+ self.correlation_sort_btn.setChecked(True)
1429
+
1430
+ # Store the current sort key
1431
+ self.current_sort = sort_key
1432
+
1433
+ # Re-sort the dataframe
1434
+ self.importance_df = self.importance_df.sort_values(by=sort_key, ascending=False)
1435
+
1436
+ # Reset rendering of the table
1437
+ self.importance_table.clearContents()
1438
+ self.importance_table.setRowCount(len(self.importance_df))
1439
+ self.current_row = 0
1440
+
1441
+ # Start incremental rendering with the new sort order
1442
+ if self.render_timer and self.render_timer.isActive():
1443
+ self.render_timer.stop()
1444
+ self.render_timer = QTimer()
1445
+ self.render_timer.timeout.connect(lambda: self.render_next_batch(10))
1446
+ self.render_timer.start(10) # Update every 10ms
1447
+
1448
+ # Main application class
1449
+ class ColumnProfilerApp(QMainWindow):
1450
+ # Global application-wide cache to prevent redundant computations
1451
+ global_cache = {}
1452
+
1453
+ def __init__(self, df):
1454
+ super().__init__()
1455
+
1456
+ # Store reference to data
1457
+ self.df = df
1458
+
1459
+ # Initialize cache for results
1460
+ self.result_cache = {}
1461
+
1462
+ # Initialize thread variable
1463
+ self.worker_thread = None
1464
+
1465
+ # Variables for incremental rendering
1466
+ self.importance_df = None
1467
+ self.current_row = 0
1468
+ self.render_timer = None
1469
+
1470
+ # Current sort key
1471
+ self.current_sort = 'importance_value'
1472
+
1473
+ # Set window properties
1474
+ self.setWindowTitle("Column Profiler")
1475
+ self.setMinimumSize(900, 600)
1476
+
1477
+ # Create central widget and main layout
1478
+ central_widget = QWidget()
1479
+ main_layout = QVBoxLayout(central_widget)
1480
+
1481
+ # Create top control panel
1482
+ control_panel = QWidget()
1483
+ control_layout = QHBoxLayout(control_panel)
1484
+
1485
+ # Column selector
1486
+ self.column_selector = QComboBox()
1487
+ self.column_selector.addItems([col for col in df.columns])
1488
+ control_layout.addWidget(QLabel("Select Column to Analyze:"))
1489
+ control_layout.addWidget(self.column_selector)
1490
+
1491
+ # Analyze button
1492
+ self.analyze_button = QPushButton("Analyze")
1493
+ self.analyze_button.clicked.connect(self.analyze_column)
1494
+ control_layout.addWidget(self.analyze_button)
1495
+
1496
+ # Progress indicators
1497
+ self.progress_bar = QProgressBar()
1498
+ self.progress_bar.setRange(0, 100)
1499
+ self.progress_bar.hide()
1500
+ self.progress_label = QLabel()
1501
+ self.progress_label.hide()
1502
+
1503
+ # Cancel button
1504
+ self.cancel_button = QPushButton("Cancel")
1505
+ self.cancel_button.clicked.connect(self.cancel_analysis)
1506
+ self.cancel_button.hide()
1507
+
1508
+ control_layout.addWidget(self.progress_bar)
1509
+ control_layout.addWidget(self.progress_label)
1510
+ control_layout.addWidget(self.cancel_button)
1511
+
1512
+ # Add control panel to main layout
1513
+ main_layout.addWidget(control_panel)
1514
+
1515
+ # Add sorting control
1516
+ sort_panel = QWidget()
1517
+ sort_layout = QHBoxLayout(sort_panel)
1518
+ sort_layout.setContentsMargins(0, 0, 0, 0)
1519
+
1520
+ # Add sort label
1521
+ sort_layout.addWidget(QLabel("Sort by:"))
1522
+
1523
+ # Add sort buttons
1524
+ self.importance_sort_btn = QPushButton("Importance")
1525
+ self.importance_sort_btn.setCheckable(True)
1526
+ self.importance_sort_btn.setChecked(True) # Default sort
1527
+ self.importance_sort_btn.clicked.connect(lambda: self.change_sort('importance_value'))
1528
+
1529
+ self.correlation_sort_btn = QPushButton("Correlation")
1530
+ self.correlation_sort_btn.setCheckable(True)
1531
+ self.correlation_sort_btn.clicked.connect(lambda: self.change_sort('correlation'))
1532
+
1533
+ sort_layout.addWidget(self.importance_sort_btn)
1534
+ sort_layout.addWidget(self.correlation_sort_btn)
1535
+ sort_layout.addStretch()
1536
+
1537
+ # Add buttons to layout
1538
+ main_layout.addWidget(sort_panel)
1539
+
1540
+ # Add a splitter for results area
1541
+ results_splitter = QSplitter(Qt.Orientation.Vertical)
1542
+
1543
+ # Create table for showing importance values
1544
+ self.importance_table = QTableWidget()
1545
+ self.importance_table.setColumnCount(3)
1546
+ self.importance_table.setHorizontalHeaderLabels(["Feature", "Importance", "Abs. Correlation"])
1547
+ self.importance_table.horizontalHeader().setSectionResizeMode(0, QHeaderView.ResizeMode.Stretch)
1548
+ self.importance_table.cellDoubleClicked.connect(self.show_relationship_visualization)
1549
+ results_splitter.addWidget(self.importance_table)
1550
+
1551
+ # Add instruction label for double-click functionality
1552
+ instruction_label = QLabel("Double-click on any feature to view detailed relationship visualization with the target column")
1553
+ instruction_label.setStyleSheet("color: #666; font-style: italic;")
1554
+ instruction_label.setAlignment(Qt.AlignmentFlag.AlignCenter)
1555
+ main_layout.addWidget(instruction_label)
1556
+
1557
+ # Create matplotlib canvas for the chart
1558
+ self.chart_view = MatplotlibCanvas(width=8, height=5, dpi=100)
1559
+ results_splitter.addWidget(self.chart_view)
1560
+
1561
+ # Set initial splitter sizes
1562
+ results_splitter.setSizes([300, 300])
1563
+
1564
+ # Add the splitter to the main layout
1565
+ main_layout.addWidget(results_splitter)
1566
+
1567
+ # Set the central widget
1568
+ self.setCentralWidget(central_widget)
1569
+
1570
+ def analyze_column(self):
1571
+ if self.df is None or self.column_selector.currentText() == "":
1572
+ return
1573
+
1574
+ # Cancel any existing worker thread
1575
+ if self.worker_thread and self.worker_thread.isRunning():
1576
+ # Signal the thread to cancel
1577
+ self.worker_thread.cancel()
1578
+
1579
+ try:
1580
+ # Disconnect all signals to avoid callbacks during termination
1581
+ self.worker_thread.progress.disconnect()
1582
+ self.worker_thread.result.disconnect()
1583
+ self.worker_thread.error.disconnect()
1584
+ self.worker_thread.finished.disconnect()
1585
+ except Exception:
1586
+ pass # Already disconnected
1587
+
1588
+ # Terminate thread properly
1589
+ self.worker_thread.terminate()
1590
+ self.worker_thread.wait(1000) # Wait up to 1 second
1591
+ self.worker_thread = None # Clear reference
1592
+
1593
+ target_column = self.column_selector.currentText()
1594
+
1595
+ # Check in-memory cache first (fastest)
1596
+ if target_column in self.result_cache:
1597
+ self.handle_results(self.result_cache[target_column])
1598
+ return
1599
+
1600
+ # Check global application-wide cache second (still fast)
1601
+ global_key = get_cache_key(self.df, target_column)
1602
+ if global_key in ColumnProfilerApp.global_cache:
1603
+ self.result_cache[target_column] = ColumnProfilerApp.global_cache[global_key]
1604
+ self.handle_results(self.result_cache[target_column])
1605
+ return
1606
+
1607
+ # Disk cache will be checked in the worker thread
1608
+
1609
+ # Disable the analyze button while processing
1610
+ self.analyze_button.setEnabled(False)
1611
+
1612
+ # Show progress indicators
1613
+ self.progress_bar.setValue(0)
1614
+ self.progress_bar.show()
1615
+ self.progress_label.setText("Starting analysis...")
1616
+ self.progress_label.show()
1617
+ self.cancel_button.show()
1618
+
1619
+ # Create and start the worker thread
1620
+ self.worker_thread = ExplainerThread(self.df, target_column)
1621
+ self.worker_thread.progress.connect(self.update_progress)
1622
+ self.worker_thread.result.connect(self.cache_and_display_results)
1623
+ self.worker_thread.error.connect(self.handle_error)
1624
+ self.worker_thread.finished.connect(self.on_analysis_finished)
1625
+ self.worker_thread.start()
1626
+
1627
+ def update_progress(self, value, message):
1628
+ self.progress_bar.setValue(value)
1629
+ self.progress_label.setText(message)
1630
+
1631
+ def cache_and_display_results(self, importance_df):
1632
+ # Cache the results
1633
+ target_column = self.column_selector.currentText()
1634
+ self.result_cache[target_column] = importance_df
1635
+
1636
+ # Also cache in the global application cache
1637
+ global_key = get_cache_key(self.df, target_column)
1638
+ ColumnProfilerApp.global_cache[global_key] = importance_df
1639
+
1640
+ # Display the results
1641
+ self.handle_results(importance_df)
1642
+
1643
+ def on_analysis_finished(self):
1644
+ """Handle cleanup when analysis is finished (either completed or cancelled)"""
1645
+ self.analyze_button.setEnabled(True)
1646
+ self.cancel_button.hide()
1647
+
1648
+ def handle_results(self, importance_df):
1649
+ # Hide progress indicators
1650
+ self.progress_bar.hide()
1651
+ self.progress_label.hide()
1652
+ self.cancel_button.hide()
1653
+
1654
+ # Update importance table to include correlation column
1655
+ self.importance_table.setColumnCount(3)
1656
+ self.importance_table.setHorizontalHeaderLabels(["Feature", "Importance", "Abs. Correlation"])
1657
+ self.importance_table.setRowCount(len(importance_df))
1658
+
1659
+ # Using a timer for incremental updates
1660
+ self.importance_df = importance_df # Store for incremental rendering
1661
+ self.current_row = 0
1662
+ self.render_timer = QTimer()
1663
+ self.render_timer.timeout.connect(lambda: self.render_next_batch(10))
1664
+ self.render_timer.start(10) # Update every 10ms
1665
+
1666
+ def render_next_batch(self, batch_size):
1667
+ try:
1668
+ if self.current_row >= len(self.importance_df):
1669
+ # All rows rendered, now render the chart and stop the timer
1670
+ self.render_chart()
1671
+ self.render_timer.stop()
1672
+ return
1673
+
1674
+ # Render a batch of rows
1675
+ end_row = min(self.current_row + batch_size, len(self.importance_df))
1676
+ for row in range(self.current_row, end_row):
1677
+ try:
1678
+ # Check if row exists in dataframe to prevent index errors
1679
+ if row < len(self.importance_df):
1680
+ feature = self.importance_df.iloc[row]['feature']
1681
+ importance_value = self.importance_df.iloc[row]['importance_value']
1682
+
1683
+ # Add correlation if available
1684
+ correlation = self.importance_df.iloc[row].get('correlation', None)
1685
+ if correlation is not None:
1686
+ self.importance_table.setItem(row, 0, QTableWidgetItem(str(feature)))
1687
+ self.importance_table.setItem(row, 1, QTableWidgetItem(str(round(importance_value, 4))))
1688
+ self.importance_table.setItem(row, 2, QTableWidgetItem(str(round(correlation, 4))))
1689
+ else:
1690
+ self.importance_table.setItem(row, 0, QTableWidgetItem(str(feature)))
1691
+ self.importance_table.setItem(row, 1, QTableWidgetItem(str(round(importance_value, 4))))
1692
+ else:
1693
+ # Handle out of range index
1694
+ print(f"Warning: Row {row} is out of range (max: {len(self.importance_df)-1})")
1695
+ self.importance_table.setItem(row, 0, QTableWidgetItem("Error"))
1696
+ self.importance_table.setItem(row, 1, QTableWidgetItem("Out of range"))
1697
+ self.importance_table.setItem(row, 2, QTableWidgetItem("N/A"))
1698
+ except (IndexError, KeyError) as e:
1699
+ # Enhanced error reporting for index and key errors
1700
+ import traceback
1701
+ trace = traceback.format_exc()
1702
+ error_msg = f"Error rendering row {row}: {e.__class__.__name__}: {e}\n{trace}"
1703
+ print(error_msg)
1704
+
1705
+ # Handle missing data in the dataframe gracefully
1706
+ self.importance_table.setItem(row, 0, QTableWidgetItem(f"Error: {e.__class__.__name__}"))
1707
+ self.importance_table.setItem(row, 1, QTableWidgetItem(f"{str(e)[:20]}"))
1708
+ self.importance_table.setItem(row, 2, QTableWidgetItem("Error"))
1709
+ except Exception as e:
1710
+ # Catch any other exceptions
1711
+ print(f"Unexpected error rendering row {row}: {e.__class__.__name__}: {e}")
1712
+ self.importance_table.setItem(row, 0, QTableWidgetItem(f"Error: {e.__class__.__name__}"))
1713
+ self.importance_table.setItem(row, 1, QTableWidgetItem("See console for details"))
1714
+ self.importance_table.setItem(row, 2, QTableWidgetItem("Error"))
1715
+
1716
+ self.current_row = end_row
1717
+ QApplication.processEvents() # Allow UI to update
1718
+ except Exception as e:
1719
+ # Catch any exceptions in the rendering loop itself
1720
+ import traceback
1721
+ trace = traceback.format_exc()
1722
+ error_msg = f"Error in render_next_batch: {e.__class__.__name__}: {e}\n{trace}"
1723
+ print(error_msg)
1724
+
1725
+ # Try to stop the timer to prevent further errors
1726
+ try:
1727
+ if self.render_timer and self.render_timer.isActive():
1728
+ self.render_timer.stop()
1729
+ except:
1730
+ pass
1731
+
1732
+ # Show error
1733
+ QMessageBox.critical(self, "Rendering Error",
1734
+ f"Error rendering results: {e.__class__.__name__}: {e}")
1735
+
1736
+ def render_chart(self):
1737
+ # Create horizontal bar chart
1738
+ try:
1739
+ if self.importance_df is None or len(self.importance_df) == 0:
1740
+ # No data to render
1741
+ self.chart_view.axes.clear()
1742
+ self.chart_view.axes.text(0.5, 0.5, "No data available for chart",
1743
+ ha='center', va='center', fontsize=12, color='gray')
1744
+ self.chart_view.axes.set_axis_off()
1745
+ self.chart_view.draw()
1746
+ return
1747
+
1748
+ self.chart_view.axes.clear()
1749
+
1750
+ # Get a sorted copy based on current sort key
1751
+ plot_df = self.importance_df.sort_values(by=self.current_sort, ascending=False).head(20).copy()
1752
+
1753
+ # Verify we have data before proceeding
1754
+ if len(plot_df) == 0:
1755
+ self.chart_view.axes.text(0.5, 0.5, "No features found with importance values",
1756
+ ha='center', va='center', fontsize=12, color='gray')
1757
+ self.chart_view.axes.set_axis_off()
1758
+ self.chart_view.draw()
1759
+ return
1760
+
1761
+ # Check required columns exist
1762
+ required_columns = ['feature', 'importance_value']
1763
+ missing_columns = [col for col in required_columns if col not in plot_df.columns]
1764
+ if missing_columns:
1765
+ error_msg = f"Missing required columns: {', '.join(missing_columns)}"
1766
+ self.chart_view.axes.text(0.5, 0.5, error_msg,
1767
+ ha='center', va='center', fontsize=12, color='red')
1768
+ self.chart_view.axes.set_axis_off()
1769
+ self.chart_view.draw()
1770
+ print(f"Chart rendering error: {error_msg}")
1771
+ return
1772
+
1773
+ # Truncate long feature names for better display
1774
+ max_feature_length = 30
1775
+ plot_df['display_feature'] = plot_df['feature'].apply(
1776
+ lambda x: (str(x)[:max_feature_length] + '...') if len(str(x)) > max_feature_length else str(x)
1777
+ )
1778
+
1779
+ # Reverse order for better display (highest at top)
1780
+ plot_df = plot_df.iloc[::-1].reset_index(drop=True)
1781
+
1782
+ # Create a figure with two subplots side by side
1783
+ self.chart_view.figure.clear()
1784
+ gs = self.chart_view.figure.add_gridspec(1, 2, width_ratios=[3, 2])
1785
+
1786
+ # First subplot for importance
1787
+ ax1 = self.chart_view.figure.add_subplot(gs[0, 0])
1788
+
1789
+ # Create a colormap for better visualization
1790
+ cmap = plt.cm.Blues
1791
+ colors = cmap(np.linspace(0.4, 0.8, len(plot_df)))
1792
+
1793
+ # Plot with custom colors
1794
+ bars = ax1.barh(
1795
+ plot_df['display_feature'],
1796
+ plot_df['importance_value'],
1797
+ color=colors,
1798
+ height=0.7, # Thinner bars for more spacing
1799
+ alpha=0.8
1800
+ )
1801
+
1802
+ # Add values at the end of bars
1803
+ for bar in bars:
1804
+ width = bar.get_width()
1805
+ ax1.text(
1806
+ width * 1.05,
1807
+ bar.get_y() + bar.get_height()/2,
1808
+ f'{width:.2f}',
1809
+ va='center',
1810
+ fontsize=9,
1811
+ fontweight='bold'
1812
+ )
1813
+
1814
+ # Add grid for better readability
1815
+ ax1.grid(True, axis='x', linestyle='--', alpha=0.3)
1816
+
1817
+ # Remove unnecessary spines
1818
+ for spine in ['top', 'right']:
1819
+ ax1.spines[spine].set_visible(False)
1820
+
1821
+ # Make labels more readable
1822
+ ax1.tick_params(axis='y', labelsize=9)
1823
+
1824
+ # Set title and labels
1825
+ ax1.set_title(f'Feature Importance for {self.column_selector.currentText()}')
1826
+ ax1.set_xlabel('Importance Value')
1827
+
1828
+ # Add a note about the sorting order
1829
+ sort_label = "Sorted by: " + ("Importance" if self.current_sort == 'importance_value' else "Correlation")
1830
+
1831
+ # Second subplot for correlation if available
1832
+ if 'correlation' in plot_df.columns:
1833
+ ax2 = self.chart_view.figure.add_subplot(gs[0, 1], sharey=ax1)
1834
+
1835
+ # Create a colormap for correlation - use a different color
1836
+ cmap_corr = plt.cm.Reds
1837
+ colors_corr = cmap_corr(np.linspace(0.4, 0.8, len(plot_df)))
1838
+
1839
+ # Plot correlation bars
1840
+ corr_bars = ax2.barh(
1841
+ plot_df['display_feature'],
1842
+ plot_df['correlation'],
1843
+ color=colors_corr,
1844
+ height=0.7,
1845
+ alpha=0.8
1846
+ )
1847
+
1848
+ # Add values at the end of correlation bars
1849
+ for bar in corr_bars:
1850
+ width = bar.get_width()
1851
+ ax2.text(
1852
+ width * 1.05,
1853
+ bar.get_y() + bar.get_height()/2,
1854
+ f'{width:.2f}',
1855
+ va='center',
1856
+ fontsize=9,
1857
+ fontweight='bold'
1858
+ )
1859
+
1860
+ # Add grid and styling
1861
+ ax2.grid(True, axis='x', linestyle='--', alpha=0.3)
1862
+ ax2.set_title('Absolute Correlation')
1863
+ ax2.set_xlabel('Correlation Value')
1864
+
1865
+ # Hide y-axis labels since they're shared with the first plot
1866
+ ax2.set_yticklabels([])
1867
+
1868
+ # Remove unnecessary spines
1869
+ for spine in ['top', 'right']:
1870
+ ax2.spines[spine].set_visible(False)
1871
+
1872
+ # Add a note about the current sort order
1873
+ self.chart_view.figure.text(0.5, 0.01, sort_label, ha='center', fontsize=9, style='italic')
1874
+
1875
+ # Adjust figure size based on number of features
1876
+ feature_count = len(plot_df)
1877
+ self.chart_view.figure.set_figheight(max(5, min(4 + feature_count * 0.3, 12)))
1878
+
1879
+ # Adjust layout and draw
1880
+ self.chart_view.figure.tight_layout(rect=[0, 0.03, 1, 0.97]) # Make room for sort label
1881
+ self.chart_view.draw()
1882
+
1883
+ except IndexError as e:
1884
+ # Special handling for index errors with detailed information
1885
+ import traceback
1886
+ import inspect
1887
+
1888
+ # Get stack trace information
1889
+ trace = traceback.format_exc()
1890
+
1891
+ # Try to get line and context information
1892
+ try:
1893
+ frame = inspect.trace()[-1]
1894
+ frame_info = inspect.getframeinfo(frame[0])
1895
+ filename = frame_info.filename
1896
+ lineno = frame_info.lineno
1897
+ function = frame_info.function
1898
+ code_context = frame_info.code_context[0].strip() if frame_info.code_context else "Unknown code context"
1899
+
1900
+ # Detailed error message
1901
+ detail_msg = f"IndexError at line {lineno} in {function}: {str(e)}\nCode: {code_context}"
1902
+ print(f"Chart rendering error: {detail_msg}\n{trace}")
1903
+
1904
+ # Display error in chart
1905
+ self.chart_view.axes.clear()
1906
+ self.chart_view.axes.text(0.5, 0.5,
1907
+ f"Index Error in chart rendering:\n{str(e)}\nAt line {lineno}: {code_context}",
1908
+ ha='center', va='center', fontsize=12, color='red',
1909
+ wrap=True)
1910
+ self.chart_view.axes.set_axis_off()
1911
+ self.chart_view.draw()
1912
+ except Exception as inner_e:
1913
+ # Fallback if the detailed error reporting fails
1914
+ print(f"Error getting detailed error info: {inner_e}")
1915
+ print(f"Original error: {e}\n{trace}")
1916
+
1917
+ self.chart_view.axes.clear()
1918
+ self.chart_view.axes.text(0.5, 0.5, f"Index Error: {str(e)}",
1919
+ ha='center', va='center', fontsize=12, color='red')
1920
+ self.chart_view.axes.set_axis_off()
1921
+ self.chart_view.draw()
1922
+ except Exception as e:
1923
+ # Recover gracefully from any chart rendering errors with detailed information
1924
+ import traceback
1925
+ trace = traceback.format_exc()
1926
+ error_msg = f"Error rendering chart: {e.__class__.__name__}: {str(e)}"
1927
+ print(f"{error_msg}\n{trace}")
1928
+
1929
+ self.chart_view.axes.clear()
1930
+ self.chart_view.axes.text(0.5, 0.5, error_msg,
1931
+ ha='center', va='center', fontsize=12, color='red',
1932
+ wrap=True)
1933
+ self.chart_view.axes.set_axis_off()
1934
+ self.chart_view.draw()
1935
+
1936
+ def handle_error(self, error_message):
1937
+ """Handle errors during analysis"""
1938
+ # Hide progress indicators
1939
+ self.progress_bar.hide()
1940
+ self.progress_label.hide()
1941
+ self.cancel_button.hide()
1942
+
1943
+ # Re-enable analyze button
1944
+ self.analyze_button.setEnabled(True)
1945
+
1946
+ # Print error to console for debugging
1947
+ print(f"Error in column profiler: {error_message}")
1948
+
1949
+ # Show error notification
1950
+ show_error_notification(f"Analysis Error: {error_message.split(chr(10))[0] if chr(10) in error_message else error_message}")
1951
+
1952
+ # Show a message in the UI as well
1953
+ self.importance_table.setRowCount(1)
1954
+ self.importance_table.setColumnCount(3)
1955
+ self.importance_table.setHorizontalHeaderLabels(["Feature", "Importance", "Abs. Correlation"])
1956
+ self.importance_table.setItem(0, 0, QTableWidgetItem(f"Error: {error_message.split(chr(10))[0]}"))
1957
+ self.importance_table.setItem(0, 1, QTableWidgetItem(""))
1958
+ self.importance_table.setItem(0, 2, QTableWidgetItem(""))
1959
+ self.importance_table.resizeColumnsToContents()
1960
+
1961
+ # Update the chart to show error
1962
+ self.chart_view.axes.clear()
1963
+ self.chart_view.axes.text(0.5, 0.5, f"Error calculating importance:\n{error_message.split(chr(10))[0]}",
1964
+ ha='center', va='center', fontsize=12, color='red',
1965
+ wrap=True)
1966
+ self.chart_view.axes.set_axis_off()
1967
+ self.chart_view.draw()
1968
+
1969
+ def closeEvent(self, event):
1970
+ """Clean up when the window is closed"""
1971
+ # Stop any running timer
1972
+ if self.render_timer and self.render_timer.isActive():
1973
+ self.render_timer.stop()
1974
+
1975
+ # Clean up any background threads
1976
+ if self.worker_thread and self.worker_thread.isRunning():
1977
+ # Disconnect all signals to avoid callbacks during termination
1978
+ try:
1979
+ self.worker_thread.progress.disconnect()
1980
+ self.worker_thread.result.disconnect()
1981
+ self.worker_thread.error.disconnect()
1982
+ self.worker_thread.finished.disconnect()
1983
+ except Exception:
1984
+ pass # Already disconnected
1985
+
1986
+ # Terminate thread properly
1987
+ self.worker_thread.terminate()
1988
+ self.worker_thread.wait(1000) # Wait up to 1 second
1989
+
1990
+ # Clear references to prevent thread issues
1991
+ self.worker_thread = None
1992
+
1993
+ # Clean up memory
1994
+ self.result_cache.clear()
1995
+
1996
+ # Accept the close event
1997
+ event.accept()
1998
+
1999
+ # Suggest garbage collection
2000
+ gc.collect()
2001
+
2002
+ def cancel_analysis(self):
2003
+ """Cancel the current analysis"""
2004
+ if self.worker_thread and self.worker_thread.isRunning():
2005
+ # Signal the thread to cancel first
2006
+ self.worker_thread.cancel()
2007
+
2008
+ # Disconnect all signals to avoid callbacks during termination
2009
+ try:
2010
+ self.worker_thread.progress.disconnect()
2011
+ self.worker_thread.result.disconnect()
2012
+ self.worker_thread.error.disconnect()
2013
+ self.worker_thread.finished.disconnect()
2014
+ except Exception:
2015
+ pass # Already disconnected
2016
+
2017
+ # Terminate thread properly
2018
+ self.worker_thread.terminate()
2019
+ self.worker_thread.wait(1000) # Wait up to 1 second
2020
+
2021
+ # Clear reference
2022
+ self.worker_thread = None
2023
+
2024
+ # Update UI
2025
+ self.progress_bar.hide()
2026
+ self.progress_label.setText("Analysis cancelled")
2027
+ self.progress_label.show()
2028
+ self.cancel_button.hide()
2029
+ self.analyze_button.setEnabled(True)
2030
+
2031
+ # Hide the progress label after 2 seconds
2032
+ QTimer.singleShot(2000, self.progress_label.hide)
2033
+
2034
+ def show_relationship_visualization(self, row, column):
2035
+ """Show visualization of relationship between selected feature and target column"""
2036
+ if self.importance_df is None or row < 0 or row >= len(self.importance_df):
2037
+ return
2038
+
2039
+ # Get the feature name and target column
2040
+ try:
2041
+ feature = self.importance_df.iloc[row]['feature']
2042
+ target = self.column_selector.currentText()
2043
+
2044
+ # Verify both columns exist in the dataframe
2045
+ if feature not in self.df.columns:
2046
+ QMessageBox.warning(self, "Column Not Found",
2047
+ f"Feature column '{feature}' not found in the dataframe")
2048
+ return
2049
+
2050
+ if target not in self.df.columns:
2051
+ QMessageBox.warning(self, "Column Not Found",
2052
+ f"Target column '{target}' not found in the dataframe")
2053
+ return
2054
+ except Exception as e:
2055
+ QMessageBox.critical(self, "Error", f"Error getting column data: {str(e)}")
2056
+ return
2057
+
2058
+ # Create a dialog to show the visualization
2059
+ dialog = QDialog(self)
2060
+ dialog.setWindowTitle(f"Relationship: {feature} vs {target}")
2061
+ dialog.resize(900, 700)
2062
+
2063
+ # Create layout
2064
+ layout = QVBoxLayout(dialog)
2065
+
2066
+ # Create canvas for the plot
2067
+ canvas = MatplotlibCanvas(width=8, height=6, dpi=100)
2068
+ layout.addWidget(canvas)
2069
+
2070
+ # Determine the data types
2071
+ feature_is_numeric = pd.api.types.is_numeric_dtype(self.df[feature])
2072
+ target_is_numeric = pd.api.types.is_numeric_dtype(self.df[target])
2073
+
2074
+ # Get unique counts to determine if we have high cardinality
2075
+ feature_unique_count = self.df[feature].nunique()
2076
+ target_unique_count = self.df[target].nunique()
2077
+
2078
+ # Define high cardinality threshold
2079
+ high_cardinality_threshold = 10
2080
+
2081
+ # Clear the figure
2082
+ canvas.axes.clear()
2083
+
2084
+ # Create a working copy of the dataframe
2085
+ working_df = self.df.copy()
2086
+
2087
+ # Prepare data for high cardinality columns
2088
+ if not feature_is_numeric and feature_unique_count > high_cardinality_threshold:
2089
+ # Get the top N categories by frequency
2090
+ top_categories = self.df[feature].value_counts().nlargest(high_cardinality_threshold).index.tolist()
2091
+ # Create "Other" category for remaining values
2092
+ working_df[feature] = working_df[feature].apply(lambda x: x if x in top_categories else 'Other')
2093
+
2094
+ if not target_is_numeric and target_unique_count > high_cardinality_threshold:
2095
+ top_categories = self.df[target].value_counts().nlargest(high_cardinality_threshold).index.tolist()
2096
+ working_df[target] = working_df[target].apply(lambda x: x if x in top_categories else 'Other')
2097
+
2098
+ # Create appropriate visualization based on data types and cardinality
2099
+ if feature_is_numeric and target_is_numeric:
2100
+ # Scatter plot for numeric vs numeric
2101
+ # Use hexbin for large datasets to avoid overplotting
2102
+ if len(working_df) > 100:
2103
+ canvas.axes.hexbin(
2104
+ working_df[feature],
2105
+ working_df[target],
2106
+ gridsize=25,
2107
+ cmap='Blues',
2108
+ mincnt=1
2109
+ )
2110
+ canvas.axes.set_title(f"Hexbin Density Plot: {feature} vs {target}")
2111
+ canvas.axes.set_xlabel(feature)
2112
+ canvas.axes.set_ylabel(target)
2113
+ # Add a colorbar
2114
+ cbar = canvas.figure.colorbar(canvas.axes.collections[0], ax=canvas.axes)
2115
+ cbar.set_label('Count')
2116
+ else:
2117
+ # For smaller datasets, use a scatter plot with transparency
2118
+ sns.scatterplot(
2119
+ x=feature,
2120
+ y=target,
2121
+ data=working_df,
2122
+ ax=canvas.axes,
2123
+ alpha=0.6
2124
+ )
2125
+ # Add regression line
2126
+ sns.regplot(
2127
+ x=feature,
2128
+ y=target,
2129
+ data=working_df,
2130
+ ax=canvas.axes,
2131
+ scatter=False,
2132
+ line_kws={"color": "red"}
2133
+ )
2134
+ canvas.axes.set_title(f"Scatter Plot: {feature} vs {target}")
2135
+
2136
+ elif feature_is_numeric and not target_is_numeric:
2137
+ # Box plot for numeric vs categorical
2138
+ if target_unique_count <= high_cardinality_threshold * 2:
2139
+ # Standard boxplot for reasonable number of categories
2140
+ order = working_df[target].value_counts().nlargest(high_cardinality_threshold * 2).index
2141
+
2142
+ # Calculate counts for each category
2143
+ category_counts = working_df[target].value_counts()
2144
+
2145
+ sns.boxplot(
2146
+ x=target,
2147
+ y=feature,
2148
+ data=working_df,
2149
+ ax=canvas.axes,
2150
+ order=order
2151
+ )
2152
+ canvas.axes.set_title(f"Box Plot: {feature} by {target}")
2153
+
2154
+ # Add count annotations below each box
2155
+ for i, category in enumerate(order):
2156
+ if category in category_counts:
2157
+ count = category_counts[category]
2158
+ canvas.axes.text(
2159
+ i,
2160
+ canvas.axes.get_ylim()[0] - (canvas.axes.get_ylim()[1] - canvas.axes.get_ylim()[0]) * 0.05,
2161
+ f'n={count}',
2162
+ ha='center',
2163
+ va='top',
2164
+ fontsize=8,
2165
+ fontweight='bold'
2166
+ )
2167
+
2168
+ # Rotate x-axis labels for better readability
2169
+ canvas.axes.set_xticklabels(
2170
+ canvas.axes.get_xticklabels(),
2171
+ rotation=45,
2172
+ ha='right'
2173
+ )
2174
+ else:
2175
+ # For very high cardinality, use a violin plot with limited categories
2176
+ order = working_df[target].value_counts().nlargest(high_cardinality_threshold).index
2177
+ working_df_filtered = working_df[working_df[target].isin(order)]
2178
+
2179
+ # Calculate counts for filtered categories
2180
+ category_counts = working_df_filtered[target].value_counts()
2181
+
2182
+ sns.violinplot(
2183
+ x=target,
2184
+ y=feature,
2185
+ data=working_df_filtered,
2186
+ ax=canvas.axes,
2187
+ inner='quartile',
2188
+ cut=0
2189
+ )
2190
+ canvas.axes.set_title(f"Violin Plot: {feature} by Top {len(order)} {target} Categories")
2191
+
2192
+ # Add count annotations below each violin
2193
+ for i, category in enumerate(order):
2194
+ if category in category_counts:
2195
+ count = category_counts[category]
2196
+ canvas.axes.text(
2197
+ i,
2198
+ canvas.axes.get_ylim()[0] - (canvas.axes.get_ylim()[1] - canvas.axes.get_ylim()[0]) * 0.05,
2199
+ f'n={count}',
2200
+ ha='center',
2201
+ va='top',
2202
+ fontsize=8,
2203
+ fontweight='bold'
2204
+ )
2205
+
2206
+ canvas.axes.set_xticklabels(
2207
+ canvas.axes.get_xticklabels(),
2208
+ rotation=45,
2209
+ ha='right'
2210
+ )
2211
+
2212
+ elif not feature_is_numeric and target_is_numeric:
2213
+ # Bar plot for categorical vs numeric
2214
+ if feature_unique_count <= high_cardinality_threshold * 2:
2215
+ # Use standard barplot for reasonable number of categories
2216
+ order = working_df[feature].value_counts().nlargest(high_cardinality_threshold * 2).index
2217
+
2218
+ # Calculate counts for each category for annotations
2219
+ category_counts = working_df[feature].value_counts()
2220
+
2221
+ sns.barplot(
2222
+ x=feature,
2223
+ y=target,
2224
+ data=working_df,
2225
+ ax=canvas.axes,
2226
+ order=order,
2227
+ estimator=np.mean,
2228
+ errorbar=('ci', 95),
2229
+ capsize=0.2
2230
+ )
2231
+ canvas.axes.set_title(f"Bar Plot: Average {target} by {feature}")
2232
+
2233
+ # Add value labels and counts on top of bars
2234
+ for i, p in enumerate(canvas.axes.patches):
2235
+ # Get the category name for this bar
2236
+ if i < len(order):
2237
+ category = order[i]
2238
+ count = category_counts[category]
2239
+
2240
+ # Add mean value and count
2241
+ canvas.axes.annotate(
2242
+ f'{p.get_height():.1f}\n(n={count})',
2243
+ (p.get_x() + p.get_width() / 2., p.get_height()),
2244
+ ha='center',
2245
+ va='bottom',
2246
+ fontsize=8,
2247
+ rotation=0
2248
+ )
2249
+
2250
+ # Rotate x-axis labels if needed
2251
+ if feature_unique_count > 5:
2252
+ canvas.axes.set_xticklabels(
2253
+ canvas.axes.get_xticklabels(),
2254
+ rotation=45,
2255
+ ha='right'
2256
+ )
2257
+ else:
2258
+ # For high cardinality, use a horizontal bar plot with top N categories
2259
+ top_n = 15 # Show top 15 categories
2260
+ # Calculate mean of target for each feature category
2261
+ grouped = working_df.groupby(feature)[target].agg(['mean', 'count', 'std']).reset_index()
2262
+ # Sort by mean and take top categories
2263
+ top_groups = grouped.nlargest(top_n, 'mean')
2264
+
2265
+ # Sort by mean value for better visualization
2266
+ sns.barplot(
2267
+ y=feature,
2268
+ x='mean',
2269
+ data=top_groups,
2270
+ ax=canvas.axes,
2271
+ orient='h'
2272
+ )
2273
+ canvas.axes.set_title(f"Top {top_n} Categories by Average {target}")
2274
+ canvas.axes.set_xlabel(f"Average {target}")
2275
+
2276
+ # Add count annotations
2277
+ for i, row in enumerate(top_groups.itertuples()):
2278
+ canvas.axes.text(
2279
+ row.mean + 0.1,
2280
+ i,
2281
+ f'n={row.count}',
2282
+ va='center',
2283
+ fontsize=8
2284
+ )
2285
+
2286
+ else:
2287
+ # Both feature and target are categorical
2288
+ if feature_unique_count <= high_cardinality_threshold and target_unique_count <= high_cardinality_threshold:
2289
+ # Heatmap for categorical vs categorical with manageable cardinality
2290
+ crosstab = pd.crosstab(
2291
+ working_df[feature],
2292
+ working_df[target],
2293
+ normalize='index'
2294
+ )
2295
+
2296
+ # Create heatmap with improved readability
2297
+ sns.heatmap(
2298
+ crosstab,
2299
+ annot=True,
2300
+ cmap="YlGnBu",
2301
+ ax=canvas.axes,
2302
+ fmt='.2f',
2303
+ linewidths=0.5,
2304
+ annot_kws={"size": 9 if crosstab.size < 30 else 7}
2305
+ )
2306
+ canvas.axes.set_title(f"Heatmap: {feature} vs {target} (proportions)")
2307
+ else:
2308
+ # For high cardinality in both, show a count plot of top categories
2309
+ feature_top = working_df[feature].value_counts().nlargest(8).index
2310
+ target_top = working_df[target].value_counts().nlargest(5).index
2311
+
2312
+ # Filter data to only include top categories
2313
+ filtered_df = working_df[
2314
+ working_df[feature].isin(feature_top) &
2315
+ working_df[target].isin(target_top)
2316
+ ]
2317
+
2318
+ # Create a grouped count plot
2319
+ ax_plot = sns.countplot(
2320
+ x=feature,
2321
+ hue=target,
2322
+ data=filtered_df,
2323
+ ax=canvas.axes
2324
+ )
2325
+ canvas.axes.set_title(f"Count Plot: Top {len(feature_top)} {feature} by Top {len(target_top)} {target}")
2326
+
2327
+ # Add count labels on top of bars
2328
+ for p in canvas.axes.patches:
2329
+ if p.get_height() > 0: # Only add labels for non-zero bars
2330
+ canvas.axes.annotate(
2331
+ f'{int(p.get_height())}',
2332
+ (p.get_x() + p.get_width() / 2., p.get_height()),
2333
+ ha='center',
2334
+ va='bottom',
2335
+ fontsize=8,
2336
+ rotation=0
2337
+ )
2338
+
2339
+ # Rotate x-axis labels
2340
+ canvas.axes.set_xticklabels(
2341
+ canvas.axes.get_xticklabels(),
2342
+ rotation=45,
2343
+ ha='right'
2344
+ )
2345
+
2346
+ # Move legend to a better position
2347
+ canvas.axes.legend(title=target, bbox_to_anchor=(1.05, 1), loc='upper left')
2348
+
2349
+ # Add informational text about data reduction if applicable
2350
+ if (not feature_is_numeric and feature_unique_count > high_cardinality_threshold) or \
2351
+ (not target_is_numeric and target_unique_count > high_cardinality_threshold):
2352
+ canvas.figure.text(
2353
+ 0.5, 0.01,
2354
+ f"Note: Visualization simplified to show top categories only. Original data has {feature_unique_count} unique {feature} values and {target_unique_count} unique {target} values.",
2355
+ ha='center',
2356
+ fontsize=8,
2357
+ style='italic'
2358
+ )
2359
+
2360
+ # Adjust layout and draw
2361
+ canvas.figure.tight_layout()
2362
+ canvas.draw()
2363
+
2364
+ # Add a close button
2365
+ close_button = QPushButton("Close")
2366
+ close_button.clicked.connect(dialog.accept)
2367
+ layout.addWidget(close_button)
2368
+
2369
+ # Show the dialog
2370
+ dialog.exec()
2371
+
2372
+ def change_sort(self, sort_key):
2373
+ """Change the sort order of the results"""
2374
+ if self.importance_df is None:
2375
+ return
2376
+
2377
+ # Update button states
2378
+ if sort_key == 'importance_value':
2379
+ self.importance_sort_btn.setChecked(True)
2380
+ self.correlation_sort_btn.setChecked(False)
2381
+ else:
2382
+ self.importance_sort_btn.setChecked(False)
2383
+ self.correlation_sort_btn.setChecked(True)
2384
+
2385
+ # Store the current sort key
2386
+ self.current_sort = sort_key
2387
+
2388
+ # Re-sort the dataframe
2389
+ self.importance_df = self.importance_df.sort_values(by=sort_key, ascending=False)
2390
+
2391
+ # Reset rendering of the table
2392
+ self.importance_table.clearContents()
2393
+ self.importance_table.setRowCount(len(self.importance_df))
2394
+ self.current_row = 0
2395
+
2396
+ # Start incremental rendering with the new sort order
2397
+ if self.render_timer and self.render_timer.isActive():
2398
+ self.render_timer.stop()
2399
+ self.render_timer = QTimer()
2400
+ self.render_timer.timeout.connect(lambda: self.render_next_batch(10))
2401
+ self.render_timer.start(10) # Update every 10ms
2402
+
2403
+ # Custom matplotlib canvas for embedding in Qt
2404
+ class MatplotlibCanvas(FigureCanvasQTAgg):
2405
+ def __init__(self, width=5, height=4, dpi=100):
2406
+ self.figure = Figure(figsize=(width, height), dpi=dpi)
2407
+ self.axes = self.figure.add_subplot(111)
2408
+ super().__init__(self.figure)
2409
+
2410
+ def visualize_profile(df: pd.DataFrame, column: str = None) -> None:
2411
+ """
2412
+ Launch a PyQt6 UI for visualizing column importance.
2413
+
2414
+ Args:
2415
+ df: DataFrame containing the data
2416
+ column: Optional target column to analyze immediately
2417
+ """
2418
+ try:
2419
+ # Verify df is a valid DataFrame
2420
+ if not isinstance(df, pd.DataFrame):
2421
+ raise ValueError("Input must be a pandas DataFrame")
2422
+
2423
+ # Verify df has data
2424
+ if len(df) == 0:
2425
+ raise ValueError("DataFrame is empty, cannot analyze")
2426
+
2427
+ # Verify columns exist
2428
+ if column is not None and column not in df.columns:
2429
+ raise ValueError(f"Column '{column}' not found in the DataFrame")
2430
+
2431
+ # Check if dataset is too small for meaningful analysis
2432
+ row_count = len(df)
2433
+ if row_count <= 5:
2434
+ print(f"WARNING: Dataset only has {row_count} rows. Feature importance analysis requires more data for meaningful results.")
2435
+ if QApplication.instance():
2436
+ QMessageBox.warning(None, "Insufficient Data",
2437
+ f"The dataset only contains {row_count} rows. Feature importance analysis requires more data for meaningful results.")
2438
+
2439
+ # For large datasets, sample up to 500 rows for better statistical significance
2440
+ elif row_count > 500:
2441
+ print(f"Sampling 500 rows from dataset ({row_count:,} total rows)")
2442
+ df = df.sample(n=500, random_state=42)
2443
+
2444
+ # Check if we're already in a Qt application
2445
+ existing_app = QApplication.instance()
2446
+ standalone_mode = existing_app is None
2447
+
2448
+ # Create app if needed
2449
+ if standalone_mode:
2450
+ app = QApplication(sys.argv)
2451
+ else:
2452
+ app = existing_app
2453
+
2454
+ app.setStyle('Fusion') # Modern look
2455
+
2456
+ # Set modern dark theme (only in standalone mode to avoid affecting parent app)
2457
+ if standalone_mode:
2458
+ palette = QPalette()
2459
+ palette.setColor(QPalette.ColorRole.Window, QColor(53, 53, 53))
2460
+ palette.setColor(QPalette.ColorRole.WindowText, Qt.GlobalColor.white)
2461
+ palette.setColor(QPalette.ColorRole.Base, QColor(25, 25, 25))
2462
+ palette.setColor(QPalette.ColorRole.AlternateBase, QColor(53, 53, 53))
2463
+ palette.setColor(QPalette.ColorRole.ToolTipBase, Qt.GlobalColor.white)
2464
+ palette.setColor(QPalette.ColorRole.ToolTipText, Qt.GlobalColor.white)
2465
+ palette.setColor(QPalette.ColorRole.Text, Qt.GlobalColor.white)
2466
+ palette.setColor(QPalette.ColorRole.Button, QColor(53, 53, 53))
2467
+ palette.setColor(QPalette.ColorRole.ButtonText, Qt.GlobalColor.white)
2468
+ palette.setColor(QPalette.ColorRole.BrightText, Qt.GlobalColor.red)
2469
+ palette.setColor(QPalette.ColorRole.Link, QColor(42, 130, 218))
2470
+ palette.setColor(QPalette.ColorRole.Highlight, QColor(42, 130, 218))
2471
+ palette.setColor(QPalette.ColorRole.HighlightedText, Qt.GlobalColor.black)
2472
+ app.setPalette(palette)
2473
+
2474
+ window = ColumnProfilerApp(df)
2475
+ window.setAttribute(Qt.WidgetAttribute.WA_DeleteOnClose) # Ensure cleanup on close
2476
+ window.show()
2477
+
2478
+ # Add tooltip to explain double-click functionality
2479
+ window.importance_table.setToolTip("Double-click on a feature to visualize its relationship with the target column")
2480
+
2481
+ # If a specific column is provided, analyze it immediately
2482
+ if column is not None and column in df.columns:
2483
+ window.column_selector.setCurrentText(column)
2484
+ # Wrap the analysis in a try/except to prevent crashes
2485
+ def safe_analyze():
2486
+ try:
2487
+ window.analyze_column()
2488
+ except Exception as e:
2489
+ print(f"Error during column analysis: {e}")
2490
+ import traceback
2491
+ traceback.print_exc()
2492
+ QMessageBox.critical(window, "Analysis Error",
2493
+ f"Error analyzing column:\n\n{str(e)}")
2494
+
2495
+ QTimer.singleShot(100, safe_analyze) # Use timer to avoid immediate thread issues
2496
+
2497
+ # Set a watchdog timer to cancel analysis if it takes too long (30 seconds)
2498
+ def check_progress():
2499
+ if window.worker_thread and window.worker_thread.isRunning():
2500
+ # If still running after 30 seconds, cancel the operation
2501
+ QMessageBox.warning(window, "Analysis Timeout",
2502
+ "The analysis is taking longer than expected. It will be canceled to prevent hanging.")
2503
+ try:
2504
+ window.cancel_analysis()
2505
+ except Exception as e:
2506
+ print(f"Error canceling analysis: {e}")
2507
+
2508
+ QTimer.singleShot(30000, check_progress) # 30 seconds timeout
2509
+
2510
+ # Only enter event loop in standalone mode
2511
+ if standalone_mode:
2512
+ sys.exit(app.exec())
2513
+ else:
2514
+ # Return the window for parent app to track
2515
+ return window
2516
+ except Exception as e:
2517
+ # Handle any exceptions to prevent crashes
2518
+ print(f"Error in visualize_profile: {e}")
2519
+ import traceback
2520
+ traceback.print_exc()
2521
+
2522
+ # Show error to user
2523
+ if QApplication.instance():
2524
+ show_error_notification(f"Profile Error: Error creating column profile - {str(e)}")
2525
+ return None
2526
+
2527
+ def test_profile():
2528
+ """
2529
+ Test the profile and visualization functions with sample data.
2530
+ """
2531
+ # Create a sample DataFrame with 40 columns
2532
+ np.random.seed(42)
2533
+ n = 1000
2534
+
2535
+ # Generate core sample data with known relationships
2536
+ age = np.random.normal(35, 10, n).astype(int)
2537
+ experience = age - np.random.randint(18, 25, n) # experience correlates with age
2538
+ experience = np.maximum(0, experience) # no negative experience
2539
+
2540
+ salary = 30000 + 2000 * experience + np.random.normal(0, 10000, n)
2541
+
2542
+ departments = np.random.choice(['Engineering', 'Marketing', 'Sales', 'HR', 'Finance'], n)
2543
+ education = np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], n,
2544
+ p=[0.2, 0.5, 0.2, 0.1])
2545
+
2546
+ performance = np.random.normal(0, 1, n)
2547
+ performance += 0.5 * (education == 'Master') + 0.8 * (education == 'PhD') # education affects performance
2548
+ performance += 0.01 * experience # experience slightly affects performance
2549
+ performance = (performance - performance.min()) / (performance.max() - performance.min()) * 5 # scale to 0-5
2550
+
2551
+ # Create the base DataFrame
2552
+ data = {
2553
+ 'Age': age,
2554
+ 'Experience': experience,
2555
+ 'Department': departments,
2556
+ 'Education': education,
2557
+ 'Performance': performance,
2558
+ 'Salary': salary
2559
+ }
2560
+
2561
+ # Generate additional numeric columns
2562
+ for i in range(1, 15):
2563
+ # Create some columns with relationship to salary
2564
+ if i <= 5:
2565
+ data[f'Metric_{i}'] = salary * (0.01 * i) + np.random.normal(0, 5000, n)
2566
+ # Create columns with relationship to age
2567
+ elif i <= 10:
2568
+ data[f'Metric_{i}'] = age * (i-5) + np.random.normal(0, 10, n)
2569
+ # Create random columns
2570
+ else:
2571
+ data[f'Metric_{i}'] = np.random.normal(100, 50, n)
2572
+
2573
+ # Generate additional categorical columns
2574
+ categories = [
2575
+ ['A', 'B', 'C', 'D'],
2576
+ ['Low', 'Medium', 'High'],
2577
+ ['North', 'South', 'East', 'West'],
2578
+ ['Type1', 'Type2', 'Type3'],
2579
+ ['Yes', 'No', 'Maybe'],
2580
+ ['Red', 'Green', 'Blue', 'Yellow'],
2581
+ ['Small', 'Medium', 'Large']
2582
+ ]
2583
+
2584
+ for i in range(1, 10):
2585
+ # Pick a category list
2586
+ cat_list = categories[i % len(categories)]
2587
+ # Generate random categorical column
2588
+ data[f'Category_{i}'] = np.random.choice(cat_list, n)
2589
+
2590
+ # Generate date and time related columns
2591
+ base_date = np.datetime64('2020-01-01')
2592
+
2593
+ # Instead of datetime objects, convert to days since base date (numeric values)
2594
+ hire_days = np.array([365 * (35 - a) + np.random.randint(0, 30) for a in age])
2595
+ data['Hire_Days_Ago'] = hire_days
2596
+
2597
+ promotion_days = np.array([np.random.randint(0, 1000) for _ in range(n)])
2598
+ data['Last_Promotion_Days_Ago'] = promotion_days
2599
+
2600
+ review_days = np.array([np.random.randint(1000, 1200) for _ in range(n)])
2601
+ data['Next_Review_In_Days'] = review_days
2602
+
2603
+ # For reference, also store the actual dates as strings instead of datetime64
2604
+ data['Hire_Date_Str'] = [str(base_date + np.timedelta64(int(days), 'D')) for days in hire_days]
2605
+ data['Last_Promotion_Date_Str'] = [str(base_date + np.timedelta64(int(days), 'D')) for days in promotion_days]
2606
+ data['Review_Date_Str'] = [str(base_date + np.timedelta64(int(days), 'D')) for days in review_days]
2607
+
2608
+ # Binary columns
2609
+ data['IsManager'] = np.random.choice([0, 1], n, p=[0.8, 0.2])
2610
+ data['RemoteWorker'] = np.random.choice([0, 1], n)
2611
+ data['HasHealthInsurance'] = np.random.choice([0, 1], n, p=[0.1, 0.9])
2612
+ data['HasRetirementPlan'] = np.random.choice([0, 1], n, p=[0.15, 0.85])
2613
+
2614
+ # Columns with missing values
2615
+ data['OptionalMetric_1'] = np.random.normal(50, 10, n)
2616
+ data['OptionalMetric_1'][np.random.choice([True, False], n, p=[0.2, 0.8])] = np.nan
2617
+
2618
+ data['OptionalMetric_2'] = np.random.normal(100, 20, n)
2619
+ data['OptionalMetric_2'][np.random.choice([True, False], n, p=[0.3, 0.7])] = np.nan
2620
+
2621
+ data['OptionalCategory'] = np.random.choice(['Option1', 'Option2', 'Option3', None], n, p=[0.3, 0.3, 0.3, 0.1])
2622
+
2623
+ # High cardinality column (like an ID)
2624
+ data['ID'] = [f"ID_{i:06d}" for i in range(n)]
2625
+
2626
+ # Create the DataFrame with 40 columns
2627
+ df = pd.DataFrame(data)
2628
+
2629
+ print(f"Created sample DataFrame with {len(df.columns)} columns and {len(df)} rows")
2630
+ print("Columns:", ', '.join(df.columns))
2631
+ print("Launching PyQt6 Column Profiler application...")
2632
+ visualize_profile(df, 'Salary') # Start with Salary analysis
2633
+
2634
+ if __name__ == "__main__":
2635
+ test_profile()