sqlshell 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sqlshell might be problematic. Click here for more details.

@@ -0,0 +1,2515 @@
1
+ import pandas as pd
2
+ import xgboost as xgb
3
+ import numpy as np
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.preprocessing import LabelEncoder
6
+ import sys
7
+ import time
8
+ import hashlib
9
+ import os
10
+ import pickle
11
+ import gc
12
+ from pathlib import Path
13
+ from PyQt6.QtWidgets import (QApplication, QMainWindow, QTableWidget, QTableWidgetItem,
14
+ QVBoxLayout, QHBoxLayout, QLabel, QWidget, QComboBox,
15
+ QPushButton, QSplitter, QHeaderView, QFrame, QProgressBar,
16
+ QMessageBox, QDialog)
17
+ from PyQt6.QtCore import Qt, QAbstractTableModel, QModelIndex, QThread, pyqtSignal, QTimer
18
+ from PyQt6.QtGui import QPalette, QColor, QBrush, QPainter, QPen
19
+ from scipy.stats import chi2_contingency, pearsonr
20
+
21
+ # Import matplotlib at the top level
22
+ import matplotlib
23
+ matplotlib.use('QtAgg')
24
+ from matplotlib.backends.backend_qtagg import FigureCanvasQTAgg
25
+ from matplotlib.figure import Figure
26
+ import seaborn as sns
27
+ import matplotlib.pyplot as plt
28
+
29
+ # Create a cache directory in user's home directory
30
+ CACHE_DIR = os.path.join(Path.home(), '.sqlshell_cache')
31
+ os.makedirs(CACHE_DIR, exist_ok=True)
32
+
33
+ def get_cache_key(df, column):
34
+ """Generate a cache key based on dataframe content and column"""
35
+ # Get DataFrame characteristics that make it unique
36
+ columns = ','.join(df.columns)
37
+ shapes = f"{df.shape[0]}x{df.shape[1]}"
38
+ col_types = ','.join(str(dtype) for dtype in df.dtypes)
39
+
40
+ # Sample some values as fingerprint without loading entire dataframe
41
+ sample_rows = min(50, len(df))
42
+ values_sample = df.head(sample_rows).values.tobytes()
43
+
44
+ # Create hash
45
+ hash_input = f"{columns}|{shapes}|{col_types}|{column}|{len(df)}"
46
+ m = hashlib.md5()
47
+ m.update(hash_input.encode())
48
+ m.update(values_sample) # Add sample data to hash
49
+ return m.hexdigest()
50
+
51
+ def cache_results(df, column, results):
52
+ """Save results to disk cache"""
53
+ try:
54
+ cache_key = get_cache_key(df, column)
55
+ cache_file = os.path.join(CACHE_DIR, f"{cache_key}.pkl")
56
+ with open(cache_file, 'wb') as f:
57
+ pickle.dump(results, f)
58
+ return True
59
+ except Exception as e:
60
+ print(f"Cache write error: {e}")
61
+ return False
62
+
63
+ def get_cached_results(df, column):
64
+ """Try to get results from disk cache"""
65
+ try:
66
+ cache_key = get_cache_key(df, column)
67
+ cache_file = os.path.join(CACHE_DIR, f"{cache_key}.pkl")
68
+ if os.path.exists(cache_file):
69
+ # Check if cache file is recent (less than 1 day old)
70
+ mod_time = os.path.getmtime(cache_file)
71
+ if time.time() - mod_time < 86400: # 24 hours in seconds
72
+ with open(cache_file, 'rb') as f:
73
+ return pickle.load(f)
74
+ return None
75
+ except Exception as e:
76
+ print(f"Cache read error: {e}")
77
+ return None
78
+
79
+ # Worker thread for background processing
80
+ class ExplainerThread(QThread):
81
+ # Signals for progress updates and results
82
+ progress = pyqtSignal(int, str)
83
+ result = pyqtSignal(object)
84
+ error = pyqtSignal(str)
85
+
86
+ def __init__(self, df, column):
87
+ super().__init__()
88
+ # Make a copy of the dataframe to avoid reference issues
89
+ self.df = df.copy()
90
+ self.column = column
91
+ self._is_canceled = False
92
+
93
+ def cancel(self):
94
+ """Mark the thread as canceled"""
95
+ self._is_canceled = True
96
+
97
+ def calculate_correlation(self, x, y):
98
+ """Calculate correlation between two variables, handling different data types.
99
+ Returns absolute correlation value between 0 and 1."""
100
+ try:
101
+ # Handle missing values
102
+ mask = ~(pd.isna(x) | pd.isna(y))
103
+ x_clean = x[mask]
104
+ y_clean = y[mask]
105
+
106
+ # If too few data points, return default
107
+ if len(x_clean) < 5:
108
+ return 0.0
109
+
110
+ # Check data types
111
+ x_is_numeric = pd.api.types.is_numeric_dtype(x_clean)
112
+ y_is_numeric = pd.api.types.is_numeric_dtype(y_clean)
113
+
114
+ # Case 1: Both numeric - use Pearson correlation
115
+ if x_is_numeric and y_is_numeric:
116
+ corr, _ = pearsonr(x_clean, y_clean)
117
+ return abs(corr)
118
+
119
+ # Case 2: Categorical vs Categorical - use Cramer's V
120
+ elif not x_is_numeric and not y_is_numeric:
121
+ # Convert to categorical codes
122
+ x_cat = pd.Categorical(x_clean).codes
123
+ y_cat = pd.Categorical(y_clean).codes
124
+
125
+ # Create contingency table
126
+ contingency = pd.crosstab(x_cat, y_cat)
127
+
128
+ # Calculate Cramer's V
129
+ chi2, _, _, _ = chi2_contingency(contingency)
130
+ n = contingency.sum().sum()
131
+ phi2 = chi2 / n
132
+
133
+ # Get dimensions
134
+ r, k = contingency.shape
135
+
136
+ # Calculate Cramer's V with correction for dimensions
137
+ cramers_v = np.sqrt(phi2 / min(k-1, r-1)) if min(k-1, r-1) > 0 else 0.0
138
+ return min(cramers_v, 1.0) # Cap at 1.0
139
+
140
+ # Case 3: Mixed types - convert to ranks or categories
141
+ else:
142
+ if x_is_numeric and not y_is_numeric:
143
+ # Convert categorical y to codes
144
+ y_encoded = pd.Categorical(y_clean).codes
145
+
146
+ # Calculate correlation between x and encoded y
147
+ # Using point-biserial correlation (special case of Pearson)
148
+ corr, _ = pearsonr(x_clean, y_encoded)
149
+ return abs(corr)
150
+ else: # y is numeric, x is categorical
151
+ # Convert categorical x to codes
152
+ x_encoded = pd.Categorical(x_clean).codes
153
+
154
+ # Calculate correlation
155
+ corr, _ = pearsonr(x_encoded, y_clean)
156
+ return abs(corr)
157
+
158
+ except Exception as e:
159
+ print(f"Error calculating correlation: {e}")
160
+ return 0.0 # Return zero if correlation calculation fails
161
+
162
+ def run(self):
163
+ try:
164
+ # Check if canceled
165
+ if self._is_canceled:
166
+ return
167
+
168
+ # Check disk cache first
169
+ self.progress.emit(0, "Checking for cached results...")
170
+ cached_results = get_cached_results(self.df, self.column)
171
+ if cached_results is not None:
172
+ # Check if canceled
173
+ if self._is_canceled:
174
+ return
175
+
176
+ self.progress.emit(95, "Found cached results, loading...")
177
+ time.sleep(0.5) # Brief pause to show the user we found a cache
178
+
179
+ # Check if canceled
180
+ if self._is_canceled:
181
+ return
182
+
183
+ self.progress.emit(100, "Loaded from cache")
184
+ self.result.emit(cached_results)
185
+ return
186
+
187
+ # Clean up memory before intensive computation
188
+ gc.collect()
189
+
190
+ # Check if canceled
191
+ if self._is_canceled:
192
+ return
193
+
194
+ # Early check for empty dataframe or no columns
195
+ if self.df.empty or len(self.df.columns) == 0:
196
+ raise ValueError("The dataframe is empty or has no columns for analysis")
197
+
198
+ # No cache found, proceed with computation
199
+ self.progress.emit(5, "Computing new analysis...")
200
+
201
+ # Validate that the target column exists in the dataframe
202
+ if self.column not in self.df.columns:
203
+ raise ValueError(f"Target column '{self.column}' not found in the dataframe")
204
+
205
+ # Create a copy to avoid modifying the original dataframe
206
+ df = self.df.copy()
207
+
208
+ # Verify we have data to work with
209
+ if len(df) == 0:
210
+ raise ValueError("No data available for analysis (empty dataframe)")
211
+
212
+ # Sample up to 500 rows for better statistical significance while maintaining speed
213
+ if len(df) > 500:
214
+ sample_size = 500 # Increased sample size for better analysis
215
+ self.progress.emit(10, f"Sampling dataset (using {sample_size} rows from {len(df)} total)...")
216
+ df = df.sample(n=sample_size, random_state=42)
217
+ # Force garbage collection after sampling
218
+ gc.collect()
219
+
220
+ # Check if canceled
221
+ if self._is_canceled:
222
+ return
223
+
224
+ # Drop columns with too many unique values (likely IDs) or excessive NaNs
225
+ self.progress.emit(15, "Analyzing columns for preprocessing...")
226
+ cols_to_drop = []
227
+ for col in df.columns:
228
+ if col == self.column: # Don't drop target column
229
+ continue
230
+ try:
231
+ # Only drop columns with extremely high uniqueness (99% instead of 95%)
232
+ # This ensures we keep more features for analysis
233
+ if df[col].nunique() / len(df) > 0.99 and len(df) > 100:
234
+ cols_to_drop.append(col)
235
+ # Only drop columns with very high missing values (80% instead of 50%)
236
+ elif df[col].isna().mean() > 0.8:
237
+ cols_to_drop.append(col)
238
+ except:
239
+ # If we can't analyze the column, drop it
240
+ cols_to_drop.append(col)
241
+
242
+ # Drop identified columns, but ensure we keep at least some features
243
+ remaining_cols = [col for col in df.columns if col != self.column and col not in cols_to_drop]
244
+
245
+ # If dropping would leave us with no features, keep at least 3 columns (or all if less than 3)
246
+ if len(remaining_cols) == 0 and len(cols_to_drop) > 0:
247
+ # Sort dropped columns by uniqueness (keep those with lower uniqueness)
248
+ col_uniqueness = {}
249
+ for col in cols_to_drop:
250
+ try:
251
+ col_uniqueness[col] = df[col].nunique() / len(df)
252
+ except:
253
+ col_uniqueness[col] = 1.0 # Assume high uniqueness for problematic columns
254
+
255
+ # Sort by uniqueness and keep the least unique columns
256
+ cols_to_keep = sorted(col_uniqueness.items(), key=lambda x: x[1])[:min(3, len(cols_to_drop))]
257
+ cols_to_drop = [col for col in cols_to_drop if col not in [c[0] for c in cols_to_keep]]
258
+ print(f"Keeping {len(cols_to_keep)} columns to ensure analysis can proceed")
259
+
260
+ if cols_to_drop:
261
+ self.progress.emit(20, f"Removing {len(cols_to_drop)} low-information columns...")
262
+ df = df.drop(columns=cols_to_drop)
263
+
264
+ # Ensure target column is still in the dataframe
265
+ if self.column not in df.columns:
266
+ raise ValueError(f"Target column '{self.column}' not found in dataframe after preprocessing")
267
+
268
+ # Calculate correlation coefficients first
269
+ self.progress.emit(25, "Calculating correlation measures...")
270
+ correlations = {}
271
+
272
+ # Get all feature columns (excluding target)
273
+ feature_cols = [col for col in df.columns if col != self.column]
274
+
275
+ # Calculate correlation for each feature
276
+ for col in feature_cols:
277
+ try:
278
+ # Calculate correlation between each feature and target
279
+ cor_val = self.calculate_correlation(df[col], df[self.column])
280
+ correlations[col] = cor_val
281
+ except Exception as e:
282
+ print(f"Error calculating correlation for {col}: {e}")
283
+ correlations[col] = 0.0
284
+
285
+ # Separate features and target
286
+ self.progress.emit(30, "Preparing features and target...")
287
+ X = df.drop(columns=[self.column])
288
+ y = df[self.column]
289
+
290
+ # Handle high-cardinality categorical features
291
+ self.progress.emit(35, "Encoding categorical features...")
292
+ # Use a simpler approach - just one-hot encode columns with few unique values
293
+ # and encode (don't drop) high-cardinality columns for speed
294
+ categorical_cols = X.select_dtypes(include='object').columns
295
+ high_cardinality_threshold = 20 # Higher threshold to keep more columns
296
+
297
+ # Keep track of how many columns we've processed
298
+ columns_processed = 0
299
+ columns_kept = 0
300
+
301
+ for col in categorical_cols:
302
+ columns_processed += 1
303
+ unique_count = X[col].nunique()
304
+ # Always keep the column, but use different encoding strategies based on cardinality
305
+ if unique_count <= high_cardinality_threshold:
306
+ # Simple label encoding for low-cardinality features
307
+ X[col] = X[col].fillna('_MISSING_').astype('category').cat.codes
308
+ columns_kept += 1
309
+ else:
310
+ # For high-cardinality features, still encode them but with a simpler approach
311
+ # Use label encoding instead of dropping
312
+ X[col] = X[col].fillna('_MISSING_').astype('category').cat.codes
313
+ columns_kept += 1
314
+
315
+ # Log how many columns were kept
316
+ if columns_processed > 0:
317
+ self.progress.emit(40, f"Encoded {columns_kept} categorical columns out of {columns_processed}")
318
+
319
+ # Handle target column in a simpler, faster way
320
+ if y.dtype == 'object':
321
+ # For categorical targets, use simple category codes
322
+ y = y.fillna('_MISSING_').astype('category').cat.codes
323
+ else:
324
+ # For numeric targets, just fill NaNs with mean
325
+ y = y.fillna(y.mean() if pd.api.types.is_numeric_dtype(y) else y.mode()[0])
326
+
327
+ # Train/test split
328
+ self.progress.emit(45, "Splitting data into train/test sets...")
329
+
330
+ # Make sure we still have features to work with
331
+ if X.shape[1] == 0:
332
+ raise ValueError("No features remain after preprocessing. Try selecting a different target column.")
333
+
334
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
335
+
336
+ # Check if canceled
337
+ if self._is_canceled:
338
+ return
339
+
340
+ # Train a tree-based model
341
+ self.progress.emit(50, "Training XGBoost model...")
342
+
343
+ # Check the number of features left for analysis
344
+ feature_count = X_train.shape[1]
345
+
346
+ # Adjust model complexity based on feature count
347
+ if feature_count < 3:
348
+ max_depth = 1 # Very simple trees for few features
349
+ n_estimators = 10 # Use more trees to compensate
350
+ else:
351
+ max_depth = 2 # Still shallow trees
352
+ n_estimators = 5 # Fewer trees for more features
353
+
354
+ model = xgb.XGBRegressor(
355
+ n_estimators=n_estimators,
356
+ max_depth=max_depth,
357
+ learning_rate=0.3, # Higher learning rate to compensate for fewer trees
358
+ tree_method='hist', # Fast histogram method
359
+ subsample=0.7, # Use 70% of data per tree
360
+ grow_policy='depthwise', # Simple growth policy
361
+ n_jobs=1, # Single thread to avoid overhead
362
+ random_state=42,
363
+ verbosity=0 # Suppress output
364
+ )
365
+
366
+ # Set memory conservation parameter for large datasets with many features
367
+ if X_train.shape[1] > 100: # If there are many features
368
+ self.progress.emit(55, "Large feature set detected, using memory-efficient training...")
369
+ model.set_params(grow_policy='lossguide', max_leaves=64)
370
+
371
+ # Fit model with a try/except to catch memory issues
372
+ try:
373
+ model.fit(X_train, y_train)
374
+ except Exception as e:
375
+ # Log the error for debugging
376
+ print(f"Initial XGBoost fit failed: {str(e)}")
377
+
378
+ # If we encounter an error, try with an even smaller and simpler model
379
+ self.progress.emit(55, "Adjusting model parameters due to computational constraints...")
380
+ try:
381
+ # Try a simpler regressor with more conservative parameters
382
+ model = xgb.XGBRegressor(
383
+ n_estimators=3,
384
+ max_depth=1,
385
+ subsample=0.5,
386
+ colsample_bytree=0.5,
387
+ n_jobs=1,
388
+ verbosity=0
389
+ )
390
+ model.fit(X_train, y_train)
391
+ except Exception as inner_e:
392
+ # If even the simpler model fails, resort to a fallback strategy
393
+ print(f"Even simpler XGBoost failed: {str(inner_e)}")
394
+ self.progress.emit(60, "Using fallback importance calculation method...")
395
+
396
+ # Create a basic feature importance based on correlation with target
397
+ # This is a simple fallback when model training fails
398
+ importance = []
399
+ for col in X.columns:
400
+ try:
401
+ # Use pre-calculated correlations for fallback importance
402
+ corr_value = correlations.get(col, 0.5)
403
+ # Scale correlation to make a reasonable importance value
404
+ # Higher correlation = higher importance
405
+ importance.append(0.5 + corr_value/2 if not pd.isna(corr_value) else 0.5)
406
+ except:
407
+ # If correlation fails, use default
408
+ importance.append(0.5)
409
+
410
+ # Normalize to sum to 1
411
+ importance = np.array(importance)
412
+ if sum(importance) > 0:
413
+ importance = importance / sum(importance)
414
+ else:
415
+ # Equal importance if everything fails
416
+ importance = np.ones(len(X.columns)) / len(X.columns)
417
+
418
+ # Skip the model-based code path since we calculated importances manually
419
+ self.progress.emit(80, "Creating importance results...")
420
+ feature_importance = pd.DataFrame({
421
+ 'feature': X.columns,
422
+ 'importance_value': importance,
423
+ 'correlation': [correlations.get(col, 0.0) for col in X.columns]
424
+ }).sort_values(by='importance_value', ascending=False)
425
+
426
+ # Cache the results for future use
427
+ self.progress.emit(95, "Caching results for future use...")
428
+ cache_results(self.df, self.column, feature_importance)
429
+
430
+ # Clean up after computation
431
+ del df, X, y, X_train, X_test, y_train, y_test
432
+ gc.collect()
433
+
434
+ # Check if canceled
435
+ if self._is_canceled:
436
+ return
437
+
438
+ # Emit the result
439
+ self.progress.emit(100, "Analysis complete (fallback method)")
440
+ self.result.emit(feature_importance)
441
+ return
442
+
443
+ # Check if canceled
444
+ if self._is_canceled:
445
+ return
446
+
447
+ # Get feature importance directly from XGBoost
448
+ self.progress.emit(80, "Calculating feature importance and correlations...")
449
+
450
+ try:
451
+ # Check if we have features to analyze
452
+ if X.shape[1] == 0:
453
+ raise ValueError("No features available for importance analysis")
454
+
455
+ # Get feature importance directly from XGBoost
456
+ importance = model.feature_importances_
457
+
458
+ # Verify importance values are valid
459
+ if np.isnan(importance).any() or np.isinf(importance).any():
460
+ # Handle NaN or Inf values
461
+ print("Warning: Invalid importance values detected, using fallback method")
462
+ # Replace with equal importance
463
+ importance = np.ones(len(X.columns)) / len(X.columns)
464
+
465
+ # Create and sort the importance dataframe with correlations
466
+ feature_importance = pd.DataFrame({
467
+ 'feature': X.columns,
468
+ 'importance_value': importance,
469
+ 'correlation': [correlations.get(col, 0.0) for col in X.columns]
470
+ }).sort_values(by='importance_value', ascending=False)
471
+
472
+ # Cache the results for future use
473
+ self.progress.emit(95, "Caching results for future use...")
474
+ cache_results(self.df, self.column, feature_importance)
475
+
476
+ # Clean up after computation
477
+ del df, X, y, X_train, X_test, y_train, y_test, model
478
+ gc.collect()
479
+
480
+ # Check if canceled
481
+ if self._is_canceled:
482
+ return
483
+
484
+ # Emit the result
485
+ self.progress.emit(100, "Analysis complete")
486
+ self.result.emit(feature_importance)
487
+ return
488
+
489
+ except Exception as e:
490
+ print(f"Error in feature importance calculation: {e}")
491
+ import traceback
492
+ traceback.print_exc()
493
+
494
+ # Create fallback importance values when model-based approach fails
495
+ self.progress.emit(85, "Using alternative importance calculation method...")
496
+
497
+ try:
498
+ # Try correlation-based approach first
499
+ importance = []
500
+ has_valid_correlations = False
501
+
502
+ for col in X.columns:
503
+ try:
504
+ # Use pre-calculated correlations
505
+ corr = correlations.get(col, 0.1)
506
+ if not pd.isna(corr):
507
+ importance.append(corr)
508
+ has_valid_correlations = True
509
+ else:
510
+ importance.append(0.1) # Default for failed correlation
511
+ except:
512
+ # Default value for any error
513
+ importance.append(0.1)
514
+
515
+ # Normalize importance values
516
+ importance = np.array(importance)
517
+ if has_valid_correlations and sum(importance) > 0:
518
+ # If we have valid correlations, use them normalized
519
+ importance = importance / max(sum(importance), 0.001)
520
+ else:
521
+ # Otherwise use frequency-based heuristic
522
+ print("Using frequency-based feature importance as fallback")
523
+ # Count unique values as a proxy for importance
524
+ importance = []
525
+ total_rows = len(X)
526
+
527
+ for col in X.columns:
528
+ try:
529
+ # More unique values could indicate more information content
530
+ # But we invert the ratio so columns with fewer unique values
531
+ # (more predictive) get higher importance
532
+ uniqueness = X[col].nunique() / total_rows
533
+ # Invert and scale between 0.1 and 1.0
534
+ val = 1.0 - (0.9 * uniqueness)
535
+ importance.append(max(0.1, min(1.0, val)))
536
+ except:
537
+ importance.append(0.1) # Default value
538
+
539
+ # Normalize
540
+ importance = np.array(importance)
541
+ importance = importance / max(sum(importance), 0.001)
542
+
543
+ except Exception as fallback_error:
544
+ # Last resort: create equal importance for all features
545
+ print(f"Fallback error: {fallback_error}, using equal importance")
546
+ importance_values = np.ones(len(X.columns)) / max(len(X.columns), 1)
547
+ importance = importance_values
548
+
549
+ # Create dataframe with results, including correlations
550
+ feature_importance = pd.DataFrame({
551
+ 'feature': X.columns,
552
+ 'importance_value': importance,
553
+ 'correlation': [correlations.get(col, 0.0) for col in X.columns]
554
+ }).sort_values(by='importance_value', ascending=False)
555
+
556
+ # Cache the results
557
+ try:
558
+ cache_results(self.df, self.column, feature_importance)
559
+ except:
560
+ pass # Ignore cache errors
561
+
562
+ # Clean up
563
+ try:
564
+ del df, X, y, X_train, X_test, y_train, y_test
565
+ gc.collect()
566
+ except:
567
+ pass
568
+
569
+ # Emit the result
570
+ self.progress.emit(100, "Analysis complete (with fallback methods)")
571
+ self.result.emit(feature_importance)
572
+ return
573
+
574
+ except IndexError as e:
575
+ # Handle index errors with more detail
576
+ import traceback
577
+ import inspect
578
+ trace = traceback.format_exc()
579
+
580
+ # Get more detailed information
581
+ frame = inspect.trace()[-1]
582
+ frame_info = inspect.getframeinfo(frame[0])
583
+ filename = frame_info.filename
584
+ lineno = frame_info.lineno
585
+ function = frame_info.function
586
+ code_context = frame_info.code_context[0].strip() if frame_info.code_context else "Unknown code context"
587
+
588
+ # Format a more detailed error message
589
+ detail_msg = f"IndexError: {str(e)}\nLocation: {filename}:{lineno} in function '{function}'\nCode: {code_context}\n\n{trace}"
590
+ print(detail_msg) # Print to console for debugging
591
+
592
+ if not self._is_canceled:
593
+ self.error.emit(f"Index error at line {lineno} in {function}:\n{str(e)}\nCode: {code_context}")
594
+
595
+ except Exception as e:
596
+ if not self._is_canceled: # Only emit error if not canceled
597
+ import traceback
598
+ trace = traceback.format_exc()
599
+ print(f"Error in ExplainerThread: {str(e)}")
600
+ print(trace) # Print full stack trace to help debug
601
+ self.error.emit(f"{str(e)}\n\nTrace: {trace}")
602
+
603
+ def analyze_column(self):
604
+ if self.df is None or self.column_selector.currentText() == "":
605
+ return
606
+
607
+ # Cancel any existing worker thread
608
+ if self.worker_thread and self.worker_thread.isRunning():
609
+ # Signal the thread to cancel
610
+ self.worker_thread.cancel()
611
+
612
+ try:
613
+ # Disconnect all signals to avoid callbacks during termination
614
+ self.worker_thread.progress.disconnect()
615
+ self.worker_thread.result.disconnect()
616
+ self.worker_thread.error.disconnect()
617
+ self.worker_thread.finished.disconnect()
618
+ except Exception:
619
+ pass # Already disconnected
620
+
621
+ # Terminate thread properly
622
+ self.worker_thread.terminate()
623
+ self.worker_thread.wait(1000) # Wait up to 1 second
624
+ self.worker_thread = None # Clear reference
625
+
626
+ target_column = self.column_selector.currentText()
627
+
628
+ # Check in-memory cache first (fastest)
629
+ if target_column in self.result_cache:
630
+ self.handle_results(self.result_cache[target_column])
631
+ return
632
+
633
+ # Check global application-wide cache second (still fast)
634
+ global_key = get_cache_key(self.df, target_column)
635
+ if global_key in ColumnProfilerApp.global_cache:
636
+ self.result_cache[target_column] = ColumnProfilerApp.global_cache[global_key]
637
+ self.handle_results(self.result_cache[target_column])
638
+ return
639
+
640
+ # Disk cache will be checked in the worker thread
641
+
642
+ # Disable the analyze button while processing
643
+ self.analyze_button.setEnabled(False)
644
+
645
+ # Show progress indicators
646
+ self.progress_bar.setValue(0)
647
+ self.progress_bar.show()
648
+ self.progress_label.setText("Starting analysis...")
649
+ self.progress_label.show()
650
+ self.cancel_button.show()
651
+
652
+ # Create and start the worker thread
653
+ self.worker_thread = ExplainerThread(self.df, target_column)
654
+ self.worker_thread.progress.connect(self.update_progress)
655
+ self.worker_thread.result.connect(self.cache_and_display_results)
656
+ self.worker_thread.error.connect(self.handle_error)
657
+ self.worker_thread.finished.connect(self.on_analysis_finished)
658
+ self.worker_thread.start()
659
+
660
+ def update_progress(self, value, message):
661
+ self.progress_bar.setValue(value)
662
+ self.progress_label.setText(message)
663
+
664
+ def cache_and_display_results(self, importance_df):
665
+ # Cache the results
666
+ target_column = self.column_selector.currentText()
667
+ self.result_cache[target_column] = importance_df
668
+
669
+ # Also cache in the global application cache
670
+ global_key = get_cache_key(self.df, target_column)
671
+ ColumnProfilerApp.global_cache[global_key] = importance_df
672
+
673
+ # Display the results
674
+ self.handle_results(importance_df)
675
+
676
+ def on_analysis_finished(self):
677
+ """Handle cleanup when analysis is finished (either completed or cancelled)"""
678
+ self.analyze_button.setEnabled(True)
679
+ self.cancel_button.hide()
680
+
681
+ def handle_results(self, importance_df):
682
+ # Hide progress indicators
683
+ self.progress_bar.hide()
684
+ self.progress_label.hide()
685
+ self.cancel_button.hide()
686
+
687
+ # Update importance table to include correlation column
688
+ self.importance_table.setColumnCount(3)
689
+ self.importance_table.setHorizontalHeaderLabels(["Feature", "Importance", "Abs. Correlation"])
690
+ self.importance_table.setRowCount(len(importance_df))
691
+
692
+ # Using a timer for incremental updates
693
+ self.importance_df = importance_df # Store for incremental rendering
694
+ self.current_row = 0
695
+ self.render_timer = QTimer()
696
+ self.render_timer.timeout.connect(lambda: self.render_next_batch(10))
697
+ self.render_timer.start(10) # Update every 10ms
698
+
699
+ def render_next_batch(self, batch_size):
700
+ try:
701
+ if self.current_row >= len(self.importance_df):
702
+ # All rows rendered, now render the chart and stop the timer
703
+ self.render_chart()
704
+ self.render_timer.stop()
705
+ return
706
+
707
+ # Render a batch of rows
708
+ end_row = min(self.current_row + batch_size, len(self.importance_df))
709
+ for row in range(self.current_row, end_row):
710
+ try:
711
+ # Check if row exists in dataframe to prevent index errors
712
+ if row < len(self.importance_df):
713
+ feature = self.importance_df.iloc[row]['feature']
714
+ importance_value = self.importance_df.iloc[row]['importance_value']
715
+
716
+ # Add correlation if available
717
+ correlation = self.importance_df.iloc[row].get('correlation', None)
718
+ if correlation is not None:
719
+ self.importance_table.setItem(row, 0, QTableWidgetItem(str(feature)))
720
+ self.importance_table.setItem(row, 1, QTableWidgetItem(str(round(importance_value, 4))))
721
+ self.importance_table.setItem(row, 2, QTableWidgetItem(str(round(correlation, 4))))
722
+ else:
723
+ self.importance_table.setItem(row, 0, QTableWidgetItem(str(feature)))
724
+ self.importance_table.setItem(row, 1, QTableWidgetItem(str(round(importance_value, 4))))
725
+ else:
726
+ # Handle out of range index
727
+ print(f"Warning: Row {row} is out of range (max: {len(self.importance_df)-1})")
728
+ self.importance_table.setItem(row, 0, QTableWidgetItem("Error"))
729
+ self.importance_table.setItem(row, 1, QTableWidgetItem("Out of range"))
730
+ self.importance_table.setItem(row, 2, QTableWidgetItem("N/A"))
731
+ except (IndexError, KeyError) as e:
732
+ # Enhanced error reporting for index and key errors
733
+ import traceback
734
+ trace = traceback.format_exc()
735
+ error_msg = f"Error rendering row {row}: {e.__class__.__name__}: {e}\n{trace}"
736
+ print(error_msg)
737
+
738
+ # Handle missing data in the dataframe gracefully
739
+ self.importance_table.setItem(row, 0, QTableWidgetItem(f"Error: {e.__class__.__name__}"))
740
+ self.importance_table.setItem(row, 1, QTableWidgetItem(f"{str(e)[:20]}"))
741
+ self.importance_table.setItem(row, 2, QTableWidgetItem("Error"))
742
+ except Exception as e:
743
+ # Catch any other exceptions
744
+ print(f"Unexpected error rendering row {row}: {e.__class__.__name__}: {e}")
745
+ self.importance_table.setItem(row, 0, QTableWidgetItem(f"Error: {e.__class__.__name__}"))
746
+ self.importance_table.setItem(row, 1, QTableWidgetItem("See console for details"))
747
+ self.importance_table.setItem(row, 2, QTableWidgetItem("Error"))
748
+
749
+ self.current_row = end_row
750
+ QApplication.processEvents() # Allow UI to update
751
+ except Exception as e:
752
+ # Catch any exceptions in the rendering loop itself
753
+ import traceback
754
+ trace = traceback.format_exc()
755
+ error_msg = f"Error in render_next_batch: {e.__class__.__name__}: {e}\n{trace}"
756
+ print(error_msg)
757
+
758
+ # Try to stop the timer to prevent further errors
759
+ try:
760
+ if self.render_timer and self.render_timer.isActive():
761
+ self.render_timer.stop()
762
+ except:
763
+ pass
764
+
765
+ # Show error
766
+ QMessageBox.critical(self, "Rendering Error",
767
+ f"Error rendering results: {e.__class__.__name__}: {e}")
768
+
769
+ def render_chart(self):
770
+ # Create horizontal bar chart
771
+ try:
772
+ if self.importance_df is None or len(self.importance_df) == 0:
773
+ # No data to render
774
+ self.chart_view.axes.clear()
775
+ self.chart_view.axes.text(0.5, 0.5, "No data available for chart",
776
+ ha='center', va='center', fontsize=12, color='gray')
777
+ self.chart_view.axes.set_axis_off()
778
+ self.chart_view.draw()
779
+ return
780
+
781
+ self.chart_view.axes.clear()
782
+
783
+ # Get a sorted copy based on current sort key
784
+ plot_df = self.importance_df.sort_values(by=self.current_sort, ascending=False).head(20).copy()
785
+
786
+ # Verify we have data before proceeding
787
+ if len(plot_df) == 0:
788
+ self.chart_view.axes.text(0.5, 0.5, "No features found with importance values",
789
+ ha='center', va='center', fontsize=12, color='gray')
790
+ self.chart_view.axes.set_axis_off()
791
+ self.chart_view.draw()
792
+ return
793
+
794
+ # Check required columns exist
795
+ required_columns = ['feature', 'importance_value']
796
+ missing_columns = [col for col in required_columns if col not in plot_df.columns]
797
+ if missing_columns:
798
+ error_msg = f"Missing required columns: {', '.join(missing_columns)}"
799
+ self.chart_view.axes.text(0.5, 0.5, error_msg,
800
+ ha='center', va='center', fontsize=12, color='red')
801
+ self.chart_view.axes.set_axis_off()
802
+ self.chart_view.draw()
803
+ print(f"Chart rendering error: {error_msg}")
804
+ return
805
+
806
+ # Truncate long feature names for better display
807
+ max_feature_length = 30
808
+ plot_df['display_feature'] = plot_df['feature'].apply(
809
+ lambda x: (str(x)[:max_feature_length] + '...') if len(str(x)) > max_feature_length else str(x)
810
+ )
811
+
812
+ # Reverse order for better display (highest at top)
813
+ plot_df = plot_df.iloc[::-1].reset_index(drop=True)
814
+
815
+ # Create a figure with two subplots side by side
816
+ self.chart_view.figure.clear()
817
+ gs = self.chart_view.figure.add_gridspec(1, 2, width_ratios=[3, 2])
818
+
819
+ # First subplot for importance
820
+ ax1 = self.chart_view.figure.add_subplot(gs[0, 0])
821
+
822
+ # Create a colormap for better visualization
823
+ cmap = plt.cm.Blues
824
+ colors = cmap(np.linspace(0.4, 0.8, len(plot_df)))
825
+
826
+ # Plot with custom colors
827
+ bars = ax1.barh(
828
+ plot_df['display_feature'],
829
+ plot_df['importance_value'],
830
+ color=colors,
831
+ height=0.7, # Thinner bars for more spacing
832
+ alpha=0.8
833
+ )
834
+
835
+ # Add values at the end of bars
836
+ for bar in bars:
837
+ width = bar.get_width()
838
+ ax1.text(
839
+ width * 1.05,
840
+ bar.get_y() + bar.get_height()/2,
841
+ f'{width:.2f}',
842
+ va='center',
843
+ fontsize=9,
844
+ fontweight='bold'
845
+ )
846
+
847
+ # Add grid for better readability
848
+ ax1.grid(True, axis='x', linestyle='--', alpha=0.3)
849
+
850
+ # Remove unnecessary spines
851
+ for spine in ['top', 'right']:
852
+ ax1.spines[spine].set_visible(False)
853
+
854
+ # Make labels more readable
855
+ ax1.tick_params(axis='y', labelsize=9)
856
+
857
+ # Set title and labels
858
+ ax1.set_title(f'Feature Importance for {self.column_selector.currentText()}')
859
+ ax1.set_xlabel('Importance Value')
860
+
861
+ # Add a note about the sorting order
862
+ sort_label = "Sorted by: " + ("Importance" if self.current_sort == 'importance_value' else "Correlation")
863
+
864
+ # Second subplot for correlation if available
865
+ if 'correlation' in plot_df.columns:
866
+ ax2 = self.chart_view.figure.add_subplot(gs[0, 1], sharey=ax1)
867
+
868
+ # Create a colormap for correlation - use a different color
869
+ cmap_corr = plt.cm.Reds
870
+ colors_corr = cmap_corr(np.linspace(0.4, 0.8, len(plot_df)))
871
+
872
+ # Plot correlation bars
873
+ corr_bars = ax2.barh(
874
+ plot_df['display_feature'],
875
+ plot_df['correlation'],
876
+ color=colors_corr,
877
+ height=0.7,
878
+ alpha=0.8
879
+ )
880
+
881
+ # Add values at the end of correlation bars
882
+ for bar in corr_bars:
883
+ width = bar.get_width()
884
+ ax2.text(
885
+ width * 1.05,
886
+ bar.get_y() + bar.get_height()/2,
887
+ f'{width:.2f}',
888
+ va='center',
889
+ fontsize=9,
890
+ fontweight='bold'
891
+ )
892
+
893
+ # Add grid and styling
894
+ ax2.grid(True, axis='x', linestyle='--', alpha=0.3)
895
+ ax2.set_title('Absolute Correlation')
896
+ ax2.set_xlabel('Correlation Value')
897
+
898
+ # Hide y-axis labels since they're shared with the first plot
899
+ ax2.set_yticklabels([])
900
+
901
+ # Remove unnecessary spines
902
+ for spine in ['top', 'right']:
903
+ ax2.spines[spine].set_visible(False)
904
+
905
+ # Add a note about the current sort order
906
+ self.chart_view.figure.text(0.5, 0.01, sort_label, ha='center', fontsize=9, style='italic')
907
+
908
+ # Adjust figure size based on number of features
909
+ feature_count = len(plot_df)
910
+ self.chart_view.figure.set_figheight(max(5, min(4 + feature_count * 0.3, 12)))
911
+
912
+ # Adjust layout and draw
913
+ self.chart_view.figure.tight_layout(rect=[0, 0.03, 1, 0.97]) # Make room for sort label
914
+ self.chart_view.draw()
915
+
916
+ except IndexError as e:
917
+ # Special handling for index errors with detailed information
918
+ import traceback
919
+ import inspect
920
+
921
+ # Get stack trace information
922
+ trace = traceback.format_exc()
923
+
924
+ # Try to get line and context information
925
+ try:
926
+ frame = inspect.trace()[-1]
927
+ frame_info = inspect.getframeinfo(frame[0])
928
+ filename = frame_info.filename
929
+ lineno = frame_info.lineno
930
+ function = frame_info.function
931
+ code_context = frame_info.code_context[0].strip() if frame_info.code_context else "Unknown code context"
932
+
933
+ # Detailed error message
934
+ detail_msg = f"IndexError at line {lineno} in {function}: {str(e)}\nCode: {code_context}"
935
+ print(f"Chart rendering error: {detail_msg}\n{trace}")
936
+
937
+ # Display error in chart
938
+ self.chart_view.axes.clear()
939
+ self.chart_view.axes.text(0.5, 0.5,
940
+ f"Index Error in chart rendering:\n{str(e)}\nAt line {lineno}: {code_context}",
941
+ ha='center', va='center', fontsize=12, color='red',
942
+ wrap=True)
943
+ self.chart_view.axes.set_axis_off()
944
+ self.chart_view.draw()
945
+ except Exception as inner_e:
946
+ # Fallback if the detailed error reporting fails
947
+ print(f"Error getting detailed error info: {inner_e}")
948
+ print(f"Original error: {e}\n{trace}")
949
+
950
+ self.chart_view.axes.clear()
951
+ self.chart_view.axes.text(0.5, 0.5, f"Index Error: {str(e)}",
952
+ ha='center', va='center', fontsize=12, color='red')
953
+ self.chart_view.axes.set_axis_off()
954
+ self.chart_view.draw()
955
+ except Exception as e:
956
+ # Recover gracefully from any chart rendering errors with detailed information
957
+ import traceback
958
+ trace = traceback.format_exc()
959
+ error_msg = f"Error rendering chart: {e.__class__.__name__}: {str(e)}"
960
+ print(f"{error_msg}\n{trace}")
961
+
962
+ self.chart_view.axes.clear()
963
+ self.chart_view.axes.text(0.5, 0.5, error_msg,
964
+ ha='center', va='center', fontsize=12, color='red',
965
+ wrap=True)
966
+ self.chart_view.axes.set_axis_off()
967
+ self.chart_view.draw()
968
+
969
+ def handle_error(self, error_message):
970
+ """Handle errors during analysis"""
971
+ # Hide progress indicators
972
+ self.progress_bar.hide()
973
+ self.progress_label.hide()
974
+ self.cancel_button.hide()
975
+
976
+ # Re-enable analyze button
977
+ self.analyze_button.setEnabled(True)
978
+
979
+ # Print error to console for debugging
980
+ print(f"Error in column profiler: {error_message}")
981
+
982
+ # Show error message with more details
983
+ msg_box = QMessageBox(self)
984
+ msg_box.setIcon(QMessageBox.Icon.Critical)
985
+ msg_box.setWindowTitle("Error")
986
+ msg_box.setText("An error occurred during analysis")
987
+ msg_box.setDetailedText(error_message)
988
+ msg_box.setStandardButtons(QMessageBox.StandardButton.Ok)
989
+ msg_box.exec()
990
+
991
+ # Show a message in the UI as well
992
+ self.importance_table.setRowCount(1)
993
+ self.importance_table.setColumnCount(3)
994
+ self.importance_table.setHorizontalHeaderLabels(["Feature", "Importance", "Abs. Correlation"])
995
+ self.importance_table.setItem(0, 0, QTableWidgetItem(f"Error: {error_message.split('\n')[0]}"))
996
+ self.importance_table.setItem(0, 1, QTableWidgetItem(""))
997
+ self.importance_table.setItem(0, 2, QTableWidgetItem(""))
998
+ self.importance_table.resizeColumnsToContents()
999
+
1000
+ # Update the chart to show error
1001
+ self.chart_view.axes.clear()
1002
+ self.chart_view.axes.text(0.5, 0.5, f"Error calculating importance:\n{error_message.split('\n')[0]}",
1003
+ ha='center', va='center', fontsize=12, color='red',
1004
+ wrap=True)
1005
+ self.chart_view.axes.set_axis_off()
1006
+ self.chart_view.draw()
1007
+
1008
+ def closeEvent(self, event):
1009
+ """Clean up when the window is closed"""
1010
+ # Stop any running timer
1011
+ if self.render_timer and self.render_timer.isActive():
1012
+ self.render_timer.stop()
1013
+
1014
+ # Clean up any background threads
1015
+ if self.worker_thread and self.worker_thread.isRunning():
1016
+ # Disconnect all signals to avoid callbacks during termination
1017
+ try:
1018
+ self.worker_thread.progress.disconnect()
1019
+ self.worker_thread.result.disconnect()
1020
+ self.worker_thread.error.disconnect()
1021
+ self.worker_thread.finished.disconnect()
1022
+ except Exception:
1023
+ pass # Already disconnected
1024
+
1025
+ # Terminate thread properly
1026
+ self.worker_thread.terminate()
1027
+ self.worker_thread.wait(1000) # Wait up to 1 second
1028
+
1029
+ # Clear references to prevent thread issues
1030
+ self.worker_thread = None
1031
+
1032
+ # Clean up memory
1033
+ self.result_cache.clear()
1034
+
1035
+ # Accept the close event
1036
+ event.accept()
1037
+
1038
+ # Suggest garbage collection
1039
+ gc.collect()
1040
+
1041
+ def cancel_analysis(self):
1042
+ """Cancel the current analysis"""
1043
+ if self.worker_thread and self.worker_thread.isRunning():
1044
+ # Signal the thread to cancel first
1045
+ self.worker_thread.cancel()
1046
+
1047
+ # Disconnect all signals to avoid callbacks during termination
1048
+ try:
1049
+ self.worker_thread.progress.disconnect()
1050
+ self.worker_thread.result.disconnect()
1051
+ self.worker_thread.error.disconnect()
1052
+ self.worker_thread.finished.disconnect()
1053
+ except Exception:
1054
+ pass # Already disconnected
1055
+
1056
+ # Terminate thread properly
1057
+ self.worker_thread.terminate()
1058
+ self.worker_thread.wait(1000) # Wait up to 1 second
1059
+
1060
+ # Clear reference
1061
+ self.worker_thread = None
1062
+
1063
+ # Update UI
1064
+ self.progress_bar.hide()
1065
+ self.progress_label.setText("Analysis cancelled")
1066
+ self.progress_label.show()
1067
+ self.cancel_button.hide()
1068
+ self.analyze_button.setEnabled(True)
1069
+
1070
+ # Hide the progress label after 2 seconds
1071
+ QTimer.singleShot(2000, self.progress_label.hide)
1072
+
1073
+ def show_relationship_visualization(self, row, column):
1074
+ """Show visualization of relationship between selected feature and target column"""
1075
+ if self.importance_df is None or row < 0 or row >= len(self.importance_df):
1076
+ return
1077
+
1078
+ # Get the feature name and target column
1079
+ try:
1080
+ feature = self.importance_df.iloc[row]['feature']
1081
+ target = self.column_selector.currentText()
1082
+
1083
+ # Verify both columns exist in the dataframe
1084
+ if feature not in self.df.columns:
1085
+ QMessageBox.warning(self, "Column Not Found",
1086
+ f"Feature column '{feature}' not found in the dataframe")
1087
+ return
1088
+
1089
+ if target not in self.df.columns:
1090
+ QMessageBox.warning(self, "Column Not Found",
1091
+ f"Target column '{target}' not found in the dataframe")
1092
+ return
1093
+ except Exception as e:
1094
+ QMessageBox.critical(self, "Error", f"Error getting column data: {str(e)}")
1095
+ return
1096
+
1097
+ # Create a dialog to show the visualization
1098
+ dialog = QDialog(self)
1099
+ dialog.setWindowTitle(f"Relationship: {feature} vs {target}")
1100
+ dialog.resize(900, 700)
1101
+
1102
+ # Create layout
1103
+ layout = QVBoxLayout(dialog)
1104
+
1105
+ # Create canvas for the plot
1106
+ canvas = MatplotlibCanvas(width=8, height=6, dpi=100)
1107
+ layout.addWidget(canvas)
1108
+
1109
+ # Determine the data types
1110
+ feature_is_numeric = pd.api.types.is_numeric_dtype(self.df[feature])
1111
+ target_is_numeric = pd.api.types.is_numeric_dtype(self.df[target])
1112
+
1113
+ # Get unique counts to determine if we have high cardinality
1114
+ feature_unique_count = self.df[feature].nunique()
1115
+ target_unique_count = self.df[target].nunique()
1116
+
1117
+ # Define high cardinality threshold
1118
+ high_cardinality_threshold = 10
1119
+
1120
+ # Clear the figure
1121
+ canvas.axes.clear()
1122
+
1123
+ # Create a working copy of the dataframe
1124
+ working_df = self.df.copy()
1125
+
1126
+ # Prepare data for high cardinality columns
1127
+ if not feature_is_numeric and feature_unique_count > high_cardinality_threshold:
1128
+ # Get the top N categories by frequency
1129
+ top_categories = self.df[feature].value_counts().nlargest(high_cardinality_threshold).index.tolist()
1130
+ # Create "Other" category for remaining values
1131
+ working_df[feature] = working_df[feature].apply(lambda x: x if x in top_categories else 'Other')
1132
+
1133
+ if not target_is_numeric and target_unique_count > high_cardinality_threshold:
1134
+ top_categories = self.df[target].value_counts().nlargest(high_cardinality_threshold).index.tolist()
1135
+ working_df[target] = working_df[target].apply(lambda x: x if x in top_categories else 'Other')
1136
+
1137
+ # Create appropriate visualization based on data types and cardinality
1138
+ if feature_is_numeric and target_is_numeric:
1139
+ # Scatter plot for numeric vs numeric
1140
+ # Use hexbin for large datasets to avoid overplotting
1141
+ if len(working_df) > 100:
1142
+ canvas.axes.hexbin(
1143
+ working_df[feature],
1144
+ working_df[target],
1145
+ gridsize=25,
1146
+ cmap='Blues',
1147
+ mincnt=1
1148
+ )
1149
+ canvas.axes.set_title(f"Hexbin Density Plot: {feature} vs {target}")
1150
+ canvas.axes.set_xlabel(feature)
1151
+ canvas.axes.set_ylabel(target)
1152
+ # Add a colorbar
1153
+ cbar = canvas.figure.colorbar(canvas.axes.collections[0], ax=canvas.axes)
1154
+ cbar.set_label('Count')
1155
+ else:
1156
+ # For smaller datasets, use a scatter plot with transparency
1157
+ sns.scatterplot(
1158
+ x=feature,
1159
+ y=target,
1160
+ data=working_df,
1161
+ ax=canvas.axes,
1162
+ alpha=0.6
1163
+ )
1164
+ # Add regression line
1165
+ sns.regplot(
1166
+ x=feature,
1167
+ y=target,
1168
+ data=working_df,
1169
+ ax=canvas.axes,
1170
+ scatter=False,
1171
+ line_kws={"color": "red"}
1172
+ )
1173
+ canvas.axes.set_title(f"Scatter Plot: {feature} vs {target}")
1174
+
1175
+ elif feature_is_numeric and not target_is_numeric:
1176
+ # Box plot for numeric vs categorical
1177
+ if target_unique_count <= high_cardinality_threshold * 2:
1178
+ # Standard boxplot for reasonable number of categories
1179
+ order = working_df[target].value_counts().nlargest(high_cardinality_threshold * 2).index
1180
+ sns.boxplot(
1181
+ x=target,
1182
+ y=feature,
1183
+ data=working_df,
1184
+ ax=canvas.axes,
1185
+ order=order
1186
+ )
1187
+ canvas.axes.set_title(f"Box Plot: {feature} by {target}")
1188
+ # Rotate x-axis labels for better readability
1189
+ canvas.axes.set_xticklabels(
1190
+ canvas.axes.get_xticklabels(),
1191
+ rotation=45,
1192
+ ha='right'
1193
+ )
1194
+ else:
1195
+ # For very high cardinality, use a violin plot with limited categories
1196
+ order = working_df[target].value_counts().nlargest(high_cardinality_threshold).index
1197
+ working_df_filtered = working_df[working_df[target].isin(order)]
1198
+ sns.violinplot(
1199
+ x=target,
1200
+ y=feature,
1201
+ data=working_df_filtered,
1202
+ ax=canvas.axes,
1203
+ inner='quartile',
1204
+ cut=0
1205
+ )
1206
+ canvas.axes.set_title(f"Violin Plot: {feature} by Top {len(order)} {target} Categories")
1207
+ canvas.axes.set_xticklabels(
1208
+ canvas.axes.get_xticklabels(),
1209
+ rotation=45,
1210
+ ha='right'
1211
+ )
1212
+
1213
+ elif not feature_is_numeric and target_is_numeric:
1214
+ # Bar plot for categorical vs numeric
1215
+ if feature_unique_count <= high_cardinality_threshold * 2:
1216
+ # Use standard barplot for reasonable number of categories
1217
+ order = working_df[feature].value_counts().nlargest(high_cardinality_threshold * 2).index
1218
+ sns.barplot(
1219
+ x=feature,
1220
+ y=target,
1221
+ data=working_df,
1222
+ ax=canvas.axes,
1223
+ order=order,
1224
+ estimator=np.mean,
1225
+ errorbar=('ci', 95),
1226
+ capsize=0.2
1227
+ )
1228
+ canvas.axes.set_title(f"Bar Plot: Average {target} by {feature}")
1229
+
1230
+ # Add value labels on top of bars
1231
+ for p in canvas.axes.patches:
1232
+ canvas.axes.annotate(
1233
+ f'{p.get_height():.1f}',
1234
+ (p.get_x() + p.get_width() / 2., p.get_height()),
1235
+ ha='center',
1236
+ va='bottom',
1237
+ fontsize=8,
1238
+ rotation=0
1239
+ )
1240
+
1241
+ # Rotate x-axis labels if needed
1242
+ if feature_unique_count > 5:
1243
+ canvas.axes.set_xticklabels(
1244
+ canvas.axes.get_xticklabels(),
1245
+ rotation=45,
1246
+ ha='right'
1247
+ )
1248
+ else:
1249
+ # For high cardinality, use a horizontal bar plot with top N categories
1250
+ top_n = 15 # Show top 15 categories
1251
+ # Calculate mean of target for each feature category
1252
+ grouped = working_df.groupby(feature)[target].agg(['mean', 'count', 'std']).reset_index()
1253
+ # Sort by mean and take top categories
1254
+ top_groups = grouped.nlargest(top_n, 'mean')
1255
+
1256
+ # Sort by mean value for better visualization
1257
+ sns.barplot(
1258
+ y=feature,
1259
+ x='mean',
1260
+ data=top_groups,
1261
+ ax=canvas.axes,
1262
+ orient='h'
1263
+ )
1264
+ canvas.axes.set_title(f"Top {top_n} Categories by Average {target}")
1265
+ canvas.axes.set_xlabel(f"Average {target}")
1266
+
1267
+ # Add count annotations
1268
+ for i, row in enumerate(top_groups.itertuples()):
1269
+ canvas.axes.text(
1270
+ row.mean + 0.1,
1271
+ i,
1272
+ f'n={row.count}',
1273
+ va='center',
1274
+ fontsize=8
1275
+ )
1276
+
1277
+ else:
1278
+ # Both feature and target are categorical
1279
+ if feature_unique_count <= high_cardinality_threshold and target_unique_count <= high_cardinality_threshold:
1280
+ # Heatmap for categorical vs categorical with manageable cardinality
1281
+ crosstab = pd.crosstab(
1282
+ working_df[feature],
1283
+ working_df[target],
1284
+ normalize='index'
1285
+ )
1286
+
1287
+ # Create heatmap with improved readability
1288
+ sns.heatmap(
1289
+ crosstab,
1290
+ annot=True,
1291
+ cmap="YlGnBu",
1292
+ ax=canvas.axes,
1293
+ fmt='.2f',
1294
+ linewidths=0.5,
1295
+ annot_kws={"size": 9 if crosstab.size < 30 else 7}
1296
+ )
1297
+ canvas.axes.set_title(f"Heatmap: {feature} vs {target} (proportions)")
1298
+ else:
1299
+ # For high cardinality in both, show a count plot of top categories
1300
+ feature_top = working_df[feature].value_counts().nlargest(8).index
1301
+ target_top = working_df[target].value_counts().nlargest(5).index
1302
+
1303
+ # Filter data to only include top categories
1304
+ filtered_df = working_df[
1305
+ working_df[feature].isin(feature_top) &
1306
+ working_df[target].isin(target_top)
1307
+ ]
1308
+
1309
+ # Create a grouped count plot
1310
+ sns.countplot(
1311
+ x=feature,
1312
+ hue=target,
1313
+ data=filtered_df,
1314
+ ax=canvas.axes
1315
+ )
1316
+ canvas.axes.set_title(f"Count Plot: Top {len(feature_top)} {feature} by Top {len(target_top)} {target}")
1317
+
1318
+ # Rotate x-axis labels
1319
+ canvas.axes.set_xticklabels(
1320
+ canvas.axes.get_xticklabels(),
1321
+ rotation=45,
1322
+ ha='right'
1323
+ )
1324
+
1325
+ # Move legend to a better position
1326
+ canvas.axes.legend(title=target, bbox_to_anchor=(1.05, 1), loc='upper left')
1327
+
1328
+ # Add informational text about data reduction if applicable
1329
+ if (not feature_is_numeric and feature_unique_count > high_cardinality_threshold) or \
1330
+ (not target_is_numeric and target_unique_count > high_cardinality_threshold):
1331
+ canvas.figure.text(
1332
+ 0.5, 0.01,
1333
+ f"Note: Visualization simplified to show top categories only. Original data has {feature_unique_count} unique {feature} values and {target_unique_count} unique {target} values.",
1334
+ ha='center',
1335
+ fontsize=8,
1336
+ style='italic'
1337
+ )
1338
+
1339
+ # Adjust layout and draw
1340
+ canvas.figure.tight_layout()
1341
+ canvas.draw()
1342
+
1343
+ # Add a close button
1344
+ close_button = QPushButton("Close")
1345
+ close_button.clicked.connect(dialog.accept)
1346
+ layout.addWidget(close_button)
1347
+
1348
+ # Show the dialog
1349
+ dialog.exec()
1350
+
1351
+ def change_sort(self, sort_key):
1352
+ """Change the sort order of the results"""
1353
+ if self.importance_df is None:
1354
+ return
1355
+
1356
+ # Update button states
1357
+ if sort_key == 'importance_value':
1358
+ self.importance_sort_btn.setChecked(True)
1359
+ self.correlation_sort_btn.setChecked(False)
1360
+ else:
1361
+ self.importance_sort_btn.setChecked(False)
1362
+ self.correlation_sort_btn.setChecked(True)
1363
+
1364
+ # Store the current sort key
1365
+ self.current_sort = sort_key
1366
+
1367
+ # Re-sort the dataframe
1368
+ self.importance_df = self.importance_df.sort_values(by=sort_key, ascending=False)
1369
+
1370
+ # Reset rendering of the table
1371
+ self.importance_table.clearContents()
1372
+ self.importance_table.setRowCount(len(self.importance_df))
1373
+ self.current_row = 0
1374
+
1375
+ # Start incremental rendering with the new sort order
1376
+ if self.render_timer and self.render_timer.isActive():
1377
+ self.render_timer.stop()
1378
+ self.render_timer = QTimer()
1379
+ self.render_timer.timeout.connect(lambda: self.render_next_batch(10))
1380
+ self.render_timer.start(10) # Update every 10ms
1381
+
1382
+ # Main application class
1383
+ class ColumnProfilerApp(QMainWindow):
1384
+ # Global application-wide cache to prevent redundant computations
1385
+ global_cache = {}
1386
+
1387
+ def __init__(self, df):
1388
+ super().__init__()
1389
+
1390
+ # Store reference to data
1391
+ self.df = df
1392
+
1393
+ # Initialize cache for results
1394
+ self.result_cache = {}
1395
+
1396
+ # Initialize thread variable
1397
+ self.worker_thread = None
1398
+
1399
+ # Variables for incremental rendering
1400
+ self.importance_df = None
1401
+ self.current_row = 0
1402
+ self.render_timer = None
1403
+
1404
+ # Current sort key
1405
+ self.current_sort = 'importance_value'
1406
+
1407
+ # Set window properties
1408
+ self.setWindowTitle("Column Profiler")
1409
+ self.setMinimumSize(900, 600)
1410
+
1411
+ # Create central widget and main layout
1412
+ central_widget = QWidget()
1413
+ main_layout = QVBoxLayout(central_widget)
1414
+
1415
+ # Create top control panel
1416
+ control_panel = QWidget()
1417
+ control_layout = QHBoxLayout(control_panel)
1418
+
1419
+ # Column selector
1420
+ self.column_selector = QComboBox()
1421
+ self.column_selector.addItems([col for col in df.columns])
1422
+ control_layout.addWidget(QLabel("Select Column to Analyze:"))
1423
+ control_layout.addWidget(self.column_selector)
1424
+
1425
+ # Analyze button
1426
+ self.analyze_button = QPushButton("Analyze")
1427
+ self.analyze_button.clicked.connect(self.analyze_column)
1428
+ control_layout.addWidget(self.analyze_button)
1429
+
1430
+ # Progress indicators
1431
+ self.progress_bar = QProgressBar()
1432
+ self.progress_bar.setRange(0, 100)
1433
+ self.progress_bar.hide()
1434
+ self.progress_label = QLabel()
1435
+ self.progress_label.hide()
1436
+
1437
+ # Cancel button
1438
+ self.cancel_button = QPushButton("Cancel")
1439
+ self.cancel_button.clicked.connect(self.cancel_analysis)
1440
+ self.cancel_button.hide()
1441
+
1442
+ control_layout.addWidget(self.progress_bar)
1443
+ control_layout.addWidget(self.progress_label)
1444
+ control_layout.addWidget(self.cancel_button)
1445
+
1446
+ # Add control panel to main layout
1447
+ main_layout.addWidget(control_panel)
1448
+
1449
+ # Add sorting control
1450
+ sort_panel = QWidget()
1451
+ sort_layout = QHBoxLayout(sort_panel)
1452
+ sort_layout.setContentsMargins(0, 0, 0, 0)
1453
+
1454
+ # Add sort label
1455
+ sort_layout.addWidget(QLabel("Sort by:"))
1456
+
1457
+ # Add sort buttons
1458
+ self.importance_sort_btn = QPushButton("Importance")
1459
+ self.importance_sort_btn.setCheckable(True)
1460
+ self.importance_sort_btn.setChecked(True) # Default sort
1461
+ self.importance_sort_btn.clicked.connect(lambda: self.change_sort('importance_value'))
1462
+
1463
+ self.correlation_sort_btn = QPushButton("Correlation")
1464
+ self.correlation_sort_btn.setCheckable(True)
1465
+ self.correlation_sort_btn.clicked.connect(lambda: self.change_sort('correlation'))
1466
+
1467
+ sort_layout.addWidget(self.importance_sort_btn)
1468
+ sort_layout.addWidget(self.correlation_sort_btn)
1469
+ sort_layout.addStretch()
1470
+
1471
+ # Add buttons to layout
1472
+ main_layout.addWidget(sort_panel)
1473
+
1474
+ # Add a splitter for results area
1475
+ results_splitter = QSplitter(Qt.Orientation.Vertical)
1476
+
1477
+ # Create table for showing importance values
1478
+ self.importance_table = QTableWidget()
1479
+ self.importance_table.setColumnCount(3)
1480
+ self.importance_table.setHorizontalHeaderLabels(["Feature", "Importance", "Abs. Correlation"])
1481
+ self.importance_table.horizontalHeader().setSectionResizeMode(0, QHeaderView.ResizeMode.Stretch)
1482
+ self.importance_table.cellDoubleClicked.connect(self.show_relationship_visualization)
1483
+ results_splitter.addWidget(self.importance_table)
1484
+
1485
+ # Add instruction label for double-click functionality
1486
+ instruction_label = QLabel("Double-click on any feature to view detailed relationship visualization with the target column")
1487
+ instruction_label.setStyleSheet("color: #666; font-style: italic;")
1488
+ instruction_label.setAlignment(Qt.AlignmentFlag.AlignCenter)
1489
+ main_layout.addWidget(instruction_label)
1490
+
1491
+ # Create matplotlib canvas for the chart
1492
+ self.chart_view = MatplotlibCanvas(width=8, height=5, dpi=100)
1493
+ results_splitter.addWidget(self.chart_view)
1494
+
1495
+ # Set initial splitter sizes
1496
+ results_splitter.setSizes([300, 300])
1497
+
1498
+ # Add the splitter to the main layout
1499
+ main_layout.addWidget(results_splitter)
1500
+
1501
+ # Set the central widget
1502
+ self.setCentralWidget(central_widget)
1503
+
1504
+ def analyze_column(self):
1505
+ if self.df is None or self.column_selector.currentText() == "":
1506
+ return
1507
+
1508
+ # Cancel any existing worker thread
1509
+ if self.worker_thread and self.worker_thread.isRunning():
1510
+ # Signal the thread to cancel
1511
+ self.worker_thread.cancel()
1512
+
1513
+ try:
1514
+ # Disconnect all signals to avoid callbacks during termination
1515
+ self.worker_thread.progress.disconnect()
1516
+ self.worker_thread.result.disconnect()
1517
+ self.worker_thread.error.disconnect()
1518
+ self.worker_thread.finished.disconnect()
1519
+ except Exception:
1520
+ pass # Already disconnected
1521
+
1522
+ # Terminate thread properly
1523
+ self.worker_thread.terminate()
1524
+ self.worker_thread.wait(1000) # Wait up to 1 second
1525
+ self.worker_thread = None # Clear reference
1526
+
1527
+ target_column = self.column_selector.currentText()
1528
+
1529
+ # Check in-memory cache first (fastest)
1530
+ if target_column in self.result_cache:
1531
+ self.handle_results(self.result_cache[target_column])
1532
+ return
1533
+
1534
+ # Check global application-wide cache second (still fast)
1535
+ global_key = get_cache_key(self.df, target_column)
1536
+ if global_key in ColumnProfilerApp.global_cache:
1537
+ self.result_cache[target_column] = ColumnProfilerApp.global_cache[global_key]
1538
+ self.handle_results(self.result_cache[target_column])
1539
+ return
1540
+
1541
+ # Disk cache will be checked in the worker thread
1542
+
1543
+ # Disable the analyze button while processing
1544
+ self.analyze_button.setEnabled(False)
1545
+
1546
+ # Show progress indicators
1547
+ self.progress_bar.setValue(0)
1548
+ self.progress_bar.show()
1549
+ self.progress_label.setText("Starting analysis...")
1550
+ self.progress_label.show()
1551
+ self.cancel_button.show()
1552
+
1553
+ # Create and start the worker thread
1554
+ self.worker_thread = ExplainerThread(self.df, target_column)
1555
+ self.worker_thread.progress.connect(self.update_progress)
1556
+ self.worker_thread.result.connect(self.cache_and_display_results)
1557
+ self.worker_thread.error.connect(self.handle_error)
1558
+ self.worker_thread.finished.connect(self.on_analysis_finished)
1559
+ self.worker_thread.start()
1560
+
1561
+ def update_progress(self, value, message):
1562
+ self.progress_bar.setValue(value)
1563
+ self.progress_label.setText(message)
1564
+
1565
+ def cache_and_display_results(self, importance_df):
1566
+ # Cache the results
1567
+ target_column = self.column_selector.currentText()
1568
+ self.result_cache[target_column] = importance_df
1569
+
1570
+ # Also cache in the global application cache
1571
+ global_key = get_cache_key(self.df, target_column)
1572
+ ColumnProfilerApp.global_cache[global_key] = importance_df
1573
+
1574
+ # Display the results
1575
+ self.handle_results(importance_df)
1576
+
1577
+ def on_analysis_finished(self):
1578
+ """Handle cleanup when analysis is finished (either completed or cancelled)"""
1579
+ self.analyze_button.setEnabled(True)
1580
+ self.cancel_button.hide()
1581
+
1582
+ def handle_results(self, importance_df):
1583
+ # Hide progress indicators
1584
+ self.progress_bar.hide()
1585
+ self.progress_label.hide()
1586
+ self.cancel_button.hide()
1587
+
1588
+ # Update importance table to include correlation column
1589
+ self.importance_table.setColumnCount(3)
1590
+ self.importance_table.setHorizontalHeaderLabels(["Feature", "Importance", "Abs. Correlation"])
1591
+ self.importance_table.setRowCount(len(importance_df))
1592
+
1593
+ # Using a timer for incremental updates
1594
+ self.importance_df = importance_df # Store for incremental rendering
1595
+ self.current_row = 0
1596
+ self.render_timer = QTimer()
1597
+ self.render_timer.timeout.connect(lambda: self.render_next_batch(10))
1598
+ self.render_timer.start(10) # Update every 10ms
1599
+
1600
+ def render_next_batch(self, batch_size):
1601
+ try:
1602
+ if self.current_row >= len(self.importance_df):
1603
+ # All rows rendered, now render the chart and stop the timer
1604
+ self.render_chart()
1605
+ self.render_timer.stop()
1606
+ return
1607
+
1608
+ # Render a batch of rows
1609
+ end_row = min(self.current_row + batch_size, len(self.importance_df))
1610
+ for row in range(self.current_row, end_row):
1611
+ try:
1612
+ # Check if row exists in dataframe to prevent index errors
1613
+ if row < len(self.importance_df):
1614
+ feature = self.importance_df.iloc[row]['feature']
1615
+ importance_value = self.importance_df.iloc[row]['importance_value']
1616
+
1617
+ # Add correlation if available
1618
+ correlation = self.importance_df.iloc[row].get('correlation', None)
1619
+ if correlation is not None:
1620
+ self.importance_table.setItem(row, 0, QTableWidgetItem(str(feature)))
1621
+ self.importance_table.setItem(row, 1, QTableWidgetItem(str(round(importance_value, 4))))
1622
+ self.importance_table.setItem(row, 2, QTableWidgetItem(str(round(correlation, 4))))
1623
+ else:
1624
+ self.importance_table.setItem(row, 0, QTableWidgetItem(str(feature)))
1625
+ self.importance_table.setItem(row, 1, QTableWidgetItem(str(round(importance_value, 4))))
1626
+ else:
1627
+ # Handle out of range index
1628
+ print(f"Warning: Row {row} is out of range (max: {len(self.importance_df)-1})")
1629
+ self.importance_table.setItem(row, 0, QTableWidgetItem("Error"))
1630
+ self.importance_table.setItem(row, 1, QTableWidgetItem("Out of range"))
1631
+ self.importance_table.setItem(row, 2, QTableWidgetItem("N/A"))
1632
+ except (IndexError, KeyError) as e:
1633
+ # Enhanced error reporting for index and key errors
1634
+ import traceback
1635
+ trace = traceback.format_exc()
1636
+ error_msg = f"Error rendering row {row}: {e.__class__.__name__}: {e}\n{trace}"
1637
+ print(error_msg)
1638
+
1639
+ # Handle missing data in the dataframe gracefully
1640
+ self.importance_table.setItem(row, 0, QTableWidgetItem(f"Error: {e.__class__.__name__}"))
1641
+ self.importance_table.setItem(row, 1, QTableWidgetItem(f"{str(e)[:20]}"))
1642
+ self.importance_table.setItem(row, 2, QTableWidgetItem("Error"))
1643
+ except Exception as e:
1644
+ # Catch any other exceptions
1645
+ print(f"Unexpected error rendering row {row}: {e.__class__.__name__}: {e}")
1646
+ self.importance_table.setItem(row, 0, QTableWidgetItem(f"Error: {e.__class__.__name__}"))
1647
+ self.importance_table.setItem(row, 1, QTableWidgetItem("See console for details"))
1648
+ self.importance_table.setItem(row, 2, QTableWidgetItem("Error"))
1649
+
1650
+ self.current_row = end_row
1651
+ QApplication.processEvents() # Allow UI to update
1652
+ except Exception as e:
1653
+ # Catch any exceptions in the rendering loop itself
1654
+ import traceback
1655
+ trace = traceback.format_exc()
1656
+ error_msg = f"Error in render_next_batch: {e.__class__.__name__}: {e}\n{trace}"
1657
+ print(error_msg)
1658
+
1659
+ # Try to stop the timer to prevent further errors
1660
+ try:
1661
+ if self.render_timer and self.render_timer.isActive():
1662
+ self.render_timer.stop()
1663
+ except:
1664
+ pass
1665
+
1666
+ # Show error
1667
+ QMessageBox.critical(self, "Rendering Error",
1668
+ f"Error rendering results: {e.__class__.__name__}: {e}")
1669
+
1670
+ def render_chart(self):
1671
+ # Create horizontal bar chart
1672
+ try:
1673
+ if self.importance_df is None or len(self.importance_df) == 0:
1674
+ # No data to render
1675
+ self.chart_view.axes.clear()
1676
+ self.chart_view.axes.text(0.5, 0.5, "No data available for chart",
1677
+ ha='center', va='center', fontsize=12, color='gray')
1678
+ self.chart_view.axes.set_axis_off()
1679
+ self.chart_view.draw()
1680
+ return
1681
+
1682
+ self.chart_view.axes.clear()
1683
+
1684
+ # Get a sorted copy based on current sort key
1685
+ plot_df = self.importance_df.sort_values(by=self.current_sort, ascending=False).head(20).copy()
1686
+
1687
+ # Verify we have data before proceeding
1688
+ if len(plot_df) == 0:
1689
+ self.chart_view.axes.text(0.5, 0.5, "No features found with importance values",
1690
+ ha='center', va='center', fontsize=12, color='gray')
1691
+ self.chart_view.axes.set_axis_off()
1692
+ self.chart_view.draw()
1693
+ return
1694
+
1695
+ # Check required columns exist
1696
+ required_columns = ['feature', 'importance_value']
1697
+ missing_columns = [col for col in required_columns if col not in plot_df.columns]
1698
+ if missing_columns:
1699
+ error_msg = f"Missing required columns: {', '.join(missing_columns)}"
1700
+ self.chart_view.axes.text(0.5, 0.5, error_msg,
1701
+ ha='center', va='center', fontsize=12, color='red')
1702
+ self.chart_view.axes.set_axis_off()
1703
+ self.chart_view.draw()
1704
+ print(f"Chart rendering error: {error_msg}")
1705
+ return
1706
+
1707
+ # Truncate long feature names for better display
1708
+ max_feature_length = 30
1709
+ plot_df['display_feature'] = plot_df['feature'].apply(
1710
+ lambda x: (str(x)[:max_feature_length] + '...') if len(str(x)) > max_feature_length else str(x)
1711
+ )
1712
+
1713
+ # Reverse order for better display (highest at top)
1714
+ plot_df = plot_df.iloc[::-1].reset_index(drop=True)
1715
+
1716
+ # Create a figure with two subplots side by side
1717
+ self.chart_view.figure.clear()
1718
+ gs = self.chart_view.figure.add_gridspec(1, 2, width_ratios=[3, 2])
1719
+
1720
+ # First subplot for importance
1721
+ ax1 = self.chart_view.figure.add_subplot(gs[0, 0])
1722
+
1723
+ # Create a colormap for better visualization
1724
+ cmap = plt.cm.Blues
1725
+ colors = cmap(np.linspace(0.4, 0.8, len(plot_df)))
1726
+
1727
+ # Plot with custom colors
1728
+ bars = ax1.barh(
1729
+ plot_df['display_feature'],
1730
+ plot_df['importance_value'],
1731
+ color=colors,
1732
+ height=0.7, # Thinner bars for more spacing
1733
+ alpha=0.8
1734
+ )
1735
+
1736
+ # Add values at the end of bars
1737
+ for bar in bars:
1738
+ width = bar.get_width()
1739
+ ax1.text(
1740
+ width * 1.05,
1741
+ bar.get_y() + bar.get_height()/2,
1742
+ f'{width:.2f}',
1743
+ va='center',
1744
+ fontsize=9,
1745
+ fontweight='bold'
1746
+ )
1747
+
1748
+ # Add grid for better readability
1749
+ ax1.grid(True, axis='x', linestyle='--', alpha=0.3)
1750
+
1751
+ # Remove unnecessary spines
1752
+ for spine in ['top', 'right']:
1753
+ ax1.spines[spine].set_visible(False)
1754
+
1755
+ # Make labels more readable
1756
+ ax1.tick_params(axis='y', labelsize=9)
1757
+
1758
+ # Set title and labels
1759
+ ax1.set_title(f'Feature Importance for {self.column_selector.currentText()}')
1760
+ ax1.set_xlabel('Importance Value')
1761
+
1762
+ # Add a note about the sorting order
1763
+ sort_label = "Sorted by: " + ("Importance" if self.current_sort == 'importance_value' else "Correlation")
1764
+
1765
+ # Second subplot for correlation if available
1766
+ if 'correlation' in plot_df.columns:
1767
+ ax2 = self.chart_view.figure.add_subplot(gs[0, 1], sharey=ax1)
1768
+
1769
+ # Create a colormap for correlation - use a different color
1770
+ cmap_corr = plt.cm.Reds
1771
+ colors_corr = cmap_corr(np.linspace(0.4, 0.8, len(plot_df)))
1772
+
1773
+ # Plot correlation bars
1774
+ corr_bars = ax2.barh(
1775
+ plot_df['display_feature'],
1776
+ plot_df['correlation'],
1777
+ color=colors_corr,
1778
+ height=0.7,
1779
+ alpha=0.8
1780
+ )
1781
+
1782
+ # Add values at the end of correlation bars
1783
+ for bar in corr_bars:
1784
+ width = bar.get_width()
1785
+ ax2.text(
1786
+ width * 1.05,
1787
+ bar.get_y() + bar.get_height()/2,
1788
+ f'{width:.2f}',
1789
+ va='center',
1790
+ fontsize=9,
1791
+ fontweight='bold'
1792
+ )
1793
+
1794
+ # Add grid and styling
1795
+ ax2.grid(True, axis='x', linestyle='--', alpha=0.3)
1796
+ ax2.set_title('Absolute Correlation')
1797
+ ax2.set_xlabel('Correlation Value')
1798
+
1799
+ # Hide y-axis labels since they're shared with the first plot
1800
+ ax2.set_yticklabels([])
1801
+
1802
+ # Remove unnecessary spines
1803
+ for spine in ['top', 'right']:
1804
+ ax2.spines[spine].set_visible(False)
1805
+
1806
+ # Add a note about the current sort order
1807
+ self.chart_view.figure.text(0.5, 0.01, sort_label, ha='center', fontsize=9, style='italic')
1808
+
1809
+ # Adjust figure size based on number of features
1810
+ feature_count = len(plot_df)
1811
+ self.chart_view.figure.set_figheight(max(5, min(4 + feature_count * 0.3, 12)))
1812
+
1813
+ # Adjust layout and draw
1814
+ self.chart_view.figure.tight_layout(rect=[0, 0.03, 1, 0.97]) # Make room for sort label
1815
+ self.chart_view.draw()
1816
+
1817
+ except IndexError as e:
1818
+ # Special handling for index errors with detailed information
1819
+ import traceback
1820
+ import inspect
1821
+
1822
+ # Get stack trace information
1823
+ trace = traceback.format_exc()
1824
+
1825
+ # Try to get line and context information
1826
+ try:
1827
+ frame = inspect.trace()[-1]
1828
+ frame_info = inspect.getframeinfo(frame[0])
1829
+ filename = frame_info.filename
1830
+ lineno = frame_info.lineno
1831
+ function = frame_info.function
1832
+ code_context = frame_info.code_context[0].strip() if frame_info.code_context else "Unknown code context"
1833
+
1834
+ # Detailed error message
1835
+ detail_msg = f"IndexError at line {lineno} in {function}: {str(e)}\nCode: {code_context}"
1836
+ print(f"Chart rendering error: {detail_msg}\n{trace}")
1837
+
1838
+ # Display error in chart
1839
+ self.chart_view.axes.clear()
1840
+ self.chart_view.axes.text(0.5, 0.5,
1841
+ f"Index Error in chart rendering:\n{str(e)}\nAt line {lineno}: {code_context}",
1842
+ ha='center', va='center', fontsize=12, color='red',
1843
+ wrap=True)
1844
+ self.chart_view.axes.set_axis_off()
1845
+ self.chart_view.draw()
1846
+ except Exception as inner_e:
1847
+ # Fallback if the detailed error reporting fails
1848
+ print(f"Error getting detailed error info: {inner_e}")
1849
+ print(f"Original error: {e}\n{trace}")
1850
+
1851
+ self.chart_view.axes.clear()
1852
+ self.chart_view.axes.text(0.5, 0.5, f"Index Error: {str(e)}",
1853
+ ha='center', va='center', fontsize=12, color='red')
1854
+ self.chart_view.axes.set_axis_off()
1855
+ self.chart_view.draw()
1856
+ except Exception as e:
1857
+ # Recover gracefully from any chart rendering errors with detailed information
1858
+ import traceback
1859
+ trace = traceback.format_exc()
1860
+ error_msg = f"Error rendering chart: {e.__class__.__name__}: {str(e)}"
1861
+ print(f"{error_msg}\n{trace}")
1862
+
1863
+ self.chart_view.axes.clear()
1864
+ self.chart_view.axes.text(0.5, 0.5, error_msg,
1865
+ ha='center', va='center', fontsize=12, color='red',
1866
+ wrap=True)
1867
+ self.chart_view.axes.set_axis_off()
1868
+ self.chart_view.draw()
1869
+
1870
+ def handle_error(self, error_message):
1871
+ """Handle errors during analysis"""
1872
+ # Hide progress indicators
1873
+ self.progress_bar.hide()
1874
+ self.progress_label.hide()
1875
+ self.cancel_button.hide()
1876
+
1877
+ # Re-enable analyze button
1878
+ self.analyze_button.setEnabled(True)
1879
+
1880
+ # Print error to console for debugging
1881
+ print(f"Error in column profiler: {error_message}")
1882
+
1883
+ # Show error message with more details
1884
+ msg_box = QMessageBox(self)
1885
+ msg_box.setIcon(QMessageBox.Icon.Critical)
1886
+ msg_box.setWindowTitle("Error")
1887
+ msg_box.setText("An error occurred during analysis")
1888
+ msg_box.setDetailedText(error_message)
1889
+ msg_box.setStandardButtons(QMessageBox.StandardButton.Ok)
1890
+ msg_box.exec()
1891
+
1892
+ # Show a message in the UI as well
1893
+ self.importance_table.setRowCount(1)
1894
+ self.importance_table.setColumnCount(3)
1895
+ self.importance_table.setHorizontalHeaderLabels(["Feature", "Importance", "Abs. Correlation"])
1896
+ self.importance_table.setItem(0, 0, QTableWidgetItem(f"Error: {error_message.split('\n')[0]}"))
1897
+ self.importance_table.setItem(0, 1, QTableWidgetItem(""))
1898
+ self.importance_table.setItem(0, 2, QTableWidgetItem(""))
1899
+ self.importance_table.resizeColumnsToContents()
1900
+
1901
+ # Update the chart to show error
1902
+ self.chart_view.axes.clear()
1903
+ self.chart_view.axes.text(0.5, 0.5, f"Error calculating importance:\n{error_message.split('\n')[0]}",
1904
+ ha='center', va='center', fontsize=12, color='red',
1905
+ wrap=True)
1906
+ self.chart_view.axes.set_axis_off()
1907
+ self.chart_view.draw()
1908
+
1909
+ def closeEvent(self, event):
1910
+ """Clean up when the window is closed"""
1911
+ # Stop any running timer
1912
+ if self.render_timer and self.render_timer.isActive():
1913
+ self.render_timer.stop()
1914
+
1915
+ # Clean up any background threads
1916
+ if self.worker_thread and self.worker_thread.isRunning():
1917
+ # Disconnect all signals to avoid callbacks during termination
1918
+ try:
1919
+ self.worker_thread.progress.disconnect()
1920
+ self.worker_thread.result.disconnect()
1921
+ self.worker_thread.error.disconnect()
1922
+ self.worker_thread.finished.disconnect()
1923
+ except Exception:
1924
+ pass # Already disconnected
1925
+
1926
+ # Terminate thread properly
1927
+ self.worker_thread.terminate()
1928
+ self.worker_thread.wait(1000) # Wait up to 1 second
1929
+
1930
+ # Clear references to prevent thread issues
1931
+ self.worker_thread = None
1932
+
1933
+ # Clean up memory
1934
+ self.result_cache.clear()
1935
+
1936
+ # Accept the close event
1937
+ event.accept()
1938
+
1939
+ # Suggest garbage collection
1940
+ gc.collect()
1941
+
1942
+ def cancel_analysis(self):
1943
+ """Cancel the current analysis"""
1944
+ if self.worker_thread and self.worker_thread.isRunning():
1945
+ # Signal the thread to cancel first
1946
+ self.worker_thread.cancel()
1947
+
1948
+ # Disconnect all signals to avoid callbacks during termination
1949
+ try:
1950
+ self.worker_thread.progress.disconnect()
1951
+ self.worker_thread.result.disconnect()
1952
+ self.worker_thread.error.disconnect()
1953
+ self.worker_thread.finished.disconnect()
1954
+ except Exception:
1955
+ pass # Already disconnected
1956
+
1957
+ # Terminate thread properly
1958
+ self.worker_thread.terminate()
1959
+ self.worker_thread.wait(1000) # Wait up to 1 second
1960
+
1961
+ # Clear reference
1962
+ self.worker_thread = None
1963
+
1964
+ # Update UI
1965
+ self.progress_bar.hide()
1966
+ self.progress_label.setText("Analysis cancelled")
1967
+ self.progress_label.show()
1968
+ self.cancel_button.hide()
1969
+ self.analyze_button.setEnabled(True)
1970
+
1971
+ # Hide the progress label after 2 seconds
1972
+ QTimer.singleShot(2000, self.progress_label.hide)
1973
+
1974
+ def show_relationship_visualization(self, row, column):
1975
+ """Show visualization of relationship between selected feature and target column"""
1976
+ if self.importance_df is None or row < 0 or row >= len(self.importance_df):
1977
+ return
1978
+
1979
+ # Get the feature name and target column
1980
+ try:
1981
+ feature = self.importance_df.iloc[row]['feature']
1982
+ target = self.column_selector.currentText()
1983
+
1984
+ # Verify both columns exist in the dataframe
1985
+ if feature not in self.df.columns:
1986
+ QMessageBox.warning(self, "Column Not Found",
1987
+ f"Feature column '{feature}' not found in the dataframe")
1988
+ return
1989
+
1990
+ if target not in self.df.columns:
1991
+ QMessageBox.warning(self, "Column Not Found",
1992
+ f"Target column '{target}' not found in the dataframe")
1993
+ return
1994
+ except Exception as e:
1995
+ QMessageBox.critical(self, "Error", f"Error getting column data: {str(e)}")
1996
+ return
1997
+
1998
+ # Create a dialog to show the visualization
1999
+ dialog = QDialog(self)
2000
+ dialog.setWindowTitle(f"Relationship: {feature} vs {target}")
2001
+ dialog.resize(900, 700)
2002
+
2003
+ # Create layout
2004
+ layout = QVBoxLayout(dialog)
2005
+
2006
+ # Create canvas for the plot
2007
+ canvas = MatplotlibCanvas(width=8, height=6, dpi=100)
2008
+ layout.addWidget(canvas)
2009
+
2010
+ # Determine the data types
2011
+ feature_is_numeric = pd.api.types.is_numeric_dtype(self.df[feature])
2012
+ target_is_numeric = pd.api.types.is_numeric_dtype(self.df[target])
2013
+
2014
+ # Get unique counts to determine if we have high cardinality
2015
+ feature_unique_count = self.df[feature].nunique()
2016
+ target_unique_count = self.df[target].nunique()
2017
+
2018
+ # Define high cardinality threshold
2019
+ high_cardinality_threshold = 10
2020
+
2021
+ # Clear the figure
2022
+ canvas.axes.clear()
2023
+
2024
+ # Create a working copy of the dataframe
2025
+ working_df = self.df.copy()
2026
+
2027
+ # Prepare data for high cardinality columns
2028
+ if not feature_is_numeric and feature_unique_count > high_cardinality_threshold:
2029
+ # Get the top N categories by frequency
2030
+ top_categories = self.df[feature].value_counts().nlargest(high_cardinality_threshold).index.tolist()
2031
+ # Create "Other" category for remaining values
2032
+ working_df[feature] = working_df[feature].apply(lambda x: x if x in top_categories else 'Other')
2033
+
2034
+ if not target_is_numeric and target_unique_count > high_cardinality_threshold:
2035
+ top_categories = self.df[target].value_counts().nlargest(high_cardinality_threshold).index.tolist()
2036
+ working_df[target] = working_df[target].apply(lambda x: x if x in top_categories else 'Other')
2037
+
2038
+ # Create appropriate visualization based on data types and cardinality
2039
+ if feature_is_numeric and target_is_numeric:
2040
+ # Scatter plot for numeric vs numeric
2041
+ # Use hexbin for large datasets to avoid overplotting
2042
+ if len(working_df) > 100:
2043
+ canvas.axes.hexbin(
2044
+ working_df[feature],
2045
+ working_df[target],
2046
+ gridsize=25,
2047
+ cmap='Blues',
2048
+ mincnt=1
2049
+ )
2050
+ canvas.axes.set_title(f"Hexbin Density Plot: {feature} vs {target}")
2051
+ canvas.axes.set_xlabel(feature)
2052
+ canvas.axes.set_ylabel(target)
2053
+ # Add a colorbar
2054
+ cbar = canvas.figure.colorbar(canvas.axes.collections[0], ax=canvas.axes)
2055
+ cbar.set_label('Count')
2056
+ else:
2057
+ # For smaller datasets, use a scatter plot with transparency
2058
+ sns.scatterplot(
2059
+ x=feature,
2060
+ y=target,
2061
+ data=working_df,
2062
+ ax=canvas.axes,
2063
+ alpha=0.6
2064
+ )
2065
+ # Add regression line
2066
+ sns.regplot(
2067
+ x=feature,
2068
+ y=target,
2069
+ data=working_df,
2070
+ ax=canvas.axes,
2071
+ scatter=False,
2072
+ line_kws={"color": "red"}
2073
+ )
2074
+ canvas.axes.set_title(f"Scatter Plot: {feature} vs {target}")
2075
+
2076
+ elif feature_is_numeric and not target_is_numeric:
2077
+ # Box plot for numeric vs categorical
2078
+ if target_unique_count <= high_cardinality_threshold * 2:
2079
+ # Standard boxplot for reasonable number of categories
2080
+ order = working_df[target].value_counts().nlargest(high_cardinality_threshold * 2).index
2081
+ sns.boxplot(
2082
+ x=target,
2083
+ y=feature,
2084
+ data=working_df,
2085
+ ax=canvas.axes,
2086
+ order=order
2087
+ )
2088
+ canvas.axes.set_title(f"Box Plot: {feature} by {target}")
2089
+ # Rotate x-axis labels for better readability
2090
+ canvas.axes.set_xticklabels(
2091
+ canvas.axes.get_xticklabels(),
2092
+ rotation=45,
2093
+ ha='right'
2094
+ )
2095
+ else:
2096
+ # For very high cardinality, use a violin plot with limited categories
2097
+ order = working_df[target].value_counts().nlargest(high_cardinality_threshold).index
2098
+ working_df_filtered = working_df[working_df[target].isin(order)]
2099
+ sns.violinplot(
2100
+ x=target,
2101
+ y=feature,
2102
+ data=working_df_filtered,
2103
+ ax=canvas.axes,
2104
+ inner='quartile',
2105
+ cut=0
2106
+ )
2107
+ canvas.axes.set_title(f"Violin Plot: {feature} by Top {len(order)} {target} Categories")
2108
+ canvas.axes.set_xticklabels(
2109
+ canvas.axes.get_xticklabels(),
2110
+ rotation=45,
2111
+ ha='right'
2112
+ )
2113
+
2114
+ elif not feature_is_numeric and target_is_numeric:
2115
+ # Bar plot for categorical vs numeric
2116
+ if feature_unique_count <= high_cardinality_threshold * 2:
2117
+ # Use standard barplot for reasonable number of categories
2118
+ order = working_df[feature].value_counts().nlargest(high_cardinality_threshold * 2).index
2119
+ sns.barplot(
2120
+ x=feature,
2121
+ y=target,
2122
+ data=working_df,
2123
+ ax=canvas.axes,
2124
+ order=order,
2125
+ estimator=np.mean,
2126
+ errorbar=('ci', 95),
2127
+ capsize=0.2
2128
+ )
2129
+ canvas.axes.set_title(f"Bar Plot: Average {target} by {feature}")
2130
+
2131
+ # Add value labels on top of bars
2132
+ for p in canvas.axes.patches:
2133
+ canvas.axes.annotate(
2134
+ f'{p.get_height():.1f}',
2135
+ (p.get_x() + p.get_width() / 2., p.get_height()),
2136
+ ha='center',
2137
+ va='bottom',
2138
+ fontsize=8,
2139
+ rotation=0
2140
+ )
2141
+
2142
+ # Rotate x-axis labels if needed
2143
+ if feature_unique_count > 5:
2144
+ canvas.axes.set_xticklabels(
2145
+ canvas.axes.get_xticklabels(),
2146
+ rotation=45,
2147
+ ha='right'
2148
+ )
2149
+ else:
2150
+ # For high cardinality, use a horizontal bar plot with top N categories
2151
+ top_n = 15 # Show top 15 categories
2152
+ # Calculate mean of target for each feature category
2153
+ grouped = working_df.groupby(feature)[target].agg(['mean', 'count', 'std']).reset_index()
2154
+ # Sort by mean and take top categories
2155
+ top_groups = grouped.nlargest(top_n, 'mean')
2156
+
2157
+ # Sort by mean value for better visualization
2158
+ sns.barplot(
2159
+ y=feature,
2160
+ x='mean',
2161
+ data=top_groups,
2162
+ ax=canvas.axes,
2163
+ orient='h'
2164
+ )
2165
+ canvas.axes.set_title(f"Top {top_n} Categories by Average {target}")
2166
+ canvas.axes.set_xlabel(f"Average {target}")
2167
+
2168
+ # Add count annotations
2169
+ for i, row in enumerate(top_groups.itertuples()):
2170
+ canvas.axes.text(
2171
+ row.mean + 0.1,
2172
+ i,
2173
+ f'n={row.count}',
2174
+ va='center',
2175
+ fontsize=8
2176
+ )
2177
+
2178
+ else:
2179
+ # Both feature and target are categorical
2180
+ if feature_unique_count <= high_cardinality_threshold and target_unique_count <= high_cardinality_threshold:
2181
+ # Heatmap for categorical vs categorical with manageable cardinality
2182
+ crosstab = pd.crosstab(
2183
+ working_df[feature],
2184
+ working_df[target],
2185
+ normalize='index'
2186
+ )
2187
+
2188
+ # Create heatmap with improved readability
2189
+ sns.heatmap(
2190
+ crosstab,
2191
+ annot=True,
2192
+ cmap="YlGnBu",
2193
+ ax=canvas.axes,
2194
+ fmt='.2f',
2195
+ linewidths=0.5,
2196
+ annot_kws={"size": 9 if crosstab.size < 30 else 7}
2197
+ )
2198
+ canvas.axes.set_title(f"Heatmap: {feature} vs {target} (proportions)")
2199
+ else:
2200
+ # For high cardinality in both, show a count plot of top categories
2201
+ feature_top = working_df[feature].value_counts().nlargest(8).index
2202
+ target_top = working_df[target].value_counts().nlargest(5).index
2203
+
2204
+ # Filter data to only include top categories
2205
+ filtered_df = working_df[
2206
+ working_df[feature].isin(feature_top) &
2207
+ working_df[target].isin(target_top)
2208
+ ]
2209
+
2210
+ # Create a grouped count plot
2211
+ sns.countplot(
2212
+ x=feature,
2213
+ hue=target,
2214
+ data=filtered_df,
2215
+ ax=canvas.axes
2216
+ )
2217
+ canvas.axes.set_title(f"Count Plot: Top {len(feature_top)} {feature} by Top {len(target_top)} {target}")
2218
+
2219
+ # Rotate x-axis labels
2220
+ canvas.axes.set_xticklabels(
2221
+ canvas.axes.get_xticklabels(),
2222
+ rotation=45,
2223
+ ha='right'
2224
+ )
2225
+
2226
+ # Move legend to a better position
2227
+ canvas.axes.legend(title=target, bbox_to_anchor=(1.05, 1), loc='upper left')
2228
+
2229
+ # Add informational text about data reduction if applicable
2230
+ if (not feature_is_numeric and feature_unique_count > high_cardinality_threshold) or \
2231
+ (not target_is_numeric and target_unique_count > high_cardinality_threshold):
2232
+ canvas.figure.text(
2233
+ 0.5, 0.01,
2234
+ f"Note: Visualization simplified to show top categories only. Original data has {feature_unique_count} unique {feature} values and {target_unique_count} unique {target} values.",
2235
+ ha='center',
2236
+ fontsize=8,
2237
+ style='italic'
2238
+ )
2239
+
2240
+ # Adjust layout and draw
2241
+ canvas.figure.tight_layout()
2242
+ canvas.draw()
2243
+
2244
+ # Add a close button
2245
+ close_button = QPushButton("Close")
2246
+ close_button.clicked.connect(dialog.accept)
2247
+ layout.addWidget(close_button)
2248
+
2249
+ # Show the dialog
2250
+ dialog.exec()
2251
+
2252
+ def change_sort(self, sort_key):
2253
+ """Change the sort order of the results"""
2254
+ if self.importance_df is None:
2255
+ return
2256
+
2257
+ # Update button states
2258
+ if sort_key == 'importance_value':
2259
+ self.importance_sort_btn.setChecked(True)
2260
+ self.correlation_sort_btn.setChecked(False)
2261
+ else:
2262
+ self.importance_sort_btn.setChecked(False)
2263
+ self.correlation_sort_btn.setChecked(True)
2264
+
2265
+ # Store the current sort key
2266
+ self.current_sort = sort_key
2267
+
2268
+ # Re-sort the dataframe
2269
+ self.importance_df = self.importance_df.sort_values(by=sort_key, ascending=False)
2270
+
2271
+ # Reset rendering of the table
2272
+ self.importance_table.clearContents()
2273
+ self.importance_table.setRowCount(len(self.importance_df))
2274
+ self.current_row = 0
2275
+
2276
+ # Start incremental rendering with the new sort order
2277
+ if self.render_timer and self.render_timer.isActive():
2278
+ self.render_timer.stop()
2279
+ self.render_timer = QTimer()
2280
+ self.render_timer.timeout.connect(lambda: self.render_next_batch(10))
2281
+ self.render_timer.start(10) # Update every 10ms
2282
+
2283
+ # Custom matplotlib canvas for embedding in Qt
2284
+ class MatplotlibCanvas(FigureCanvasQTAgg):
2285
+ def __init__(self, width=5, height=4, dpi=100):
2286
+ self.figure = Figure(figsize=(width, height), dpi=dpi)
2287
+ self.axes = self.figure.add_subplot(111)
2288
+ super().__init__(self.figure)
2289
+
2290
+ def visualize_profile(df: pd.DataFrame, column: str = None) -> None:
2291
+ """
2292
+ Launch a PyQt6 UI for visualizing column importance.
2293
+
2294
+ Args:
2295
+ df: DataFrame containing the data
2296
+ column: Optional target column to analyze immediately
2297
+ """
2298
+ try:
2299
+ # Verify df is a valid DataFrame
2300
+ if not isinstance(df, pd.DataFrame):
2301
+ raise ValueError("Input must be a pandas DataFrame")
2302
+
2303
+ # Verify df has data
2304
+ if len(df) == 0:
2305
+ raise ValueError("DataFrame is empty, cannot analyze")
2306
+
2307
+ # Verify columns exist
2308
+ if column is not None and column not in df.columns:
2309
+ raise ValueError(f"Column '{column}' not found in the DataFrame")
2310
+
2311
+ # Check if dataset is too small for meaningful analysis
2312
+ row_count = len(df)
2313
+ if row_count <= 5:
2314
+ print(f"WARNING: Dataset only has {row_count} rows. Feature importance analysis requires more data for meaningful results.")
2315
+ if QApplication.instance():
2316
+ QMessageBox.warning(None, "Insufficient Data",
2317
+ f"The dataset only contains {row_count} rows. Feature importance analysis requires more data for meaningful results.")
2318
+
2319
+ # For large datasets, sample up to 500 rows for better statistical significance
2320
+ elif row_count > 500:
2321
+ print(f"Sampling 500 rows from dataset ({row_count:,} total rows)")
2322
+ df = df.sample(n=500, random_state=42)
2323
+
2324
+ # Check if we're already in a Qt application
2325
+ existing_app = QApplication.instance()
2326
+ standalone_mode = existing_app is None
2327
+
2328
+ # Create app if needed
2329
+ if standalone_mode:
2330
+ app = QApplication(sys.argv)
2331
+ else:
2332
+ app = existing_app
2333
+
2334
+ app.setStyle('Fusion') # Modern look
2335
+
2336
+ # Set modern dark theme (only in standalone mode to avoid affecting parent app)
2337
+ if standalone_mode:
2338
+ palette = QPalette()
2339
+ palette.setColor(QPalette.ColorRole.Window, QColor(53, 53, 53))
2340
+ palette.setColor(QPalette.ColorRole.WindowText, Qt.GlobalColor.white)
2341
+ palette.setColor(QPalette.ColorRole.Base, QColor(25, 25, 25))
2342
+ palette.setColor(QPalette.ColorRole.AlternateBase, QColor(53, 53, 53))
2343
+ palette.setColor(QPalette.ColorRole.ToolTipBase, Qt.GlobalColor.white)
2344
+ palette.setColor(QPalette.ColorRole.ToolTipText, Qt.GlobalColor.white)
2345
+ palette.setColor(QPalette.ColorRole.Text, Qt.GlobalColor.white)
2346
+ palette.setColor(QPalette.ColorRole.Button, QColor(53, 53, 53))
2347
+ palette.setColor(QPalette.ColorRole.ButtonText, Qt.GlobalColor.white)
2348
+ palette.setColor(QPalette.ColorRole.BrightText, Qt.GlobalColor.red)
2349
+ palette.setColor(QPalette.ColorRole.Link, QColor(42, 130, 218))
2350
+ palette.setColor(QPalette.ColorRole.Highlight, QColor(42, 130, 218))
2351
+ palette.setColor(QPalette.ColorRole.HighlightedText, Qt.GlobalColor.black)
2352
+ app.setPalette(palette)
2353
+
2354
+ window = ColumnProfilerApp(df)
2355
+ window.setAttribute(Qt.WidgetAttribute.WA_DeleteOnClose) # Ensure cleanup on close
2356
+ window.show()
2357
+
2358
+ # Add tooltip to explain double-click functionality
2359
+ window.importance_table.setToolTip("Double-click on a feature to visualize its relationship with the target column")
2360
+
2361
+ # If a specific column is provided, analyze it immediately
2362
+ if column is not None and column in df.columns:
2363
+ window.column_selector.setCurrentText(column)
2364
+ # Wrap the analysis in a try/except to prevent crashes
2365
+ def safe_analyze():
2366
+ try:
2367
+ window.analyze_column()
2368
+ except Exception as e:
2369
+ print(f"Error during column analysis: {e}")
2370
+ import traceback
2371
+ traceback.print_exc()
2372
+ QMessageBox.critical(window, "Analysis Error",
2373
+ f"Error analyzing column:\n\n{str(e)}")
2374
+
2375
+ QTimer.singleShot(100, safe_analyze) # Use timer to avoid immediate thread issues
2376
+
2377
+ # Set a watchdog timer to cancel analysis if it takes too long (30 seconds)
2378
+ def check_progress():
2379
+ if window.worker_thread and window.worker_thread.isRunning():
2380
+ # If still running after 30 seconds, cancel the operation
2381
+ QMessageBox.warning(window, "Analysis Timeout",
2382
+ "The analysis is taking longer than expected. It will be canceled to prevent hanging.")
2383
+ try:
2384
+ window.cancel_analysis()
2385
+ except Exception as e:
2386
+ print(f"Error canceling analysis: {e}")
2387
+
2388
+ QTimer.singleShot(30000, check_progress) # 30 seconds timeout
2389
+
2390
+ # Only enter event loop in standalone mode
2391
+ if standalone_mode:
2392
+ sys.exit(app.exec())
2393
+ else:
2394
+ # Return the window for parent app to track
2395
+ return window
2396
+ except Exception as e:
2397
+ # Handle any exceptions to prevent crashes
2398
+ print(f"Error in visualize_profile: {e}")
2399
+ import traceback
2400
+ traceback.print_exc()
2401
+
2402
+ # Show error to user
2403
+ if QApplication.instance():
2404
+ QMessageBox.critical(None, "Profile Error", f"Error creating column profile:\n\n{str(e)}")
2405
+ return None
2406
+
2407
+ def test_profile():
2408
+ """
2409
+ Test the profile and visualization functions with sample data.
2410
+ """
2411
+ # Create a sample DataFrame with 40 columns
2412
+ np.random.seed(42)
2413
+ n = 1000
2414
+
2415
+ # Generate core sample data with known relationships
2416
+ age = np.random.normal(35, 10, n).astype(int)
2417
+ experience = age - np.random.randint(18, 25, n) # experience correlates with age
2418
+ experience = np.maximum(0, experience) # no negative experience
2419
+
2420
+ salary = 30000 + 2000 * experience + np.random.normal(0, 10000, n)
2421
+
2422
+ departments = np.random.choice(['Engineering', 'Marketing', 'Sales', 'HR', 'Finance'], n)
2423
+ education = np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], n,
2424
+ p=[0.2, 0.5, 0.2, 0.1])
2425
+
2426
+ performance = np.random.normal(0, 1, n)
2427
+ performance += 0.5 * (education == 'Master') + 0.8 * (education == 'PhD') # education affects performance
2428
+ performance += 0.01 * experience # experience slightly affects performance
2429
+ performance = (performance - performance.min()) / (performance.max() - performance.min()) * 5 # scale to 0-5
2430
+
2431
+ # Create the base DataFrame
2432
+ data = {
2433
+ 'Age': age,
2434
+ 'Experience': experience,
2435
+ 'Department': departments,
2436
+ 'Education': education,
2437
+ 'Performance': performance,
2438
+ 'Salary': salary
2439
+ }
2440
+
2441
+ # Generate additional numeric columns
2442
+ for i in range(1, 15):
2443
+ # Create some columns with relationship to salary
2444
+ if i <= 5:
2445
+ data[f'Metric_{i}'] = salary * (0.01 * i) + np.random.normal(0, 5000, n)
2446
+ # Create columns with relationship to age
2447
+ elif i <= 10:
2448
+ data[f'Metric_{i}'] = age * (i-5) + np.random.normal(0, 10, n)
2449
+ # Create random columns
2450
+ else:
2451
+ data[f'Metric_{i}'] = np.random.normal(100, 50, n)
2452
+
2453
+ # Generate additional categorical columns
2454
+ categories = [
2455
+ ['A', 'B', 'C', 'D'],
2456
+ ['Low', 'Medium', 'High'],
2457
+ ['North', 'South', 'East', 'West'],
2458
+ ['Type1', 'Type2', 'Type3'],
2459
+ ['Yes', 'No', 'Maybe'],
2460
+ ['Red', 'Green', 'Blue', 'Yellow'],
2461
+ ['Small', 'Medium', 'Large']
2462
+ ]
2463
+
2464
+ for i in range(1, 10):
2465
+ # Pick a category list
2466
+ cat_list = categories[i % len(categories)]
2467
+ # Generate random categorical column
2468
+ data[f'Category_{i}'] = np.random.choice(cat_list, n)
2469
+
2470
+ # Generate date and time related columns
2471
+ base_date = np.datetime64('2020-01-01')
2472
+
2473
+ # Instead of datetime objects, convert to days since base date (numeric values)
2474
+ hire_days = np.array([365 * (35 - a) + np.random.randint(0, 30) for a in age])
2475
+ data['Hire_Days_Ago'] = hire_days
2476
+
2477
+ promotion_days = np.array([np.random.randint(0, 1000) for _ in range(n)])
2478
+ data['Last_Promotion_Days_Ago'] = promotion_days
2479
+
2480
+ review_days = np.array([np.random.randint(1000, 1200) for _ in range(n)])
2481
+ data['Next_Review_In_Days'] = review_days
2482
+
2483
+ # For reference, also store the actual dates as strings instead of datetime64
2484
+ data['Hire_Date_Str'] = [str(base_date + np.timedelta64(int(days), 'D')) for days in hire_days]
2485
+ data['Last_Promotion_Date_Str'] = [str(base_date + np.timedelta64(int(days), 'D')) for days in promotion_days]
2486
+ data['Review_Date_Str'] = [str(base_date + np.timedelta64(int(days), 'D')) for days in review_days]
2487
+
2488
+ # Binary columns
2489
+ data['IsManager'] = np.random.choice([0, 1], n, p=[0.8, 0.2])
2490
+ data['RemoteWorker'] = np.random.choice([0, 1], n)
2491
+ data['HasHealthInsurance'] = np.random.choice([0, 1], n, p=[0.1, 0.9])
2492
+ data['HasRetirementPlan'] = np.random.choice([0, 1], n, p=[0.15, 0.85])
2493
+
2494
+ # Columns with missing values
2495
+ data['OptionalMetric_1'] = np.random.normal(50, 10, n)
2496
+ data['OptionalMetric_1'][np.random.choice([True, False], n, p=[0.2, 0.8])] = np.nan
2497
+
2498
+ data['OptionalMetric_2'] = np.random.normal(100, 20, n)
2499
+ data['OptionalMetric_2'][np.random.choice([True, False], n, p=[0.3, 0.7])] = np.nan
2500
+
2501
+ data['OptionalCategory'] = np.random.choice(['Option1', 'Option2', 'Option3', None], n, p=[0.3, 0.3, 0.3, 0.1])
2502
+
2503
+ # High cardinality column (like an ID)
2504
+ data['ID'] = [f"ID_{i:06d}" for i in range(n)]
2505
+
2506
+ # Create the DataFrame with 40 columns
2507
+ df = pd.DataFrame(data)
2508
+
2509
+ print(f"Created sample DataFrame with {len(df.columns)} columns and {len(df)} rows")
2510
+ print("Columns:", ', '.join(df.columns))
2511
+ print("Launching PyQt6 Column Profiler application...")
2512
+ visualize_profile(df, 'Salary') # Start with Salary analysis
2513
+
2514
+ if __name__ == "__main__":
2515
+ test_profile()