sqlshell 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. sqlshell/__init__.py +84 -0
  2. sqlshell/__main__.py +4926 -0
  3. sqlshell/ai_autocomplete.py +392 -0
  4. sqlshell/ai_settings_dialog.py +337 -0
  5. sqlshell/context_suggester.py +768 -0
  6. sqlshell/create_test_data.py +152 -0
  7. sqlshell/data/create_test_data.py +137 -0
  8. sqlshell/db/__init__.py +6 -0
  9. sqlshell/db/database_manager.py +1318 -0
  10. sqlshell/db/export_manager.py +188 -0
  11. sqlshell/editor.py +1166 -0
  12. sqlshell/editor_integration.py +127 -0
  13. sqlshell/execution_handler.py +421 -0
  14. sqlshell/menus.py +262 -0
  15. sqlshell/notification_manager.py +370 -0
  16. sqlshell/query_tab.py +904 -0
  17. sqlshell/resources/__init__.py +1 -0
  18. sqlshell/resources/icon.png +0 -0
  19. sqlshell/resources/logo_large.png +0 -0
  20. sqlshell/resources/logo_medium.png +0 -0
  21. sqlshell/resources/logo_small.png +0 -0
  22. sqlshell/resources/splash_screen.gif +0 -0
  23. sqlshell/space_invaders.py +501 -0
  24. sqlshell/splash_screen.py +405 -0
  25. sqlshell/sqlshell/__init__.py +5 -0
  26. sqlshell/sqlshell/create_test_data.py +118 -0
  27. sqlshell/sqlshell/create_test_databases.py +96 -0
  28. sqlshell/sqlshell_demo.png +0 -0
  29. sqlshell/styles.py +257 -0
  30. sqlshell/suggester_integration.py +330 -0
  31. sqlshell/syntax_highlighter.py +124 -0
  32. sqlshell/table_list.py +996 -0
  33. sqlshell/ui/__init__.py +6 -0
  34. sqlshell/ui/bar_chart_delegate.py +49 -0
  35. sqlshell/ui/filter_header.py +469 -0
  36. sqlshell/utils/__init__.py +16 -0
  37. sqlshell/utils/profile_cn2.py +1661 -0
  38. sqlshell/utils/profile_column.py +2635 -0
  39. sqlshell/utils/profile_distributions.py +616 -0
  40. sqlshell/utils/profile_entropy.py +347 -0
  41. sqlshell/utils/profile_foreign_keys.py +779 -0
  42. sqlshell/utils/profile_keys.py +2834 -0
  43. sqlshell/utils/profile_ohe.py +934 -0
  44. sqlshell/utils/profile_ohe_advanced.py +754 -0
  45. sqlshell/utils/profile_ohe_comparison.py +237 -0
  46. sqlshell/utils/profile_prediction.py +926 -0
  47. sqlshell/utils/profile_similarity.py +876 -0
  48. sqlshell/utils/search_in_df.py +90 -0
  49. sqlshell/widgets.py +400 -0
  50. sqlshell-0.4.4.dist-info/METADATA +441 -0
  51. sqlshell-0.4.4.dist-info/RECORD +54 -0
  52. sqlshell-0.4.4.dist-info/WHEEL +5 -0
  53. sqlshell-0.4.4.dist-info/entry_points.txt +2 -0
  54. sqlshell-0.4.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,876 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import warnings
4
+ warnings.filterwarnings('ignore')
5
+
6
+ # Try to import optional dependencies
7
+ try:
8
+ import matplotlib
9
+ try:
10
+ matplotlib.use('qtagg') # Set the backend before importing pyplot
11
+ except ImportError:
12
+ matplotlib.use('Agg') # Fall back to headless backend for CI/testing
13
+ import matplotlib.pyplot as plt
14
+ from matplotlib.figure import Figure
15
+ from matplotlib.backends.backend_qtagg import FigureCanvasQTAgg
16
+ MATPLOTLIB_AVAILABLE = True
17
+ except ImportError:
18
+ MATPLOTLIB_AVAILABLE = False
19
+ print("Warning: matplotlib not available, visualizations will be limited")
20
+
21
+ try:
22
+ from PyQt6.QtCore import QObject, pyqtSignal, Qt
23
+ from PyQt6.QtWidgets import (
24
+ QApplication, QMainWindow, QVBoxLayout, QHBoxLayout, QWidget,
25
+ QTableView, QHeaderView, QLabel, QFrame, QScrollArea, QTabWidget,
26
+ QComboBox, QPushButton, QSplitter, QMessageBox
27
+ )
28
+ from PyQt6.QtGui import QStandardItemModel, QStandardItem, QColor, QBrush
29
+ PYQT6_AVAILABLE = True
30
+ except ImportError:
31
+ PYQT6_AVAILABLE = False
32
+ print("Warning: PyQt6 not available, using basic QObject substitute")
33
+
34
+ # Create a basic substitute for QObject when PyQt6 is not available
35
+ class QObject:
36
+ def __init__(self):
37
+ pass
38
+
39
+ class pyqtSignal:
40
+ def __init__(self, *args):
41
+ pass
42
+ def emit(self, *args):
43
+ pass
44
+
45
+ try:
46
+ import seaborn as sns
47
+ SEABORN_AVAILABLE = True
48
+ except ImportError:
49
+ SEABORN_AVAILABLE = False
50
+ print("Warning: seaborn not available")
51
+
52
+ try:
53
+ from scipy.spatial.distance import euclidean, pdist, squareform
54
+ from scipy.stats import zscore
55
+ SCIPY_AVAILABLE = True
56
+ except ImportError:
57
+ SCIPY_AVAILABLE = False
58
+ print("Warning: scipy not available, using numpy alternatives")
59
+
60
+ try:
61
+ from sklearn.preprocessing import StandardScaler
62
+ from sklearn.decomposition import PCA
63
+ SKLEARN_AVAILABLE = True
64
+ except ImportError:
65
+ SKLEARN_AVAILABLE = False
66
+ print("Warning: sklearn not available, PCA analysis will be limited")
67
+
68
+
69
+ class SimilarityProfiler(QObject):
70
+ """Class to analyze similarity between rows and columns using z-scores and euclidean distance"""
71
+ progress_updated = pyqtSignal(int, str) # Signal for progress reporting
72
+
73
+ def __init__(self):
74
+ super().__init__()
75
+ self.similarity_results = {}
76
+ self.z_scores = None
77
+ self.distance_matrix = None
78
+ self.numerical_columns = []
79
+
80
+ def profile(self, df):
81
+ """
82
+ Perform similarity analysis on the dataframe
83
+
84
+ Args:
85
+ df (pd.DataFrame): Input dataframe
86
+
87
+ Returns:
88
+ dict: Dictionary containing similarity analysis results
89
+ """
90
+ self.progress_updated.emit(10, "Starting similarity analysis...")
91
+
92
+ if df is None or df.empty:
93
+ return {"error": "Empty or invalid dataframe"}
94
+
95
+ # Store original dataframe
96
+ self.original_df = df.copy()
97
+
98
+ # Identify numerical columns
99
+ self.numerical_columns = df.select_dtypes(include=[np.number]).columns.tolist()
100
+
101
+ if len(self.numerical_columns) == 0:
102
+ return {"error": "No numerical columns found for similarity analysis"}
103
+
104
+ self.progress_updated.emit(20, "Computing z-scores...")
105
+
106
+ # Calculate z-scores for numerical columns
107
+ numerical_df = df[self.numerical_columns].copy()
108
+
109
+ # Handle missing values
110
+ numerical_df = numerical_df.fillna(numerical_df.mean())
111
+
112
+ # Calculate z-scores
113
+ if SCIPY_AVAILABLE:
114
+ self.z_scores = numerical_df.apply(zscore, nan_policy='omit')
115
+ else:
116
+ # Fallback to manual z-score calculation
117
+ self.z_scores = (numerical_df - numerical_df.mean()) / numerical_df.std()
118
+
119
+ self.progress_updated.emit(40, "Computing distance matrices...")
120
+
121
+ # Calculate euclidean distance between rows
122
+ row_distances = self._calculate_row_distances(numerical_df)
123
+
124
+ # Calculate euclidean distance between columns (features)
125
+ col_distances = self._calculate_column_distances(numerical_df)
126
+
127
+ self.progress_updated.emit(60, "Analyzing similarity patterns...")
128
+
129
+ # Find most similar and dissimilar pairs
130
+ similar_rows, dissimilar_rows = self._find_extreme_pairs(row_distances, 'rows')
131
+ similar_cols, dissimilar_cols = self._find_extreme_pairs(col_distances, 'columns')
132
+
133
+ self.progress_updated.emit(80, "Computing cluster analysis...")
134
+
135
+ # Perform basic clustering analysis
136
+ cluster_info = self._analyze_clusters(numerical_df)
137
+
138
+ self.progress_updated.emit(90, "Finalizing results...")
139
+
140
+ # Store results
141
+ self.similarity_results = {
142
+ 'z_scores': self.z_scores,
143
+ 'row_distances': row_distances,
144
+ 'column_distances': col_distances,
145
+ 'similar_rows': similar_rows,
146
+ 'dissimilar_rows': dissimilar_rows,
147
+ 'similar_columns': similar_cols,
148
+ 'dissimilar_columns': dissimilar_cols,
149
+ 'cluster_info': cluster_info,
150
+ 'numerical_columns': self.numerical_columns,
151
+ 'original_shape': df.shape,
152
+ 'processed_shape': numerical_df.shape
153
+ }
154
+
155
+ self.progress_updated.emit(100, "Similarity analysis complete!")
156
+
157
+ return self.similarity_results
158
+
159
+ def _calculate_row_distances(self, df):
160
+ """Calculate euclidean distances between all pairs of rows"""
161
+ try:
162
+ if SKLEARN_AVAILABLE and SCIPY_AVAILABLE:
163
+ # Standardize the data
164
+ scaler = StandardScaler()
165
+ scaled_data = scaler.fit_transform(df)
166
+
167
+ # Calculate pairwise distances
168
+ distances = pdist(scaled_data, metric='euclidean')
169
+ distance_matrix = squareform(distances)
170
+ else:
171
+ # Fallback to manual calculation
172
+ # Standardize manually
173
+ mean_vals = df.mean()
174
+ std_vals = df.std()
175
+ scaled_data = (df - mean_vals) / std_vals
176
+
177
+ # Calculate pairwise euclidean distances manually
178
+ n_rows = len(scaled_data)
179
+ distance_matrix = np.zeros((n_rows, n_rows))
180
+
181
+ for i in range(n_rows):
182
+ for j in range(i+1, n_rows):
183
+ dist = np.sqrt(np.sum((scaled_data.iloc[i] - scaled_data.iloc[j]) ** 2))
184
+ distance_matrix[i, j] = dist
185
+ distance_matrix[j, i] = dist
186
+
187
+ return pd.DataFrame(
188
+ distance_matrix,
189
+ index=df.index,
190
+ columns=df.index
191
+ )
192
+ except Exception as e:
193
+ print(f"Error calculating row distances: {e}")
194
+ return pd.DataFrame()
195
+
196
+ def _calculate_column_distances(self, df):
197
+ """Calculate euclidean distances between all pairs of columns"""
198
+ try:
199
+ # Transpose to treat columns as observations
200
+ df_transposed = df.T
201
+
202
+ if SKLEARN_AVAILABLE and SCIPY_AVAILABLE:
203
+ # Standardize the data
204
+ scaler = StandardScaler()
205
+ scaled_data = scaler.fit_transform(df_transposed)
206
+
207
+ # Calculate pairwise distances
208
+ distances = pdist(scaled_data, metric='euclidean')
209
+ distance_matrix = squareform(distances)
210
+ else:
211
+ # Fallback to manual calculation
212
+ # Standardize manually
213
+ mean_vals = df_transposed.mean()
214
+ std_vals = df_transposed.std()
215
+ scaled_data = (df_transposed - mean_vals) / std_vals
216
+
217
+ # Calculate pairwise euclidean distances manually
218
+ n_cols = len(scaled_data)
219
+ distance_matrix = np.zeros((n_cols, n_cols))
220
+
221
+ for i in range(n_cols):
222
+ for j in range(i+1, n_cols):
223
+ dist = np.sqrt(np.sum((scaled_data.iloc[i] - scaled_data.iloc[j]) ** 2))
224
+ distance_matrix[i, j] = dist
225
+ distance_matrix[j, i] = dist
226
+
227
+ return pd.DataFrame(
228
+ distance_matrix,
229
+ index=df.columns,
230
+ columns=df.columns
231
+ )
232
+ except Exception as e:
233
+ print(f"Error calculating column distances: {e}")
234
+ return pd.DataFrame()
235
+
236
+ def _find_extreme_pairs(self, distance_matrix, pair_type='rows'):
237
+ """Find most similar and dissimilar pairs from distance matrix"""
238
+ if distance_matrix.empty:
239
+ return [], []
240
+
241
+ # Get upper triangle (avoid duplicates and self-comparisons)
242
+ mask = np.triu(np.ones_like(distance_matrix, dtype=bool), k=1)
243
+ distances = distance_matrix.where(mask)
244
+
245
+ # Flatten and get valid distances
246
+ flat_distances = distances.stack()
247
+
248
+ if len(flat_distances) == 0:
249
+ return [], []
250
+
251
+ # Find most similar (smallest distance) and dissimilar (largest distance)
252
+ similar_pairs = flat_distances.nsmallest(5).index.tolist()
253
+ dissimilar_pairs = flat_distances.nlargest(5).index.tolist()
254
+
255
+ return similar_pairs, dissimilar_pairs
256
+
257
+ def _analyze_clusters(self, df):
258
+ """Perform basic clustering analysis using PCA"""
259
+ try:
260
+ if df.shape[1] < 2:
261
+ return {"error": "Need at least 2 numerical columns for clustering"}
262
+
263
+ if not SKLEARN_AVAILABLE:
264
+ return {"error": "sklearn not available for PCA analysis"}
265
+
266
+ # Standardize data
267
+ scaler = StandardScaler()
268
+ scaled_data = scaler.fit_transform(df)
269
+
270
+ # Apply PCA
271
+ n_components = min(3, df.shape[1]) # Use max 3 components
272
+ pca = PCA(n_components=n_components)
273
+ pca_result = pca.fit_transform(scaled_data)
274
+
275
+ # Calculate explained variance
276
+ explained_variance = pca.explained_variance_ratio_
277
+
278
+ return {
279
+ 'pca_components': pca_result,
280
+ 'explained_variance': explained_variance,
281
+ 'cumulative_variance': np.cumsum(explained_variance),
282
+ 'n_components': n_components,
283
+ 'feature_importance': pca.components_
284
+ }
285
+ except Exception as e:
286
+ return {"error": f"Clustering analysis failed: {e}"}
287
+
288
+
289
+ def visualize_profile(df, profiler_results=None, force_text_mode=False, show_window=True):
290
+ """
291
+ Visualize the similarity profiling results
292
+
293
+ Args:
294
+ df (pd.DataFrame): Original dataframe
295
+ profiler_results (dict): Results from SimilarityProfiler.profile()
296
+ force_text_mode (bool): Force text mode even if GUI is available
297
+ show_window (bool): Whether to show the window (for standalone usage)
298
+
299
+ Returns:
300
+ QWidget or dict: Widget containing the visualization or dict with results if GUI not available
301
+ """
302
+ # If no results provided, run the profiler
303
+ if profiler_results is None:
304
+ profiler = SimilarityProfiler()
305
+ profiler_results = profiler.profile(df)
306
+
307
+ if "error" in profiler_results:
308
+ print(f"Error: {profiler_results['error']}")
309
+ return profiler_results
310
+
311
+ # Check if we should use GUI or text mode
312
+ if force_text_mode or not PYQT6_AVAILABLE:
313
+ # Return results as dictionary with text summary when GUI is not available
314
+ if not PYQT6_AVAILABLE:
315
+ print("PyQt6 not available - providing text summary:")
316
+ else:
317
+ print("Text mode requested - providing text summary:")
318
+ _print_text_summary(profiler_results)
319
+ return profiler_results
320
+
321
+ # Ensure QApplication exists (for standalone usage)
322
+ app = QApplication.instance()
323
+ if app is None:
324
+ # Create QApplication for standalone usage
325
+ # In SQLShell, this will already exist
326
+ app = QApplication([])
327
+
328
+ # Create main widget (only if PyQt6 is available)
329
+ main_widget = QWidget()
330
+ main_layout = QVBoxLayout()
331
+
332
+ # Create tab widget for different visualizations
333
+ tab_widget = QTabWidget()
334
+
335
+ # Tab 1: Z-scores heatmap
336
+ if 'z_scores' in profiler_results and not profiler_results['z_scores'].empty:
337
+ zscore_tab = _create_zscore_visualization(profiler_results['z_scores'])
338
+ if zscore_tab:
339
+ tab_widget.addTab(zscore_tab, "Z-Scores Heatmap")
340
+
341
+ # Tab 2: Row similarity matrix
342
+ if 'row_distances' in profiler_results and not profiler_results['row_distances'].empty:
343
+ row_sim_tab = _create_distance_visualization(
344
+ profiler_results['row_distances'],
345
+ "Row Similarity Matrix"
346
+ )
347
+ if row_sim_tab:
348
+ tab_widget.addTab(row_sim_tab, "Row Similarities")
349
+
350
+ # Tab 3: Column similarity matrix
351
+ if 'column_distances' in profiler_results and not profiler_results['column_distances'].empty:
352
+ col_sim_tab = _create_distance_visualization(
353
+ profiler_results['column_distances'],
354
+ "Column Similarity Matrix"
355
+ )
356
+ if col_sim_tab:
357
+ tab_widget.addTab(col_sim_tab, "Column Similarities")
358
+
359
+ # Tab 4: PCA visualization
360
+ if 'cluster_info' in profiler_results and 'pca_components' in profiler_results['cluster_info']:
361
+ pca_tab = _create_pca_visualization(profiler_results['cluster_info'])
362
+ if pca_tab:
363
+ tab_widget.addTab(pca_tab, "PCA Analysis")
364
+
365
+ # Tab 5: Data Preview with unusual rows highlighted
366
+ if 'z_scores' in profiler_results and not profiler_results['z_scores'].empty:
367
+ preview_tab = _create_data_preview_tab(df, profiler_results)
368
+ if preview_tab:
369
+ tab_widget.addTab(preview_tab, "Data Preview")
370
+
371
+ # Tab 6: Summary statistics
372
+ summary_tab = _create_summary_tab(profiler_results)
373
+ if summary_tab:
374
+ tab_widget.addTab(summary_tab, "Summary")
375
+
376
+ main_layout.addWidget(tab_widget)
377
+ main_widget.setLayout(main_layout)
378
+
379
+ # Set window properties
380
+ main_widget.setWindowTitle("Similarity Analysis Results")
381
+ main_widget.resize(1000, 700)
382
+
383
+ # Show the window if requested (for standalone usage)
384
+ if show_window:
385
+ main_widget.show()
386
+
387
+ # For standalone usage, run the event loop
388
+ app = QApplication.instance()
389
+ if app is not None:
390
+ # Check if we're running as main script
391
+ import sys
392
+ import __main__
393
+
394
+ # Only start event loop if we're the main script and no event loop is running
395
+ if hasattr(__main__, '__file__') and not hasattr(sys, 'ps1'):
396
+ try:
397
+ # We're in a script, start the event loop
398
+ app.exec()
399
+ except RuntimeError:
400
+ # Event loop might already be running, that's okay
401
+ pass
402
+
403
+ return main_widget
404
+
405
+
406
+ def _print_text_summary(results):
407
+ """Print a text summary when GUI is not available"""
408
+ print("\n" + "="*50)
409
+ print("SIMILARITY ANALYSIS SUMMARY")
410
+ print("="*50)
411
+
412
+ print(f"Dataset shape: {results.get('original_shape', 'N/A')}")
413
+ print(f"Numerical columns: {len(results.get('numerical_columns', []))}")
414
+
415
+ if 'similar_rows' in results and results['similar_rows']:
416
+ print("\nMost similar row pairs:")
417
+ for i, pair in enumerate(results['similar_rows'][:3]):
418
+ print(f" {i+1}. Row {pair[0]} ↔ Row {pair[1]}")
419
+
420
+ if 'similar_columns' in results and results['similar_columns']:
421
+ print("\nMost similar column pairs:")
422
+ for i, pair in enumerate(results['similar_columns'][:3]):
423
+ print(f" {i+1}. {pair[0]} ↔ {pair[1]}")
424
+
425
+ if 'cluster_info' in results and 'explained_variance' in results['cluster_info']:
426
+ cluster_info = results['cluster_info']
427
+ print(f"\nPCA Analysis:")
428
+ print(f" Components: {cluster_info['n_components']}")
429
+ print(f" Total variance explained: {cluster_info['cumulative_variance'][-1]:.1%}")
430
+
431
+ print("="*50)
432
+
433
+
434
+ def _create_zscore_visualization(z_scores):
435
+ """Create z-scores heatmap visualization"""
436
+ if not PYQT6_AVAILABLE or not MATPLOTLIB_AVAILABLE:
437
+ return None
438
+
439
+ widget = QWidget()
440
+ layout = QVBoxLayout()
441
+
442
+ # Create matplotlib figure
443
+ fig = Figure(figsize=(12, 8))
444
+ canvas = FigureCanvasQTAgg(fig)
445
+
446
+ ax = fig.add_subplot(111)
447
+
448
+ # Create heatmap
449
+ im = ax.imshow(z_scores.values, cmap='RdBu_r', aspect='auto', vmin=-3, vmax=3)
450
+
451
+ # Set labels
452
+ ax.set_title('Z-Scores Heatmap\n(Blue: Below average, Red: Above average)', fontsize=14)
453
+ ax.set_xlabel('Columns')
454
+ ax.set_ylabel('Rows')
455
+
456
+ # Set ticks
457
+ if len(z_scores.columns) <= 20:
458
+ ax.set_xticks(range(len(z_scores.columns)))
459
+ ax.set_xticklabels(z_scores.columns, rotation=45, ha='right')
460
+ else:
461
+ ax.set_xticks([])
462
+ ax.set_xlabel(f'Columns (showing {len(z_scores.columns)} columns)')
463
+
464
+ if len(z_scores.index) <= 20:
465
+ ax.set_yticks(range(len(z_scores.index)))
466
+ ax.set_yticklabels(z_scores.index)
467
+ else:
468
+ ax.set_yticks([])
469
+ ax.set_ylabel(f'Rows (showing {len(z_scores.index)} rows)')
470
+
471
+ # Add colorbar
472
+ cbar = fig.colorbar(im, ax=ax, shrink=0.8)
473
+ cbar.set_label('Z-Score', rotation=270, labelpad=15)
474
+
475
+ fig.tight_layout()
476
+
477
+ layout.addWidget(canvas)
478
+ widget.setLayout(layout)
479
+
480
+ return widget
481
+
482
+
483
+ def _create_distance_visualization(distance_matrix, title):
484
+ """Create distance matrix visualization"""
485
+ if not PYQT6_AVAILABLE or not MATPLOTLIB_AVAILABLE:
486
+ return None
487
+
488
+ widget = QWidget()
489
+ layout = QVBoxLayout()
490
+
491
+ # Create matplotlib figure
492
+ fig = Figure(figsize=(10, 8))
493
+ canvas = FigureCanvasQTAgg(fig)
494
+
495
+ ax = fig.add_subplot(111)
496
+
497
+ # Create heatmap (invert colormap so smaller distances are darker)
498
+ im = ax.imshow(distance_matrix.values, cmap='viridis_r', aspect='auto')
499
+
500
+ # Set labels
501
+ ax.set_title(f'{title}\n(Darker colors indicate higher similarity)', fontsize=14)
502
+
503
+ # Set ticks
504
+ if len(distance_matrix.index) <= 15:
505
+ ax.set_xticks(range(len(distance_matrix.columns)))
506
+ ax.set_xticklabels(distance_matrix.columns, rotation=45, ha='right')
507
+ ax.set_yticks(range(len(distance_matrix.index)))
508
+ ax.set_yticklabels(distance_matrix.index)
509
+ else:
510
+ ax.set_xticks([])
511
+ ax.set_yticks([])
512
+ ax.set_xlabel(f'Showing {len(distance_matrix.columns)} items')
513
+ ax.set_ylabel(f'Showing {len(distance_matrix.index)} items')
514
+
515
+ # Add colorbar
516
+ cbar = fig.colorbar(im, ax=ax, shrink=0.8)
517
+ cbar.set_label('Euclidean Distance', rotation=270, labelpad=15)
518
+
519
+ fig.tight_layout()
520
+
521
+ layout.addWidget(canvas)
522
+ widget.setLayout(layout)
523
+
524
+ return widget
525
+
526
+
527
+ def _create_pca_visualization(cluster_info):
528
+ """Create PCA visualization"""
529
+ if not PYQT6_AVAILABLE or not MATPLOTLIB_AVAILABLE:
530
+ return None
531
+
532
+ widget = QWidget()
533
+ layout = QVBoxLayout()
534
+
535
+ if 'error' in cluster_info:
536
+ error_label = QLabel(f"PCA Error: {cluster_info['error']}")
537
+ error_label.setStyleSheet("color: red;")
538
+ layout.addWidget(error_label)
539
+ widget.setLayout(layout)
540
+ return widget
541
+
542
+ # Create matplotlib figure
543
+ fig = Figure(figsize=(12, 8))
544
+ canvas = FigureCanvasQTAgg(fig)
545
+
546
+ pca_components = cluster_info['pca_components']
547
+ explained_variance = cluster_info['explained_variance']
548
+ n_components = cluster_info['n_components']
549
+
550
+ if n_components >= 2:
551
+ # Create 2D scatter plot
552
+ ax1 = fig.add_subplot(121)
553
+ scatter = ax1.scatter(pca_components[:, 0], pca_components[:, 1],
554
+ c=range(len(pca_components)), cmap='viridis', alpha=0.7)
555
+ ax1.set_xlabel(f'PC1 ({explained_variance[0]:.1%} variance)')
556
+ ax1.set_ylabel(f'PC2 ({explained_variance[1]:.1%} variance)')
557
+ ax1.set_title('PCA: First Two Components')
558
+ ax1.grid(True, alpha=0.3)
559
+
560
+ # Add colorbar
561
+ cbar = fig.colorbar(scatter, ax=ax1, shrink=0.8)
562
+ cbar.set_label('Row Index')
563
+
564
+ # Create variance explained plot
565
+ if n_components >= 2:
566
+ ax2 = fig.add_subplot(122)
567
+ else:
568
+ ax2 = fig.add_subplot(111)
569
+
570
+ ax2.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.7)
571
+ ax2.plot(range(1, len(explained_variance) + 1), np.cumsum(explained_variance),
572
+ 'ro-', linewidth=2, markersize=6)
573
+ ax2.set_xlabel('Principal Component')
574
+ ax2.set_ylabel('Variance Explained')
575
+ ax2.set_title('PCA Variance Explained')
576
+ ax2.grid(True, alpha=0.3)
577
+ ax2.set_xticks(range(1, len(explained_variance) + 1))
578
+
579
+ fig.tight_layout()
580
+
581
+ layout.addWidget(canvas)
582
+ widget.setLayout(layout)
583
+
584
+ return widget
585
+
586
+
587
+ def _create_summary_tab(results):
588
+ """Create summary statistics tab"""
589
+ if not PYQT6_AVAILABLE:
590
+ return None
591
+
592
+ widget = QWidget()
593
+ layout = QVBoxLayout()
594
+
595
+ # Create scroll area for the summary
596
+ scroll = QScrollArea()
597
+ scroll_widget = QWidget()
598
+ scroll_layout = QVBoxLayout()
599
+
600
+ # Basic information
601
+ info_label = QLabel("<h3>Similarity Analysis Summary</h3>")
602
+ info_label.setAlignment(Qt.AlignmentFlag.AlignCenter)
603
+ scroll_layout.addWidget(info_label)
604
+
605
+ # Dataset info
606
+ dataset_info = f"""
607
+ <b>Dataset Information:</b><br>
608
+ • Original shape: {results.get('original_shape', 'N/A')}<br>
609
+ • Processed shape: {results.get('processed_shape', 'N/A')}<br>
610
+ • Numerical columns analyzed: {len(results.get('numerical_columns', []))}<br>
611
+ """
612
+
613
+ dataset_label = QLabel(dataset_info)
614
+ dataset_label.setWordWrap(True)
615
+ scroll_layout.addWidget(dataset_label)
616
+
617
+ # Similar pairs information
618
+ if 'similar_rows' in results and results['similar_rows']:
619
+ similar_info = "<b>Most Similar Row Pairs:</b><br>"
620
+ for i, pair in enumerate(results['similar_rows'][:3]):
621
+ similar_info += f"• Row {pair[0]} ↔ Row {pair[1]}<br>"
622
+
623
+ similar_label = QLabel(similar_info)
624
+ similar_label.setWordWrap(True)
625
+ scroll_layout.addWidget(similar_label)
626
+
627
+ if 'similar_columns' in results and results['similar_columns']:
628
+ col_similar_info = "<b>Most Similar Column Pairs:</b><br>"
629
+ for i, pair in enumerate(results['similar_columns'][:3]):
630
+ col_similar_info += f"• {pair[0]} ↔ {pair[1]}<br>"
631
+
632
+ col_similar_label = QLabel(col_similar_info)
633
+ col_similar_label.setWordWrap(True)
634
+ scroll_layout.addWidget(col_similar_label)
635
+
636
+ # PCA information
637
+ if 'cluster_info' in results and 'explained_variance' in results['cluster_info']:
638
+ cluster_info = results['cluster_info']
639
+ pca_info = f"""
640
+ <b>PCA Analysis:</b><br>
641
+ • Components: {cluster_info['n_components']}<br>
642
+ • Total variance explained: {cluster_info['cumulative_variance'][-1]:.1%}<br>
643
+ • First component: {cluster_info['explained_variance'][0]:.1%}<br>
644
+ """
645
+
646
+ pca_label = QLabel(pca_info)
647
+ pca_label.setWordWrap(True)
648
+ scroll_layout.addWidget(pca_label)
649
+
650
+ scroll_widget.setLayout(scroll_layout)
651
+ scroll.setWidget(scroll_widget)
652
+ scroll.setWidgetResizable(True)
653
+
654
+ layout.addWidget(scroll)
655
+ widget.setLayout(layout)
656
+
657
+ return widget
658
+
659
+
660
+ def _create_data_preview_tab(original_df, results):
661
+ """Create data preview tab with unusual rows highlighted"""
662
+ if not PYQT6_AVAILABLE:
663
+ return None
664
+
665
+ widget = QWidget()
666
+ layout = QVBoxLayout()
667
+
668
+ # Create header label
669
+ header_label = QLabel("<h3>Data Preview - Unusual Rows Highlighted</h3>")
670
+ header_label.setAlignment(Qt.AlignmentFlag.AlignCenter)
671
+ layout.addWidget(header_label)
672
+
673
+ # Calculate "unusualness" score for each row
674
+ z_scores = results['z_scores']
675
+
676
+ # Calculate unusualness as the sum of absolute z-scores for each row
677
+ unusualness_scores = z_scores.abs().sum(axis=1)
678
+
679
+ # Sort dataframe by unusualness (most unusual first)
680
+ sorted_indices = unusualness_scores.sort_values(ascending=False).index
681
+ sorted_df = original_df.loc[sorted_indices].copy()
682
+ sorted_z_scores = z_scores.loc[sorted_indices]
683
+ sorted_unusualness = unusualness_scores.loc[sorted_indices]
684
+
685
+ # Add unusualness score as first column
686
+ display_df = sorted_df.copy()
687
+ display_df.insert(0, 'Unusualness_Score', sorted_unusualness.round(2))
688
+
689
+ # Limit to top 50 rows for performance
690
+ display_rows = min(50, len(display_df))
691
+ display_df = display_df.head(display_rows)
692
+ sorted_z_scores = sorted_z_scores.head(display_rows)
693
+
694
+ # Create table view
695
+ table_view = QTableView()
696
+ model = QStandardItemModel()
697
+
698
+ # Set headers
699
+ headers = ['Row Index'] + list(display_df.columns)
700
+ model.setHorizontalHeaderLabels(headers)
701
+
702
+ # Populate table with data and coloring
703
+ for row_idx, (orig_idx, row) in enumerate(display_df.iterrows()):
704
+ # Add original row index as first column
705
+ index_item = QStandardItem(str(orig_idx))
706
+ index_item.setBackground(QBrush(QColor(240, 240, 240))) # Light gray background
707
+ model.setItem(row_idx, 0, index_item)
708
+
709
+ # Add data columns
710
+ for col_idx, (col_name, value) in enumerate(row.items()):
711
+ item = QStandardItem(str(value))
712
+
713
+ # Color based on unusualness and z-scores
714
+ if col_name == 'Unusualness_Score':
715
+ # Color unusualness score column
716
+ unusualness = float(value)
717
+ if unusualness > 6: # Very unusual
718
+ item.setBackground(QBrush(QColor(255, 100, 100))) # Red
719
+ elif unusualness > 4: # Unusual
720
+ item.setBackground(QBrush(QColor(255, 200, 100))) # Orange
721
+ elif unusualness > 2: # Somewhat unusual
722
+ item.setBackground(QBrush(QColor(255, 255, 100))) # Yellow
723
+ else: # Normal
724
+ item.setBackground(QBrush(QColor(200, 255, 200))) # Light green
725
+ else:
726
+ # Color data columns based on z-scores
727
+ if col_name in sorted_z_scores.columns:
728
+ z_score = sorted_z_scores.iloc[row_idx][col_name]
729
+
730
+ if abs(z_score) > 3: # Extreme outlier
731
+ item.setBackground(QBrush(QColor(255, 100, 100))) # Red
732
+ elif abs(z_score) > 2: # Outlier
733
+ item.setBackground(QBrush(QColor(255, 200, 100))) # Orange
734
+ elif abs(z_score) > 1: # Somewhat unusual
735
+ item.setBackground(QBrush(QColor(255, 255, 200))) # Light yellow
736
+ # Normal values get no special coloring
737
+
738
+ model.setItem(row_idx, col_idx + 1, item)
739
+
740
+ table_view.setModel(model)
741
+
742
+ # Configure table appearance
743
+ table_view.setAlternatingRowColors(True)
744
+ table_view.setSortingEnabled(True)
745
+ table_view.horizontalHeader().setStretchLastSection(True)
746
+ table_view.resizeColumnsToContents()
747
+
748
+ # Create info panel
749
+ info_text = f"""
750
+ <b>Data Preview Information:</b><br>
751
+ • Showing top {display_rows} most unusual rows (out of {len(original_df)})<br>
752
+ • Rows sorted by unusualness score (sum of absolute z-scores)<br>
753
+ • <span style='background-color: #ff6464; padding: 2px;'>Red</span>: Extreme values (|z-score| > 3 or unusualness > 6)<br>
754
+ • <span style='background-color: #ffc864; padding: 2px;'>Orange</span>: Outliers (|z-score| > 2 or unusualness > 4)<br>
755
+ • <span style='background-color: #ffff64; padding: 2px;'>Yellow</span>: Somewhat unusual (|z-score| > 1 or unusualness > 2)<br>
756
+ • <span style='background-color: #c8ffc8; padding: 2px;'>Light Green</span>: Normal unusualness score<br>
757
+ • White: Normal values<br><br>
758
+ <b>Most Unusual Rows:</b><br>
759
+ """
760
+
761
+ # Add top 5 most unusual rows info
762
+ for i in range(min(5, len(sorted_unusualness))):
763
+ row_idx = sorted_unusualness.index[i]
764
+ score = sorted_unusualness.iloc[i]
765
+ info_text += f"• Row {row_idx}: unusualness score {score:.2f}<br>"
766
+
767
+ info_label = QLabel(info_text)
768
+ info_label.setWordWrap(True)
769
+ info_label.setMaximumHeight(200)
770
+ info_label.setStyleSheet("QLabel { background-color: #f0f0f0; padding: 10px; border: 1px solid #ccc; }")
771
+
772
+ # Add widgets to layout
773
+ layout.addWidget(info_label)
774
+ layout.addWidget(table_view)
775
+
776
+ widget.setLayout(layout)
777
+ return widget
778
+
779
+
780
+ def demo_similarity_analysis():
781
+ """
782
+ Demo function to showcase the similarity analysis capabilities
783
+ Creates a sample dataset and demonstrates both analysis and visualization
784
+ """
785
+ print("Running Similarity Analysis Demo...")
786
+
787
+ # Create sample data for testing
788
+ np.random.seed(42)
789
+ sample_df = pd.DataFrame({
790
+ 'revenue': np.random.normal(1000, 200, 80),
791
+ 'marketing_cost': np.random.normal(500, 100, 80),
792
+ 'customer_satisfaction': np.random.normal(4.0, 0.5, 80),
793
+ 'product_sales': np.random.normal(150, 30, 80)
794
+ })
795
+
796
+ # Create some correlations
797
+ sample_df['product_sales'] = sample_df['revenue'] * 0.15 + np.random.normal(0, 20, 80)
798
+ sample_df['marketing_cost'] = sample_df['revenue'] * 0.4 + np.random.normal(0, 50, 80)
799
+
800
+ # Add some similar rows for testing similarity detection
801
+ sample_df.iloc[40] = sample_df.iloc[10] + np.random.normal(0, 10, 4)
802
+ sample_df.iloc[41] = sample_df.iloc[10] + np.random.normal(0, 15, 4)
803
+
804
+ print(f"Created sample dataset: {sample_df.shape}")
805
+ print(f"Columns: {list(sample_df.columns)}")
806
+
807
+ # Test the profiler
808
+ profiler = SimilarityProfiler()
809
+ results = profiler.profile(sample_df)
810
+
811
+ print("\nAnalysis Results:")
812
+ print(f"✓ Analyzed {len(results.get('numerical_columns', []))} numerical columns")
813
+ print(f"✓ Dataset shape: {results.get('original_shape', 'N/A')}")
814
+ print(f"✓ Found {len(results.get('similar_rows', []))} similar row pairs")
815
+ print(f"✓ Found {len(results.get('similar_columns', []))} similar column pairs")
816
+
817
+ # Show most similar pairs
818
+ if results.get('similar_rows'):
819
+ print(f"\nMost similar rows:")
820
+ for i, pair in enumerate(results['similar_rows'][:3]):
821
+ distance = results['row_distances'].loc[pair[0], pair[1]]
822
+ print(f" {i+1}. Row {pair[0]} ↔ Row {pair[1]} (distance: {distance:.3f})")
823
+
824
+ if results.get('similar_columns'):
825
+ print(f"\nMost similar columns:")
826
+ for i, pair in enumerate(results['similar_columns'][:3]):
827
+ distance = results['column_distances'].loc[pair[0], pair[1]]
828
+ print(f" {i+1}. {pair[0]} ↔ {pair[1]} (distance: {distance:.3f})")
829
+
830
+ # Demonstrate visualization
831
+ print(f"\nCreating visualization...")
832
+ print("Available visualization tabs:")
833
+ print(" 1. Z-Scores Heatmap - Shows standardized values")
834
+ print(" 2. Row Similarities - Distance matrix between rows")
835
+ print(" 3. Column Similarities - Distance matrix between columns")
836
+ print(" 4. PCA Analysis - Principal component analysis")
837
+ print(" 5. Data Preview - Dataframe with unusual rows highlighted")
838
+ print(" 6. Summary - Text summary of results")
839
+
840
+ # For SQLShell integration (widget only)
841
+ widget = visualize_profile(sample_df, results, show_window=False)
842
+ print(f"✓ Created widget for SQLShell: {type(widget)}")
843
+
844
+ # Show the actual visualization window for demo
845
+ print(f"\n🎯 Opening visualization window...")
846
+ print(" Close the window to continue or press Ctrl+C to exit")
847
+
848
+ # This will show the actual GUI window with all tabs
849
+ visualize_profile(sample_df, results, show_window=True)
850
+
851
+ return sample_df, results, widget
852
+
853
+
854
+ # Main function for testing
855
+ if __name__ == "__main__":
856
+ print("="*60)
857
+ print("SIMILARITY PROFILER DEMO")
858
+ print("="*60)
859
+
860
+ try:
861
+ df, results, widget = demo_similarity_analysis()
862
+
863
+ print(f"\n" + "="*60)
864
+ print("DEMO COMPLETED SUCCESSFULLY!")
865
+ print("="*60)
866
+ print("\nTo use in your code:")
867
+ print("1. from sqlshell.utils.profile_similarity import SimilarityProfiler, visualize_profile")
868
+ print("2. profiler = SimilarityProfiler()")
869
+ print("3. results = profiler.profile(your_dataframe)")
870
+ print("4. widget = visualize_profile(your_dataframe, show_window=False) # For SQLShell")
871
+ print("5. visualize_profile(your_dataframe, show_window=True) # For standalone")
872
+
873
+ except Exception as e:
874
+ print(f"Demo failed: {e}")
875
+ import traceback
876
+ traceback.print_exc()