sqlshell 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlshell/__init__.py +84 -0
- sqlshell/__main__.py +4926 -0
- sqlshell/ai_autocomplete.py +392 -0
- sqlshell/ai_settings_dialog.py +337 -0
- sqlshell/context_suggester.py +768 -0
- sqlshell/create_test_data.py +152 -0
- sqlshell/data/create_test_data.py +137 -0
- sqlshell/db/__init__.py +6 -0
- sqlshell/db/database_manager.py +1318 -0
- sqlshell/db/export_manager.py +188 -0
- sqlshell/editor.py +1166 -0
- sqlshell/editor_integration.py +127 -0
- sqlshell/execution_handler.py +421 -0
- sqlshell/menus.py +262 -0
- sqlshell/notification_manager.py +370 -0
- sqlshell/query_tab.py +904 -0
- sqlshell/resources/__init__.py +1 -0
- sqlshell/resources/icon.png +0 -0
- sqlshell/resources/logo_large.png +0 -0
- sqlshell/resources/logo_medium.png +0 -0
- sqlshell/resources/logo_small.png +0 -0
- sqlshell/resources/splash_screen.gif +0 -0
- sqlshell/space_invaders.py +501 -0
- sqlshell/splash_screen.py +405 -0
- sqlshell/sqlshell/__init__.py +5 -0
- sqlshell/sqlshell/create_test_data.py +118 -0
- sqlshell/sqlshell/create_test_databases.py +96 -0
- sqlshell/sqlshell_demo.png +0 -0
- sqlshell/styles.py +257 -0
- sqlshell/suggester_integration.py +330 -0
- sqlshell/syntax_highlighter.py +124 -0
- sqlshell/table_list.py +996 -0
- sqlshell/ui/__init__.py +6 -0
- sqlshell/ui/bar_chart_delegate.py +49 -0
- sqlshell/ui/filter_header.py +469 -0
- sqlshell/utils/__init__.py +16 -0
- sqlshell/utils/profile_cn2.py +1661 -0
- sqlshell/utils/profile_column.py +2635 -0
- sqlshell/utils/profile_distributions.py +616 -0
- sqlshell/utils/profile_entropy.py +347 -0
- sqlshell/utils/profile_foreign_keys.py +779 -0
- sqlshell/utils/profile_keys.py +2834 -0
- sqlshell/utils/profile_ohe.py +934 -0
- sqlshell/utils/profile_ohe_advanced.py +754 -0
- sqlshell/utils/profile_ohe_comparison.py +237 -0
- sqlshell/utils/profile_prediction.py +926 -0
- sqlshell/utils/profile_similarity.py +876 -0
- sqlshell/utils/search_in_df.py +90 -0
- sqlshell/widgets.py +400 -0
- sqlshell-0.4.4.dist-info/METADATA +441 -0
- sqlshell-0.4.4.dist-info/RECORD +54 -0
- sqlshell-0.4.4.dist-info/WHEEL +5 -0
- sqlshell-0.4.4.dist-info/entry_points.txt +2 -0
- sqlshell-0.4.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,876 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import warnings
|
|
4
|
+
warnings.filterwarnings('ignore')
|
|
5
|
+
|
|
6
|
+
# Try to import optional dependencies
|
|
7
|
+
try:
|
|
8
|
+
import matplotlib
|
|
9
|
+
try:
|
|
10
|
+
matplotlib.use('qtagg') # Set the backend before importing pyplot
|
|
11
|
+
except ImportError:
|
|
12
|
+
matplotlib.use('Agg') # Fall back to headless backend for CI/testing
|
|
13
|
+
import matplotlib.pyplot as plt
|
|
14
|
+
from matplotlib.figure import Figure
|
|
15
|
+
from matplotlib.backends.backend_qtagg import FigureCanvasQTAgg
|
|
16
|
+
MATPLOTLIB_AVAILABLE = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
MATPLOTLIB_AVAILABLE = False
|
|
19
|
+
print("Warning: matplotlib not available, visualizations will be limited")
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
from PyQt6.QtCore import QObject, pyqtSignal, Qt
|
|
23
|
+
from PyQt6.QtWidgets import (
|
|
24
|
+
QApplication, QMainWindow, QVBoxLayout, QHBoxLayout, QWidget,
|
|
25
|
+
QTableView, QHeaderView, QLabel, QFrame, QScrollArea, QTabWidget,
|
|
26
|
+
QComboBox, QPushButton, QSplitter, QMessageBox
|
|
27
|
+
)
|
|
28
|
+
from PyQt6.QtGui import QStandardItemModel, QStandardItem, QColor, QBrush
|
|
29
|
+
PYQT6_AVAILABLE = True
|
|
30
|
+
except ImportError:
|
|
31
|
+
PYQT6_AVAILABLE = False
|
|
32
|
+
print("Warning: PyQt6 not available, using basic QObject substitute")
|
|
33
|
+
|
|
34
|
+
# Create a basic substitute for QObject when PyQt6 is not available
|
|
35
|
+
class QObject:
|
|
36
|
+
def __init__(self):
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
class pyqtSignal:
|
|
40
|
+
def __init__(self, *args):
|
|
41
|
+
pass
|
|
42
|
+
def emit(self, *args):
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
import seaborn as sns
|
|
47
|
+
SEABORN_AVAILABLE = True
|
|
48
|
+
except ImportError:
|
|
49
|
+
SEABORN_AVAILABLE = False
|
|
50
|
+
print("Warning: seaborn not available")
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
from scipy.spatial.distance import euclidean, pdist, squareform
|
|
54
|
+
from scipy.stats import zscore
|
|
55
|
+
SCIPY_AVAILABLE = True
|
|
56
|
+
except ImportError:
|
|
57
|
+
SCIPY_AVAILABLE = False
|
|
58
|
+
print("Warning: scipy not available, using numpy alternatives")
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
from sklearn.preprocessing import StandardScaler
|
|
62
|
+
from sklearn.decomposition import PCA
|
|
63
|
+
SKLEARN_AVAILABLE = True
|
|
64
|
+
except ImportError:
|
|
65
|
+
SKLEARN_AVAILABLE = False
|
|
66
|
+
print("Warning: sklearn not available, PCA analysis will be limited")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class SimilarityProfiler(QObject):
|
|
70
|
+
"""Class to analyze similarity between rows and columns using z-scores and euclidean distance"""
|
|
71
|
+
progress_updated = pyqtSignal(int, str) # Signal for progress reporting
|
|
72
|
+
|
|
73
|
+
def __init__(self):
|
|
74
|
+
super().__init__()
|
|
75
|
+
self.similarity_results = {}
|
|
76
|
+
self.z_scores = None
|
|
77
|
+
self.distance_matrix = None
|
|
78
|
+
self.numerical_columns = []
|
|
79
|
+
|
|
80
|
+
def profile(self, df):
|
|
81
|
+
"""
|
|
82
|
+
Perform similarity analysis on the dataframe
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
df (pd.DataFrame): Input dataframe
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
dict: Dictionary containing similarity analysis results
|
|
89
|
+
"""
|
|
90
|
+
self.progress_updated.emit(10, "Starting similarity analysis...")
|
|
91
|
+
|
|
92
|
+
if df is None or df.empty:
|
|
93
|
+
return {"error": "Empty or invalid dataframe"}
|
|
94
|
+
|
|
95
|
+
# Store original dataframe
|
|
96
|
+
self.original_df = df.copy()
|
|
97
|
+
|
|
98
|
+
# Identify numerical columns
|
|
99
|
+
self.numerical_columns = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
100
|
+
|
|
101
|
+
if len(self.numerical_columns) == 0:
|
|
102
|
+
return {"error": "No numerical columns found for similarity analysis"}
|
|
103
|
+
|
|
104
|
+
self.progress_updated.emit(20, "Computing z-scores...")
|
|
105
|
+
|
|
106
|
+
# Calculate z-scores for numerical columns
|
|
107
|
+
numerical_df = df[self.numerical_columns].copy()
|
|
108
|
+
|
|
109
|
+
# Handle missing values
|
|
110
|
+
numerical_df = numerical_df.fillna(numerical_df.mean())
|
|
111
|
+
|
|
112
|
+
# Calculate z-scores
|
|
113
|
+
if SCIPY_AVAILABLE:
|
|
114
|
+
self.z_scores = numerical_df.apply(zscore, nan_policy='omit')
|
|
115
|
+
else:
|
|
116
|
+
# Fallback to manual z-score calculation
|
|
117
|
+
self.z_scores = (numerical_df - numerical_df.mean()) / numerical_df.std()
|
|
118
|
+
|
|
119
|
+
self.progress_updated.emit(40, "Computing distance matrices...")
|
|
120
|
+
|
|
121
|
+
# Calculate euclidean distance between rows
|
|
122
|
+
row_distances = self._calculate_row_distances(numerical_df)
|
|
123
|
+
|
|
124
|
+
# Calculate euclidean distance between columns (features)
|
|
125
|
+
col_distances = self._calculate_column_distances(numerical_df)
|
|
126
|
+
|
|
127
|
+
self.progress_updated.emit(60, "Analyzing similarity patterns...")
|
|
128
|
+
|
|
129
|
+
# Find most similar and dissimilar pairs
|
|
130
|
+
similar_rows, dissimilar_rows = self._find_extreme_pairs(row_distances, 'rows')
|
|
131
|
+
similar_cols, dissimilar_cols = self._find_extreme_pairs(col_distances, 'columns')
|
|
132
|
+
|
|
133
|
+
self.progress_updated.emit(80, "Computing cluster analysis...")
|
|
134
|
+
|
|
135
|
+
# Perform basic clustering analysis
|
|
136
|
+
cluster_info = self._analyze_clusters(numerical_df)
|
|
137
|
+
|
|
138
|
+
self.progress_updated.emit(90, "Finalizing results...")
|
|
139
|
+
|
|
140
|
+
# Store results
|
|
141
|
+
self.similarity_results = {
|
|
142
|
+
'z_scores': self.z_scores,
|
|
143
|
+
'row_distances': row_distances,
|
|
144
|
+
'column_distances': col_distances,
|
|
145
|
+
'similar_rows': similar_rows,
|
|
146
|
+
'dissimilar_rows': dissimilar_rows,
|
|
147
|
+
'similar_columns': similar_cols,
|
|
148
|
+
'dissimilar_columns': dissimilar_cols,
|
|
149
|
+
'cluster_info': cluster_info,
|
|
150
|
+
'numerical_columns': self.numerical_columns,
|
|
151
|
+
'original_shape': df.shape,
|
|
152
|
+
'processed_shape': numerical_df.shape
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
self.progress_updated.emit(100, "Similarity analysis complete!")
|
|
156
|
+
|
|
157
|
+
return self.similarity_results
|
|
158
|
+
|
|
159
|
+
def _calculate_row_distances(self, df):
|
|
160
|
+
"""Calculate euclidean distances between all pairs of rows"""
|
|
161
|
+
try:
|
|
162
|
+
if SKLEARN_AVAILABLE and SCIPY_AVAILABLE:
|
|
163
|
+
# Standardize the data
|
|
164
|
+
scaler = StandardScaler()
|
|
165
|
+
scaled_data = scaler.fit_transform(df)
|
|
166
|
+
|
|
167
|
+
# Calculate pairwise distances
|
|
168
|
+
distances = pdist(scaled_data, metric='euclidean')
|
|
169
|
+
distance_matrix = squareform(distances)
|
|
170
|
+
else:
|
|
171
|
+
# Fallback to manual calculation
|
|
172
|
+
# Standardize manually
|
|
173
|
+
mean_vals = df.mean()
|
|
174
|
+
std_vals = df.std()
|
|
175
|
+
scaled_data = (df - mean_vals) / std_vals
|
|
176
|
+
|
|
177
|
+
# Calculate pairwise euclidean distances manually
|
|
178
|
+
n_rows = len(scaled_data)
|
|
179
|
+
distance_matrix = np.zeros((n_rows, n_rows))
|
|
180
|
+
|
|
181
|
+
for i in range(n_rows):
|
|
182
|
+
for j in range(i+1, n_rows):
|
|
183
|
+
dist = np.sqrt(np.sum((scaled_data.iloc[i] - scaled_data.iloc[j]) ** 2))
|
|
184
|
+
distance_matrix[i, j] = dist
|
|
185
|
+
distance_matrix[j, i] = dist
|
|
186
|
+
|
|
187
|
+
return pd.DataFrame(
|
|
188
|
+
distance_matrix,
|
|
189
|
+
index=df.index,
|
|
190
|
+
columns=df.index
|
|
191
|
+
)
|
|
192
|
+
except Exception as e:
|
|
193
|
+
print(f"Error calculating row distances: {e}")
|
|
194
|
+
return pd.DataFrame()
|
|
195
|
+
|
|
196
|
+
def _calculate_column_distances(self, df):
|
|
197
|
+
"""Calculate euclidean distances between all pairs of columns"""
|
|
198
|
+
try:
|
|
199
|
+
# Transpose to treat columns as observations
|
|
200
|
+
df_transposed = df.T
|
|
201
|
+
|
|
202
|
+
if SKLEARN_AVAILABLE and SCIPY_AVAILABLE:
|
|
203
|
+
# Standardize the data
|
|
204
|
+
scaler = StandardScaler()
|
|
205
|
+
scaled_data = scaler.fit_transform(df_transposed)
|
|
206
|
+
|
|
207
|
+
# Calculate pairwise distances
|
|
208
|
+
distances = pdist(scaled_data, metric='euclidean')
|
|
209
|
+
distance_matrix = squareform(distances)
|
|
210
|
+
else:
|
|
211
|
+
# Fallback to manual calculation
|
|
212
|
+
# Standardize manually
|
|
213
|
+
mean_vals = df_transposed.mean()
|
|
214
|
+
std_vals = df_transposed.std()
|
|
215
|
+
scaled_data = (df_transposed - mean_vals) / std_vals
|
|
216
|
+
|
|
217
|
+
# Calculate pairwise euclidean distances manually
|
|
218
|
+
n_cols = len(scaled_data)
|
|
219
|
+
distance_matrix = np.zeros((n_cols, n_cols))
|
|
220
|
+
|
|
221
|
+
for i in range(n_cols):
|
|
222
|
+
for j in range(i+1, n_cols):
|
|
223
|
+
dist = np.sqrt(np.sum((scaled_data.iloc[i] - scaled_data.iloc[j]) ** 2))
|
|
224
|
+
distance_matrix[i, j] = dist
|
|
225
|
+
distance_matrix[j, i] = dist
|
|
226
|
+
|
|
227
|
+
return pd.DataFrame(
|
|
228
|
+
distance_matrix,
|
|
229
|
+
index=df.columns,
|
|
230
|
+
columns=df.columns
|
|
231
|
+
)
|
|
232
|
+
except Exception as e:
|
|
233
|
+
print(f"Error calculating column distances: {e}")
|
|
234
|
+
return pd.DataFrame()
|
|
235
|
+
|
|
236
|
+
def _find_extreme_pairs(self, distance_matrix, pair_type='rows'):
|
|
237
|
+
"""Find most similar and dissimilar pairs from distance matrix"""
|
|
238
|
+
if distance_matrix.empty:
|
|
239
|
+
return [], []
|
|
240
|
+
|
|
241
|
+
# Get upper triangle (avoid duplicates and self-comparisons)
|
|
242
|
+
mask = np.triu(np.ones_like(distance_matrix, dtype=bool), k=1)
|
|
243
|
+
distances = distance_matrix.where(mask)
|
|
244
|
+
|
|
245
|
+
# Flatten and get valid distances
|
|
246
|
+
flat_distances = distances.stack()
|
|
247
|
+
|
|
248
|
+
if len(flat_distances) == 0:
|
|
249
|
+
return [], []
|
|
250
|
+
|
|
251
|
+
# Find most similar (smallest distance) and dissimilar (largest distance)
|
|
252
|
+
similar_pairs = flat_distances.nsmallest(5).index.tolist()
|
|
253
|
+
dissimilar_pairs = flat_distances.nlargest(5).index.tolist()
|
|
254
|
+
|
|
255
|
+
return similar_pairs, dissimilar_pairs
|
|
256
|
+
|
|
257
|
+
def _analyze_clusters(self, df):
|
|
258
|
+
"""Perform basic clustering analysis using PCA"""
|
|
259
|
+
try:
|
|
260
|
+
if df.shape[1] < 2:
|
|
261
|
+
return {"error": "Need at least 2 numerical columns for clustering"}
|
|
262
|
+
|
|
263
|
+
if not SKLEARN_AVAILABLE:
|
|
264
|
+
return {"error": "sklearn not available for PCA analysis"}
|
|
265
|
+
|
|
266
|
+
# Standardize data
|
|
267
|
+
scaler = StandardScaler()
|
|
268
|
+
scaled_data = scaler.fit_transform(df)
|
|
269
|
+
|
|
270
|
+
# Apply PCA
|
|
271
|
+
n_components = min(3, df.shape[1]) # Use max 3 components
|
|
272
|
+
pca = PCA(n_components=n_components)
|
|
273
|
+
pca_result = pca.fit_transform(scaled_data)
|
|
274
|
+
|
|
275
|
+
# Calculate explained variance
|
|
276
|
+
explained_variance = pca.explained_variance_ratio_
|
|
277
|
+
|
|
278
|
+
return {
|
|
279
|
+
'pca_components': pca_result,
|
|
280
|
+
'explained_variance': explained_variance,
|
|
281
|
+
'cumulative_variance': np.cumsum(explained_variance),
|
|
282
|
+
'n_components': n_components,
|
|
283
|
+
'feature_importance': pca.components_
|
|
284
|
+
}
|
|
285
|
+
except Exception as e:
|
|
286
|
+
return {"error": f"Clustering analysis failed: {e}"}
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def visualize_profile(df, profiler_results=None, force_text_mode=False, show_window=True):
|
|
290
|
+
"""
|
|
291
|
+
Visualize the similarity profiling results
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
df (pd.DataFrame): Original dataframe
|
|
295
|
+
profiler_results (dict): Results from SimilarityProfiler.profile()
|
|
296
|
+
force_text_mode (bool): Force text mode even if GUI is available
|
|
297
|
+
show_window (bool): Whether to show the window (for standalone usage)
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
QWidget or dict: Widget containing the visualization or dict with results if GUI not available
|
|
301
|
+
"""
|
|
302
|
+
# If no results provided, run the profiler
|
|
303
|
+
if profiler_results is None:
|
|
304
|
+
profiler = SimilarityProfiler()
|
|
305
|
+
profiler_results = profiler.profile(df)
|
|
306
|
+
|
|
307
|
+
if "error" in profiler_results:
|
|
308
|
+
print(f"Error: {profiler_results['error']}")
|
|
309
|
+
return profiler_results
|
|
310
|
+
|
|
311
|
+
# Check if we should use GUI or text mode
|
|
312
|
+
if force_text_mode or not PYQT6_AVAILABLE:
|
|
313
|
+
# Return results as dictionary with text summary when GUI is not available
|
|
314
|
+
if not PYQT6_AVAILABLE:
|
|
315
|
+
print("PyQt6 not available - providing text summary:")
|
|
316
|
+
else:
|
|
317
|
+
print("Text mode requested - providing text summary:")
|
|
318
|
+
_print_text_summary(profiler_results)
|
|
319
|
+
return profiler_results
|
|
320
|
+
|
|
321
|
+
# Ensure QApplication exists (for standalone usage)
|
|
322
|
+
app = QApplication.instance()
|
|
323
|
+
if app is None:
|
|
324
|
+
# Create QApplication for standalone usage
|
|
325
|
+
# In SQLShell, this will already exist
|
|
326
|
+
app = QApplication([])
|
|
327
|
+
|
|
328
|
+
# Create main widget (only if PyQt6 is available)
|
|
329
|
+
main_widget = QWidget()
|
|
330
|
+
main_layout = QVBoxLayout()
|
|
331
|
+
|
|
332
|
+
# Create tab widget for different visualizations
|
|
333
|
+
tab_widget = QTabWidget()
|
|
334
|
+
|
|
335
|
+
# Tab 1: Z-scores heatmap
|
|
336
|
+
if 'z_scores' in profiler_results and not profiler_results['z_scores'].empty:
|
|
337
|
+
zscore_tab = _create_zscore_visualization(profiler_results['z_scores'])
|
|
338
|
+
if zscore_tab:
|
|
339
|
+
tab_widget.addTab(zscore_tab, "Z-Scores Heatmap")
|
|
340
|
+
|
|
341
|
+
# Tab 2: Row similarity matrix
|
|
342
|
+
if 'row_distances' in profiler_results and not profiler_results['row_distances'].empty:
|
|
343
|
+
row_sim_tab = _create_distance_visualization(
|
|
344
|
+
profiler_results['row_distances'],
|
|
345
|
+
"Row Similarity Matrix"
|
|
346
|
+
)
|
|
347
|
+
if row_sim_tab:
|
|
348
|
+
tab_widget.addTab(row_sim_tab, "Row Similarities")
|
|
349
|
+
|
|
350
|
+
# Tab 3: Column similarity matrix
|
|
351
|
+
if 'column_distances' in profiler_results and not profiler_results['column_distances'].empty:
|
|
352
|
+
col_sim_tab = _create_distance_visualization(
|
|
353
|
+
profiler_results['column_distances'],
|
|
354
|
+
"Column Similarity Matrix"
|
|
355
|
+
)
|
|
356
|
+
if col_sim_tab:
|
|
357
|
+
tab_widget.addTab(col_sim_tab, "Column Similarities")
|
|
358
|
+
|
|
359
|
+
# Tab 4: PCA visualization
|
|
360
|
+
if 'cluster_info' in profiler_results and 'pca_components' in profiler_results['cluster_info']:
|
|
361
|
+
pca_tab = _create_pca_visualization(profiler_results['cluster_info'])
|
|
362
|
+
if pca_tab:
|
|
363
|
+
tab_widget.addTab(pca_tab, "PCA Analysis")
|
|
364
|
+
|
|
365
|
+
# Tab 5: Data Preview with unusual rows highlighted
|
|
366
|
+
if 'z_scores' in profiler_results and not profiler_results['z_scores'].empty:
|
|
367
|
+
preview_tab = _create_data_preview_tab(df, profiler_results)
|
|
368
|
+
if preview_tab:
|
|
369
|
+
tab_widget.addTab(preview_tab, "Data Preview")
|
|
370
|
+
|
|
371
|
+
# Tab 6: Summary statistics
|
|
372
|
+
summary_tab = _create_summary_tab(profiler_results)
|
|
373
|
+
if summary_tab:
|
|
374
|
+
tab_widget.addTab(summary_tab, "Summary")
|
|
375
|
+
|
|
376
|
+
main_layout.addWidget(tab_widget)
|
|
377
|
+
main_widget.setLayout(main_layout)
|
|
378
|
+
|
|
379
|
+
# Set window properties
|
|
380
|
+
main_widget.setWindowTitle("Similarity Analysis Results")
|
|
381
|
+
main_widget.resize(1000, 700)
|
|
382
|
+
|
|
383
|
+
# Show the window if requested (for standalone usage)
|
|
384
|
+
if show_window:
|
|
385
|
+
main_widget.show()
|
|
386
|
+
|
|
387
|
+
# For standalone usage, run the event loop
|
|
388
|
+
app = QApplication.instance()
|
|
389
|
+
if app is not None:
|
|
390
|
+
# Check if we're running as main script
|
|
391
|
+
import sys
|
|
392
|
+
import __main__
|
|
393
|
+
|
|
394
|
+
# Only start event loop if we're the main script and no event loop is running
|
|
395
|
+
if hasattr(__main__, '__file__') and not hasattr(sys, 'ps1'):
|
|
396
|
+
try:
|
|
397
|
+
# We're in a script, start the event loop
|
|
398
|
+
app.exec()
|
|
399
|
+
except RuntimeError:
|
|
400
|
+
# Event loop might already be running, that's okay
|
|
401
|
+
pass
|
|
402
|
+
|
|
403
|
+
return main_widget
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def _print_text_summary(results):
|
|
407
|
+
"""Print a text summary when GUI is not available"""
|
|
408
|
+
print("\n" + "="*50)
|
|
409
|
+
print("SIMILARITY ANALYSIS SUMMARY")
|
|
410
|
+
print("="*50)
|
|
411
|
+
|
|
412
|
+
print(f"Dataset shape: {results.get('original_shape', 'N/A')}")
|
|
413
|
+
print(f"Numerical columns: {len(results.get('numerical_columns', []))}")
|
|
414
|
+
|
|
415
|
+
if 'similar_rows' in results and results['similar_rows']:
|
|
416
|
+
print("\nMost similar row pairs:")
|
|
417
|
+
for i, pair in enumerate(results['similar_rows'][:3]):
|
|
418
|
+
print(f" {i+1}. Row {pair[0]} ↔ Row {pair[1]}")
|
|
419
|
+
|
|
420
|
+
if 'similar_columns' in results and results['similar_columns']:
|
|
421
|
+
print("\nMost similar column pairs:")
|
|
422
|
+
for i, pair in enumerate(results['similar_columns'][:3]):
|
|
423
|
+
print(f" {i+1}. {pair[0]} ↔ {pair[1]}")
|
|
424
|
+
|
|
425
|
+
if 'cluster_info' in results and 'explained_variance' in results['cluster_info']:
|
|
426
|
+
cluster_info = results['cluster_info']
|
|
427
|
+
print(f"\nPCA Analysis:")
|
|
428
|
+
print(f" Components: {cluster_info['n_components']}")
|
|
429
|
+
print(f" Total variance explained: {cluster_info['cumulative_variance'][-1]:.1%}")
|
|
430
|
+
|
|
431
|
+
print("="*50)
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def _create_zscore_visualization(z_scores):
|
|
435
|
+
"""Create z-scores heatmap visualization"""
|
|
436
|
+
if not PYQT6_AVAILABLE or not MATPLOTLIB_AVAILABLE:
|
|
437
|
+
return None
|
|
438
|
+
|
|
439
|
+
widget = QWidget()
|
|
440
|
+
layout = QVBoxLayout()
|
|
441
|
+
|
|
442
|
+
# Create matplotlib figure
|
|
443
|
+
fig = Figure(figsize=(12, 8))
|
|
444
|
+
canvas = FigureCanvasQTAgg(fig)
|
|
445
|
+
|
|
446
|
+
ax = fig.add_subplot(111)
|
|
447
|
+
|
|
448
|
+
# Create heatmap
|
|
449
|
+
im = ax.imshow(z_scores.values, cmap='RdBu_r', aspect='auto', vmin=-3, vmax=3)
|
|
450
|
+
|
|
451
|
+
# Set labels
|
|
452
|
+
ax.set_title('Z-Scores Heatmap\n(Blue: Below average, Red: Above average)', fontsize=14)
|
|
453
|
+
ax.set_xlabel('Columns')
|
|
454
|
+
ax.set_ylabel('Rows')
|
|
455
|
+
|
|
456
|
+
# Set ticks
|
|
457
|
+
if len(z_scores.columns) <= 20:
|
|
458
|
+
ax.set_xticks(range(len(z_scores.columns)))
|
|
459
|
+
ax.set_xticklabels(z_scores.columns, rotation=45, ha='right')
|
|
460
|
+
else:
|
|
461
|
+
ax.set_xticks([])
|
|
462
|
+
ax.set_xlabel(f'Columns (showing {len(z_scores.columns)} columns)')
|
|
463
|
+
|
|
464
|
+
if len(z_scores.index) <= 20:
|
|
465
|
+
ax.set_yticks(range(len(z_scores.index)))
|
|
466
|
+
ax.set_yticklabels(z_scores.index)
|
|
467
|
+
else:
|
|
468
|
+
ax.set_yticks([])
|
|
469
|
+
ax.set_ylabel(f'Rows (showing {len(z_scores.index)} rows)')
|
|
470
|
+
|
|
471
|
+
# Add colorbar
|
|
472
|
+
cbar = fig.colorbar(im, ax=ax, shrink=0.8)
|
|
473
|
+
cbar.set_label('Z-Score', rotation=270, labelpad=15)
|
|
474
|
+
|
|
475
|
+
fig.tight_layout()
|
|
476
|
+
|
|
477
|
+
layout.addWidget(canvas)
|
|
478
|
+
widget.setLayout(layout)
|
|
479
|
+
|
|
480
|
+
return widget
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
def _create_distance_visualization(distance_matrix, title):
|
|
484
|
+
"""Create distance matrix visualization"""
|
|
485
|
+
if not PYQT6_AVAILABLE or not MATPLOTLIB_AVAILABLE:
|
|
486
|
+
return None
|
|
487
|
+
|
|
488
|
+
widget = QWidget()
|
|
489
|
+
layout = QVBoxLayout()
|
|
490
|
+
|
|
491
|
+
# Create matplotlib figure
|
|
492
|
+
fig = Figure(figsize=(10, 8))
|
|
493
|
+
canvas = FigureCanvasQTAgg(fig)
|
|
494
|
+
|
|
495
|
+
ax = fig.add_subplot(111)
|
|
496
|
+
|
|
497
|
+
# Create heatmap (invert colormap so smaller distances are darker)
|
|
498
|
+
im = ax.imshow(distance_matrix.values, cmap='viridis_r', aspect='auto')
|
|
499
|
+
|
|
500
|
+
# Set labels
|
|
501
|
+
ax.set_title(f'{title}\n(Darker colors indicate higher similarity)', fontsize=14)
|
|
502
|
+
|
|
503
|
+
# Set ticks
|
|
504
|
+
if len(distance_matrix.index) <= 15:
|
|
505
|
+
ax.set_xticks(range(len(distance_matrix.columns)))
|
|
506
|
+
ax.set_xticklabels(distance_matrix.columns, rotation=45, ha='right')
|
|
507
|
+
ax.set_yticks(range(len(distance_matrix.index)))
|
|
508
|
+
ax.set_yticklabels(distance_matrix.index)
|
|
509
|
+
else:
|
|
510
|
+
ax.set_xticks([])
|
|
511
|
+
ax.set_yticks([])
|
|
512
|
+
ax.set_xlabel(f'Showing {len(distance_matrix.columns)} items')
|
|
513
|
+
ax.set_ylabel(f'Showing {len(distance_matrix.index)} items')
|
|
514
|
+
|
|
515
|
+
# Add colorbar
|
|
516
|
+
cbar = fig.colorbar(im, ax=ax, shrink=0.8)
|
|
517
|
+
cbar.set_label('Euclidean Distance', rotation=270, labelpad=15)
|
|
518
|
+
|
|
519
|
+
fig.tight_layout()
|
|
520
|
+
|
|
521
|
+
layout.addWidget(canvas)
|
|
522
|
+
widget.setLayout(layout)
|
|
523
|
+
|
|
524
|
+
return widget
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def _create_pca_visualization(cluster_info):
|
|
528
|
+
"""Create PCA visualization"""
|
|
529
|
+
if not PYQT6_AVAILABLE or not MATPLOTLIB_AVAILABLE:
|
|
530
|
+
return None
|
|
531
|
+
|
|
532
|
+
widget = QWidget()
|
|
533
|
+
layout = QVBoxLayout()
|
|
534
|
+
|
|
535
|
+
if 'error' in cluster_info:
|
|
536
|
+
error_label = QLabel(f"PCA Error: {cluster_info['error']}")
|
|
537
|
+
error_label.setStyleSheet("color: red;")
|
|
538
|
+
layout.addWidget(error_label)
|
|
539
|
+
widget.setLayout(layout)
|
|
540
|
+
return widget
|
|
541
|
+
|
|
542
|
+
# Create matplotlib figure
|
|
543
|
+
fig = Figure(figsize=(12, 8))
|
|
544
|
+
canvas = FigureCanvasQTAgg(fig)
|
|
545
|
+
|
|
546
|
+
pca_components = cluster_info['pca_components']
|
|
547
|
+
explained_variance = cluster_info['explained_variance']
|
|
548
|
+
n_components = cluster_info['n_components']
|
|
549
|
+
|
|
550
|
+
if n_components >= 2:
|
|
551
|
+
# Create 2D scatter plot
|
|
552
|
+
ax1 = fig.add_subplot(121)
|
|
553
|
+
scatter = ax1.scatter(pca_components[:, 0], pca_components[:, 1],
|
|
554
|
+
c=range(len(pca_components)), cmap='viridis', alpha=0.7)
|
|
555
|
+
ax1.set_xlabel(f'PC1 ({explained_variance[0]:.1%} variance)')
|
|
556
|
+
ax1.set_ylabel(f'PC2 ({explained_variance[1]:.1%} variance)')
|
|
557
|
+
ax1.set_title('PCA: First Two Components')
|
|
558
|
+
ax1.grid(True, alpha=0.3)
|
|
559
|
+
|
|
560
|
+
# Add colorbar
|
|
561
|
+
cbar = fig.colorbar(scatter, ax=ax1, shrink=0.8)
|
|
562
|
+
cbar.set_label('Row Index')
|
|
563
|
+
|
|
564
|
+
# Create variance explained plot
|
|
565
|
+
if n_components >= 2:
|
|
566
|
+
ax2 = fig.add_subplot(122)
|
|
567
|
+
else:
|
|
568
|
+
ax2 = fig.add_subplot(111)
|
|
569
|
+
|
|
570
|
+
ax2.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.7)
|
|
571
|
+
ax2.plot(range(1, len(explained_variance) + 1), np.cumsum(explained_variance),
|
|
572
|
+
'ro-', linewidth=2, markersize=6)
|
|
573
|
+
ax2.set_xlabel('Principal Component')
|
|
574
|
+
ax2.set_ylabel('Variance Explained')
|
|
575
|
+
ax2.set_title('PCA Variance Explained')
|
|
576
|
+
ax2.grid(True, alpha=0.3)
|
|
577
|
+
ax2.set_xticks(range(1, len(explained_variance) + 1))
|
|
578
|
+
|
|
579
|
+
fig.tight_layout()
|
|
580
|
+
|
|
581
|
+
layout.addWidget(canvas)
|
|
582
|
+
widget.setLayout(layout)
|
|
583
|
+
|
|
584
|
+
return widget
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
def _create_summary_tab(results):
|
|
588
|
+
"""Create summary statistics tab"""
|
|
589
|
+
if not PYQT6_AVAILABLE:
|
|
590
|
+
return None
|
|
591
|
+
|
|
592
|
+
widget = QWidget()
|
|
593
|
+
layout = QVBoxLayout()
|
|
594
|
+
|
|
595
|
+
# Create scroll area for the summary
|
|
596
|
+
scroll = QScrollArea()
|
|
597
|
+
scroll_widget = QWidget()
|
|
598
|
+
scroll_layout = QVBoxLayout()
|
|
599
|
+
|
|
600
|
+
# Basic information
|
|
601
|
+
info_label = QLabel("<h3>Similarity Analysis Summary</h3>")
|
|
602
|
+
info_label.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
603
|
+
scroll_layout.addWidget(info_label)
|
|
604
|
+
|
|
605
|
+
# Dataset info
|
|
606
|
+
dataset_info = f"""
|
|
607
|
+
<b>Dataset Information:</b><br>
|
|
608
|
+
• Original shape: {results.get('original_shape', 'N/A')}<br>
|
|
609
|
+
• Processed shape: {results.get('processed_shape', 'N/A')}<br>
|
|
610
|
+
• Numerical columns analyzed: {len(results.get('numerical_columns', []))}<br>
|
|
611
|
+
"""
|
|
612
|
+
|
|
613
|
+
dataset_label = QLabel(dataset_info)
|
|
614
|
+
dataset_label.setWordWrap(True)
|
|
615
|
+
scroll_layout.addWidget(dataset_label)
|
|
616
|
+
|
|
617
|
+
# Similar pairs information
|
|
618
|
+
if 'similar_rows' in results and results['similar_rows']:
|
|
619
|
+
similar_info = "<b>Most Similar Row Pairs:</b><br>"
|
|
620
|
+
for i, pair in enumerate(results['similar_rows'][:3]):
|
|
621
|
+
similar_info += f"• Row {pair[0]} ↔ Row {pair[1]}<br>"
|
|
622
|
+
|
|
623
|
+
similar_label = QLabel(similar_info)
|
|
624
|
+
similar_label.setWordWrap(True)
|
|
625
|
+
scroll_layout.addWidget(similar_label)
|
|
626
|
+
|
|
627
|
+
if 'similar_columns' in results and results['similar_columns']:
|
|
628
|
+
col_similar_info = "<b>Most Similar Column Pairs:</b><br>"
|
|
629
|
+
for i, pair in enumerate(results['similar_columns'][:3]):
|
|
630
|
+
col_similar_info += f"• {pair[0]} ↔ {pair[1]}<br>"
|
|
631
|
+
|
|
632
|
+
col_similar_label = QLabel(col_similar_info)
|
|
633
|
+
col_similar_label.setWordWrap(True)
|
|
634
|
+
scroll_layout.addWidget(col_similar_label)
|
|
635
|
+
|
|
636
|
+
# PCA information
|
|
637
|
+
if 'cluster_info' in results and 'explained_variance' in results['cluster_info']:
|
|
638
|
+
cluster_info = results['cluster_info']
|
|
639
|
+
pca_info = f"""
|
|
640
|
+
<b>PCA Analysis:</b><br>
|
|
641
|
+
• Components: {cluster_info['n_components']}<br>
|
|
642
|
+
• Total variance explained: {cluster_info['cumulative_variance'][-1]:.1%}<br>
|
|
643
|
+
• First component: {cluster_info['explained_variance'][0]:.1%}<br>
|
|
644
|
+
"""
|
|
645
|
+
|
|
646
|
+
pca_label = QLabel(pca_info)
|
|
647
|
+
pca_label.setWordWrap(True)
|
|
648
|
+
scroll_layout.addWidget(pca_label)
|
|
649
|
+
|
|
650
|
+
scroll_widget.setLayout(scroll_layout)
|
|
651
|
+
scroll.setWidget(scroll_widget)
|
|
652
|
+
scroll.setWidgetResizable(True)
|
|
653
|
+
|
|
654
|
+
layout.addWidget(scroll)
|
|
655
|
+
widget.setLayout(layout)
|
|
656
|
+
|
|
657
|
+
return widget
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
def _create_data_preview_tab(original_df, results):
|
|
661
|
+
"""Create data preview tab with unusual rows highlighted"""
|
|
662
|
+
if not PYQT6_AVAILABLE:
|
|
663
|
+
return None
|
|
664
|
+
|
|
665
|
+
widget = QWidget()
|
|
666
|
+
layout = QVBoxLayout()
|
|
667
|
+
|
|
668
|
+
# Create header label
|
|
669
|
+
header_label = QLabel("<h3>Data Preview - Unusual Rows Highlighted</h3>")
|
|
670
|
+
header_label.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
671
|
+
layout.addWidget(header_label)
|
|
672
|
+
|
|
673
|
+
# Calculate "unusualness" score for each row
|
|
674
|
+
z_scores = results['z_scores']
|
|
675
|
+
|
|
676
|
+
# Calculate unusualness as the sum of absolute z-scores for each row
|
|
677
|
+
unusualness_scores = z_scores.abs().sum(axis=1)
|
|
678
|
+
|
|
679
|
+
# Sort dataframe by unusualness (most unusual first)
|
|
680
|
+
sorted_indices = unusualness_scores.sort_values(ascending=False).index
|
|
681
|
+
sorted_df = original_df.loc[sorted_indices].copy()
|
|
682
|
+
sorted_z_scores = z_scores.loc[sorted_indices]
|
|
683
|
+
sorted_unusualness = unusualness_scores.loc[sorted_indices]
|
|
684
|
+
|
|
685
|
+
# Add unusualness score as first column
|
|
686
|
+
display_df = sorted_df.copy()
|
|
687
|
+
display_df.insert(0, 'Unusualness_Score', sorted_unusualness.round(2))
|
|
688
|
+
|
|
689
|
+
# Limit to top 50 rows for performance
|
|
690
|
+
display_rows = min(50, len(display_df))
|
|
691
|
+
display_df = display_df.head(display_rows)
|
|
692
|
+
sorted_z_scores = sorted_z_scores.head(display_rows)
|
|
693
|
+
|
|
694
|
+
# Create table view
|
|
695
|
+
table_view = QTableView()
|
|
696
|
+
model = QStandardItemModel()
|
|
697
|
+
|
|
698
|
+
# Set headers
|
|
699
|
+
headers = ['Row Index'] + list(display_df.columns)
|
|
700
|
+
model.setHorizontalHeaderLabels(headers)
|
|
701
|
+
|
|
702
|
+
# Populate table with data and coloring
|
|
703
|
+
for row_idx, (orig_idx, row) in enumerate(display_df.iterrows()):
|
|
704
|
+
# Add original row index as first column
|
|
705
|
+
index_item = QStandardItem(str(orig_idx))
|
|
706
|
+
index_item.setBackground(QBrush(QColor(240, 240, 240))) # Light gray background
|
|
707
|
+
model.setItem(row_idx, 0, index_item)
|
|
708
|
+
|
|
709
|
+
# Add data columns
|
|
710
|
+
for col_idx, (col_name, value) in enumerate(row.items()):
|
|
711
|
+
item = QStandardItem(str(value))
|
|
712
|
+
|
|
713
|
+
# Color based on unusualness and z-scores
|
|
714
|
+
if col_name == 'Unusualness_Score':
|
|
715
|
+
# Color unusualness score column
|
|
716
|
+
unusualness = float(value)
|
|
717
|
+
if unusualness > 6: # Very unusual
|
|
718
|
+
item.setBackground(QBrush(QColor(255, 100, 100))) # Red
|
|
719
|
+
elif unusualness > 4: # Unusual
|
|
720
|
+
item.setBackground(QBrush(QColor(255, 200, 100))) # Orange
|
|
721
|
+
elif unusualness > 2: # Somewhat unusual
|
|
722
|
+
item.setBackground(QBrush(QColor(255, 255, 100))) # Yellow
|
|
723
|
+
else: # Normal
|
|
724
|
+
item.setBackground(QBrush(QColor(200, 255, 200))) # Light green
|
|
725
|
+
else:
|
|
726
|
+
# Color data columns based on z-scores
|
|
727
|
+
if col_name in sorted_z_scores.columns:
|
|
728
|
+
z_score = sorted_z_scores.iloc[row_idx][col_name]
|
|
729
|
+
|
|
730
|
+
if abs(z_score) > 3: # Extreme outlier
|
|
731
|
+
item.setBackground(QBrush(QColor(255, 100, 100))) # Red
|
|
732
|
+
elif abs(z_score) > 2: # Outlier
|
|
733
|
+
item.setBackground(QBrush(QColor(255, 200, 100))) # Orange
|
|
734
|
+
elif abs(z_score) > 1: # Somewhat unusual
|
|
735
|
+
item.setBackground(QBrush(QColor(255, 255, 200))) # Light yellow
|
|
736
|
+
# Normal values get no special coloring
|
|
737
|
+
|
|
738
|
+
model.setItem(row_idx, col_idx + 1, item)
|
|
739
|
+
|
|
740
|
+
table_view.setModel(model)
|
|
741
|
+
|
|
742
|
+
# Configure table appearance
|
|
743
|
+
table_view.setAlternatingRowColors(True)
|
|
744
|
+
table_view.setSortingEnabled(True)
|
|
745
|
+
table_view.horizontalHeader().setStretchLastSection(True)
|
|
746
|
+
table_view.resizeColumnsToContents()
|
|
747
|
+
|
|
748
|
+
# Create info panel
|
|
749
|
+
info_text = f"""
|
|
750
|
+
<b>Data Preview Information:</b><br>
|
|
751
|
+
• Showing top {display_rows} most unusual rows (out of {len(original_df)})<br>
|
|
752
|
+
• Rows sorted by unusualness score (sum of absolute z-scores)<br>
|
|
753
|
+
• <span style='background-color: #ff6464; padding: 2px;'>Red</span>: Extreme values (|z-score| > 3 or unusualness > 6)<br>
|
|
754
|
+
• <span style='background-color: #ffc864; padding: 2px;'>Orange</span>: Outliers (|z-score| > 2 or unusualness > 4)<br>
|
|
755
|
+
• <span style='background-color: #ffff64; padding: 2px;'>Yellow</span>: Somewhat unusual (|z-score| > 1 or unusualness > 2)<br>
|
|
756
|
+
• <span style='background-color: #c8ffc8; padding: 2px;'>Light Green</span>: Normal unusualness score<br>
|
|
757
|
+
• White: Normal values<br><br>
|
|
758
|
+
<b>Most Unusual Rows:</b><br>
|
|
759
|
+
"""
|
|
760
|
+
|
|
761
|
+
# Add top 5 most unusual rows info
|
|
762
|
+
for i in range(min(5, len(sorted_unusualness))):
|
|
763
|
+
row_idx = sorted_unusualness.index[i]
|
|
764
|
+
score = sorted_unusualness.iloc[i]
|
|
765
|
+
info_text += f"• Row {row_idx}: unusualness score {score:.2f}<br>"
|
|
766
|
+
|
|
767
|
+
info_label = QLabel(info_text)
|
|
768
|
+
info_label.setWordWrap(True)
|
|
769
|
+
info_label.setMaximumHeight(200)
|
|
770
|
+
info_label.setStyleSheet("QLabel { background-color: #f0f0f0; padding: 10px; border: 1px solid #ccc; }")
|
|
771
|
+
|
|
772
|
+
# Add widgets to layout
|
|
773
|
+
layout.addWidget(info_label)
|
|
774
|
+
layout.addWidget(table_view)
|
|
775
|
+
|
|
776
|
+
widget.setLayout(layout)
|
|
777
|
+
return widget
|
|
778
|
+
|
|
779
|
+
|
|
780
|
+
def demo_similarity_analysis():
|
|
781
|
+
"""
|
|
782
|
+
Demo function to showcase the similarity analysis capabilities
|
|
783
|
+
Creates a sample dataset and demonstrates both analysis and visualization
|
|
784
|
+
"""
|
|
785
|
+
print("Running Similarity Analysis Demo...")
|
|
786
|
+
|
|
787
|
+
# Create sample data for testing
|
|
788
|
+
np.random.seed(42)
|
|
789
|
+
sample_df = pd.DataFrame({
|
|
790
|
+
'revenue': np.random.normal(1000, 200, 80),
|
|
791
|
+
'marketing_cost': np.random.normal(500, 100, 80),
|
|
792
|
+
'customer_satisfaction': np.random.normal(4.0, 0.5, 80),
|
|
793
|
+
'product_sales': np.random.normal(150, 30, 80)
|
|
794
|
+
})
|
|
795
|
+
|
|
796
|
+
# Create some correlations
|
|
797
|
+
sample_df['product_sales'] = sample_df['revenue'] * 0.15 + np.random.normal(0, 20, 80)
|
|
798
|
+
sample_df['marketing_cost'] = sample_df['revenue'] * 0.4 + np.random.normal(0, 50, 80)
|
|
799
|
+
|
|
800
|
+
# Add some similar rows for testing similarity detection
|
|
801
|
+
sample_df.iloc[40] = sample_df.iloc[10] + np.random.normal(0, 10, 4)
|
|
802
|
+
sample_df.iloc[41] = sample_df.iloc[10] + np.random.normal(0, 15, 4)
|
|
803
|
+
|
|
804
|
+
print(f"Created sample dataset: {sample_df.shape}")
|
|
805
|
+
print(f"Columns: {list(sample_df.columns)}")
|
|
806
|
+
|
|
807
|
+
# Test the profiler
|
|
808
|
+
profiler = SimilarityProfiler()
|
|
809
|
+
results = profiler.profile(sample_df)
|
|
810
|
+
|
|
811
|
+
print("\nAnalysis Results:")
|
|
812
|
+
print(f"✓ Analyzed {len(results.get('numerical_columns', []))} numerical columns")
|
|
813
|
+
print(f"✓ Dataset shape: {results.get('original_shape', 'N/A')}")
|
|
814
|
+
print(f"✓ Found {len(results.get('similar_rows', []))} similar row pairs")
|
|
815
|
+
print(f"✓ Found {len(results.get('similar_columns', []))} similar column pairs")
|
|
816
|
+
|
|
817
|
+
# Show most similar pairs
|
|
818
|
+
if results.get('similar_rows'):
|
|
819
|
+
print(f"\nMost similar rows:")
|
|
820
|
+
for i, pair in enumerate(results['similar_rows'][:3]):
|
|
821
|
+
distance = results['row_distances'].loc[pair[0], pair[1]]
|
|
822
|
+
print(f" {i+1}. Row {pair[0]} ↔ Row {pair[1]} (distance: {distance:.3f})")
|
|
823
|
+
|
|
824
|
+
if results.get('similar_columns'):
|
|
825
|
+
print(f"\nMost similar columns:")
|
|
826
|
+
for i, pair in enumerate(results['similar_columns'][:3]):
|
|
827
|
+
distance = results['column_distances'].loc[pair[0], pair[1]]
|
|
828
|
+
print(f" {i+1}. {pair[0]} ↔ {pair[1]} (distance: {distance:.3f})")
|
|
829
|
+
|
|
830
|
+
# Demonstrate visualization
|
|
831
|
+
print(f"\nCreating visualization...")
|
|
832
|
+
print("Available visualization tabs:")
|
|
833
|
+
print(" 1. Z-Scores Heatmap - Shows standardized values")
|
|
834
|
+
print(" 2. Row Similarities - Distance matrix between rows")
|
|
835
|
+
print(" 3. Column Similarities - Distance matrix between columns")
|
|
836
|
+
print(" 4. PCA Analysis - Principal component analysis")
|
|
837
|
+
print(" 5. Data Preview - Dataframe with unusual rows highlighted")
|
|
838
|
+
print(" 6. Summary - Text summary of results")
|
|
839
|
+
|
|
840
|
+
# For SQLShell integration (widget only)
|
|
841
|
+
widget = visualize_profile(sample_df, results, show_window=False)
|
|
842
|
+
print(f"✓ Created widget for SQLShell: {type(widget)}")
|
|
843
|
+
|
|
844
|
+
# Show the actual visualization window for demo
|
|
845
|
+
print(f"\n🎯 Opening visualization window...")
|
|
846
|
+
print(" Close the window to continue or press Ctrl+C to exit")
|
|
847
|
+
|
|
848
|
+
# This will show the actual GUI window with all tabs
|
|
849
|
+
visualize_profile(sample_df, results, show_window=True)
|
|
850
|
+
|
|
851
|
+
return sample_df, results, widget
|
|
852
|
+
|
|
853
|
+
|
|
854
|
+
# Main function for testing
|
|
855
|
+
if __name__ == "__main__":
|
|
856
|
+
print("="*60)
|
|
857
|
+
print("SIMILARITY PROFILER DEMO")
|
|
858
|
+
print("="*60)
|
|
859
|
+
|
|
860
|
+
try:
|
|
861
|
+
df, results, widget = demo_similarity_analysis()
|
|
862
|
+
|
|
863
|
+
print(f"\n" + "="*60)
|
|
864
|
+
print("DEMO COMPLETED SUCCESSFULLY!")
|
|
865
|
+
print("="*60)
|
|
866
|
+
print("\nTo use in your code:")
|
|
867
|
+
print("1. from sqlshell.utils.profile_similarity import SimilarityProfiler, visualize_profile")
|
|
868
|
+
print("2. profiler = SimilarityProfiler()")
|
|
869
|
+
print("3. results = profiler.profile(your_dataframe)")
|
|
870
|
+
print("4. widget = visualize_profile(your_dataframe, show_window=False) # For SQLShell")
|
|
871
|
+
print("5. visualize_profile(your_dataframe, show_window=True) # For standalone")
|
|
872
|
+
|
|
873
|
+
except Exception as e:
|
|
874
|
+
print(f"Demo failed: {e}")
|
|
875
|
+
import traceback
|
|
876
|
+
traceback.print_exc()
|