sqlshell 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlshell/__init__.py +84 -0
- sqlshell/__main__.py +4926 -0
- sqlshell/ai_autocomplete.py +392 -0
- sqlshell/ai_settings_dialog.py +337 -0
- sqlshell/context_suggester.py +768 -0
- sqlshell/create_test_data.py +152 -0
- sqlshell/data/create_test_data.py +137 -0
- sqlshell/db/__init__.py +6 -0
- sqlshell/db/database_manager.py +1318 -0
- sqlshell/db/export_manager.py +188 -0
- sqlshell/editor.py +1166 -0
- sqlshell/editor_integration.py +127 -0
- sqlshell/execution_handler.py +421 -0
- sqlshell/menus.py +262 -0
- sqlshell/notification_manager.py +370 -0
- sqlshell/query_tab.py +904 -0
- sqlshell/resources/__init__.py +1 -0
- sqlshell/resources/icon.png +0 -0
- sqlshell/resources/logo_large.png +0 -0
- sqlshell/resources/logo_medium.png +0 -0
- sqlshell/resources/logo_small.png +0 -0
- sqlshell/resources/splash_screen.gif +0 -0
- sqlshell/space_invaders.py +501 -0
- sqlshell/splash_screen.py +405 -0
- sqlshell/sqlshell/__init__.py +5 -0
- sqlshell/sqlshell/create_test_data.py +118 -0
- sqlshell/sqlshell/create_test_databases.py +96 -0
- sqlshell/sqlshell_demo.png +0 -0
- sqlshell/styles.py +257 -0
- sqlshell/suggester_integration.py +330 -0
- sqlshell/syntax_highlighter.py +124 -0
- sqlshell/table_list.py +996 -0
- sqlshell/ui/__init__.py +6 -0
- sqlshell/ui/bar_chart_delegate.py +49 -0
- sqlshell/ui/filter_header.py +469 -0
- sqlshell/utils/__init__.py +16 -0
- sqlshell/utils/profile_cn2.py +1661 -0
- sqlshell/utils/profile_column.py +2635 -0
- sqlshell/utils/profile_distributions.py +616 -0
- sqlshell/utils/profile_entropy.py +347 -0
- sqlshell/utils/profile_foreign_keys.py +779 -0
- sqlshell/utils/profile_keys.py +2834 -0
- sqlshell/utils/profile_ohe.py +934 -0
- sqlshell/utils/profile_ohe_advanced.py +754 -0
- sqlshell/utils/profile_ohe_comparison.py +237 -0
- sqlshell/utils/profile_prediction.py +926 -0
- sqlshell/utils/profile_similarity.py +876 -0
- sqlshell/utils/search_in_df.py +90 -0
- sqlshell/widgets.py +400 -0
- sqlshell-0.4.4.dist-info/METADATA +441 -0
- sqlshell-0.4.4.dist-info/RECORD +54 -0
- sqlshell-0.4.4.dist-info/WHEEL +5 -0
- sqlshell-0.4.4.dist-info/entry_points.txt +2 -0
- sqlshell-0.4.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from PyQt6.QtCore import QObject, pyqtSignal, Qt
|
|
4
|
+
from PyQt6.QtWidgets import (
|
|
5
|
+
QApplication, QMainWindow, QVBoxLayout, QHBoxLayout, QWidget,
|
|
6
|
+
QTableView, QHeaderView, QLabel, QFrame, QScrollArea
|
|
7
|
+
)
|
|
8
|
+
from PyQt6.QtGui import QStandardItemModel, QStandardItem, QColor, QPalette, QBrush
|
|
9
|
+
|
|
10
|
+
class EntropyProfiler(QObject):
|
|
11
|
+
"""Class to calculate entropy of columns in a dataframe"""
|
|
12
|
+
progress_updated = pyqtSignal(int, str) # Signal for progress reporting
|
|
13
|
+
|
|
14
|
+
def __init__(self):
|
|
15
|
+
super().__init__()
|
|
16
|
+
|
|
17
|
+
def calculate_entropy(self, series):
|
|
18
|
+
"""Calculate Shannon entropy for a series of values"""
|
|
19
|
+
# Handle NaN values by dropping them
|
|
20
|
+
series = series.dropna()
|
|
21
|
+
|
|
22
|
+
if len(series) == 0:
|
|
23
|
+
return 0.0
|
|
24
|
+
|
|
25
|
+
# For numerical data with many unique values, bin the data
|
|
26
|
+
if series.dtype.kind in 'ifc' and series.nunique() > 10:
|
|
27
|
+
# Create bins (10 bins by default)
|
|
28
|
+
series = pd.cut(series, bins=10)
|
|
29
|
+
|
|
30
|
+
# Calculate value counts and probabilities
|
|
31
|
+
value_counts = series.value_counts(normalize=True)
|
|
32
|
+
|
|
33
|
+
# Calculate entropy: -sum(p * log2(p))
|
|
34
|
+
entropy = -np.sum(value_counts * np.log2(value_counts))
|
|
35
|
+
return entropy
|
|
36
|
+
|
|
37
|
+
def normalize_entropy(self, entropy_value, max_entropy):
|
|
38
|
+
"""Normalize entropy value to 0-1 range"""
|
|
39
|
+
if max_entropy == 0:
|
|
40
|
+
return 0.0
|
|
41
|
+
return entropy_value / max_entropy
|
|
42
|
+
|
|
43
|
+
def profile(self, df):
|
|
44
|
+
"""
|
|
45
|
+
Profile a dataframe to identify the most important columns based on entropy.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
df: pandas DataFrame to analyze
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
DataFrame with columns ranked by importance (entropy)
|
|
52
|
+
"""
|
|
53
|
+
if not isinstance(df, pd.DataFrame):
|
|
54
|
+
raise TypeError("Input must be a pandas DataFrame")
|
|
55
|
+
|
|
56
|
+
if df.empty:
|
|
57
|
+
return pd.DataFrame(columns=['column', 'entropy', 'normalized_entropy', 'importance'])
|
|
58
|
+
|
|
59
|
+
results = []
|
|
60
|
+
total_columns = len(df.columns)
|
|
61
|
+
|
|
62
|
+
# Calculate entropy for each column
|
|
63
|
+
for i, column in enumerate(df.columns):
|
|
64
|
+
# Emit progress signal (if connected)
|
|
65
|
+
self.progress_updated.emit(int((i / total_columns) * 100), f"Analyzing column: {column}")
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
entropy_value = self.calculate_entropy(df[column])
|
|
69
|
+
results.append({
|
|
70
|
+
'column': column,
|
|
71
|
+
'entropy': entropy_value
|
|
72
|
+
})
|
|
73
|
+
except Exception as e:
|
|
74
|
+
# Skip columns that can't be analyzed
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
# Create results dataframe
|
|
78
|
+
result_df = pd.DataFrame(results)
|
|
79
|
+
|
|
80
|
+
if result_df.empty:
|
|
81
|
+
return pd.DataFrame(columns=['column', 'entropy', 'normalized_entropy', 'importance'])
|
|
82
|
+
|
|
83
|
+
# Calculate max entropy for normalization
|
|
84
|
+
max_entropy = result_df['entropy'].max()
|
|
85
|
+
|
|
86
|
+
# Add normalized entropy
|
|
87
|
+
result_df['normalized_entropy'] = result_df['entropy'].apply(
|
|
88
|
+
lambda x: self.normalize_entropy(x, max_entropy)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Rank by importance (normalized entropy)
|
|
92
|
+
result_df = result_df.sort_values(by='normalized_entropy', ascending=False)
|
|
93
|
+
|
|
94
|
+
# Add importance label
|
|
95
|
+
def get_importance(value):
|
|
96
|
+
if value >= 0.8:
|
|
97
|
+
return "High"
|
|
98
|
+
elif value >= 0.5:
|
|
99
|
+
return "Medium"
|
|
100
|
+
elif value >= 0.3:
|
|
101
|
+
return "Low"
|
|
102
|
+
else:
|
|
103
|
+
return "Very Low"
|
|
104
|
+
|
|
105
|
+
result_df['importance'] = result_df['normalized_entropy'].apply(get_importance)
|
|
106
|
+
|
|
107
|
+
self.progress_updated.emit(100, "Analysis complete")
|
|
108
|
+
return result_df
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class EntropyVisualization(QMainWindow):
|
|
112
|
+
"""Window to visualize entropy results"""
|
|
113
|
+
|
|
114
|
+
def __init__(self, results_df, parent=None):
|
|
115
|
+
super().__init__(parent)
|
|
116
|
+
self.setWindowTitle("Column Entropy Profile")
|
|
117
|
+
self.resize(800, 600)
|
|
118
|
+
|
|
119
|
+
# Create central widget and layout
|
|
120
|
+
central_widget = QWidget()
|
|
121
|
+
self.setCentralWidget(central_widget)
|
|
122
|
+
layout = QVBoxLayout(central_widget)
|
|
123
|
+
|
|
124
|
+
# Add a title
|
|
125
|
+
title = QLabel("Column Importance Analysis (Entropy-Based)")
|
|
126
|
+
title.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
127
|
+
title.setStyleSheet("font-size: 16pt; font-weight: bold; margin: 10px;")
|
|
128
|
+
layout.addWidget(title)
|
|
129
|
+
|
|
130
|
+
# Add a description
|
|
131
|
+
description = QLabel(
|
|
132
|
+
"Columns with higher entropy values contain more information and are likely more important for analysis."
|
|
133
|
+
)
|
|
134
|
+
description.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
135
|
+
description.setWordWrap(True)
|
|
136
|
+
layout.addWidget(description)
|
|
137
|
+
|
|
138
|
+
# Create visual bars representation
|
|
139
|
+
self.create_visual_bars(layout, results_df)
|
|
140
|
+
|
|
141
|
+
# Create table view
|
|
142
|
+
self.create_table_view(layout, results_df)
|
|
143
|
+
|
|
144
|
+
def create_visual_bars(self, layout, df):
|
|
145
|
+
"""Create horizontal bars representing entropy values"""
|
|
146
|
+
frame = QFrame()
|
|
147
|
+
frame.setFrameShape(QFrame.Shape.StyledPanel)
|
|
148
|
+
frame.setLineWidth(1)
|
|
149
|
+
|
|
150
|
+
# Create a scroll area for the bars
|
|
151
|
+
scroll_area = QScrollArea()
|
|
152
|
+
scroll_area.setWidgetResizable(True)
|
|
153
|
+
scroll_area.setFrameShape(QFrame.Shape.NoFrame)
|
|
154
|
+
|
|
155
|
+
# Content widget for the scroll area
|
|
156
|
+
content_widget = QWidget()
|
|
157
|
+
bars_layout = QVBoxLayout(content_widget)
|
|
158
|
+
|
|
159
|
+
# Scale for better visualization
|
|
160
|
+
max_width = 500
|
|
161
|
+
|
|
162
|
+
# Create a bar for each column
|
|
163
|
+
importance_colors = {
|
|
164
|
+
"High": QColor(52, 152, 219), # Blue
|
|
165
|
+
"Medium": QColor(46, 204, 113), # Green
|
|
166
|
+
"Low": QColor(241, 196, 15), # Yellow
|
|
167
|
+
"Very Low": QColor(230, 126, 34) # Orange
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
# Header
|
|
171
|
+
header = QLabel("Visualization of Column Importance (by Normalized Entropy)")
|
|
172
|
+
header.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
173
|
+
header.setStyleSheet("font-weight: bold; margin-top: 10px;")
|
|
174
|
+
bars_layout.addWidget(header)
|
|
175
|
+
|
|
176
|
+
for _, row in df.iterrows():
|
|
177
|
+
bar_container = QWidget()
|
|
178
|
+
bar_layout = QVBoxLayout(bar_container)
|
|
179
|
+
bar_layout.setContentsMargins(0, 2, 0, 2)
|
|
180
|
+
|
|
181
|
+
# Column name and value
|
|
182
|
+
label_text = f"{row['column']}: {row['normalized_entropy']:.3f} ({row['importance']})"
|
|
183
|
+
label = QLabel(label_text)
|
|
184
|
+
bar_layout.addWidget(label)
|
|
185
|
+
|
|
186
|
+
# Progress bar
|
|
187
|
+
bar_width = int(row['normalized_entropy'] * max_width)
|
|
188
|
+
bar = QFrame()
|
|
189
|
+
bar.setFixedHeight(20)
|
|
190
|
+
bar.setFixedWidth(bar_width)
|
|
191
|
+
bar.setStyleSheet(f"background-color: {importance_colors[row['importance']].name()}; border-radius: 2px;")
|
|
192
|
+
|
|
193
|
+
# Container to left-align the bar
|
|
194
|
+
bar_container_inner = QWidget()
|
|
195
|
+
bar_container_layout = QHBoxLayout(bar_container_inner)
|
|
196
|
+
bar_container_layout.setContentsMargins(0, 0, 0, 0)
|
|
197
|
+
bar_container_layout.addWidget(bar)
|
|
198
|
+
bar_container_layout.addStretch()
|
|
199
|
+
|
|
200
|
+
bar_layout.addWidget(bar_container_inner)
|
|
201
|
+
bars_layout.addWidget(bar_container)
|
|
202
|
+
|
|
203
|
+
bars_layout.addStretch()
|
|
204
|
+
|
|
205
|
+
# Set the content widget to the scroll area
|
|
206
|
+
scroll_area.setWidget(content_widget)
|
|
207
|
+
|
|
208
|
+
# Add the scroll area to the frame layout
|
|
209
|
+
frame_layout = QVBoxLayout(frame)
|
|
210
|
+
frame_layout.addWidget(scroll_area)
|
|
211
|
+
|
|
212
|
+
# Add to main layout
|
|
213
|
+
layout.addWidget(frame)
|
|
214
|
+
|
|
215
|
+
# Set a reasonable maximum height for the scroll area
|
|
216
|
+
if len(df) > 10:
|
|
217
|
+
scroll_area.setMaximumHeight(400)
|
|
218
|
+
|
|
219
|
+
def create_table_view(self, layout, df):
|
|
220
|
+
"""Create a table view showing the entropy results"""
|
|
221
|
+
# Create the model
|
|
222
|
+
model = QStandardItemModel()
|
|
223
|
+
model.setHorizontalHeaderLabels(['Column', 'Entropy', 'Normalized Entropy', 'Importance'])
|
|
224
|
+
|
|
225
|
+
# Set table data
|
|
226
|
+
for index, row in df.iterrows():
|
|
227
|
+
column_item = QStandardItem(str(row['column']))
|
|
228
|
+
entropy_item = QStandardItem(f"{row['entropy']:.4f}")
|
|
229
|
+
norm_entropy_item = QStandardItem(f"{row['normalized_entropy']:.4f}")
|
|
230
|
+
importance_item = QStandardItem(row['importance'])
|
|
231
|
+
|
|
232
|
+
# Set alignment
|
|
233
|
+
entropy_item.setTextAlignment(Qt.AlignmentFlag.AlignRight | Qt.AlignmentFlag.AlignVCenter)
|
|
234
|
+
norm_entropy_item.setTextAlignment(Qt.AlignmentFlag.AlignRight | Qt.AlignmentFlag.AlignVCenter)
|
|
235
|
+
importance_item.setTextAlignment(Qt.AlignmentFlag.AlignCenter | Qt.AlignmentFlag.AlignVCenter)
|
|
236
|
+
|
|
237
|
+
# Color based on importance
|
|
238
|
+
if row['importance'] == 'High':
|
|
239
|
+
importance_item.setBackground(QBrush(QColor(52, 152, 219))) # Blue
|
|
240
|
+
elif row['importance'] == 'Medium':
|
|
241
|
+
importance_item.setBackground(QBrush(QColor(46, 204, 113))) # Green
|
|
242
|
+
elif row['importance'] == 'Low':
|
|
243
|
+
importance_item.setBackground(QBrush(QColor(241, 196, 15))) # Yellow
|
|
244
|
+
else: # Very Low
|
|
245
|
+
importance_item.setBackground(QBrush(QColor(230, 126, 34))) # Orange
|
|
246
|
+
|
|
247
|
+
model.appendRow([column_item, entropy_item, norm_entropy_item, importance_item])
|
|
248
|
+
|
|
249
|
+
# Create and configure the table view
|
|
250
|
+
table_view = QTableView()
|
|
251
|
+
table_view.setModel(model)
|
|
252
|
+
table_view.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
|
|
253
|
+
table_view.setAlternatingRowColors(True)
|
|
254
|
+
table_view.setMinimumHeight(200)
|
|
255
|
+
|
|
256
|
+
layout.addWidget(table_view)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
# Function interface for simpler usage
|
|
260
|
+
def profile(df):
|
|
261
|
+
"""
|
|
262
|
+
Profile a dataframe to identify the most important columns based on entropy.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
df: pandas DataFrame to analyze
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
DataFrame with columns ranked by importance (entropy)
|
|
269
|
+
"""
|
|
270
|
+
profiler = EntropyProfiler()
|
|
271
|
+
return profiler.profile(df)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def visualize_profile(df):
|
|
275
|
+
"""
|
|
276
|
+
Create a visual representation of the entropy profile for a dataframe.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
df: pandas DataFrame to analyze
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
A PyQt6 window showing the visualization
|
|
283
|
+
"""
|
|
284
|
+
profiler = EntropyProfiler()
|
|
285
|
+
results = profiler.profile(df)
|
|
286
|
+
vis = EntropyVisualization(results)
|
|
287
|
+
vis.show()
|
|
288
|
+
return vis
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def test_profile_entropy():
|
|
292
|
+
"""Test the entropy profiler with a sample dataframe"""
|
|
293
|
+
import sys
|
|
294
|
+
|
|
295
|
+
# Create a QApplication instance if one doesn't exist
|
|
296
|
+
app = QApplication.instance()
|
|
297
|
+
if app is None:
|
|
298
|
+
app = QApplication(sys.argv)
|
|
299
|
+
|
|
300
|
+
# Generate a random dataframe with some columns with different distributions
|
|
301
|
+
np.random.seed(42) # For reproducibility
|
|
302
|
+
|
|
303
|
+
# Create a dataframe with columns of varying entropy levels
|
|
304
|
+
df = pd.DataFrame({
|
|
305
|
+
'uniform': np.random.randint(0, 100, size=1000), # High entropy (uniform distribution)
|
|
306
|
+
'normal': np.random.normal(50, 10, size=1000), # Medium entropy
|
|
307
|
+
'binary': np.random.choice([0, 1], size=1000), # Low entropy (only two values)
|
|
308
|
+
'constant': np.ones(1000), # Zero entropy (same value)
|
|
309
|
+
'skewed': np.random.exponential(5, size=1000), # Skewed distribution,
|
|
310
|
+
'categorical': np.random.choice(['A', 'B', 'C'], size=1000), # Categorical data
|
|
311
|
+
'mixed': np.random.randint(0, 100, size=1000) * np.random.choice([0, 1], size=1000), # Mixed data
|
|
312
|
+
'datetime': pd.date_range('2020-01-01', periods=1000), # Datetime data
|
|
313
|
+
'text': pd.Series(['a', 'b', 'c'] * 334)[:1000], # Text data
|
|
314
|
+
'boolean': np.random.choice([True, False], size=1000), # Boolean data
|
|
315
|
+
# add 20 more dummy columns with different distributions
|
|
316
|
+
'dummy1': np.random.randint(0, 100, size=1000),
|
|
317
|
+
'dummy2': np.random.normal(50, 10, size=1000),
|
|
318
|
+
'dummy3': np.random.choice([0, 1], size=1000),
|
|
319
|
+
'dummy4': np.ones(1000),
|
|
320
|
+
'dummy5': np.random.exponential(5, size=1000),
|
|
321
|
+
# add 20 more dummy columns with different distributions
|
|
322
|
+
'dummy6': np.random.randint(0, 100, size=1000),
|
|
323
|
+
'dummy7': np.random.normal(50, 10, size=1000),
|
|
324
|
+
'dummy8': np.random.choice([0, 1], size=1000),
|
|
325
|
+
'dummy9': np.ones(1000),
|
|
326
|
+
'dummy10': np.random.exponential(5, size=1000),
|
|
327
|
+
|
|
328
|
+
})
|
|
329
|
+
|
|
330
|
+
# Add a categorical column with few categories
|
|
331
|
+
df['category'] = np.random.choice(['A', 'B', 'C'], size=1000)
|
|
332
|
+
|
|
333
|
+
# Calculate and display profile information
|
|
334
|
+
print("Entropy Profile Results:")
|
|
335
|
+
profiler = EntropyProfiler()
|
|
336
|
+
result = profiler.profile(df)
|
|
337
|
+
print(result)
|
|
338
|
+
|
|
339
|
+
# Visualize the results
|
|
340
|
+
vis = visualize_profile(df)
|
|
341
|
+
|
|
342
|
+
# Start the application event loop
|
|
343
|
+
app.exec()
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
if __name__ == "__main__":
|
|
347
|
+
test_profile_entropy()
|