sqlshell 0.1.9__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sqlshell might be problematic. Click here for more details.

@@ -0,0 +1,8 @@
1
+ """
2
+ Utility functions for SQLShell
3
+ """
4
+
5
+ # Import profile_entropy for convenient access
6
+ from sqlshell.utils.profile_entropy import profile, visualize_profile, EntropyProfiler
7
+
8
+ # Empty init file to make the directory a package
@@ -0,0 +1,347 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from PyQt6.QtCore import QObject, pyqtSignal, Qt
4
+ from PyQt6.QtWidgets import (
5
+ QApplication, QMainWindow, QVBoxLayout, QHBoxLayout, QWidget,
6
+ QTableView, QHeaderView, QLabel, QFrame, QScrollArea
7
+ )
8
+ from PyQt6.QtGui import QStandardItemModel, QStandardItem, QColor, QPalette, QBrush
9
+
10
+ class EntropyProfiler(QObject):
11
+ """Class to calculate entropy of columns in a dataframe"""
12
+ progress_updated = pyqtSignal(int, str) # Signal for progress reporting
13
+
14
+ def __init__(self):
15
+ super().__init__()
16
+
17
+ def calculate_entropy(self, series):
18
+ """Calculate Shannon entropy for a series of values"""
19
+ # Handle NaN values by dropping them
20
+ series = series.dropna()
21
+
22
+ if len(series) == 0:
23
+ return 0.0
24
+
25
+ # For numerical data with many unique values, bin the data
26
+ if series.dtype.kind in 'ifc' and series.nunique() > 10:
27
+ # Create bins (10 bins by default)
28
+ series = pd.cut(series, bins=10)
29
+
30
+ # Calculate value counts and probabilities
31
+ value_counts = series.value_counts(normalize=True)
32
+
33
+ # Calculate entropy: -sum(p * log2(p))
34
+ entropy = -np.sum(value_counts * np.log2(value_counts))
35
+ return entropy
36
+
37
+ def normalize_entropy(self, entropy_value, max_entropy):
38
+ """Normalize entropy value to 0-1 range"""
39
+ if max_entropy == 0:
40
+ return 0.0
41
+ return entropy_value / max_entropy
42
+
43
+ def profile(self, df):
44
+ """
45
+ Profile a dataframe to identify the most important columns based on entropy.
46
+
47
+ Args:
48
+ df: pandas DataFrame to analyze
49
+
50
+ Returns:
51
+ DataFrame with columns ranked by importance (entropy)
52
+ """
53
+ if not isinstance(df, pd.DataFrame):
54
+ raise TypeError("Input must be a pandas DataFrame")
55
+
56
+ if df.empty:
57
+ return pd.DataFrame(columns=['column', 'entropy', 'normalized_entropy', 'importance'])
58
+
59
+ results = []
60
+ total_columns = len(df.columns)
61
+
62
+ # Calculate entropy for each column
63
+ for i, column in enumerate(df.columns):
64
+ # Emit progress signal (if connected)
65
+ self.progress_updated.emit(int((i / total_columns) * 100), f"Analyzing column: {column}")
66
+
67
+ try:
68
+ entropy_value = self.calculate_entropy(df[column])
69
+ results.append({
70
+ 'column': column,
71
+ 'entropy': entropy_value
72
+ })
73
+ except Exception as e:
74
+ # Skip columns that can't be analyzed
75
+ continue
76
+
77
+ # Create results dataframe
78
+ result_df = pd.DataFrame(results)
79
+
80
+ if result_df.empty:
81
+ return pd.DataFrame(columns=['column', 'entropy', 'normalized_entropy', 'importance'])
82
+
83
+ # Calculate max entropy for normalization
84
+ max_entropy = result_df['entropy'].max()
85
+
86
+ # Add normalized entropy
87
+ result_df['normalized_entropy'] = result_df['entropy'].apply(
88
+ lambda x: self.normalize_entropy(x, max_entropy)
89
+ )
90
+
91
+ # Rank by importance (normalized entropy)
92
+ result_df = result_df.sort_values(by='normalized_entropy', ascending=False)
93
+
94
+ # Add importance label
95
+ def get_importance(value):
96
+ if value >= 0.8:
97
+ return "High"
98
+ elif value >= 0.5:
99
+ return "Medium"
100
+ elif value >= 0.3:
101
+ return "Low"
102
+ else:
103
+ return "Very Low"
104
+
105
+ result_df['importance'] = result_df['normalized_entropy'].apply(get_importance)
106
+
107
+ self.progress_updated.emit(100, "Analysis complete")
108
+ return result_df
109
+
110
+
111
+ class EntropyVisualization(QMainWindow):
112
+ """Window to visualize entropy results"""
113
+
114
+ def __init__(self, results_df, parent=None):
115
+ super().__init__(parent)
116
+ self.setWindowTitle("Column Entropy Profile")
117
+ self.resize(800, 600)
118
+
119
+ # Create central widget and layout
120
+ central_widget = QWidget()
121
+ self.setCentralWidget(central_widget)
122
+ layout = QVBoxLayout(central_widget)
123
+
124
+ # Add a title
125
+ title = QLabel("Column Importance Analysis (Entropy-Based)")
126
+ title.setAlignment(Qt.AlignmentFlag.AlignCenter)
127
+ title.setStyleSheet("font-size: 16pt; font-weight: bold; margin: 10px;")
128
+ layout.addWidget(title)
129
+
130
+ # Add a description
131
+ description = QLabel(
132
+ "Columns with higher entropy values contain more information and are likely more important for analysis."
133
+ )
134
+ description.setAlignment(Qt.AlignmentFlag.AlignCenter)
135
+ description.setWordWrap(True)
136
+ layout.addWidget(description)
137
+
138
+ # Create visual bars representation
139
+ self.create_visual_bars(layout, results_df)
140
+
141
+ # Create table view
142
+ self.create_table_view(layout, results_df)
143
+
144
+ def create_visual_bars(self, layout, df):
145
+ """Create horizontal bars representing entropy values"""
146
+ frame = QFrame()
147
+ frame.setFrameShape(QFrame.Shape.StyledPanel)
148
+ frame.setLineWidth(1)
149
+
150
+ # Create a scroll area for the bars
151
+ scroll_area = QScrollArea()
152
+ scroll_area.setWidgetResizable(True)
153
+ scroll_area.setFrameShape(QFrame.Shape.NoFrame)
154
+
155
+ # Content widget for the scroll area
156
+ content_widget = QWidget()
157
+ bars_layout = QVBoxLayout(content_widget)
158
+
159
+ # Scale for better visualization
160
+ max_width = 500
161
+
162
+ # Create a bar for each column
163
+ importance_colors = {
164
+ "High": QColor(52, 152, 219), # Blue
165
+ "Medium": QColor(46, 204, 113), # Green
166
+ "Low": QColor(241, 196, 15), # Yellow
167
+ "Very Low": QColor(230, 126, 34) # Orange
168
+ }
169
+
170
+ # Header
171
+ header = QLabel("Visualization of Column Importance (by Normalized Entropy)")
172
+ header.setAlignment(Qt.AlignmentFlag.AlignCenter)
173
+ header.setStyleSheet("font-weight: bold; margin-top: 10px;")
174
+ bars_layout.addWidget(header)
175
+
176
+ for _, row in df.iterrows():
177
+ bar_container = QWidget()
178
+ bar_layout = QVBoxLayout(bar_container)
179
+ bar_layout.setContentsMargins(0, 2, 0, 2)
180
+
181
+ # Column name and value
182
+ label_text = f"{row['column']}: {row['normalized_entropy']:.3f} ({row['importance']})"
183
+ label = QLabel(label_text)
184
+ bar_layout.addWidget(label)
185
+
186
+ # Progress bar
187
+ bar_width = int(row['normalized_entropy'] * max_width)
188
+ bar = QFrame()
189
+ bar.setFixedHeight(20)
190
+ bar.setFixedWidth(bar_width)
191
+ bar.setStyleSheet(f"background-color: {importance_colors[row['importance']].name()}; border-radius: 2px;")
192
+
193
+ # Container to left-align the bar
194
+ bar_container_inner = QWidget()
195
+ bar_container_layout = QHBoxLayout(bar_container_inner)
196
+ bar_container_layout.setContentsMargins(0, 0, 0, 0)
197
+ bar_container_layout.addWidget(bar)
198
+ bar_container_layout.addStretch()
199
+
200
+ bar_layout.addWidget(bar_container_inner)
201
+ bars_layout.addWidget(bar_container)
202
+
203
+ bars_layout.addStretch()
204
+
205
+ # Set the content widget to the scroll area
206
+ scroll_area.setWidget(content_widget)
207
+
208
+ # Add the scroll area to the frame layout
209
+ frame_layout = QVBoxLayout(frame)
210
+ frame_layout.addWidget(scroll_area)
211
+
212
+ # Add to main layout
213
+ layout.addWidget(frame)
214
+
215
+ # Set a reasonable maximum height for the scroll area
216
+ if len(df) > 10:
217
+ scroll_area.setMaximumHeight(400)
218
+
219
+ def create_table_view(self, layout, df):
220
+ """Create a table view showing the entropy results"""
221
+ # Create the model
222
+ model = QStandardItemModel()
223
+ model.setHorizontalHeaderLabels(['Column', 'Entropy', 'Normalized Entropy', 'Importance'])
224
+
225
+ # Set table data
226
+ for index, row in df.iterrows():
227
+ column_item = QStandardItem(str(row['column']))
228
+ entropy_item = QStandardItem(f"{row['entropy']:.4f}")
229
+ norm_entropy_item = QStandardItem(f"{row['normalized_entropy']:.4f}")
230
+ importance_item = QStandardItem(row['importance'])
231
+
232
+ # Set alignment
233
+ entropy_item.setTextAlignment(Qt.AlignmentFlag.AlignRight | Qt.AlignmentFlag.AlignVCenter)
234
+ norm_entropy_item.setTextAlignment(Qt.AlignmentFlag.AlignRight | Qt.AlignmentFlag.AlignVCenter)
235
+ importance_item.setTextAlignment(Qt.AlignmentFlag.AlignCenter | Qt.AlignmentFlag.AlignVCenter)
236
+
237
+ # Color based on importance
238
+ if row['importance'] == 'High':
239
+ importance_item.setBackground(QBrush(QColor(52, 152, 219))) # Blue
240
+ elif row['importance'] == 'Medium':
241
+ importance_item.setBackground(QBrush(QColor(46, 204, 113))) # Green
242
+ elif row['importance'] == 'Low':
243
+ importance_item.setBackground(QBrush(QColor(241, 196, 15))) # Yellow
244
+ else: # Very Low
245
+ importance_item.setBackground(QBrush(QColor(230, 126, 34))) # Orange
246
+
247
+ model.appendRow([column_item, entropy_item, norm_entropy_item, importance_item])
248
+
249
+ # Create and configure the table view
250
+ table_view = QTableView()
251
+ table_view.setModel(model)
252
+ table_view.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
253
+ table_view.setAlternatingRowColors(True)
254
+ table_view.setMinimumHeight(200)
255
+
256
+ layout.addWidget(table_view)
257
+
258
+
259
+ # Function interface for simpler usage
260
+ def profile(df):
261
+ """
262
+ Profile a dataframe to identify the most important columns based on entropy.
263
+
264
+ Args:
265
+ df: pandas DataFrame to analyze
266
+
267
+ Returns:
268
+ DataFrame with columns ranked by importance (entropy)
269
+ """
270
+ profiler = EntropyProfiler()
271
+ return profiler.profile(df)
272
+
273
+
274
+ def visualize_profile(df):
275
+ """
276
+ Create a visual representation of the entropy profile for a dataframe.
277
+
278
+ Args:
279
+ df: pandas DataFrame to analyze
280
+
281
+ Returns:
282
+ A PyQt6 window showing the visualization
283
+ """
284
+ profiler = EntropyProfiler()
285
+ results = profiler.profile(df)
286
+ vis = EntropyVisualization(results)
287
+ vis.show()
288
+ return vis
289
+
290
+
291
+ def test_profile_entropy():
292
+ """Test the entropy profiler with a sample dataframe"""
293
+ import sys
294
+
295
+ # Create a QApplication instance if one doesn't exist
296
+ app = QApplication.instance()
297
+ if app is None:
298
+ app = QApplication(sys.argv)
299
+
300
+ # Generate a random dataframe with some columns with different distributions
301
+ np.random.seed(42) # For reproducibility
302
+
303
+ # Create a dataframe with columns of varying entropy levels
304
+ df = pd.DataFrame({
305
+ 'uniform': np.random.randint(0, 100, size=1000), # High entropy (uniform distribution)
306
+ 'normal': np.random.normal(50, 10, size=1000), # Medium entropy
307
+ 'binary': np.random.choice([0, 1], size=1000), # Low entropy (only two values)
308
+ 'constant': np.ones(1000), # Zero entropy (same value)
309
+ 'skewed': np.random.exponential(5, size=1000), # Skewed distribution,
310
+ 'categorical': np.random.choice(['A', 'B', 'C'], size=1000), # Categorical data
311
+ 'mixed': np.random.randint(0, 100, size=1000) * np.random.choice([0, 1], size=1000), # Mixed data
312
+ 'datetime': pd.date_range('2020-01-01', periods=1000), # Datetime data
313
+ 'text': pd.Series(['a', 'b', 'c'] * 334)[:1000], # Text data
314
+ 'boolean': np.random.choice([True, False], size=1000), # Boolean data
315
+ # add 20 more dummy columns with different distributions
316
+ 'dummy1': np.random.randint(0, 100, size=1000),
317
+ 'dummy2': np.random.normal(50, 10, size=1000),
318
+ 'dummy3': np.random.choice([0, 1], size=1000),
319
+ 'dummy4': np.ones(1000),
320
+ 'dummy5': np.random.exponential(5, size=1000),
321
+ # add 20 more dummy columns with different distributions
322
+ 'dummy6': np.random.randint(0, 100, size=1000),
323
+ 'dummy7': np.random.normal(50, 10, size=1000),
324
+ 'dummy8': np.random.choice([0, 1], size=1000),
325
+ 'dummy9': np.ones(1000),
326
+ 'dummy10': np.random.exponential(5, size=1000),
327
+
328
+ })
329
+
330
+ # Add a categorical column with few categories
331
+ df['category'] = np.random.choice(['A', 'B', 'C'], size=1000)
332
+
333
+ # Calculate and display profile information
334
+ print("Entropy Profile Results:")
335
+ profiler = EntropyProfiler()
336
+ result = profiler.profile(df)
337
+ print(result)
338
+
339
+ # Visualize the results
340
+ vis = visualize_profile(df)
341
+
342
+ # Start the application event loop
343
+ app.exec()
344
+
345
+
346
+ if __name__ == "__main__":
347
+ test_profile_entropy()