sqlshell 0.1.8__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sqlshell might be problematic. Click here for more details.
- sqlshell/LICENSE +21 -0
- sqlshell/MANIFEST.in +6 -0
- sqlshell/README.md +59 -0
- sqlshell/__init__.py +1 -1
- sqlshell/context_suggester.py +765 -0
- sqlshell/create_test_data.py +106 -30
- sqlshell/db/__init__.py +5 -0
- sqlshell/db/database_manager.py +837 -0
- sqlshell/editor.py +610 -52
- sqlshell/main.py +2657 -1164
- sqlshell/menus.py +171 -0
- sqlshell/query_tab.py +201 -0
- sqlshell/resources/create_icon.py +106 -28
- sqlshell/resources/create_splash.py +41 -11
- sqlshell/resources/icon.png +0 -0
- sqlshell/resources/logo_large.png +0 -0
- sqlshell/resources/logo_medium.png +0 -0
- sqlshell/resources/logo_small.png +0 -0
- sqlshell/splash_screen.py +276 -48
- sqlshell/styles.py +257 -0
- sqlshell/suggester_integration.py +275 -0
- sqlshell/table_list.py +907 -0
- sqlshell/ui/__init__.py +6 -0
- sqlshell/ui/bar_chart_delegate.py +49 -0
- sqlshell/ui/filter_header.py +403 -0
- sqlshell/utils/__init__.py +8 -0
- sqlshell/utils/profile_entropy.py +347 -0
- sqlshell/utils/profile_keys.py +356 -0
- sqlshell-0.2.0.dist-info/METADATA +198 -0
- sqlshell-0.2.0.dist-info/RECORD +41 -0
- {sqlshell-0.1.8.dist-info → sqlshell-0.2.0.dist-info}/WHEEL +1 -1
- sqlshell/setup.py +0 -42
- sqlshell-0.1.8.dist-info/METADATA +0 -120
- sqlshell-0.1.8.dist-info/RECORD +0 -21
- {sqlshell-0.1.8.dist-info → sqlshell-0.2.0.dist-info}/entry_points.txt +0 -0
- {sqlshell-0.1.8.dist-info → sqlshell-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from PyQt6.QtCore import QObject, pyqtSignal, Qt
|
|
4
|
+
from PyQt6.QtWidgets import (
|
|
5
|
+
QApplication, QMainWindow, QVBoxLayout, QHBoxLayout, QWidget,
|
|
6
|
+
QTableView, QHeaderView, QLabel, QFrame, QScrollArea
|
|
7
|
+
)
|
|
8
|
+
from PyQt6.QtGui import QStandardItemModel, QStandardItem, QColor, QPalette, QBrush
|
|
9
|
+
|
|
10
|
+
class EntropyProfiler(QObject):
|
|
11
|
+
"""Class to calculate entropy of columns in a dataframe"""
|
|
12
|
+
progress_updated = pyqtSignal(int, str) # Signal for progress reporting
|
|
13
|
+
|
|
14
|
+
def __init__(self):
|
|
15
|
+
super().__init__()
|
|
16
|
+
|
|
17
|
+
def calculate_entropy(self, series):
|
|
18
|
+
"""Calculate Shannon entropy for a series of values"""
|
|
19
|
+
# Handle NaN values by dropping them
|
|
20
|
+
series = series.dropna()
|
|
21
|
+
|
|
22
|
+
if len(series) == 0:
|
|
23
|
+
return 0.0
|
|
24
|
+
|
|
25
|
+
# For numerical data with many unique values, bin the data
|
|
26
|
+
if series.dtype.kind in 'ifc' and series.nunique() > 10:
|
|
27
|
+
# Create bins (10 bins by default)
|
|
28
|
+
series = pd.cut(series, bins=10)
|
|
29
|
+
|
|
30
|
+
# Calculate value counts and probabilities
|
|
31
|
+
value_counts = series.value_counts(normalize=True)
|
|
32
|
+
|
|
33
|
+
# Calculate entropy: -sum(p * log2(p))
|
|
34
|
+
entropy = -np.sum(value_counts * np.log2(value_counts))
|
|
35
|
+
return entropy
|
|
36
|
+
|
|
37
|
+
def normalize_entropy(self, entropy_value, max_entropy):
|
|
38
|
+
"""Normalize entropy value to 0-1 range"""
|
|
39
|
+
if max_entropy == 0:
|
|
40
|
+
return 0.0
|
|
41
|
+
return entropy_value / max_entropy
|
|
42
|
+
|
|
43
|
+
def profile(self, df):
|
|
44
|
+
"""
|
|
45
|
+
Profile a dataframe to identify the most important columns based on entropy.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
df: pandas DataFrame to analyze
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
DataFrame with columns ranked by importance (entropy)
|
|
52
|
+
"""
|
|
53
|
+
if not isinstance(df, pd.DataFrame):
|
|
54
|
+
raise TypeError("Input must be a pandas DataFrame")
|
|
55
|
+
|
|
56
|
+
if df.empty:
|
|
57
|
+
return pd.DataFrame(columns=['column', 'entropy', 'normalized_entropy', 'importance'])
|
|
58
|
+
|
|
59
|
+
results = []
|
|
60
|
+
total_columns = len(df.columns)
|
|
61
|
+
|
|
62
|
+
# Calculate entropy for each column
|
|
63
|
+
for i, column in enumerate(df.columns):
|
|
64
|
+
# Emit progress signal (if connected)
|
|
65
|
+
self.progress_updated.emit(int((i / total_columns) * 100), f"Analyzing column: {column}")
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
entropy_value = self.calculate_entropy(df[column])
|
|
69
|
+
results.append({
|
|
70
|
+
'column': column,
|
|
71
|
+
'entropy': entropy_value
|
|
72
|
+
})
|
|
73
|
+
except Exception as e:
|
|
74
|
+
# Skip columns that can't be analyzed
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
# Create results dataframe
|
|
78
|
+
result_df = pd.DataFrame(results)
|
|
79
|
+
|
|
80
|
+
if result_df.empty:
|
|
81
|
+
return pd.DataFrame(columns=['column', 'entropy', 'normalized_entropy', 'importance'])
|
|
82
|
+
|
|
83
|
+
# Calculate max entropy for normalization
|
|
84
|
+
max_entropy = result_df['entropy'].max()
|
|
85
|
+
|
|
86
|
+
# Add normalized entropy
|
|
87
|
+
result_df['normalized_entropy'] = result_df['entropy'].apply(
|
|
88
|
+
lambda x: self.normalize_entropy(x, max_entropy)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Rank by importance (normalized entropy)
|
|
92
|
+
result_df = result_df.sort_values(by='normalized_entropy', ascending=False)
|
|
93
|
+
|
|
94
|
+
# Add importance label
|
|
95
|
+
def get_importance(value):
|
|
96
|
+
if value >= 0.8:
|
|
97
|
+
return "High"
|
|
98
|
+
elif value >= 0.5:
|
|
99
|
+
return "Medium"
|
|
100
|
+
elif value >= 0.3:
|
|
101
|
+
return "Low"
|
|
102
|
+
else:
|
|
103
|
+
return "Very Low"
|
|
104
|
+
|
|
105
|
+
result_df['importance'] = result_df['normalized_entropy'].apply(get_importance)
|
|
106
|
+
|
|
107
|
+
self.progress_updated.emit(100, "Analysis complete")
|
|
108
|
+
return result_df
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class EntropyVisualization(QMainWindow):
|
|
112
|
+
"""Window to visualize entropy results"""
|
|
113
|
+
|
|
114
|
+
def __init__(self, results_df, parent=None):
|
|
115
|
+
super().__init__(parent)
|
|
116
|
+
self.setWindowTitle("Column Entropy Profile")
|
|
117
|
+
self.resize(800, 600)
|
|
118
|
+
|
|
119
|
+
# Create central widget and layout
|
|
120
|
+
central_widget = QWidget()
|
|
121
|
+
self.setCentralWidget(central_widget)
|
|
122
|
+
layout = QVBoxLayout(central_widget)
|
|
123
|
+
|
|
124
|
+
# Add a title
|
|
125
|
+
title = QLabel("Column Importance Analysis (Entropy-Based)")
|
|
126
|
+
title.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
127
|
+
title.setStyleSheet("font-size: 16pt; font-weight: bold; margin: 10px;")
|
|
128
|
+
layout.addWidget(title)
|
|
129
|
+
|
|
130
|
+
# Add a description
|
|
131
|
+
description = QLabel(
|
|
132
|
+
"Columns with higher entropy values contain more information and are likely more important for analysis."
|
|
133
|
+
)
|
|
134
|
+
description.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
135
|
+
description.setWordWrap(True)
|
|
136
|
+
layout.addWidget(description)
|
|
137
|
+
|
|
138
|
+
# Create visual bars representation
|
|
139
|
+
self.create_visual_bars(layout, results_df)
|
|
140
|
+
|
|
141
|
+
# Create table view
|
|
142
|
+
self.create_table_view(layout, results_df)
|
|
143
|
+
|
|
144
|
+
def create_visual_bars(self, layout, df):
|
|
145
|
+
"""Create horizontal bars representing entropy values"""
|
|
146
|
+
frame = QFrame()
|
|
147
|
+
frame.setFrameShape(QFrame.Shape.StyledPanel)
|
|
148
|
+
frame.setLineWidth(1)
|
|
149
|
+
|
|
150
|
+
# Create a scroll area for the bars
|
|
151
|
+
scroll_area = QScrollArea()
|
|
152
|
+
scroll_area.setWidgetResizable(True)
|
|
153
|
+
scroll_area.setFrameShape(QFrame.Shape.NoFrame)
|
|
154
|
+
|
|
155
|
+
# Content widget for the scroll area
|
|
156
|
+
content_widget = QWidget()
|
|
157
|
+
bars_layout = QVBoxLayout(content_widget)
|
|
158
|
+
|
|
159
|
+
# Scale for better visualization
|
|
160
|
+
max_width = 500
|
|
161
|
+
|
|
162
|
+
# Create a bar for each column
|
|
163
|
+
importance_colors = {
|
|
164
|
+
"High": QColor(52, 152, 219), # Blue
|
|
165
|
+
"Medium": QColor(46, 204, 113), # Green
|
|
166
|
+
"Low": QColor(241, 196, 15), # Yellow
|
|
167
|
+
"Very Low": QColor(230, 126, 34) # Orange
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
# Header
|
|
171
|
+
header = QLabel("Visualization of Column Importance (by Normalized Entropy)")
|
|
172
|
+
header.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
173
|
+
header.setStyleSheet("font-weight: bold; margin-top: 10px;")
|
|
174
|
+
bars_layout.addWidget(header)
|
|
175
|
+
|
|
176
|
+
for _, row in df.iterrows():
|
|
177
|
+
bar_container = QWidget()
|
|
178
|
+
bar_layout = QVBoxLayout(bar_container)
|
|
179
|
+
bar_layout.setContentsMargins(0, 2, 0, 2)
|
|
180
|
+
|
|
181
|
+
# Column name and value
|
|
182
|
+
label_text = f"{row['column']}: {row['normalized_entropy']:.3f} ({row['importance']})"
|
|
183
|
+
label = QLabel(label_text)
|
|
184
|
+
bar_layout.addWidget(label)
|
|
185
|
+
|
|
186
|
+
# Progress bar
|
|
187
|
+
bar_width = int(row['normalized_entropy'] * max_width)
|
|
188
|
+
bar = QFrame()
|
|
189
|
+
bar.setFixedHeight(20)
|
|
190
|
+
bar.setFixedWidth(bar_width)
|
|
191
|
+
bar.setStyleSheet(f"background-color: {importance_colors[row['importance']].name()}; border-radius: 2px;")
|
|
192
|
+
|
|
193
|
+
# Container to left-align the bar
|
|
194
|
+
bar_container_inner = QWidget()
|
|
195
|
+
bar_container_layout = QHBoxLayout(bar_container_inner)
|
|
196
|
+
bar_container_layout.setContentsMargins(0, 0, 0, 0)
|
|
197
|
+
bar_container_layout.addWidget(bar)
|
|
198
|
+
bar_container_layout.addStretch()
|
|
199
|
+
|
|
200
|
+
bar_layout.addWidget(bar_container_inner)
|
|
201
|
+
bars_layout.addWidget(bar_container)
|
|
202
|
+
|
|
203
|
+
bars_layout.addStretch()
|
|
204
|
+
|
|
205
|
+
# Set the content widget to the scroll area
|
|
206
|
+
scroll_area.setWidget(content_widget)
|
|
207
|
+
|
|
208
|
+
# Add the scroll area to the frame layout
|
|
209
|
+
frame_layout = QVBoxLayout(frame)
|
|
210
|
+
frame_layout.addWidget(scroll_area)
|
|
211
|
+
|
|
212
|
+
# Add to main layout
|
|
213
|
+
layout.addWidget(frame)
|
|
214
|
+
|
|
215
|
+
# Set a reasonable maximum height for the scroll area
|
|
216
|
+
if len(df) > 10:
|
|
217
|
+
scroll_area.setMaximumHeight(400)
|
|
218
|
+
|
|
219
|
+
def create_table_view(self, layout, df):
|
|
220
|
+
"""Create a table view showing the entropy results"""
|
|
221
|
+
# Create the model
|
|
222
|
+
model = QStandardItemModel()
|
|
223
|
+
model.setHorizontalHeaderLabels(['Column', 'Entropy', 'Normalized Entropy', 'Importance'])
|
|
224
|
+
|
|
225
|
+
# Set table data
|
|
226
|
+
for index, row in df.iterrows():
|
|
227
|
+
column_item = QStandardItem(str(row['column']))
|
|
228
|
+
entropy_item = QStandardItem(f"{row['entropy']:.4f}")
|
|
229
|
+
norm_entropy_item = QStandardItem(f"{row['normalized_entropy']:.4f}")
|
|
230
|
+
importance_item = QStandardItem(row['importance'])
|
|
231
|
+
|
|
232
|
+
# Set alignment
|
|
233
|
+
entropy_item.setTextAlignment(Qt.AlignmentFlag.AlignRight | Qt.AlignmentFlag.AlignVCenter)
|
|
234
|
+
norm_entropy_item.setTextAlignment(Qt.AlignmentFlag.AlignRight | Qt.AlignmentFlag.AlignVCenter)
|
|
235
|
+
importance_item.setTextAlignment(Qt.AlignmentFlag.AlignCenter | Qt.AlignmentFlag.AlignVCenter)
|
|
236
|
+
|
|
237
|
+
# Color based on importance
|
|
238
|
+
if row['importance'] == 'High':
|
|
239
|
+
importance_item.setBackground(QBrush(QColor(52, 152, 219))) # Blue
|
|
240
|
+
elif row['importance'] == 'Medium':
|
|
241
|
+
importance_item.setBackground(QBrush(QColor(46, 204, 113))) # Green
|
|
242
|
+
elif row['importance'] == 'Low':
|
|
243
|
+
importance_item.setBackground(QBrush(QColor(241, 196, 15))) # Yellow
|
|
244
|
+
else: # Very Low
|
|
245
|
+
importance_item.setBackground(QBrush(QColor(230, 126, 34))) # Orange
|
|
246
|
+
|
|
247
|
+
model.appendRow([column_item, entropy_item, norm_entropy_item, importance_item])
|
|
248
|
+
|
|
249
|
+
# Create and configure the table view
|
|
250
|
+
table_view = QTableView()
|
|
251
|
+
table_view.setModel(model)
|
|
252
|
+
table_view.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
|
|
253
|
+
table_view.setAlternatingRowColors(True)
|
|
254
|
+
table_view.setMinimumHeight(200)
|
|
255
|
+
|
|
256
|
+
layout.addWidget(table_view)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
# Function interface for simpler usage
|
|
260
|
+
def profile(df):
|
|
261
|
+
"""
|
|
262
|
+
Profile a dataframe to identify the most important columns based on entropy.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
df: pandas DataFrame to analyze
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
DataFrame with columns ranked by importance (entropy)
|
|
269
|
+
"""
|
|
270
|
+
profiler = EntropyProfiler()
|
|
271
|
+
return profiler.profile(df)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def visualize_profile(df):
|
|
275
|
+
"""
|
|
276
|
+
Create a visual representation of the entropy profile for a dataframe.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
df: pandas DataFrame to analyze
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
A PyQt6 window showing the visualization
|
|
283
|
+
"""
|
|
284
|
+
profiler = EntropyProfiler()
|
|
285
|
+
results = profiler.profile(df)
|
|
286
|
+
vis = EntropyVisualization(results)
|
|
287
|
+
vis.show()
|
|
288
|
+
return vis
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def test_profile_entropy():
|
|
292
|
+
"""Test the entropy profiler with a sample dataframe"""
|
|
293
|
+
import sys
|
|
294
|
+
|
|
295
|
+
# Create a QApplication instance if one doesn't exist
|
|
296
|
+
app = QApplication.instance()
|
|
297
|
+
if app is None:
|
|
298
|
+
app = QApplication(sys.argv)
|
|
299
|
+
|
|
300
|
+
# Generate a random dataframe with some columns with different distributions
|
|
301
|
+
np.random.seed(42) # For reproducibility
|
|
302
|
+
|
|
303
|
+
# Create a dataframe with columns of varying entropy levels
|
|
304
|
+
df = pd.DataFrame({
|
|
305
|
+
'uniform': np.random.randint(0, 100, size=1000), # High entropy (uniform distribution)
|
|
306
|
+
'normal': np.random.normal(50, 10, size=1000), # Medium entropy
|
|
307
|
+
'binary': np.random.choice([0, 1], size=1000), # Low entropy (only two values)
|
|
308
|
+
'constant': np.ones(1000), # Zero entropy (same value)
|
|
309
|
+
'skewed': np.random.exponential(5, size=1000), # Skewed distribution,
|
|
310
|
+
'categorical': np.random.choice(['A', 'B', 'C'], size=1000), # Categorical data
|
|
311
|
+
'mixed': np.random.randint(0, 100, size=1000) * np.random.choice([0, 1], size=1000), # Mixed data
|
|
312
|
+
'datetime': pd.date_range('2020-01-01', periods=1000), # Datetime data
|
|
313
|
+
'text': pd.Series(['a', 'b', 'c'] * 334)[:1000], # Text data
|
|
314
|
+
'boolean': np.random.choice([True, False], size=1000), # Boolean data
|
|
315
|
+
# add 20 more dummy columns with different distributions
|
|
316
|
+
'dummy1': np.random.randint(0, 100, size=1000),
|
|
317
|
+
'dummy2': np.random.normal(50, 10, size=1000),
|
|
318
|
+
'dummy3': np.random.choice([0, 1], size=1000),
|
|
319
|
+
'dummy4': np.ones(1000),
|
|
320
|
+
'dummy5': np.random.exponential(5, size=1000),
|
|
321
|
+
# add 20 more dummy columns with different distributions
|
|
322
|
+
'dummy6': np.random.randint(0, 100, size=1000),
|
|
323
|
+
'dummy7': np.random.normal(50, 10, size=1000),
|
|
324
|
+
'dummy8': np.random.choice([0, 1], size=1000),
|
|
325
|
+
'dummy9': np.ones(1000),
|
|
326
|
+
'dummy10': np.random.exponential(5, size=1000),
|
|
327
|
+
|
|
328
|
+
})
|
|
329
|
+
|
|
330
|
+
# Add a categorical column with few categories
|
|
331
|
+
df['category'] = np.random.choice(['A', 'B', 'C'], size=1000)
|
|
332
|
+
|
|
333
|
+
# Calculate and display profile information
|
|
334
|
+
print("Entropy Profile Results:")
|
|
335
|
+
profiler = EntropyProfiler()
|
|
336
|
+
result = profiler.profile(df)
|
|
337
|
+
print(result)
|
|
338
|
+
|
|
339
|
+
# Visualize the results
|
|
340
|
+
vis = visualize_profile(df)
|
|
341
|
+
|
|
342
|
+
# Start the application event loop
|
|
343
|
+
app.exec()
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
if __name__ == "__main__":
|
|
347
|
+
test_profile_entropy()
|
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import itertools
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import random
|
|
5
|
+
from PyQt6.QtWidgets import (
|
|
6
|
+
QApplication, QWidget, QVBoxLayout, QLabel, QTableWidget, QTableWidgetItem, QHeaderView, QTabWidget, QMainWindow
|
|
7
|
+
)
|
|
8
|
+
from PyQt6.QtCore import Qt
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def find_functional_dependencies(df: pd.DataFrame, max_lhs_size: int = 2):
|
|
12
|
+
"""
|
|
13
|
+
Discover all functional dependencies X -> A in the DataFrame for |X| <= max_lhs_size.
|
|
14
|
+
Returns a list of tuples (lhs, rhs).
|
|
15
|
+
"""
|
|
16
|
+
fds = []
|
|
17
|
+
cols = list(df.columns)
|
|
18
|
+
n_rows = len(df)
|
|
19
|
+
|
|
20
|
+
for size in range(1, max_lhs_size + 1):
|
|
21
|
+
for lhs in itertools.combinations(cols, size):
|
|
22
|
+
# for each potential dependent attribute not in lhs
|
|
23
|
+
lhs_df = df[list(lhs)]
|
|
24
|
+
# group by lhs and count distinct values of each other column
|
|
25
|
+
grouped = df.groupby(list(lhs))
|
|
26
|
+
for rhs in cols:
|
|
27
|
+
if rhs in lhs:
|
|
28
|
+
continue
|
|
29
|
+
# Check if for each group, rhs has only one distinct value
|
|
30
|
+
distinct_counts = grouped[rhs].nunique(dropna=False)
|
|
31
|
+
if (distinct_counts <= 1).all():
|
|
32
|
+
fds.append((lhs, rhs))
|
|
33
|
+
return fds
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def propose_normalized_tables(cols, candidate_keys, fds):
|
|
37
|
+
"""
|
|
38
|
+
Propose a set of normalized tables based on functional dependencies.
|
|
39
|
+
Uses a simplified approach to create 3NF tables.
|
|
40
|
+
|
|
41
|
+
Parameters:
|
|
42
|
+
- cols: list of all columns
|
|
43
|
+
- candidate_keys: list of candidate keys
|
|
44
|
+
- fds: list of functional dependencies as (lhs, rhs) tuples
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
- List of proposed tables as (table_name, primary_key, attributes) tuples
|
|
48
|
+
"""
|
|
49
|
+
# Start with a set of all attributes
|
|
50
|
+
all_attrs = set(cols)
|
|
51
|
+
proposed_tables = []
|
|
52
|
+
|
|
53
|
+
# Group FDs by their determinants (LHS)
|
|
54
|
+
determinant_groups = {}
|
|
55
|
+
for lhs, rhs in fds:
|
|
56
|
+
lhs_key = tuple(sorted(lhs))
|
|
57
|
+
if lhs_key not in determinant_groups:
|
|
58
|
+
determinant_groups[lhs_key] = []
|
|
59
|
+
determinant_groups[lhs_key].append(rhs)
|
|
60
|
+
|
|
61
|
+
# Create tables for each determinant group
|
|
62
|
+
table_counter = 1
|
|
63
|
+
for lhs, rhs_list in determinant_groups.items():
|
|
64
|
+
table_attrs = set(lhs) | set(rhs_list)
|
|
65
|
+
if table_attrs: # Skip empty tables
|
|
66
|
+
table_name = f"Table_{table_counter}"
|
|
67
|
+
primary_key = ", ".join(lhs)
|
|
68
|
+
attributes = list(table_attrs)
|
|
69
|
+
proposed_tables.append((table_name, primary_key, attributes))
|
|
70
|
+
table_counter += 1
|
|
71
|
+
|
|
72
|
+
# Create a table for any remaining attributes not in any FD
|
|
73
|
+
# or create a table with a candidate key if none exists yet
|
|
74
|
+
used_attrs = set()
|
|
75
|
+
for _, _, attrs in proposed_tables:
|
|
76
|
+
used_attrs.update(attrs)
|
|
77
|
+
|
|
78
|
+
remaining_attrs = all_attrs - used_attrs
|
|
79
|
+
if remaining_attrs:
|
|
80
|
+
# If we have a candidate key, use it for remaining attributes
|
|
81
|
+
for key in candidate_keys:
|
|
82
|
+
key_set = set(key)
|
|
83
|
+
if key_set & remaining_attrs: # If key has overlap with remaining attrs
|
|
84
|
+
table_name = f"Table_{table_counter}"
|
|
85
|
+
primary_key = ", ".join(key)
|
|
86
|
+
attributes = list(remaining_attrs | key_set)
|
|
87
|
+
proposed_tables.append((table_name, primary_key, attributes))
|
|
88
|
+
break
|
|
89
|
+
else: # No suitable candidate key
|
|
90
|
+
table_name = f"Table_{table_counter}"
|
|
91
|
+
primary_key = "id (suggested)"
|
|
92
|
+
attributes = list(remaining_attrs)
|
|
93
|
+
proposed_tables.append((table_name, primary_key, attributes))
|
|
94
|
+
|
|
95
|
+
return proposed_tables
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def profile(df: pd.DataFrame, max_combination_size: int = 2, max_lhs_size: int = 2):
|
|
99
|
+
"""
|
|
100
|
+
Analyze a pandas DataFrame to suggest candidate keys and discover functional dependencies.
|
|
101
|
+
|
|
102
|
+
Parameters:
|
|
103
|
+
- df: pandas.DataFrame to analyze.
|
|
104
|
+
- max_combination_size: max size of column combos to test for keys.
|
|
105
|
+
- max_lhs_size: max size of LHS in discovered FDs.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
- Tuple of (fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables)
|
|
109
|
+
"""
|
|
110
|
+
n_rows = len(df)
|
|
111
|
+
cols = list(df.columns)
|
|
112
|
+
|
|
113
|
+
# Discover functional dependencies
|
|
114
|
+
fds = find_functional_dependencies(df, max_lhs_size)
|
|
115
|
+
|
|
116
|
+
# Prepare FD results
|
|
117
|
+
fd_results = [(", ".join(lhs), rhs) for lhs, rhs in fds]
|
|
118
|
+
|
|
119
|
+
# Profile keys (by uniqueness)
|
|
120
|
+
all_keys = []
|
|
121
|
+
for size in range(1, max_combination_size + 1):
|
|
122
|
+
for combo in itertools.combinations(cols, size):
|
|
123
|
+
unique_count = df.drop_duplicates(subset=combo).shape[0]
|
|
124
|
+
unique_ratio = unique_count / n_rows
|
|
125
|
+
is_key = unique_count == n_rows
|
|
126
|
+
if is_key:
|
|
127
|
+
all_keys.append(combo)
|
|
128
|
+
|
|
129
|
+
# Distinguish between candidate keys and superkeys
|
|
130
|
+
candidate_keys = []
|
|
131
|
+
superkeys = []
|
|
132
|
+
|
|
133
|
+
for key in all_keys:
|
|
134
|
+
is_candidate = True
|
|
135
|
+
# Check if any proper subset of this key is also a key
|
|
136
|
+
for i in range(1, len(key)):
|
|
137
|
+
for subset in itertools.combinations(key, i):
|
|
138
|
+
if subset in all_keys:
|
|
139
|
+
is_candidate = False
|
|
140
|
+
break
|
|
141
|
+
if not is_candidate:
|
|
142
|
+
break
|
|
143
|
+
|
|
144
|
+
if is_candidate:
|
|
145
|
+
candidate_keys.append(key)
|
|
146
|
+
else:
|
|
147
|
+
superkeys.append(key)
|
|
148
|
+
|
|
149
|
+
# Prepare results for all keys (both candidate keys and superkeys)
|
|
150
|
+
results = []
|
|
151
|
+
for size in range(1, max_combination_size + 1):
|
|
152
|
+
for combo in itertools.combinations(cols, size):
|
|
153
|
+
unique_count = df.drop_duplicates(subset=combo).shape[0]
|
|
154
|
+
unique_ratio = unique_count / n_rows
|
|
155
|
+
is_key = combo in all_keys
|
|
156
|
+
is_candidate = combo in candidate_keys
|
|
157
|
+
is_superkey = combo in superkeys
|
|
158
|
+
|
|
159
|
+
# Use icons for different key types
|
|
160
|
+
key_type = ""
|
|
161
|
+
if is_candidate:
|
|
162
|
+
key_type = "★ Candidate Key" # Star for candidate keys
|
|
163
|
+
elif is_superkey:
|
|
164
|
+
key_type = "⊃ Superkey" # Superset symbol for superkeys
|
|
165
|
+
|
|
166
|
+
results.append((combo, unique_count, unique_ratio, is_key, key_type))
|
|
167
|
+
|
|
168
|
+
results.sort(key=lambda x: (not x[3], -x[2], len(x[0])))
|
|
169
|
+
key_results = [(", ".join(c), u, f"{u/n_rows:.2%}", k)
|
|
170
|
+
for c, u, _, _, k in results]
|
|
171
|
+
|
|
172
|
+
# Propose normalized tables
|
|
173
|
+
normalized_tables = propose_normalized_tables(cols, candidate_keys, fds)
|
|
174
|
+
|
|
175
|
+
return fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def visualize_profile(df: pd.DataFrame, max_combination_size: int = 2, max_lhs_size: int = 2):
|
|
179
|
+
"""
|
|
180
|
+
Create a visual representation of the key profile for a dataframe.
|
|
181
|
+
|
|
182
|
+
Parameters:
|
|
183
|
+
- df: pandas.DataFrame to analyze.
|
|
184
|
+
- max_combination_size: max size of column combos to test for keys.
|
|
185
|
+
- max_lhs_size: max size of LHS in discovered FDs.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
- QMainWindow: The visualization window
|
|
189
|
+
"""
|
|
190
|
+
# Get profile results
|
|
191
|
+
fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables = profile(
|
|
192
|
+
df, max_combination_size, max_lhs_size
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Create main window
|
|
196
|
+
window = QMainWindow()
|
|
197
|
+
window.setWindowTitle("Table Profile: Keys & Dependencies")
|
|
198
|
+
window.resize(900, 700)
|
|
199
|
+
|
|
200
|
+
# Create central widget and layout
|
|
201
|
+
central_widget = QWidget()
|
|
202
|
+
window.setCentralWidget(central_widget)
|
|
203
|
+
layout = QVBoxLayout(central_widget)
|
|
204
|
+
|
|
205
|
+
# Add header
|
|
206
|
+
header = QLabel(f"Analyzed {n_rows} rows × {len(cols)} columns; key combos up to size {max_combination_size}, FDs up to LHS size {max_lhs_size}")
|
|
207
|
+
header.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
208
|
+
header.setStyleSheet("font-size: 14pt; font-weight: bold; margin: 10px;")
|
|
209
|
+
layout.addWidget(header)
|
|
210
|
+
|
|
211
|
+
# Add description
|
|
212
|
+
description = QLabel(
|
|
213
|
+
"This profile helps identify candidate keys and functional dependencies in your data. "
|
|
214
|
+
"★ Candidate keys are minimal combinations of columns that uniquely identify rows. "
|
|
215
|
+
"⊃ Superkeys are non-minimal column sets that uniquely identify rows. "
|
|
216
|
+
"Functional dependencies indicate when one column's values determine another's."
|
|
217
|
+
)
|
|
218
|
+
description.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
219
|
+
description.setWordWrap(True)
|
|
220
|
+
description.setStyleSheet("margin-bottom: 10px;")
|
|
221
|
+
layout.addWidget(description)
|
|
222
|
+
|
|
223
|
+
# Add key for icons
|
|
224
|
+
icons_key = QLabel("Key: ★ = Minimal Candidate Key | ⊃ = Non-minimal Superkey")
|
|
225
|
+
icons_key.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
226
|
+
icons_key.setStyleSheet("font-style: italic; margin-bottom: 15px;")
|
|
227
|
+
layout.addWidget(icons_key)
|
|
228
|
+
|
|
229
|
+
# Create tabs
|
|
230
|
+
tabs = QTabWidget()
|
|
231
|
+
|
|
232
|
+
# Tab for Candidate Keys
|
|
233
|
+
key_tab = QWidget()
|
|
234
|
+
key_layout = QVBoxLayout()
|
|
235
|
+
|
|
236
|
+
key_header = QLabel("Keys (Column Combinations that Uniquely Identify Rows)")
|
|
237
|
+
key_header.setStyleSheet("font-weight: bold;")
|
|
238
|
+
key_layout.addWidget(key_header)
|
|
239
|
+
|
|
240
|
+
key_table = QTableWidget(len(key_results), 4)
|
|
241
|
+
key_table.setHorizontalHeaderLabels(["Columns", "Unique Count", "Uniqueness Ratio", "Key Type"])
|
|
242
|
+
key_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
|
|
243
|
+
for row, (cols_str, count, ratio, key_type) in enumerate(key_results):
|
|
244
|
+
key_table.setItem(row, 0, QTableWidgetItem(cols_str))
|
|
245
|
+
key_table.setItem(row, 1, QTableWidgetItem(str(count)))
|
|
246
|
+
key_table.setItem(row, 2, QTableWidgetItem(ratio))
|
|
247
|
+
|
|
248
|
+
# Create item with appropriate styling
|
|
249
|
+
type_item = QTableWidgetItem(key_type)
|
|
250
|
+
if "Candidate Key" in key_type:
|
|
251
|
+
type_item.setForeground(Qt.GlobalColor.darkGreen)
|
|
252
|
+
elif "Superkey" in key_type:
|
|
253
|
+
type_item.setForeground(Qt.GlobalColor.darkBlue)
|
|
254
|
+
key_table.setItem(row, 3, type_item)
|
|
255
|
+
|
|
256
|
+
key_layout.addWidget(key_table)
|
|
257
|
+
key_tab.setLayout(key_layout)
|
|
258
|
+
tabs.addTab(key_tab, "Keys")
|
|
259
|
+
|
|
260
|
+
# Tab for FDs
|
|
261
|
+
fd_tab = QWidget()
|
|
262
|
+
fd_layout = QVBoxLayout()
|
|
263
|
+
|
|
264
|
+
fd_header = QLabel("Functional Dependencies (When Values in One Set of Columns Determine Another Column)")
|
|
265
|
+
fd_header.setStyleSheet("font-weight: bold;")
|
|
266
|
+
fd_layout.addWidget(fd_header)
|
|
267
|
+
|
|
268
|
+
fd_table = QTableWidget(len(fd_results), 2)
|
|
269
|
+
fd_table.setHorizontalHeaderLabels(["Determinant (LHS)", "Dependent (RHS)"])
|
|
270
|
+
fd_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
|
|
271
|
+
for i, (lhs, rhs) in enumerate(fd_results):
|
|
272
|
+
lhs_item = QTableWidgetItem(lhs)
|
|
273
|
+
lhs_item.setFlags(lhs_item.flags() ^ Qt.ItemFlag.ItemIsEditable)
|
|
274
|
+
fd_table.setItem(i, 0, lhs_item)
|
|
275
|
+
fd_table.setItem(i, 1, QTableWidgetItem(rhs))
|
|
276
|
+
fd_layout.addWidget(fd_table)
|
|
277
|
+
fd_tab.setLayout(fd_layout)
|
|
278
|
+
tabs.addTab(fd_tab, "Functional Dependencies")
|
|
279
|
+
|
|
280
|
+
# Tab for Normalized Tables
|
|
281
|
+
norm_tab = QWidget()
|
|
282
|
+
norm_layout = QVBoxLayout()
|
|
283
|
+
|
|
284
|
+
norm_header = QLabel("Proposed Normalized Tables (Based on Functional Dependencies)")
|
|
285
|
+
norm_header.setStyleSheet("font-weight: bold;")
|
|
286
|
+
norm_layout.addWidget(norm_header)
|
|
287
|
+
|
|
288
|
+
norm_description = QLabel(
|
|
289
|
+
"These tables represent a proposed normalized schema based on the discovered functional dependencies. "
|
|
290
|
+
"Each table includes attributes that are functionally dependent on its primary key. "
|
|
291
|
+
"This is an approximate 3NF decomposition and may need further refinement."
|
|
292
|
+
)
|
|
293
|
+
norm_description.setWordWrap(True)
|
|
294
|
+
norm_description.setStyleSheet("margin-bottom: 10px;")
|
|
295
|
+
norm_layout.addWidget(norm_description)
|
|
296
|
+
|
|
297
|
+
norm_table = QTableWidget(len(normalized_tables), 3)
|
|
298
|
+
norm_table.setHorizontalHeaderLabels(["Table Name", "Primary Key", "Attributes"])
|
|
299
|
+
norm_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
|
|
300
|
+
for i, (table_name, primary_key, attributes) in enumerate(normalized_tables):
|
|
301
|
+
norm_table.setItem(i, 0, QTableWidgetItem(table_name))
|
|
302
|
+
|
|
303
|
+
pk_item = QTableWidgetItem(primary_key)
|
|
304
|
+
pk_item.setForeground(Qt.GlobalColor.darkGreen)
|
|
305
|
+
norm_table.setItem(i, 1, pk_item)
|
|
306
|
+
|
|
307
|
+
norm_table.setItem(i, 2, QTableWidgetItem(", ".join(attributes)))
|
|
308
|
+
|
|
309
|
+
norm_layout.addWidget(norm_table)
|
|
310
|
+
norm_tab.setLayout(norm_layout)
|
|
311
|
+
tabs.addTab(norm_tab, "Normalized Tables")
|
|
312
|
+
|
|
313
|
+
layout.addWidget(tabs)
|
|
314
|
+
|
|
315
|
+
# Show the window
|
|
316
|
+
window.show()
|
|
317
|
+
return window
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def test_profile_keys(test_size=100):
|
|
321
|
+
# Generate a dataframe with some realistic examples of a customer-product-order relationship
|
|
322
|
+
# Create customer data
|
|
323
|
+
customer_ids = list(range(1, 21)) # 20 customers
|
|
324
|
+
customer_names = ["John", "Jane", "Alice", "Bob", "Charlie", "Diana", "Edward", "Fiona", "George", "Hannah"]
|
|
325
|
+
|
|
326
|
+
# Create product data
|
|
327
|
+
product_names = ["Apple", "Banana", "Orange", "Grape", "Mango", "Strawberry", "Blueberry", "Kiwi", "Pineapple", "Watermelon"]
|
|
328
|
+
product_groups = ["Fruit"] * len(product_names)
|
|
329
|
+
|
|
330
|
+
# Generate random orders
|
|
331
|
+
random.seed(42) # For reproducibility
|
|
332
|
+
df_data = {
|
|
333
|
+
"customer_id": [random.choice(customer_ids) for _ in range(test_size)],
|
|
334
|
+
"customer_name": [customer_names[i % len(customer_names)] for i in range(test_size)],
|
|
335
|
+
"product_name": [random.choice(product_names) for _ in range(test_size)],
|
|
336
|
+
"product_group": ["Fruit" for _ in range(test_size)],
|
|
337
|
+
"order_date": [pd.Timestamp("2021-01-01") + pd.Timedelta(days=random.randint(0, 30)) for _ in range(test_size)],
|
|
338
|
+
"order_amount": [random.randint(100, 1000) for _ in range(test_size)]
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
# Ensure consistent relationships
|
|
342
|
+
for i in range(test_size):
|
|
343
|
+
# Ensure customer_name is consistently associated with customer_id
|
|
344
|
+
customer_idx = df_data["customer_id"][i] % len(customer_names)
|
|
345
|
+
df_data["customer_name"][i] = customer_names[customer_idx]
|
|
346
|
+
|
|
347
|
+
df = pd.DataFrame(df_data)
|
|
348
|
+
|
|
349
|
+
# Create and show visualization
|
|
350
|
+
app = QApplication(sys.argv)
|
|
351
|
+
window = visualize_profile(df, max_combination_size=3, max_lhs_size=2)
|
|
352
|
+
sys.exit(app.exec())
|
|
353
|
+
|
|
354
|
+
# Only run the test function when script is executed directly
|
|
355
|
+
if __name__ == "__main__":
|
|
356
|
+
test_profile_keys()
|