sqlshell 0.1.8__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sqlshell might be problematic. Click here for more details.

@@ -0,0 +1,347 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from PyQt6.QtCore import QObject, pyqtSignal, Qt
4
+ from PyQt6.QtWidgets import (
5
+ QApplication, QMainWindow, QVBoxLayout, QHBoxLayout, QWidget,
6
+ QTableView, QHeaderView, QLabel, QFrame, QScrollArea
7
+ )
8
+ from PyQt6.QtGui import QStandardItemModel, QStandardItem, QColor, QPalette, QBrush
9
+
10
+ class EntropyProfiler(QObject):
11
+ """Class to calculate entropy of columns in a dataframe"""
12
+ progress_updated = pyqtSignal(int, str) # Signal for progress reporting
13
+
14
+ def __init__(self):
15
+ super().__init__()
16
+
17
+ def calculate_entropy(self, series):
18
+ """Calculate Shannon entropy for a series of values"""
19
+ # Handle NaN values by dropping them
20
+ series = series.dropna()
21
+
22
+ if len(series) == 0:
23
+ return 0.0
24
+
25
+ # For numerical data with many unique values, bin the data
26
+ if series.dtype.kind in 'ifc' and series.nunique() > 10:
27
+ # Create bins (10 bins by default)
28
+ series = pd.cut(series, bins=10)
29
+
30
+ # Calculate value counts and probabilities
31
+ value_counts = series.value_counts(normalize=True)
32
+
33
+ # Calculate entropy: -sum(p * log2(p))
34
+ entropy = -np.sum(value_counts * np.log2(value_counts))
35
+ return entropy
36
+
37
+ def normalize_entropy(self, entropy_value, max_entropy):
38
+ """Normalize entropy value to 0-1 range"""
39
+ if max_entropy == 0:
40
+ return 0.0
41
+ return entropy_value / max_entropy
42
+
43
+ def profile(self, df):
44
+ """
45
+ Profile a dataframe to identify the most important columns based on entropy.
46
+
47
+ Args:
48
+ df: pandas DataFrame to analyze
49
+
50
+ Returns:
51
+ DataFrame with columns ranked by importance (entropy)
52
+ """
53
+ if not isinstance(df, pd.DataFrame):
54
+ raise TypeError("Input must be a pandas DataFrame")
55
+
56
+ if df.empty:
57
+ return pd.DataFrame(columns=['column', 'entropy', 'normalized_entropy', 'importance'])
58
+
59
+ results = []
60
+ total_columns = len(df.columns)
61
+
62
+ # Calculate entropy for each column
63
+ for i, column in enumerate(df.columns):
64
+ # Emit progress signal (if connected)
65
+ self.progress_updated.emit(int((i / total_columns) * 100), f"Analyzing column: {column}")
66
+
67
+ try:
68
+ entropy_value = self.calculate_entropy(df[column])
69
+ results.append({
70
+ 'column': column,
71
+ 'entropy': entropy_value
72
+ })
73
+ except Exception as e:
74
+ # Skip columns that can't be analyzed
75
+ continue
76
+
77
+ # Create results dataframe
78
+ result_df = pd.DataFrame(results)
79
+
80
+ if result_df.empty:
81
+ return pd.DataFrame(columns=['column', 'entropy', 'normalized_entropy', 'importance'])
82
+
83
+ # Calculate max entropy for normalization
84
+ max_entropy = result_df['entropy'].max()
85
+
86
+ # Add normalized entropy
87
+ result_df['normalized_entropy'] = result_df['entropy'].apply(
88
+ lambda x: self.normalize_entropy(x, max_entropy)
89
+ )
90
+
91
+ # Rank by importance (normalized entropy)
92
+ result_df = result_df.sort_values(by='normalized_entropy', ascending=False)
93
+
94
+ # Add importance label
95
+ def get_importance(value):
96
+ if value >= 0.8:
97
+ return "High"
98
+ elif value >= 0.5:
99
+ return "Medium"
100
+ elif value >= 0.3:
101
+ return "Low"
102
+ else:
103
+ return "Very Low"
104
+
105
+ result_df['importance'] = result_df['normalized_entropy'].apply(get_importance)
106
+
107
+ self.progress_updated.emit(100, "Analysis complete")
108
+ return result_df
109
+
110
+
111
+ class EntropyVisualization(QMainWindow):
112
+ """Window to visualize entropy results"""
113
+
114
+ def __init__(self, results_df, parent=None):
115
+ super().__init__(parent)
116
+ self.setWindowTitle("Column Entropy Profile")
117
+ self.resize(800, 600)
118
+
119
+ # Create central widget and layout
120
+ central_widget = QWidget()
121
+ self.setCentralWidget(central_widget)
122
+ layout = QVBoxLayout(central_widget)
123
+
124
+ # Add a title
125
+ title = QLabel("Column Importance Analysis (Entropy-Based)")
126
+ title.setAlignment(Qt.AlignmentFlag.AlignCenter)
127
+ title.setStyleSheet("font-size: 16pt; font-weight: bold; margin: 10px;")
128
+ layout.addWidget(title)
129
+
130
+ # Add a description
131
+ description = QLabel(
132
+ "Columns with higher entropy values contain more information and are likely more important for analysis."
133
+ )
134
+ description.setAlignment(Qt.AlignmentFlag.AlignCenter)
135
+ description.setWordWrap(True)
136
+ layout.addWidget(description)
137
+
138
+ # Create visual bars representation
139
+ self.create_visual_bars(layout, results_df)
140
+
141
+ # Create table view
142
+ self.create_table_view(layout, results_df)
143
+
144
+ def create_visual_bars(self, layout, df):
145
+ """Create horizontal bars representing entropy values"""
146
+ frame = QFrame()
147
+ frame.setFrameShape(QFrame.Shape.StyledPanel)
148
+ frame.setLineWidth(1)
149
+
150
+ # Create a scroll area for the bars
151
+ scroll_area = QScrollArea()
152
+ scroll_area.setWidgetResizable(True)
153
+ scroll_area.setFrameShape(QFrame.Shape.NoFrame)
154
+
155
+ # Content widget for the scroll area
156
+ content_widget = QWidget()
157
+ bars_layout = QVBoxLayout(content_widget)
158
+
159
+ # Scale for better visualization
160
+ max_width = 500
161
+
162
+ # Create a bar for each column
163
+ importance_colors = {
164
+ "High": QColor(52, 152, 219), # Blue
165
+ "Medium": QColor(46, 204, 113), # Green
166
+ "Low": QColor(241, 196, 15), # Yellow
167
+ "Very Low": QColor(230, 126, 34) # Orange
168
+ }
169
+
170
+ # Header
171
+ header = QLabel("Visualization of Column Importance (by Normalized Entropy)")
172
+ header.setAlignment(Qt.AlignmentFlag.AlignCenter)
173
+ header.setStyleSheet("font-weight: bold; margin-top: 10px;")
174
+ bars_layout.addWidget(header)
175
+
176
+ for _, row in df.iterrows():
177
+ bar_container = QWidget()
178
+ bar_layout = QVBoxLayout(bar_container)
179
+ bar_layout.setContentsMargins(0, 2, 0, 2)
180
+
181
+ # Column name and value
182
+ label_text = f"{row['column']}: {row['normalized_entropy']:.3f} ({row['importance']})"
183
+ label = QLabel(label_text)
184
+ bar_layout.addWidget(label)
185
+
186
+ # Progress bar
187
+ bar_width = int(row['normalized_entropy'] * max_width)
188
+ bar = QFrame()
189
+ bar.setFixedHeight(20)
190
+ bar.setFixedWidth(bar_width)
191
+ bar.setStyleSheet(f"background-color: {importance_colors[row['importance']].name()}; border-radius: 2px;")
192
+
193
+ # Container to left-align the bar
194
+ bar_container_inner = QWidget()
195
+ bar_container_layout = QHBoxLayout(bar_container_inner)
196
+ bar_container_layout.setContentsMargins(0, 0, 0, 0)
197
+ bar_container_layout.addWidget(bar)
198
+ bar_container_layout.addStretch()
199
+
200
+ bar_layout.addWidget(bar_container_inner)
201
+ bars_layout.addWidget(bar_container)
202
+
203
+ bars_layout.addStretch()
204
+
205
+ # Set the content widget to the scroll area
206
+ scroll_area.setWidget(content_widget)
207
+
208
+ # Add the scroll area to the frame layout
209
+ frame_layout = QVBoxLayout(frame)
210
+ frame_layout.addWidget(scroll_area)
211
+
212
+ # Add to main layout
213
+ layout.addWidget(frame)
214
+
215
+ # Set a reasonable maximum height for the scroll area
216
+ if len(df) > 10:
217
+ scroll_area.setMaximumHeight(400)
218
+
219
+ def create_table_view(self, layout, df):
220
+ """Create a table view showing the entropy results"""
221
+ # Create the model
222
+ model = QStandardItemModel()
223
+ model.setHorizontalHeaderLabels(['Column', 'Entropy', 'Normalized Entropy', 'Importance'])
224
+
225
+ # Set table data
226
+ for index, row in df.iterrows():
227
+ column_item = QStandardItem(str(row['column']))
228
+ entropy_item = QStandardItem(f"{row['entropy']:.4f}")
229
+ norm_entropy_item = QStandardItem(f"{row['normalized_entropy']:.4f}")
230
+ importance_item = QStandardItem(row['importance'])
231
+
232
+ # Set alignment
233
+ entropy_item.setTextAlignment(Qt.AlignmentFlag.AlignRight | Qt.AlignmentFlag.AlignVCenter)
234
+ norm_entropy_item.setTextAlignment(Qt.AlignmentFlag.AlignRight | Qt.AlignmentFlag.AlignVCenter)
235
+ importance_item.setTextAlignment(Qt.AlignmentFlag.AlignCenter | Qt.AlignmentFlag.AlignVCenter)
236
+
237
+ # Color based on importance
238
+ if row['importance'] == 'High':
239
+ importance_item.setBackground(QBrush(QColor(52, 152, 219))) # Blue
240
+ elif row['importance'] == 'Medium':
241
+ importance_item.setBackground(QBrush(QColor(46, 204, 113))) # Green
242
+ elif row['importance'] == 'Low':
243
+ importance_item.setBackground(QBrush(QColor(241, 196, 15))) # Yellow
244
+ else: # Very Low
245
+ importance_item.setBackground(QBrush(QColor(230, 126, 34))) # Orange
246
+
247
+ model.appendRow([column_item, entropy_item, norm_entropy_item, importance_item])
248
+
249
+ # Create and configure the table view
250
+ table_view = QTableView()
251
+ table_view.setModel(model)
252
+ table_view.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
253
+ table_view.setAlternatingRowColors(True)
254
+ table_view.setMinimumHeight(200)
255
+
256
+ layout.addWidget(table_view)
257
+
258
+
259
+ # Function interface for simpler usage
260
+ def profile(df):
261
+ """
262
+ Profile a dataframe to identify the most important columns based on entropy.
263
+
264
+ Args:
265
+ df: pandas DataFrame to analyze
266
+
267
+ Returns:
268
+ DataFrame with columns ranked by importance (entropy)
269
+ """
270
+ profiler = EntropyProfiler()
271
+ return profiler.profile(df)
272
+
273
+
274
+ def visualize_profile(df):
275
+ """
276
+ Create a visual representation of the entropy profile for a dataframe.
277
+
278
+ Args:
279
+ df: pandas DataFrame to analyze
280
+
281
+ Returns:
282
+ A PyQt6 window showing the visualization
283
+ """
284
+ profiler = EntropyProfiler()
285
+ results = profiler.profile(df)
286
+ vis = EntropyVisualization(results)
287
+ vis.show()
288
+ return vis
289
+
290
+
291
+ def test_profile_entropy():
292
+ """Test the entropy profiler with a sample dataframe"""
293
+ import sys
294
+
295
+ # Create a QApplication instance if one doesn't exist
296
+ app = QApplication.instance()
297
+ if app is None:
298
+ app = QApplication(sys.argv)
299
+
300
+ # Generate a random dataframe with some columns with different distributions
301
+ np.random.seed(42) # For reproducibility
302
+
303
+ # Create a dataframe with columns of varying entropy levels
304
+ df = pd.DataFrame({
305
+ 'uniform': np.random.randint(0, 100, size=1000), # High entropy (uniform distribution)
306
+ 'normal': np.random.normal(50, 10, size=1000), # Medium entropy
307
+ 'binary': np.random.choice([0, 1], size=1000), # Low entropy (only two values)
308
+ 'constant': np.ones(1000), # Zero entropy (same value)
309
+ 'skewed': np.random.exponential(5, size=1000), # Skewed distribution,
310
+ 'categorical': np.random.choice(['A', 'B', 'C'], size=1000), # Categorical data
311
+ 'mixed': np.random.randint(0, 100, size=1000) * np.random.choice([0, 1], size=1000), # Mixed data
312
+ 'datetime': pd.date_range('2020-01-01', periods=1000), # Datetime data
313
+ 'text': pd.Series(['a', 'b', 'c'] * 334)[:1000], # Text data
314
+ 'boolean': np.random.choice([True, False], size=1000), # Boolean data
315
+ # add 20 more dummy columns with different distributions
316
+ 'dummy1': np.random.randint(0, 100, size=1000),
317
+ 'dummy2': np.random.normal(50, 10, size=1000),
318
+ 'dummy3': np.random.choice([0, 1], size=1000),
319
+ 'dummy4': np.ones(1000),
320
+ 'dummy5': np.random.exponential(5, size=1000),
321
+ # add 20 more dummy columns with different distributions
322
+ 'dummy6': np.random.randint(0, 100, size=1000),
323
+ 'dummy7': np.random.normal(50, 10, size=1000),
324
+ 'dummy8': np.random.choice([0, 1], size=1000),
325
+ 'dummy9': np.ones(1000),
326
+ 'dummy10': np.random.exponential(5, size=1000),
327
+
328
+ })
329
+
330
+ # Add a categorical column with few categories
331
+ df['category'] = np.random.choice(['A', 'B', 'C'], size=1000)
332
+
333
+ # Calculate and display profile information
334
+ print("Entropy Profile Results:")
335
+ profiler = EntropyProfiler()
336
+ result = profiler.profile(df)
337
+ print(result)
338
+
339
+ # Visualize the results
340
+ vis = visualize_profile(df)
341
+
342
+ # Start the application event loop
343
+ app.exec()
344
+
345
+
346
+ if __name__ == "__main__":
347
+ test_profile_entropy()
@@ -0,0 +1,356 @@
1
+ import sys
2
+ import itertools
3
+ import pandas as pd
4
+ import random
5
+ from PyQt6.QtWidgets import (
6
+ QApplication, QWidget, QVBoxLayout, QLabel, QTableWidget, QTableWidgetItem, QHeaderView, QTabWidget, QMainWindow
7
+ )
8
+ from PyQt6.QtCore import Qt
9
+
10
+
11
+ def find_functional_dependencies(df: pd.DataFrame, max_lhs_size: int = 2):
12
+ """
13
+ Discover all functional dependencies X -> A in the DataFrame for |X| <= max_lhs_size.
14
+ Returns a list of tuples (lhs, rhs).
15
+ """
16
+ fds = []
17
+ cols = list(df.columns)
18
+ n_rows = len(df)
19
+
20
+ for size in range(1, max_lhs_size + 1):
21
+ for lhs in itertools.combinations(cols, size):
22
+ # for each potential dependent attribute not in lhs
23
+ lhs_df = df[list(lhs)]
24
+ # group by lhs and count distinct values of each other column
25
+ grouped = df.groupby(list(lhs))
26
+ for rhs in cols:
27
+ if rhs in lhs:
28
+ continue
29
+ # Check if for each group, rhs has only one distinct value
30
+ distinct_counts = grouped[rhs].nunique(dropna=False)
31
+ if (distinct_counts <= 1).all():
32
+ fds.append((lhs, rhs))
33
+ return fds
34
+
35
+
36
+ def propose_normalized_tables(cols, candidate_keys, fds):
37
+ """
38
+ Propose a set of normalized tables based on functional dependencies.
39
+ Uses a simplified approach to create 3NF tables.
40
+
41
+ Parameters:
42
+ - cols: list of all columns
43
+ - candidate_keys: list of candidate keys
44
+ - fds: list of functional dependencies as (lhs, rhs) tuples
45
+
46
+ Returns:
47
+ - List of proposed tables as (table_name, primary_key, attributes) tuples
48
+ """
49
+ # Start with a set of all attributes
50
+ all_attrs = set(cols)
51
+ proposed_tables = []
52
+
53
+ # Group FDs by their determinants (LHS)
54
+ determinant_groups = {}
55
+ for lhs, rhs in fds:
56
+ lhs_key = tuple(sorted(lhs))
57
+ if lhs_key not in determinant_groups:
58
+ determinant_groups[lhs_key] = []
59
+ determinant_groups[lhs_key].append(rhs)
60
+
61
+ # Create tables for each determinant group
62
+ table_counter = 1
63
+ for lhs, rhs_list in determinant_groups.items():
64
+ table_attrs = set(lhs) | set(rhs_list)
65
+ if table_attrs: # Skip empty tables
66
+ table_name = f"Table_{table_counter}"
67
+ primary_key = ", ".join(lhs)
68
+ attributes = list(table_attrs)
69
+ proposed_tables.append((table_name, primary_key, attributes))
70
+ table_counter += 1
71
+
72
+ # Create a table for any remaining attributes not in any FD
73
+ # or create a table with a candidate key if none exists yet
74
+ used_attrs = set()
75
+ for _, _, attrs in proposed_tables:
76
+ used_attrs.update(attrs)
77
+
78
+ remaining_attrs = all_attrs - used_attrs
79
+ if remaining_attrs:
80
+ # If we have a candidate key, use it for remaining attributes
81
+ for key in candidate_keys:
82
+ key_set = set(key)
83
+ if key_set & remaining_attrs: # If key has overlap with remaining attrs
84
+ table_name = f"Table_{table_counter}"
85
+ primary_key = ", ".join(key)
86
+ attributes = list(remaining_attrs | key_set)
87
+ proposed_tables.append((table_name, primary_key, attributes))
88
+ break
89
+ else: # No suitable candidate key
90
+ table_name = f"Table_{table_counter}"
91
+ primary_key = "id (suggested)"
92
+ attributes = list(remaining_attrs)
93
+ proposed_tables.append((table_name, primary_key, attributes))
94
+
95
+ return proposed_tables
96
+
97
+
98
+ def profile(df: pd.DataFrame, max_combination_size: int = 2, max_lhs_size: int = 2):
99
+ """
100
+ Analyze a pandas DataFrame to suggest candidate keys and discover functional dependencies.
101
+
102
+ Parameters:
103
+ - df: pandas.DataFrame to analyze.
104
+ - max_combination_size: max size of column combos to test for keys.
105
+ - max_lhs_size: max size of LHS in discovered FDs.
106
+
107
+ Returns:
108
+ - Tuple of (fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables)
109
+ """
110
+ n_rows = len(df)
111
+ cols = list(df.columns)
112
+
113
+ # Discover functional dependencies
114
+ fds = find_functional_dependencies(df, max_lhs_size)
115
+
116
+ # Prepare FD results
117
+ fd_results = [(", ".join(lhs), rhs) for lhs, rhs in fds]
118
+
119
+ # Profile keys (by uniqueness)
120
+ all_keys = []
121
+ for size in range(1, max_combination_size + 1):
122
+ for combo in itertools.combinations(cols, size):
123
+ unique_count = df.drop_duplicates(subset=combo).shape[0]
124
+ unique_ratio = unique_count / n_rows
125
+ is_key = unique_count == n_rows
126
+ if is_key:
127
+ all_keys.append(combo)
128
+
129
+ # Distinguish between candidate keys and superkeys
130
+ candidate_keys = []
131
+ superkeys = []
132
+
133
+ for key in all_keys:
134
+ is_candidate = True
135
+ # Check if any proper subset of this key is also a key
136
+ for i in range(1, len(key)):
137
+ for subset in itertools.combinations(key, i):
138
+ if subset in all_keys:
139
+ is_candidate = False
140
+ break
141
+ if not is_candidate:
142
+ break
143
+
144
+ if is_candidate:
145
+ candidate_keys.append(key)
146
+ else:
147
+ superkeys.append(key)
148
+
149
+ # Prepare results for all keys (both candidate keys and superkeys)
150
+ results = []
151
+ for size in range(1, max_combination_size + 1):
152
+ for combo in itertools.combinations(cols, size):
153
+ unique_count = df.drop_duplicates(subset=combo).shape[0]
154
+ unique_ratio = unique_count / n_rows
155
+ is_key = combo in all_keys
156
+ is_candidate = combo in candidate_keys
157
+ is_superkey = combo in superkeys
158
+
159
+ # Use icons for different key types
160
+ key_type = ""
161
+ if is_candidate:
162
+ key_type = "★ Candidate Key" # Star for candidate keys
163
+ elif is_superkey:
164
+ key_type = "⊃ Superkey" # Superset symbol for superkeys
165
+
166
+ results.append((combo, unique_count, unique_ratio, is_key, key_type))
167
+
168
+ results.sort(key=lambda x: (not x[3], -x[2], len(x[0])))
169
+ key_results = [(", ".join(c), u, f"{u/n_rows:.2%}", k)
170
+ for c, u, _, _, k in results]
171
+
172
+ # Propose normalized tables
173
+ normalized_tables = propose_normalized_tables(cols, candidate_keys, fds)
174
+
175
+ return fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables
176
+
177
+
178
+ def visualize_profile(df: pd.DataFrame, max_combination_size: int = 2, max_lhs_size: int = 2):
179
+ """
180
+ Create a visual representation of the key profile for a dataframe.
181
+
182
+ Parameters:
183
+ - df: pandas.DataFrame to analyze.
184
+ - max_combination_size: max size of column combos to test for keys.
185
+ - max_lhs_size: max size of LHS in discovered FDs.
186
+
187
+ Returns:
188
+ - QMainWindow: The visualization window
189
+ """
190
+ # Get profile results
191
+ fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables = profile(
192
+ df, max_combination_size, max_lhs_size
193
+ )
194
+
195
+ # Create main window
196
+ window = QMainWindow()
197
+ window.setWindowTitle("Table Profile: Keys & Dependencies")
198
+ window.resize(900, 700)
199
+
200
+ # Create central widget and layout
201
+ central_widget = QWidget()
202
+ window.setCentralWidget(central_widget)
203
+ layout = QVBoxLayout(central_widget)
204
+
205
+ # Add header
206
+ header = QLabel(f"Analyzed {n_rows} rows × {len(cols)} columns; key combos up to size {max_combination_size}, FDs up to LHS size {max_lhs_size}")
207
+ header.setAlignment(Qt.AlignmentFlag.AlignCenter)
208
+ header.setStyleSheet("font-size: 14pt; font-weight: bold; margin: 10px;")
209
+ layout.addWidget(header)
210
+
211
+ # Add description
212
+ description = QLabel(
213
+ "This profile helps identify candidate keys and functional dependencies in your data. "
214
+ "★ Candidate keys are minimal combinations of columns that uniquely identify rows. "
215
+ "⊃ Superkeys are non-minimal column sets that uniquely identify rows. "
216
+ "Functional dependencies indicate when one column's values determine another's."
217
+ )
218
+ description.setAlignment(Qt.AlignmentFlag.AlignCenter)
219
+ description.setWordWrap(True)
220
+ description.setStyleSheet("margin-bottom: 10px;")
221
+ layout.addWidget(description)
222
+
223
+ # Add key for icons
224
+ icons_key = QLabel("Key: ★ = Minimal Candidate Key | ⊃ = Non-minimal Superkey")
225
+ icons_key.setAlignment(Qt.AlignmentFlag.AlignCenter)
226
+ icons_key.setStyleSheet("font-style: italic; margin-bottom: 15px;")
227
+ layout.addWidget(icons_key)
228
+
229
+ # Create tabs
230
+ tabs = QTabWidget()
231
+
232
+ # Tab for Candidate Keys
233
+ key_tab = QWidget()
234
+ key_layout = QVBoxLayout()
235
+
236
+ key_header = QLabel("Keys (Column Combinations that Uniquely Identify Rows)")
237
+ key_header.setStyleSheet("font-weight: bold;")
238
+ key_layout.addWidget(key_header)
239
+
240
+ key_table = QTableWidget(len(key_results), 4)
241
+ key_table.setHorizontalHeaderLabels(["Columns", "Unique Count", "Uniqueness Ratio", "Key Type"])
242
+ key_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
243
+ for row, (cols_str, count, ratio, key_type) in enumerate(key_results):
244
+ key_table.setItem(row, 0, QTableWidgetItem(cols_str))
245
+ key_table.setItem(row, 1, QTableWidgetItem(str(count)))
246
+ key_table.setItem(row, 2, QTableWidgetItem(ratio))
247
+
248
+ # Create item with appropriate styling
249
+ type_item = QTableWidgetItem(key_type)
250
+ if "Candidate Key" in key_type:
251
+ type_item.setForeground(Qt.GlobalColor.darkGreen)
252
+ elif "Superkey" in key_type:
253
+ type_item.setForeground(Qt.GlobalColor.darkBlue)
254
+ key_table.setItem(row, 3, type_item)
255
+
256
+ key_layout.addWidget(key_table)
257
+ key_tab.setLayout(key_layout)
258
+ tabs.addTab(key_tab, "Keys")
259
+
260
+ # Tab for FDs
261
+ fd_tab = QWidget()
262
+ fd_layout = QVBoxLayout()
263
+
264
+ fd_header = QLabel("Functional Dependencies (When Values in One Set of Columns Determine Another Column)")
265
+ fd_header.setStyleSheet("font-weight: bold;")
266
+ fd_layout.addWidget(fd_header)
267
+
268
+ fd_table = QTableWidget(len(fd_results), 2)
269
+ fd_table.setHorizontalHeaderLabels(["Determinant (LHS)", "Dependent (RHS)"])
270
+ fd_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
271
+ for i, (lhs, rhs) in enumerate(fd_results):
272
+ lhs_item = QTableWidgetItem(lhs)
273
+ lhs_item.setFlags(lhs_item.flags() ^ Qt.ItemFlag.ItemIsEditable)
274
+ fd_table.setItem(i, 0, lhs_item)
275
+ fd_table.setItem(i, 1, QTableWidgetItem(rhs))
276
+ fd_layout.addWidget(fd_table)
277
+ fd_tab.setLayout(fd_layout)
278
+ tabs.addTab(fd_tab, "Functional Dependencies")
279
+
280
+ # Tab for Normalized Tables
281
+ norm_tab = QWidget()
282
+ norm_layout = QVBoxLayout()
283
+
284
+ norm_header = QLabel("Proposed Normalized Tables (Based on Functional Dependencies)")
285
+ norm_header.setStyleSheet("font-weight: bold;")
286
+ norm_layout.addWidget(norm_header)
287
+
288
+ norm_description = QLabel(
289
+ "These tables represent a proposed normalized schema based on the discovered functional dependencies. "
290
+ "Each table includes attributes that are functionally dependent on its primary key. "
291
+ "This is an approximate 3NF decomposition and may need further refinement."
292
+ )
293
+ norm_description.setWordWrap(True)
294
+ norm_description.setStyleSheet("margin-bottom: 10px;")
295
+ norm_layout.addWidget(norm_description)
296
+
297
+ norm_table = QTableWidget(len(normalized_tables), 3)
298
+ norm_table.setHorizontalHeaderLabels(["Table Name", "Primary Key", "Attributes"])
299
+ norm_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
300
+ for i, (table_name, primary_key, attributes) in enumerate(normalized_tables):
301
+ norm_table.setItem(i, 0, QTableWidgetItem(table_name))
302
+
303
+ pk_item = QTableWidgetItem(primary_key)
304
+ pk_item.setForeground(Qt.GlobalColor.darkGreen)
305
+ norm_table.setItem(i, 1, pk_item)
306
+
307
+ norm_table.setItem(i, 2, QTableWidgetItem(", ".join(attributes)))
308
+
309
+ norm_layout.addWidget(norm_table)
310
+ norm_tab.setLayout(norm_layout)
311
+ tabs.addTab(norm_tab, "Normalized Tables")
312
+
313
+ layout.addWidget(tabs)
314
+
315
+ # Show the window
316
+ window.show()
317
+ return window
318
+
319
+
320
+ def test_profile_keys(test_size=100):
321
+ # Generate a dataframe with some realistic examples of a customer-product-order relationship
322
+ # Create customer data
323
+ customer_ids = list(range(1, 21)) # 20 customers
324
+ customer_names = ["John", "Jane", "Alice", "Bob", "Charlie", "Diana", "Edward", "Fiona", "George", "Hannah"]
325
+
326
+ # Create product data
327
+ product_names = ["Apple", "Banana", "Orange", "Grape", "Mango", "Strawberry", "Blueberry", "Kiwi", "Pineapple", "Watermelon"]
328
+ product_groups = ["Fruit"] * len(product_names)
329
+
330
+ # Generate random orders
331
+ random.seed(42) # For reproducibility
332
+ df_data = {
333
+ "customer_id": [random.choice(customer_ids) for _ in range(test_size)],
334
+ "customer_name": [customer_names[i % len(customer_names)] for i in range(test_size)],
335
+ "product_name": [random.choice(product_names) for _ in range(test_size)],
336
+ "product_group": ["Fruit" for _ in range(test_size)],
337
+ "order_date": [pd.Timestamp("2021-01-01") + pd.Timedelta(days=random.randint(0, 30)) for _ in range(test_size)],
338
+ "order_amount": [random.randint(100, 1000) for _ in range(test_size)]
339
+ }
340
+
341
+ # Ensure consistent relationships
342
+ for i in range(test_size):
343
+ # Ensure customer_name is consistently associated with customer_id
344
+ customer_idx = df_data["customer_id"][i] % len(customer_names)
345
+ df_data["customer_name"][i] = customer_names[customer_idx]
346
+
347
+ df = pd.DataFrame(df_data)
348
+
349
+ # Create and show visualization
350
+ app = QApplication(sys.argv)
351
+ window = visualize_profile(df, max_combination_size=3, max_lhs_size=2)
352
+ sys.exit(app.exec())
353
+
354
+ # Only run the test function when script is executed directly
355
+ if __name__ == "__main__":
356
+ test_profile_keys()