sqlshell 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. sqlshell/__init__.py +84 -0
  2. sqlshell/__main__.py +4926 -0
  3. sqlshell/ai_autocomplete.py +392 -0
  4. sqlshell/ai_settings_dialog.py +337 -0
  5. sqlshell/context_suggester.py +768 -0
  6. sqlshell/create_test_data.py +152 -0
  7. sqlshell/data/create_test_data.py +137 -0
  8. sqlshell/db/__init__.py +6 -0
  9. sqlshell/db/database_manager.py +1318 -0
  10. sqlshell/db/export_manager.py +188 -0
  11. sqlshell/editor.py +1166 -0
  12. sqlshell/editor_integration.py +127 -0
  13. sqlshell/execution_handler.py +421 -0
  14. sqlshell/menus.py +262 -0
  15. sqlshell/notification_manager.py +370 -0
  16. sqlshell/query_tab.py +904 -0
  17. sqlshell/resources/__init__.py +1 -0
  18. sqlshell/resources/icon.png +0 -0
  19. sqlshell/resources/logo_large.png +0 -0
  20. sqlshell/resources/logo_medium.png +0 -0
  21. sqlshell/resources/logo_small.png +0 -0
  22. sqlshell/resources/splash_screen.gif +0 -0
  23. sqlshell/space_invaders.py +501 -0
  24. sqlshell/splash_screen.py +405 -0
  25. sqlshell/sqlshell/__init__.py +5 -0
  26. sqlshell/sqlshell/create_test_data.py +118 -0
  27. sqlshell/sqlshell/create_test_databases.py +96 -0
  28. sqlshell/sqlshell_demo.png +0 -0
  29. sqlshell/styles.py +257 -0
  30. sqlshell/suggester_integration.py +330 -0
  31. sqlshell/syntax_highlighter.py +124 -0
  32. sqlshell/table_list.py +996 -0
  33. sqlshell/ui/__init__.py +6 -0
  34. sqlshell/ui/bar_chart_delegate.py +49 -0
  35. sqlshell/ui/filter_header.py +469 -0
  36. sqlshell/utils/__init__.py +16 -0
  37. sqlshell/utils/profile_cn2.py +1661 -0
  38. sqlshell/utils/profile_column.py +2635 -0
  39. sqlshell/utils/profile_distributions.py +616 -0
  40. sqlshell/utils/profile_entropy.py +347 -0
  41. sqlshell/utils/profile_foreign_keys.py +779 -0
  42. sqlshell/utils/profile_keys.py +2834 -0
  43. sqlshell/utils/profile_ohe.py +934 -0
  44. sqlshell/utils/profile_ohe_advanced.py +754 -0
  45. sqlshell/utils/profile_ohe_comparison.py +237 -0
  46. sqlshell/utils/profile_prediction.py +926 -0
  47. sqlshell/utils/profile_similarity.py +876 -0
  48. sqlshell/utils/search_in_df.py +90 -0
  49. sqlshell/widgets.py +400 -0
  50. sqlshell-0.4.4.dist-info/METADATA +441 -0
  51. sqlshell-0.4.4.dist-info/RECORD +54 -0
  52. sqlshell-0.4.4.dist-info/WHEEL +5 -0
  53. sqlshell-0.4.4.dist-info/entry_points.txt +2 -0
  54. sqlshell-0.4.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,779 @@
1
+ import sys
2
+ import itertools
3
+ import pandas as pd
4
+ from typing import List, Dict, Tuple, Set, Callable
5
+ from PyQt6.QtWidgets import (
6
+ QApplication, QWidget, QVBoxLayout, QLabel, QTableWidget, QTableWidgetItem, QHeaderView, QTabWidget, QMainWindow,
7
+ QPushButton, QHBoxLayout, QMessageBox
8
+ )
9
+ from PyQt6.QtCore import Qt
10
+ import re
11
+
12
+
13
+ def find_foreign_keys(dfs: List[pd.DataFrame], df_names: List[str], min_match_ratio: float = 0.95):
14
+ """
15
+ Discover potential foreign key relationships between DataFrames.
16
+
17
+ Parameters:
18
+ - dfs: List of pandas DataFrames to analyze
19
+ - df_names: Names of the DataFrames (used for reporting)
20
+ - min_match_ratio: Minimum ratio of matching values to consider a foreign key
21
+
22
+ Returns:
23
+ - List of tuples (referenced_table, referenced_column, referencing_table, referencing_column, match_ratio)
24
+ """
25
+ foreign_keys = []
26
+
27
+ # Helper function to check if a column name suggests it's an ID/key column
28
+ def is_likely_id_column(col_name):
29
+ col_lower = col_name.lower()
30
+ id_patterns = [
31
+ r'.*_?id$', # ends with 'id' or '_id'
32
+ r'^id_?.*', # starts with 'id' or 'id_'
33
+ r'.*_?key$', # ends with 'key' or '_key'
34
+ r'^key_?.*', # starts with 'key' or 'key_'
35
+ r'.*_?code$', # ends with 'code' or '_code'
36
+ r'.*_?ref$', # ends with 'ref' or '_ref'
37
+ r'.*_?num$', # ends with 'num' or '_num'
38
+ r'.*_?number$' # ends with 'number' or '_number'
39
+ ]
40
+ return any(re.match(pattern, col_lower) for pattern in id_patterns)
41
+
42
+ # Helper function to check if a column name suggests it's NOT a foreign key
43
+ def is_unlikely_fk_column(col_name):
44
+ col_lower = col_name.lower()
45
+ non_fk_patterns = [
46
+ r'.*quantity.*', r'.*amount.*', r'.*price.*', r'.*cost.*',
47
+ r'.*total.*', r'.*sum.*', r'.*count.*', r'.*rate.*',
48
+ r'.*percent.*', r'.*ratio.*', r'.*score.*', r'.*weight.*',
49
+ r'.*length.*', r'.*width.*', r'.*height.*', r'.*size.*',
50
+ r'.*age.*', r'.*year.*', r'.*month.*', r'.*day.*',
51
+ r'.*time.*', r'.*date.*', r'.*timestamp.*',
52
+ r'.*name.*', r'.*title.*', r'.*description.*', r'.*text.*',
53
+ r'.*comment.*', r'.*note.*', r'.*email.*', r'.*phone.*',
54
+ r'.*address.*', r'.*city.*', r'.*state.*', r'.*country.*'
55
+ ]
56
+ return any(re.match(pattern, col_lower) for pattern in non_fk_patterns)
57
+
58
+ # Helper function to calculate column name similarity
59
+ def column_name_similarity(col1, col2):
60
+ col1_lower = col1.lower()
61
+ col2_lower = col2.lower()
62
+
63
+ # Exact match
64
+ if col1_lower == col2_lower:
65
+ return 1.0
66
+
67
+ # Check if one is a substring of the other
68
+ if col1_lower in col2_lower or col2_lower in col1_lower:
69
+ return 0.8
70
+
71
+ # Check for common FK patterns (e.g., "customer_id" matches "customer")
72
+ col1_clean = re.sub(r'_?(id|key|ref|code|num|number)$', '', col1_lower)
73
+ col2_clean = re.sub(r'_?(id|key|ref|code|num|number)$', '', col2_lower)
74
+
75
+ if col1_clean == col2_clean and col1_clean:
76
+ return 0.9
77
+
78
+ # Check if cleaned versions have overlap
79
+ if col1_clean in col2_clean or col2_clean in col1_clean:
80
+ return 0.6
81
+
82
+ return 0.0
83
+
84
+ # First, identify potential primary keys in each DataFrame
85
+ pk_candidates = {}
86
+ for i, df in enumerate(dfs):
87
+ name = df_names[i]
88
+ # Consider columns with unique values as potential primary keys
89
+ for col in df.columns:
90
+ if df[col].nunique() == len(df) and not df[col].isna().any():
91
+ # Prefer columns that look like ID columns
92
+ if is_likely_id_column(col):
93
+ if name not in pk_candidates:
94
+ pk_candidates[name] = []
95
+ pk_candidates[name].append(col)
96
+
97
+ # For each DataFrame pair, check for foreign key relationships
98
+ for i, df1 in enumerate(dfs):
99
+ name1 = df_names[i]
100
+
101
+ # Skip if this DataFrame has no primary key candidates
102
+ if name1 not in pk_candidates:
103
+ continue
104
+
105
+ # For each potential primary key column
106
+ for pk_col in pk_candidates[name1]:
107
+ pk_values = set(df1[pk_col])
108
+
109
+ # Check every other DataFrame for matching columns
110
+ for j, df2 in enumerate(dfs):
111
+ name2 = df_names[j]
112
+
113
+ # Skip self-references
114
+ if i == j:
115
+ continue
116
+
117
+ # Check each column in df2 for potential foreign key relationship
118
+ for fk_col in df2.columns:
119
+ # Skip if data types are incompatible
120
+ if df1[pk_col].dtype != df2[fk_col].dtype:
121
+ continue
122
+
123
+ # Skip columns that are unlikely to be foreign keys
124
+ if is_unlikely_fk_column(fk_col):
125
+ continue
126
+
127
+ # Get unique values in potential foreign key column
128
+ fk_values = set(df2[fk_col].dropna())
129
+
130
+ # Skip empty columns
131
+ if not fk_values:
132
+ continue
133
+
134
+ # Check cardinality - FK column should have fewer or equal unique values than PK
135
+ if len(fk_values) > len(pk_values):
136
+ continue
137
+
138
+ # Check if foreign key values are a subset of primary key values
139
+ common_values = fk_values.intersection(pk_values)
140
+ match_ratio = len(common_values) / len(fk_values)
141
+
142
+ # Calculate a confidence score based on multiple factors
143
+ confidence_score = match_ratio
144
+
145
+ # Boost confidence for column name similarity
146
+ name_similarity = column_name_similarity(pk_col, fk_col)
147
+ if name_similarity > 0.5:
148
+ confidence_score += name_similarity * 0.3 # Up to 30% boost
149
+
150
+ # Boost confidence if FK column name suggests it's an ID
151
+ if is_likely_id_column(fk_col):
152
+ confidence_score += 0.1 # 10% boost
153
+
154
+ # Penalize if the FK column has too many unique values relative to total rows
155
+ fk_cardinality_ratio = len(fk_values) / len(df2)
156
+ if fk_cardinality_ratio > 0.5: # More than 50% unique values
157
+ confidence_score -= 0.2 # 20% penalty
158
+
159
+ # Consider it a foreign key if confidence score exceeds threshold
160
+ # But also require minimum match ratio
161
+ if confidence_score >= min_match_ratio and match_ratio >= 0.9:
162
+ foreign_keys.append((name1, pk_col, name2, fk_col, match_ratio))
163
+
164
+ # Sort by match ratio (descending), then by confidence
165
+ foreign_keys.sort(key=lambda x: x[4], reverse=True)
166
+ return foreign_keys
167
+
168
+
169
+ def find_inclusion_dependencies(dfs: List[pd.DataFrame], df_names: List[str], min_match_ratio: float = 0.8):
170
+ """
171
+ Find inclusion dependencies (more general than foreign keys) between DataFrames.
172
+ An inclusion dependency exists when values in one column are a subset of values in another column.
173
+
174
+ Parameters:
175
+ - dfs: List of pandas DataFrames to analyze
176
+ - df_names: Names of the DataFrames
177
+ - min_match_ratio: Minimum ratio of matching values
178
+
179
+ Returns:
180
+ - List of tuples (referenced_table, referenced_column, referencing_table, referencing_column, match_ratio)
181
+ """
182
+ dependencies = []
183
+
184
+ # Helper function to check if a column name suggests it's an ID/key column
185
+ def is_likely_id_column(col_name):
186
+ col_lower = col_name.lower()
187
+ id_patterns = [
188
+ r'.*_?id$', # ends with 'id' or '_id'
189
+ r'^id_?.*', # starts with 'id' or 'id_'
190
+ r'.*_?key$', # ends with 'key' or '_key'
191
+ r'^key_?.*', # starts with 'key' or 'key_'
192
+ r'.*_?code$', # ends with 'code' or '_code'
193
+ r'.*_?ref$', # ends with 'ref' or '_ref'
194
+ r'.*_?num$', # ends with 'num' or '_num'
195
+ r'.*_?number$' # ends with 'number' or '_number'
196
+ ]
197
+ return any(re.match(pattern, col_lower) for pattern in id_patterns)
198
+
199
+ # Helper function to check if a column name suggests it's NOT a foreign key
200
+ def is_unlikely_fk_column(col_name):
201
+ col_lower = col_name.lower()
202
+ non_fk_patterns = [
203
+ r'.*quantity.*', r'.*amount.*', r'.*price.*', r'.*cost.*',
204
+ r'.*total.*', r'.*sum.*', r'.*count.*', r'.*rate.*',
205
+ r'.*percent.*', r'.*ratio.*', r'.*score.*', r'.*weight.*',
206
+ r'.*length.*', r'.*width.*', r'.*height.*', r'.*size.*',
207
+ r'.*age.*', r'.*year.*', r'.*month.*', r'.*day.*',
208
+ r'.*time.*', r'.*date.*', r'.*timestamp.*',
209
+ r'.*name.*', r'.*title.*', r'.*description.*', r'.*text.*',
210
+ r'.*comment.*', r'.*note.*', r'.*email.*', r'.*phone.*',
211
+ r'.*address.*', r'.*city.*', r'.*state.*', r'.*country.*'
212
+ ]
213
+ return any(re.match(pattern, col_lower) for pattern in non_fk_patterns)
214
+
215
+ # Helper function to calculate column name similarity
216
+ def column_name_similarity(col1, col2):
217
+ col1_lower = col1.lower()
218
+ col2_lower = col2.lower()
219
+
220
+ # Exact match
221
+ if col1_lower == col2_lower:
222
+ return 1.0
223
+
224
+ # Check if one is a substring of the other
225
+ if col1_lower in col2_lower or col2_lower in col1_lower:
226
+ return 0.8
227
+
228
+ # Check for common FK patterns (e.g., "customer_id" matches "customer")
229
+ col1_clean = re.sub(r'_?(id|key|ref|code|num|number)$', '', col1_lower)
230
+ col2_clean = re.sub(r'_?(id|key|ref|code|num|number)$', '', col2_lower)
231
+
232
+ if col1_clean == col2_clean and col1_clean:
233
+ return 0.9
234
+
235
+ # Check if cleaned versions have overlap
236
+ if col1_clean in col2_clean or col2_clean in col1_clean:
237
+ return 0.6
238
+
239
+ return 0.0
240
+
241
+ # For each pair of DataFrames
242
+ for i, df1 in enumerate(dfs):
243
+ name1 = df_names[i]
244
+
245
+ for j, df2 in enumerate(dfs):
246
+ name2 = df_names[j]
247
+
248
+ # Skip self-comparison for the same index
249
+ if i == j:
250
+ continue
251
+
252
+ # For each potential pair of columns
253
+ for col1 in df1.columns:
254
+ # Get unique values in the potential referenced column
255
+ values1 = set(df1[col1].dropna())
256
+
257
+ # Skip empty columns
258
+ if not values1:
259
+ continue
260
+
261
+ # Prefer columns that look like ID columns for referenced side
262
+ if not is_likely_id_column(col1):
263
+ continue
264
+
265
+ for col2 in df2.columns:
266
+ # Skip if data types are incompatible
267
+ if df1[col1].dtype != df2[col2].dtype:
268
+ continue
269
+
270
+ # Skip columns that are unlikely to be foreign keys
271
+ if is_unlikely_fk_column(col2):
272
+ continue
273
+
274
+ # Get unique values in the potential referencing column
275
+ values2 = set(df2[col2].dropna())
276
+
277
+ # Skip empty columns
278
+ if not values2:
279
+ continue
280
+
281
+ # Check cardinality - referencing column should have fewer or equal unique values
282
+ if len(values2) > len(values1):
283
+ continue
284
+
285
+ # Check if values2 is approximately a subset of values1
286
+ common_values = values2.intersection(values1)
287
+ match_ratio = len(common_values) / len(values2)
288
+
289
+ # Calculate a confidence score based on multiple factors
290
+ confidence_score = match_ratio
291
+
292
+ # Boost confidence for column name similarity
293
+ name_similarity = column_name_similarity(col1, col2)
294
+ if name_similarity > 0.5:
295
+ confidence_score += name_similarity * 0.3 # Up to 30% boost
296
+
297
+ # Boost confidence if referencing column name suggests it's an ID
298
+ if is_likely_id_column(col2):
299
+ confidence_score += 0.1 # 10% boost
300
+
301
+ # Consider it an inclusion dependency if confidence score exceeds threshold
302
+ # But also require minimum match ratio
303
+ if confidence_score >= min_match_ratio and match_ratio >= 0.85:
304
+ dependencies.append((name1, col1, name2, col2, match_ratio))
305
+
306
+ # Sort by match ratio (descending)
307
+ dependencies.sort(key=lambda x: x[4], reverse=True)
308
+ return dependencies
309
+
310
+
311
+ def profile_referential_integrity(dfs: List[pd.DataFrame], df_names: List[str], foreign_keys):
312
+ """
313
+ Profile the referential integrity of discovered foreign keys.
314
+
315
+ Parameters:
316
+ - dfs: List of pandas DataFrames
317
+ - df_names: Names of the DataFrames
318
+ - foreign_keys: List of foreign key relationships
319
+
320
+ Returns:
321
+ - Dictionary with referential integrity statistics
322
+ """
323
+ integrity_results = {}
324
+
325
+ # Create lookup for DataFrames by name
326
+ df_dict = {name: df for name, df in zip(df_names, dfs)}
327
+
328
+ for pk_table, pk_col, fk_table, fk_col, _ in foreign_keys:
329
+ pk_df = df_dict[pk_table]
330
+ fk_df = df_dict[fk_table]
331
+
332
+ # Get primary key values
333
+ pk_values = set(pk_df[pk_col])
334
+
335
+ # Get foreign key values
336
+ fk_values = set(fk_df[fk_col].dropna())
337
+
338
+ # Count values that violate referential integrity
339
+ violations = fk_values - pk_values
340
+ violation_count = len(violations)
341
+
342
+ # Calculate violation ratio
343
+ total_fk_values = len(fk_df[fk_col].dropna())
344
+ violation_ratio = violation_count / total_fk_values if total_fk_values > 0 else 0
345
+
346
+ # Record results
347
+ key = (pk_table, pk_col, fk_table, fk_col)
348
+ integrity_results[key] = {
349
+ 'violation_count': violation_count,
350
+ 'violation_ratio': violation_ratio,
351
+ 'total_fk_values': total_fk_values,
352
+ 'violations': list(violations)[:10] # Only store first 10 violations for display
353
+ }
354
+
355
+ return integrity_results
356
+
357
+
358
+ def profile_foreign_keys(dfs: List[pd.DataFrame], df_names: List[str] = None, min_match_ratio: float = 0.95):
359
+ """
360
+ Analyze a list of pandas DataFrames to discover foreign key relationships.
361
+
362
+ Parameters:
363
+ - dfs: List of pandas DataFrames to analyze
364
+ - df_names: Optional list of names for the DataFrames. If None, names will be generated.
365
+ - min_match_ratio: Minimum ratio of matching values to consider a foreign key
366
+
367
+ Returns:
368
+ - Tuple of (foreign_keys, inclusion_dependencies, integrity_results)
369
+ """
370
+ # Generate default names if not provided
371
+ if df_names is None:
372
+ df_names = [f"Table_{i+1}" for i in range(len(dfs))]
373
+
374
+ # Ensure we have the same number of names as DataFrames
375
+ assert len(dfs) == len(df_names), "Number of DataFrames must match number of names"
376
+
377
+ # Find foreign keys
378
+ foreign_keys = find_foreign_keys(dfs, df_names, min_match_ratio)
379
+
380
+ # Find more general inclusion dependencies
381
+ inclusion_dependencies = find_inclusion_dependencies(dfs, df_names, min_match_ratio * 0.8)
382
+
383
+ # Profile referential integrity
384
+ integrity_results = profile_referential_integrity(dfs, df_names, foreign_keys)
385
+
386
+ return foreign_keys, inclusion_dependencies, integrity_results
387
+
388
+
389
+ def visualize_foreign_keys(dfs: List[pd.DataFrame], df_names: List[str] = None, min_match_ratio: float = 0.95,
390
+ on_generate_join: Callable = None, parent=None):
391
+ """
392
+ Create a visual representation of foreign key relationships between DataFrames.
393
+
394
+ Parameters:
395
+ - dfs: List of pandas DataFrames to analyze
396
+ - df_names: Optional list of names for the DataFrames. If None, names will be generated.
397
+ - min_match_ratio: Minimum ratio of matching values to consider a foreign key
398
+ - on_generate_join: Callback function that will be called when the Generate JOIN button is clicked.
399
+ It receives a JOIN query string as its argument.
400
+ - parent: Parent widget for the QMainWindow. Typically the main application window.
401
+
402
+ Returns:
403
+ - QMainWindow: The visualization window
404
+ """
405
+ # Generate default names if not provided
406
+ if df_names is None:
407
+ df_names = [f"Table_{i+1}" for i in range(len(dfs))]
408
+
409
+ # Get profile results
410
+ foreign_keys, inclusion_dependencies, integrity_results = profile_foreign_keys(
411
+ dfs, df_names, min_match_ratio
412
+ )
413
+
414
+ # Create main window
415
+ window = QMainWindow(parent)
416
+ window.setWindowTitle("Foreign Key Analysis")
417
+ window.resize(900, 700)
418
+
419
+ # Create central widget and layout
420
+ central_widget = QWidget()
421
+ window.setCentralWidget(central_widget)
422
+ layout = QVBoxLayout(central_widget)
423
+
424
+ # Add header
425
+ header = QLabel(f"Analyzed {len(dfs)} tables with potential foreign key relationships")
426
+ header.setAlignment(Qt.AlignmentFlag.AlignCenter)
427
+ header.setStyleSheet("font-size: 14pt; font-weight: bold; margin: 10px;")
428
+ layout.addWidget(header)
429
+
430
+ # Add description
431
+ description = QLabel(
432
+ "This analysis helps identify potential foreign key relationships between tables. "
433
+ "Foreign keys are columns in one table that reference the primary key of another table. "
434
+ "The match ratio indicates how many values in the foreign key column exist in the referenced column."
435
+ )
436
+ description.setAlignment(Qt.AlignmentFlag.AlignCenter)
437
+ description.setWordWrap(True)
438
+ description.setStyleSheet("margin-bottom: 10px;")
439
+ layout.addWidget(description)
440
+
441
+ # Create tabs
442
+ tabs = QTabWidget()
443
+
444
+ # Define the "Add to editor" function to handle JOIN queries
445
+ def handle_join_query(query):
446
+ if on_generate_join:
447
+ on_generate_join(query)
448
+ QMessageBox.information(window, "JOIN Query Generated",
449
+ f"The following query has been added to the editor:\n\n{query}")
450
+
451
+ # Tab for Foreign Keys
452
+ fk_tab = QWidget()
453
+ fk_layout = QVBoxLayout()
454
+
455
+ fk_header = QLabel("Potential Foreign Key Relationships")
456
+ fk_header.setStyleSheet("font-weight: bold;")
457
+ fk_layout.addWidget(fk_header)
458
+
459
+ fk_table = QTableWidget(len(foreign_keys), 6) # Added column for Generate JOIN button
460
+ fk_table.setHorizontalHeaderLabels([
461
+ "Referenced Table", "Referenced Column", "Referencing Table", "Referencing Column", "Match Ratio", "Action"
462
+ ])
463
+ fk_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
464
+
465
+ # Set minimum width for the Action column
466
+ fk_table.horizontalHeader().setSectionResizeMode(5, QHeaderView.ResizeMode.Interactive)
467
+ fk_table.setColumnWidth(5, 140) # Set a fixed width for action column
468
+
469
+ for row, (pk_table, pk_col, fk_table_name, fk_col, match_ratio) in enumerate(foreign_keys):
470
+ fk_table.setItem(row, 0, QTableWidgetItem(pk_table))
471
+ fk_table.setItem(row, 1, QTableWidgetItem(pk_col))
472
+ fk_table.setItem(row, 2, QTableWidgetItem(fk_table_name))
473
+ fk_table.setItem(row, 3, QTableWidgetItem(fk_col))
474
+
475
+ # Format match ratio with color coding
476
+ ratio_item = QTableWidgetItem(f"{match_ratio:.2%}")
477
+ if match_ratio >= 0.99:
478
+ ratio_item.setForeground(Qt.GlobalColor.darkGreen)
479
+ elif match_ratio >= 0.9:
480
+ ratio_item.setForeground(Qt.GlobalColor.darkBlue)
481
+ else:
482
+ ratio_item.setForeground(Qt.GlobalColor.darkYellow)
483
+ fk_table.setItem(row, 4, ratio_item)
484
+
485
+ # Add Generate JOIN hyperlink - optimized for better visibility
486
+ if on_generate_join is not None:
487
+ button_widget = QWidget()
488
+ button_layout = QHBoxLayout(button_widget)
489
+ button_layout.setContentsMargins(0, 0, 0, 0) # Minimal margins
490
+ button_layout.setSpacing(0) # No spacing
491
+
492
+ # Create a styled hyperlink label
493
+ join_link = QLabel("<a href='#' style='color: #3498DB; font-weight: bold;'>Generate JOIN</a>")
494
+ join_link.setTextFormat(Qt.TextFormat.RichText)
495
+ join_link.setTextInteractionFlags(Qt.TextInteractionFlag.TextBrowserInteraction)
496
+ join_link.setCursor(Qt.CursorShape.PointingHandCursor)
497
+ join_link.setAlignment(Qt.AlignmentFlag.AlignCenter) # Center the text
498
+ join_query = f"SELECT * FROM {fk_table_name} JOIN {pk_table} ON {fk_table_name}.{fk_col} = {pk_table}.{pk_col}"
499
+
500
+ # Connect linkActivated signal to handle the JOIN query
501
+ join_link.linkActivated.connect(lambda link, q=join_query: handle_join_query(q))
502
+
503
+ button_layout.addWidget(join_link)
504
+ fk_table.setCellWidget(row, 5, button_widget)
505
+
506
+ fk_layout.addWidget(fk_table)
507
+ fk_tab.setLayout(fk_layout)
508
+ tabs.addTab(fk_tab, "Foreign Keys")
509
+
510
+ # Tab for Inclusion Dependencies
511
+ id_tab = QWidget()
512
+ id_layout = QVBoxLayout()
513
+
514
+ id_header = QLabel("Inclusion Dependencies (Values in one column are a subset of another)")
515
+ id_header.setStyleSheet("font-weight: bold;")
516
+ id_layout.addWidget(id_header)
517
+
518
+ id_table = QTableWidget(len(inclusion_dependencies), 6) # Added column for Generate JOIN button
519
+ id_table.setHorizontalHeaderLabels([
520
+ "Referenced Table", "Referenced Column", "Referencing Table", "Referencing Column", "Match Ratio", "Action"
521
+ ])
522
+ id_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
523
+
524
+ # Set minimum width for the Action column
525
+ id_table.horizontalHeader().setSectionResizeMode(5, QHeaderView.ResizeMode.Interactive)
526
+ id_table.setColumnWidth(5, 140) # Set a fixed width for action column
527
+
528
+ for row, (table1, col1, table2, col2, match_ratio) in enumerate(inclusion_dependencies):
529
+ id_table.setItem(row, 0, QTableWidgetItem(table1))
530
+ id_table.setItem(row, 1, QTableWidgetItem(col1))
531
+ id_table.setItem(row, 2, QTableWidgetItem(table2))
532
+ id_table.setItem(row, 3, QTableWidgetItem(col2))
533
+
534
+ # Format match ratio with color coding
535
+ ratio_item = QTableWidgetItem(f"{match_ratio:.2%}")
536
+ if match_ratio >= 0.95:
537
+ ratio_item.setForeground(Qt.GlobalColor.darkGreen)
538
+ elif match_ratio >= 0.8:
539
+ ratio_item.setForeground(Qt.GlobalColor.darkBlue)
540
+ else:
541
+ ratio_item.setForeground(Qt.GlobalColor.darkYellow)
542
+ id_table.setItem(row, 4, ratio_item)
543
+
544
+ # Add Generate JOIN hyperlink - optimized for better visibility
545
+ if on_generate_join is not None:
546
+ button_widget = QWidget()
547
+ button_layout = QHBoxLayout(button_widget)
548
+ button_layout.setContentsMargins(0, 0, 0, 0) # Minimal margins
549
+ button_layout.setSpacing(0) # No spacing
550
+
551
+ # Create a styled hyperlink label
552
+ join_link = QLabel("<a href='#' style='color: #3498DB; font-weight: bold;'>Generate JOIN</a>")
553
+ join_link.setTextFormat(Qt.TextFormat.RichText)
554
+ join_link.setTextInteractionFlags(Qt.TextInteractionFlag.TextBrowserInteraction)
555
+ join_link.setCursor(Qt.CursorShape.PointingHandCursor)
556
+ join_link.setAlignment(Qt.AlignmentFlag.AlignCenter) # Center the text
557
+ join_query = f"SELECT * FROM {table2} JOIN {table1} ON {table2}.{col2} = {table1}.{col1}"
558
+
559
+ # Connect linkActivated signal to handle the JOIN query
560
+ join_link.linkActivated.connect(lambda link, q=join_query: handle_join_query(q))
561
+
562
+ button_layout.addWidget(join_link)
563
+ id_table.setCellWidget(row, 5, button_widget)
564
+
565
+ id_layout.addWidget(id_table)
566
+ id_tab.setLayout(id_layout)
567
+ tabs.addTab(id_tab, "Inclusion Dependencies")
568
+
569
+ # Tab for Referential Integrity
570
+ ri_tab = QWidget()
571
+ ri_layout = QVBoxLayout()
572
+
573
+ ri_header = QLabel("Referential Integrity Analysis")
574
+ ri_header.setStyleSheet("font-weight: bold;")
575
+ ri_layout.addWidget(ri_header)
576
+
577
+ ri_description = QLabel(
578
+ "This table shows referential integrity violations for discovered foreign keys. "
579
+ "A violation occurs when a value in the foreign key column doesn't exist in the referenced column."
580
+ )
581
+ ri_description.setWordWrap(True)
582
+ ri_layout.addWidget(ri_description)
583
+
584
+ # Create table for referential integrity
585
+ ri_table = QTableWidget(len(integrity_results), 6) # Added column for Generate JOIN button
586
+ ri_table.setHorizontalHeaderLabels([
587
+ "Relationship", "Violations", "Total FK Values", "Violation %", "Example Violations", "Action"
588
+ ])
589
+ ri_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
590
+
591
+ # Set minimum width for the Action column
592
+ ri_table.horizontalHeader().setSectionResizeMode(5, QHeaderView.ResizeMode.Interactive)
593
+ ri_table.setColumnWidth(5, 140) # Set a fixed width for action column
594
+
595
+ row = 0
596
+ for key, stats in integrity_results.items():
597
+ pk_table, pk_col, fk_table, fk_col = key
598
+ relationship = f"{fk_table}.{fk_col} → {pk_table}.{pk_col}"
599
+
600
+ ri_table.setItem(row, 0, QTableWidgetItem(relationship))
601
+ ri_table.setItem(row, 1, QTableWidgetItem(str(stats['violation_count'])))
602
+ ri_table.setItem(row, 2, QTableWidgetItem(str(stats['total_fk_values'])))
603
+
604
+ # Format violation ratio with color coding
605
+ ratio_item = QTableWidgetItem(f"{stats['violation_ratio']:.2%}")
606
+ if stats['violation_ratio'] == 0:
607
+ ratio_item.setForeground(Qt.GlobalColor.darkGreen)
608
+ elif stats['violation_ratio'] < 0.01:
609
+ ratio_item.setForeground(Qt.GlobalColor.darkBlue)
610
+ else:
611
+ ratio_item.setForeground(Qt.GlobalColor.darkRed)
612
+ ri_table.setItem(row, 3, ratio_item)
613
+
614
+ # Show example violations
615
+ examples = ', '.join([str(v) for v in stats['violations']])
616
+ if stats['violation_count'] > len(stats['violations']):
617
+ examples += f" (and {stats['violation_count'] - len(stats['violations'])} more)"
618
+ ri_table.setItem(row, 4, QTableWidgetItem(examples))
619
+
620
+ # Add Generate JOIN hyperlink - optimized for better visibility
621
+ if on_generate_join is not None:
622
+ button_widget = QWidget()
623
+ button_layout = QHBoxLayout(button_widget)
624
+ button_layout.setContentsMargins(0, 0, 0, 0) # Minimal margins
625
+ button_layout.setSpacing(0) # No spacing
626
+
627
+ # Create a styled hyperlink label
628
+ join_link = QLabel("<a href='#' style='color: #3498DB; font-weight: bold;'>Generate JOIN</a>")
629
+ join_link.setTextFormat(Qt.TextFormat.RichText)
630
+ join_link.setTextInteractionFlags(Qt.TextInteractionFlag.TextBrowserInteraction)
631
+ join_link.setCursor(Qt.CursorShape.PointingHandCursor)
632
+ join_link.setAlignment(Qt.AlignmentFlag.AlignCenter) # Center the text
633
+ join_query = f"SELECT * FROM {fk_table} LEFT JOIN {pk_table} ON {fk_table}.{fk_col} = {pk_table}.{pk_col}"
634
+
635
+ # Connect linkActivated signal to handle the JOIN query
636
+ join_link.linkActivated.connect(lambda link, q=join_query: handle_join_query(q))
637
+
638
+ button_layout.addWidget(join_link)
639
+ ri_table.setCellWidget(row, 5, button_widget)
640
+
641
+ row += 1
642
+
643
+ ri_layout.addWidget(ri_table)
644
+ ri_tab.setLayout(ri_layout)
645
+ tabs.addTab(ri_tab, "Referential Integrity")
646
+
647
+ layout.addWidget(tabs)
648
+
649
+ # Show the window
650
+ window.show()
651
+ return window
652
+
653
+
654
+ def test_profile_foreign_keys():
655
+ """
656
+ Test function to demonstrate foreign key detection with sample data.
657
+ """
658
+ # Create test data with clear foreign key relationships
659
+
660
+ # Customers table
661
+ customers_data = {
662
+ "customer_id": list(range(1, 21)),
663
+ "customer_name": ["Customer " + str(i) for i in range(1, 21)],
664
+ "city": ["City " + str(i % 5) for i in range(1, 21)]
665
+ }
666
+ customers_df = pd.DataFrame(customers_data)
667
+
668
+ # Products table
669
+ products_data = {
670
+ "product_id": list(range(101, 111)),
671
+ "product_name": ["Product " + str(i) for i in range(101, 111)],
672
+ "category": ["Category " + str(i % 3) for i in range(101, 111)]
673
+ }
674
+ products_df = pd.DataFrame(products_data)
675
+
676
+ # Orders table (with foreign keys to customers and products)
677
+ import random
678
+ random.seed(42)
679
+
680
+ orders_data = {
681
+ "order_id": list(range(1001, 1101)),
682
+ "customer_id": [random.randint(1, 20) for _ in range(100)],
683
+ "order_date": [pd.Timestamp("2021-01-01") + pd.Timedelta(days=i) for i in range(100)]
684
+ }
685
+ orders_df = pd.DataFrame(orders_data)
686
+
687
+ # Order details table (with foreign keys to orders and products)
688
+ order_details_data = {
689
+ "order_detail_id": list(range(10001, 10201)),
690
+ "order_id": [random.choice(orders_data["order_id"]) for _ in range(200)],
691
+ "product_id": [random.choice(products_data["product_id"]) for _ in range(200)],
692
+ "quantity": [random.randint(1, 10) for _ in range(200)]
693
+ }
694
+ order_details_df = pd.DataFrame(order_details_data)
695
+
696
+ # Add some referential integrity violations
697
+ # Add some non-existent customer IDs
698
+ orders_df.loc[95:99, "customer_id"] = [25, 26, 27, 28, 29]
699
+
700
+ # Define a callback function to handle JOIN generation
701
+ def handle_join_query(query):
702
+ print(f"Generated JOIN query: {query}")
703
+ # In a real application, this would insert the query into the query editor
704
+
705
+ # Create and show visualization
706
+ dfs = [customers_df, products_df, orders_df, order_details_df]
707
+ df_names = ["Customers", "Products", "Orders", "OrderDetails"]
708
+
709
+ app = QApplication(sys.argv)
710
+ window = visualize_foreign_keys(dfs, df_names, min_match_ratio=0.9, on_generate_join=handle_join_query)
711
+ sys.exit(app.exec())
712
+
713
+
714
+ def test_profile_foreign_keys_console():
715
+ """
716
+ Console test function to demonstrate improved foreign key detection.
717
+ """
718
+ import random
719
+
720
+ # Create test data with clear foreign key relationships
721
+
722
+ # Customers table
723
+ customers_data = {
724
+ "customer_id": list(range(1, 21)),
725
+ "customer_name": ["Customer " + str(i) for i in range(1, 21)],
726
+ "city": ["City " + str(i % 5) for i in range(1, 21)]
727
+ }
728
+ customers_df = pd.DataFrame(customers_data)
729
+
730
+ # Products table
731
+ products_data = {
732
+ "product_id": list(range(101, 111)),
733
+ "product_name": ["Product " + str(i) for i in range(101, 111)],
734
+ "category": ["Category " + str(i % 3) for i in range(101, 111)]
735
+ }
736
+ products_df = pd.DataFrame(products_data)
737
+
738
+ # Orders table (with foreign keys to customers)
739
+ random.seed(42)
740
+ orders_data = {
741
+ "order_id": list(range(1001, 1101)),
742
+ "customer_id": [random.randint(1, 20) for _ in range(100)],
743
+ "order_date": [pd.Timestamp("2021-01-01") + pd.Timedelta(days=i) for i in range(100)]
744
+ }
745
+ orders_df = pd.DataFrame(orders_data)
746
+
747
+ # Order details table (with foreign keys to orders and products)
748
+ order_details_data = {
749
+ "order_detail_id": list(range(10001, 10201)),
750
+ "order_id": [random.choice(orders_data["order_id"]) for _ in range(200)],
751
+ "product_id": [random.choice(products_data["product_id"]) for _ in range(200)],
752
+ "quantity": [random.randint(1, 10) for _ in range(200)]
753
+ }
754
+ order_details_df = pd.DataFrame(order_details_data)
755
+
756
+ # Run foreign key detection
757
+ dfs = [customers_df, products_df, orders_df, order_details_df]
758
+ df_names = ["Customers", "Products", "Orders", "OrderDetails"]
759
+
760
+ foreign_keys, inclusion_dependencies, integrity_results = profile_foreign_keys(
761
+ dfs, df_names, min_match_ratio=0.9
762
+ )
763
+
764
+ print("=== IMPROVED FOREIGN KEY DETECTION RESULTS ===")
765
+ print(f"\nFound {len(foreign_keys)} potential foreign key relationships:")
766
+
767
+ for i, (pk_table, pk_col, fk_table, fk_col, match_ratio) in enumerate(foreign_keys, 1):
768
+ print(f"{i}. {fk_table}.{fk_col} → {pk_table}.{pk_col} (Match: {match_ratio:.2%})")
769
+
770
+ print(f"\nFound {len(inclusion_dependencies)} inclusion dependencies:")
771
+ for i, (table1, col1, table2, col2, match_ratio) in enumerate(inclusion_dependencies[:10], 1): # Show first 10
772
+ print(f"{i}. {table2}.{col2} ⊆ {table1}.{col1} (Match: {match_ratio:.2%})")
773
+
774
+ if len(inclusion_dependencies) > 10:
775
+ print(f"... and {len(inclusion_dependencies) - 10} more")
776
+
777
+ # Only run the GUI test function when script is executed directly
778
+ if __name__ == "__main__":
779
+ test_profile_foreign_keys()