sqlshell 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlshell/__init__.py +84 -0
- sqlshell/__main__.py +4926 -0
- sqlshell/ai_autocomplete.py +392 -0
- sqlshell/ai_settings_dialog.py +337 -0
- sqlshell/context_suggester.py +768 -0
- sqlshell/create_test_data.py +152 -0
- sqlshell/data/create_test_data.py +137 -0
- sqlshell/db/__init__.py +6 -0
- sqlshell/db/database_manager.py +1318 -0
- sqlshell/db/export_manager.py +188 -0
- sqlshell/editor.py +1166 -0
- sqlshell/editor_integration.py +127 -0
- sqlshell/execution_handler.py +421 -0
- sqlshell/menus.py +262 -0
- sqlshell/notification_manager.py +370 -0
- sqlshell/query_tab.py +904 -0
- sqlshell/resources/__init__.py +1 -0
- sqlshell/resources/icon.png +0 -0
- sqlshell/resources/logo_large.png +0 -0
- sqlshell/resources/logo_medium.png +0 -0
- sqlshell/resources/logo_small.png +0 -0
- sqlshell/resources/splash_screen.gif +0 -0
- sqlshell/space_invaders.py +501 -0
- sqlshell/splash_screen.py +405 -0
- sqlshell/sqlshell/__init__.py +5 -0
- sqlshell/sqlshell/create_test_data.py +118 -0
- sqlshell/sqlshell/create_test_databases.py +96 -0
- sqlshell/sqlshell_demo.png +0 -0
- sqlshell/styles.py +257 -0
- sqlshell/suggester_integration.py +330 -0
- sqlshell/syntax_highlighter.py +124 -0
- sqlshell/table_list.py +996 -0
- sqlshell/ui/__init__.py +6 -0
- sqlshell/ui/bar_chart_delegate.py +49 -0
- sqlshell/ui/filter_header.py +469 -0
- sqlshell/utils/__init__.py +16 -0
- sqlshell/utils/profile_cn2.py +1661 -0
- sqlshell/utils/profile_column.py +2635 -0
- sqlshell/utils/profile_distributions.py +616 -0
- sqlshell/utils/profile_entropy.py +347 -0
- sqlshell/utils/profile_foreign_keys.py +779 -0
- sqlshell/utils/profile_keys.py +2834 -0
- sqlshell/utils/profile_ohe.py +934 -0
- sqlshell/utils/profile_ohe_advanced.py +754 -0
- sqlshell/utils/profile_ohe_comparison.py +237 -0
- sqlshell/utils/profile_prediction.py +926 -0
- sqlshell/utils/profile_similarity.py +876 -0
- sqlshell/utils/search_in_df.py +90 -0
- sqlshell/widgets.py +400 -0
- sqlshell-0.4.4.dist-info/METADATA +441 -0
- sqlshell-0.4.4.dist-info/RECORD +54 -0
- sqlshell-0.4.4.dist-info/WHEEL +5 -0
- sqlshell-0.4.4.dist-info/entry_points.txt +2 -0
- sqlshell-0.4.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,779 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import itertools
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from typing import List, Dict, Tuple, Set, Callable
|
|
5
|
+
from PyQt6.QtWidgets import (
|
|
6
|
+
QApplication, QWidget, QVBoxLayout, QLabel, QTableWidget, QTableWidgetItem, QHeaderView, QTabWidget, QMainWindow,
|
|
7
|
+
QPushButton, QHBoxLayout, QMessageBox
|
|
8
|
+
)
|
|
9
|
+
from PyQt6.QtCore import Qt
|
|
10
|
+
import re
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def find_foreign_keys(dfs: List[pd.DataFrame], df_names: List[str], min_match_ratio: float = 0.95):
|
|
14
|
+
"""
|
|
15
|
+
Discover potential foreign key relationships between DataFrames.
|
|
16
|
+
|
|
17
|
+
Parameters:
|
|
18
|
+
- dfs: List of pandas DataFrames to analyze
|
|
19
|
+
- df_names: Names of the DataFrames (used for reporting)
|
|
20
|
+
- min_match_ratio: Minimum ratio of matching values to consider a foreign key
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
- List of tuples (referenced_table, referenced_column, referencing_table, referencing_column, match_ratio)
|
|
24
|
+
"""
|
|
25
|
+
foreign_keys = []
|
|
26
|
+
|
|
27
|
+
# Helper function to check if a column name suggests it's an ID/key column
|
|
28
|
+
def is_likely_id_column(col_name):
|
|
29
|
+
col_lower = col_name.lower()
|
|
30
|
+
id_patterns = [
|
|
31
|
+
r'.*_?id$', # ends with 'id' or '_id'
|
|
32
|
+
r'^id_?.*', # starts with 'id' or 'id_'
|
|
33
|
+
r'.*_?key$', # ends with 'key' or '_key'
|
|
34
|
+
r'^key_?.*', # starts with 'key' or 'key_'
|
|
35
|
+
r'.*_?code$', # ends with 'code' or '_code'
|
|
36
|
+
r'.*_?ref$', # ends with 'ref' or '_ref'
|
|
37
|
+
r'.*_?num$', # ends with 'num' or '_num'
|
|
38
|
+
r'.*_?number$' # ends with 'number' or '_number'
|
|
39
|
+
]
|
|
40
|
+
return any(re.match(pattern, col_lower) for pattern in id_patterns)
|
|
41
|
+
|
|
42
|
+
# Helper function to check if a column name suggests it's NOT a foreign key
|
|
43
|
+
def is_unlikely_fk_column(col_name):
|
|
44
|
+
col_lower = col_name.lower()
|
|
45
|
+
non_fk_patterns = [
|
|
46
|
+
r'.*quantity.*', r'.*amount.*', r'.*price.*', r'.*cost.*',
|
|
47
|
+
r'.*total.*', r'.*sum.*', r'.*count.*', r'.*rate.*',
|
|
48
|
+
r'.*percent.*', r'.*ratio.*', r'.*score.*', r'.*weight.*',
|
|
49
|
+
r'.*length.*', r'.*width.*', r'.*height.*', r'.*size.*',
|
|
50
|
+
r'.*age.*', r'.*year.*', r'.*month.*', r'.*day.*',
|
|
51
|
+
r'.*time.*', r'.*date.*', r'.*timestamp.*',
|
|
52
|
+
r'.*name.*', r'.*title.*', r'.*description.*', r'.*text.*',
|
|
53
|
+
r'.*comment.*', r'.*note.*', r'.*email.*', r'.*phone.*',
|
|
54
|
+
r'.*address.*', r'.*city.*', r'.*state.*', r'.*country.*'
|
|
55
|
+
]
|
|
56
|
+
return any(re.match(pattern, col_lower) for pattern in non_fk_patterns)
|
|
57
|
+
|
|
58
|
+
# Helper function to calculate column name similarity
|
|
59
|
+
def column_name_similarity(col1, col2):
|
|
60
|
+
col1_lower = col1.lower()
|
|
61
|
+
col2_lower = col2.lower()
|
|
62
|
+
|
|
63
|
+
# Exact match
|
|
64
|
+
if col1_lower == col2_lower:
|
|
65
|
+
return 1.0
|
|
66
|
+
|
|
67
|
+
# Check if one is a substring of the other
|
|
68
|
+
if col1_lower in col2_lower or col2_lower in col1_lower:
|
|
69
|
+
return 0.8
|
|
70
|
+
|
|
71
|
+
# Check for common FK patterns (e.g., "customer_id" matches "customer")
|
|
72
|
+
col1_clean = re.sub(r'_?(id|key|ref|code|num|number)$', '', col1_lower)
|
|
73
|
+
col2_clean = re.sub(r'_?(id|key|ref|code|num|number)$', '', col2_lower)
|
|
74
|
+
|
|
75
|
+
if col1_clean == col2_clean and col1_clean:
|
|
76
|
+
return 0.9
|
|
77
|
+
|
|
78
|
+
# Check if cleaned versions have overlap
|
|
79
|
+
if col1_clean in col2_clean or col2_clean in col1_clean:
|
|
80
|
+
return 0.6
|
|
81
|
+
|
|
82
|
+
return 0.0
|
|
83
|
+
|
|
84
|
+
# First, identify potential primary keys in each DataFrame
|
|
85
|
+
pk_candidates = {}
|
|
86
|
+
for i, df in enumerate(dfs):
|
|
87
|
+
name = df_names[i]
|
|
88
|
+
# Consider columns with unique values as potential primary keys
|
|
89
|
+
for col in df.columns:
|
|
90
|
+
if df[col].nunique() == len(df) and not df[col].isna().any():
|
|
91
|
+
# Prefer columns that look like ID columns
|
|
92
|
+
if is_likely_id_column(col):
|
|
93
|
+
if name not in pk_candidates:
|
|
94
|
+
pk_candidates[name] = []
|
|
95
|
+
pk_candidates[name].append(col)
|
|
96
|
+
|
|
97
|
+
# For each DataFrame pair, check for foreign key relationships
|
|
98
|
+
for i, df1 in enumerate(dfs):
|
|
99
|
+
name1 = df_names[i]
|
|
100
|
+
|
|
101
|
+
# Skip if this DataFrame has no primary key candidates
|
|
102
|
+
if name1 not in pk_candidates:
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
# For each potential primary key column
|
|
106
|
+
for pk_col in pk_candidates[name1]:
|
|
107
|
+
pk_values = set(df1[pk_col])
|
|
108
|
+
|
|
109
|
+
# Check every other DataFrame for matching columns
|
|
110
|
+
for j, df2 in enumerate(dfs):
|
|
111
|
+
name2 = df_names[j]
|
|
112
|
+
|
|
113
|
+
# Skip self-references
|
|
114
|
+
if i == j:
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
# Check each column in df2 for potential foreign key relationship
|
|
118
|
+
for fk_col in df2.columns:
|
|
119
|
+
# Skip if data types are incompatible
|
|
120
|
+
if df1[pk_col].dtype != df2[fk_col].dtype:
|
|
121
|
+
continue
|
|
122
|
+
|
|
123
|
+
# Skip columns that are unlikely to be foreign keys
|
|
124
|
+
if is_unlikely_fk_column(fk_col):
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
# Get unique values in potential foreign key column
|
|
128
|
+
fk_values = set(df2[fk_col].dropna())
|
|
129
|
+
|
|
130
|
+
# Skip empty columns
|
|
131
|
+
if not fk_values:
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
# Check cardinality - FK column should have fewer or equal unique values than PK
|
|
135
|
+
if len(fk_values) > len(pk_values):
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
# Check if foreign key values are a subset of primary key values
|
|
139
|
+
common_values = fk_values.intersection(pk_values)
|
|
140
|
+
match_ratio = len(common_values) / len(fk_values)
|
|
141
|
+
|
|
142
|
+
# Calculate a confidence score based on multiple factors
|
|
143
|
+
confidence_score = match_ratio
|
|
144
|
+
|
|
145
|
+
# Boost confidence for column name similarity
|
|
146
|
+
name_similarity = column_name_similarity(pk_col, fk_col)
|
|
147
|
+
if name_similarity > 0.5:
|
|
148
|
+
confidence_score += name_similarity * 0.3 # Up to 30% boost
|
|
149
|
+
|
|
150
|
+
# Boost confidence if FK column name suggests it's an ID
|
|
151
|
+
if is_likely_id_column(fk_col):
|
|
152
|
+
confidence_score += 0.1 # 10% boost
|
|
153
|
+
|
|
154
|
+
# Penalize if the FK column has too many unique values relative to total rows
|
|
155
|
+
fk_cardinality_ratio = len(fk_values) / len(df2)
|
|
156
|
+
if fk_cardinality_ratio > 0.5: # More than 50% unique values
|
|
157
|
+
confidence_score -= 0.2 # 20% penalty
|
|
158
|
+
|
|
159
|
+
# Consider it a foreign key if confidence score exceeds threshold
|
|
160
|
+
# But also require minimum match ratio
|
|
161
|
+
if confidence_score >= min_match_ratio and match_ratio >= 0.9:
|
|
162
|
+
foreign_keys.append((name1, pk_col, name2, fk_col, match_ratio))
|
|
163
|
+
|
|
164
|
+
# Sort by match ratio (descending), then by confidence
|
|
165
|
+
foreign_keys.sort(key=lambda x: x[4], reverse=True)
|
|
166
|
+
return foreign_keys
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def find_inclusion_dependencies(dfs: List[pd.DataFrame], df_names: List[str], min_match_ratio: float = 0.8):
|
|
170
|
+
"""
|
|
171
|
+
Find inclusion dependencies (more general than foreign keys) between DataFrames.
|
|
172
|
+
An inclusion dependency exists when values in one column are a subset of values in another column.
|
|
173
|
+
|
|
174
|
+
Parameters:
|
|
175
|
+
- dfs: List of pandas DataFrames to analyze
|
|
176
|
+
- df_names: Names of the DataFrames
|
|
177
|
+
- min_match_ratio: Minimum ratio of matching values
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
- List of tuples (referenced_table, referenced_column, referencing_table, referencing_column, match_ratio)
|
|
181
|
+
"""
|
|
182
|
+
dependencies = []
|
|
183
|
+
|
|
184
|
+
# Helper function to check if a column name suggests it's an ID/key column
|
|
185
|
+
def is_likely_id_column(col_name):
|
|
186
|
+
col_lower = col_name.lower()
|
|
187
|
+
id_patterns = [
|
|
188
|
+
r'.*_?id$', # ends with 'id' or '_id'
|
|
189
|
+
r'^id_?.*', # starts with 'id' or 'id_'
|
|
190
|
+
r'.*_?key$', # ends with 'key' or '_key'
|
|
191
|
+
r'^key_?.*', # starts with 'key' or 'key_'
|
|
192
|
+
r'.*_?code$', # ends with 'code' or '_code'
|
|
193
|
+
r'.*_?ref$', # ends with 'ref' or '_ref'
|
|
194
|
+
r'.*_?num$', # ends with 'num' or '_num'
|
|
195
|
+
r'.*_?number$' # ends with 'number' or '_number'
|
|
196
|
+
]
|
|
197
|
+
return any(re.match(pattern, col_lower) for pattern in id_patterns)
|
|
198
|
+
|
|
199
|
+
# Helper function to check if a column name suggests it's NOT a foreign key
|
|
200
|
+
def is_unlikely_fk_column(col_name):
|
|
201
|
+
col_lower = col_name.lower()
|
|
202
|
+
non_fk_patterns = [
|
|
203
|
+
r'.*quantity.*', r'.*amount.*', r'.*price.*', r'.*cost.*',
|
|
204
|
+
r'.*total.*', r'.*sum.*', r'.*count.*', r'.*rate.*',
|
|
205
|
+
r'.*percent.*', r'.*ratio.*', r'.*score.*', r'.*weight.*',
|
|
206
|
+
r'.*length.*', r'.*width.*', r'.*height.*', r'.*size.*',
|
|
207
|
+
r'.*age.*', r'.*year.*', r'.*month.*', r'.*day.*',
|
|
208
|
+
r'.*time.*', r'.*date.*', r'.*timestamp.*',
|
|
209
|
+
r'.*name.*', r'.*title.*', r'.*description.*', r'.*text.*',
|
|
210
|
+
r'.*comment.*', r'.*note.*', r'.*email.*', r'.*phone.*',
|
|
211
|
+
r'.*address.*', r'.*city.*', r'.*state.*', r'.*country.*'
|
|
212
|
+
]
|
|
213
|
+
return any(re.match(pattern, col_lower) for pattern in non_fk_patterns)
|
|
214
|
+
|
|
215
|
+
# Helper function to calculate column name similarity
|
|
216
|
+
def column_name_similarity(col1, col2):
|
|
217
|
+
col1_lower = col1.lower()
|
|
218
|
+
col2_lower = col2.lower()
|
|
219
|
+
|
|
220
|
+
# Exact match
|
|
221
|
+
if col1_lower == col2_lower:
|
|
222
|
+
return 1.0
|
|
223
|
+
|
|
224
|
+
# Check if one is a substring of the other
|
|
225
|
+
if col1_lower in col2_lower or col2_lower in col1_lower:
|
|
226
|
+
return 0.8
|
|
227
|
+
|
|
228
|
+
# Check for common FK patterns (e.g., "customer_id" matches "customer")
|
|
229
|
+
col1_clean = re.sub(r'_?(id|key|ref|code|num|number)$', '', col1_lower)
|
|
230
|
+
col2_clean = re.sub(r'_?(id|key|ref|code|num|number)$', '', col2_lower)
|
|
231
|
+
|
|
232
|
+
if col1_clean == col2_clean and col1_clean:
|
|
233
|
+
return 0.9
|
|
234
|
+
|
|
235
|
+
# Check if cleaned versions have overlap
|
|
236
|
+
if col1_clean in col2_clean or col2_clean in col1_clean:
|
|
237
|
+
return 0.6
|
|
238
|
+
|
|
239
|
+
return 0.0
|
|
240
|
+
|
|
241
|
+
# For each pair of DataFrames
|
|
242
|
+
for i, df1 in enumerate(dfs):
|
|
243
|
+
name1 = df_names[i]
|
|
244
|
+
|
|
245
|
+
for j, df2 in enumerate(dfs):
|
|
246
|
+
name2 = df_names[j]
|
|
247
|
+
|
|
248
|
+
# Skip self-comparison for the same index
|
|
249
|
+
if i == j:
|
|
250
|
+
continue
|
|
251
|
+
|
|
252
|
+
# For each potential pair of columns
|
|
253
|
+
for col1 in df1.columns:
|
|
254
|
+
# Get unique values in the potential referenced column
|
|
255
|
+
values1 = set(df1[col1].dropna())
|
|
256
|
+
|
|
257
|
+
# Skip empty columns
|
|
258
|
+
if not values1:
|
|
259
|
+
continue
|
|
260
|
+
|
|
261
|
+
# Prefer columns that look like ID columns for referenced side
|
|
262
|
+
if not is_likely_id_column(col1):
|
|
263
|
+
continue
|
|
264
|
+
|
|
265
|
+
for col2 in df2.columns:
|
|
266
|
+
# Skip if data types are incompatible
|
|
267
|
+
if df1[col1].dtype != df2[col2].dtype:
|
|
268
|
+
continue
|
|
269
|
+
|
|
270
|
+
# Skip columns that are unlikely to be foreign keys
|
|
271
|
+
if is_unlikely_fk_column(col2):
|
|
272
|
+
continue
|
|
273
|
+
|
|
274
|
+
# Get unique values in the potential referencing column
|
|
275
|
+
values2 = set(df2[col2].dropna())
|
|
276
|
+
|
|
277
|
+
# Skip empty columns
|
|
278
|
+
if not values2:
|
|
279
|
+
continue
|
|
280
|
+
|
|
281
|
+
# Check cardinality - referencing column should have fewer or equal unique values
|
|
282
|
+
if len(values2) > len(values1):
|
|
283
|
+
continue
|
|
284
|
+
|
|
285
|
+
# Check if values2 is approximately a subset of values1
|
|
286
|
+
common_values = values2.intersection(values1)
|
|
287
|
+
match_ratio = len(common_values) / len(values2)
|
|
288
|
+
|
|
289
|
+
# Calculate a confidence score based on multiple factors
|
|
290
|
+
confidence_score = match_ratio
|
|
291
|
+
|
|
292
|
+
# Boost confidence for column name similarity
|
|
293
|
+
name_similarity = column_name_similarity(col1, col2)
|
|
294
|
+
if name_similarity > 0.5:
|
|
295
|
+
confidence_score += name_similarity * 0.3 # Up to 30% boost
|
|
296
|
+
|
|
297
|
+
# Boost confidence if referencing column name suggests it's an ID
|
|
298
|
+
if is_likely_id_column(col2):
|
|
299
|
+
confidence_score += 0.1 # 10% boost
|
|
300
|
+
|
|
301
|
+
# Consider it an inclusion dependency if confidence score exceeds threshold
|
|
302
|
+
# But also require minimum match ratio
|
|
303
|
+
if confidence_score >= min_match_ratio and match_ratio >= 0.85:
|
|
304
|
+
dependencies.append((name1, col1, name2, col2, match_ratio))
|
|
305
|
+
|
|
306
|
+
# Sort by match ratio (descending)
|
|
307
|
+
dependencies.sort(key=lambda x: x[4], reverse=True)
|
|
308
|
+
return dependencies
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def profile_referential_integrity(dfs: List[pd.DataFrame], df_names: List[str], foreign_keys):
|
|
312
|
+
"""
|
|
313
|
+
Profile the referential integrity of discovered foreign keys.
|
|
314
|
+
|
|
315
|
+
Parameters:
|
|
316
|
+
- dfs: List of pandas DataFrames
|
|
317
|
+
- df_names: Names of the DataFrames
|
|
318
|
+
- foreign_keys: List of foreign key relationships
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
- Dictionary with referential integrity statistics
|
|
322
|
+
"""
|
|
323
|
+
integrity_results = {}
|
|
324
|
+
|
|
325
|
+
# Create lookup for DataFrames by name
|
|
326
|
+
df_dict = {name: df for name, df in zip(df_names, dfs)}
|
|
327
|
+
|
|
328
|
+
for pk_table, pk_col, fk_table, fk_col, _ in foreign_keys:
|
|
329
|
+
pk_df = df_dict[pk_table]
|
|
330
|
+
fk_df = df_dict[fk_table]
|
|
331
|
+
|
|
332
|
+
# Get primary key values
|
|
333
|
+
pk_values = set(pk_df[pk_col])
|
|
334
|
+
|
|
335
|
+
# Get foreign key values
|
|
336
|
+
fk_values = set(fk_df[fk_col].dropna())
|
|
337
|
+
|
|
338
|
+
# Count values that violate referential integrity
|
|
339
|
+
violations = fk_values - pk_values
|
|
340
|
+
violation_count = len(violations)
|
|
341
|
+
|
|
342
|
+
# Calculate violation ratio
|
|
343
|
+
total_fk_values = len(fk_df[fk_col].dropna())
|
|
344
|
+
violation_ratio = violation_count / total_fk_values if total_fk_values > 0 else 0
|
|
345
|
+
|
|
346
|
+
# Record results
|
|
347
|
+
key = (pk_table, pk_col, fk_table, fk_col)
|
|
348
|
+
integrity_results[key] = {
|
|
349
|
+
'violation_count': violation_count,
|
|
350
|
+
'violation_ratio': violation_ratio,
|
|
351
|
+
'total_fk_values': total_fk_values,
|
|
352
|
+
'violations': list(violations)[:10] # Only store first 10 violations for display
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
return integrity_results
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def profile_foreign_keys(dfs: List[pd.DataFrame], df_names: List[str] = None, min_match_ratio: float = 0.95):
|
|
359
|
+
"""
|
|
360
|
+
Analyze a list of pandas DataFrames to discover foreign key relationships.
|
|
361
|
+
|
|
362
|
+
Parameters:
|
|
363
|
+
- dfs: List of pandas DataFrames to analyze
|
|
364
|
+
- df_names: Optional list of names for the DataFrames. If None, names will be generated.
|
|
365
|
+
- min_match_ratio: Minimum ratio of matching values to consider a foreign key
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
- Tuple of (foreign_keys, inclusion_dependencies, integrity_results)
|
|
369
|
+
"""
|
|
370
|
+
# Generate default names if not provided
|
|
371
|
+
if df_names is None:
|
|
372
|
+
df_names = [f"Table_{i+1}" for i in range(len(dfs))]
|
|
373
|
+
|
|
374
|
+
# Ensure we have the same number of names as DataFrames
|
|
375
|
+
assert len(dfs) == len(df_names), "Number of DataFrames must match number of names"
|
|
376
|
+
|
|
377
|
+
# Find foreign keys
|
|
378
|
+
foreign_keys = find_foreign_keys(dfs, df_names, min_match_ratio)
|
|
379
|
+
|
|
380
|
+
# Find more general inclusion dependencies
|
|
381
|
+
inclusion_dependencies = find_inclusion_dependencies(dfs, df_names, min_match_ratio * 0.8)
|
|
382
|
+
|
|
383
|
+
# Profile referential integrity
|
|
384
|
+
integrity_results = profile_referential_integrity(dfs, df_names, foreign_keys)
|
|
385
|
+
|
|
386
|
+
return foreign_keys, inclusion_dependencies, integrity_results
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def visualize_foreign_keys(dfs: List[pd.DataFrame], df_names: List[str] = None, min_match_ratio: float = 0.95,
|
|
390
|
+
on_generate_join: Callable = None, parent=None):
|
|
391
|
+
"""
|
|
392
|
+
Create a visual representation of foreign key relationships between DataFrames.
|
|
393
|
+
|
|
394
|
+
Parameters:
|
|
395
|
+
- dfs: List of pandas DataFrames to analyze
|
|
396
|
+
- df_names: Optional list of names for the DataFrames. If None, names will be generated.
|
|
397
|
+
- min_match_ratio: Minimum ratio of matching values to consider a foreign key
|
|
398
|
+
- on_generate_join: Callback function that will be called when the Generate JOIN button is clicked.
|
|
399
|
+
It receives a JOIN query string as its argument.
|
|
400
|
+
- parent: Parent widget for the QMainWindow. Typically the main application window.
|
|
401
|
+
|
|
402
|
+
Returns:
|
|
403
|
+
- QMainWindow: The visualization window
|
|
404
|
+
"""
|
|
405
|
+
# Generate default names if not provided
|
|
406
|
+
if df_names is None:
|
|
407
|
+
df_names = [f"Table_{i+1}" for i in range(len(dfs))]
|
|
408
|
+
|
|
409
|
+
# Get profile results
|
|
410
|
+
foreign_keys, inclusion_dependencies, integrity_results = profile_foreign_keys(
|
|
411
|
+
dfs, df_names, min_match_ratio
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
# Create main window
|
|
415
|
+
window = QMainWindow(parent)
|
|
416
|
+
window.setWindowTitle("Foreign Key Analysis")
|
|
417
|
+
window.resize(900, 700)
|
|
418
|
+
|
|
419
|
+
# Create central widget and layout
|
|
420
|
+
central_widget = QWidget()
|
|
421
|
+
window.setCentralWidget(central_widget)
|
|
422
|
+
layout = QVBoxLayout(central_widget)
|
|
423
|
+
|
|
424
|
+
# Add header
|
|
425
|
+
header = QLabel(f"Analyzed {len(dfs)} tables with potential foreign key relationships")
|
|
426
|
+
header.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
427
|
+
header.setStyleSheet("font-size: 14pt; font-weight: bold; margin: 10px;")
|
|
428
|
+
layout.addWidget(header)
|
|
429
|
+
|
|
430
|
+
# Add description
|
|
431
|
+
description = QLabel(
|
|
432
|
+
"This analysis helps identify potential foreign key relationships between tables. "
|
|
433
|
+
"Foreign keys are columns in one table that reference the primary key of another table. "
|
|
434
|
+
"The match ratio indicates how many values in the foreign key column exist in the referenced column."
|
|
435
|
+
)
|
|
436
|
+
description.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
437
|
+
description.setWordWrap(True)
|
|
438
|
+
description.setStyleSheet("margin-bottom: 10px;")
|
|
439
|
+
layout.addWidget(description)
|
|
440
|
+
|
|
441
|
+
# Create tabs
|
|
442
|
+
tabs = QTabWidget()
|
|
443
|
+
|
|
444
|
+
# Define the "Add to editor" function to handle JOIN queries
|
|
445
|
+
def handle_join_query(query):
|
|
446
|
+
if on_generate_join:
|
|
447
|
+
on_generate_join(query)
|
|
448
|
+
QMessageBox.information(window, "JOIN Query Generated",
|
|
449
|
+
f"The following query has been added to the editor:\n\n{query}")
|
|
450
|
+
|
|
451
|
+
# Tab for Foreign Keys
|
|
452
|
+
fk_tab = QWidget()
|
|
453
|
+
fk_layout = QVBoxLayout()
|
|
454
|
+
|
|
455
|
+
fk_header = QLabel("Potential Foreign Key Relationships")
|
|
456
|
+
fk_header.setStyleSheet("font-weight: bold;")
|
|
457
|
+
fk_layout.addWidget(fk_header)
|
|
458
|
+
|
|
459
|
+
fk_table = QTableWidget(len(foreign_keys), 6) # Added column for Generate JOIN button
|
|
460
|
+
fk_table.setHorizontalHeaderLabels([
|
|
461
|
+
"Referenced Table", "Referenced Column", "Referencing Table", "Referencing Column", "Match Ratio", "Action"
|
|
462
|
+
])
|
|
463
|
+
fk_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
|
|
464
|
+
|
|
465
|
+
# Set minimum width for the Action column
|
|
466
|
+
fk_table.horizontalHeader().setSectionResizeMode(5, QHeaderView.ResizeMode.Interactive)
|
|
467
|
+
fk_table.setColumnWidth(5, 140) # Set a fixed width for action column
|
|
468
|
+
|
|
469
|
+
for row, (pk_table, pk_col, fk_table_name, fk_col, match_ratio) in enumerate(foreign_keys):
|
|
470
|
+
fk_table.setItem(row, 0, QTableWidgetItem(pk_table))
|
|
471
|
+
fk_table.setItem(row, 1, QTableWidgetItem(pk_col))
|
|
472
|
+
fk_table.setItem(row, 2, QTableWidgetItem(fk_table_name))
|
|
473
|
+
fk_table.setItem(row, 3, QTableWidgetItem(fk_col))
|
|
474
|
+
|
|
475
|
+
# Format match ratio with color coding
|
|
476
|
+
ratio_item = QTableWidgetItem(f"{match_ratio:.2%}")
|
|
477
|
+
if match_ratio >= 0.99:
|
|
478
|
+
ratio_item.setForeground(Qt.GlobalColor.darkGreen)
|
|
479
|
+
elif match_ratio >= 0.9:
|
|
480
|
+
ratio_item.setForeground(Qt.GlobalColor.darkBlue)
|
|
481
|
+
else:
|
|
482
|
+
ratio_item.setForeground(Qt.GlobalColor.darkYellow)
|
|
483
|
+
fk_table.setItem(row, 4, ratio_item)
|
|
484
|
+
|
|
485
|
+
# Add Generate JOIN hyperlink - optimized for better visibility
|
|
486
|
+
if on_generate_join is not None:
|
|
487
|
+
button_widget = QWidget()
|
|
488
|
+
button_layout = QHBoxLayout(button_widget)
|
|
489
|
+
button_layout.setContentsMargins(0, 0, 0, 0) # Minimal margins
|
|
490
|
+
button_layout.setSpacing(0) # No spacing
|
|
491
|
+
|
|
492
|
+
# Create a styled hyperlink label
|
|
493
|
+
join_link = QLabel("<a href='#' style='color: #3498DB; font-weight: bold;'>Generate JOIN</a>")
|
|
494
|
+
join_link.setTextFormat(Qt.TextFormat.RichText)
|
|
495
|
+
join_link.setTextInteractionFlags(Qt.TextInteractionFlag.TextBrowserInteraction)
|
|
496
|
+
join_link.setCursor(Qt.CursorShape.PointingHandCursor)
|
|
497
|
+
join_link.setAlignment(Qt.AlignmentFlag.AlignCenter) # Center the text
|
|
498
|
+
join_query = f"SELECT * FROM {fk_table_name} JOIN {pk_table} ON {fk_table_name}.{fk_col} = {pk_table}.{pk_col}"
|
|
499
|
+
|
|
500
|
+
# Connect linkActivated signal to handle the JOIN query
|
|
501
|
+
join_link.linkActivated.connect(lambda link, q=join_query: handle_join_query(q))
|
|
502
|
+
|
|
503
|
+
button_layout.addWidget(join_link)
|
|
504
|
+
fk_table.setCellWidget(row, 5, button_widget)
|
|
505
|
+
|
|
506
|
+
fk_layout.addWidget(fk_table)
|
|
507
|
+
fk_tab.setLayout(fk_layout)
|
|
508
|
+
tabs.addTab(fk_tab, "Foreign Keys")
|
|
509
|
+
|
|
510
|
+
# Tab for Inclusion Dependencies
|
|
511
|
+
id_tab = QWidget()
|
|
512
|
+
id_layout = QVBoxLayout()
|
|
513
|
+
|
|
514
|
+
id_header = QLabel("Inclusion Dependencies (Values in one column are a subset of another)")
|
|
515
|
+
id_header.setStyleSheet("font-weight: bold;")
|
|
516
|
+
id_layout.addWidget(id_header)
|
|
517
|
+
|
|
518
|
+
id_table = QTableWidget(len(inclusion_dependencies), 6) # Added column for Generate JOIN button
|
|
519
|
+
id_table.setHorizontalHeaderLabels([
|
|
520
|
+
"Referenced Table", "Referenced Column", "Referencing Table", "Referencing Column", "Match Ratio", "Action"
|
|
521
|
+
])
|
|
522
|
+
id_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
|
|
523
|
+
|
|
524
|
+
# Set minimum width for the Action column
|
|
525
|
+
id_table.horizontalHeader().setSectionResizeMode(5, QHeaderView.ResizeMode.Interactive)
|
|
526
|
+
id_table.setColumnWidth(5, 140) # Set a fixed width for action column
|
|
527
|
+
|
|
528
|
+
for row, (table1, col1, table2, col2, match_ratio) in enumerate(inclusion_dependencies):
|
|
529
|
+
id_table.setItem(row, 0, QTableWidgetItem(table1))
|
|
530
|
+
id_table.setItem(row, 1, QTableWidgetItem(col1))
|
|
531
|
+
id_table.setItem(row, 2, QTableWidgetItem(table2))
|
|
532
|
+
id_table.setItem(row, 3, QTableWidgetItem(col2))
|
|
533
|
+
|
|
534
|
+
# Format match ratio with color coding
|
|
535
|
+
ratio_item = QTableWidgetItem(f"{match_ratio:.2%}")
|
|
536
|
+
if match_ratio >= 0.95:
|
|
537
|
+
ratio_item.setForeground(Qt.GlobalColor.darkGreen)
|
|
538
|
+
elif match_ratio >= 0.8:
|
|
539
|
+
ratio_item.setForeground(Qt.GlobalColor.darkBlue)
|
|
540
|
+
else:
|
|
541
|
+
ratio_item.setForeground(Qt.GlobalColor.darkYellow)
|
|
542
|
+
id_table.setItem(row, 4, ratio_item)
|
|
543
|
+
|
|
544
|
+
# Add Generate JOIN hyperlink - optimized for better visibility
|
|
545
|
+
if on_generate_join is not None:
|
|
546
|
+
button_widget = QWidget()
|
|
547
|
+
button_layout = QHBoxLayout(button_widget)
|
|
548
|
+
button_layout.setContentsMargins(0, 0, 0, 0) # Minimal margins
|
|
549
|
+
button_layout.setSpacing(0) # No spacing
|
|
550
|
+
|
|
551
|
+
# Create a styled hyperlink label
|
|
552
|
+
join_link = QLabel("<a href='#' style='color: #3498DB; font-weight: bold;'>Generate JOIN</a>")
|
|
553
|
+
join_link.setTextFormat(Qt.TextFormat.RichText)
|
|
554
|
+
join_link.setTextInteractionFlags(Qt.TextInteractionFlag.TextBrowserInteraction)
|
|
555
|
+
join_link.setCursor(Qt.CursorShape.PointingHandCursor)
|
|
556
|
+
join_link.setAlignment(Qt.AlignmentFlag.AlignCenter) # Center the text
|
|
557
|
+
join_query = f"SELECT * FROM {table2} JOIN {table1} ON {table2}.{col2} = {table1}.{col1}"
|
|
558
|
+
|
|
559
|
+
# Connect linkActivated signal to handle the JOIN query
|
|
560
|
+
join_link.linkActivated.connect(lambda link, q=join_query: handle_join_query(q))
|
|
561
|
+
|
|
562
|
+
button_layout.addWidget(join_link)
|
|
563
|
+
id_table.setCellWidget(row, 5, button_widget)
|
|
564
|
+
|
|
565
|
+
id_layout.addWidget(id_table)
|
|
566
|
+
id_tab.setLayout(id_layout)
|
|
567
|
+
tabs.addTab(id_tab, "Inclusion Dependencies")
|
|
568
|
+
|
|
569
|
+
# Tab for Referential Integrity
|
|
570
|
+
ri_tab = QWidget()
|
|
571
|
+
ri_layout = QVBoxLayout()
|
|
572
|
+
|
|
573
|
+
ri_header = QLabel("Referential Integrity Analysis")
|
|
574
|
+
ri_header.setStyleSheet("font-weight: bold;")
|
|
575
|
+
ri_layout.addWidget(ri_header)
|
|
576
|
+
|
|
577
|
+
ri_description = QLabel(
|
|
578
|
+
"This table shows referential integrity violations for discovered foreign keys. "
|
|
579
|
+
"A violation occurs when a value in the foreign key column doesn't exist in the referenced column."
|
|
580
|
+
)
|
|
581
|
+
ri_description.setWordWrap(True)
|
|
582
|
+
ri_layout.addWidget(ri_description)
|
|
583
|
+
|
|
584
|
+
# Create table for referential integrity
|
|
585
|
+
ri_table = QTableWidget(len(integrity_results), 6) # Added column for Generate JOIN button
|
|
586
|
+
ri_table.setHorizontalHeaderLabels([
|
|
587
|
+
"Relationship", "Violations", "Total FK Values", "Violation %", "Example Violations", "Action"
|
|
588
|
+
])
|
|
589
|
+
ri_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
|
|
590
|
+
|
|
591
|
+
# Set minimum width for the Action column
|
|
592
|
+
ri_table.horizontalHeader().setSectionResizeMode(5, QHeaderView.ResizeMode.Interactive)
|
|
593
|
+
ri_table.setColumnWidth(5, 140) # Set a fixed width for action column
|
|
594
|
+
|
|
595
|
+
row = 0
|
|
596
|
+
for key, stats in integrity_results.items():
|
|
597
|
+
pk_table, pk_col, fk_table, fk_col = key
|
|
598
|
+
relationship = f"{fk_table}.{fk_col} → {pk_table}.{pk_col}"
|
|
599
|
+
|
|
600
|
+
ri_table.setItem(row, 0, QTableWidgetItem(relationship))
|
|
601
|
+
ri_table.setItem(row, 1, QTableWidgetItem(str(stats['violation_count'])))
|
|
602
|
+
ri_table.setItem(row, 2, QTableWidgetItem(str(stats['total_fk_values'])))
|
|
603
|
+
|
|
604
|
+
# Format violation ratio with color coding
|
|
605
|
+
ratio_item = QTableWidgetItem(f"{stats['violation_ratio']:.2%}")
|
|
606
|
+
if stats['violation_ratio'] == 0:
|
|
607
|
+
ratio_item.setForeground(Qt.GlobalColor.darkGreen)
|
|
608
|
+
elif stats['violation_ratio'] < 0.01:
|
|
609
|
+
ratio_item.setForeground(Qt.GlobalColor.darkBlue)
|
|
610
|
+
else:
|
|
611
|
+
ratio_item.setForeground(Qt.GlobalColor.darkRed)
|
|
612
|
+
ri_table.setItem(row, 3, ratio_item)
|
|
613
|
+
|
|
614
|
+
# Show example violations
|
|
615
|
+
examples = ', '.join([str(v) for v in stats['violations']])
|
|
616
|
+
if stats['violation_count'] > len(stats['violations']):
|
|
617
|
+
examples += f" (and {stats['violation_count'] - len(stats['violations'])} more)"
|
|
618
|
+
ri_table.setItem(row, 4, QTableWidgetItem(examples))
|
|
619
|
+
|
|
620
|
+
# Add Generate JOIN hyperlink - optimized for better visibility
|
|
621
|
+
if on_generate_join is not None:
|
|
622
|
+
button_widget = QWidget()
|
|
623
|
+
button_layout = QHBoxLayout(button_widget)
|
|
624
|
+
button_layout.setContentsMargins(0, 0, 0, 0) # Minimal margins
|
|
625
|
+
button_layout.setSpacing(0) # No spacing
|
|
626
|
+
|
|
627
|
+
# Create a styled hyperlink label
|
|
628
|
+
join_link = QLabel("<a href='#' style='color: #3498DB; font-weight: bold;'>Generate JOIN</a>")
|
|
629
|
+
join_link.setTextFormat(Qt.TextFormat.RichText)
|
|
630
|
+
join_link.setTextInteractionFlags(Qt.TextInteractionFlag.TextBrowserInteraction)
|
|
631
|
+
join_link.setCursor(Qt.CursorShape.PointingHandCursor)
|
|
632
|
+
join_link.setAlignment(Qt.AlignmentFlag.AlignCenter) # Center the text
|
|
633
|
+
join_query = f"SELECT * FROM {fk_table} LEFT JOIN {pk_table} ON {fk_table}.{fk_col} = {pk_table}.{pk_col}"
|
|
634
|
+
|
|
635
|
+
# Connect linkActivated signal to handle the JOIN query
|
|
636
|
+
join_link.linkActivated.connect(lambda link, q=join_query: handle_join_query(q))
|
|
637
|
+
|
|
638
|
+
button_layout.addWidget(join_link)
|
|
639
|
+
ri_table.setCellWidget(row, 5, button_widget)
|
|
640
|
+
|
|
641
|
+
row += 1
|
|
642
|
+
|
|
643
|
+
ri_layout.addWidget(ri_table)
|
|
644
|
+
ri_tab.setLayout(ri_layout)
|
|
645
|
+
tabs.addTab(ri_tab, "Referential Integrity")
|
|
646
|
+
|
|
647
|
+
layout.addWidget(tabs)
|
|
648
|
+
|
|
649
|
+
# Show the window
|
|
650
|
+
window.show()
|
|
651
|
+
return window
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def test_profile_foreign_keys():
|
|
655
|
+
"""
|
|
656
|
+
Test function to demonstrate foreign key detection with sample data.
|
|
657
|
+
"""
|
|
658
|
+
# Create test data with clear foreign key relationships
|
|
659
|
+
|
|
660
|
+
# Customers table
|
|
661
|
+
customers_data = {
|
|
662
|
+
"customer_id": list(range(1, 21)),
|
|
663
|
+
"customer_name": ["Customer " + str(i) for i in range(1, 21)],
|
|
664
|
+
"city": ["City " + str(i % 5) for i in range(1, 21)]
|
|
665
|
+
}
|
|
666
|
+
customers_df = pd.DataFrame(customers_data)
|
|
667
|
+
|
|
668
|
+
# Products table
|
|
669
|
+
products_data = {
|
|
670
|
+
"product_id": list(range(101, 111)),
|
|
671
|
+
"product_name": ["Product " + str(i) for i in range(101, 111)],
|
|
672
|
+
"category": ["Category " + str(i % 3) for i in range(101, 111)]
|
|
673
|
+
}
|
|
674
|
+
products_df = pd.DataFrame(products_data)
|
|
675
|
+
|
|
676
|
+
# Orders table (with foreign keys to customers and products)
|
|
677
|
+
import random
|
|
678
|
+
random.seed(42)
|
|
679
|
+
|
|
680
|
+
orders_data = {
|
|
681
|
+
"order_id": list(range(1001, 1101)),
|
|
682
|
+
"customer_id": [random.randint(1, 20) for _ in range(100)],
|
|
683
|
+
"order_date": [pd.Timestamp("2021-01-01") + pd.Timedelta(days=i) for i in range(100)]
|
|
684
|
+
}
|
|
685
|
+
orders_df = pd.DataFrame(orders_data)
|
|
686
|
+
|
|
687
|
+
# Order details table (with foreign keys to orders and products)
|
|
688
|
+
order_details_data = {
|
|
689
|
+
"order_detail_id": list(range(10001, 10201)),
|
|
690
|
+
"order_id": [random.choice(orders_data["order_id"]) for _ in range(200)],
|
|
691
|
+
"product_id": [random.choice(products_data["product_id"]) for _ in range(200)],
|
|
692
|
+
"quantity": [random.randint(1, 10) for _ in range(200)]
|
|
693
|
+
}
|
|
694
|
+
order_details_df = pd.DataFrame(order_details_data)
|
|
695
|
+
|
|
696
|
+
# Add some referential integrity violations
|
|
697
|
+
# Add some non-existent customer IDs
|
|
698
|
+
orders_df.loc[95:99, "customer_id"] = [25, 26, 27, 28, 29]
|
|
699
|
+
|
|
700
|
+
# Define a callback function to handle JOIN generation
|
|
701
|
+
def handle_join_query(query):
|
|
702
|
+
print(f"Generated JOIN query: {query}")
|
|
703
|
+
# In a real application, this would insert the query into the query editor
|
|
704
|
+
|
|
705
|
+
# Create and show visualization
|
|
706
|
+
dfs = [customers_df, products_df, orders_df, order_details_df]
|
|
707
|
+
df_names = ["Customers", "Products", "Orders", "OrderDetails"]
|
|
708
|
+
|
|
709
|
+
app = QApplication(sys.argv)
|
|
710
|
+
window = visualize_foreign_keys(dfs, df_names, min_match_ratio=0.9, on_generate_join=handle_join_query)
|
|
711
|
+
sys.exit(app.exec())
|
|
712
|
+
|
|
713
|
+
|
|
714
|
+
def test_profile_foreign_keys_console():
|
|
715
|
+
"""
|
|
716
|
+
Console test function to demonstrate improved foreign key detection.
|
|
717
|
+
"""
|
|
718
|
+
import random
|
|
719
|
+
|
|
720
|
+
# Create test data with clear foreign key relationships
|
|
721
|
+
|
|
722
|
+
# Customers table
|
|
723
|
+
customers_data = {
|
|
724
|
+
"customer_id": list(range(1, 21)),
|
|
725
|
+
"customer_name": ["Customer " + str(i) for i in range(1, 21)],
|
|
726
|
+
"city": ["City " + str(i % 5) for i in range(1, 21)]
|
|
727
|
+
}
|
|
728
|
+
customers_df = pd.DataFrame(customers_data)
|
|
729
|
+
|
|
730
|
+
# Products table
|
|
731
|
+
products_data = {
|
|
732
|
+
"product_id": list(range(101, 111)),
|
|
733
|
+
"product_name": ["Product " + str(i) for i in range(101, 111)],
|
|
734
|
+
"category": ["Category " + str(i % 3) for i in range(101, 111)]
|
|
735
|
+
}
|
|
736
|
+
products_df = pd.DataFrame(products_data)
|
|
737
|
+
|
|
738
|
+
# Orders table (with foreign keys to customers)
|
|
739
|
+
random.seed(42)
|
|
740
|
+
orders_data = {
|
|
741
|
+
"order_id": list(range(1001, 1101)),
|
|
742
|
+
"customer_id": [random.randint(1, 20) for _ in range(100)],
|
|
743
|
+
"order_date": [pd.Timestamp("2021-01-01") + pd.Timedelta(days=i) for i in range(100)]
|
|
744
|
+
}
|
|
745
|
+
orders_df = pd.DataFrame(orders_data)
|
|
746
|
+
|
|
747
|
+
# Order details table (with foreign keys to orders and products)
|
|
748
|
+
order_details_data = {
|
|
749
|
+
"order_detail_id": list(range(10001, 10201)),
|
|
750
|
+
"order_id": [random.choice(orders_data["order_id"]) for _ in range(200)],
|
|
751
|
+
"product_id": [random.choice(products_data["product_id"]) for _ in range(200)],
|
|
752
|
+
"quantity": [random.randint(1, 10) for _ in range(200)]
|
|
753
|
+
}
|
|
754
|
+
order_details_df = pd.DataFrame(order_details_data)
|
|
755
|
+
|
|
756
|
+
# Run foreign key detection
|
|
757
|
+
dfs = [customers_df, products_df, orders_df, order_details_df]
|
|
758
|
+
df_names = ["Customers", "Products", "Orders", "OrderDetails"]
|
|
759
|
+
|
|
760
|
+
foreign_keys, inclusion_dependencies, integrity_results = profile_foreign_keys(
|
|
761
|
+
dfs, df_names, min_match_ratio=0.9
|
|
762
|
+
)
|
|
763
|
+
|
|
764
|
+
print("=== IMPROVED FOREIGN KEY DETECTION RESULTS ===")
|
|
765
|
+
print(f"\nFound {len(foreign_keys)} potential foreign key relationships:")
|
|
766
|
+
|
|
767
|
+
for i, (pk_table, pk_col, fk_table, fk_col, match_ratio) in enumerate(foreign_keys, 1):
|
|
768
|
+
print(f"{i}. {fk_table}.{fk_col} → {pk_table}.{pk_col} (Match: {match_ratio:.2%})")
|
|
769
|
+
|
|
770
|
+
print(f"\nFound {len(inclusion_dependencies)} inclusion dependencies:")
|
|
771
|
+
for i, (table1, col1, table2, col2, match_ratio) in enumerate(inclusion_dependencies[:10], 1): # Show first 10
|
|
772
|
+
print(f"{i}. {table2}.{col2} ⊆ {table1}.{col1} (Match: {match_ratio:.2%})")
|
|
773
|
+
|
|
774
|
+
if len(inclusion_dependencies) > 10:
|
|
775
|
+
print(f"... and {len(inclusion_dependencies) - 10} more")
|
|
776
|
+
|
|
777
|
+
# Only run the GUI test function when script is executed directly
|
|
778
|
+
if __name__ == "__main__":
|
|
779
|
+
test_profile_foreign_keys()
|