sqlshell 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sqlshell might be problematic. Click here for more details.

@@ -0,0 +1,455 @@
1
+ import sys
2
+ import itertools
3
+ import pandas as pd
4
+ from typing import List, Dict, Tuple, Set
5
+ from PyQt6.QtWidgets import (
6
+ QApplication, QWidget, QVBoxLayout, QLabel, QTableWidget, QTableWidgetItem, QHeaderView, QTabWidget, QMainWindow
7
+ )
8
+ from PyQt6.QtCore import Qt
9
+
10
+
11
+ def find_foreign_keys(dfs: List[pd.DataFrame], df_names: List[str], min_match_ratio: float = 0.95):
12
+ """
13
+ Discover potential foreign key relationships between DataFrames.
14
+
15
+ Parameters:
16
+ - dfs: List of pandas DataFrames to analyze
17
+ - df_names: Names of the DataFrames (used for reporting)
18
+ - min_match_ratio: Minimum ratio of matching values to consider a foreign key
19
+
20
+ Returns:
21
+ - List of tuples (referenced_table, referenced_column, referencing_table, referencing_column, match_ratio)
22
+ """
23
+ foreign_keys = []
24
+
25
+ # First, identify potential primary keys in each DataFrame
26
+ pk_candidates = {}
27
+ for i, df in enumerate(dfs):
28
+ name = df_names[i]
29
+ # Consider columns with unique values as potential primary keys
30
+ for col in df.columns:
31
+ if df[col].nunique() == len(df) and not df[col].isna().any():
32
+ if name not in pk_candidates:
33
+ pk_candidates[name] = []
34
+ pk_candidates[name].append(col)
35
+
36
+ # For each DataFrame pair, check for foreign key relationships
37
+ for i, df1 in enumerate(dfs):
38
+ name1 = df_names[i]
39
+
40
+ # Skip if this DataFrame has no primary key candidates
41
+ if name1 not in pk_candidates:
42
+ continue
43
+
44
+ # For each potential primary key column
45
+ for pk_col in pk_candidates[name1]:
46
+ pk_values = set(df1[pk_col])
47
+
48
+ # Check every other DataFrame for matching columns
49
+ for j, df2 in enumerate(dfs):
50
+ name2 = df_names[j]
51
+
52
+ # Skip self-references
53
+ if i == j:
54
+ continue
55
+
56
+ # Check each column in df2 for potential foreign key relationship
57
+ for fk_col in df2.columns:
58
+ # Skip if data types are incompatible
59
+ if df1[pk_col].dtype != df2[fk_col].dtype:
60
+ continue
61
+
62
+ # Get unique values in potential foreign key column
63
+ fk_values = set(df2[fk_col].dropna())
64
+
65
+ # Skip empty columns
66
+ if not fk_values:
67
+ continue
68
+
69
+ # Check if foreign key values are a subset of primary key values
70
+ common_values = fk_values.intersection(pk_values)
71
+ match_ratio = len(common_values) / len(fk_values)
72
+
73
+ # Consider it a foreign key if match ratio exceeds threshold
74
+ if match_ratio >= min_match_ratio:
75
+ foreign_keys.append((name1, pk_col, name2, fk_col, match_ratio))
76
+
77
+ # Sort by match ratio (descending)
78
+ foreign_keys.sort(key=lambda x: x[4], reverse=True)
79
+ return foreign_keys
80
+
81
+
82
+ def find_inclusion_dependencies(dfs: List[pd.DataFrame], df_names: List[str], min_match_ratio: float = 0.8):
83
+ """
84
+ Find inclusion dependencies (more general than foreign keys) between DataFrames.
85
+ An inclusion dependency exists when values in one column are a subset of values in another column.
86
+
87
+ Parameters:
88
+ - dfs: List of pandas DataFrames to analyze
89
+ - df_names: Names of the DataFrames
90
+ - min_match_ratio: Minimum ratio of matching values
91
+
92
+ Returns:
93
+ - List of tuples (referenced_table, referenced_column, referencing_table, referencing_column, match_ratio)
94
+ """
95
+ dependencies = []
96
+
97
+ # For each pair of DataFrames
98
+ for i, df1 in enumerate(dfs):
99
+ name1 = df_names[i]
100
+
101
+ for j, df2 in enumerate(dfs):
102
+ name2 = df_names[j]
103
+
104
+ # Skip self-comparison for the same index
105
+ if i == j:
106
+ continue
107
+
108
+ # For each potential pair of columns
109
+ for col1 in df1.columns:
110
+ # Get unique values in the potential referenced column
111
+ values1 = set(df1[col1].dropna())
112
+
113
+ # Skip empty columns
114
+ if not values1:
115
+ continue
116
+
117
+ for col2 in df2.columns:
118
+ # Skip if data types are incompatible
119
+ if df1[col1].dtype != df2[col2].dtype:
120
+ continue
121
+
122
+ # Get unique values in the potential referencing column
123
+ values2 = set(df2[col2].dropna())
124
+
125
+ # Skip empty columns
126
+ if not values2:
127
+ continue
128
+
129
+ # Check if values2 is approximately a subset of values1
130
+ common_values = values2.intersection(values1)
131
+ match_ratio = len(common_values) / len(values2)
132
+
133
+ # Consider it an inclusion dependency if match ratio exceeds threshold
134
+ if match_ratio >= min_match_ratio:
135
+ dependencies.append((name1, col1, name2, col2, match_ratio))
136
+
137
+ # Sort by match ratio (descending)
138
+ dependencies.sort(key=lambda x: x[4], reverse=True)
139
+ return dependencies
140
+
141
+
142
+ def profile_referential_integrity(dfs: List[pd.DataFrame], df_names: List[str], foreign_keys):
143
+ """
144
+ Profile the referential integrity of discovered foreign keys.
145
+
146
+ Parameters:
147
+ - dfs: List of pandas DataFrames
148
+ - df_names: Names of the DataFrames
149
+ - foreign_keys: List of foreign key relationships
150
+
151
+ Returns:
152
+ - Dictionary with referential integrity statistics
153
+ """
154
+ integrity_results = {}
155
+
156
+ # Create lookup for DataFrames by name
157
+ df_dict = {name: df for name, df in zip(df_names, dfs)}
158
+
159
+ for pk_table, pk_col, fk_table, fk_col, _ in foreign_keys:
160
+ pk_df = df_dict[pk_table]
161
+ fk_df = df_dict[fk_table]
162
+
163
+ # Get primary key values
164
+ pk_values = set(pk_df[pk_col])
165
+
166
+ # Get foreign key values
167
+ fk_values = set(fk_df[fk_col].dropna())
168
+
169
+ # Count values that violate referential integrity
170
+ violations = fk_values - pk_values
171
+ violation_count = len(violations)
172
+
173
+ # Calculate violation ratio
174
+ total_fk_values = len(fk_df[fk_col].dropna())
175
+ violation_ratio = violation_count / total_fk_values if total_fk_values > 0 else 0
176
+
177
+ # Record results
178
+ key = (pk_table, pk_col, fk_table, fk_col)
179
+ integrity_results[key] = {
180
+ 'violation_count': violation_count,
181
+ 'violation_ratio': violation_ratio,
182
+ 'total_fk_values': total_fk_values,
183
+ 'violations': list(violations)[:10] # Only store first 10 violations for display
184
+ }
185
+
186
+ return integrity_results
187
+
188
+
189
+ def profile_foreign_keys(dfs: List[pd.DataFrame], df_names: List[str] = None, min_match_ratio: float = 0.95):
190
+ """
191
+ Analyze a list of pandas DataFrames to discover foreign key relationships.
192
+
193
+ Parameters:
194
+ - dfs: List of pandas DataFrames to analyze
195
+ - df_names: Optional list of names for the DataFrames. If None, names will be generated.
196
+ - min_match_ratio: Minimum ratio of matching values to consider a foreign key
197
+
198
+ Returns:
199
+ - Tuple of (foreign_keys, inclusion_dependencies, integrity_results)
200
+ """
201
+ # Generate default names if not provided
202
+ if df_names is None:
203
+ df_names = [f"Table_{i+1}" for i in range(len(dfs))]
204
+
205
+ # Ensure we have the same number of names as DataFrames
206
+ assert len(dfs) == len(df_names), "Number of DataFrames must match number of names"
207
+
208
+ # Find foreign keys
209
+ foreign_keys = find_foreign_keys(dfs, df_names, min_match_ratio)
210
+
211
+ # Find more general inclusion dependencies
212
+ inclusion_dependencies = find_inclusion_dependencies(dfs, df_names, min_match_ratio * 0.8)
213
+
214
+ # Profile referential integrity
215
+ integrity_results = profile_referential_integrity(dfs, df_names, foreign_keys)
216
+
217
+ return foreign_keys, inclusion_dependencies, integrity_results
218
+
219
+
220
+ def visualize_foreign_keys(dfs: List[pd.DataFrame], df_names: List[str] = None, min_match_ratio: float = 0.95):
221
+ """
222
+ Create a visual representation of foreign key relationships between DataFrames.
223
+
224
+ Parameters:
225
+ - dfs: List of pandas DataFrames to analyze
226
+ - df_names: Optional list of names for the DataFrames. If None, names will be generated.
227
+ - min_match_ratio: Minimum ratio of matching values to consider a foreign key
228
+
229
+ Returns:
230
+ - QMainWindow: The visualization window
231
+ """
232
+ # Generate default names if not provided
233
+ if df_names is None:
234
+ df_names = [f"Table_{i+1}" for i in range(len(dfs))]
235
+
236
+ # Get profile results
237
+ foreign_keys, inclusion_dependencies, integrity_results = profile_foreign_keys(
238
+ dfs, df_names, min_match_ratio
239
+ )
240
+
241
+ # Create main window
242
+ window = QMainWindow()
243
+ window.setWindowTitle("Foreign Key Analysis")
244
+ window.resize(900, 700)
245
+
246
+ # Create central widget and layout
247
+ central_widget = QWidget()
248
+ window.setCentralWidget(central_widget)
249
+ layout = QVBoxLayout(central_widget)
250
+
251
+ # Add header
252
+ header = QLabel(f"Analyzed {len(dfs)} tables with potential foreign key relationships")
253
+ header.setAlignment(Qt.AlignmentFlag.AlignCenter)
254
+ header.setStyleSheet("font-size: 14pt; font-weight: bold; margin: 10px;")
255
+ layout.addWidget(header)
256
+
257
+ # Add description
258
+ description = QLabel(
259
+ "This analysis helps identify potential foreign key relationships between tables. "
260
+ "Foreign keys are columns in one table that reference the primary key of another table. "
261
+ "The match ratio indicates how many values in the foreign key column exist in the referenced column."
262
+ )
263
+ description.setAlignment(Qt.AlignmentFlag.AlignCenter)
264
+ description.setWordWrap(True)
265
+ description.setStyleSheet("margin-bottom: 10px;")
266
+ layout.addWidget(description)
267
+
268
+ # Create tabs
269
+ tabs = QTabWidget()
270
+
271
+ # Tab for Foreign Keys
272
+ fk_tab = QWidget()
273
+ fk_layout = QVBoxLayout()
274
+
275
+ fk_header = QLabel("Potential Foreign Key Relationships")
276
+ fk_header.setStyleSheet("font-weight: bold;")
277
+ fk_layout.addWidget(fk_header)
278
+
279
+ fk_table = QTableWidget(len(foreign_keys), 5)
280
+ fk_table.setHorizontalHeaderLabels([
281
+ "Referenced Table", "Referenced Column", "Referencing Table", "Referencing Column", "Match Ratio"
282
+ ])
283
+ fk_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
284
+
285
+ for row, (pk_table, pk_col, fk_table_name, fk_col, match_ratio) in enumerate(foreign_keys):
286
+ fk_table.setItem(row, 0, QTableWidgetItem(pk_table))
287
+ fk_table.setItem(row, 1, QTableWidgetItem(pk_col))
288
+ fk_table.setItem(row, 2, QTableWidgetItem(fk_table_name))
289
+ fk_table.setItem(row, 3, QTableWidgetItem(fk_col))
290
+
291
+ # Format match ratio with color coding
292
+ ratio_item = QTableWidgetItem(f"{match_ratio:.2%}")
293
+ if match_ratio >= 0.99:
294
+ ratio_item.setForeground(Qt.GlobalColor.darkGreen)
295
+ elif match_ratio >= 0.9:
296
+ ratio_item.setForeground(Qt.GlobalColor.darkBlue)
297
+ else:
298
+ ratio_item.setForeground(Qt.GlobalColor.darkYellow)
299
+ fk_table.setItem(row, 4, ratio_item)
300
+
301
+ fk_layout.addWidget(fk_table)
302
+ fk_tab.setLayout(fk_layout)
303
+ tabs.addTab(fk_tab, "Foreign Keys")
304
+
305
+ # Tab for Inclusion Dependencies
306
+ id_tab = QWidget()
307
+ id_layout = QVBoxLayout()
308
+
309
+ id_header = QLabel("Inclusion Dependencies (Values in one column are a subset of another)")
310
+ id_header.setStyleSheet("font-weight: bold;")
311
+ id_layout.addWidget(id_header)
312
+
313
+ id_table = QTableWidget(len(inclusion_dependencies), 5)
314
+ id_table.setHorizontalHeaderLabels([
315
+ "Referenced Table", "Referenced Column", "Referencing Table", "Referencing Column", "Match Ratio"
316
+ ])
317
+ id_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
318
+
319
+ for row, (table1, col1, table2, col2, match_ratio) in enumerate(inclusion_dependencies):
320
+ id_table.setItem(row, 0, QTableWidgetItem(table1))
321
+ id_table.setItem(row, 1, QTableWidgetItem(col1))
322
+ id_table.setItem(row, 2, QTableWidgetItem(table2))
323
+ id_table.setItem(row, 3, QTableWidgetItem(col2))
324
+
325
+ # Format match ratio with color coding
326
+ ratio_item = QTableWidgetItem(f"{match_ratio:.2%}")
327
+ if match_ratio >= 0.95:
328
+ ratio_item.setForeground(Qt.GlobalColor.darkGreen)
329
+ elif match_ratio >= 0.8:
330
+ ratio_item.setForeground(Qt.GlobalColor.darkBlue)
331
+ else:
332
+ ratio_item.setForeground(Qt.GlobalColor.darkYellow)
333
+ id_table.setItem(row, 4, ratio_item)
334
+
335
+ id_layout.addWidget(id_table)
336
+ id_tab.setLayout(id_layout)
337
+ tabs.addTab(id_tab, "Inclusion Dependencies")
338
+
339
+ # Tab for Referential Integrity
340
+ ri_tab = QWidget()
341
+ ri_layout = QVBoxLayout()
342
+
343
+ ri_header = QLabel("Referential Integrity Analysis")
344
+ ri_header.setStyleSheet("font-weight: bold;")
345
+ ri_layout.addWidget(ri_header)
346
+
347
+ ri_description = QLabel(
348
+ "This table shows referential integrity violations for discovered foreign keys. "
349
+ "A violation occurs when a value in the foreign key column doesn't exist in the referenced column."
350
+ )
351
+ ri_description.setWordWrap(True)
352
+ ri_layout.addWidget(ri_description)
353
+
354
+ # Create table for referential integrity
355
+ ri_table = QTableWidget(len(integrity_results), 5)
356
+ ri_table.setHorizontalHeaderLabels([
357
+ "Relationship", "Violations", "Total FK Values", "Violation %", "Example Violations"
358
+ ])
359
+ ri_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
360
+
361
+ row = 0
362
+ for key, stats in integrity_results.items():
363
+ pk_table, pk_col, fk_table, fk_col = key
364
+ relationship = f"{fk_table}.{fk_col} → {pk_table}.{pk_col}"
365
+
366
+ ri_table.setItem(row, 0, QTableWidgetItem(relationship))
367
+ ri_table.setItem(row, 1, QTableWidgetItem(str(stats['violation_count'])))
368
+ ri_table.setItem(row, 2, QTableWidgetItem(str(stats['total_fk_values'])))
369
+
370
+ # Format violation ratio with color coding
371
+ ratio_item = QTableWidgetItem(f"{stats['violation_ratio']:.2%}")
372
+ if stats['violation_ratio'] == 0:
373
+ ratio_item.setForeground(Qt.GlobalColor.darkGreen)
374
+ elif stats['violation_ratio'] < 0.01:
375
+ ratio_item.setForeground(Qt.GlobalColor.darkBlue)
376
+ else:
377
+ ratio_item.setForeground(Qt.GlobalColor.darkRed)
378
+ ri_table.setItem(row, 3, ratio_item)
379
+
380
+ # Show example violations
381
+ examples = ', '.join([str(v) for v in stats['violations']])
382
+ if stats['violation_count'] > len(stats['violations']):
383
+ examples += f" (and {stats['violation_count'] - len(stats['violations'])} more)"
384
+ ri_table.setItem(row, 4, QTableWidgetItem(examples))
385
+
386
+ row += 1
387
+
388
+ ri_layout.addWidget(ri_table)
389
+ ri_tab.setLayout(ri_layout)
390
+ tabs.addTab(ri_tab, "Referential Integrity")
391
+
392
+ layout.addWidget(tabs)
393
+
394
+ # Show the window
395
+ window.show()
396
+ return window
397
+
398
+
399
+ def test_profile_foreign_keys():
400
+ """
401
+ Test function to demonstrate foreign key detection with sample data.
402
+ """
403
+ # Create test data with clear foreign key relationships
404
+
405
+ # Customers table
406
+ customers_data = {
407
+ "customer_id": list(range(1, 21)),
408
+ "customer_name": ["Customer " + str(i) for i in range(1, 21)],
409
+ "city": ["City " + str(i % 5) for i in range(1, 21)]
410
+ }
411
+ customers_df = pd.DataFrame(customers_data)
412
+
413
+ # Products table
414
+ products_data = {
415
+ "product_id": list(range(101, 111)),
416
+ "product_name": ["Product " + str(i) for i in range(101, 111)],
417
+ "category": ["Category " + str(i % 3) for i in range(101, 111)]
418
+ }
419
+ products_df = pd.DataFrame(products_data)
420
+
421
+ # Orders table (with foreign keys to customers and products)
422
+ import random
423
+ random.seed(42)
424
+
425
+ orders_data = {
426
+ "order_id": list(range(1001, 1101)),
427
+ "customer_id": [random.randint(1, 20) for _ in range(100)],
428
+ "order_date": [pd.Timestamp("2021-01-01") + pd.Timedelta(days=i) for i in range(100)]
429
+ }
430
+ orders_df = pd.DataFrame(orders_data)
431
+
432
+ # Order details table (with foreign keys to orders and products)
433
+ order_details_data = {
434
+ "order_detail_id": list(range(10001, 10201)),
435
+ "order_id": [random.choice(orders_data["order_id"]) for _ in range(200)],
436
+ "product_id": [random.choice(products_data["product_id"]) for _ in range(200)],
437
+ "quantity": [random.randint(1, 10) for _ in range(200)]
438
+ }
439
+ order_details_df = pd.DataFrame(order_details_data)
440
+
441
+ # Add some referential integrity violations
442
+ # Add some non-existent customer IDs
443
+ orders_df.loc[95:99, "customer_id"] = [25, 26, 27, 28, 29]
444
+
445
+ # Create and show visualization
446
+ dfs = [customers_df, products_df, orders_df, order_details_df]
447
+ df_names = ["Customers", "Products", "Orders", "OrderDetails"]
448
+
449
+ app = QApplication(sys.argv)
450
+ window = visualize_foreign_keys(dfs, df_names, min_match_ratio=0.9)
451
+ sys.exit(app.exec())
452
+
453
+ # Only run the test function when script is executed directly
454
+ if __name__ == "__main__":
455
+ test_profile_foreign_keys()