sqlshell 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. sqlshell/__init__.py +84 -0
  2. sqlshell/__main__.py +4926 -0
  3. sqlshell/ai_autocomplete.py +392 -0
  4. sqlshell/ai_settings_dialog.py +337 -0
  5. sqlshell/context_suggester.py +768 -0
  6. sqlshell/create_test_data.py +152 -0
  7. sqlshell/data/create_test_data.py +137 -0
  8. sqlshell/db/__init__.py +6 -0
  9. sqlshell/db/database_manager.py +1318 -0
  10. sqlshell/db/export_manager.py +188 -0
  11. sqlshell/editor.py +1166 -0
  12. sqlshell/editor_integration.py +127 -0
  13. sqlshell/execution_handler.py +421 -0
  14. sqlshell/menus.py +262 -0
  15. sqlshell/notification_manager.py +370 -0
  16. sqlshell/query_tab.py +904 -0
  17. sqlshell/resources/__init__.py +1 -0
  18. sqlshell/resources/icon.png +0 -0
  19. sqlshell/resources/logo_large.png +0 -0
  20. sqlshell/resources/logo_medium.png +0 -0
  21. sqlshell/resources/logo_small.png +0 -0
  22. sqlshell/resources/splash_screen.gif +0 -0
  23. sqlshell/space_invaders.py +501 -0
  24. sqlshell/splash_screen.py +405 -0
  25. sqlshell/sqlshell/__init__.py +5 -0
  26. sqlshell/sqlshell/create_test_data.py +118 -0
  27. sqlshell/sqlshell/create_test_databases.py +96 -0
  28. sqlshell/sqlshell_demo.png +0 -0
  29. sqlshell/styles.py +257 -0
  30. sqlshell/suggester_integration.py +330 -0
  31. sqlshell/syntax_highlighter.py +124 -0
  32. sqlshell/table_list.py +996 -0
  33. sqlshell/ui/__init__.py +6 -0
  34. sqlshell/ui/bar_chart_delegate.py +49 -0
  35. sqlshell/ui/filter_header.py +469 -0
  36. sqlshell/utils/__init__.py +16 -0
  37. sqlshell/utils/profile_cn2.py +1661 -0
  38. sqlshell/utils/profile_column.py +2635 -0
  39. sqlshell/utils/profile_distributions.py +616 -0
  40. sqlshell/utils/profile_entropy.py +347 -0
  41. sqlshell/utils/profile_foreign_keys.py +779 -0
  42. sqlshell/utils/profile_keys.py +2834 -0
  43. sqlshell/utils/profile_ohe.py +934 -0
  44. sqlshell/utils/profile_ohe_advanced.py +754 -0
  45. sqlshell/utils/profile_ohe_comparison.py +237 -0
  46. sqlshell/utils/profile_prediction.py +926 -0
  47. sqlshell/utils/profile_similarity.py +876 -0
  48. sqlshell/utils/search_in_df.py +90 -0
  49. sqlshell/widgets.py +400 -0
  50. sqlshell-0.4.4.dist-info/METADATA +441 -0
  51. sqlshell-0.4.4.dist-info/RECORD +54 -0
  52. sqlshell-0.4.4.dist-info/WHEEL +5 -0
  53. sqlshell-0.4.4.dist-info/entry_points.txt +2 -0
  54. sqlshell-0.4.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2834 @@
1
+ import sys
2
+ import itertools
3
+ import pandas as pd
4
+ import numpy as np
5
+ import random
6
+ import time
7
+ import math
8
+ from collections import defaultdict
9
+ from PyQt6.QtWidgets import (
10
+ QApplication, QWidget, QVBoxLayout, QLabel, QTableWidget, QTableWidgetItem, QHeaderView, QTabWidget, QMainWindow
11
+ )
12
+ from PyQt6.QtCore import Qt
13
+
14
+
15
+ def estimate_computation_cost(n_rows, n_cols, max_combination_size, max_lhs_size):
16
+ """
17
+ Estimate computational cost to decide on sampling strategy.
18
+ Returns (estimated_seconds, should_sample, sample_size)
19
+ """
20
+ # Special handling for high-column datasets - these are computationally expensive
21
+ if n_cols > 50:
22
+ # Very aggressive limits for high-column datasets
23
+ print(f" High-column dataset detected ({n_cols} columns) - using aggressive optimization")
24
+ return float('inf'), True, min(5000, max(1000, n_rows // 20))
25
+
26
+ # Base cost factors
27
+ fd_combinations = sum(math.comb(n_cols, i) for i in range(1, max_lhs_size + 1))
28
+ key_combinations = sum(math.comb(n_cols, i) for i in range(1, max_combination_size + 1))
29
+
30
+ # Rough estimate: each combination costs O(n_rows * log(n_rows)) for groupby
31
+ fd_cost = fd_combinations * n_rows * math.log(max(n_rows, 2)) * 1e-6
32
+ key_cost = key_combinations * n_rows * math.log(max(n_rows, 2)) * 1e-6
33
+
34
+ total_cost = fd_cost + key_cost
35
+
36
+ # Sampling thresholds
37
+ if total_cost > 30: # More than 30 seconds estimated
38
+ return total_cost, True, min(50000, max(10000, n_rows // 10))
39
+ elif total_cost > 10: # More than 10 seconds estimated
40
+ return total_cost, True, min(100000, max(20000, n_rows // 5))
41
+ else:
42
+ return total_cost, False, n_rows
43
+
44
+
45
+ def sample_dataframe_intelligently(df, sample_size, random_state=42):
46
+ """
47
+ Sample dataframe while preserving data characteristics for key analysis.
48
+ """
49
+ try:
50
+ if len(df) <= sample_size:
51
+ return df, False
52
+
53
+ # Ensure sample_size is valid
54
+ sample_size = min(sample_size, len(df))
55
+ if sample_size <= 0:
56
+ return df.head(100) if len(df) > 100 else df, True
57
+
58
+ # Strategy: Take a mix of random sample and important patterns
59
+ np.random.seed(random_state)
60
+
61
+ # 1. Take a random sample (80% of sample)
62
+ random_sample_size = max(1, int(sample_size * 0.8))
63
+ random_sample_size = min(random_sample_size, len(df))
64
+
65
+ try:
66
+ random_indices = np.random.choice(len(df), size=random_sample_size, replace=False)
67
+ except ValueError:
68
+ # Fallback if numpy choice fails
69
+ random_indices = np.random.permutation(len(df))[:random_sample_size]
70
+
71
+ # 2. Add unique value representatives (20% of sample)
72
+ remaining_sample = sample_size - random_sample_size
73
+ unique_representatives = []
74
+
75
+ if remaining_sample > 0:
76
+ for col in df.columns:
77
+ if len(unique_representatives) >= remaining_sample:
78
+ break
79
+ try:
80
+ # Get indices of unique values not already in random sample
81
+ unique_values = df[col].drop_duplicates()
82
+ unique_indices = unique_values.index.tolist()
83
+ new_indices = [i for i in unique_indices if i not in random_indices and i < len(df)]
84
+ unique_representatives.extend(new_indices[:remaining_sample - len(unique_representatives)])
85
+ except Exception:
86
+ continue # Skip problematic columns
87
+
88
+ # Combine samples and ensure all indices are valid
89
+ all_indices = list(set(random_indices) | set(unique_representatives))
90
+ all_indices = [i for i in all_indices if 0 <= i < len(df)] # Bounds check
91
+ all_indices = all_indices[:sample_size] # Limit to sample size
92
+
93
+ if not all_indices:
94
+ # Fallback: just take first sample_size rows
95
+ return df.head(sample_size), True
96
+
97
+ try:
98
+ sampled_df = df.iloc[all_indices].reset_index(drop=True)
99
+ return sampled_df, True
100
+ except (IndexError, KeyError):
101
+ # Final fallback: simple head sampling
102
+ return df.head(sample_size), True
103
+
104
+ except Exception as e:
105
+ print(f"Warning: Error in intelligent sampling: {e}. Using simple head sampling.")
106
+ safe_sample_size = min(sample_size, len(df))
107
+ return df.head(safe_sample_size), True
108
+
109
+
110
+ def find_functional_dependencies_ultra_optimized(df: pd.DataFrame, max_lhs_size: int = 2):
111
+ """
112
+ Ultra-optimized functional dependency discovery for large datasets.
113
+ Maintains correctness while improving performance through smart sampling and caching.
114
+ """
115
+ n_rows = len(df)
116
+ cols = list(df.columns)
117
+
118
+ if n_rows == 0 or len(cols) < 2:
119
+ return []
120
+
121
+ # Only sample for very large datasets to maintain accuracy for smaller ones
122
+ original_df = df
123
+ was_sampled = False
124
+ if n_rows > 50000: # Only sample for very large datasets
125
+ cost, should_sample, sample_size = estimate_computation_cost(n_rows, len(cols), 3, max_lhs_size)
126
+ if should_sample:
127
+ df, was_sampled = sample_dataframe_intelligently(df, sample_size)
128
+ n_rows = len(df)
129
+ print(f" Sampled {n_rows} rows from {len(original_df)} for FD analysis")
130
+
131
+ fds = []
132
+
133
+ # Pre-compute all cardinalities once
134
+ col_cardinalities = {col: df[col].nunique() for col in cols}
135
+
136
+ # Use the same filtering logic as the original but with pre-computed cardinalities
137
+ # Don't be too aggressive with filtering to maintain consistency
138
+ non_unique_cols = [col for col in cols if col_cardinalities[col] < n_rows]
139
+
140
+ # Group cache for efficient reuse
141
+ group_cache = {}
142
+
143
+ # Apply combination limits only for very large datasets
144
+ if n_rows > 100000:
145
+ max_combinations_per_size = {1: min(100, len(cols)), 2: min(200, len(cols) ** 2)}
146
+ else:
147
+ max_combinations_per_size = {1: len(cols), 2: len(cols) ** 2} # No limits for smaller datasets
148
+
149
+ for size in range(1, max_lhs_size + 1):
150
+ # Use same logic as optimized version for consistency
151
+ lhs_candidates = non_unique_cols if size == 1 else cols
152
+
153
+ lhs_combinations = list(itertools.combinations(lhs_candidates, size))
154
+
155
+ # Only limit combinations for very large datasets
156
+ if n_rows > 100000:
157
+ max_combos = max_combinations_per_size.get(size, len(lhs_combinations))
158
+ if len(lhs_combinations) > max_combos:
159
+ # Prioritize by cardinality (lower cardinality = more likely to be determinant)
160
+ lhs_combinations = sorted(lhs_combinations,
161
+ key=lambda x: sum(col_cardinalities[col] for col in x))[:max_combos]
162
+
163
+ for lhs in lhs_combinations:
164
+ lhs_tuple = tuple(lhs)
165
+
166
+ # Use cached groupby if available
167
+ if lhs_tuple not in group_cache:
168
+ try:
169
+ grouped = df.groupby(list(lhs), sort=False, dropna=False)
170
+ group_sizes = grouped.size()
171
+ group_cache[lhs_tuple] = (grouped, group_sizes)
172
+ except Exception:
173
+ continue # Skip problematic groupings
174
+ else:
175
+ grouped, group_sizes = group_cache[lhs_tuple]
176
+
177
+ # Use same logic as optimized version
178
+ n_groups = len(group_sizes)
179
+ if group_sizes.max() == 1:
180
+ continue # No interesting dependencies possible
181
+
182
+ # Test all RHS candidates like the original, but with early termination heuristics
183
+ for rhs in cols:
184
+ if rhs in lhs:
185
+ continue
186
+
187
+ # Only apply early termination for large datasets
188
+ if n_rows > 100000 and col_cardinalities[rhs] > n_groups:
189
+ continue
190
+
191
+ try:
192
+ # Check FD using same logic as optimized version
193
+ rhs_per_group = grouped[rhs].nunique()
194
+ if (rhs_per_group <= 1).all():
195
+ fds.append((lhs, rhs))
196
+ except Exception:
197
+ continue # Skip problematic columns
198
+
199
+ return fds
200
+
201
+
202
+ def find_candidate_keys_ultra_optimized(df: pd.DataFrame, max_combination_size: int = 2):
203
+ """
204
+ Ultra-optimized candidate key discovery for large datasets.
205
+ Maintains correctness while improving performance.
206
+ """
207
+ n_rows = len(df)
208
+ cols = list(df.columns)
209
+
210
+ if n_rows == 0:
211
+ return [], [], []
212
+
213
+ # Only sample for very large datasets
214
+ original_df = df
215
+ was_sampled = False
216
+ if n_rows > 50000: # Only sample for very large datasets
217
+ cost, should_sample, sample_size = estimate_computation_cost(n_rows, len(cols), max_combination_size, 2)
218
+ if should_sample:
219
+ df, was_sampled = sample_dataframe_intelligently(df, sample_size)
220
+ n_rows = len(df)
221
+ print(f" Sampled {n_rows} rows from {len(original_df)} for key analysis")
222
+
223
+ all_keys = []
224
+
225
+ # Check single columns first (same as optimized version)
226
+ single_column_keys = []
227
+ col_cardinalities = {}
228
+
229
+ for col in cols:
230
+ cardinality = df[col].nunique()
231
+ col_cardinalities[col] = cardinality
232
+ if cardinality == n_rows:
233
+ single_column_keys.append((col,))
234
+ all_keys.append((col,))
235
+
236
+ # Early termination only for single-column case if we have keys
237
+ if single_column_keys and max_combination_size == 1:
238
+ return all_keys, single_column_keys, []
239
+
240
+ # Apply conservative limits only for very large datasets
241
+ if n_rows > 100000:
242
+ max_combination_size = min(max_combination_size, 3)
243
+ max_combinations_to_test = min(500, math.comb(len(cols), 2))
244
+ else:
245
+ max_combinations_to_test = float('inf') # No limits for smaller datasets
246
+
247
+ # Multi-column key discovery
248
+ for size in range(2, max_combination_size + 1):
249
+ if size > len(cols):
250
+ break
251
+
252
+ combinations = list(itertools.combinations(cols, size))
253
+
254
+ # Only limit and prioritize for very large datasets
255
+ if n_rows > 100000 and len(combinations) > max_combinations_to_test:
256
+ # Prioritize combinations by likelihood of being keys
257
+ combinations = sorted(combinations,
258
+ key=lambda x: sum(col_cardinalities.get(col, n_rows) for col in x))
259
+ combinations = combinations[:max_combinations_to_test]
260
+
261
+ size_keys = []
262
+ tested_count = 0
263
+
264
+ for combo in combinations:
265
+ # Skip if contains single-column key
266
+ if any((col,) in single_column_keys for col in combo):
267
+ continue
268
+
269
+ # Skip if subset is already a key (same logic as optimized)
270
+ is_superkey = False
271
+ for subset_size in range(1, size):
272
+ for subset in itertools.combinations(combo, subset_size):
273
+ if subset in all_keys:
274
+ is_superkey = True
275
+ break
276
+ if is_superkey:
277
+ break
278
+
279
+ if is_superkey:
280
+ continue
281
+
282
+ # Check uniqueness using same method as optimized
283
+ try:
284
+ unique_count = len(df[list(combo)].drop_duplicates())
285
+ if unique_count == n_rows:
286
+ size_keys.append(combo)
287
+ all_keys.append(combo)
288
+ except Exception:
289
+ continue # Skip problematic combinations
290
+
291
+ tested_count += 1
292
+ # Only apply testing limits for very large datasets
293
+ if n_rows > 100000 and tested_count >= max_combinations_to_test // (size * size):
294
+ break
295
+
296
+ # Early termination if no keys found and we have smaller keys
297
+ if not size_keys and all_keys:
298
+ break
299
+
300
+ # Classify keys (same logic as optimized)
301
+ candidate_keys = []
302
+ superkeys = []
303
+
304
+ for key in all_keys:
305
+ is_candidate = True
306
+ for other_key in all_keys:
307
+ if len(other_key) < len(key) and set(other_key).issubset(set(key)):
308
+ is_candidate = False
309
+ break
310
+
311
+ if is_candidate:
312
+ candidate_keys.append(key)
313
+ else:
314
+ superkeys.append(key)
315
+
316
+ return all_keys, candidate_keys, superkeys
317
+
318
+
319
+ def profile_ultra_optimized(df: pd.DataFrame, max_combination_size: int = 2, max_lhs_size: int = 2):
320
+ """
321
+ Ultra-optimized profile function for large datasets.
322
+ """
323
+ start_time = time.time()
324
+ n_rows = len(df)
325
+ cols = list(df.columns)
326
+
327
+ print(f"Starting analysis of {n_rows:,} rows × {len(cols)} columns...")
328
+
329
+ # Intelligent parameter adjustment based on data size
330
+ if n_rows > 100000:
331
+ max_combination_size = min(max_combination_size, 2)
332
+ max_lhs_size = min(max_lhs_size, 2)
333
+ print(f" Large dataset detected - limiting analysis to combinations of size {max_combination_size}")
334
+ elif n_rows > 50000:
335
+ max_combination_size = min(max_combination_size, 3)
336
+ max_lhs_size = min(max_lhs_size, 2)
337
+
338
+ # Discover functional dependencies
339
+ fd_start = time.time()
340
+ fds = find_functional_dependencies_ultra_optimized(df, max_lhs_size)
341
+ fd_time = time.time() - fd_start
342
+ print(f" FD discovery completed in {fd_time:.2f}s - found {len(fds)} dependencies")
343
+
344
+ fd_results = [(", ".join(lhs), rhs) for lhs, rhs in fds]
345
+
346
+ # Discover keys
347
+ key_start = time.time()
348
+ all_keys, candidate_keys, superkeys = find_candidate_keys_ultra_optimized(df, max_combination_size)
349
+ key_time = time.time() - key_start
350
+ print(f" Key discovery completed in {key_time:.2f}s - found {len(candidate_keys)} candidate keys")
351
+
352
+ # Efficient result preparation
353
+ results = []
354
+ single_col_uniqueness = {col: df[col].nunique() for col in cols}
355
+
356
+ # Process results with smart computation limiting
357
+ combinations_tested = 0
358
+ max_combinations_total = min(1000, sum(math.comb(len(cols), i) for i in range(1, max_combination_size + 1)))
359
+
360
+ for size in range(1, max_combination_size + 1):
361
+ for combo in itertools.combinations(cols, size):
362
+ if combinations_tested >= max_combinations_total:
363
+ break
364
+
365
+ if len(combo) == 1:
366
+ unique_count = single_col_uniqueness[combo[0]]
367
+ elif combo in all_keys:
368
+ # For keys, we know they're unique
369
+ unique_count = n_rows
370
+ elif size <= 2: # Only compute for small combinations
371
+ try:
372
+ unique_count = len(df[list(combo)].drop_duplicates())
373
+ except Exception:
374
+ unique_count = min(n_rows, sum(single_col_uniqueness[col] for col in combo) // len(combo))
375
+ else:
376
+ # Estimate for larger combinations
377
+ unique_count = min(n_rows, sum(single_col_uniqueness[col] for col in combo) // len(combo))
378
+
379
+ unique_ratio = unique_count / n_rows if n_rows > 0 else 0
380
+ is_key = combo in all_keys
381
+ is_candidate = combo in candidate_keys
382
+ is_superkey = combo in superkeys
383
+
384
+ key_type = ""
385
+ if is_candidate:
386
+ key_type = "★ Candidate Key"
387
+ elif is_superkey:
388
+ key_type = "⊃ Superkey"
389
+
390
+ results.append((combo, unique_count, unique_ratio, is_key, key_type))
391
+ combinations_tested += 1
392
+
393
+ # Sort efficiently
394
+ results.sort(key=lambda x: (not x[3], -x[2], len(x[0])))
395
+ key_results = [(", ".join(c), u, f"{u/n_rows:.2%}", k)
396
+ for c, u, _, _, k in results]
397
+
398
+ # Generate normalized tables
399
+ normalized_tables = propose_normalized_tables(cols, candidate_keys, fds)
400
+
401
+ total_time = time.time() - start_time
402
+ print(f" Total analysis completed in {total_time:.2f}s")
403
+
404
+ return fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables
405
+
406
+
407
+ def create_stress_test_data(size, n_cols=None, complexity='medium'):
408
+ """
409
+ Create stress test data with different complexity levels.
410
+ """
411
+ random.seed(42)
412
+ np.random.seed(42)
413
+
414
+ if n_cols is None:
415
+ if complexity == 'simple':
416
+ n_cols = min(8, max(4, int(math.log10(size))))
417
+ elif complexity == 'medium':
418
+ n_cols = min(15, max(6, int(math.log10(size) * 1.5)))
419
+ else: # complex
420
+ n_cols = min(25, max(10, int(math.log10(size) * 2)))
421
+
422
+ print(f"Creating stress test data: {size:,} rows × {n_cols} columns ({complexity} complexity)")
423
+
424
+ data = {}
425
+
426
+ # Create ID column (always unique)
427
+ data['id'] = range(1, size + 1)
428
+
429
+ # Create categorical columns with different cardinalities
430
+ if complexity == 'simple':
431
+ cardinalities = [10, 20, 50, min(100, size // 10)]
432
+ elif complexity == 'medium':
433
+ cardinalities = [5, 10, 25, 50, 100, min(200, size // 10), min(500, size // 5)]
434
+ else: # complex
435
+ cardinalities = [3, 5, 10, 20, 50, 100, 200, min(500, size // 10), min(1000, size // 5)]
436
+
437
+ for i in range(1, min(n_cols, len(cardinalities) + 1)):
438
+ card = cardinalities[min(i-1, len(cardinalities)-1)]
439
+ data[f'cat_{i}'] = [f'cat_{i}_val_{j % card}' for j in range(size)]
440
+
441
+ # Add some functional dependencies
442
+ if n_cols > 4:
443
+ # category -> subcategory
444
+ data['category'] = [f'Category_{i % 5}' for i in range(size)]
445
+ data['subcategory'] = [f'Sub_{data["category"][i]}_{i % 3}' for i in range(size)]
446
+
447
+ if n_cols > 6:
448
+ # Create some numeric columns with dependencies
449
+ data['price'] = [random.randint(10, 1000) for _ in range(size)]
450
+ data['tax_rate'] = [0.1 if data['category'][i] == 'Category_0' else 0.15 for i in range(size)]
451
+ data['total_price'] = [int(data['price'][i] * (1 + data['tax_rate'][i])) for i in range(size)]
452
+
453
+ # Fill remaining columns with random data
454
+ remaining_cols = n_cols - len(data)
455
+ for i in range(remaining_cols):
456
+ col_name = f'random_{i}'
457
+ data[col_name] = [random.randint(1, min(1000, size // 2)) for _ in range(size)]
458
+
459
+ return pd.DataFrame(data)
460
+
461
+
462
+ def comprehensive_benchmark():
463
+ """
464
+ Comprehensive benchmark for large dataset performance.
465
+ """
466
+ print("=== COMPREHENSIVE LARGE DATA BENCHMARK ===\n")
467
+
468
+ # Test different dataset sizes and complexities
469
+ test_configs = [
470
+ (1000, 'simple'),
471
+ (5000, 'simple'),
472
+ (10000, 'medium'),
473
+ (50000, 'medium'),
474
+ (100000, 'medium'),
475
+ (500000, 'complex'),
476
+ (1000000, 'complex'),
477
+ ]
478
+
479
+ results = []
480
+
481
+ for size, complexity in test_configs:
482
+ print(f"\n{'='*60}")
483
+ print(f"TESTING: {size:,} rows with {complexity} complexity")
484
+ print('='*60)
485
+
486
+ try:
487
+ # Create test data
488
+ df = create_stress_test_data(size, complexity=complexity)
489
+
490
+ # Test ultra-optimized version
491
+ print("\n⚡ Running ULTRA-OPTIMIZED version...")
492
+ start_time = time.time()
493
+ ultra_results = profile_ultra_optimized(df, max_combination_size=3, max_lhs_size=2)
494
+ ultra_time = time.time() - start_time
495
+
496
+ # Test old optimized version for comparison (only for smaller datasets)
497
+ if size <= 10000:
498
+ print("\n🐌 Running OLD-OPTIMIZED version...")
499
+ start_time = time.time()
500
+ old_results = profile_optimized(df, max_combination_size=3, max_lhs_size=2)
501
+ old_time = time.time() - start_time
502
+ speedup = old_time / ultra_time if ultra_time > 0 else float('inf')
503
+ else:
504
+ print("\n⏭️ Skipping old version (too slow for large data)")
505
+ old_time = None
506
+ speedup = None
507
+
508
+ # Memory usage estimation
509
+ memory_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
510
+
511
+ results.append({
512
+ 'size': size,
513
+ 'complexity': complexity,
514
+ 'columns': len(df.columns),
515
+ 'memory_mb': memory_mb,
516
+ 'ultra_time': ultra_time,
517
+ 'old_time': old_time,
518
+ 'speedup': speedup,
519
+ 'fds_found': len(ultra_results[0]),
520
+ 'keys_found': len([k for k in ultra_results[1] if "Candidate Key" in k[3]]),
521
+ 'success': True
522
+ })
523
+
524
+ print(f"\n📊 RESULTS:")
525
+ print(f" Dataset: {size:,} rows × {len(df.columns)} cols ({memory_mb:.1f} MB)")
526
+ print(f" Ultra-optimized: {ultra_time:.3f} seconds")
527
+ if old_time:
528
+ print(f" Old optimized: {old_time:.3f} seconds")
529
+ print(f" Speedup: {speedup:.2f}x")
530
+ print(f" Found: {len(ultra_results[0])} FDs, {len([k for k in ultra_results[1] if 'Candidate Key' in k[3]])} keys")
531
+
532
+ # Performance targets
533
+ if ultra_time < 5:
534
+ print(" ✅ Excellent performance")
535
+ elif ultra_time < 15:
536
+ print(" ✅ Good performance")
537
+ elif ultra_time < 60:
538
+ print(" ⚠️ Acceptable performance")
539
+ else:
540
+ print(" ❌ Needs further optimization")
541
+
542
+ except Exception as e:
543
+ print(f" ❌ FAILED: {e}")
544
+ results.append({
545
+ 'size': size,
546
+ 'complexity': complexity,
547
+ 'columns': '?',
548
+ 'memory_mb': 0,
549
+ 'ultra_time': float('inf'),
550
+ 'old_time': None,
551
+ 'speedup': None,
552
+ 'fds_found': 0,
553
+ 'keys_found': 0,
554
+ 'success': False
555
+ })
556
+
557
+ # Print comprehensive summary
558
+ print(f"\n{'='*80}")
559
+ print("COMPREHENSIVE BENCHMARK SUMMARY")
560
+ print('='*80)
561
+ print(f"{'Size':<8} {'Complexity':<10} {'Cols':<5} {'Memory':<8} {'Time':<8} {'Speedup':<8} {'FDs':<4} {'Keys':<4} {'Status'}")
562
+ print("-" * 80)
563
+
564
+ for result in results:
565
+ size = f"{result['size']:,}"
566
+ complexity = result['complexity']
567
+ cols = str(result['columns'])
568
+ memory = f"{result['memory_mb']:.1f}MB"
569
+ time_str = f"{result['ultra_time']:.2f}s" if result['ultra_time'] != float('inf') else "FAIL"
570
+ speedup = f"{result['speedup']:.1f}x" if result['speedup'] else "N/A"
571
+ fds = str(result['fds_found'])
572
+ keys = str(result['keys_found'])
573
+ status = "✅" if result['success'] else "❌"
574
+
575
+ print(f"{size:<8} {complexity:<10} {cols:<5} {memory:<8} {time_str:<8} {speedup:<8} {fds:<4} {keys:<4} {status}")
576
+
577
+ # Performance analysis
578
+ successful_results = [r for r in results if r['success']]
579
+ if successful_results:
580
+ print(f"\n🎯 PERFORMANCE ANALYSIS:")
581
+ print(f" • Successfully processed up to {max(r['size'] for r in successful_results):,} rows")
582
+ print(f" • Average time for datasets under 100K: {np.mean([r['ultra_time'] for r in successful_results if r['size'] < 100000]):.2f}s")
583
+ print(f" • Largest dataset processed: {max(r['memory_mb'] for r in successful_results):.1f} MB")
584
+
585
+ # Speed improvements
586
+ speed_improvements = [r['speedup'] for r in successful_results if r['speedup'] and r['speedup'] != float('inf')]
587
+ if speed_improvements:
588
+ print(f" • Average speedup over old version: {np.mean(speed_improvements):.1f}x")
589
+
590
+ return results
591
+
592
+
593
+ def find_functional_dependencies_optimized(df: pd.DataFrame, max_lhs_size: int = 2):
594
+ """
595
+ Highly optimized functional dependency discovery.
596
+ Main optimizations:
597
+ 1. Early termination for trivial cases
598
+ 2. Efficient groupby operations
599
+ 3. Smart filtering to avoid checking impossible FDs
600
+ """
601
+ fds = []
602
+ cols = list(df.columns)
603
+ n_rows = len(df)
604
+
605
+ if n_rows == 0 or len(cols) < 2:
606
+ return fds
607
+
608
+ # Pre-compute column cardinalities
609
+ col_cardinalities = {col: df[col].nunique() for col in cols}
610
+
611
+ # Skip columns that are unique (they trivially determine everything)
612
+ non_unique_cols = [col for col in cols if col_cardinalities[col] < n_rows]
613
+
614
+ # Cache groupby results to avoid recomputation
615
+ groupby_cache = {}
616
+
617
+ for size in range(1, max_lhs_size + 1):
618
+ # Only consider non-unique columns for LHS
619
+ lhs_candidates = non_unique_cols if size == 1 else cols
620
+
621
+ for lhs in itertools.combinations(lhs_candidates, size):
622
+ lhs_tuple = tuple(lhs)
623
+
624
+ # Use cached groupby if available
625
+ if lhs_tuple in groupby_cache:
626
+ grouped = groupby_cache[lhs_tuple]
627
+ else:
628
+ # Use pandas groupby which is highly optimized
629
+ grouped = df.groupby(list(lhs), sort=False, dropna=False)
630
+ groupby_cache[lhs_tuple] = grouped
631
+
632
+ # Get group info efficiently
633
+ group_info = grouped.size()
634
+ n_groups = len(group_info)
635
+
636
+ # If all groups have size 1, skip (no interesting FDs)
637
+ if group_info.max() == 1:
638
+ continue
639
+
640
+ for rhs in cols:
641
+ if rhs in lhs:
642
+ continue
643
+
644
+ # Remove the overly aggressive early termination that was filtering out valid FDs
645
+ # The original algorithm doesn't have this filter, so we shouldn't either
646
+
647
+ # Check if RHS is functionally determined by LHS
648
+ # Count unique RHS values per group
649
+ try:
650
+ rhs_per_group = grouped[rhs].nunique()
651
+
652
+ # FD holds if every group has at most 1 unique RHS value
653
+ if (rhs_per_group <= 1).all():
654
+ fds.append((lhs, rhs))
655
+ except Exception:
656
+ continue # Skip problematic columns
657
+
658
+ return fds
659
+
660
+
661
+ def find_candidate_keys_optimized(df: pd.DataFrame, max_combination_size: int = 2):
662
+ """
663
+ Highly optimized candidate key discovery.
664
+ Main optimizations:
665
+ 1. Early termination when smaller keys are found
666
+ 2. Efficient uniqueness checking with drop_duplicates
667
+ 3. Smart pruning of superkey candidates
668
+ """
669
+ n_rows = len(df)
670
+ cols = list(df.columns)
671
+
672
+ if n_rows == 0:
673
+ return [], [], []
674
+
675
+ all_keys = []
676
+
677
+ # Check single columns first (most common case)
678
+ single_column_keys = []
679
+ for col in cols:
680
+ if df[col].nunique() == n_rows:
681
+ single_column_keys.append((col,))
682
+ all_keys.append((col,))
683
+
684
+ # If we found single-column keys, we can stop here for many use cases
685
+ # Multi-column keys would be superkeys
686
+ if single_column_keys and max_combination_size == 1:
687
+ return all_keys, single_column_keys, []
688
+
689
+ # For multi-column combinations, use efficient approach
690
+ for size in range(2, max_combination_size + 1):
691
+ size_keys = []
692
+
693
+ for combo in itertools.combinations(cols, size):
694
+ # Skip if any single column in combo is already a key
695
+ if any((col,) in single_column_keys for col in combo):
696
+ continue
697
+
698
+ # Skip if any smaller subset is already a key
699
+ is_superkey = False
700
+ for subset_size in range(1, size):
701
+ for subset in itertools.combinations(combo, subset_size):
702
+ if subset in all_keys:
703
+ is_superkey = True
704
+ break
705
+ if is_superkey:
706
+ break
707
+
708
+ if is_superkey:
709
+ continue
710
+
711
+ # Check uniqueness using efficient drop_duplicates
712
+ if len(df[list(combo)].drop_duplicates()) == n_rows:
713
+ size_keys.append(combo)
714
+ all_keys.append(combo)
715
+
716
+ # If no keys found at this size and we have smaller keys, we can stop
717
+ if not size_keys and all_keys:
718
+ break
719
+
720
+ # Separate candidate keys from superkeys
721
+ candidate_keys = []
722
+ superkeys = []
723
+
724
+ for key in all_keys:
725
+ is_candidate = True
726
+ for other_key in all_keys:
727
+ if len(other_key) < len(key) and set(other_key).issubset(set(key)):
728
+ is_candidate = False
729
+ break
730
+
731
+ if is_candidate:
732
+ candidate_keys.append(key)
733
+ else:
734
+ superkeys.append(key)
735
+
736
+ return all_keys, candidate_keys, superkeys
737
+
738
+
739
+ def profile_optimized(df: pd.DataFrame, max_combination_size: int = 2, max_lhs_size: int = 2):
740
+ """
741
+ Highly optimized profile function.
742
+ Main optimizations:
743
+ 1. Reduced redundant computations
744
+ 2. Early termination strategies
745
+ 3. Efficient pandas operations
746
+ """
747
+ n_rows = len(df)
748
+ cols = list(df.columns)
749
+
750
+ # Use optimized algorithms
751
+ fds = find_functional_dependencies_optimized(df, max_lhs_size)
752
+ fd_results = [(", ".join(lhs), rhs) for lhs, rhs in fds]
753
+
754
+ all_keys, candidate_keys, superkeys = find_candidate_keys_optimized(df, max_combination_size)
755
+
756
+ # Prepare results efficiently
757
+ results = []
758
+
759
+ # Pre-compute uniqueness for single columns
760
+ single_col_uniqueness = {col: df[col].nunique() for col in cols}
761
+
762
+ for size in range(1, max_combination_size + 1):
763
+ for combo in itertools.combinations(cols, size):
764
+ if len(combo) == 1:
765
+ unique_count = single_col_uniqueness[combo[0]]
766
+ else:
767
+ # Only compute for combinations we need
768
+ if combo in all_keys or size <= 2: # Always compute for size 1,2
769
+ unique_count = len(df[list(combo)].drop_duplicates())
770
+ else:
771
+ # For larger non-keys, we can estimate or skip
772
+ unique_count = min(n_rows,
773
+ sum(single_col_uniqueness[col] for col in combo) // len(combo))
774
+
775
+ unique_ratio = unique_count / n_rows if n_rows > 0 else 0
776
+ is_key = combo in all_keys
777
+ is_candidate = combo in candidate_keys
778
+ is_superkey = combo in superkeys
779
+
780
+ key_type = ""
781
+ if is_candidate:
782
+ key_type = "★ Candidate Key"
783
+ elif is_superkey:
784
+ key_type = "⊃ Superkey"
785
+
786
+ results.append((combo, unique_count, unique_ratio, is_key, key_type))
787
+
788
+ results.sort(key=lambda x: (not x[3], -x[2], len(x[0])))
789
+ key_results = [(", ".join(c), u, f"{u/n_rows:.2%}", k)
790
+ for c, u, _, _, k in results]
791
+
792
+ normalized_tables = propose_normalized_tables(cols, candidate_keys, fds)
793
+
794
+ return fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables
795
+
796
+
797
+ def propose_normalized_tables(cols, candidate_keys, fds):
798
+ """
799
+ Propose a set of normalized tables based on functional dependencies.
800
+ Uses a simplified approach to create 3NF tables.
801
+
802
+ Parameters:
803
+ - cols: list of all columns
804
+ - candidate_keys: list of candidate keys
805
+ - fds: list of functional dependencies as (lhs, rhs) tuples
806
+
807
+ Returns:
808
+ - List of proposed tables as (table_name, primary_key, attributes) tuples
809
+ """
810
+ # Start with a set of all attributes
811
+ all_attrs = set(cols)
812
+ proposed_tables = []
813
+
814
+ # Group FDs by their determinants (LHS)
815
+ determinant_groups = {}
816
+ for lhs, rhs in fds:
817
+ lhs_key = tuple(sorted(lhs))
818
+ if lhs_key not in determinant_groups:
819
+ determinant_groups[lhs_key] = []
820
+ determinant_groups[lhs_key].append(rhs)
821
+
822
+ # Create tables for each determinant group
823
+ table_counter = 1
824
+ for lhs, rhs_list in determinant_groups.items():
825
+ table_attrs = set(lhs) | set(rhs_list)
826
+ if table_attrs: # Skip empty tables
827
+ table_name = f"Table_{table_counter}"
828
+ primary_key = ", ".join(lhs)
829
+ attributes = list(table_attrs)
830
+ proposed_tables.append((table_name, primary_key, attributes))
831
+ table_counter += 1
832
+
833
+ # Create a table for any remaining attributes not in any FD
834
+ # or create a table with a candidate key if none exists yet
835
+ used_attrs = set()
836
+ for _, _, attrs in proposed_tables:
837
+ used_attrs.update(attrs)
838
+
839
+ remaining_attrs = all_attrs - used_attrs
840
+ if remaining_attrs:
841
+ # If we have a candidate key, use it for remaining attributes
842
+ for key in candidate_keys:
843
+ key_set = set(key)
844
+ if key_set & remaining_attrs: # If key has overlap with remaining attrs
845
+ table_name = f"Table_{table_counter}"
846
+ primary_key = ", ".join(key)
847
+ attributes = list(remaining_attrs | key_set)
848
+ proposed_tables.append((table_name, primary_key, attributes))
849
+ break
850
+ else: # No suitable candidate key
851
+ table_name = f"Table_{table_counter}"
852
+ primary_key = "id (suggested)"
853
+ attributes = list(remaining_attrs)
854
+ proposed_tables.append((table_name, primary_key, attributes))
855
+
856
+ return proposed_tables
857
+
858
+
859
+ # Keep the original functions for comparison
860
+ def find_functional_dependencies(df: pd.DataFrame, max_lhs_size: int = 2):
861
+ """
862
+ Original functional dependency discovery function (for comparison).
863
+ """
864
+ fds = []
865
+ cols = list(df.columns)
866
+ n_rows = len(df)
867
+
868
+ for size in range(1, max_lhs_size + 1):
869
+ for lhs in itertools.combinations(cols, size):
870
+ # for each potential dependent attribute not in lhs
871
+ lhs_df = df[list(lhs)]
872
+ # group by lhs and count distinct values of each other column
873
+ grouped = df.groupby(list(lhs))
874
+ for rhs in cols:
875
+ if rhs in lhs:
876
+ continue
877
+ # Check if for each group, rhs has only one distinct value
878
+ distinct_counts = grouped[rhs].nunique(dropna=False)
879
+ if (distinct_counts <= 1).all():
880
+ fds.append((lhs, rhs))
881
+ return fds
882
+
883
+
884
+ def profile_original(df: pd.DataFrame, max_combination_size: int = 2, max_lhs_size: int = 2):
885
+ """
886
+ Original profile function (for comparison).
887
+ """
888
+ n_rows = len(df)
889
+ cols = list(df.columns)
890
+
891
+ # Discover functional dependencies
892
+ fds = find_functional_dependencies(df, max_lhs_size)
893
+
894
+ # Prepare FD results
895
+ fd_results = [(", ".join(lhs), rhs) for lhs, rhs in fds]
896
+
897
+ # Profile keys (by uniqueness)
898
+ all_keys = []
899
+ for size in range(1, max_combination_size + 1):
900
+ for combo in itertools.combinations(cols, size):
901
+ unique_count = df.drop_duplicates(subset=combo).shape[0]
902
+ unique_ratio = unique_count / n_rows
903
+ is_key = unique_count == n_rows
904
+ if is_key:
905
+ all_keys.append(combo)
906
+
907
+ # Distinguish between candidate keys and superkeys
908
+ candidate_keys = []
909
+ superkeys = []
910
+
911
+ for key in all_keys:
912
+ is_candidate = True
913
+ # Check if any proper subset of this key is also a key
914
+ for i in range(1, len(key)):
915
+ for subset in itertools.combinations(key, i):
916
+ if subset in all_keys:
917
+ is_candidate = False
918
+ break
919
+ if not is_candidate:
920
+ break
921
+
922
+ if is_candidate:
923
+ candidate_keys.append(key)
924
+ else:
925
+ superkeys.append(key)
926
+
927
+ # Prepare results for all keys (both candidate keys and superkeys)
928
+ results = []
929
+ for size in range(1, max_combination_size + 1):
930
+ for combo in itertools.combinations(cols, size):
931
+ unique_count = df.drop_duplicates(subset=combo).shape[0]
932
+ unique_ratio = unique_count / n_rows
933
+ is_key = combo in all_keys
934
+ is_candidate = combo in candidate_keys
935
+ is_superkey = combo in superkeys
936
+
937
+ # Use icons for different key types
938
+ key_type = ""
939
+ if is_candidate:
940
+ key_type = "★ Candidate Key" # Star for candidate keys
941
+ elif is_superkey:
942
+ key_type = "⊃ Superkey" # Superset symbol for superkeys
943
+
944
+ results.append((combo, unique_count, unique_ratio, is_key, key_type))
945
+
946
+ results.sort(key=lambda x: (not x[3], -x[2], len(x[0])))
947
+ key_results = [(", ".join(c), u, f"{u/n_rows:.2%}", k)
948
+ for c, u, _, _, k in results]
949
+
950
+ # Propose normalized tables
951
+ normalized_tables = propose_normalized_tables(cols, candidate_keys, fds)
952
+
953
+ return fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables
954
+
955
+
956
+ # Update the main profile function to use the optimized version
957
+ def profile(df: pd.DataFrame, max_combination_size: int = 2, max_lhs_size: int = 2):
958
+ """
959
+ Analyze a pandas DataFrame to suggest candidate keys and discover functional dependencies.
960
+ Automatically selects the best optimization level based on dataset size and characteristics.
961
+
962
+ Parameters:
963
+ - df: pandas.DataFrame to analyze.
964
+ - max_combination_size: max size of column combos to test for keys.
965
+ - max_lhs_size: max size of LHS in discovered FDs.
966
+
967
+ Returns:
968
+ - Tuple of (fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables)
969
+ """
970
+ n_rows, n_cols = len(df), len(df.columns)
971
+
972
+ # Choose optimization level based on dataset characteristics
973
+ if n_cols > 50:
974
+ # High-column datasets get special treatment regardless of row count
975
+ print("🏗️ Using HIGH-COLUMN-OPTIMIZED mode for wide dataset")
976
+ return profile_high_column_optimized(df, max_combination_size, max_lhs_size)
977
+ elif n_rows > 500000 or (n_rows > 100000 and n_cols > 15):
978
+ print("🚀 Using HYPER-OPTIMIZED mode for very large dataset")
979
+ return profile_hyper_optimized(df, max_combination_size, max_lhs_size)
980
+ elif n_rows > 10000 or n_cols > 10:
981
+ print("⚡ Using ULTRA-OPTIMIZED mode for large dataset")
982
+ return profile_ultra_optimized(df, max_combination_size, max_lhs_size)
983
+ else:
984
+ print("🔍 Using STANDARD-OPTIMIZED mode for small dataset")
985
+ return profile_optimized(df, max_combination_size, max_lhs_size)
986
+
987
+
988
+ def visualize_profile(df: pd.DataFrame, max_combination_size: int = 2, max_lhs_size: int = 2):
989
+ """
990
+ Create a visual representation of the key profile for a dataframe.
991
+
992
+ Parameters:
993
+ - df: pandas.DataFrame to analyze.
994
+ - max_combination_size: max size of column combos to test for keys.
995
+ - max_lhs_size: max size of LHS in discovered FDs.
996
+
997
+ Returns:
998
+ - QMainWindow: The visualization window
999
+ """
1000
+ # Get profile results
1001
+ fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables = profile(
1002
+ df, max_combination_size, max_lhs_size
1003
+ )
1004
+
1005
+ # Create main window
1006
+ window = QMainWindow()
1007
+ window.setWindowTitle("Table Profile: Keys & Dependencies")
1008
+ window.resize(900, 700)
1009
+
1010
+ # Create central widget and layout
1011
+ central_widget = QWidget()
1012
+ window.setCentralWidget(central_widget)
1013
+ layout = QVBoxLayout(central_widget)
1014
+
1015
+ # Add header
1016
+ header = QLabel(f"Analyzed {n_rows} rows × {len(cols)} columns; key combos up to size {max_combination_size}, FDs up to LHS size {max_lhs_size}")
1017
+ header.setAlignment(Qt.AlignmentFlag.AlignCenter)
1018
+ header.setStyleSheet("font-size: 14pt; font-weight: bold; margin: 10px;")
1019
+ layout.addWidget(header)
1020
+
1021
+ # Add description
1022
+ description = QLabel(
1023
+ "This profile helps identify candidate keys and functional dependencies in your data. "
1024
+ "★ Candidate keys are minimal combinations of columns that uniquely identify rows. "
1025
+ "⊃ Superkeys are non-minimal column sets that uniquely identify rows. "
1026
+ "Functional dependencies indicate when one column's values determine another's."
1027
+ )
1028
+ description.setAlignment(Qt.AlignmentFlag.AlignCenter)
1029
+ description.setWordWrap(True)
1030
+ description.setStyleSheet("margin-bottom: 10px;")
1031
+ layout.addWidget(description)
1032
+
1033
+ # Add key for icons
1034
+ icons_key = QLabel("Key: ★ = Minimal Candidate Key | ⊃ = Non-minimal Superkey")
1035
+ icons_key.setAlignment(Qt.AlignmentFlag.AlignCenter)
1036
+ icons_key.setStyleSheet("font-style: italic; margin-bottom: 15px;")
1037
+ layout.addWidget(icons_key)
1038
+
1039
+ # Create tabs
1040
+ tabs = QTabWidget()
1041
+
1042
+ # Tab for Candidate Keys
1043
+ key_tab = QWidget()
1044
+ key_layout = QVBoxLayout()
1045
+
1046
+ key_header = QLabel("Keys (Column Combinations that Uniquely Identify Rows)")
1047
+ key_header.setStyleSheet("font-weight: bold;")
1048
+ key_layout.addWidget(key_header)
1049
+
1050
+ key_table = QTableWidget(len(key_results), 4)
1051
+ key_table.setHorizontalHeaderLabels(["Columns", "Unique Count", "Uniqueness Ratio", "Key Type"])
1052
+ key_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
1053
+ for row, (cols_str, count, ratio, key_type) in enumerate(key_results):
1054
+ key_table.setItem(row, 0, QTableWidgetItem(cols_str))
1055
+ key_table.setItem(row, 1, QTableWidgetItem(str(count)))
1056
+ key_table.setItem(row, 2, QTableWidgetItem(ratio))
1057
+
1058
+ # Create item with appropriate styling
1059
+ type_item = QTableWidgetItem(key_type)
1060
+ if "Candidate Key" in key_type:
1061
+ type_item.setForeground(Qt.GlobalColor.darkGreen)
1062
+ elif "Superkey" in key_type:
1063
+ type_item.setForeground(Qt.GlobalColor.darkBlue)
1064
+ key_table.setItem(row, 3, type_item)
1065
+
1066
+ key_layout.addWidget(key_table)
1067
+ key_tab.setLayout(key_layout)
1068
+ tabs.addTab(key_tab, "Keys")
1069
+
1070
+ # Tab for FDs
1071
+ fd_tab = QWidget()
1072
+ fd_layout = QVBoxLayout()
1073
+
1074
+ fd_header = QLabel("Functional Dependencies (When Values in One Set of Columns Determine Another Column)")
1075
+ fd_header.setStyleSheet("font-weight: bold;")
1076
+ fd_layout.addWidget(fd_header)
1077
+
1078
+ fd_table = QTableWidget(len(fd_results), 2)
1079
+ fd_table.setHorizontalHeaderLabels(["Determinant (LHS)", "Dependent (RHS)"])
1080
+ fd_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
1081
+ for i, (lhs, rhs) in enumerate(fd_results):
1082
+ lhs_item = QTableWidgetItem(lhs)
1083
+ lhs_item.setFlags(lhs_item.flags() ^ Qt.ItemFlag.ItemIsEditable)
1084
+ fd_table.setItem(i, 0, lhs_item)
1085
+ fd_table.setItem(i, 1, QTableWidgetItem(rhs))
1086
+ fd_layout.addWidget(fd_table)
1087
+ fd_tab.setLayout(fd_layout)
1088
+ tabs.addTab(fd_tab, "Functional Dependencies")
1089
+
1090
+ # Tab for Normalized Tables
1091
+ norm_tab = QWidget()
1092
+ norm_layout = QVBoxLayout()
1093
+
1094
+ norm_header = QLabel("Proposed Normalized Tables (Based on Functional Dependencies)")
1095
+ norm_header.setStyleSheet("font-weight: bold;")
1096
+ norm_layout.addWidget(norm_header)
1097
+
1098
+ norm_description = QLabel(
1099
+ "These tables represent a proposed normalized schema based on the discovered functional dependencies. "
1100
+ "Each table includes attributes that are functionally dependent on its primary key. "
1101
+ "This is an approximate 3NF decomposition and may need further refinement."
1102
+ )
1103
+ norm_description.setWordWrap(True)
1104
+ norm_description.setStyleSheet("margin-bottom: 10px;")
1105
+ norm_layout.addWidget(norm_description)
1106
+
1107
+ norm_table = QTableWidget(len(normalized_tables), 3)
1108
+ norm_table.setHorizontalHeaderLabels(["Table Name", "Primary Key", "Attributes"])
1109
+ norm_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
1110
+ for i, (table_name, primary_key, attributes) in enumerate(normalized_tables):
1111
+ norm_table.setItem(i, 0, QTableWidgetItem(table_name))
1112
+
1113
+ pk_item = QTableWidgetItem(primary_key)
1114
+ pk_item.setForeground(Qt.GlobalColor.darkGreen)
1115
+ norm_table.setItem(i, 1, pk_item)
1116
+
1117
+ norm_table.setItem(i, 2, QTableWidgetItem(", ".join(attributes)))
1118
+
1119
+ norm_layout.addWidget(norm_table)
1120
+ norm_tab.setLayout(norm_layout)
1121
+ tabs.addTab(norm_tab, "Normalized Tables")
1122
+
1123
+ layout.addWidget(tabs)
1124
+
1125
+ # Show the window
1126
+ window.show()
1127
+ return window
1128
+
1129
+
1130
+ def benchmark_performance():
1131
+ """
1132
+ Benchmark the performance improvements of the optimized version.
1133
+ """
1134
+ print("=== PROFILE KEYS PERFORMANCE BENCHMARK ===\n")
1135
+
1136
+ # Create realistic test datasets of varying sizes
1137
+ test_sizes = [100, 500, 1000, 2000]
1138
+ results = []
1139
+
1140
+ for size in test_sizes:
1141
+ print(f"Testing with {size} rows...")
1142
+
1143
+ # Create realistic test data
1144
+ df = create_realistic_test_data(size)
1145
+
1146
+ # Benchmark original version
1147
+ start_time = time.time()
1148
+ try:
1149
+ original_results = profile_original(df, max_combination_size=3, max_lhs_size=2)
1150
+ original_time = time.time() - start_time
1151
+ original_success = True
1152
+ except Exception as e:
1153
+ original_time = float('inf')
1154
+ original_success = False
1155
+ print(f" Original version failed: {e}")
1156
+
1157
+ # Benchmark optimized version
1158
+ start_time = time.time()
1159
+ try:
1160
+ optimized_results = profile_optimized(df, max_combination_size=3, max_lhs_size=2)
1161
+ optimized_time = time.time() - start_time
1162
+ optimized_success = True
1163
+ except Exception as e:
1164
+ optimized_time = float('inf')
1165
+ optimized_success = False
1166
+ print(f" Optimized version failed: {e}")
1167
+
1168
+ # Verify results are consistent (if both succeeded)
1169
+ consistent = True
1170
+ if original_success and optimized_success:
1171
+ # Compare functional dependencies
1172
+ orig_fds = set(original_results[0])
1173
+ opt_fds = set(optimized_results[0])
1174
+
1175
+ # Compare key findings (just the key type counts)
1176
+ orig_key_types = [result[3] for result in original_results[1]]
1177
+ opt_key_types = [result[3] for result in optimized_results[1]]
1178
+
1179
+ if orig_fds != opt_fds or orig_key_types != opt_key_types:
1180
+ consistent = False
1181
+ print(f" WARNING: Results differ between versions!")
1182
+
1183
+ # Calculate speedup
1184
+ if original_time > 0 and optimized_time > 0:
1185
+ speedup = original_time / optimized_time
1186
+ else:
1187
+ speedup = float('inf') if optimized_time > 0 else 0
1188
+
1189
+ results.append({
1190
+ 'size': size,
1191
+ 'original_time': original_time,
1192
+ 'optimized_time': optimized_time,
1193
+ 'speedup': speedup,
1194
+ 'consistent': consistent,
1195
+ 'original_success': original_success,
1196
+ 'optimized_success': optimized_success
1197
+ })
1198
+
1199
+ print(f" Original: {original_time:.3f}s")
1200
+ print(f" Optimized: {optimized_time:.3f}s")
1201
+ if speedup != float('inf'):
1202
+ print(f" Speedup: {speedup:.2f}x")
1203
+ print(f" Results consistent: {consistent}")
1204
+ print()
1205
+
1206
+ # Print summary
1207
+ print("=== BENCHMARK SUMMARY ===")
1208
+ print(f"{'Size':<6} {'Original':<10} {'Optimized':<10} {'Speedup':<8} {'Consistent'}")
1209
+ print("-" * 50)
1210
+
1211
+ for result in results:
1212
+ size = result['size']
1213
+ orig_time = f"{result['original_time']:.3f}s" if result['original_success'] else "FAILED"
1214
+ opt_time = f"{result['optimized_time']:.3f}s" if result['optimized_success'] else "FAILED"
1215
+ speedup = f"{result['speedup']:.2f}x" if result['speedup'] != float('inf') else "∞"
1216
+ consistent = "✓" if result['consistent'] else "✗"
1217
+
1218
+ print(f"{size:<6} {orig_time:<10} {opt_time:<10} {speedup:<8} {consistent}")
1219
+
1220
+ # Calculate average speedup for successful runs
1221
+ successful_speedups = [r['speedup'] for r in results if r['speedup'] != float('inf') and r['speedup'] > 0]
1222
+ if successful_speedups:
1223
+ avg_speedup = sum(successful_speedups) / len(successful_speedups)
1224
+ print(f"\nAverage speedup: {avg_speedup:.2f}x")
1225
+
1226
+ return results
1227
+
1228
+
1229
+ def create_realistic_test_data(size):
1230
+ """
1231
+ Create realistic test data for benchmarking with known functional dependencies.
1232
+ """
1233
+ random.seed(42) # For reproducibility
1234
+ np.random.seed(42)
1235
+
1236
+ # Create realistic customer-order-product scenario
1237
+ n_customers = min(size // 10, 100) # 10% unique customers, max 100
1238
+ n_products = min(size // 20, 50) # 5% unique products, max 50
1239
+ n_orders = min(size // 5, 200) # 20% unique orders, max 200
1240
+
1241
+ customer_ids = list(range(1, n_customers + 1))
1242
+ customer_names = [f"Customer_{i}" for i in customer_ids]
1243
+ customer_cities = [f"City_{i % 10}" for i in customer_ids] # 10 cities
1244
+
1245
+ product_ids = list(range(1001, 1001 + n_products))
1246
+ product_names = [f"Product_{i}" for i in product_ids]
1247
+ product_categories = [f"Category_{i % 5}" for i in range(n_products)] # 5 categories
1248
+
1249
+ order_ids = list(range(10001, 10001 + n_orders))
1250
+
1251
+ # Generate order line items
1252
+ data = []
1253
+ for i in range(size):
1254
+ customer_id = random.choice(customer_ids)
1255
+ customer_idx = customer_id - 1
1256
+ order_id = random.choice(order_ids)
1257
+ product_id = random.choice(product_ids)
1258
+ product_idx = product_id - 1001
1259
+
1260
+ data.append({
1261
+ 'order_line_id': 100001 + i, # Unique for each row
1262
+ 'customer_id': customer_id,
1263
+ 'customer_name': customer_names[customer_idx], # FD: customer_id -> customer_name
1264
+ 'customer_city': customer_cities[customer_idx], # FD: customer_id -> customer_city
1265
+ 'order_id': order_id,
1266
+ 'product_id': product_id,
1267
+ 'product_name': product_names[product_idx], # FD: product_id -> product_name
1268
+ 'product_category': product_categories[product_idx], # FD: product_id -> product_category
1269
+ 'quantity': random.randint(1, 10),
1270
+ 'unit_price': random.randint(10, 100),
1271
+ 'total_price': 0 # Will be calculated
1272
+ })
1273
+
1274
+ # Calculate total price (FD: quantity, unit_price -> total_price)
1275
+ data[-1]['total_price'] = data[-1]['quantity'] * data[-1]['unit_price']
1276
+
1277
+ df = pd.DataFrame(data)
1278
+
1279
+ # Add some duplicate rows to make it more realistic
1280
+ if size > 100:
1281
+ n_duplicates = size // 20 # 5% duplicates
1282
+ duplicate_indices = np.random.choice(len(df), n_duplicates, replace=True)
1283
+ duplicate_rows = df.iloc[duplicate_indices].copy()
1284
+ duplicate_rows['order_line_id'] = range(200001, 200001 + len(duplicate_rows))
1285
+ df = pd.concat([df, duplicate_rows], ignore_index=True)
1286
+
1287
+ return df
1288
+
1289
+
1290
+ def test_realistic_scenario():
1291
+ """
1292
+ Test the optimized version with a realistic scenario and verify expected results.
1293
+ """
1294
+ print("=== REALISTIC SCENARIO TEST ===\n")
1295
+
1296
+ # Create test data with known structure
1297
+ df = create_realistic_test_data(500)
1298
+
1299
+ print(f"Created test dataset with {len(df)} rows and {len(df.columns)} columns")
1300
+ print("Expected functional dependencies:")
1301
+ print(" - customer_id -> customer_name")
1302
+ print(" - customer_id -> customer_city")
1303
+ print(" - product_id -> product_name")
1304
+ print(" - product_id -> product_category")
1305
+ print(" - (quantity, unit_price) -> total_price")
1306
+ print()
1307
+
1308
+ # Run analysis
1309
+ start_time = time.time()
1310
+ fd_results, key_results, n_rows, cols, max_combo, max_lhs, norm_tables = profile_optimized(
1311
+ df, max_combination_size=3, max_lhs_size=2
1312
+ )
1313
+ analysis_time = time.time() - start_time
1314
+
1315
+ print(f"Analysis completed in {analysis_time:.3f} seconds")
1316
+ print()
1317
+
1318
+ # Display results
1319
+ print("Discovered Functional Dependencies:")
1320
+ if fd_results:
1321
+ for lhs, rhs in fd_results:
1322
+ print(f" {lhs} -> {rhs}")
1323
+ else:
1324
+ print(" None found")
1325
+ print()
1326
+
1327
+ print("Candidate Keys Found:")
1328
+ candidate_keys = [result for result in key_results if "Candidate Key" in result[3]]
1329
+ if candidate_keys:
1330
+ for cols_str, count, ratio, key_type in candidate_keys:
1331
+ print(f" {cols_str} ({ratio} unique)")
1332
+ else:
1333
+ print(" None found")
1334
+ print()
1335
+
1336
+ print("Proposed Normalized Tables:")
1337
+ for i, (table_name, pk, attrs) in enumerate(norm_tables, 1):
1338
+ print(f" {table_name}: PK({pk}) -> {', '.join(attrs)}")
1339
+
1340
+ # Verify expected results
1341
+ print("\n=== VERIFICATION ===")
1342
+ expected_fds = [
1343
+ "customer_id -> customer_name",
1344
+ "customer_id -> customer_city",
1345
+ "product_id -> product_name",
1346
+ "product_id -> product_category"
1347
+ ]
1348
+
1349
+ found_fds = [f"{lhs} -> {rhs}" for lhs, rhs in fd_results]
1350
+
1351
+ print("Expected FDs found:")
1352
+ for expected in expected_fds:
1353
+ found = expected in found_fds
1354
+ status = "✓" if found else "✗"
1355
+ print(f" {status} {expected}")
1356
+
1357
+ # Check for unexpected FDs
1358
+ unexpected_fds = [fd for fd in found_fds if fd not in expected_fds]
1359
+ if unexpected_fds:
1360
+ print("\nUnexpected FDs found:")
1361
+ for fd in unexpected_fds:
1362
+ print(f" {fd}")
1363
+
1364
+ print(f"\nCandidate key found: {'✓' if candidate_keys else '✗'}")
1365
+
1366
+
1367
+ def test_profile_keys(test_size=100):
1368
+ # Generate a dataframe with some realistic examples of a customer-product-order relationship
1369
+ # Create customer data
1370
+ customer_ids = list(range(1, 21)) # 20 customers
1371
+ customer_names = ["John", "Jane", "Alice", "Bob", "Charlie", "Diana", "Edward", "Fiona", "George", "Hannah"]
1372
+
1373
+ # Create product data
1374
+ product_names = ["Apple", "Banana", "Orange", "Grape", "Mango", "Strawberry", "Blueberry", "Kiwi", "Pineapple", "Watermelon"]
1375
+ product_groups = ["Fruit"] * len(product_names)
1376
+
1377
+ # Generate random orders
1378
+ random.seed(42) # For reproducibility
1379
+ df_data = {
1380
+ "customer_id": [random.choice(customer_ids) for _ in range(test_size)],
1381
+ "customer_name": [customer_names[i % len(customer_names)] for i in range(test_size)],
1382
+ "product_name": [random.choice(product_names) for _ in range(test_size)],
1383
+ "product_group": ["Fruit" for _ in range(test_size)],
1384
+ "order_date": [pd.Timestamp("2021-01-01") + pd.Timedelta(days=random.randint(0, 30)) for _ in range(test_size)],
1385
+ "order_amount": [random.randint(100, 1000) for _ in range(test_size)]
1386
+ }
1387
+
1388
+ # Ensure consistent relationships
1389
+ for i in range(test_size):
1390
+ # Ensure customer_name is consistently associated with customer_id
1391
+ customer_idx = df_data["customer_id"][i] % len(customer_names)
1392
+ df_data["customer_name"][i] = customer_names[customer_idx]
1393
+
1394
+ df = pd.DataFrame(df_data)
1395
+
1396
+ # Create and show visualization
1397
+ app = QApplication(sys.argv)
1398
+ window = visualize_profile(df, max_combination_size=3, max_lhs_size=2)
1399
+ sys.exit(app.exec())
1400
+
1401
+
1402
+ def demo_performance_improvements():
1403
+ """
1404
+ Simple demonstration of the performance improvements.
1405
+ """
1406
+ print("=== PROFILE KEYS PERFORMANCE DEMO ===\n")
1407
+
1408
+ # Create a moderately complex dataset
1409
+ df = create_realistic_test_data(1000)
1410
+ print(f"Testing with dataset: {len(df)} rows × {len(df.columns)} columns")
1411
+
1412
+ # Test original version
1413
+ print("\n🐌 Running ORIGINAL version...")
1414
+ start_time = time.time()
1415
+ original_results = profile_original(df, max_combination_size=3, max_lhs_size=2)
1416
+ original_time = time.time() - start_time
1417
+
1418
+ # Test optimized version
1419
+ print("⚡ Running OPTIMIZED version...")
1420
+ start_time = time.time()
1421
+ optimized_results = profile_optimized(df, max_combination_size=3, max_lhs_size=2)
1422
+ optimized_time = time.time() - start_time
1423
+
1424
+ # Show results
1425
+ speedup = original_time / optimized_time
1426
+ print(f"\n📊 RESULTS:")
1427
+ print(f" Original time: {original_time:.3f} seconds")
1428
+ print(f" Optimized time: {optimized_time:.3f} seconds")
1429
+ print(f" Speedup: {speedup:.2f}x faster!")
1430
+
1431
+ # Show discovered insights
1432
+ orig_fds, orig_keys = original_results[0], original_results[1]
1433
+ opt_fds, opt_keys = optimized_results[0], optimized_results[1]
1434
+
1435
+ print(f"\n🔍 FUNCTIONAL DEPENDENCIES FOUND:")
1436
+ print(f" Original: {len(orig_fds)} dependencies")
1437
+ print(f" Optimized: {len(opt_fds)} dependencies")
1438
+
1439
+ candidate_keys_orig = [k for k in orig_keys if "Candidate Key" in k[3]]
1440
+ candidate_keys_opt = [k for k in opt_keys if "Candidate Key" in k[3]]
1441
+
1442
+ print(f"\n🔑 CANDIDATE KEYS FOUND:")
1443
+ print(f" Original: {len(candidate_keys_orig)} keys")
1444
+ print(f" Optimized: {len(candidate_keys_opt)} keys")
1445
+
1446
+ if candidate_keys_opt:
1447
+ print("\n Key(s) discovered:")
1448
+ for cols, count, ratio, key_type in candidate_keys_opt:
1449
+ print(f" • {cols} ({ratio} unique)")
1450
+
1451
+ print(f"\n🎯 Key improvements:")
1452
+ print(f" • Eliminated redundant computations")
1453
+ print(f" • Added smart early termination")
1454
+ print(f" • Optimized pandas operations")
1455
+ print(f" • Better caching strategies")
1456
+ print(f" • Filtered trivial dependencies")
1457
+
1458
+
1459
+ def test_big_data_scenario():
1460
+ """
1461
+ Test with a realistic big data scenario.
1462
+ """
1463
+ print("=== BIG DATA SCENARIO TEST ===\n")
1464
+
1465
+ # Create a 1M row dataset similar to real-world scenarios
1466
+ df = create_stress_test_data(1000000, complexity='complex')
1467
+
1468
+ print(f"Created big data test with {len(df):,} rows and {len(df.columns)} columns")
1469
+ print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")
1470
+
1471
+ # Test the ultra-optimized version
1472
+ print("\n⚡ Running ultra-optimized analysis...")
1473
+ start_time = time.time()
1474
+
1475
+ try:
1476
+ fd_results, key_results, n_rows, cols, max_combo, max_lhs, norm_tables = profile_ultra_optimized(
1477
+ df, max_combination_size=3, max_lhs_size=2
1478
+ )
1479
+ analysis_time = time.time() - start_time
1480
+
1481
+ print(f"\n✅ SUCCESS! Analysis completed in {analysis_time:.2f} seconds")
1482
+ print(f" • Processed {n_rows:,} rows")
1483
+ print(f" • Found {len(fd_results)} functional dependencies")
1484
+ print(f" • Found {len([k for k in key_results if 'Candidate Key' in k[3]])} candidate keys")
1485
+ print(f" • Proposed {len(norm_tables)} normalized tables")
1486
+
1487
+ if fd_results:
1488
+ print(f"\n🔍 Sample functional dependencies:")
1489
+ for i, (lhs, rhs) in enumerate(fd_results[:5]):
1490
+ print(f" • {lhs} → {rhs}")
1491
+ if len(fd_results) > 5:
1492
+ print(f" ... and {len(fd_results) - 5} more")
1493
+
1494
+ candidate_keys = [k for k in key_results if "Candidate Key" in k[3]]
1495
+ if candidate_keys:
1496
+ print(f"\n🔑 Candidate keys found:")
1497
+ for cols_str, count, ratio, key_type in candidate_keys:
1498
+ print(f" • {cols_str} ({ratio} unique)")
1499
+
1500
+ # Performance assessment
1501
+ rows_per_second = n_rows / analysis_time
1502
+ print(f"\n📈 Performance metrics:")
1503
+ print(f" • Processing rate: {rows_per_second:,.0f} rows/second")
1504
+ print(f" • Memory efficiency: {df.memory_usage(deep=True).sum() / 1024 / 1024 / analysis_time:.1f} MB/second")
1505
+
1506
+ if analysis_time < 30:
1507
+ print(" ✅ Excellent performance for big data!")
1508
+ elif analysis_time < 120:
1509
+ print(" ✅ Good performance for big data")
1510
+ else:
1511
+ print(" ⚠️ Acceptable but could be improved")
1512
+
1513
+ except Exception as e:
1514
+ print(f"❌ FAILED: {str(e)}")
1515
+ import traceback
1516
+ traceback.print_exc()
1517
+
1518
+
1519
+ def find_functional_dependencies_hyper_optimized(df: pd.DataFrame, max_lhs_size: int = 2):
1520
+ """
1521
+ Hyper-optimized functional dependency discovery for very large datasets.
1522
+ Uses more aggressive sampling and limits but tries to maintain accuracy.
1523
+ """
1524
+ n_rows = len(df)
1525
+ cols = list(df.columns)
1526
+
1527
+ if n_rows == 0 or len(cols) < 2:
1528
+ return []
1529
+
1530
+ # For very large datasets, use more aggressive sampling
1531
+ if n_rows > 200000:
1532
+ sample_size = min(25000, max(10000, n_rows // 40)) # More conservative sampling
1533
+ df, was_sampled = sample_dataframe_intelligently(df, sample_size)
1534
+ n_rows = len(df)
1535
+ print(f" Aggressively sampled {n_rows} rows from original dataset for FD analysis")
1536
+
1537
+ fds = []
1538
+
1539
+ # Pre-compute cardinalities
1540
+ col_cardinalities = {col: df[col].nunique() for col in cols}
1541
+
1542
+ # Use similar but more aggressive filtering than ultra-optimized
1543
+ non_unique_cols = [col for col in cols if 1 < col_cardinalities[col] < n_rows * 0.9]
1544
+
1545
+ if not non_unique_cols:
1546
+ return fds
1547
+
1548
+ # Much more aggressive limits
1549
+ max_lhs_combinations = min(50, len(non_unique_cols))
1550
+ max_total_tests = min(300, len(non_unique_cols) * len(cols))
1551
+
1552
+ # Cache for group operations
1553
+ group_cache = {}
1554
+ tests_performed = 0
1555
+
1556
+ for size in range(1, min(max_lhs_size + 1, 3)): # Cap at size 2 for hyper mode
1557
+ if size > len(non_unique_cols) or tests_performed >= max_total_tests:
1558
+ break
1559
+
1560
+ # Be more selective about combinations
1561
+ if size == 1:
1562
+ lhs_candidates = [(col,) for col in non_unique_cols[:max_lhs_combinations]]
1563
+ else:
1564
+ # For multi-column, be very selective but still thorough
1565
+ all_combos = list(itertools.combinations(non_unique_cols[:15], size))[:30]
1566
+ lhs_candidates = sorted(all_combos,
1567
+ key=lambda x: sum(col_cardinalities[col] for col in x))[:30]
1568
+
1569
+ for lhs in lhs_candidates:
1570
+ if tests_performed >= max_total_tests:
1571
+ break
1572
+
1573
+ lhs_tuple = tuple(lhs)
1574
+
1575
+ try:
1576
+ if lhs_tuple not in group_cache:
1577
+ grouped = df.groupby(list(lhs), sort=False, dropna=False)
1578
+ group_sizes = grouped.size()
1579
+ group_cache[lhs_tuple] = (grouped, group_sizes)
1580
+ else:
1581
+ grouped, group_sizes = group_cache[lhs_tuple]
1582
+
1583
+ n_groups = len(group_sizes)
1584
+ if n_groups == n_rows or group_sizes.max() == 1:
1585
+ continue
1586
+
1587
+ # Test RHS candidates with some prioritization
1588
+ for rhs in cols:
1589
+ if rhs in lhs or tests_performed >= max_total_tests:
1590
+ continue
1591
+
1592
+ # Quick heuristic check
1593
+ if col_cardinalities[rhs] > n_groups * 1.2:
1594
+ continue
1595
+
1596
+ try:
1597
+ rhs_per_group = grouped[rhs].nunique()
1598
+ if (rhs_per_group <= 1).all():
1599
+ fds.append((lhs, rhs))
1600
+ tests_performed += 1
1601
+ except Exception:
1602
+ continue
1603
+
1604
+ except Exception:
1605
+ continue
1606
+
1607
+ return fds
1608
+
1609
+
1610
+ def find_candidate_keys_hyper_optimized(df: pd.DataFrame, max_combination_size: int = 2):
1611
+ """
1612
+ Hyper-optimized candidate key discovery for very large datasets.
1613
+ """
1614
+ n_rows = len(df)
1615
+ cols = list(df.columns)
1616
+
1617
+ if n_rows == 0:
1618
+ return [], [], []
1619
+
1620
+ # Aggressive sampling for very large datasets
1621
+ if n_rows > 200000:
1622
+ sample_size = min(25000, max(5000, n_rows // 40))
1623
+ df, was_sampled = sample_dataframe_intelligently(df, sample_size)
1624
+ n_rows = len(df)
1625
+ print(f" Aggressively sampled {n_rows} rows from original dataset for key analysis")
1626
+
1627
+ all_keys = []
1628
+
1629
+ # Quick single-column check with early termination
1630
+ single_column_keys = []
1631
+ col_cardinalities = {}
1632
+
1633
+ for col in cols:
1634
+ cardinality = df[col].nunique()
1635
+ col_cardinalities[col] = cardinality
1636
+ if cardinality == n_rows:
1637
+ single_column_keys.append((col,))
1638
+ all_keys.append((col,))
1639
+
1640
+ # For very large datasets, if we have single-column keys, stop there
1641
+ if single_column_keys and n_rows > 100000:
1642
+ return all_keys, single_column_keys, []
1643
+
1644
+ # Very conservative limits for multi-column keys
1645
+ max_combination_size = min(max_combination_size, 2)
1646
+ max_combinations_to_test = min(50, math.comb(len(cols), 2))
1647
+
1648
+ # Only test most promising combinations
1649
+ for size in range(2, max_combination_size + 1):
1650
+ if size > len(cols):
1651
+ break
1652
+
1653
+ # Select only most promising combinations based on cardinality
1654
+ all_combinations = list(itertools.combinations(cols, size))
1655
+
1656
+ # Sort by likelihood of being keys (lower total cardinality)
1657
+ promising_combinations = sorted(all_combinations,
1658
+ key=lambda x: sum(col_cardinalities.get(col, n_rows) for col in x))
1659
+
1660
+ # Test only top candidates
1661
+ combinations_to_test = promising_combinations[:max_combinations_to_test]
1662
+
1663
+ for combo in combinations_to_test:
1664
+ # Skip if contains single-column key
1665
+ if any((col,) in single_column_keys for col in combo):
1666
+ continue
1667
+
1668
+ # Quick heuristic: if sum of cardinalities is much less than n_rows, unlikely to be key
1669
+ total_card = sum(col_cardinalities.get(col, n_rows) for col in combo)
1670
+ if total_card < n_rows * 0.8:
1671
+ continue
1672
+
1673
+ try:
1674
+ unique_count = len(df[list(combo)].drop_duplicates())
1675
+ if unique_count == n_rows:
1676
+ all_keys.append(combo)
1677
+ except Exception:
1678
+ continue
1679
+
1680
+ # Early termination if we found enough keys
1681
+ if len(all_keys) > 5:
1682
+ break
1683
+
1684
+ # Classify keys
1685
+ candidate_keys = []
1686
+ superkeys = []
1687
+
1688
+ for key in all_keys:
1689
+ is_candidate = True
1690
+ for other_key in all_keys:
1691
+ if len(other_key) < len(key) and set(other_key).issubset(set(key)):
1692
+ is_candidate = False
1693
+ break
1694
+
1695
+ if is_candidate:
1696
+ candidate_keys.append(key)
1697
+ else:
1698
+ superkeys.append(key)
1699
+
1700
+ return all_keys, candidate_keys, superkeys
1701
+
1702
+
1703
+ def profile_hyper_optimized(df: pd.DataFrame, max_combination_size: int = 2, max_lhs_size: int = 2):
1704
+ """
1705
+ Hyper-optimized profile function for very large datasets (500k+ rows).
1706
+ Sacrifices some completeness for dramatic speed improvements.
1707
+ """
1708
+ start_time = time.time()
1709
+ n_rows = len(df)
1710
+ cols = list(df.columns)
1711
+
1712
+ print(f"Starting HYPER-OPTIMIZED analysis of {n_rows:,} rows × {len(cols)} columns...")
1713
+
1714
+ # Very aggressive parameter limits
1715
+ max_combination_size = min(max_combination_size, 2)
1716
+ max_lhs_size = min(max_lhs_size, 2)
1717
+ print(f" Hyper mode: limiting to max combination size {max_combination_size}")
1718
+
1719
+ # Discover functional dependencies
1720
+ fd_start = time.time()
1721
+ fds = find_functional_dependencies_hyper_optimized(df, max_lhs_size)
1722
+ fd_time = time.time() - fd_start
1723
+ print(f" FD discovery completed in {fd_time:.2f}s - found {len(fds)} dependencies")
1724
+
1725
+ fd_results = [(", ".join(lhs), rhs) for lhs, rhs in fds]
1726
+
1727
+ # Discover keys
1728
+ key_start = time.time()
1729
+ all_keys, candidate_keys, superkeys = find_candidate_keys_hyper_optimized(df, max_combination_size)
1730
+ key_time = time.time() - key_start
1731
+ print(f" Key discovery completed in {key_time:.2f}s - found {len(candidate_keys)} candidate keys")
1732
+
1733
+ # Minimal result preparation
1734
+ results = []
1735
+ single_col_uniqueness = {col: df[col].nunique() for col in cols}
1736
+
1737
+ # Only process essential combinations
1738
+ max_combinations_total = min(100, len(cols) * 2)
1739
+ combinations_tested = 0
1740
+
1741
+ for size in range(1, max_combination_size + 1):
1742
+ for combo in itertools.combinations(cols, size):
1743
+ if combinations_tested >= max_combinations_total:
1744
+ break
1745
+
1746
+ if len(combo) == 1:
1747
+ unique_count = single_col_uniqueness[combo[0]]
1748
+ elif combo in all_keys:
1749
+ unique_count = n_rows
1750
+ else:
1751
+ # Estimate for larger combinations
1752
+ unique_count = min(n_rows, sum(single_col_uniqueness[col] for col in combo) // len(combo))
1753
+
1754
+ unique_ratio = unique_count / n_rows if n_rows > 0 else 0
1755
+ is_key = combo in all_keys
1756
+ is_candidate = combo in candidate_keys
1757
+ is_superkey = combo in superkeys
1758
+
1759
+ key_type = ""
1760
+ if is_candidate:
1761
+ key_type = "★ Candidate Key"
1762
+ elif is_superkey:
1763
+ key_type = "⊃ Superkey"
1764
+
1765
+ results.append((combo, unique_count, unique_ratio, is_key, key_type))
1766
+ combinations_tested += 1
1767
+
1768
+ # Quick sort
1769
+ results.sort(key=lambda x: (not x[3], -x[2], len(x[0])))
1770
+ key_results = [(", ".join(c), u, f"{u/n_rows:.2%}", k)
1771
+ for c, u, _, _, k in results]
1772
+
1773
+ # Simplified normalized tables
1774
+ normalized_tables = propose_normalized_tables(cols, candidate_keys, fds)
1775
+
1776
+ total_time = time.time() - start_time
1777
+ print(f" HYPER-OPTIMIZED analysis completed in {total_time:.2f}s")
1778
+
1779
+ return fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables
1780
+
1781
+
1782
+ def test_hyper_optimized_scenario():
1783
+ """
1784
+ Test the hyper-optimized version with extremely large datasets.
1785
+ """
1786
+ print("=== HYPER-OPTIMIZED SCENARIO TEST ===\n")
1787
+
1788
+ # Test different large dataset scenarios
1789
+ test_scenarios = [
1790
+ (500000, 'complex', "500K rows complex"),
1791
+ (1000000, 'complex', "1M rows complex"),
1792
+ (2000000, 'medium', "2M rows medium"),
1793
+ (5000000, 'simple', "5M rows simple")
1794
+ ]
1795
+
1796
+ results = []
1797
+
1798
+ for size, complexity, description in test_scenarios:
1799
+ print(f"\n{'='*60}")
1800
+ print(f"TESTING: {description}")
1801
+ print('='*60)
1802
+
1803
+ try:
1804
+ # Create test data
1805
+ df = create_stress_test_data(size, complexity=complexity)
1806
+ memory_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
1807
+
1808
+ print(f"Memory usage: {memory_mb:.1f} MB")
1809
+
1810
+ # Test hyper-optimized version
1811
+ start_time = time.time()
1812
+ fd_results, key_results, n_rows, cols, max_combo, max_lhs, norm_tables = profile_hyper_optimized(
1813
+ df, max_combination_size=3, max_lhs_size=2
1814
+ )
1815
+ analysis_time = time.time() - start_time
1816
+
1817
+ candidate_keys = [k for k in key_results if "Candidate Key" in k[3]]
1818
+ rows_per_second = n_rows / analysis_time
1819
+
1820
+ print(f"\n✅ SUCCESS!")
1821
+ print(f" • Analysis time: {analysis_time:.2f} seconds")
1822
+ print(f" • Processing rate: {rows_per_second:,.0f} rows/second")
1823
+ print(f" • Found {len(fd_results)} functional dependencies")
1824
+ print(f" • Found {len(candidate_keys)} candidate keys")
1825
+ print(f" • Memory efficiency: {memory_mb / analysis_time:.1f} MB/second")
1826
+
1827
+ # Performance assessment
1828
+ if analysis_time < 30:
1829
+ performance = "🔥 EXCELLENT"
1830
+ elif analysis_time < 60:
1831
+ performance = "✅ GOOD"
1832
+ elif analysis_time < 180:
1833
+ performance = "⚠️ ACCEPTABLE"
1834
+ else:
1835
+ performance = "❌ NEEDS WORK"
1836
+
1837
+ print(f" • Performance: {performance}")
1838
+
1839
+ results.append({
1840
+ 'size': size,
1841
+ 'complexity': complexity,
1842
+ 'memory_mb': memory_mb,
1843
+ 'time': analysis_time,
1844
+ 'rows_per_sec': rows_per_second,
1845
+ 'fds': len(fd_results),
1846
+ 'keys': len(candidate_keys),
1847
+ 'success': True,
1848
+ 'performance': performance
1849
+ })
1850
+
1851
+ except Exception as e:
1852
+ print(f"❌ FAILED: {str(e)}")
1853
+ results.append({
1854
+ 'size': size,
1855
+ 'complexity': complexity,
1856
+ 'memory_mb': 0,
1857
+ 'time': float('inf'),
1858
+ 'rows_per_sec': 0,
1859
+ 'fds': 0,
1860
+ 'keys': 0,
1861
+ 'success': False,
1862
+ 'performance': "❌ FAILED"
1863
+ })
1864
+
1865
+ # Summary
1866
+ print(f"\n{'='*80}")
1867
+ print("HYPER-OPTIMIZED PERFORMANCE SUMMARY")
1868
+ print('='*80)
1869
+ print(f"{'Dataset':<20} {'Memory':<10} {'Time':<10} {'Rate':<12} {'FDs':<5} {'Keys':<5} {'Performance'}")
1870
+ print("-" * 80)
1871
+
1872
+ for result in results:
1873
+ dataset = f"{result['size']:,} {result['complexity']}"
1874
+ memory = f"{result['memory_mb']:.1f}MB"
1875
+ time_str = f"{result['time']:.1f}s" if result['time'] != float('inf') else "FAIL"
1876
+ rate = f"{result['rows_per_sec']:,.0f}/s" if result['success'] else "N/A"
1877
+ fds = str(result['fds'])
1878
+ keys = str(result['keys'])
1879
+ performance = result['performance']
1880
+
1881
+ print(f"{dataset:<20} {memory:<10} {time_str:<10} {rate:<12} {fds:<5} {keys:<5} {performance}")
1882
+
1883
+ # Analysis
1884
+ successful = [r for r in results if r['success']]
1885
+ if successful:
1886
+ max_size = max(r['size'] for r in successful)
1887
+ avg_rate = np.mean([r['rows_per_sec'] for r in successful])
1888
+ print(f"\n🎯 ANALYSIS:")
1889
+ print(f" • Successfully processed datasets up to {max_size:,} rows")
1890
+ print(f" • Average processing rate: {avg_rate:,.0f} rows/second")
1891
+ print(f" • Hyper-optimization enables analysis of datasets that would be impossible otherwise")
1892
+
1893
+ return results
1894
+
1895
+
1896
+ def test_small_data_optimizations():
1897
+ """
1898
+ Test optimizations specifically for small datasets to ensure no performance regression.
1899
+ """
1900
+ print("=== SMALL DATA OPTIMIZATION TEST ===\n")
1901
+
1902
+ # Test different small dataset scenarios
1903
+ small_test_configs = [
1904
+ (10, 3, 'tiny'),
1905
+ (50, 4, 'small'),
1906
+ (100, 5, 'small'),
1907
+ (500, 6, 'medium'),
1908
+ (1000, 8, 'medium'),
1909
+ (5000, 10, 'medium'),
1910
+ ]
1911
+
1912
+ results = []
1913
+
1914
+ for size, n_cols, complexity in small_test_configs:
1915
+ print(f"\n{'='*50}")
1916
+ print(f"TESTING: {size:,} rows × {n_cols} columns ({complexity})")
1917
+ print('='*50)
1918
+
1919
+ try:
1920
+ # Create test data
1921
+ df = create_stress_test_data(size, n_cols=n_cols, complexity=complexity)
1922
+ memory_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
1923
+
1924
+ print(f"Memory usage: {memory_mb:.3f} MB")
1925
+
1926
+ # Test all three optimization levels
1927
+ optimization_results = {}
1928
+
1929
+ # 1. Test original version (for very small datasets only)
1930
+ if size <= 1000:
1931
+ print("\n🐌 Testing ORIGINAL version...")
1932
+ start_time = time.time()
1933
+ orig_results = profile_original(df, max_combination_size=3, max_lhs_size=2)
1934
+ orig_time = time.time() - start_time
1935
+ optimization_results['original'] = {
1936
+ 'time': orig_time,
1937
+ 'fds': len(orig_results[0]),
1938
+ 'keys': len([k for k in orig_results[1] if "Candidate Key" in k[3]])
1939
+ }
1940
+ print(f" Original: {orig_time:.4f}s - {optimization_results['original']['fds']} FDs, {optimization_results['original']['keys']} keys")
1941
+
1942
+ # 2. Test standard optimized version
1943
+ print("\n🔍 Testing STANDARD-OPTIMIZED version...")
1944
+ start_time = time.time()
1945
+ std_results = profile_optimized(df, max_combination_size=3, max_lhs_size=2)
1946
+ std_time = time.time() - start_time
1947
+ optimization_results['standard'] = {
1948
+ 'time': std_time,
1949
+ 'fds': len(std_results[0]),
1950
+ 'keys': len([k for k in std_results[1] if "Candidate Key" in k[3]])
1951
+ }
1952
+ print(f" Standard: {std_time:.4f}s - {optimization_results['standard']['fds']} FDs, {optimization_results['standard']['keys']} keys")
1953
+
1954
+ # 3. Test ultra optimized version
1955
+ print("\n⚡ Testing ULTRA-OPTIMIZED version...")
1956
+ start_time = time.time()
1957
+ ultra_results = profile_ultra_optimized(df, max_combination_size=3, max_lhs_size=2)
1958
+ ultra_time = time.time() - start_time
1959
+ optimization_results['ultra'] = {
1960
+ 'time': ultra_time,
1961
+ 'fds': len(ultra_results[0]),
1962
+ 'keys': len([k for k in ultra_results[1] if "Candidate Key" in k[3]])
1963
+ }
1964
+ print(f" Ultra: {ultra_time:.4f}s - {optimization_results['ultra']['fds']} FDs, {optimization_results['ultra']['keys']} keys")
1965
+
1966
+ # 4. Test automatic selection (should pick standard for small data)
1967
+ print("\n🎯 Testing AUTOMATIC selection...")
1968
+ start_time = time.time()
1969
+ auto_results = profile(df, max_combination_size=3, max_lhs_size=2)
1970
+ auto_time = time.time() - start_time
1971
+ optimization_results['auto'] = {
1972
+ 'time': auto_time,
1973
+ 'fds': len(auto_results[0]),
1974
+ 'keys': len([k for k in auto_results[1] if "Candidate Key" in k[3]])
1975
+ }
1976
+ print(f" Auto: {auto_time:.4f}s - {optimization_results['auto']['fds']} FDs, {optimization_results['auto']['keys']} keys")
1977
+
1978
+ # Analyze results
1979
+ print(f"\n📊 ANALYSIS:")
1980
+
1981
+ # Check consistency
1982
+ fd_counts = [opt['fds'] for opt in optimization_results.values()]
1983
+ key_counts = [opt['keys'] for opt in optimization_results.values()]
1984
+
1985
+ consistent_fds = len(set(fd_counts)) <= 1
1986
+ consistent_keys = len(set(key_counts)) <= 1
1987
+
1988
+ print(f" • FD consistency: {'✅' if consistent_fds else '❌'} ({fd_counts})")
1989
+ print(f" • Key consistency: {'✅' if consistent_keys else '❌'} ({key_counts})")
1990
+
1991
+ # Compare performance
1992
+ if 'original' in optimization_results:
1993
+ std_speedup = optimization_results['original']['time'] / optimization_results['standard']['time']
1994
+ ultra_speedup = optimization_results['original']['time'] / optimization_results['ultra']['time']
1995
+ print(f" • Standard speedup vs original: {std_speedup:.2f}x")
1996
+ print(f" • Ultra speedup vs original: {ultra_speedup:.2f}x")
1997
+
1998
+ # Check if auto selection made good choice
1999
+ fastest_time = min(opt['time'] for opt in optimization_results.values())
2000
+ auto_efficiency = fastest_time / optimization_results['auto']['time']
2001
+ print(f" • Auto selection efficiency: {auto_efficiency:.2f} (1.0 = optimal)")
2002
+
2003
+ # Overall assessment
2004
+ if consistent_fds and consistent_keys and auto_efficiency > 0.8:
2005
+ assessment = "✅ EXCELLENT"
2006
+ elif consistent_fds and consistent_keys:
2007
+ assessment = "✅ GOOD"
2008
+ elif auto_efficiency > 0.8:
2009
+ assessment = "⚠️ INCONSISTENT RESULTS"
2010
+ else:
2011
+ assessment = "❌ POOR PERFORMANCE"
2012
+
2013
+ print(f" • Overall: {assessment}")
2014
+
2015
+ results.append({
2016
+ 'size': size,
2017
+ 'cols': n_cols,
2018
+ 'complexity': complexity,
2019
+ 'memory_mb': memory_mb,
2020
+ 'optimization_results': optimization_results,
2021
+ 'consistent_fds': consistent_fds,
2022
+ 'consistent_keys': consistent_keys,
2023
+ 'auto_efficiency': auto_efficiency,
2024
+ 'assessment': assessment,
2025
+ 'success': True
2026
+ })
2027
+
2028
+ except Exception as e:
2029
+ print(f"❌ FAILED: {str(e)}")
2030
+ results.append({
2031
+ 'size': size,
2032
+ 'cols': n_cols,
2033
+ 'complexity': complexity,
2034
+ 'memory_mb': 0,
2035
+ 'optimization_results': {},
2036
+ 'consistent_fds': False,
2037
+ 'consistent_keys': False,
2038
+ 'auto_efficiency': 0,
2039
+ 'assessment': "❌ FAILED",
2040
+ 'success': False
2041
+ })
2042
+
2043
+ # Comprehensive summary
2044
+ print(f"\n{'='*80}")
2045
+ print("SMALL DATA OPTIMIZATION SUMMARY")
2046
+ print('='*80)
2047
+ print(f"{'Dataset':<15} {'Memory':<8} {'Original':<10} {'Standard':<10} {'Ultra':<10} {'Auto':<10} {'Consistent':<10} {'Assessment'}")
2048
+ print("-" * 80)
2049
+
2050
+ for result in results:
2051
+ if not result['success']:
2052
+ continue
2053
+
2054
+ dataset = f"{result['size']}×{result['cols']}"
2055
+ memory = f"{result['memory_mb']:.2f}MB"
2056
+
2057
+ opt_res = result['optimization_results']
2058
+ orig_time = f"{opt_res.get('original', {}).get('time', 0):.3f}s" if 'original' in opt_res else "N/A"
2059
+ std_time = f"{opt_res['standard']['time']:.3f}s"
2060
+ ultra_time = f"{opt_res['ultra']['time']:.3f}s"
2061
+ auto_time = f"{opt_res['auto']['time']:.3f}s"
2062
+
2063
+ consistent = "✅" if result['consistent_fds'] and result['consistent_keys'] else "❌"
2064
+ assessment = result['assessment'].split()[0] # Just the emoji/symbol
2065
+
2066
+ print(f"{dataset:<15} {memory:<8} {orig_time:<10} {std_time:<10} {ultra_time:<10} {auto_time:<10} {consistent:<10} {assessment}")
2067
+
2068
+ # Performance analysis
2069
+ successful = [r for r in results if r['success']]
2070
+ if successful:
2071
+ print(f"\n🎯 PERFORMANCE ANALYSIS:")
2072
+
2073
+ # Consistency check
2074
+ all_consistent = all(r['consistent_fds'] and r['consistent_keys'] for r in successful)
2075
+ print(f" • Result consistency across optimizations: {'✅' if all_consistent else '❌'}")
2076
+
2077
+ # Auto selection efficiency
2078
+ avg_auto_efficiency = np.mean([r['auto_efficiency'] for r in successful])
2079
+ print(f" • Average auto-selection efficiency: {avg_auto_efficiency:.3f}")
2080
+
2081
+ # Speed comparison for overlapping tests
2082
+ overlap_tests = [r for r in successful if 'original' in r['optimization_results']]
2083
+ if overlap_tests:
2084
+ avg_std_speedup = np.mean([
2085
+ r['optimization_results']['original']['time'] / r['optimization_results']['standard']['time']
2086
+ for r in overlap_tests
2087
+ ])
2088
+ avg_ultra_speedup = np.mean([
2089
+ r['optimization_results']['original']['time'] / r['optimization_results']['ultra']['time']
2090
+ for r in overlap_tests
2091
+ ])
2092
+ print(f" • Average standard optimization speedup: {avg_std_speedup:.2f}x")
2093
+ print(f" • Average ultra optimization speedup: {avg_ultra_speedup:.2f}x")
2094
+
2095
+ # Recommendations
2096
+ print(f"\n💡 RECOMMENDATIONS:")
2097
+ if all_consistent and avg_auto_efficiency > 0.9:
2098
+ print(" ✅ Optimizations are working excellently for small data")
2099
+ elif all_consistent:
2100
+ print(" ✅ Results are consistent, but auto-selection could be improved")
2101
+ else:
2102
+ print(" ⚠️ Some optimization levels produce inconsistent results")
2103
+
2104
+ # Check if any optimization is consistently best for small data
2105
+ fastest_counts = {}
2106
+ for result in successful:
2107
+ if result['optimization_results']:
2108
+ fastest = min(result['optimization_results'].items(), key=lambda x: x[1]['time'])[0]
2109
+ fastest_counts[fastest] = fastest_counts.get(fastest, 0) + 1
2110
+
2111
+ if fastest_counts:
2112
+ best_optimization = max(fastest_counts.items(), key=lambda x: x[1])
2113
+ print(f" 🏆 Most often fastest: {best_optimization[0]} ({best_optimization[1]}/{len(successful)} times)")
2114
+
2115
+ return results
2116
+
2117
+
2118
+ def find_functional_dependencies_high_column_optimized(df: pd.DataFrame, max_lhs_size: int = 2):
2119
+ """
2120
+ Specialized functional dependency discovery for high-column datasets (>50 columns).
2121
+ Uses intelligent column selection and aggressive limits.
2122
+ """
2123
+ try:
2124
+ n_rows = len(df)
2125
+ cols = list(df.columns)
2126
+ n_cols = len(cols)
2127
+
2128
+ if n_rows == 0 or n_cols < 2:
2129
+ return []
2130
+
2131
+ print(f" High-column FD analysis: {n_rows} rows × {n_cols} columns")
2132
+
2133
+ # Always sample for high-column datasets to keep it manageable
2134
+ if n_rows > 2000:
2135
+ sample_size = min(2000, max(500, n_rows // 50))
2136
+ df, was_sampled = sample_dataframe_intelligently(df, sample_size)
2137
+ n_rows = len(df)
2138
+ print(f" Sampled to {n_rows} rows for high-column analysis")
2139
+
2140
+ # Pre-compute column characteristics for intelligent selection
2141
+ col_info = {}
2142
+ for col in cols:
2143
+ try:
2144
+ unique_count = df[col].nunique()
2145
+ col_info[col] = {
2146
+ 'cardinality': unique_count,
2147
+ 'uniqueness_ratio': unique_count / n_rows,
2148
+ 'is_potential_key': unique_count == n_rows,
2149
+ 'is_low_cardinality': unique_count < n_rows * 0.1,
2150
+ 'is_boolean_like': unique_count <= 2
2151
+ }
2152
+ except Exception:
2153
+ # Skip problematic columns
2154
+ col_info[col] = {
2155
+ 'cardinality': 0,
2156
+ 'uniqueness_ratio': 0,
2157
+ 'is_potential_key': False,
2158
+ 'is_low_cardinality': False,
2159
+ 'is_boolean_like': False
2160
+ }
2161
+
2162
+ # Select most promising columns for LHS (determinants)
2163
+ # Focus on columns that are likely to be good determinants
2164
+ lhs_candidates = []
2165
+
2166
+ # Add potential keys first (high cardinality)
2167
+ potential_keys = [col for col, info in col_info.items() if info['uniqueness_ratio'] > 0.8]
2168
+ lhs_candidates.extend(potential_keys[:10]) # Top 10 potential keys
2169
+
2170
+ # Add low-cardinality columns (good for grouping)
2171
+ low_card_cols = sorted([col for col, info in col_info.items() if info['is_low_cardinality']],
2172
+ key=lambda x: col_info[x]['cardinality'])
2173
+ lhs_candidates.extend(low_card_cols[:15]) # Top 15 low-cardinality
2174
+
2175
+ # Add some medium-cardinality columns
2176
+ medium_card_cols = [col for col, info in col_info.items()
2177
+ if 0.1 <= info['uniqueness_ratio'] <= 0.8]
2178
+ medium_card_cols = sorted(medium_card_cols, key=lambda x: col_info[x]['cardinality'])
2179
+ lhs_candidates.extend(medium_card_cols[:10]) # Top 10 medium-cardinality
2180
+
2181
+ # Remove duplicates while preserving order and ensure they exist in dataframe
2182
+ seen = set()
2183
+ lhs_candidates = [col for col in lhs_candidates
2184
+ if col in df.columns and not (col in seen or seen.add(col))]
2185
+
2186
+ # Limit to top 30 LHS candidates to keep it manageable
2187
+ lhs_candidates = lhs_candidates[:30]
2188
+
2189
+ print(f" Selected {len(lhs_candidates)} promising LHS candidates from {n_cols} columns")
2190
+
2191
+ fds = []
2192
+ group_cache = {}
2193
+
2194
+ # Very aggressive limits for high-column datasets
2195
+ max_tests = 200 # Maximum total FD tests
2196
+ tests_performed = 0
2197
+
2198
+ for size in range(1, min(max_lhs_size + 1, 3)): # Cap at size 2 for high-column
2199
+ if tests_performed >= max_tests or not lhs_candidates:
2200
+ break
2201
+
2202
+ if size == 1:
2203
+ # Single column determinants
2204
+ candidates = lhs_candidates[:20] # Top 20 for single-column
2205
+ else:
2206
+ # Multi-column determinants - be very selective
2207
+ try:
2208
+ candidates = list(itertools.combinations(lhs_candidates[:15], size))[:30]
2209
+ except Exception:
2210
+ candidates = []
2211
+
2212
+ for lhs in candidates:
2213
+ if tests_performed >= max_tests:
2214
+ break
2215
+
2216
+ lhs_tuple = tuple(lhs) if isinstance(lhs, (list, tuple)) else (lhs,)
2217
+
2218
+ try:
2219
+ # Ensure all columns in lhs_tuple exist in dataframe
2220
+ if not all(col in df.columns for col in lhs_tuple):
2221
+ continue
2222
+
2223
+ if lhs_tuple not in group_cache:
2224
+ grouped = df.groupby(list(lhs_tuple), sort=False, dropna=False)
2225
+ group_sizes = grouped.size()
2226
+ group_cache[lhs_tuple] = (grouped, group_sizes)
2227
+ else:
2228
+ grouped, group_sizes = group_cache[lhs_tuple]
2229
+
2230
+ n_groups = len(group_sizes)
2231
+ if n_groups == n_rows or group_sizes.max() == 1:
2232
+ continue
2233
+
2234
+ # Test only most promising RHS candidates
2235
+ rhs_candidates = []
2236
+
2237
+ # Add high-cardinality columns as RHS candidates
2238
+ high_card_rhs = [col for col, info in col_info.items()
2239
+ if info['uniqueness_ratio'] > 0.5 and col not in lhs_tuple and col in df.columns]
2240
+ rhs_candidates.extend(high_card_rhs[:10])
2241
+
2242
+ # Add some other columns
2243
+ other_rhs = [col for col in cols if col not in lhs_tuple and col not in rhs_candidates and col in df.columns]
2244
+ rhs_candidates.extend(other_rhs[:10])
2245
+
2246
+ for rhs in rhs_candidates:
2247
+ if tests_performed >= max_tests:
2248
+ break
2249
+
2250
+ # Quick heuristic check
2251
+ if col_info.get(rhs, {}).get('cardinality', 0) > n_groups * 1.5:
2252
+ continue
2253
+
2254
+ try:
2255
+ rhs_per_group = grouped[rhs].nunique()
2256
+ if (rhs_per_group <= 1).all():
2257
+ fds.append((lhs_tuple, rhs))
2258
+ tests_performed += 1
2259
+ except Exception:
2260
+ continue
2261
+
2262
+ except Exception:
2263
+ continue
2264
+
2265
+ print(f" Performed {tests_performed} FD tests (limit: {max_tests})")
2266
+ return fds
2267
+
2268
+ except Exception as e:
2269
+ print(f" Error in high-column FD analysis: {e}")
2270
+ return [] # Return empty list on error
2271
+
2272
+
2273
+ def find_candidate_keys_high_column_optimized(df: pd.DataFrame, max_combination_size: int = 2):
2274
+ """
2275
+ Specialized candidate key discovery for high-column datasets (>50 columns).
2276
+ Uses intelligent column selection and aggressive limits.
2277
+ """
2278
+ try:
2279
+ n_rows = len(df)
2280
+ cols = list(df.columns)
2281
+ n_cols = len(cols)
2282
+
2283
+ if n_rows == 0:
2284
+ return [], [], []
2285
+
2286
+ print(f" High-column key analysis: {n_rows} rows × {n_cols} columns")
2287
+
2288
+ # Always sample for high-column datasets
2289
+ if n_rows > 2000:
2290
+ sample_size = min(2000, max(500, n_rows // 50))
2291
+ df, was_sampled = sample_dataframe_intelligently(df, sample_size)
2292
+ n_rows = len(df)
2293
+ print(f" Sampled to {n_rows} rows for high-column key analysis")
2294
+
2295
+ all_keys = []
2296
+
2297
+ # Quick single-column check with cardinality-based prioritization
2298
+ col_cardinalities = {}
2299
+ potential_single_keys = []
2300
+
2301
+ # Sort columns by cardinality (descending) to check most promising first
2302
+ for col in cols:
2303
+ try:
2304
+ if col in df.columns:
2305
+ cardinality = df[col].nunique()
2306
+ col_cardinalities[col] = cardinality
2307
+ if cardinality == n_rows:
2308
+ potential_single_keys.append((col,))
2309
+ all_keys.append((col,))
2310
+ else:
2311
+ col_cardinalities[col] = 0
2312
+ except Exception:
2313
+ col_cardinalities[col] = 0
2314
+
2315
+ print(f" Found {len(potential_single_keys)} single-column keys")
2316
+
2317
+ # For high-column datasets, if we have single-column keys, be very conservative about multi-column
2318
+ if potential_single_keys and n_cols > 80:
2319
+ print(f" Stopping early due to high column count ({n_cols}) and existing single-column keys")
2320
+ return all_keys, potential_single_keys, []
2321
+
2322
+ # Select most promising columns for multi-column key testing
2323
+ # Sort by cardinality (highest first) and take top candidates
2324
+ try:
2325
+ sorted_cols = sorted([col for col in cols if col in df.columns],
2326
+ key=lambda x: col_cardinalities.get(x, 0), reverse=True)
2327
+ except Exception:
2328
+ sorted_cols = [col for col in cols if col in df.columns]
2329
+
2330
+ # Take top candidates based on cardinality
2331
+ if n_cols > 80:
2332
+ promising_cols = sorted_cols[:15] # Very selective for >80 columns
2333
+ elif n_cols > 60:
2334
+ promising_cols = sorted_cols[:20] # Selective for >60 columns
2335
+ else:
2336
+ promising_cols = sorted_cols[:25] # Less selective for 50-60 columns
2337
+
2338
+ print(f" Selected {len(promising_cols)} promising columns for multi-column key testing")
2339
+
2340
+ # Very conservative multi-column key testing
2341
+ max_combination_size = min(max_combination_size, 2) # Cap at 2 for high-column
2342
+ max_combinations_to_test = 50 # Hard limit
2343
+
2344
+ for size in range(2, max_combination_size + 1):
2345
+ if size > len(promising_cols):
2346
+ break
2347
+
2348
+ # Generate combinations from promising columns only
2349
+ try:
2350
+ combinations = list(itertools.combinations(promising_cols, size))
2351
+ except Exception:
2352
+ combinations = []
2353
+
2354
+ # Sort by total cardinality (higher is more likely to be a key)
2355
+ try:
2356
+ combinations = sorted(combinations,
2357
+ key=lambda x: sum(col_cardinalities.get(col, 0) for col in x),
2358
+ reverse=True)
2359
+ except Exception:
2360
+ pass # Keep original order if sorting fails
2361
+
2362
+ # Test only top combinations
2363
+ combinations_to_test = combinations[:max_combinations_to_test]
2364
+
2365
+ tested_count = 0
2366
+ for combo in combinations_to_test:
2367
+ try:
2368
+ # Skip if contains single-column key
2369
+ if any((col,) in potential_single_keys for col in combo):
2370
+ continue
2371
+
2372
+ # Ensure all columns in combo exist in dataframe
2373
+ if not all(col in df.columns for col in combo):
2374
+ continue
2375
+
2376
+ # Quick heuristic: if sum of cardinalities is much less than n_rows, skip
2377
+ total_card = sum(col_cardinalities.get(col, 0) for col in combo)
2378
+ if total_card < n_rows * 0.7:
2379
+ continue
2380
+
2381
+ try:
2382
+ unique_count = len(df[list(combo)].drop_duplicates())
2383
+ if unique_count == n_rows:
2384
+ all_keys.append(combo)
2385
+ except Exception:
2386
+ continue # Skip problematic combinations
2387
+
2388
+ tested_count += 1
2389
+
2390
+ # Early termination for high-column datasets
2391
+ if tested_count >= 20: # Test at most 20 combinations per size
2392
+ break
2393
+
2394
+ except Exception:
2395
+ continue
2396
+
2397
+ print(f" Tested {tested_count} combinations of size {size}")
2398
+
2399
+ # Early termination if we found keys and this is a very high-column dataset
2400
+ if all_keys and n_cols > 80:
2401
+ break
2402
+
2403
+ # Classify keys
2404
+ candidate_keys = []
2405
+ superkeys = []
2406
+
2407
+ for key in all_keys:
2408
+ try:
2409
+ is_candidate = True
2410
+ for other_key in all_keys:
2411
+ if len(other_key) < len(key) and set(other_key).issubset(set(key)):
2412
+ is_candidate = False
2413
+ break
2414
+
2415
+ if is_candidate:
2416
+ candidate_keys.append(key)
2417
+ else:
2418
+ superkeys.append(key)
2419
+ except Exception:
2420
+ # If classification fails, treat as candidate key
2421
+ candidate_keys.append(key)
2422
+
2423
+ return all_keys, candidate_keys, superkeys
2424
+
2425
+ except Exception as e:
2426
+ print(f" Error in high-column key analysis: {e}")
2427
+ return [], [], [] # Return empty lists on error
2428
+
2429
+
2430
+ def profile_high_column_optimized(df: pd.DataFrame, max_combination_size: int = 2, max_lhs_size: int = 2):
2431
+ """
2432
+ Specialized profile function for high-column datasets (>50 columns).
2433
+ Uses aggressive optimization and intelligent column selection.
2434
+ """
2435
+ try:
2436
+ start_time = time.time()
2437
+ n_rows = len(df)
2438
+ cols = list(df.columns)
2439
+ n_cols = len(cols)
2440
+
2441
+ print(f"Starting HIGH-COLUMN analysis of {n_rows:,} rows × {n_cols} columns...")
2442
+
2443
+ # Very aggressive parameter limits for high-column datasets
2444
+ max_combination_size = min(max_combination_size, 2)
2445
+ max_lhs_size = min(max_lhs_size, 2)
2446
+ print(f" High-column mode: limiting to max combination size {max_combination_size}")
2447
+
2448
+ # Discover functional dependencies
2449
+ fd_start = time.time()
2450
+ try:
2451
+ fds = find_functional_dependencies_high_column_optimized(df, max_lhs_size)
2452
+ except Exception as e:
2453
+ print(f" Error in FD discovery: {e}")
2454
+ fds = []
2455
+ fd_time = time.time() - fd_start
2456
+ print(f" FD discovery completed in {fd_time:.2f}s - found {len(fds)} dependencies")
2457
+
2458
+ fd_results = [(", ".join(lhs), rhs) for lhs, rhs in fds]
2459
+
2460
+ # Discover keys
2461
+ key_start = time.time()
2462
+ try:
2463
+ all_keys, candidate_keys, superkeys = find_candidate_keys_high_column_optimized(df, max_combination_size)
2464
+ except Exception as e:
2465
+ print(f" Error in key discovery: {e}")
2466
+ all_keys, candidate_keys, superkeys = [], [], []
2467
+ key_time = time.time() - key_start
2468
+ print(f" Key discovery completed in {key_time:.2f}s - found {len(candidate_keys)} candidate keys")
2469
+
2470
+ # Minimal result preparation for high-column datasets
2471
+ results = []
2472
+
2473
+ # Pre-compute single column uniqueness for efficiency
2474
+ single_col_uniqueness = {}
2475
+ print(" Computing column uniqueness...")
2476
+ try:
2477
+ for col in cols:
2478
+ if col in df.columns:
2479
+ try:
2480
+ single_col_uniqueness[col] = df[col].nunique()
2481
+ except Exception:
2482
+ single_col_uniqueness[col] = 0
2483
+ else:
2484
+ single_col_uniqueness[col] = 0
2485
+ except Exception as e:
2486
+ print(f" Error computing column uniqueness: {e}")
2487
+ # Set default values
2488
+ single_col_uniqueness = {col: 0 for col in cols}
2489
+
2490
+ # Only process essential combinations for high-column datasets
2491
+ max_combinations_total = min(100, n_cols * 2) # Very conservative
2492
+ combinations_tested = 0
2493
+
2494
+ print(f" Preparing results (testing max {max_combinations_total} combinations)...")
2495
+
2496
+ # Process single columns first (most important)
2497
+ try:
2498
+ for col in cols:
2499
+ if combinations_tested >= max_combinations_total:
2500
+ break
2501
+
2502
+ if col not in df.columns:
2503
+ continue
2504
+
2505
+ combo = (col,)
2506
+ unique_count = single_col_uniqueness.get(col, 0)
2507
+ unique_ratio = unique_count / n_rows if n_rows > 0 else 0
2508
+ is_key = combo in all_keys
2509
+ is_candidate = combo in candidate_keys
2510
+ is_superkey = combo in superkeys
2511
+
2512
+ key_type = ""
2513
+ if is_candidate:
2514
+ key_type = "★ Candidate Key"
2515
+ elif is_superkey:
2516
+ key_type = "⊃ Superkey"
2517
+
2518
+ results.append((combo, unique_count, unique_ratio, is_key, key_type))
2519
+ combinations_tested += 1
2520
+ except Exception as e:
2521
+ print(f" Error processing single columns: {e}")
2522
+
2523
+ # Process only the most promising multi-column combinations
2524
+ try:
2525
+ if combinations_tested < max_combinations_total and max_combination_size > 1:
2526
+ # Sort columns by uniqueness (highest first) for better multi-column candidates
2527
+ try:
2528
+ sorted_cols = sorted([col for col in cols if col in df.columns],
2529
+ key=lambda x: single_col_uniqueness.get(x, 0), reverse=True)
2530
+ top_cols = sorted_cols[:min(20, len(sorted_cols))] # Top 20 most unique columns
2531
+ except Exception:
2532
+ top_cols = [col for col in cols if col in df.columns][:20]
2533
+
2534
+ for size in range(2, min(max_combination_size + 1, 3)):
2535
+ if combinations_tested >= max_combinations_total:
2536
+ break
2537
+
2538
+ try:
2539
+ for combo in itertools.combinations(top_cols, size):
2540
+ if combinations_tested >= max_combinations_total:
2541
+ break
2542
+
2543
+ # Ensure all columns exist
2544
+ if not all(col in df.columns for col in combo):
2545
+ continue
2546
+
2547
+ if combo in all_keys:
2548
+ unique_count = n_rows
2549
+ else:
2550
+ # For non-keys, estimate uniqueness
2551
+ unique_count = min(n_rows, sum(single_col_uniqueness.get(col, 0) for col in combo) // len(combo))
2552
+
2553
+ unique_ratio = unique_count / n_rows if n_rows > 0 else 0
2554
+ is_key = combo in all_keys
2555
+ is_candidate = combo in candidate_keys
2556
+ is_superkey = combo in superkeys
2557
+
2558
+ key_type = ""
2559
+ if is_candidate:
2560
+ key_type = "★ Candidate Key"
2561
+ elif is_superkey:
2562
+ key_type = "⊃ Superkey"
2563
+
2564
+ results.append((combo, unique_count, unique_ratio, is_key, key_type))
2565
+ combinations_tested += 1
2566
+ except Exception as e:
2567
+ print(f" Error processing size {size} combinations: {e}")
2568
+ continue
2569
+ except Exception as e:
2570
+ print(f" Error processing multi-column combinations: {e}")
2571
+
2572
+ # Quick sort
2573
+ try:
2574
+ results.sort(key=lambda x: (not x[3], -x[2], len(x[0])))
2575
+ key_results = [(", ".join(c), u, f"{u/n_rows:.2%}", k)
2576
+ for c, u, _, _, k in results]
2577
+ except Exception as e:
2578
+ print(f" Error sorting results: {e}")
2579
+ key_results = []
2580
+
2581
+ # Simplified normalized tables
2582
+ try:
2583
+ normalized_tables = propose_normalized_tables(cols, candidate_keys, fds)
2584
+ except Exception as e:
2585
+ print(f" Error creating normalized tables: {e}")
2586
+ normalized_tables = []
2587
+
2588
+ total_time = time.time() - start_time
2589
+ print(f" HIGH-COLUMN analysis completed in {total_time:.2f}s")
2590
+
2591
+ return fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables
2592
+
2593
+ except Exception as e:
2594
+ print(f" Critical error in HIGH-COLUMN analysis: {e}")
2595
+ import traceback
2596
+ traceback.print_exc()
2597
+ # Return safe defaults
2598
+ return [], [], len(df), list(df.columns), max_combination_size, max_lhs_size, []
2599
+
2600
+
2601
+ def test_high_column_scenario():
2602
+ """
2603
+ Test the high-column optimization with scenarios similar to user's 16k×100 case.
2604
+ """
2605
+ print("=== HIGH-COLUMN SCENARIO TEST ===\n")
2606
+
2607
+ # Test different high-column scenarios
2608
+ test_scenarios = [
2609
+ (1000, 60, "1K×60 columns"),
2610
+ (5000, 80, "5K×80 columns"),
2611
+ (16000, 100, "16K×100 columns (user scenario)"),
2612
+ (10000, 120, "10K×120 columns"),
2613
+ (50000, 200, "50K×200 columns (extreme)")
2614
+ ]
2615
+
2616
+ results = []
2617
+
2618
+ for n_rows, n_cols, description in test_scenarios:
2619
+ print(f"\n{'='*60}")
2620
+ print(f"TESTING: {description}")
2621
+ print('='*60)
2622
+
2623
+ try:
2624
+ # Create test data with many columns
2625
+ print(f"Creating test dataset with {n_rows:,} rows and {n_cols} columns...")
2626
+
2627
+ # Create a realistic high-column dataset
2628
+ np.random.seed(42)
2629
+ random.seed(42)
2630
+
2631
+ data = {}
2632
+
2633
+ # Add ID column (primary key)
2634
+ data['id'] = range(1, n_rows + 1)
2635
+
2636
+ # Add categorical columns of various cardinalities
2637
+ for i in range(min(20, n_cols - 1)):
2638
+ if i < 5:
2639
+ # Low cardinality categorical
2640
+ cardinality = min(10, n_rows // 100)
2641
+ elif i < 10:
2642
+ # Medium cardinality categorical
2643
+ cardinality = min(100, n_rows // 10)
2644
+ else:
2645
+ # Higher cardinality categorical
2646
+ cardinality = min(1000, n_rows // 5)
2647
+
2648
+ data[f'cat_{i}'] = [f'cat_{i}_val_{j % cardinality}' for j in range(n_rows)]
2649
+
2650
+ # Add numeric columns
2651
+ remaining_cols = n_cols - len(data)
2652
+ for i in range(remaining_cols):
2653
+ if i % 4 == 0:
2654
+ # Integer columns
2655
+ data[f'num_{i}'] = np.random.randint(1, 1000, n_rows)
2656
+ elif i % 4 == 1:
2657
+ # Float columns
2658
+ data[f'float_{i}'] = np.random.uniform(0, 100, n_rows)
2659
+ elif i % 4 == 2:
2660
+ # Boolean-like columns
2661
+ data[f'bool_{i}'] = np.random.choice([0, 1], n_rows)
2662
+ else:
2663
+ # Text columns
2664
+ data[f'text_{i}'] = [f'text_{j % 50}' for j in range(n_rows)]
2665
+
2666
+ df = pd.DataFrame(data)
2667
+
2668
+ # Ensure we have the right number of columns
2669
+ if len(df.columns) != n_cols:
2670
+ print(f" Adjusting columns: created {len(df.columns)}, target {n_cols}")
2671
+ while len(df.columns) < n_cols:
2672
+ col_name = f'extra_{len(df.columns)}'
2673
+ df[col_name] = np.random.randint(1, 100, n_rows)
2674
+ df = df.iloc[:, :n_cols] # Trim if too many
2675
+
2676
+ memory_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
2677
+ print(f"Memory usage: {memory_mb:.1f} MB")
2678
+
2679
+ # Test the high-column optimized version
2680
+ start_time = time.time()
2681
+
2682
+ print(f"\n🏗️ Running HIGH-COLUMN-OPTIMIZED analysis...")
2683
+ fd_results, key_results, n_rows_result, cols, max_combo, max_lhs, norm_tables = profile_high_column_optimized(
2684
+ df, max_combination_size=3, max_lhs_size=2
2685
+ )
2686
+
2687
+ analysis_time = time.time() - start_time
2688
+
2689
+ candidate_keys = [k for k in key_results if "Candidate Key" in k[3]]
2690
+
2691
+ print(f"\n✅ SUCCESS!")
2692
+ print(f" • Analysis time: {analysis_time:.2f} seconds")
2693
+ print(f" • Memory usage: {memory_mb:.1f} MB")
2694
+ print(f" • Processing rate: {n_rows / analysis_time:,.0f} rows/second")
2695
+ print(f" • Column processing rate: {n_cols / analysis_time:.1f} columns/second")
2696
+ print(f" • Found {len(fd_results)} functional dependencies")
2697
+ print(f" • Found {len(candidate_keys)} candidate keys")
2698
+
2699
+ # Performance assessment
2700
+ if analysis_time < 10:
2701
+ performance = "🔥 EXCELLENT"
2702
+ elif analysis_time < 30:
2703
+ performance = "✅ GOOD"
2704
+ elif analysis_time < 120:
2705
+ performance = "⚠️ ACCEPTABLE"
2706
+ else:
2707
+ performance = "❌ TOO SLOW"
2708
+
2709
+ print(f" • Performance: {performance}")
2710
+
2711
+ # Show some sample results
2712
+ if fd_results:
2713
+ print(f"\n🔍 Sample functional dependencies found:")
2714
+ for i, (lhs, rhs) in enumerate(fd_results[:3]):
2715
+ print(f" • {lhs} → {rhs}")
2716
+ if len(fd_results) > 3:
2717
+ print(f" ... and {len(fd_results) - 3} more")
2718
+
2719
+ if candidate_keys:
2720
+ print(f"\n🔑 Candidate keys found:")
2721
+ for cols_str, count, ratio, key_type in candidate_keys[:3]:
2722
+ print(f" • {cols_str} ({ratio} unique)")
2723
+ if len(candidate_keys) > 3:
2724
+ print(f" ... and {len(candidate_keys) - 3} more")
2725
+
2726
+ results.append({
2727
+ 'scenario': description,
2728
+ 'rows': n_rows,
2729
+ 'cols': n_cols,
2730
+ 'memory_mb': memory_mb,
2731
+ 'time': analysis_time,
2732
+ 'rows_per_sec': n_rows / analysis_time,
2733
+ 'cols_per_sec': n_cols / analysis_time,
2734
+ 'fds': len(fd_results),
2735
+ 'keys': len(candidate_keys),
2736
+ 'performance': performance,
2737
+ 'success': True
2738
+ })
2739
+
2740
+ except Exception as e:
2741
+ print(f"❌ FAILED: {str(e)}")
2742
+ import traceback
2743
+ traceback.print_exc()
2744
+
2745
+ results.append({
2746
+ 'scenario': description,
2747
+ 'rows': n_rows,
2748
+ 'cols': n_cols,
2749
+ 'memory_mb': 0,
2750
+ 'time': float('inf'),
2751
+ 'rows_per_sec': 0,
2752
+ 'cols_per_sec': 0,
2753
+ 'fds': 0,
2754
+ 'keys': 0,
2755
+ 'performance': "❌ FAILED",
2756
+ 'success': False
2757
+ })
2758
+
2759
+ # Summary
2760
+ print(f"\n{'='*80}")
2761
+ print("HIGH-COLUMN OPTIMIZATION SUMMARY")
2762
+ print('='*80)
2763
+ print(f"{'Scenario':<25} {'Memory':<8} {'Time':<8} {'Rows/s':<8} {'Cols/s':<8} {'FDs':<4} {'Keys':<4} {'Performance'}")
2764
+ print("-" * 80)
2765
+
2766
+ for result in results:
2767
+ scenario = result['scenario'][:24]
2768
+ memory = f"{result['memory_mb']:.1f}MB"
2769
+ time_str = f"{result['time']:.1f}s" if result['time'] != float('inf') else "FAIL"
2770
+ rows_rate = f"{result['rows_per_sec']:,.0f}" if result['success'] else "N/A"
2771
+ cols_rate = f"{result['cols_per_sec']:.1f}" if result['success'] else "N/A"
2772
+ fds = str(result['fds'])
2773
+ keys = str(result['keys'])
2774
+ performance = result['performance'].split()[0] # Just the emoji
2775
+
2776
+ print(f"{scenario:<25} {memory:<8} {time_str:<8} {rows_rate:<8} {cols_rate:<8} {fds:<4} {keys:<4} {performance}")
2777
+
2778
+ # Analysis
2779
+ successful = [r for r in results if r['success']]
2780
+ if successful:
2781
+ print(f"\n🎯 PERFORMANCE ANALYSIS:")
2782
+
2783
+ # Check if user scenario (16K×100) was successful
2784
+ user_scenario = next((r for r in successful if '16K×100' in r['scenario']), None)
2785
+ if user_scenario:
2786
+ print(f" ✅ User scenario (16K×100 columns) completed in {user_scenario['time']:.1f} seconds")
2787
+ if user_scenario['time'] < 30:
2788
+ print(f" 🎉 This should be much faster on your smaller machine!")
2789
+ elif user_scenario['time'] < 120:
2790
+ print(f" 👍 This should provide reasonable performance on your smaller machine")
2791
+ else:
2792
+ print(f" ⚠️ May still be slow on smaller machines - consider further optimization")
2793
+
2794
+ avg_time = np.mean([r['time'] for r in successful])
2795
+ avg_cols_per_sec = np.mean([r['cols_per_sec'] for r in successful])
2796
+
2797
+ print(f" • Average analysis time: {avg_time:.1f} seconds")
2798
+ print(f" • Average column processing rate: {avg_cols_per_sec:.1f} columns/second")
2799
+ print(f" • Successfully handled datasets up to {max(r['cols'] for r in successful)} columns")
2800
+
2801
+ # Specific optimizations applied
2802
+ print(f"\n💡 HIGH-COLUMN OPTIMIZATIONS APPLIED:")
2803
+ print(f" • Intelligent column selection (top 30 LHS candidates)")
2804
+ print(f" • Aggressive sampling (max 2000 rows for analysis)")
2805
+ print(f" • Limited combination testing (max 200 FD tests)")
2806
+ print(f" • Prioritized high-cardinality columns for keys")
2807
+ print(f" • Early termination for very wide datasets (>80 columns)")
2808
+
2809
+ return results
2810
+
2811
+
2812
+ # Test functions to run when script is executed directly
2813
+ if __name__ == "__main__":
2814
+ if len(sys.argv) > 1:
2815
+ if sys.argv[1] == "benchmark":
2816
+ benchmark_performance()
2817
+ elif sys.argv[1] == "comprehensive":
2818
+ comprehensive_benchmark()
2819
+ elif sys.argv[1] == "small":
2820
+ test_small_data_optimizations()
2821
+ elif sys.argv[1] == "hyper":
2822
+ test_hyper_optimized_scenario()
2823
+ elif sys.argv[1] == "bigdata":
2824
+ test_big_data_scenario()
2825
+ elif sys.argv[1] == "test":
2826
+ test_realistic_scenario()
2827
+ elif sys.argv[1] == "demo":
2828
+ demo_performance_improvements()
2829
+ elif sys.argv[1] == "highcol":
2830
+ test_high_column_scenario()
2831
+ else:
2832
+ test_profile_keys()
2833
+ else:
2834
+ test_profile_keys()