sqlshell 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlshell/__init__.py +84 -0
- sqlshell/__main__.py +4926 -0
- sqlshell/ai_autocomplete.py +392 -0
- sqlshell/ai_settings_dialog.py +337 -0
- sqlshell/context_suggester.py +768 -0
- sqlshell/create_test_data.py +152 -0
- sqlshell/data/create_test_data.py +137 -0
- sqlshell/db/__init__.py +6 -0
- sqlshell/db/database_manager.py +1318 -0
- sqlshell/db/export_manager.py +188 -0
- sqlshell/editor.py +1166 -0
- sqlshell/editor_integration.py +127 -0
- sqlshell/execution_handler.py +421 -0
- sqlshell/menus.py +262 -0
- sqlshell/notification_manager.py +370 -0
- sqlshell/query_tab.py +904 -0
- sqlshell/resources/__init__.py +1 -0
- sqlshell/resources/icon.png +0 -0
- sqlshell/resources/logo_large.png +0 -0
- sqlshell/resources/logo_medium.png +0 -0
- sqlshell/resources/logo_small.png +0 -0
- sqlshell/resources/splash_screen.gif +0 -0
- sqlshell/space_invaders.py +501 -0
- sqlshell/splash_screen.py +405 -0
- sqlshell/sqlshell/__init__.py +5 -0
- sqlshell/sqlshell/create_test_data.py +118 -0
- sqlshell/sqlshell/create_test_databases.py +96 -0
- sqlshell/sqlshell_demo.png +0 -0
- sqlshell/styles.py +257 -0
- sqlshell/suggester_integration.py +330 -0
- sqlshell/syntax_highlighter.py +124 -0
- sqlshell/table_list.py +996 -0
- sqlshell/ui/__init__.py +6 -0
- sqlshell/ui/bar_chart_delegate.py +49 -0
- sqlshell/ui/filter_header.py +469 -0
- sqlshell/utils/__init__.py +16 -0
- sqlshell/utils/profile_cn2.py +1661 -0
- sqlshell/utils/profile_column.py +2635 -0
- sqlshell/utils/profile_distributions.py +616 -0
- sqlshell/utils/profile_entropy.py +347 -0
- sqlshell/utils/profile_foreign_keys.py +779 -0
- sqlshell/utils/profile_keys.py +2834 -0
- sqlshell/utils/profile_ohe.py +934 -0
- sqlshell/utils/profile_ohe_advanced.py +754 -0
- sqlshell/utils/profile_ohe_comparison.py +237 -0
- sqlshell/utils/profile_prediction.py +926 -0
- sqlshell/utils/profile_similarity.py +876 -0
- sqlshell/utils/search_in_df.py +90 -0
- sqlshell/widgets.py +400 -0
- sqlshell-0.4.4.dist-info/METADATA +441 -0
- sqlshell-0.4.4.dist-info/RECORD +54 -0
- sqlshell-0.4.4.dist-info/WHEEL +5 -0
- sqlshell-0.4.4.dist-info/entry_points.txt +2 -0
- sqlshell-0.4.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2834 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import itertools
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
import random
|
|
6
|
+
import time
|
|
7
|
+
import math
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from PyQt6.QtWidgets import (
|
|
10
|
+
QApplication, QWidget, QVBoxLayout, QLabel, QTableWidget, QTableWidgetItem, QHeaderView, QTabWidget, QMainWindow
|
|
11
|
+
)
|
|
12
|
+
from PyQt6.QtCore import Qt
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def estimate_computation_cost(n_rows, n_cols, max_combination_size, max_lhs_size):
|
|
16
|
+
"""
|
|
17
|
+
Estimate computational cost to decide on sampling strategy.
|
|
18
|
+
Returns (estimated_seconds, should_sample, sample_size)
|
|
19
|
+
"""
|
|
20
|
+
# Special handling for high-column datasets - these are computationally expensive
|
|
21
|
+
if n_cols > 50:
|
|
22
|
+
# Very aggressive limits for high-column datasets
|
|
23
|
+
print(f" High-column dataset detected ({n_cols} columns) - using aggressive optimization")
|
|
24
|
+
return float('inf'), True, min(5000, max(1000, n_rows // 20))
|
|
25
|
+
|
|
26
|
+
# Base cost factors
|
|
27
|
+
fd_combinations = sum(math.comb(n_cols, i) for i in range(1, max_lhs_size + 1))
|
|
28
|
+
key_combinations = sum(math.comb(n_cols, i) for i in range(1, max_combination_size + 1))
|
|
29
|
+
|
|
30
|
+
# Rough estimate: each combination costs O(n_rows * log(n_rows)) for groupby
|
|
31
|
+
fd_cost = fd_combinations * n_rows * math.log(max(n_rows, 2)) * 1e-6
|
|
32
|
+
key_cost = key_combinations * n_rows * math.log(max(n_rows, 2)) * 1e-6
|
|
33
|
+
|
|
34
|
+
total_cost = fd_cost + key_cost
|
|
35
|
+
|
|
36
|
+
# Sampling thresholds
|
|
37
|
+
if total_cost > 30: # More than 30 seconds estimated
|
|
38
|
+
return total_cost, True, min(50000, max(10000, n_rows // 10))
|
|
39
|
+
elif total_cost > 10: # More than 10 seconds estimated
|
|
40
|
+
return total_cost, True, min(100000, max(20000, n_rows // 5))
|
|
41
|
+
else:
|
|
42
|
+
return total_cost, False, n_rows
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def sample_dataframe_intelligently(df, sample_size, random_state=42):
|
|
46
|
+
"""
|
|
47
|
+
Sample dataframe while preserving data characteristics for key analysis.
|
|
48
|
+
"""
|
|
49
|
+
try:
|
|
50
|
+
if len(df) <= sample_size:
|
|
51
|
+
return df, False
|
|
52
|
+
|
|
53
|
+
# Ensure sample_size is valid
|
|
54
|
+
sample_size = min(sample_size, len(df))
|
|
55
|
+
if sample_size <= 0:
|
|
56
|
+
return df.head(100) if len(df) > 100 else df, True
|
|
57
|
+
|
|
58
|
+
# Strategy: Take a mix of random sample and important patterns
|
|
59
|
+
np.random.seed(random_state)
|
|
60
|
+
|
|
61
|
+
# 1. Take a random sample (80% of sample)
|
|
62
|
+
random_sample_size = max(1, int(sample_size * 0.8))
|
|
63
|
+
random_sample_size = min(random_sample_size, len(df))
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
random_indices = np.random.choice(len(df), size=random_sample_size, replace=False)
|
|
67
|
+
except ValueError:
|
|
68
|
+
# Fallback if numpy choice fails
|
|
69
|
+
random_indices = np.random.permutation(len(df))[:random_sample_size]
|
|
70
|
+
|
|
71
|
+
# 2. Add unique value representatives (20% of sample)
|
|
72
|
+
remaining_sample = sample_size - random_sample_size
|
|
73
|
+
unique_representatives = []
|
|
74
|
+
|
|
75
|
+
if remaining_sample > 0:
|
|
76
|
+
for col in df.columns:
|
|
77
|
+
if len(unique_representatives) >= remaining_sample:
|
|
78
|
+
break
|
|
79
|
+
try:
|
|
80
|
+
# Get indices of unique values not already in random sample
|
|
81
|
+
unique_values = df[col].drop_duplicates()
|
|
82
|
+
unique_indices = unique_values.index.tolist()
|
|
83
|
+
new_indices = [i for i in unique_indices if i not in random_indices and i < len(df)]
|
|
84
|
+
unique_representatives.extend(new_indices[:remaining_sample - len(unique_representatives)])
|
|
85
|
+
except Exception:
|
|
86
|
+
continue # Skip problematic columns
|
|
87
|
+
|
|
88
|
+
# Combine samples and ensure all indices are valid
|
|
89
|
+
all_indices = list(set(random_indices) | set(unique_representatives))
|
|
90
|
+
all_indices = [i for i in all_indices if 0 <= i < len(df)] # Bounds check
|
|
91
|
+
all_indices = all_indices[:sample_size] # Limit to sample size
|
|
92
|
+
|
|
93
|
+
if not all_indices:
|
|
94
|
+
# Fallback: just take first sample_size rows
|
|
95
|
+
return df.head(sample_size), True
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
sampled_df = df.iloc[all_indices].reset_index(drop=True)
|
|
99
|
+
return sampled_df, True
|
|
100
|
+
except (IndexError, KeyError):
|
|
101
|
+
# Final fallback: simple head sampling
|
|
102
|
+
return df.head(sample_size), True
|
|
103
|
+
|
|
104
|
+
except Exception as e:
|
|
105
|
+
print(f"Warning: Error in intelligent sampling: {e}. Using simple head sampling.")
|
|
106
|
+
safe_sample_size = min(sample_size, len(df))
|
|
107
|
+
return df.head(safe_sample_size), True
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def find_functional_dependencies_ultra_optimized(df: pd.DataFrame, max_lhs_size: int = 2):
|
|
111
|
+
"""
|
|
112
|
+
Ultra-optimized functional dependency discovery for large datasets.
|
|
113
|
+
Maintains correctness while improving performance through smart sampling and caching.
|
|
114
|
+
"""
|
|
115
|
+
n_rows = len(df)
|
|
116
|
+
cols = list(df.columns)
|
|
117
|
+
|
|
118
|
+
if n_rows == 0 or len(cols) < 2:
|
|
119
|
+
return []
|
|
120
|
+
|
|
121
|
+
# Only sample for very large datasets to maintain accuracy for smaller ones
|
|
122
|
+
original_df = df
|
|
123
|
+
was_sampled = False
|
|
124
|
+
if n_rows > 50000: # Only sample for very large datasets
|
|
125
|
+
cost, should_sample, sample_size = estimate_computation_cost(n_rows, len(cols), 3, max_lhs_size)
|
|
126
|
+
if should_sample:
|
|
127
|
+
df, was_sampled = sample_dataframe_intelligently(df, sample_size)
|
|
128
|
+
n_rows = len(df)
|
|
129
|
+
print(f" Sampled {n_rows} rows from {len(original_df)} for FD analysis")
|
|
130
|
+
|
|
131
|
+
fds = []
|
|
132
|
+
|
|
133
|
+
# Pre-compute all cardinalities once
|
|
134
|
+
col_cardinalities = {col: df[col].nunique() for col in cols}
|
|
135
|
+
|
|
136
|
+
# Use the same filtering logic as the original but with pre-computed cardinalities
|
|
137
|
+
# Don't be too aggressive with filtering to maintain consistency
|
|
138
|
+
non_unique_cols = [col for col in cols if col_cardinalities[col] < n_rows]
|
|
139
|
+
|
|
140
|
+
# Group cache for efficient reuse
|
|
141
|
+
group_cache = {}
|
|
142
|
+
|
|
143
|
+
# Apply combination limits only for very large datasets
|
|
144
|
+
if n_rows > 100000:
|
|
145
|
+
max_combinations_per_size = {1: min(100, len(cols)), 2: min(200, len(cols) ** 2)}
|
|
146
|
+
else:
|
|
147
|
+
max_combinations_per_size = {1: len(cols), 2: len(cols) ** 2} # No limits for smaller datasets
|
|
148
|
+
|
|
149
|
+
for size in range(1, max_lhs_size + 1):
|
|
150
|
+
# Use same logic as optimized version for consistency
|
|
151
|
+
lhs_candidates = non_unique_cols if size == 1 else cols
|
|
152
|
+
|
|
153
|
+
lhs_combinations = list(itertools.combinations(lhs_candidates, size))
|
|
154
|
+
|
|
155
|
+
# Only limit combinations for very large datasets
|
|
156
|
+
if n_rows > 100000:
|
|
157
|
+
max_combos = max_combinations_per_size.get(size, len(lhs_combinations))
|
|
158
|
+
if len(lhs_combinations) > max_combos:
|
|
159
|
+
# Prioritize by cardinality (lower cardinality = more likely to be determinant)
|
|
160
|
+
lhs_combinations = sorted(lhs_combinations,
|
|
161
|
+
key=lambda x: sum(col_cardinalities[col] for col in x))[:max_combos]
|
|
162
|
+
|
|
163
|
+
for lhs in lhs_combinations:
|
|
164
|
+
lhs_tuple = tuple(lhs)
|
|
165
|
+
|
|
166
|
+
# Use cached groupby if available
|
|
167
|
+
if lhs_tuple not in group_cache:
|
|
168
|
+
try:
|
|
169
|
+
grouped = df.groupby(list(lhs), sort=False, dropna=False)
|
|
170
|
+
group_sizes = grouped.size()
|
|
171
|
+
group_cache[lhs_tuple] = (grouped, group_sizes)
|
|
172
|
+
except Exception:
|
|
173
|
+
continue # Skip problematic groupings
|
|
174
|
+
else:
|
|
175
|
+
grouped, group_sizes = group_cache[lhs_tuple]
|
|
176
|
+
|
|
177
|
+
# Use same logic as optimized version
|
|
178
|
+
n_groups = len(group_sizes)
|
|
179
|
+
if group_sizes.max() == 1:
|
|
180
|
+
continue # No interesting dependencies possible
|
|
181
|
+
|
|
182
|
+
# Test all RHS candidates like the original, but with early termination heuristics
|
|
183
|
+
for rhs in cols:
|
|
184
|
+
if rhs in lhs:
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
# Only apply early termination for large datasets
|
|
188
|
+
if n_rows > 100000 and col_cardinalities[rhs] > n_groups:
|
|
189
|
+
continue
|
|
190
|
+
|
|
191
|
+
try:
|
|
192
|
+
# Check FD using same logic as optimized version
|
|
193
|
+
rhs_per_group = grouped[rhs].nunique()
|
|
194
|
+
if (rhs_per_group <= 1).all():
|
|
195
|
+
fds.append((lhs, rhs))
|
|
196
|
+
except Exception:
|
|
197
|
+
continue # Skip problematic columns
|
|
198
|
+
|
|
199
|
+
return fds
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def find_candidate_keys_ultra_optimized(df: pd.DataFrame, max_combination_size: int = 2):
|
|
203
|
+
"""
|
|
204
|
+
Ultra-optimized candidate key discovery for large datasets.
|
|
205
|
+
Maintains correctness while improving performance.
|
|
206
|
+
"""
|
|
207
|
+
n_rows = len(df)
|
|
208
|
+
cols = list(df.columns)
|
|
209
|
+
|
|
210
|
+
if n_rows == 0:
|
|
211
|
+
return [], [], []
|
|
212
|
+
|
|
213
|
+
# Only sample for very large datasets
|
|
214
|
+
original_df = df
|
|
215
|
+
was_sampled = False
|
|
216
|
+
if n_rows > 50000: # Only sample for very large datasets
|
|
217
|
+
cost, should_sample, sample_size = estimate_computation_cost(n_rows, len(cols), max_combination_size, 2)
|
|
218
|
+
if should_sample:
|
|
219
|
+
df, was_sampled = sample_dataframe_intelligently(df, sample_size)
|
|
220
|
+
n_rows = len(df)
|
|
221
|
+
print(f" Sampled {n_rows} rows from {len(original_df)} for key analysis")
|
|
222
|
+
|
|
223
|
+
all_keys = []
|
|
224
|
+
|
|
225
|
+
# Check single columns first (same as optimized version)
|
|
226
|
+
single_column_keys = []
|
|
227
|
+
col_cardinalities = {}
|
|
228
|
+
|
|
229
|
+
for col in cols:
|
|
230
|
+
cardinality = df[col].nunique()
|
|
231
|
+
col_cardinalities[col] = cardinality
|
|
232
|
+
if cardinality == n_rows:
|
|
233
|
+
single_column_keys.append((col,))
|
|
234
|
+
all_keys.append((col,))
|
|
235
|
+
|
|
236
|
+
# Early termination only for single-column case if we have keys
|
|
237
|
+
if single_column_keys and max_combination_size == 1:
|
|
238
|
+
return all_keys, single_column_keys, []
|
|
239
|
+
|
|
240
|
+
# Apply conservative limits only for very large datasets
|
|
241
|
+
if n_rows > 100000:
|
|
242
|
+
max_combination_size = min(max_combination_size, 3)
|
|
243
|
+
max_combinations_to_test = min(500, math.comb(len(cols), 2))
|
|
244
|
+
else:
|
|
245
|
+
max_combinations_to_test = float('inf') # No limits for smaller datasets
|
|
246
|
+
|
|
247
|
+
# Multi-column key discovery
|
|
248
|
+
for size in range(2, max_combination_size + 1):
|
|
249
|
+
if size > len(cols):
|
|
250
|
+
break
|
|
251
|
+
|
|
252
|
+
combinations = list(itertools.combinations(cols, size))
|
|
253
|
+
|
|
254
|
+
# Only limit and prioritize for very large datasets
|
|
255
|
+
if n_rows > 100000 and len(combinations) > max_combinations_to_test:
|
|
256
|
+
# Prioritize combinations by likelihood of being keys
|
|
257
|
+
combinations = sorted(combinations,
|
|
258
|
+
key=lambda x: sum(col_cardinalities.get(col, n_rows) for col in x))
|
|
259
|
+
combinations = combinations[:max_combinations_to_test]
|
|
260
|
+
|
|
261
|
+
size_keys = []
|
|
262
|
+
tested_count = 0
|
|
263
|
+
|
|
264
|
+
for combo in combinations:
|
|
265
|
+
# Skip if contains single-column key
|
|
266
|
+
if any((col,) in single_column_keys for col in combo):
|
|
267
|
+
continue
|
|
268
|
+
|
|
269
|
+
# Skip if subset is already a key (same logic as optimized)
|
|
270
|
+
is_superkey = False
|
|
271
|
+
for subset_size in range(1, size):
|
|
272
|
+
for subset in itertools.combinations(combo, subset_size):
|
|
273
|
+
if subset in all_keys:
|
|
274
|
+
is_superkey = True
|
|
275
|
+
break
|
|
276
|
+
if is_superkey:
|
|
277
|
+
break
|
|
278
|
+
|
|
279
|
+
if is_superkey:
|
|
280
|
+
continue
|
|
281
|
+
|
|
282
|
+
# Check uniqueness using same method as optimized
|
|
283
|
+
try:
|
|
284
|
+
unique_count = len(df[list(combo)].drop_duplicates())
|
|
285
|
+
if unique_count == n_rows:
|
|
286
|
+
size_keys.append(combo)
|
|
287
|
+
all_keys.append(combo)
|
|
288
|
+
except Exception:
|
|
289
|
+
continue # Skip problematic combinations
|
|
290
|
+
|
|
291
|
+
tested_count += 1
|
|
292
|
+
# Only apply testing limits for very large datasets
|
|
293
|
+
if n_rows > 100000 and tested_count >= max_combinations_to_test // (size * size):
|
|
294
|
+
break
|
|
295
|
+
|
|
296
|
+
# Early termination if no keys found and we have smaller keys
|
|
297
|
+
if not size_keys and all_keys:
|
|
298
|
+
break
|
|
299
|
+
|
|
300
|
+
# Classify keys (same logic as optimized)
|
|
301
|
+
candidate_keys = []
|
|
302
|
+
superkeys = []
|
|
303
|
+
|
|
304
|
+
for key in all_keys:
|
|
305
|
+
is_candidate = True
|
|
306
|
+
for other_key in all_keys:
|
|
307
|
+
if len(other_key) < len(key) and set(other_key).issubset(set(key)):
|
|
308
|
+
is_candidate = False
|
|
309
|
+
break
|
|
310
|
+
|
|
311
|
+
if is_candidate:
|
|
312
|
+
candidate_keys.append(key)
|
|
313
|
+
else:
|
|
314
|
+
superkeys.append(key)
|
|
315
|
+
|
|
316
|
+
return all_keys, candidate_keys, superkeys
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def profile_ultra_optimized(df: pd.DataFrame, max_combination_size: int = 2, max_lhs_size: int = 2):
|
|
320
|
+
"""
|
|
321
|
+
Ultra-optimized profile function for large datasets.
|
|
322
|
+
"""
|
|
323
|
+
start_time = time.time()
|
|
324
|
+
n_rows = len(df)
|
|
325
|
+
cols = list(df.columns)
|
|
326
|
+
|
|
327
|
+
print(f"Starting analysis of {n_rows:,} rows × {len(cols)} columns...")
|
|
328
|
+
|
|
329
|
+
# Intelligent parameter adjustment based on data size
|
|
330
|
+
if n_rows > 100000:
|
|
331
|
+
max_combination_size = min(max_combination_size, 2)
|
|
332
|
+
max_lhs_size = min(max_lhs_size, 2)
|
|
333
|
+
print(f" Large dataset detected - limiting analysis to combinations of size {max_combination_size}")
|
|
334
|
+
elif n_rows > 50000:
|
|
335
|
+
max_combination_size = min(max_combination_size, 3)
|
|
336
|
+
max_lhs_size = min(max_lhs_size, 2)
|
|
337
|
+
|
|
338
|
+
# Discover functional dependencies
|
|
339
|
+
fd_start = time.time()
|
|
340
|
+
fds = find_functional_dependencies_ultra_optimized(df, max_lhs_size)
|
|
341
|
+
fd_time = time.time() - fd_start
|
|
342
|
+
print(f" FD discovery completed in {fd_time:.2f}s - found {len(fds)} dependencies")
|
|
343
|
+
|
|
344
|
+
fd_results = [(", ".join(lhs), rhs) for lhs, rhs in fds]
|
|
345
|
+
|
|
346
|
+
# Discover keys
|
|
347
|
+
key_start = time.time()
|
|
348
|
+
all_keys, candidate_keys, superkeys = find_candidate_keys_ultra_optimized(df, max_combination_size)
|
|
349
|
+
key_time = time.time() - key_start
|
|
350
|
+
print(f" Key discovery completed in {key_time:.2f}s - found {len(candidate_keys)} candidate keys")
|
|
351
|
+
|
|
352
|
+
# Efficient result preparation
|
|
353
|
+
results = []
|
|
354
|
+
single_col_uniqueness = {col: df[col].nunique() for col in cols}
|
|
355
|
+
|
|
356
|
+
# Process results with smart computation limiting
|
|
357
|
+
combinations_tested = 0
|
|
358
|
+
max_combinations_total = min(1000, sum(math.comb(len(cols), i) for i in range(1, max_combination_size + 1)))
|
|
359
|
+
|
|
360
|
+
for size in range(1, max_combination_size + 1):
|
|
361
|
+
for combo in itertools.combinations(cols, size):
|
|
362
|
+
if combinations_tested >= max_combinations_total:
|
|
363
|
+
break
|
|
364
|
+
|
|
365
|
+
if len(combo) == 1:
|
|
366
|
+
unique_count = single_col_uniqueness[combo[0]]
|
|
367
|
+
elif combo in all_keys:
|
|
368
|
+
# For keys, we know they're unique
|
|
369
|
+
unique_count = n_rows
|
|
370
|
+
elif size <= 2: # Only compute for small combinations
|
|
371
|
+
try:
|
|
372
|
+
unique_count = len(df[list(combo)].drop_duplicates())
|
|
373
|
+
except Exception:
|
|
374
|
+
unique_count = min(n_rows, sum(single_col_uniqueness[col] for col in combo) // len(combo))
|
|
375
|
+
else:
|
|
376
|
+
# Estimate for larger combinations
|
|
377
|
+
unique_count = min(n_rows, sum(single_col_uniqueness[col] for col in combo) // len(combo))
|
|
378
|
+
|
|
379
|
+
unique_ratio = unique_count / n_rows if n_rows > 0 else 0
|
|
380
|
+
is_key = combo in all_keys
|
|
381
|
+
is_candidate = combo in candidate_keys
|
|
382
|
+
is_superkey = combo in superkeys
|
|
383
|
+
|
|
384
|
+
key_type = ""
|
|
385
|
+
if is_candidate:
|
|
386
|
+
key_type = "★ Candidate Key"
|
|
387
|
+
elif is_superkey:
|
|
388
|
+
key_type = "⊃ Superkey"
|
|
389
|
+
|
|
390
|
+
results.append((combo, unique_count, unique_ratio, is_key, key_type))
|
|
391
|
+
combinations_tested += 1
|
|
392
|
+
|
|
393
|
+
# Sort efficiently
|
|
394
|
+
results.sort(key=lambda x: (not x[3], -x[2], len(x[0])))
|
|
395
|
+
key_results = [(", ".join(c), u, f"{u/n_rows:.2%}", k)
|
|
396
|
+
for c, u, _, _, k in results]
|
|
397
|
+
|
|
398
|
+
# Generate normalized tables
|
|
399
|
+
normalized_tables = propose_normalized_tables(cols, candidate_keys, fds)
|
|
400
|
+
|
|
401
|
+
total_time = time.time() - start_time
|
|
402
|
+
print(f" Total analysis completed in {total_time:.2f}s")
|
|
403
|
+
|
|
404
|
+
return fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def create_stress_test_data(size, n_cols=None, complexity='medium'):
|
|
408
|
+
"""
|
|
409
|
+
Create stress test data with different complexity levels.
|
|
410
|
+
"""
|
|
411
|
+
random.seed(42)
|
|
412
|
+
np.random.seed(42)
|
|
413
|
+
|
|
414
|
+
if n_cols is None:
|
|
415
|
+
if complexity == 'simple':
|
|
416
|
+
n_cols = min(8, max(4, int(math.log10(size))))
|
|
417
|
+
elif complexity == 'medium':
|
|
418
|
+
n_cols = min(15, max(6, int(math.log10(size) * 1.5)))
|
|
419
|
+
else: # complex
|
|
420
|
+
n_cols = min(25, max(10, int(math.log10(size) * 2)))
|
|
421
|
+
|
|
422
|
+
print(f"Creating stress test data: {size:,} rows × {n_cols} columns ({complexity} complexity)")
|
|
423
|
+
|
|
424
|
+
data = {}
|
|
425
|
+
|
|
426
|
+
# Create ID column (always unique)
|
|
427
|
+
data['id'] = range(1, size + 1)
|
|
428
|
+
|
|
429
|
+
# Create categorical columns with different cardinalities
|
|
430
|
+
if complexity == 'simple':
|
|
431
|
+
cardinalities = [10, 20, 50, min(100, size // 10)]
|
|
432
|
+
elif complexity == 'medium':
|
|
433
|
+
cardinalities = [5, 10, 25, 50, 100, min(200, size // 10), min(500, size // 5)]
|
|
434
|
+
else: # complex
|
|
435
|
+
cardinalities = [3, 5, 10, 20, 50, 100, 200, min(500, size // 10), min(1000, size // 5)]
|
|
436
|
+
|
|
437
|
+
for i in range(1, min(n_cols, len(cardinalities) + 1)):
|
|
438
|
+
card = cardinalities[min(i-1, len(cardinalities)-1)]
|
|
439
|
+
data[f'cat_{i}'] = [f'cat_{i}_val_{j % card}' for j in range(size)]
|
|
440
|
+
|
|
441
|
+
# Add some functional dependencies
|
|
442
|
+
if n_cols > 4:
|
|
443
|
+
# category -> subcategory
|
|
444
|
+
data['category'] = [f'Category_{i % 5}' for i in range(size)]
|
|
445
|
+
data['subcategory'] = [f'Sub_{data["category"][i]}_{i % 3}' for i in range(size)]
|
|
446
|
+
|
|
447
|
+
if n_cols > 6:
|
|
448
|
+
# Create some numeric columns with dependencies
|
|
449
|
+
data['price'] = [random.randint(10, 1000) for _ in range(size)]
|
|
450
|
+
data['tax_rate'] = [0.1 if data['category'][i] == 'Category_0' else 0.15 for i in range(size)]
|
|
451
|
+
data['total_price'] = [int(data['price'][i] * (1 + data['tax_rate'][i])) for i in range(size)]
|
|
452
|
+
|
|
453
|
+
# Fill remaining columns with random data
|
|
454
|
+
remaining_cols = n_cols - len(data)
|
|
455
|
+
for i in range(remaining_cols):
|
|
456
|
+
col_name = f'random_{i}'
|
|
457
|
+
data[col_name] = [random.randint(1, min(1000, size // 2)) for _ in range(size)]
|
|
458
|
+
|
|
459
|
+
return pd.DataFrame(data)
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def comprehensive_benchmark():
|
|
463
|
+
"""
|
|
464
|
+
Comprehensive benchmark for large dataset performance.
|
|
465
|
+
"""
|
|
466
|
+
print("=== COMPREHENSIVE LARGE DATA BENCHMARK ===\n")
|
|
467
|
+
|
|
468
|
+
# Test different dataset sizes and complexities
|
|
469
|
+
test_configs = [
|
|
470
|
+
(1000, 'simple'),
|
|
471
|
+
(5000, 'simple'),
|
|
472
|
+
(10000, 'medium'),
|
|
473
|
+
(50000, 'medium'),
|
|
474
|
+
(100000, 'medium'),
|
|
475
|
+
(500000, 'complex'),
|
|
476
|
+
(1000000, 'complex'),
|
|
477
|
+
]
|
|
478
|
+
|
|
479
|
+
results = []
|
|
480
|
+
|
|
481
|
+
for size, complexity in test_configs:
|
|
482
|
+
print(f"\n{'='*60}")
|
|
483
|
+
print(f"TESTING: {size:,} rows with {complexity} complexity")
|
|
484
|
+
print('='*60)
|
|
485
|
+
|
|
486
|
+
try:
|
|
487
|
+
# Create test data
|
|
488
|
+
df = create_stress_test_data(size, complexity=complexity)
|
|
489
|
+
|
|
490
|
+
# Test ultra-optimized version
|
|
491
|
+
print("\n⚡ Running ULTRA-OPTIMIZED version...")
|
|
492
|
+
start_time = time.time()
|
|
493
|
+
ultra_results = profile_ultra_optimized(df, max_combination_size=3, max_lhs_size=2)
|
|
494
|
+
ultra_time = time.time() - start_time
|
|
495
|
+
|
|
496
|
+
# Test old optimized version for comparison (only for smaller datasets)
|
|
497
|
+
if size <= 10000:
|
|
498
|
+
print("\n🐌 Running OLD-OPTIMIZED version...")
|
|
499
|
+
start_time = time.time()
|
|
500
|
+
old_results = profile_optimized(df, max_combination_size=3, max_lhs_size=2)
|
|
501
|
+
old_time = time.time() - start_time
|
|
502
|
+
speedup = old_time / ultra_time if ultra_time > 0 else float('inf')
|
|
503
|
+
else:
|
|
504
|
+
print("\n⏭️ Skipping old version (too slow for large data)")
|
|
505
|
+
old_time = None
|
|
506
|
+
speedup = None
|
|
507
|
+
|
|
508
|
+
# Memory usage estimation
|
|
509
|
+
memory_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
|
|
510
|
+
|
|
511
|
+
results.append({
|
|
512
|
+
'size': size,
|
|
513
|
+
'complexity': complexity,
|
|
514
|
+
'columns': len(df.columns),
|
|
515
|
+
'memory_mb': memory_mb,
|
|
516
|
+
'ultra_time': ultra_time,
|
|
517
|
+
'old_time': old_time,
|
|
518
|
+
'speedup': speedup,
|
|
519
|
+
'fds_found': len(ultra_results[0]),
|
|
520
|
+
'keys_found': len([k for k in ultra_results[1] if "Candidate Key" in k[3]]),
|
|
521
|
+
'success': True
|
|
522
|
+
})
|
|
523
|
+
|
|
524
|
+
print(f"\n📊 RESULTS:")
|
|
525
|
+
print(f" Dataset: {size:,} rows × {len(df.columns)} cols ({memory_mb:.1f} MB)")
|
|
526
|
+
print(f" Ultra-optimized: {ultra_time:.3f} seconds")
|
|
527
|
+
if old_time:
|
|
528
|
+
print(f" Old optimized: {old_time:.3f} seconds")
|
|
529
|
+
print(f" Speedup: {speedup:.2f}x")
|
|
530
|
+
print(f" Found: {len(ultra_results[0])} FDs, {len([k for k in ultra_results[1] if 'Candidate Key' in k[3]])} keys")
|
|
531
|
+
|
|
532
|
+
# Performance targets
|
|
533
|
+
if ultra_time < 5:
|
|
534
|
+
print(" ✅ Excellent performance")
|
|
535
|
+
elif ultra_time < 15:
|
|
536
|
+
print(" ✅ Good performance")
|
|
537
|
+
elif ultra_time < 60:
|
|
538
|
+
print(" ⚠️ Acceptable performance")
|
|
539
|
+
else:
|
|
540
|
+
print(" ❌ Needs further optimization")
|
|
541
|
+
|
|
542
|
+
except Exception as e:
|
|
543
|
+
print(f" ❌ FAILED: {e}")
|
|
544
|
+
results.append({
|
|
545
|
+
'size': size,
|
|
546
|
+
'complexity': complexity,
|
|
547
|
+
'columns': '?',
|
|
548
|
+
'memory_mb': 0,
|
|
549
|
+
'ultra_time': float('inf'),
|
|
550
|
+
'old_time': None,
|
|
551
|
+
'speedup': None,
|
|
552
|
+
'fds_found': 0,
|
|
553
|
+
'keys_found': 0,
|
|
554
|
+
'success': False
|
|
555
|
+
})
|
|
556
|
+
|
|
557
|
+
# Print comprehensive summary
|
|
558
|
+
print(f"\n{'='*80}")
|
|
559
|
+
print("COMPREHENSIVE BENCHMARK SUMMARY")
|
|
560
|
+
print('='*80)
|
|
561
|
+
print(f"{'Size':<8} {'Complexity':<10} {'Cols':<5} {'Memory':<8} {'Time':<8} {'Speedup':<8} {'FDs':<4} {'Keys':<4} {'Status'}")
|
|
562
|
+
print("-" * 80)
|
|
563
|
+
|
|
564
|
+
for result in results:
|
|
565
|
+
size = f"{result['size']:,}"
|
|
566
|
+
complexity = result['complexity']
|
|
567
|
+
cols = str(result['columns'])
|
|
568
|
+
memory = f"{result['memory_mb']:.1f}MB"
|
|
569
|
+
time_str = f"{result['ultra_time']:.2f}s" if result['ultra_time'] != float('inf') else "FAIL"
|
|
570
|
+
speedup = f"{result['speedup']:.1f}x" if result['speedup'] else "N/A"
|
|
571
|
+
fds = str(result['fds_found'])
|
|
572
|
+
keys = str(result['keys_found'])
|
|
573
|
+
status = "✅" if result['success'] else "❌"
|
|
574
|
+
|
|
575
|
+
print(f"{size:<8} {complexity:<10} {cols:<5} {memory:<8} {time_str:<8} {speedup:<8} {fds:<4} {keys:<4} {status}")
|
|
576
|
+
|
|
577
|
+
# Performance analysis
|
|
578
|
+
successful_results = [r for r in results if r['success']]
|
|
579
|
+
if successful_results:
|
|
580
|
+
print(f"\n🎯 PERFORMANCE ANALYSIS:")
|
|
581
|
+
print(f" • Successfully processed up to {max(r['size'] for r in successful_results):,} rows")
|
|
582
|
+
print(f" • Average time for datasets under 100K: {np.mean([r['ultra_time'] for r in successful_results if r['size'] < 100000]):.2f}s")
|
|
583
|
+
print(f" • Largest dataset processed: {max(r['memory_mb'] for r in successful_results):.1f} MB")
|
|
584
|
+
|
|
585
|
+
# Speed improvements
|
|
586
|
+
speed_improvements = [r['speedup'] for r in successful_results if r['speedup'] and r['speedup'] != float('inf')]
|
|
587
|
+
if speed_improvements:
|
|
588
|
+
print(f" • Average speedup over old version: {np.mean(speed_improvements):.1f}x")
|
|
589
|
+
|
|
590
|
+
return results
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
def find_functional_dependencies_optimized(df: pd.DataFrame, max_lhs_size: int = 2):
|
|
594
|
+
"""
|
|
595
|
+
Highly optimized functional dependency discovery.
|
|
596
|
+
Main optimizations:
|
|
597
|
+
1. Early termination for trivial cases
|
|
598
|
+
2. Efficient groupby operations
|
|
599
|
+
3. Smart filtering to avoid checking impossible FDs
|
|
600
|
+
"""
|
|
601
|
+
fds = []
|
|
602
|
+
cols = list(df.columns)
|
|
603
|
+
n_rows = len(df)
|
|
604
|
+
|
|
605
|
+
if n_rows == 0 or len(cols) < 2:
|
|
606
|
+
return fds
|
|
607
|
+
|
|
608
|
+
# Pre-compute column cardinalities
|
|
609
|
+
col_cardinalities = {col: df[col].nunique() for col in cols}
|
|
610
|
+
|
|
611
|
+
# Skip columns that are unique (they trivially determine everything)
|
|
612
|
+
non_unique_cols = [col for col in cols if col_cardinalities[col] < n_rows]
|
|
613
|
+
|
|
614
|
+
# Cache groupby results to avoid recomputation
|
|
615
|
+
groupby_cache = {}
|
|
616
|
+
|
|
617
|
+
for size in range(1, max_lhs_size + 1):
|
|
618
|
+
# Only consider non-unique columns for LHS
|
|
619
|
+
lhs_candidates = non_unique_cols if size == 1 else cols
|
|
620
|
+
|
|
621
|
+
for lhs in itertools.combinations(lhs_candidates, size):
|
|
622
|
+
lhs_tuple = tuple(lhs)
|
|
623
|
+
|
|
624
|
+
# Use cached groupby if available
|
|
625
|
+
if lhs_tuple in groupby_cache:
|
|
626
|
+
grouped = groupby_cache[lhs_tuple]
|
|
627
|
+
else:
|
|
628
|
+
# Use pandas groupby which is highly optimized
|
|
629
|
+
grouped = df.groupby(list(lhs), sort=False, dropna=False)
|
|
630
|
+
groupby_cache[lhs_tuple] = grouped
|
|
631
|
+
|
|
632
|
+
# Get group info efficiently
|
|
633
|
+
group_info = grouped.size()
|
|
634
|
+
n_groups = len(group_info)
|
|
635
|
+
|
|
636
|
+
# If all groups have size 1, skip (no interesting FDs)
|
|
637
|
+
if group_info.max() == 1:
|
|
638
|
+
continue
|
|
639
|
+
|
|
640
|
+
for rhs in cols:
|
|
641
|
+
if rhs in lhs:
|
|
642
|
+
continue
|
|
643
|
+
|
|
644
|
+
# Remove the overly aggressive early termination that was filtering out valid FDs
|
|
645
|
+
# The original algorithm doesn't have this filter, so we shouldn't either
|
|
646
|
+
|
|
647
|
+
# Check if RHS is functionally determined by LHS
|
|
648
|
+
# Count unique RHS values per group
|
|
649
|
+
try:
|
|
650
|
+
rhs_per_group = grouped[rhs].nunique()
|
|
651
|
+
|
|
652
|
+
# FD holds if every group has at most 1 unique RHS value
|
|
653
|
+
if (rhs_per_group <= 1).all():
|
|
654
|
+
fds.append((lhs, rhs))
|
|
655
|
+
except Exception:
|
|
656
|
+
continue # Skip problematic columns
|
|
657
|
+
|
|
658
|
+
return fds
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
def find_candidate_keys_optimized(df: pd.DataFrame, max_combination_size: int = 2):
|
|
662
|
+
"""
|
|
663
|
+
Highly optimized candidate key discovery.
|
|
664
|
+
Main optimizations:
|
|
665
|
+
1. Early termination when smaller keys are found
|
|
666
|
+
2. Efficient uniqueness checking with drop_duplicates
|
|
667
|
+
3. Smart pruning of superkey candidates
|
|
668
|
+
"""
|
|
669
|
+
n_rows = len(df)
|
|
670
|
+
cols = list(df.columns)
|
|
671
|
+
|
|
672
|
+
if n_rows == 0:
|
|
673
|
+
return [], [], []
|
|
674
|
+
|
|
675
|
+
all_keys = []
|
|
676
|
+
|
|
677
|
+
# Check single columns first (most common case)
|
|
678
|
+
single_column_keys = []
|
|
679
|
+
for col in cols:
|
|
680
|
+
if df[col].nunique() == n_rows:
|
|
681
|
+
single_column_keys.append((col,))
|
|
682
|
+
all_keys.append((col,))
|
|
683
|
+
|
|
684
|
+
# If we found single-column keys, we can stop here for many use cases
|
|
685
|
+
# Multi-column keys would be superkeys
|
|
686
|
+
if single_column_keys and max_combination_size == 1:
|
|
687
|
+
return all_keys, single_column_keys, []
|
|
688
|
+
|
|
689
|
+
# For multi-column combinations, use efficient approach
|
|
690
|
+
for size in range(2, max_combination_size + 1):
|
|
691
|
+
size_keys = []
|
|
692
|
+
|
|
693
|
+
for combo in itertools.combinations(cols, size):
|
|
694
|
+
# Skip if any single column in combo is already a key
|
|
695
|
+
if any((col,) in single_column_keys for col in combo):
|
|
696
|
+
continue
|
|
697
|
+
|
|
698
|
+
# Skip if any smaller subset is already a key
|
|
699
|
+
is_superkey = False
|
|
700
|
+
for subset_size in range(1, size):
|
|
701
|
+
for subset in itertools.combinations(combo, subset_size):
|
|
702
|
+
if subset in all_keys:
|
|
703
|
+
is_superkey = True
|
|
704
|
+
break
|
|
705
|
+
if is_superkey:
|
|
706
|
+
break
|
|
707
|
+
|
|
708
|
+
if is_superkey:
|
|
709
|
+
continue
|
|
710
|
+
|
|
711
|
+
# Check uniqueness using efficient drop_duplicates
|
|
712
|
+
if len(df[list(combo)].drop_duplicates()) == n_rows:
|
|
713
|
+
size_keys.append(combo)
|
|
714
|
+
all_keys.append(combo)
|
|
715
|
+
|
|
716
|
+
# If no keys found at this size and we have smaller keys, we can stop
|
|
717
|
+
if not size_keys and all_keys:
|
|
718
|
+
break
|
|
719
|
+
|
|
720
|
+
# Separate candidate keys from superkeys
|
|
721
|
+
candidate_keys = []
|
|
722
|
+
superkeys = []
|
|
723
|
+
|
|
724
|
+
for key in all_keys:
|
|
725
|
+
is_candidate = True
|
|
726
|
+
for other_key in all_keys:
|
|
727
|
+
if len(other_key) < len(key) and set(other_key).issubset(set(key)):
|
|
728
|
+
is_candidate = False
|
|
729
|
+
break
|
|
730
|
+
|
|
731
|
+
if is_candidate:
|
|
732
|
+
candidate_keys.append(key)
|
|
733
|
+
else:
|
|
734
|
+
superkeys.append(key)
|
|
735
|
+
|
|
736
|
+
return all_keys, candidate_keys, superkeys
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
def profile_optimized(df: pd.DataFrame, max_combination_size: int = 2, max_lhs_size: int = 2):
|
|
740
|
+
"""
|
|
741
|
+
Highly optimized profile function.
|
|
742
|
+
Main optimizations:
|
|
743
|
+
1. Reduced redundant computations
|
|
744
|
+
2. Early termination strategies
|
|
745
|
+
3. Efficient pandas operations
|
|
746
|
+
"""
|
|
747
|
+
n_rows = len(df)
|
|
748
|
+
cols = list(df.columns)
|
|
749
|
+
|
|
750
|
+
# Use optimized algorithms
|
|
751
|
+
fds = find_functional_dependencies_optimized(df, max_lhs_size)
|
|
752
|
+
fd_results = [(", ".join(lhs), rhs) for lhs, rhs in fds]
|
|
753
|
+
|
|
754
|
+
all_keys, candidate_keys, superkeys = find_candidate_keys_optimized(df, max_combination_size)
|
|
755
|
+
|
|
756
|
+
# Prepare results efficiently
|
|
757
|
+
results = []
|
|
758
|
+
|
|
759
|
+
# Pre-compute uniqueness for single columns
|
|
760
|
+
single_col_uniqueness = {col: df[col].nunique() for col in cols}
|
|
761
|
+
|
|
762
|
+
for size in range(1, max_combination_size + 1):
|
|
763
|
+
for combo in itertools.combinations(cols, size):
|
|
764
|
+
if len(combo) == 1:
|
|
765
|
+
unique_count = single_col_uniqueness[combo[0]]
|
|
766
|
+
else:
|
|
767
|
+
# Only compute for combinations we need
|
|
768
|
+
if combo in all_keys or size <= 2: # Always compute for size 1,2
|
|
769
|
+
unique_count = len(df[list(combo)].drop_duplicates())
|
|
770
|
+
else:
|
|
771
|
+
# For larger non-keys, we can estimate or skip
|
|
772
|
+
unique_count = min(n_rows,
|
|
773
|
+
sum(single_col_uniqueness[col] for col in combo) // len(combo))
|
|
774
|
+
|
|
775
|
+
unique_ratio = unique_count / n_rows if n_rows > 0 else 0
|
|
776
|
+
is_key = combo in all_keys
|
|
777
|
+
is_candidate = combo in candidate_keys
|
|
778
|
+
is_superkey = combo in superkeys
|
|
779
|
+
|
|
780
|
+
key_type = ""
|
|
781
|
+
if is_candidate:
|
|
782
|
+
key_type = "★ Candidate Key"
|
|
783
|
+
elif is_superkey:
|
|
784
|
+
key_type = "⊃ Superkey"
|
|
785
|
+
|
|
786
|
+
results.append((combo, unique_count, unique_ratio, is_key, key_type))
|
|
787
|
+
|
|
788
|
+
results.sort(key=lambda x: (not x[3], -x[2], len(x[0])))
|
|
789
|
+
key_results = [(", ".join(c), u, f"{u/n_rows:.2%}", k)
|
|
790
|
+
for c, u, _, _, k in results]
|
|
791
|
+
|
|
792
|
+
normalized_tables = propose_normalized_tables(cols, candidate_keys, fds)
|
|
793
|
+
|
|
794
|
+
return fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables
|
|
795
|
+
|
|
796
|
+
|
|
797
|
+
def propose_normalized_tables(cols, candidate_keys, fds):
|
|
798
|
+
"""
|
|
799
|
+
Propose a set of normalized tables based on functional dependencies.
|
|
800
|
+
Uses a simplified approach to create 3NF tables.
|
|
801
|
+
|
|
802
|
+
Parameters:
|
|
803
|
+
- cols: list of all columns
|
|
804
|
+
- candidate_keys: list of candidate keys
|
|
805
|
+
- fds: list of functional dependencies as (lhs, rhs) tuples
|
|
806
|
+
|
|
807
|
+
Returns:
|
|
808
|
+
- List of proposed tables as (table_name, primary_key, attributes) tuples
|
|
809
|
+
"""
|
|
810
|
+
# Start with a set of all attributes
|
|
811
|
+
all_attrs = set(cols)
|
|
812
|
+
proposed_tables = []
|
|
813
|
+
|
|
814
|
+
# Group FDs by their determinants (LHS)
|
|
815
|
+
determinant_groups = {}
|
|
816
|
+
for lhs, rhs in fds:
|
|
817
|
+
lhs_key = tuple(sorted(lhs))
|
|
818
|
+
if lhs_key not in determinant_groups:
|
|
819
|
+
determinant_groups[lhs_key] = []
|
|
820
|
+
determinant_groups[lhs_key].append(rhs)
|
|
821
|
+
|
|
822
|
+
# Create tables for each determinant group
|
|
823
|
+
table_counter = 1
|
|
824
|
+
for lhs, rhs_list in determinant_groups.items():
|
|
825
|
+
table_attrs = set(lhs) | set(rhs_list)
|
|
826
|
+
if table_attrs: # Skip empty tables
|
|
827
|
+
table_name = f"Table_{table_counter}"
|
|
828
|
+
primary_key = ", ".join(lhs)
|
|
829
|
+
attributes = list(table_attrs)
|
|
830
|
+
proposed_tables.append((table_name, primary_key, attributes))
|
|
831
|
+
table_counter += 1
|
|
832
|
+
|
|
833
|
+
# Create a table for any remaining attributes not in any FD
|
|
834
|
+
# or create a table with a candidate key if none exists yet
|
|
835
|
+
used_attrs = set()
|
|
836
|
+
for _, _, attrs in proposed_tables:
|
|
837
|
+
used_attrs.update(attrs)
|
|
838
|
+
|
|
839
|
+
remaining_attrs = all_attrs - used_attrs
|
|
840
|
+
if remaining_attrs:
|
|
841
|
+
# If we have a candidate key, use it for remaining attributes
|
|
842
|
+
for key in candidate_keys:
|
|
843
|
+
key_set = set(key)
|
|
844
|
+
if key_set & remaining_attrs: # If key has overlap with remaining attrs
|
|
845
|
+
table_name = f"Table_{table_counter}"
|
|
846
|
+
primary_key = ", ".join(key)
|
|
847
|
+
attributes = list(remaining_attrs | key_set)
|
|
848
|
+
proposed_tables.append((table_name, primary_key, attributes))
|
|
849
|
+
break
|
|
850
|
+
else: # No suitable candidate key
|
|
851
|
+
table_name = f"Table_{table_counter}"
|
|
852
|
+
primary_key = "id (suggested)"
|
|
853
|
+
attributes = list(remaining_attrs)
|
|
854
|
+
proposed_tables.append((table_name, primary_key, attributes))
|
|
855
|
+
|
|
856
|
+
return proposed_tables
|
|
857
|
+
|
|
858
|
+
|
|
859
|
+
# Keep the original functions for comparison
|
|
860
|
+
def find_functional_dependencies(df: pd.DataFrame, max_lhs_size: int = 2):
|
|
861
|
+
"""
|
|
862
|
+
Original functional dependency discovery function (for comparison).
|
|
863
|
+
"""
|
|
864
|
+
fds = []
|
|
865
|
+
cols = list(df.columns)
|
|
866
|
+
n_rows = len(df)
|
|
867
|
+
|
|
868
|
+
for size in range(1, max_lhs_size + 1):
|
|
869
|
+
for lhs in itertools.combinations(cols, size):
|
|
870
|
+
# for each potential dependent attribute not in lhs
|
|
871
|
+
lhs_df = df[list(lhs)]
|
|
872
|
+
# group by lhs and count distinct values of each other column
|
|
873
|
+
grouped = df.groupby(list(lhs))
|
|
874
|
+
for rhs in cols:
|
|
875
|
+
if rhs in lhs:
|
|
876
|
+
continue
|
|
877
|
+
# Check if for each group, rhs has only one distinct value
|
|
878
|
+
distinct_counts = grouped[rhs].nunique(dropna=False)
|
|
879
|
+
if (distinct_counts <= 1).all():
|
|
880
|
+
fds.append((lhs, rhs))
|
|
881
|
+
return fds
|
|
882
|
+
|
|
883
|
+
|
|
884
|
+
def profile_original(df: pd.DataFrame, max_combination_size: int = 2, max_lhs_size: int = 2):
|
|
885
|
+
"""
|
|
886
|
+
Original profile function (for comparison).
|
|
887
|
+
"""
|
|
888
|
+
n_rows = len(df)
|
|
889
|
+
cols = list(df.columns)
|
|
890
|
+
|
|
891
|
+
# Discover functional dependencies
|
|
892
|
+
fds = find_functional_dependencies(df, max_lhs_size)
|
|
893
|
+
|
|
894
|
+
# Prepare FD results
|
|
895
|
+
fd_results = [(", ".join(lhs), rhs) for lhs, rhs in fds]
|
|
896
|
+
|
|
897
|
+
# Profile keys (by uniqueness)
|
|
898
|
+
all_keys = []
|
|
899
|
+
for size in range(1, max_combination_size + 1):
|
|
900
|
+
for combo in itertools.combinations(cols, size):
|
|
901
|
+
unique_count = df.drop_duplicates(subset=combo).shape[0]
|
|
902
|
+
unique_ratio = unique_count / n_rows
|
|
903
|
+
is_key = unique_count == n_rows
|
|
904
|
+
if is_key:
|
|
905
|
+
all_keys.append(combo)
|
|
906
|
+
|
|
907
|
+
# Distinguish between candidate keys and superkeys
|
|
908
|
+
candidate_keys = []
|
|
909
|
+
superkeys = []
|
|
910
|
+
|
|
911
|
+
for key in all_keys:
|
|
912
|
+
is_candidate = True
|
|
913
|
+
# Check if any proper subset of this key is also a key
|
|
914
|
+
for i in range(1, len(key)):
|
|
915
|
+
for subset in itertools.combinations(key, i):
|
|
916
|
+
if subset in all_keys:
|
|
917
|
+
is_candidate = False
|
|
918
|
+
break
|
|
919
|
+
if not is_candidate:
|
|
920
|
+
break
|
|
921
|
+
|
|
922
|
+
if is_candidate:
|
|
923
|
+
candidate_keys.append(key)
|
|
924
|
+
else:
|
|
925
|
+
superkeys.append(key)
|
|
926
|
+
|
|
927
|
+
# Prepare results for all keys (both candidate keys and superkeys)
|
|
928
|
+
results = []
|
|
929
|
+
for size in range(1, max_combination_size + 1):
|
|
930
|
+
for combo in itertools.combinations(cols, size):
|
|
931
|
+
unique_count = df.drop_duplicates(subset=combo).shape[0]
|
|
932
|
+
unique_ratio = unique_count / n_rows
|
|
933
|
+
is_key = combo in all_keys
|
|
934
|
+
is_candidate = combo in candidate_keys
|
|
935
|
+
is_superkey = combo in superkeys
|
|
936
|
+
|
|
937
|
+
# Use icons for different key types
|
|
938
|
+
key_type = ""
|
|
939
|
+
if is_candidate:
|
|
940
|
+
key_type = "★ Candidate Key" # Star for candidate keys
|
|
941
|
+
elif is_superkey:
|
|
942
|
+
key_type = "⊃ Superkey" # Superset symbol for superkeys
|
|
943
|
+
|
|
944
|
+
results.append((combo, unique_count, unique_ratio, is_key, key_type))
|
|
945
|
+
|
|
946
|
+
results.sort(key=lambda x: (not x[3], -x[2], len(x[0])))
|
|
947
|
+
key_results = [(", ".join(c), u, f"{u/n_rows:.2%}", k)
|
|
948
|
+
for c, u, _, _, k in results]
|
|
949
|
+
|
|
950
|
+
# Propose normalized tables
|
|
951
|
+
normalized_tables = propose_normalized_tables(cols, candidate_keys, fds)
|
|
952
|
+
|
|
953
|
+
return fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables
|
|
954
|
+
|
|
955
|
+
|
|
956
|
+
# Update the main profile function to use the optimized version
|
|
957
|
+
def profile(df: pd.DataFrame, max_combination_size: int = 2, max_lhs_size: int = 2):
|
|
958
|
+
"""
|
|
959
|
+
Analyze a pandas DataFrame to suggest candidate keys and discover functional dependencies.
|
|
960
|
+
Automatically selects the best optimization level based on dataset size and characteristics.
|
|
961
|
+
|
|
962
|
+
Parameters:
|
|
963
|
+
- df: pandas.DataFrame to analyze.
|
|
964
|
+
- max_combination_size: max size of column combos to test for keys.
|
|
965
|
+
- max_lhs_size: max size of LHS in discovered FDs.
|
|
966
|
+
|
|
967
|
+
Returns:
|
|
968
|
+
- Tuple of (fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables)
|
|
969
|
+
"""
|
|
970
|
+
n_rows, n_cols = len(df), len(df.columns)
|
|
971
|
+
|
|
972
|
+
# Choose optimization level based on dataset characteristics
|
|
973
|
+
if n_cols > 50:
|
|
974
|
+
# High-column datasets get special treatment regardless of row count
|
|
975
|
+
print("🏗️ Using HIGH-COLUMN-OPTIMIZED mode for wide dataset")
|
|
976
|
+
return profile_high_column_optimized(df, max_combination_size, max_lhs_size)
|
|
977
|
+
elif n_rows > 500000 or (n_rows > 100000 and n_cols > 15):
|
|
978
|
+
print("🚀 Using HYPER-OPTIMIZED mode for very large dataset")
|
|
979
|
+
return profile_hyper_optimized(df, max_combination_size, max_lhs_size)
|
|
980
|
+
elif n_rows > 10000 or n_cols > 10:
|
|
981
|
+
print("⚡ Using ULTRA-OPTIMIZED mode for large dataset")
|
|
982
|
+
return profile_ultra_optimized(df, max_combination_size, max_lhs_size)
|
|
983
|
+
else:
|
|
984
|
+
print("🔍 Using STANDARD-OPTIMIZED mode for small dataset")
|
|
985
|
+
return profile_optimized(df, max_combination_size, max_lhs_size)
|
|
986
|
+
|
|
987
|
+
|
|
988
|
+
def visualize_profile(df: pd.DataFrame, max_combination_size: int = 2, max_lhs_size: int = 2):
|
|
989
|
+
"""
|
|
990
|
+
Create a visual representation of the key profile for a dataframe.
|
|
991
|
+
|
|
992
|
+
Parameters:
|
|
993
|
+
- df: pandas.DataFrame to analyze.
|
|
994
|
+
- max_combination_size: max size of column combos to test for keys.
|
|
995
|
+
- max_lhs_size: max size of LHS in discovered FDs.
|
|
996
|
+
|
|
997
|
+
Returns:
|
|
998
|
+
- QMainWindow: The visualization window
|
|
999
|
+
"""
|
|
1000
|
+
# Get profile results
|
|
1001
|
+
fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables = profile(
|
|
1002
|
+
df, max_combination_size, max_lhs_size
|
|
1003
|
+
)
|
|
1004
|
+
|
|
1005
|
+
# Create main window
|
|
1006
|
+
window = QMainWindow()
|
|
1007
|
+
window.setWindowTitle("Table Profile: Keys & Dependencies")
|
|
1008
|
+
window.resize(900, 700)
|
|
1009
|
+
|
|
1010
|
+
# Create central widget and layout
|
|
1011
|
+
central_widget = QWidget()
|
|
1012
|
+
window.setCentralWidget(central_widget)
|
|
1013
|
+
layout = QVBoxLayout(central_widget)
|
|
1014
|
+
|
|
1015
|
+
# Add header
|
|
1016
|
+
header = QLabel(f"Analyzed {n_rows} rows × {len(cols)} columns; key combos up to size {max_combination_size}, FDs up to LHS size {max_lhs_size}")
|
|
1017
|
+
header.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
1018
|
+
header.setStyleSheet("font-size: 14pt; font-weight: bold; margin: 10px;")
|
|
1019
|
+
layout.addWidget(header)
|
|
1020
|
+
|
|
1021
|
+
# Add description
|
|
1022
|
+
description = QLabel(
|
|
1023
|
+
"This profile helps identify candidate keys and functional dependencies in your data. "
|
|
1024
|
+
"★ Candidate keys are minimal combinations of columns that uniquely identify rows. "
|
|
1025
|
+
"⊃ Superkeys are non-minimal column sets that uniquely identify rows. "
|
|
1026
|
+
"Functional dependencies indicate when one column's values determine another's."
|
|
1027
|
+
)
|
|
1028
|
+
description.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
1029
|
+
description.setWordWrap(True)
|
|
1030
|
+
description.setStyleSheet("margin-bottom: 10px;")
|
|
1031
|
+
layout.addWidget(description)
|
|
1032
|
+
|
|
1033
|
+
# Add key for icons
|
|
1034
|
+
icons_key = QLabel("Key: ★ = Minimal Candidate Key | ⊃ = Non-minimal Superkey")
|
|
1035
|
+
icons_key.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
1036
|
+
icons_key.setStyleSheet("font-style: italic; margin-bottom: 15px;")
|
|
1037
|
+
layout.addWidget(icons_key)
|
|
1038
|
+
|
|
1039
|
+
# Create tabs
|
|
1040
|
+
tabs = QTabWidget()
|
|
1041
|
+
|
|
1042
|
+
# Tab for Candidate Keys
|
|
1043
|
+
key_tab = QWidget()
|
|
1044
|
+
key_layout = QVBoxLayout()
|
|
1045
|
+
|
|
1046
|
+
key_header = QLabel("Keys (Column Combinations that Uniquely Identify Rows)")
|
|
1047
|
+
key_header.setStyleSheet("font-weight: bold;")
|
|
1048
|
+
key_layout.addWidget(key_header)
|
|
1049
|
+
|
|
1050
|
+
key_table = QTableWidget(len(key_results), 4)
|
|
1051
|
+
key_table.setHorizontalHeaderLabels(["Columns", "Unique Count", "Uniqueness Ratio", "Key Type"])
|
|
1052
|
+
key_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
|
|
1053
|
+
for row, (cols_str, count, ratio, key_type) in enumerate(key_results):
|
|
1054
|
+
key_table.setItem(row, 0, QTableWidgetItem(cols_str))
|
|
1055
|
+
key_table.setItem(row, 1, QTableWidgetItem(str(count)))
|
|
1056
|
+
key_table.setItem(row, 2, QTableWidgetItem(ratio))
|
|
1057
|
+
|
|
1058
|
+
# Create item with appropriate styling
|
|
1059
|
+
type_item = QTableWidgetItem(key_type)
|
|
1060
|
+
if "Candidate Key" in key_type:
|
|
1061
|
+
type_item.setForeground(Qt.GlobalColor.darkGreen)
|
|
1062
|
+
elif "Superkey" in key_type:
|
|
1063
|
+
type_item.setForeground(Qt.GlobalColor.darkBlue)
|
|
1064
|
+
key_table.setItem(row, 3, type_item)
|
|
1065
|
+
|
|
1066
|
+
key_layout.addWidget(key_table)
|
|
1067
|
+
key_tab.setLayout(key_layout)
|
|
1068
|
+
tabs.addTab(key_tab, "Keys")
|
|
1069
|
+
|
|
1070
|
+
# Tab for FDs
|
|
1071
|
+
fd_tab = QWidget()
|
|
1072
|
+
fd_layout = QVBoxLayout()
|
|
1073
|
+
|
|
1074
|
+
fd_header = QLabel("Functional Dependencies (When Values in One Set of Columns Determine Another Column)")
|
|
1075
|
+
fd_header.setStyleSheet("font-weight: bold;")
|
|
1076
|
+
fd_layout.addWidget(fd_header)
|
|
1077
|
+
|
|
1078
|
+
fd_table = QTableWidget(len(fd_results), 2)
|
|
1079
|
+
fd_table.setHorizontalHeaderLabels(["Determinant (LHS)", "Dependent (RHS)"])
|
|
1080
|
+
fd_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
|
|
1081
|
+
for i, (lhs, rhs) in enumerate(fd_results):
|
|
1082
|
+
lhs_item = QTableWidgetItem(lhs)
|
|
1083
|
+
lhs_item.setFlags(lhs_item.flags() ^ Qt.ItemFlag.ItemIsEditable)
|
|
1084
|
+
fd_table.setItem(i, 0, lhs_item)
|
|
1085
|
+
fd_table.setItem(i, 1, QTableWidgetItem(rhs))
|
|
1086
|
+
fd_layout.addWidget(fd_table)
|
|
1087
|
+
fd_tab.setLayout(fd_layout)
|
|
1088
|
+
tabs.addTab(fd_tab, "Functional Dependencies")
|
|
1089
|
+
|
|
1090
|
+
# Tab for Normalized Tables
|
|
1091
|
+
norm_tab = QWidget()
|
|
1092
|
+
norm_layout = QVBoxLayout()
|
|
1093
|
+
|
|
1094
|
+
norm_header = QLabel("Proposed Normalized Tables (Based on Functional Dependencies)")
|
|
1095
|
+
norm_header.setStyleSheet("font-weight: bold;")
|
|
1096
|
+
norm_layout.addWidget(norm_header)
|
|
1097
|
+
|
|
1098
|
+
norm_description = QLabel(
|
|
1099
|
+
"These tables represent a proposed normalized schema based on the discovered functional dependencies. "
|
|
1100
|
+
"Each table includes attributes that are functionally dependent on its primary key. "
|
|
1101
|
+
"This is an approximate 3NF decomposition and may need further refinement."
|
|
1102
|
+
)
|
|
1103
|
+
norm_description.setWordWrap(True)
|
|
1104
|
+
norm_description.setStyleSheet("margin-bottom: 10px;")
|
|
1105
|
+
norm_layout.addWidget(norm_description)
|
|
1106
|
+
|
|
1107
|
+
norm_table = QTableWidget(len(normalized_tables), 3)
|
|
1108
|
+
norm_table.setHorizontalHeaderLabels(["Table Name", "Primary Key", "Attributes"])
|
|
1109
|
+
norm_table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
|
|
1110
|
+
for i, (table_name, primary_key, attributes) in enumerate(normalized_tables):
|
|
1111
|
+
norm_table.setItem(i, 0, QTableWidgetItem(table_name))
|
|
1112
|
+
|
|
1113
|
+
pk_item = QTableWidgetItem(primary_key)
|
|
1114
|
+
pk_item.setForeground(Qt.GlobalColor.darkGreen)
|
|
1115
|
+
norm_table.setItem(i, 1, pk_item)
|
|
1116
|
+
|
|
1117
|
+
norm_table.setItem(i, 2, QTableWidgetItem(", ".join(attributes)))
|
|
1118
|
+
|
|
1119
|
+
norm_layout.addWidget(norm_table)
|
|
1120
|
+
norm_tab.setLayout(norm_layout)
|
|
1121
|
+
tabs.addTab(norm_tab, "Normalized Tables")
|
|
1122
|
+
|
|
1123
|
+
layout.addWidget(tabs)
|
|
1124
|
+
|
|
1125
|
+
# Show the window
|
|
1126
|
+
window.show()
|
|
1127
|
+
return window
|
|
1128
|
+
|
|
1129
|
+
|
|
1130
|
+
def benchmark_performance():
|
|
1131
|
+
"""
|
|
1132
|
+
Benchmark the performance improvements of the optimized version.
|
|
1133
|
+
"""
|
|
1134
|
+
print("=== PROFILE KEYS PERFORMANCE BENCHMARK ===\n")
|
|
1135
|
+
|
|
1136
|
+
# Create realistic test datasets of varying sizes
|
|
1137
|
+
test_sizes = [100, 500, 1000, 2000]
|
|
1138
|
+
results = []
|
|
1139
|
+
|
|
1140
|
+
for size in test_sizes:
|
|
1141
|
+
print(f"Testing with {size} rows...")
|
|
1142
|
+
|
|
1143
|
+
# Create realistic test data
|
|
1144
|
+
df = create_realistic_test_data(size)
|
|
1145
|
+
|
|
1146
|
+
# Benchmark original version
|
|
1147
|
+
start_time = time.time()
|
|
1148
|
+
try:
|
|
1149
|
+
original_results = profile_original(df, max_combination_size=3, max_lhs_size=2)
|
|
1150
|
+
original_time = time.time() - start_time
|
|
1151
|
+
original_success = True
|
|
1152
|
+
except Exception as e:
|
|
1153
|
+
original_time = float('inf')
|
|
1154
|
+
original_success = False
|
|
1155
|
+
print(f" Original version failed: {e}")
|
|
1156
|
+
|
|
1157
|
+
# Benchmark optimized version
|
|
1158
|
+
start_time = time.time()
|
|
1159
|
+
try:
|
|
1160
|
+
optimized_results = profile_optimized(df, max_combination_size=3, max_lhs_size=2)
|
|
1161
|
+
optimized_time = time.time() - start_time
|
|
1162
|
+
optimized_success = True
|
|
1163
|
+
except Exception as e:
|
|
1164
|
+
optimized_time = float('inf')
|
|
1165
|
+
optimized_success = False
|
|
1166
|
+
print(f" Optimized version failed: {e}")
|
|
1167
|
+
|
|
1168
|
+
# Verify results are consistent (if both succeeded)
|
|
1169
|
+
consistent = True
|
|
1170
|
+
if original_success and optimized_success:
|
|
1171
|
+
# Compare functional dependencies
|
|
1172
|
+
orig_fds = set(original_results[0])
|
|
1173
|
+
opt_fds = set(optimized_results[0])
|
|
1174
|
+
|
|
1175
|
+
# Compare key findings (just the key type counts)
|
|
1176
|
+
orig_key_types = [result[3] for result in original_results[1]]
|
|
1177
|
+
opt_key_types = [result[3] for result in optimized_results[1]]
|
|
1178
|
+
|
|
1179
|
+
if orig_fds != opt_fds or orig_key_types != opt_key_types:
|
|
1180
|
+
consistent = False
|
|
1181
|
+
print(f" WARNING: Results differ between versions!")
|
|
1182
|
+
|
|
1183
|
+
# Calculate speedup
|
|
1184
|
+
if original_time > 0 and optimized_time > 0:
|
|
1185
|
+
speedup = original_time / optimized_time
|
|
1186
|
+
else:
|
|
1187
|
+
speedup = float('inf') if optimized_time > 0 else 0
|
|
1188
|
+
|
|
1189
|
+
results.append({
|
|
1190
|
+
'size': size,
|
|
1191
|
+
'original_time': original_time,
|
|
1192
|
+
'optimized_time': optimized_time,
|
|
1193
|
+
'speedup': speedup,
|
|
1194
|
+
'consistent': consistent,
|
|
1195
|
+
'original_success': original_success,
|
|
1196
|
+
'optimized_success': optimized_success
|
|
1197
|
+
})
|
|
1198
|
+
|
|
1199
|
+
print(f" Original: {original_time:.3f}s")
|
|
1200
|
+
print(f" Optimized: {optimized_time:.3f}s")
|
|
1201
|
+
if speedup != float('inf'):
|
|
1202
|
+
print(f" Speedup: {speedup:.2f}x")
|
|
1203
|
+
print(f" Results consistent: {consistent}")
|
|
1204
|
+
print()
|
|
1205
|
+
|
|
1206
|
+
# Print summary
|
|
1207
|
+
print("=== BENCHMARK SUMMARY ===")
|
|
1208
|
+
print(f"{'Size':<6} {'Original':<10} {'Optimized':<10} {'Speedup':<8} {'Consistent'}")
|
|
1209
|
+
print("-" * 50)
|
|
1210
|
+
|
|
1211
|
+
for result in results:
|
|
1212
|
+
size = result['size']
|
|
1213
|
+
orig_time = f"{result['original_time']:.3f}s" if result['original_success'] else "FAILED"
|
|
1214
|
+
opt_time = f"{result['optimized_time']:.3f}s" if result['optimized_success'] else "FAILED"
|
|
1215
|
+
speedup = f"{result['speedup']:.2f}x" if result['speedup'] != float('inf') else "∞"
|
|
1216
|
+
consistent = "✓" if result['consistent'] else "✗"
|
|
1217
|
+
|
|
1218
|
+
print(f"{size:<6} {orig_time:<10} {opt_time:<10} {speedup:<8} {consistent}")
|
|
1219
|
+
|
|
1220
|
+
# Calculate average speedup for successful runs
|
|
1221
|
+
successful_speedups = [r['speedup'] for r in results if r['speedup'] != float('inf') and r['speedup'] > 0]
|
|
1222
|
+
if successful_speedups:
|
|
1223
|
+
avg_speedup = sum(successful_speedups) / len(successful_speedups)
|
|
1224
|
+
print(f"\nAverage speedup: {avg_speedup:.2f}x")
|
|
1225
|
+
|
|
1226
|
+
return results
|
|
1227
|
+
|
|
1228
|
+
|
|
1229
|
+
def create_realistic_test_data(size):
|
|
1230
|
+
"""
|
|
1231
|
+
Create realistic test data for benchmarking with known functional dependencies.
|
|
1232
|
+
"""
|
|
1233
|
+
random.seed(42) # For reproducibility
|
|
1234
|
+
np.random.seed(42)
|
|
1235
|
+
|
|
1236
|
+
# Create realistic customer-order-product scenario
|
|
1237
|
+
n_customers = min(size // 10, 100) # 10% unique customers, max 100
|
|
1238
|
+
n_products = min(size // 20, 50) # 5% unique products, max 50
|
|
1239
|
+
n_orders = min(size // 5, 200) # 20% unique orders, max 200
|
|
1240
|
+
|
|
1241
|
+
customer_ids = list(range(1, n_customers + 1))
|
|
1242
|
+
customer_names = [f"Customer_{i}" for i in customer_ids]
|
|
1243
|
+
customer_cities = [f"City_{i % 10}" for i in customer_ids] # 10 cities
|
|
1244
|
+
|
|
1245
|
+
product_ids = list(range(1001, 1001 + n_products))
|
|
1246
|
+
product_names = [f"Product_{i}" for i in product_ids]
|
|
1247
|
+
product_categories = [f"Category_{i % 5}" for i in range(n_products)] # 5 categories
|
|
1248
|
+
|
|
1249
|
+
order_ids = list(range(10001, 10001 + n_orders))
|
|
1250
|
+
|
|
1251
|
+
# Generate order line items
|
|
1252
|
+
data = []
|
|
1253
|
+
for i in range(size):
|
|
1254
|
+
customer_id = random.choice(customer_ids)
|
|
1255
|
+
customer_idx = customer_id - 1
|
|
1256
|
+
order_id = random.choice(order_ids)
|
|
1257
|
+
product_id = random.choice(product_ids)
|
|
1258
|
+
product_idx = product_id - 1001
|
|
1259
|
+
|
|
1260
|
+
data.append({
|
|
1261
|
+
'order_line_id': 100001 + i, # Unique for each row
|
|
1262
|
+
'customer_id': customer_id,
|
|
1263
|
+
'customer_name': customer_names[customer_idx], # FD: customer_id -> customer_name
|
|
1264
|
+
'customer_city': customer_cities[customer_idx], # FD: customer_id -> customer_city
|
|
1265
|
+
'order_id': order_id,
|
|
1266
|
+
'product_id': product_id,
|
|
1267
|
+
'product_name': product_names[product_idx], # FD: product_id -> product_name
|
|
1268
|
+
'product_category': product_categories[product_idx], # FD: product_id -> product_category
|
|
1269
|
+
'quantity': random.randint(1, 10),
|
|
1270
|
+
'unit_price': random.randint(10, 100),
|
|
1271
|
+
'total_price': 0 # Will be calculated
|
|
1272
|
+
})
|
|
1273
|
+
|
|
1274
|
+
# Calculate total price (FD: quantity, unit_price -> total_price)
|
|
1275
|
+
data[-1]['total_price'] = data[-1]['quantity'] * data[-1]['unit_price']
|
|
1276
|
+
|
|
1277
|
+
df = pd.DataFrame(data)
|
|
1278
|
+
|
|
1279
|
+
# Add some duplicate rows to make it more realistic
|
|
1280
|
+
if size > 100:
|
|
1281
|
+
n_duplicates = size // 20 # 5% duplicates
|
|
1282
|
+
duplicate_indices = np.random.choice(len(df), n_duplicates, replace=True)
|
|
1283
|
+
duplicate_rows = df.iloc[duplicate_indices].copy()
|
|
1284
|
+
duplicate_rows['order_line_id'] = range(200001, 200001 + len(duplicate_rows))
|
|
1285
|
+
df = pd.concat([df, duplicate_rows], ignore_index=True)
|
|
1286
|
+
|
|
1287
|
+
return df
|
|
1288
|
+
|
|
1289
|
+
|
|
1290
|
+
def test_realistic_scenario():
|
|
1291
|
+
"""
|
|
1292
|
+
Test the optimized version with a realistic scenario and verify expected results.
|
|
1293
|
+
"""
|
|
1294
|
+
print("=== REALISTIC SCENARIO TEST ===\n")
|
|
1295
|
+
|
|
1296
|
+
# Create test data with known structure
|
|
1297
|
+
df = create_realistic_test_data(500)
|
|
1298
|
+
|
|
1299
|
+
print(f"Created test dataset with {len(df)} rows and {len(df.columns)} columns")
|
|
1300
|
+
print("Expected functional dependencies:")
|
|
1301
|
+
print(" - customer_id -> customer_name")
|
|
1302
|
+
print(" - customer_id -> customer_city")
|
|
1303
|
+
print(" - product_id -> product_name")
|
|
1304
|
+
print(" - product_id -> product_category")
|
|
1305
|
+
print(" - (quantity, unit_price) -> total_price")
|
|
1306
|
+
print()
|
|
1307
|
+
|
|
1308
|
+
# Run analysis
|
|
1309
|
+
start_time = time.time()
|
|
1310
|
+
fd_results, key_results, n_rows, cols, max_combo, max_lhs, norm_tables = profile_optimized(
|
|
1311
|
+
df, max_combination_size=3, max_lhs_size=2
|
|
1312
|
+
)
|
|
1313
|
+
analysis_time = time.time() - start_time
|
|
1314
|
+
|
|
1315
|
+
print(f"Analysis completed in {analysis_time:.3f} seconds")
|
|
1316
|
+
print()
|
|
1317
|
+
|
|
1318
|
+
# Display results
|
|
1319
|
+
print("Discovered Functional Dependencies:")
|
|
1320
|
+
if fd_results:
|
|
1321
|
+
for lhs, rhs in fd_results:
|
|
1322
|
+
print(f" {lhs} -> {rhs}")
|
|
1323
|
+
else:
|
|
1324
|
+
print(" None found")
|
|
1325
|
+
print()
|
|
1326
|
+
|
|
1327
|
+
print("Candidate Keys Found:")
|
|
1328
|
+
candidate_keys = [result for result in key_results if "Candidate Key" in result[3]]
|
|
1329
|
+
if candidate_keys:
|
|
1330
|
+
for cols_str, count, ratio, key_type in candidate_keys:
|
|
1331
|
+
print(f" {cols_str} ({ratio} unique)")
|
|
1332
|
+
else:
|
|
1333
|
+
print(" None found")
|
|
1334
|
+
print()
|
|
1335
|
+
|
|
1336
|
+
print("Proposed Normalized Tables:")
|
|
1337
|
+
for i, (table_name, pk, attrs) in enumerate(norm_tables, 1):
|
|
1338
|
+
print(f" {table_name}: PK({pk}) -> {', '.join(attrs)}")
|
|
1339
|
+
|
|
1340
|
+
# Verify expected results
|
|
1341
|
+
print("\n=== VERIFICATION ===")
|
|
1342
|
+
expected_fds = [
|
|
1343
|
+
"customer_id -> customer_name",
|
|
1344
|
+
"customer_id -> customer_city",
|
|
1345
|
+
"product_id -> product_name",
|
|
1346
|
+
"product_id -> product_category"
|
|
1347
|
+
]
|
|
1348
|
+
|
|
1349
|
+
found_fds = [f"{lhs} -> {rhs}" for lhs, rhs in fd_results]
|
|
1350
|
+
|
|
1351
|
+
print("Expected FDs found:")
|
|
1352
|
+
for expected in expected_fds:
|
|
1353
|
+
found = expected in found_fds
|
|
1354
|
+
status = "✓" if found else "✗"
|
|
1355
|
+
print(f" {status} {expected}")
|
|
1356
|
+
|
|
1357
|
+
# Check for unexpected FDs
|
|
1358
|
+
unexpected_fds = [fd for fd in found_fds if fd not in expected_fds]
|
|
1359
|
+
if unexpected_fds:
|
|
1360
|
+
print("\nUnexpected FDs found:")
|
|
1361
|
+
for fd in unexpected_fds:
|
|
1362
|
+
print(f" {fd}")
|
|
1363
|
+
|
|
1364
|
+
print(f"\nCandidate key found: {'✓' if candidate_keys else '✗'}")
|
|
1365
|
+
|
|
1366
|
+
|
|
1367
|
+
def test_profile_keys(test_size=100):
|
|
1368
|
+
# Generate a dataframe with some realistic examples of a customer-product-order relationship
|
|
1369
|
+
# Create customer data
|
|
1370
|
+
customer_ids = list(range(1, 21)) # 20 customers
|
|
1371
|
+
customer_names = ["John", "Jane", "Alice", "Bob", "Charlie", "Diana", "Edward", "Fiona", "George", "Hannah"]
|
|
1372
|
+
|
|
1373
|
+
# Create product data
|
|
1374
|
+
product_names = ["Apple", "Banana", "Orange", "Grape", "Mango", "Strawberry", "Blueberry", "Kiwi", "Pineapple", "Watermelon"]
|
|
1375
|
+
product_groups = ["Fruit"] * len(product_names)
|
|
1376
|
+
|
|
1377
|
+
# Generate random orders
|
|
1378
|
+
random.seed(42) # For reproducibility
|
|
1379
|
+
df_data = {
|
|
1380
|
+
"customer_id": [random.choice(customer_ids) for _ in range(test_size)],
|
|
1381
|
+
"customer_name": [customer_names[i % len(customer_names)] for i in range(test_size)],
|
|
1382
|
+
"product_name": [random.choice(product_names) for _ in range(test_size)],
|
|
1383
|
+
"product_group": ["Fruit" for _ in range(test_size)],
|
|
1384
|
+
"order_date": [pd.Timestamp("2021-01-01") + pd.Timedelta(days=random.randint(0, 30)) for _ in range(test_size)],
|
|
1385
|
+
"order_amount": [random.randint(100, 1000) for _ in range(test_size)]
|
|
1386
|
+
}
|
|
1387
|
+
|
|
1388
|
+
# Ensure consistent relationships
|
|
1389
|
+
for i in range(test_size):
|
|
1390
|
+
# Ensure customer_name is consistently associated with customer_id
|
|
1391
|
+
customer_idx = df_data["customer_id"][i] % len(customer_names)
|
|
1392
|
+
df_data["customer_name"][i] = customer_names[customer_idx]
|
|
1393
|
+
|
|
1394
|
+
df = pd.DataFrame(df_data)
|
|
1395
|
+
|
|
1396
|
+
# Create and show visualization
|
|
1397
|
+
app = QApplication(sys.argv)
|
|
1398
|
+
window = visualize_profile(df, max_combination_size=3, max_lhs_size=2)
|
|
1399
|
+
sys.exit(app.exec())
|
|
1400
|
+
|
|
1401
|
+
|
|
1402
|
+
def demo_performance_improvements():
|
|
1403
|
+
"""
|
|
1404
|
+
Simple demonstration of the performance improvements.
|
|
1405
|
+
"""
|
|
1406
|
+
print("=== PROFILE KEYS PERFORMANCE DEMO ===\n")
|
|
1407
|
+
|
|
1408
|
+
# Create a moderately complex dataset
|
|
1409
|
+
df = create_realistic_test_data(1000)
|
|
1410
|
+
print(f"Testing with dataset: {len(df)} rows × {len(df.columns)} columns")
|
|
1411
|
+
|
|
1412
|
+
# Test original version
|
|
1413
|
+
print("\n🐌 Running ORIGINAL version...")
|
|
1414
|
+
start_time = time.time()
|
|
1415
|
+
original_results = profile_original(df, max_combination_size=3, max_lhs_size=2)
|
|
1416
|
+
original_time = time.time() - start_time
|
|
1417
|
+
|
|
1418
|
+
# Test optimized version
|
|
1419
|
+
print("⚡ Running OPTIMIZED version...")
|
|
1420
|
+
start_time = time.time()
|
|
1421
|
+
optimized_results = profile_optimized(df, max_combination_size=3, max_lhs_size=2)
|
|
1422
|
+
optimized_time = time.time() - start_time
|
|
1423
|
+
|
|
1424
|
+
# Show results
|
|
1425
|
+
speedup = original_time / optimized_time
|
|
1426
|
+
print(f"\n📊 RESULTS:")
|
|
1427
|
+
print(f" Original time: {original_time:.3f} seconds")
|
|
1428
|
+
print(f" Optimized time: {optimized_time:.3f} seconds")
|
|
1429
|
+
print(f" Speedup: {speedup:.2f}x faster!")
|
|
1430
|
+
|
|
1431
|
+
# Show discovered insights
|
|
1432
|
+
orig_fds, orig_keys = original_results[0], original_results[1]
|
|
1433
|
+
opt_fds, opt_keys = optimized_results[0], optimized_results[1]
|
|
1434
|
+
|
|
1435
|
+
print(f"\n🔍 FUNCTIONAL DEPENDENCIES FOUND:")
|
|
1436
|
+
print(f" Original: {len(orig_fds)} dependencies")
|
|
1437
|
+
print(f" Optimized: {len(opt_fds)} dependencies")
|
|
1438
|
+
|
|
1439
|
+
candidate_keys_orig = [k for k in orig_keys if "Candidate Key" in k[3]]
|
|
1440
|
+
candidate_keys_opt = [k for k in opt_keys if "Candidate Key" in k[3]]
|
|
1441
|
+
|
|
1442
|
+
print(f"\n🔑 CANDIDATE KEYS FOUND:")
|
|
1443
|
+
print(f" Original: {len(candidate_keys_orig)} keys")
|
|
1444
|
+
print(f" Optimized: {len(candidate_keys_opt)} keys")
|
|
1445
|
+
|
|
1446
|
+
if candidate_keys_opt:
|
|
1447
|
+
print("\n Key(s) discovered:")
|
|
1448
|
+
for cols, count, ratio, key_type in candidate_keys_opt:
|
|
1449
|
+
print(f" • {cols} ({ratio} unique)")
|
|
1450
|
+
|
|
1451
|
+
print(f"\n🎯 Key improvements:")
|
|
1452
|
+
print(f" • Eliminated redundant computations")
|
|
1453
|
+
print(f" • Added smart early termination")
|
|
1454
|
+
print(f" • Optimized pandas operations")
|
|
1455
|
+
print(f" • Better caching strategies")
|
|
1456
|
+
print(f" • Filtered trivial dependencies")
|
|
1457
|
+
|
|
1458
|
+
|
|
1459
|
+
def test_big_data_scenario():
|
|
1460
|
+
"""
|
|
1461
|
+
Test with a realistic big data scenario.
|
|
1462
|
+
"""
|
|
1463
|
+
print("=== BIG DATA SCENARIO TEST ===\n")
|
|
1464
|
+
|
|
1465
|
+
# Create a 1M row dataset similar to real-world scenarios
|
|
1466
|
+
df = create_stress_test_data(1000000, complexity='complex')
|
|
1467
|
+
|
|
1468
|
+
print(f"Created big data test with {len(df):,} rows and {len(df.columns)} columns")
|
|
1469
|
+
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")
|
|
1470
|
+
|
|
1471
|
+
# Test the ultra-optimized version
|
|
1472
|
+
print("\n⚡ Running ultra-optimized analysis...")
|
|
1473
|
+
start_time = time.time()
|
|
1474
|
+
|
|
1475
|
+
try:
|
|
1476
|
+
fd_results, key_results, n_rows, cols, max_combo, max_lhs, norm_tables = profile_ultra_optimized(
|
|
1477
|
+
df, max_combination_size=3, max_lhs_size=2
|
|
1478
|
+
)
|
|
1479
|
+
analysis_time = time.time() - start_time
|
|
1480
|
+
|
|
1481
|
+
print(f"\n✅ SUCCESS! Analysis completed in {analysis_time:.2f} seconds")
|
|
1482
|
+
print(f" • Processed {n_rows:,} rows")
|
|
1483
|
+
print(f" • Found {len(fd_results)} functional dependencies")
|
|
1484
|
+
print(f" • Found {len([k for k in key_results if 'Candidate Key' in k[3]])} candidate keys")
|
|
1485
|
+
print(f" • Proposed {len(norm_tables)} normalized tables")
|
|
1486
|
+
|
|
1487
|
+
if fd_results:
|
|
1488
|
+
print(f"\n🔍 Sample functional dependencies:")
|
|
1489
|
+
for i, (lhs, rhs) in enumerate(fd_results[:5]):
|
|
1490
|
+
print(f" • {lhs} → {rhs}")
|
|
1491
|
+
if len(fd_results) > 5:
|
|
1492
|
+
print(f" ... and {len(fd_results) - 5} more")
|
|
1493
|
+
|
|
1494
|
+
candidate_keys = [k for k in key_results if "Candidate Key" in k[3]]
|
|
1495
|
+
if candidate_keys:
|
|
1496
|
+
print(f"\n🔑 Candidate keys found:")
|
|
1497
|
+
for cols_str, count, ratio, key_type in candidate_keys:
|
|
1498
|
+
print(f" • {cols_str} ({ratio} unique)")
|
|
1499
|
+
|
|
1500
|
+
# Performance assessment
|
|
1501
|
+
rows_per_second = n_rows / analysis_time
|
|
1502
|
+
print(f"\n📈 Performance metrics:")
|
|
1503
|
+
print(f" • Processing rate: {rows_per_second:,.0f} rows/second")
|
|
1504
|
+
print(f" • Memory efficiency: {df.memory_usage(deep=True).sum() / 1024 / 1024 / analysis_time:.1f} MB/second")
|
|
1505
|
+
|
|
1506
|
+
if analysis_time < 30:
|
|
1507
|
+
print(" ✅ Excellent performance for big data!")
|
|
1508
|
+
elif analysis_time < 120:
|
|
1509
|
+
print(" ✅ Good performance for big data")
|
|
1510
|
+
else:
|
|
1511
|
+
print(" ⚠️ Acceptable but could be improved")
|
|
1512
|
+
|
|
1513
|
+
except Exception as e:
|
|
1514
|
+
print(f"❌ FAILED: {str(e)}")
|
|
1515
|
+
import traceback
|
|
1516
|
+
traceback.print_exc()
|
|
1517
|
+
|
|
1518
|
+
|
|
1519
|
+
def find_functional_dependencies_hyper_optimized(df: pd.DataFrame, max_lhs_size: int = 2):
|
|
1520
|
+
"""
|
|
1521
|
+
Hyper-optimized functional dependency discovery for very large datasets.
|
|
1522
|
+
Uses more aggressive sampling and limits but tries to maintain accuracy.
|
|
1523
|
+
"""
|
|
1524
|
+
n_rows = len(df)
|
|
1525
|
+
cols = list(df.columns)
|
|
1526
|
+
|
|
1527
|
+
if n_rows == 0 or len(cols) < 2:
|
|
1528
|
+
return []
|
|
1529
|
+
|
|
1530
|
+
# For very large datasets, use more aggressive sampling
|
|
1531
|
+
if n_rows > 200000:
|
|
1532
|
+
sample_size = min(25000, max(10000, n_rows // 40)) # More conservative sampling
|
|
1533
|
+
df, was_sampled = sample_dataframe_intelligently(df, sample_size)
|
|
1534
|
+
n_rows = len(df)
|
|
1535
|
+
print(f" Aggressively sampled {n_rows} rows from original dataset for FD analysis")
|
|
1536
|
+
|
|
1537
|
+
fds = []
|
|
1538
|
+
|
|
1539
|
+
# Pre-compute cardinalities
|
|
1540
|
+
col_cardinalities = {col: df[col].nunique() for col in cols}
|
|
1541
|
+
|
|
1542
|
+
# Use similar but more aggressive filtering than ultra-optimized
|
|
1543
|
+
non_unique_cols = [col for col in cols if 1 < col_cardinalities[col] < n_rows * 0.9]
|
|
1544
|
+
|
|
1545
|
+
if not non_unique_cols:
|
|
1546
|
+
return fds
|
|
1547
|
+
|
|
1548
|
+
# Much more aggressive limits
|
|
1549
|
+
max_lhs_combinations = min(50, len(non_unique_cols))
|
|
1550
|
+
max_total_tests = min(300, len(non_unique_cols) * len(cols))
|
|
1551
|
+
|
|
1552
|
+
# Cache for group operations
|
|
1553
|
+
group_cache = {}
|
|
1554
|
+
tests_performed = 0
|
|
1555
|
+
|
|
1556
|
+
for size in range(1, min(max_lhs_size + 1, 3)): # Cap at size 2 for hyper mode
|
|
1557
|
+
if size > len(non_unique_cols) or tests_performed >= max_total_tests:
|
|
1558
|
+
break
|
|
1559
|
+
|
|
1560
|
+
# Be more selective about combinations
|
|
1561
|
+
if size == 1:
|
|
1562
|
+
lhs_candidates = [(col,) for col in non_unique_cols[:max_lhs_combinations]]
|
|
1563
|
+
else:
|
|
1564
|
+
# For multi-column, be very selective but still thorough
|
|
1565
|
+
all_combos = list(itertools.combinations(non_unique_cols[:15], size))[:30]
|
|
1566
|
+
lhs_candidates = sorted(all_combos,
|
|
1567
|
+
key=lambda x: sum(col_cardinalities[col] for col in x))[:30]
|
|
1568
|
+
|
|
1569
|
+
for lhs in lhs_candidates:
|
|
1570
|
+
if tests_performed >= max_total_tests:
|
|
1571
|
+
break
|
|
1572
|
+
|
|
1573
|
+
lhs_tuple = tuple(lhs)
|
|
1574
|
+
|
|
1575
|
+
try:
|
|
1576
|
+
if lhs_tuple not in group_cache:
|
|
1577
|
+
grouped = df.groupby(list(lhs), sort=False, dropna=False)
|
|
1578
|
+
group_sizes = grouped.size()
|
|
1579
|
+
group_cache[lhs_tuple] = (grouped, group_sizes)
|
|
1580
|
+
else:
|
|
1581
|
+
grouped, group_sizes = group_cache[lhs_tuple]
|
|
1582
|
+
|
|
1583
|
+
n_groups = len(group_sizes)
|
|
1584
|
+
if n_groups == n_rows or group_sizes.max() == 1:
|
|
1585
|
+
continue
|
|
1586
|
+
|
|
1587
|
+
# Test RHS candidates with some prioritization
|
|
1588
|
+
for rhs in cols:
|
|
1589
|
+
if rhs in lhs or tests_performed >= max_total_tests:
|
|
1590
|
+
continue
|
|
1591
|
+
|
|
1592
|
+
# Quick heuristic check
|
|
1593
|
+
if col_cardinalities[rhs] > n_groups * 1.2:
|
|
1594
|
+
continue
|
|
1595
|
+
|
|
1596
|
+
try:
|
|
1597
|
+
rhs_per_group = grouped[rhs].nunique()
|
|
1598
|
+
if (rhs_per_group <= 1).all():
|
|
1599
|
+
fds.append((lhs, rhs))
|
|
1600
|
+
tests_performed += 1
|
|
1601
|
+
except Exception:
|
|
1602
|
+
continue
|
|
1603
|
+
|
|
1604
|
+
except Exception:
|
|
1605
|
+
continue
|
|
1606
|
+
|
|
1607
|
+
return fds
|
|
1608
|
+
|
|
1609
|
+
|
|
1610
|
+
def find_candidate_keys_hyper_optimized(df: pd.DataFrame, max_combination_size: int = 2):
|
|
1611
|
+
"""
|
|
1612
|
+
Hyper-optimized candidate key discovery for very large datasets.
|
|
1613
|
+
"""
|
|
1614
|
+
n_rows = len(df)
|
|
1615
|
+
cols = list(df.columns)
|
|
1616
|
+
|
|
1617
|
+
if n_rows == 0:
|
|
1618
|
+
return [], [], []
|
|
1619
|
+
|
|
1620
|
+
# Aggressive sampling for very large datasets
|
|
1621
|
+
if n_rows > 200000:
|
|
1622
|
+
sample_size = min(25000, max(5000, n_rows // 40))
|
|
1623
|
+
df, was_sampled = sample_dataframe_intelligently(df, sample_size)
|
|
1624
|
+
n_rows = len(df)
|
|
1625
|
+
print(f" Aggressively sampled {n_rows} rows from original dataset for key analysis")
|
|
1626
|
+
|
|
1627
|
+
all_keys = []
|
|
1628
|
+
|
|
1629
|
+
# Quick single-column check with early termination
|
|
1630
|
+
single_column_keys = []
|
|
1631
|
+
col_cardinalities = {}
|
|
1632
|
+
|
|
1633
|
+
for col in cols:
|
|
1634
|
+
cardinality = df[col].nunique()
|
|
1635
|
+
col_cardinalities[col] = cardinality
|
|
1636
|
+
if cardinality == n_rows:
|
|
1637
|
+
single_column_keys.append((col,))
|
|
1638
|
+
all_keys.append((col,))
|
|
1639
|
+
|
|
1640
|
+
# For very large datasets, if we have single-column keys, stop there
|
|
1641
|
+
if single_column_keys and n_rows > 100000:
|
|
1642
|
+
return all_keys, single_column_keys, []
|
|
1643
|
+
|
|
1644
|
+
# Very conservative limits for multi-column keys
|
|
1645
|
+
max_combination_size = min(max_combination_size, 2)
|
|
1646
|
+
max_combinations_to_test = min(50, math.comb(len(cols), 2))
|
|
1647
|
+
|
|
1648
|
+
# Only test most promising combinations
|
|
1649
|
+
for size in range(2, max_combination_size + 1):
|
|
1650
|
+
if size > len(cols):
|
|
1651
|
+
break
|
|
1652
|
+
|
|
1653
|
+
# Select only most promising combinations based on cardinality
|
|
1654
|
+
all_combinations = list(itertools.combinations(cols, size))
|
|
1655
|
+
|
|
1656
|
+
# Sort by likelihood of being keys (lower total cardinality)
|
|
1657
|
+
promising_combinations = sorted(all_combinations,
|
|
1658
|
+
key=lambda x: sum(col_cardinalities.get(col, n_rows) for col in x))
|
|
1659
|
+
|
|
1660
|
+
# Test only top candidates
|
|
1661
|
+
combinations_to_test = promising_combinations[:max_combinations_to_test]
|
|
1662
|
+
|
|
1663
|
+
for combo in combinations_to_test:
|
|
1664
|
+
# Skip if contains single-column key
|
|
1665
|
+
if any((col,) in single_column_keys for col in combo):
|
|
1666
|
+
continue
|
|
1667
|
+
|
|
1668
|
+
# Quick heuristic: if sum of cardinalities is much less than n_rows, unlikely to be key
|
|
1669
|
+
total_card = sum(col_cardinalities.get(col, n_rows) for col in combo)
|
|
1670
|
+
if total_card < n_rows * 0.8:
|
|
1671
|
+
continue
|
|
1672
|
+
|
|
1673
|
+
try:
|
|
1674
|
+
unique_count = len(df[list(combo)].drop_duplicates())
|
|
1675
|
+
if unique_count == n_rows:
|
|
1676
|
+
all_keys.append(combo)
|
|
1677
|
+
except Exception:
|
|
1678
|
+
continue
|
|
1679
|
+
|
|
1680
|
+
# Early termination if we found enough keys
|
|
1681
|
+
if len(all_keys) > 5:
|
|
1682
|
+
break
|
|
1683
|
+
|
|
1684
|
+
# Classify keys
|
|
1685
|
+
candidate_keys = []
|
|
1686
|
+
superkeys = []
|
|
1687
|
+
|
|
1688
|
+
for key in all_keys:
|
|
1689
|
+
is_candidate = True
|
|
1690
|
+
for other_key in all_keys:
|
|
1691
|
+
if len(other_key) < len(key) and set(other_key).issubset(set(key)):
|
|
1692
|
+
is_candidate = False
|
|
1693
|
+
break
|
|
1694
|
+
|
|
1695
|
+
if is_candidate:
|
|
1696
|
+
candidate_keys.append(key)
|
|
1697
|
+
else:
|
|
1698
|
+
superkeys.append(key)
|
|
1699
|
+
|
|
1700
|
+
return all_keys, candidate_keys, superkeys
|
|
1701
|
+
|
|
1702
|
+
|
|
1703
|
+
def profile_hyper_optimized(df: pd.DataFrame, max_combination_size: int = 2, max_lhs_size: int = 2):
|
|
1704
|
+
"""
|
|
1705
|
+
Hyper-optimized profile function for very large datasets (500k+ rows).
|
|
1706
|
+
Sacrifices some completeness for dramatic speed improvements.
|
|
1707
|
+
"""
|
|
1708
|
+
start_time = time.time()
|
|
1709
|
+
n_rows = len(df)
|
|
1710
|
+
cols = list(df.columns)
|
|
1711
|
+
|
|
1712
|
+
print(f"Starting HYPER-OPTIMIZED analysis of {n_rows:,} rows × {len(cols)} columns...")
|
|
1713
|
+
|
|
1714
|
+
# Very aggressive parameter limits
|
|
1715
|
+
max_combination_size = min(max_combination_size, 2)
|
|
1716
|
+
max_lhs_size = min(max_lhs_size, 2)
|
|
1717
|
+
print(f" Hyper mode: limiting to max combination size {max_combination_size}")
|
|
1718
|
+
|
|
1719
|
+
# Discover functional dependencies
|
|
1720
|
+
fd_start = time.time()
|
|
1721
|
+
fds = find_functional_dependencies_hyper_optimized(df, max_lhs_size)
|
|
1722
|
+
fd_time = time.time() - fd_start
|
|
1723
|
+
print(f" FD discovery completed in {fd_time:.2f}s - found {len(fds)} dependencies")
|
|
1724
|
+
|
|
1725
|
+
fd_results = [(", ".join(lhs), rhs) for lhs, rhs in fds]
|
|
1726
|
+
|
|
1727
|
+
# Discover keys
|
|
1728
|
+
key_start = time.time()
|
|
1729
|
+
all_keys, candidate_keys, superkeys = find_candidate_keys_hyper_optimized(df, max_combination_size)
|
|
1730
|
+
key_time = time.time() - key_start
|
|
1731
|
+
print(f" Key discovery completed in {key_time:.2f}s - found {len(candidate_keys)} candidate keys")
|
|
1732
|
+
|
|
1733
|
+
# Minimal result preparation
|
|
1734
|
+
results = []
|
|
1735
|
+
single_col_uniqueness = {col: df[col].nunique() for col in cols}
|
|
1736
|
+
|
|
1737
|
+
# Only process essential combinations
|
|
1738
|
+
max_combinations_total = min(100, len(cols) * 2)
|
|
1739
|
+
combinations_tested = 0
|
|
1740
|
+
|
|
1741
|
+
for size in range(1, max_combination_size + 1):
|
|
1742
|
+
for combo in itertools.combinations(cols, size):
|
|
1743
|
+
if combinations_tested >= max_combinations_total:
|
|
1744
|
+
break
|
|
1745
|
+
|
|
1746
|
+
if len(combo) == 1:
|
|
1747
|
+
unique_count = single_col_uniqueness[combo[0]]
|
|
1748
|
+
elif combo in all_keys:
|
|
1749
|
+
unique_count = n_rows
|
|
1750
|
+
else:
|
|
1751
|
+
# Estimate for larger combinations
|
|
1752
|
+
unique_count = min(n_rows, sum(single_col_uniqueness[col] for col in combo) // len(combo))
|
|
1753
|
+
|
|
1754
|
+
unique_ratio = unique_count / n_rows if n_rows > 0 else 0
|
|
1755
|
+
is_key = combo in all_keys
|
|
1756
|
+
is_candidate = combo in candidate_keys
|
|
1757
|
+
is_superkey = combo in superkeys
|
|
1758
|
+
|
|
1759
|
+
key_type = ""
|
|
1760
|
+
if is_candidate:
|
|
1761
|
+
key_type = "★ Candidate Key"
|
|
1762
|
+
elif is_superkey:
|
|
1763
|
+
key_type = "⊃ Superkey"
|
|
1764
|
+
|
|
1765
|
+
results.append((combo, unique_count, unique_ratio, is_key, key_type))
|
|
1766
|
+
combinations_tested += 1
|
|
1767
|
+
|
|
1768
|
+
# Quick sort
|
|
1769
|
+
results.sort(key=lambda x: (not x[3], -x[2], len(x[0])))
|
|
1770
|
+
key_results = [(", ".join(c), u, f"{u/n_rows:.2%}", k)
|
|
1771
|
+
for c, u, _, _, k in results]
|
|
1772
|
+
|
|
1773
|
+
# Simplified normalized tables
|
|
1774
|
+
normalized_tables = propose_normalized_tables(cols, candidate_keys, fds)
|
|
1775
|
+
|
|
1776
|
+
total_time = time.time() - start_time
|
|
1777
|
+
print(f" HYPER-OPTIMIZED analysis completed in {total_time:.2f}s")
|
|
1778
|
+
|
|
1779
|
+
return fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables
|
|
1780
|
+
|
|
1781
|
+
|
|
1782
|
+
def test_hyper_optimized_scenario():
|
|
1783
|
+
"""
|
|
1784
|
+
Test the hyper-optimized version with extremely large datasets.
|
|
1785
|
+
"""
|
|
1786
|
+
print("=== HYPER-OPTIMIZED SCENARIO TEST ===\n")
|
|
1787
|
+
|
|
1788
|
+
# Test different large dataset scenarios
|
|
1789
|
+
test_scenarios = [
|
|
1790
|
+
(500000, 'complex', "500K rows complex"),
|
|
1791
|
+
(1000000, 'complex', "1M rows complex"),
|
|
1792
|
+
(2000000, 'medium', "2M rows medium"),
|
|
1793
|
+
(5000000, 'simple', "5M rows simple")
|
|
1794
|
+
]
|
|
1795
|
+
|
|
1796
|
+
results = []
|
|
1797
|
+
|
|
1798
|
+
for size, complexity, description in test_scenarios:
|
|
1799
|
+
print(f"\n{'='*60}")
|
|
1800
|
+
print(f"TESTING: {description}")
|
|
1801
|
+
print('='*60)
|
|
1802
|
+
|
|
1803
|
+
try:
|
|
1804
|
+
# Create test data
|
|
1805
|
+
df = create_stress_test_data(size, complexity=complexity)
|
|
1806
|
+
memory_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
|
|
1807
|
+
|
|
1808
|
+
print(f"Memory usage: {memory_mb:.1f} MB")
|
|
1809
|
+
|
|
1810
|
+
# Test hyper-optimized version
|
|
1811
|
+
start_time = time.time()
|
|
1812
|
+
fd_results, key_results, n_rows, cols, max_combo, max_lhs, norm_tables = profile_hyper_optimized(
|
|
1813
|
+
df, max_combination_size=3, max_lhs_size=2
|
|
1814
|
+
)
|
|
1815
|
+
analysis_time = time.time() - start_time
|
|
1816
|
+
|
|
1817
|
+
candidate_keys = [k for k in key_results if "Candidate Key" in k[3]]
|
|
1818
|
+
rows_per_second = n_rows / analysis_time
|
|
1819
|
+
|
|
1820
|
+
print(f"\n✅ SUCCESS!")
|
|
1821
|
+
print(f" • Analysis time: {analysis_time:.2f} seconds")
|
|
1822
|
+
print(f" • Processing rate: {rows_per_second:,.0f} rows/second")
|
|
1823
|
+
print(f" • Found {len(fd_results)} functional dependencies")
|
|
1824
|
+
print(f" • Found {len(candidate_keys)} candidate keys")
|
|
1825
|
+
print(f" • Memory efficiency: {memory_mb / analysis_time:.1f} MB/second")
|
|
1826
|
+
|
|
1827
|
+
# Performance assessment
|
|
1828
|
+
if analysis_time < 30:
|
|
1829
|
+
performance = "🔥 EXCELLENT"
|
|
1830
|
+
elif analysis_time < 60:
|
|
1831
|
+
performance = "✅ GOOD"
|
|
1832
|
+
elif analysis_time < 180:
|
|
1833
|
+
performance = "⚠️ ACCEPTABLE"
|
|
1834
|
+
else:
|
|
1835
|
+
performance = "❌ NEEDS WORK"
|
|
1836
|
+
|
|
1837
|
+
print(f" • Performance: {performance}")
|
|
1838
|
+
|
|
1839
|
+
results.append({
|
|
1840
|
+
'size': size,
|
|
1841
|
+
'complexity': complexity,
|
|
1842
|
+
'memory_mb': memory_mb,
|
|
1843
|
+
'time': analysis_time,
|
|
1844
|
+
'rows_per_sec': rows_per_second,
|
|
1845
|
+
'fds': len(fd_results),
|
|
1846
|
+
'keys': len(candidate_keys),
|
|
1847
|
+
'success': True,
|
|
1848
|
+
'performance': performance
|
|
1849
|
+
})
|
|
1850
|
+
|
|
1851
|
+
except Exception as e:
|
|
1852
|
+
print(f"❌ FAILED: {str(e)}")
|
|
1853
|
+
results.append({
|
|
1854
|
+
'size': size,
|
|
1855
|
+
'complexity': complexity,
|
|
1856
|
+
'memory_mb': 0,
|
|
1857
|
+
'time': float('inf'),
|
|
1858
|
+
'rows_per_sec': 0,
|
|
1859
|
+
'fds': 0,
|
|
1860
|
+
'keys': 0,
|
|
1861
|
+
'success': False,
|
|
1862
|
+
'performance': "❌ FAILED"
|
|
1863
|
+
})
|
|
1864
|
+
|
|
1865
|
+
# Summary
|
|
1866
|
+
print(f"\n{'='*80}")
|
|
1867
|
+
print("HYPER-OPTIMIZED PERFORMANCE SUMMARY")
|
|
1868
|
+
print('='*80)
|
|
1869
|
+
print(f"{'Dataset':<20} {'Memory':<10} {'Time':<10} {'Rate':<12} {'FDs':<5} {'Keys':<5} {'Performance'}")
|
|
1870
|
+
print("-" * 80)
|
|
1871
|
+
|
|
1872
|
+
for result in results:
|
|
1873
|
+
dataset = f"{result['size']:,} {result['complexity']}"
|
|
1874
|
+
memory = f"{result['memory_mb']:.1f}MB"
|
|
1875
|
+
time_str = f"{result['time']:.1f}s" if result['time'] != float('inf') else "FAIL"
|
|
1876
|
+
rate = f"{result['rows_per_sec']:,.0f}/s" if result['success'] else "N/A"
|
|
1877
|
+
fds = str(result['fds'])
|
|
1878
|
+
keys = str(result['keys'])
|
|
1879
|
+
performance = result['performance']
|
|
1880
|
+
|
|
1881
|
+
print(f"{dataset:<20} {memory:<10} {time_str:<10} {rate:<12} {fds:<5} {keys:<5} {performance}")
|
|
1882
|
+
|
|
1883
|
+
# Analysis
|
|
1884
|
+
successful = [r for r in results if r['success']]
|
|
1885
|
+
if successful:
|
|
1886
|
+
max_size = max(r['size'] for r in successful)
|
|
1887
|
+
avg_rate = np.mean([r['rows_per_sec'] for r in successful])
|
|
1888
|
+
print(f"\n🎯 ANALYSIS:")
|
|
1889
|
+
print(f" • Successfully processed datasets up to {max_size:,} rows")
|
|
1890
|
+
print(f" • Average processing rate: {avg_rate:,.0f} rows/second")
|
|
1891
|
+
print(f" • Hyper-optimization enables analysis of datasets that would be impossible otherwise")
|
|
1892
|
+
|
|
1893
|
+
return results
|
|
1894
|
+
|
|
1895
|
+
|
|
1896
|
+
def test_small_data_optimizations():
|
|
1897
|
+
"""
|
|
1898
|
+
Test optimizations specifically for small datasets to ensure no performance regression.
|
|
1899
|
+
"""
|
|
1900
|
+
print("=== SMALL DATA OPTIMIZATION TEST ===\n")
|
|
1901
|
+
|
|
1902
|
+
# Test different small dataset scenarios
|
|
1903
|
+
small_test_configs = [
|
|
1904
|
+
(10, 3, 'tiny'),
|
|
1905
|
+
(50, 4, 'small'),
|
|
1906
|
+
(100, 5, 'small'),
|
|
1907
|
+
(500, 6, 'medium'),
|
|
1908
|
+
(1000, 8, 'medium'),
|
|
1909
|
+
(5000, 10, 'medium'),
|
|
1910
|
+
]
|
|
1911
|
+
|
|
1912
|
+
results = []
|
|
1913
|
+
|
|
1914
|
+
for size, n_cols, complexity in small_test_configs:
|
|
1915
|
+
print(f"\n{'='*50}")
|
|
1916
|
+
print(f"TESTING: {size:,} rows × {n_cols} columns ({complexity})")
|
|
1917
|
+
print('='*50)
|
|
1918
|
+
|
|
1919
|
+
try:
|
|
1920
|
+
# Create test data
|
|
1921
|
+
df = create_stress_test_data(size, n_cols=n_cols, complexity=complexity)
|
|
1922
|
+
memory_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
|
|
1923
|
+
|
|
1924
|
+
print(f"Memory usage: {memory_mb:.3f} MB")
|
|
1925
|
+
|
|
1926
|
+
# Test all three optimization levels
|
|
1927
|
+
optimization_results = {}
|
|
1928
|
+
|
|
1929
|
+
# 1. Test original version (for very small datasets only)
|
|
1930
|
+
if size <= 1000:
|
|
1931
|
+
print("\n🐌 Testing ORIGINAL version...")
|
|
1932
|
+
start_time = time.time()
|
|
1933
|
+
orig_results = profile_original(df, max_combination_size=3, max_lhs_size=2)
|
|
1934
|
+
orig_time = time.time() - start_time
|
|
1935
|
+
optimization_results['original'] = {
|
|
1936
|
+
'time': orig_time,
|
|
1937
|
+
'fds': len(orig_results[0]),
|
|
1938
|
+
'keys': len([k for k in orig_results[1] if "Candidate Key" in k[3]])
|
|
1939
|
+
}
|
|
1940
|
+
print(f" Original: {orig_time:.4f}s - {optimization_results['original']['fds']} FDs, {optimization_results['original']['keys']} keys")
|
|
1941
|
+
|
|
1942
|
+
# 2. Test standard optimized version
|
|
1943
|
+
print("\n🔍 Testing STANDARD-OPTIMIZED version...")
|
|
1944
|
+
start_time = time.time()
|
|
1945
|
+
std_results = profile_optimized(df, max_combination_size=3, max_lhs_size=2)
|
|
1946
|
+
std_time = time.time() - start_time
|
|
1947
|
+
optimization_results['standard'] = {
|
|
1948
|
+
'time': std_time,
|
|
1949
|
+
'fds': len(std_results[0]),
|
|
1950
|
+
'keys': len([k for k in std_results[1] if "Candidate Key" in k[3]])
|
|
1951
|
+
}
|
|
1952
|
+
print(f" Standard: {std_time:.4f}s - {optimization_results['standard']['fds']} FDs, {optimization_results['standard']['keys']} keys")
|
|
1953
|
+
|
|
1954
|
+
# 3. Test ultra optimized version
|
|
1955
|
+
print("\n⚡ Testing ULTRA-OPTIMIZED version...")
|
|
1956
|
+
start_time = time.time()
|
|
1957
|
+
ultra_results = profile_ultra_optimized(df, max_combination_size=3, max_lhs_size=2)
|
|
1958
|
+
ultra_time = time.time() - start_time
|
|
1959
|
+
optimization_results['ultra'] = {
|
|
1960
|
+
'time': ultra_time,
|
|
1961
|
+
'fds': len(ultra_results[0]),
|
|
1962
|
+
'keys': len([k for k in ultra_results[1] if "Candidate Key" in k[3]])
|
|
1963
|
+
}
|
|
1964
|
+
print(f" Ultra: {ultra_time:.4f}s - {optimization_results['ultra']['fds']} FDs, {optimization_results['ultra']['keys']} keys")
|
|
1965
|
+
|
|
1966
|
+
# 4. Test automatic selection (should pick standard for small data)
|
|
1967
|
+
print("\n🎯 Testing AUTOMATIC selection...")
|
|
1968
|
+
start_time = time.time()
|
|
1969
|
+
auto_results = profile(df, max_combination_size=3, max_lhs_size=2)
|
|
1970
|
+
auto_time = time.time() - start_time
|
|
1971
|
+
optimization_results['auto'] = {
|
|
1972
|
+
'time': auto_time,
|
|
1973
|
+
'fds': len(auto_results[0]),
|
|
1974
|
+
'keys': len([k for k in auto_results[1] if "Candidate Key" in k[3]])
|
|
1975
|
+
}
|
|
1976
|
+
print(f" Auto: {auto_time:.4f}s - {optimization_results['auto']['fds']} FDs, {optimization_results['auto']['keys']} keys")
|
|
1977
|
+
|
|
1978
|
+
# Analyze results
|
|
1979
|
+
print(f"\n📊 ANALYSIS:")
|
|
1980
|
+
|
|
1981
|
+
# Check consistency
|
|
1982
|
+
fd_counts = [opt['fds'] for opt in optimization_results.values()]
|
|
1983
|
+
key_counts = [opt['keys'] for opt in optimization_results.values()]
|
|
1984
|
+
|
|
1985
|
+
consistent_fds = len(set(fd_counts)) <= 1
|
|
1986
|
+
consistent_keys = len(set(key_counts)) <= 1
|
|
1987
|
+
|
|
1988
|
+
print(f" • FD consistency: {'✅' if consistent_fds else '❌'} ({fd_counts})")
|
|
1989
|
+
print(f" • Key consistency: {'✅' if consistent_keys else '❌'} ({key_counts})")
|
|
1990
|
+
|
|
1991
|
+
# Compare performance
|
|
1992
|
+
if 'original' in optimization_results:
|
|
1993
|
+
std_speedup = optimization_results['original']['time'] / optimization_results['standard']['time']
|
|
1994
|
+
ultra_speedup = optimization_results['original']['time'] / optimization_results['ultra']['time']
|
|
1995
|
+
print(f" • Standard speedup vs original: {std_speedup:.2f}x")
|
|
1996
|
+
print(f" • Ultra speedup vs original: {ultra_speedup:.2f}x")
|
|
1997
|
+
|
|
1998
|
+
# Check if auto selection made good choice
|
|
1999
|
+
fastest_time = min(opt['time'] for opt in optimization_results.values())
|
|
2000
|
+
auto_efficiency = fastest_time / optimization_results['auto']['time']
|
|
2001
|
+
print(f" • Auto selection efficiency: {auto_efficiency:.2f} (1.0 = optimal)")
|
|
2002
|
+
|
|
2003
|
+
# Overall assessment
|
|
2004
|
+
if consistent_fds and consistent_keys and auto_efficiency > 0.8:
|
|
2005
|
+
assessment = "✅ EXCELLENT"
|
|
2006
|
+
elif consistent_fds and consistent_keys:
|
|
2007
|
+
assessment = "✅ GOOD"
|
|
2008
|
+
elif auto_efficiency > 0.8:
|
|
2009
|
+
assessment = "⚠️ INCONSISTENT RESULTS"
|
|
2010
|
+
else:
|
|
2011
|
+
assessment = "❌ POOR PERFORMANCE"
|
|
2012
|
+
|
|
2013
|
+
print(f" • Overall: {assessment}")
|
|
2014
|
+
|
|
2015
|
+
results.append({
|
|
2016
|
+
'size': size,
|
|
2017
|
+
'cols': n_cols,
|
|
2018
|
+
'complexity': complexity,
|
|
2019
|
+
'memory_mb': memory_mb,
|
|
2020
|
+
'optimization_results': optimization_results,
|
|
2021
|
+
'consistent_fds': consistent_fds,
|
|
2022
|
+
'consistent_keys': consistent_keys,
|
|
2023
|
+
'auto_efficiency': auto_efficiency,
|
|
2024
|
+
'assessment': assessment,
|
|
2025
|
+
'success': True
|
|
2026
|
+
})
|
|
2027
|
+
|
|
2028
|
+
except Exception as e:
|
|
2029
|
+
print(f"❌ FAILED: {str(e)}")
|
|
2030
|
+
results.append({
|
|
2031
|
+
'size': size,
|
|
2032
|
+
'cols': n_cols,
|
|
2033
|
+
'complexity': complexity,
|
|
2034
|
+
'memory_mb': 0,
|
|
2035
|
+
'optimization_results': {},
|
|
2036
|
+
'consistent_fds': False,
|
|
2037
|
+
'consistent_keys': False,
|
|
2038
|
+
'auto_efficiency': 0,
|
|
2039
|
+
'assessment': "❌ FAILED",
|
|
2040
|
+
'success': False
|
|
2041
|
+
})
|
|
2042
|
+
|
|
2043
|
+
# Comprehensive summary
|
|
2044
|
+
print(f"\n{'='*80}")
|
|
2045
|
+
print("SMALL DATA OPTIMIZATION SUMMARY")
|
|
2046
|
+
print('='*80)
|
|
2047
|
+
print(f"{'Dataset':<15} {'Memory':<8} {'Original':<10} {'Standard':<10} {'Ultra':<10} {'Auto':<10} {'Consistent':<10} {'Assessment'}")
|
|
2048
|
+
print("-" * 80)
|
|
2049
|
+
|
|
2050
|
+
for result in results:
|
|
2051
|
+
if not result['success']:
|
|
2052
|
+
continue
|
|
2053
|
+
|
|
2054
|
+
dataset = f"{result['size']}×{result['cols']}"
|
|
2055
|
+
memory = f"{result['memory_mb']:.2f}MB"
|
|
2056
|
+
|
|
2057
|
+
opt_res = result['optimization_results']
|
|
2058
|
+
orig_time = f"{opt_res.get('original', {}).get('time', 0):.3f}s" if 'original' in opt_res else "N/A"
|
|
2059
|
+
std_time = f"{opt_res['standard']['time']:.3f}s"
|
|
2060
|
+
ultra_time = f"{opt_res['ultra']['time']:.3f}s"
|
|
2061
|
+
auto_time = f"{opt_res['auto']['time']:.3f}s"
|
|
2062
|
+
|
|
2063
|
+
consistent = "✅" if result['consistent_fds'] and result['consistent_keys'] else "❌"
|
|
2064
|
+
assessment = result['assessment'].split()[0] # Just the emoji/symbol
|
|
2065
|
+
|
|
2066
|
+
print(f"{dataset:<15} {memory:<8} {orig_time:<10} {std_time:<10} {ultra_time:<10} {auto_time:<10} {consistent:<10} {assessment}")
|
|
2067
|
+
|
|
2068
|
+
# Performance analysis
|
|
2069
|
+
successful = [r for r in results if r['success']]
|
|
2070
|
+
if successful:
|
|
2071
|
+
print(f"\n🎯 PERFORMANCE ANALYSIS:")
|
|
2072
|
+
|
|
2073
|
+
# Consistency check
|
|
2074
|
+
all_consistent = all(r['consistent_fds'] and r['consistent_keys'] for r in successful)
|
|
2075
|
+
print(f" • Result consistency across optimizations: {'✅' if all_consistent else '❌'}")
|
|
2076
|
+
|
|
2077
|
+
# Auto selection efficiency
|
|
2078
|
+
avg_auto_efficiency = np.mean([r['auto_efficiency'] for r in successful])
|
|
2079
|
+
print(f" • Average auto-selection efficiency: {avg_auto_efficiency:.3f}")
|
|
2080
|
+
|
|
2081
|
+
# Speed comparison for overlapping tests
|
|
2082
|
+
overlap_tests = [r for r in successful if 'original' in r['optimization_results']]
|
|
2083
|
+
if overlap_tests:
|
|
2084
|
+
avg_std_speedup = np.mean([
|
|
2085
|
+
r['optimization_results']['original']['time'] / r['optimization_results']['standard']['time']
|
|
2086
|
+
for r in overlap_tests
|
|
2087
|
+
])
|
|
2088
|
+
avg_ultra_speedup = np.mean([
|
|
2089
|
+
r['optimization_results']['original']['time'] / r['optimization_results']['ultra']['time']
|
|
2090
|
+
for r in overlap_tests
|
|
2091
|
+
])
|
|
2092
|
+
print(f" • Average standard optimization speedup: {avg_std_speedup:.2f}x")
|
|
2093
|
+
print(f" • Average ultra optimization speedup: {avg_ultra_speedup:.2f}x")
|
|
2094
|
+
|
|
2095
|
+
# Recommendations
|
|
2096
|
+
print(f"\n💡 RECOMMENDATIONS:")
|
|
2097
|
+
if all_consistent and avg_auto_efficiency > 0.9:
|
|
2098
|
+
print(" ✅ Optimizations are working excellently for small data")
|
|
2099
|
+
elif all_consistent:
|
|
2100
|
+
print(" ✅ Results are consistent, but auto-selection could be improved")
|
|
2101
|
+
else:
|
|
2102
|
+
print(" ⚠️ Some optimization levels produce inconsistent results")
|
|
2103
|
+
|
|
2104
|
+
# Check if any optimization is consistently best for small data
|
|
2105
|
+
fastest_counts = {}
|
|
2106
|
+
for result in successful:
|
|
2107
|
+
if result['optimization_results']:
|
|
2108
|
+
fastest = min(result['optimization_results'].items(), key=lambda x: x[1]['time'])[0]
|
|
2109
|
+
fastest_counts[fastest] = fastest_counts.get(fastest, 0) + 1
|
|
2110
|
+
|
|
2111
|
+
if fastest_counts:
|
|
2112
|
+
best_optimization = max(fastest_counts.items(), key=lambda x: x[1])
|
|
2113
|
+
print(f" 🏆 Most often fastest: {best_optimization[0]} ({best_optimization[1]}/{len(successful)} times)")
|
|
2114
|
+
|
|
2115
|
+
return results
|
|
2116
|
+
|
|
2117
|
+
|
|
2118
|
+
def find_functional_dependencies_high_column_optimized(df: pd.DataFrame, max_lhs_size: int = 2):
|
|
2119
|
+
"""
|
|
2120
|
+
Specialized functional dependency discovery for high-column datasets (>50 columns).
|
|
2121
|
+
Uses intelligent column selection and aggressive limits.
|
|
2122
|
+
"""
|
|
2123
|
+
try:
|
|
2124
|
+
n_rows = len(df)
|
|
2125
|
+
cols = list(df.columns)
|
|
2126
|
+
n_cols = len(cols)
|
|
2127
|
+
|
|
2128
|
+
if n_rows == 0 or n_cols < 2:
|
|
2129
|
+
return []
|
|
2130
|
+
|
|
2131
|
+
print(f" High-column FD analysis: {n_rows} rows × {n_cols} columns")
|
|
2132
|
+
|
|
2133
|
+
# Always sample for high-column datasets to keep it manageable
|
|
2134
|
+
if n_rows > 2000:
|
|
2135
|
+
sample_size = min(2000, max(500, n_rows // 50))
|
|
2136
|
+
df, was_sampled = sample_dataframe_intelligently(df, sample_size)
|
|
2137
|
+
n_rows = len(df)
|
|
2138
|
+
print(f" Sampled to {n_rows} rows for high-column analysis")
|
|
2139
|
+
|
|
2140
|
+
# Pre-compute column characteristics for intelligent selection
|
|
2141
|
+
col_info = {}
|
|
2142
|
+
for col in cols:
|
|
2143
|
+
try:
|
|
2144
|
+
unique_count = df[col].nunique()
|
|
2145
|
+
col_info[col] = {
|
|
2146
|
+
'cardinality': unique_count,
|
|
2147
|
+
'uniqueness_ratio': unique_count / n_rows,
|
|
2148
|
+
'is_potential_key': unique_count == n_rows,
|
|
2149
|
+
'is_low_cardinality': unique_count < n_rows * 0.1,
|
|
2150
|
+
'is_boolean_like': unique_count <= 2
|
|
2151
|
+
}
|
|
2152
|
+
except Exception:
|
|
2153
|
+
# Skip problematic columns
|
|
2154
|
+
col_info[col] = {
|
|
2155
|
+
'cardinality': 0,
|
|
2156
|
+
'uniqueness_ratio': 0,
|
|
2157
|
+
'is_potential_key': False,
|
|
2158
|
+
'is_low_cardinality': False,
|
|
2159
|
+
'is_boolean_like': False
|
|
2160
|
+
}
|
|
2161
|
+
|
|
2162
|
+
# Select most promising columns for LHS (determinants)
|
|
2163
|
+
# Focus on columns that are likely to be good determinants
|
|
2164
|
+
lhs_candidates = []
|
|
2165
|
+
|
|
2166
|
+
# Add potential keys first (high cardinality)
|
|
2167
|
+
potential_keys = [col for col, info in col_info.items() if info['uniqueness_ratio'] > 0.8]
|
|
2168
|
+
lhs_candidates.extend(potential_keys[:10]) # Top 10 potential keys
|
|
2169
|
+
|
|
2170
|
+
# Add low-cardinality columns (good for grouping)
|
|
2171
|
+
low_card_cols = sorted([col for col, info in col_info.items() if info['is_low_cardinality']],
|
|
2172
|
+
key=lambda x: col_info[x]['cardinality'])
|
|
2173
|
+
lhs_candidates.extend(low_card_cols[:15]) # Top 15 low-cardinality
|
|
2174
|
+
|
|
2175
|
+
# Add some medium-cardinality columns
|
|
2176
|
+
medium_card_cols = [col for col, info in col_info.items()
|
|
2177
|
+
if 0.1 <= info['uniqueness_ratio'] <= 0.8]
|
|
2178
|
+
medium_card_cols = sorted(medium_card_cols, key=lambda x: col_info[x]['cardinality'])
|
|
2179
|
+
lhs_candidates.extend(medium_card_cols[:10]) # Top 10 medium-cardinality
|
|
2180
|
+
|
|
2181
|
+
# Remove duplicates while preserving order and ensure they exist in dataframe
|
|
2182
|
+
seen = set()
|
|
2183
|
+
lhs_candidates = [col for col in lhs_candidates
|
|
2184
|
+
if col in df.columns and not (col in seen or seen.add(col))]
|
|
2185
|
+
|
|
2186
|
+
# Limit to top 30 LHS candidates to keep it manageable
|
|
2187
|
+
lhs_candidates = lhs_candidates[:30]
|
|
2188
|
+
|
|
2189
|
+
print(f" Selected {len(lhs_candidates)} promising LHS candidates from {n_cols} columns")
|
|
2190
|
+
|
|
2191
|
+
fds = []
|
|
2192
|
+
group_cache = {}
|
|
2193
|
+
|
|
2194
|
+
# Very aggressive limits for high-column datasets
|
|
2195
|
+
max_tests = 200 # Maximum total FD tests
|
|
2196
|
+
tests_performed = 0
|
|
2197
|
+
|
|
2198
|
+
for size in range(1, min(max_lhs_size + 1, 3)): # Cap at size 2 for high-column
|
|
2199
|
+
if tests_performed >= max_tests or not lhs_candidates:
|
|
2200
|
+
break
|
|
2201
|
+
|
|
2202
|
+
if size == 1:
|
|
2203
|
+
# Single column determinants
|
|
2204
|
+
candidates = lhs_candidates[:20] # Top 20 for single-column
|
|
2205
|
+
else:
|
|
2206
|
+
# Multi-column determinants - be very selective
|
|
2207
|
+
try:
|
|
2208
|
+
candidates = list(itertools.combinations(lhs_candidates[:15], size))[:30]
|
|
2209
|
+
except Exception:
|
|
2210
|
+
candidates = []
|
|
2211
|
+
|
|
2212
|
+
for lhs in candidates:
|
|
2213
|
+
if tests_performed >= max_tests:
|
|
2214
|
+
break
|
|
2215
|
+
|
|
2216
|
+
lhs_tuple = tuple(lhs) if isinstance(lhs, (list, tuple)) else (lhs,)
|
|
2217
|
+
|
|
2218
|
+
try:
|
|
2219
|
+
# Ensure all columns in lhs_tuple exist in dataframe
|
|
2220
|
+
if not all(col in df.columns for col in lhs_tuple):
|
|
2221
|
+
continue
|
|
2222
|
+
|
|
2223
|
+
if lhs_tuple not in group_cache:
|
|
2224
|
+
grouped = df.groupby(list(lhs_tuple), sort=False, dropna=False)
|
|
2225
|
+
group_sizes = grouped.size()
|
|
2226
|
+
group_cache[lhs_tuple] = (grouped, group_sizes)
|
|
2227
|
+
else:
|
|
2228
|
+
grouped, group_sizes = group_cache[lhs_tuple]
|
|
2229
|
+
|
|
2230
|
+
n_groups = len(group_sizes)
|
|
2231
|
+
if n_groups == n_rows or group_sizes.max() == 1:
|
|
2232
|
+
continue
|
|
2233
|
+
|
|
2234
|
+
# Test only most promising RHS candidates
|
|
2235
|
+
rhs_candidates = []
|
|
2236
|
+
|
|
2237
|
+
# Add high-cardinality columns as RHS candidates
|
|
2238
|
+
high_card_rhs = [col for col, info in col_info.items()
|
|
2239
|
+
if info['uniqueness_ratio'] > 0.5 and col not in lhs_tuple and col in df.columns]
|
|
2240
|
+
rhs_candidates.extend(high_card_rhs[:10])
|
|
2241
|
+
|
|
2242
|
+
# Add some other columns
|
|
2243
|
+
other_rhs = [col for col in cols if col not in lhs_tuple and col not in rhs_candidates and col in df.columns]
|
|
2244
|
+
rhs_candidates.extend(other_rhs[:10])
|
|
2245
|
+
|
|
2246
|
+
for rhs in rhs_candidates:
|
|
2247
|
+
if tests_performed >= max_tests:
|
|
2248
|
+
break
|
|
2249
|
+
|
|
2250
|
+
# Quick heuristic check
|
|
2251
|
+
if col_info.get(rhs, {}).get('cardinality', 0) > n_groups * 1.5:
|
|
2252
|
+
continue
|
|
2253
|
+
|
|
2254
|
+
try:
|
|
2255
|
+
rhs_per_group = grouped[rhs].nunique()
|
|
2256
|
+
if (rhs_per_group <= 1).all():
|
|
2257
|
+
fds.append((lhs_tuple, rhs))
|
|
2258
|
+
tests_performed += 1
|
|
2259
|
+
except Exception:
|
|
2260
|
+
continue
|
|
2261
|
+
|
|
2262
|
+
except Exception:
|
|
2263
|
+
continue
|
|
2264
|
+
|
|
2265
|
+
print(f" Performed {tests_performed} FD tests (limit: {max_tests})")
|
|
2266
|
+
return fds
|
|
2267
|
+
|
|
2268
|
+
except Exception as e:
|
|
2269
|
+
print(f" Error in high-column FD analysis: {e}")
|
|
2270
|
+
return [] # Return empty list on error
|
|
2271
|
+
|
|
2272
|
+
|
|
2273
|
+
def find_candidate_keys_high_column_optimized(df: pd.DataFrame, max_combination_size: int = 2):
|
|
2274
|
+
"""
|
|
2275
|
+
Specialized candidate key discovery for high-column datasets (>50 columns).
|
|
2276
|
+
Uses intelligent column selection and aggressive limits.
|
|
2277
|
+
"""
|
|
2278
|
+
try:
|
|
2279
|
+
n_rows = len(df)
|
|
2280
|
+
cols = list(df.columns)
|
|
2281
|
+
n_cols = len(cols)
|
|
2282
|
+
|
|
2283
|
+
if n_rows == 0:
|
|
2284
|
+
return [], [], []
|
|
2285
|
+
|
|
2286
|
+
print(f" High-column key analysis: {n_rows} rows × {n_cols} columns")
|
|
2287
|
+
|
|
2288
|
+
# Always sample for high-column datasets
|
|
2289
|
+
if n_rows > 2000:
|
|
2290
|
+
sample_size = min(2000, max(500, n_rows // 50))
|
|
2291
|
+
df, was_sampled = sample_dataframe_intelligently(df, sample_size)
|
|
2292
|
+
n_rows = len(df)
|
|
2293
|
+
print(f" Sampled to {n_rows} rows for high-column key analysis")
|
|
2294
|
+
|
|
2295
|
+
all_keys = []
|
|
2296
|
+
|
|
2297
|
+
# Quick single-column check with cardinality-based prioritization
|
|
2298
|
+
col_cardinalities = {}
|
|
2299
|
+
potential_single_keys = []
|
|
2300
|
+
|
|
2301
|
+
# Sort columns by cardinality (descending) to check most promising first
|
|
2302
|
+
for col in cols:
|
|
2303
|
+
try:
|
|
2304
|
+
if col in df.columns:
|
|
2305
|
+
cardinality = df[col].nunique()
|
|
2306
|
+
col_cardinalities[col] = cardinality
|
|
2307
|
+
if cardinality == n_rows:
|
|
2308
|
+
potential_single_keys.append((col,))
|
|
2309
|
+
all_keys.append((col,))
|
|
2310
|
+
else:
|
|
2311
|
+
col_cardinalities[col] = 0
|
|
2312
|
+
except Exception:
|
|
2313
|
+
col_cardinalities[col] = 0
|
|
2314
|
+
|
|
2315
|
+
print(f" Found {len(potential_single_keys)} single-column keys")
|
|
2316
|
+
|
|
2317
|
+
# For high-column datasets, if we have single-column keys, be very conservative about multi-column
|
|
2318
|
+
if potential_single_keys and n_cols > 80:
|
|
2319
|
+
print(f" Stopping early due to high column count ({n_cols}) and existing single-column keys")
|
|
2320
|
+
return all_keys, potential_single_keys, []
|
|
2321
|
+
|
|
2322
|
+
# Select most promising columns for multi-column key testing
|
|
2323
|
+
# Sort by cardinality (highest first) and take top candidates
|
|
2324
|
+
try:
|
|
2325
|
+
sorted_cols = sorted([col for col in cols if col in df.columns],
|
|
2326
|
+
key=lambda x: col_cardinalities.get(x, 0), reverse=True)
|
|
2327
|
+
except Exception:
|
|
2328
|
+
sorted_cols = [col for col in cols if col in df.columns]
|
|
2329
|
+
|
|
2330
|
+
# Take top candidates based on cardinality
|
|
2331
|
+
if n_cols > 80:
|
|
2332
|
+
promising_cols = sorted_cols[:15] # Very selective for >80 columns
|
|
2333
|
+
elif n_cols > 60:
|
|
2334
|
+
promising_cols = sorted_cols[:20] # Selective for >60 columns
|
|
2335
|
+
else:
|
|
2336
|
+
promising_cols = sorted_cols[:25] # Less selective for 50-60 columns
|
|
2337
|
+
|
|
2338
|
+
print(f" Selected {len(promising_cols)} promising columns for multi-column key testing")
|
|
2339
|
+
|
|
2340
|
+
# Very conservative multi-column key testing
|
|
2341
|
+
max_combination_size = min(max_combination_size, 2) # Cap at 2 for high-column
|
|
2342
|
+
max_combinations_to_test = 50 # Hard limit
|
|
2343
|
+
|
|
2344
|
+
for size in range(2, max_combination_size + 1):
|
|
2345
|
+
if size > len(promising_cols):
|
|
2346
|
+
break
|
|
2347
|
+
|
|
2348
|
+
# Generate combinations from promising columns only
|
|
2349
|
+
try:
|
|
2350
|
+
combinations = list(itertools.combinations(promising_cols, size))
|
|
2351
|
+
except Exception:
|
|
2352
|
+
combinations = []
|
|
2353
|
+
|
|
2354
|
+
# Sort by total cardinality (higher is more likely to be a key)
|
|
2355
|
+
try:
|
|
2356
|
+
combinations = sorted(combinations,
|
|
2357
|
+
key=lambda x: sum(col_cardinalities.get(col, 0) for col in x),
|
|
2358
|
+
reverse=True)
|
|
2359
|
+
except Exception:
|
|
2360
|
+
pass # Keep original order if sorting fails
|
|
2361
|
+
|
|
2362
|
+
# Test only top combinations
|
|
2363
|
+
combinations_to_test = combinations[:max_combinations_to_test]
|
|
2364
|
+
|
|
2365
|
+
tested_count = 0
|
|
2366
|
+
for combo in combinations_to_test:
|
|
2367
|
+
try:
|
|
2368
|
+
# Skip if contains single-column key
|
|
2369
|
+
if any((col,) in potential_single_keys for col in combo):
|
|
2370
|
+
continue
|
|
2371
|
+
|
|
2372
|
+
# Ensure all columns in combo exist in dataframe
|
|
2373
|
+
if not all(col in df.columns for col in combo):
|
|
2374
|
+
continue
|
|
2375
|
+
|
|
2376
|
+
# Quick heuristic: if sum of cardinalities is much less than n_rows, skip
|
|
2377
|
+
total_card = sum(col_cardinalities.get(col, 0) for col in combo)
|
|
2378
|
+
if total_card < n_rows * 0.7:
|
|
2379
|
+
continue
|
|
2380
|
+
|
|
2381
|
+
try:
|
|
2382
|
+
unique_count = len(df[list(combo)].drop_duplicates())
|
|
2383
|
+
if unique_count == n_rows:
|
|
2384
|
+
all_keys.append(combo)
|
|
2385
|
+
except Exception:
|
|
2386
|
+
continue # Skip problematic combinations
|
|
2387
|
+
|
|
2388
|
+
tested_count += 1
|
|
2389
|
+
|
|
2390
|
+
# Early termination for high-column datasets
|
|
2391
|
+
if tested_count >= 20: # Test at most 20 combinations per size
|
|
2392
|
+
break
|
|
2393
|
+
|
|
2394
|
+
except Exception:
|
|
2395
|
+
continue
|
|
2396
|
+
|
|
2397
|
+
print(f" Tested {tested_count} combinations of size {size}")
|
|
2398
|
+
|
|
2399
|
+
# Early termination if we found keys and this is a very high-column dataset
|
|
2400
|
+
if all_keys and n_cols > 80:
|
|
2401
|
+
break
|
|
2402
|
+
|
|
2403
|
+
# Classify keys
|
|
2404
|
+
candidate_keys = []
|
|
2405
|
+
superkeys = []
|
|
2406
|
+
|
|
2407
|
+
for key in all_keys:
|
|
2408
|
+
try:
|
|
2409
|
+
is_candidate = True
|
|
2410
|
+
for other_key in all_keys:
|
|
2411
|
+
if len(other_key) < len(key) and set(other_key).issubset(set(key)):
|
|
2412
|
+
is_candidate = False
|
|
2413
|
+
break
|
|
2414
|
+
|
|
2415
|
+
if is_candidate:
|
|
2416
|
+
candidate_keys.append(key)
|
|
2417
|
+
else:
|
|
2418
|
+
superkeys.append(key)
|
|
2419
|
+
except Exception:
|
|
2420
|
+
# If classification fails, treat as candidate key
|
|
2421
|
+
candidate_keys.append(key)
|
|
2422
|
+
|
|
2423
|
+
return all_keys, candidate_keys, superkeys
|
|
2424
|
+
|
|
2425
|
+
except Exception as e:
|
|
2426
|
+
print(f" Error in high-column key analysis: {e}")
|
|
2427
|
+
return [], [], [] # Return empty lists on error
|
|
2428
|
+
|
|
2429
|
+
|
|
2430
|
+
def profile_high_column_optimized(df: pd.DataFrame, max_combination_size: int = 2, max_lhs_size: int = 2):
|
|
2431
|
+
"""
|
|
2432
|
+
Specialized profile function for high-column datasets (>50 columns).
|
|
2433
|
+
Uses aggressive optimization and intelligent column selection.
|
|
2434
|
+
"""
|
|
2435
|
+
try:
|
|
2436
|
+
start_time = time.time()
|
|
2437
|
+
n_rows = len(df)
|
|
2438
|
+
cols = list(df.columns)
|
|
2439
|
+
n_cols = len(cols)
|
|
2440
|
+
|
|
2441
|
+
print(f"Starting HIGH-COLUMN analysis of {n_rows:,} rows × {n_cols} columns...")
|
|
2442
|
+
|
|
2443
|
+
# Very aggressive parameter limits for high-column datasets
|
|
2444
|
+
max_combination_size = min(max_combination_size, 2)
|
|
2445
|
+
max_lhs_size = min(max_lhs_size, 2)
|
|
2446
|
+
print(f" High-column mode: limiting to max combination size {max_combination_size}")
|
|
2447
|
+
|
|
2448
|
+
# Discover functional dependencies
|
|
2449
|
+
fd_start = time.time()
|
|
2450
|
+
try:
|
|
2451
|
+
fds = find_functional_dependencies_high_column_optimized(df, max_lhs_size)
|
|
2452
|
+
except Exception as e:
|
|
2453
|
+
print(f" Error in FD discovery: {e}")
|
|
2454
|
+
fds = []
|
|
2455
|
+
fd_time = time.time() - fd_start
|
|
2456
|
+
print(f" FD discovery completed in {fd_time:.2f}s - found {len(fds)} dependencies")
|
|
2457
|
+
|
|
2458
|
+
fd_results = [(", ".join(lhs), rhs) for lhs, rhs in fds]
|
|
2459
|
+
|
|
2460
|
+
# Discover keys
|
|
2461
|
+
key_start = time.time()
|
|
2462
|
+
try:
|
|
2463
|
+
all_keys, candidate_keys, superkeys = find_candidate_keys_high_column_optimized(df, max_combination_size)
|
|
2464
|
+
except Exception as e:
|
|
2465
|
+
print(f" Error in key discovery: {e}")
|
|
2466
|
+
all_keys, candidate_keys, superkeys = [], [], []
|
|
2467
|
+
key_time = time.time() - key_start
|
|
2468
|
+
print(f" Key discovery completed in {key_time:.2f}s - found {len(candidate_keys)} candidate keys")
|
|
2469
|
+
|
|
2470
|
+
# Minimal result preparation for high-column datasets
|
|
2471
|
+
results = []
|
|
2472
|
+
|
|
2473
|
+
# Pre-compute single column uniqueness for efficiency
|
|
2474
|
+
single_col_uniqueness = {}
|
|
2475
|
+
print(" Computing column uniqueness...")
|
|
2476
|
+
try:
|
|
2477
|
+
for col in cols:
|
|
2478
|
+
if col in df.columns:
|
|
2479
|
+
try:
|
|
2480
|
+
single_col_uniqueness[col] = df[col].nunique()
|
|
2481
|
+
except Exception:
|
|
2482
|
+
single_col_uniqueness[col] = 0
|
|
2483
|
+
else:
|
|
2484
|
+
single_col_uniqueness[col] = 0
|
|
2485
|
+
except Exception as e:
|
|
2486
|
+
print(f" Error computing column uniqueness: {e}")
|
|
2487
|
+
# Set default values
|
|
2488
|
+
single_col_uniqueness = {col: 0 for col in cols}
|
|
2489
|
+
|
|
2490
|
+
# Only process essential combinations for high-column datasets
|
|
2491
|
+
max_combinations_total = min(100, n_cols * 2) # Very conservative
|
|
2492
|
+
combinations_tested = 0
|
|
2493
|
+
|
|
2494
|
+
print(f" Preparing results (testing max {max_combinations_total} combinations)...")
|
|
2495
|
+
|
|
2496
|
+
# Process single columns first (most important)
|
|
2497
|
+
try:
|
|
2498
|
+
for col in cols:
|
|
2499
|
+
if combinations_tested >= max_combinations_total:
|
|
2500
|
+
break
|
|
2501
|
+
|
|
2502
|
+
if col not in df.columns:
|
|
2503
|
+
continue
|
|
2504
|
+
|
|
2505
|
+
combo = (col,)
|
|
2506
|
+
unique_count = single_col_uniqueness.get(col, 0)
|
|
2507
|
+
unique_ratio = unique_count / n_rows if n_rows > 0 else 0
|
|
2508
|
+
is_key = combo in all_keys
|
|
2509
|
+
is_candidate = combo in candidate_keys
|
|
2510
|
+
is_superkey = combo in superkeys
|
|
2511
|
+
|
|
2512
|
+
key_type = ""
|
|
2513
|
+
if is_candidate:
|
|
2514
|
+
key_type = "★ Candidate Key"
|
|
2515
|
+
elif is_superkey:
|
|
2516
|
+
key_type = "⊃ Superkey"
|
|
2517
|
+
|
|
2518
|
+
results.append((combo, unique_count, unique_ratio, is_key, key_type))
|
|
2519
|
+
combinations_tested += 1
|
|
2520
|
+
except Exception as e:
|
|
2521
|
+
print(f" Error processing single columns: {e}")
|
|
2522
|
+
|
|
2523
|
+
# Process only the most promising multi-column combinations
|
|
2524
|
+
try:
|
|
2525
|
+
if combinations_tested < max_combinations_total and max_combination_size > 1:
|
|
2526
|
+
# Sort columns by uniqueness (highest first) for better multi-column candidates
|
|
2527
|
+
try:
|
|
2528
|
+
sorted_cols = sorted([col for col in cols if col in df.columns],
|
|
2529
|
+
key=lambda x: single_col_uniqueness.get(x, 0), reverse=True)
|
|
2530
|
+
top_cols = sorted_cols[:min(20, len(sorted_cols))] # Top 20 most unique columns
|
|
2531
|
+
except Exception:
|
|
2532
|
+
top_cols = [col for col in cols if col in df.columns][:20]
|
|
2533
|
+
|
|
2534
|
+
for size in range(2, min(max_combination_size + 1, 3)):
|
|
2535
|
+
if combinations_tested >= max_combinations_total:
|
|
2536
|
+
break
|
|
2537
|
+
|
|
2538
|
+
try:
|
|
2539
|
+
for combo in itertools.combinations(top_cols, size):
|
|
2540
|
+
if combinations_tested >= max_combinations_total:
|
|
2541
|
+
break
|
|
2542
|
+
|
|
2543
|
+
# Ensure all columns exist
|
|
2544
|
+
if not all(col in df.columns for col in combo):
|
|
2545
|
+
continue
|
|
2546
|
+
|
|
2547
|
+
if combo in all_keys:
|
|
2548
|
+
unique_count = n_rows
|
|
2549
|
+
else:
|
|
2550
|
+
# For non-keys, estimate uniqueness
|
|
2551
|
+
unique_count = min(n_rows, sum(single_col_uniqueness.get(col, 0) for col in combo) // len(combo))
|
|
2552
|
+
|
|
2553
|
+
unique_ratio = unique_count / n_rows if n_rows > 0 else 0
|
|
2554
|
+
is_key = combo in all_keys
|
|
2555
|
+
is_candidate = combo in candidate_keys
|
|
2556
|
+
is_superkey = combo in superkeys
|
|
2557
|
+
|
|
2558
|
+
key_type = ""
|
|
2559
|
+
if is_candidate:
|
|
2560
|
+
key_type = "★ Candidate Key"
|
|
2561
|
+
elif is_superkey:
|
|
2562
|
+
key_type = "⊃ Superkey"
|
|
2563
|
+
|
|
2564
|
+
results.append((combo, unique_count, unique_ratio, is_key, key_type))
|
|
2565
|
+
combinations_tested += 1
|
|
2566
|
+
except Exception as e:
|
|
2567
|
+
print(f" Error processing size {size} combinations: {e}")
|
|
2568
|
+
continue
|
|
2569
|
+
except Exception as e:
|
|
2570
|
+
print(f" Error processing multi-column combinations: {e}")
|
|
2571
|
+
|
|
2572
|
+
# Quick sort
|
|
2573
|
+
try:
|
|
2574
|
+
results.sort(key=lambda x: (not x[3], -x[2], len(x[0])))
|
|
2575
|
+
key_results = [(", ".join(c), u, f"{u/n_rows:.2%}", k)
|
|
2576
|
+
for c, u, _, _, k in results]
|
|
2577
|
+
except Exception as e:
|
|
2578
|
+
print(f" Error sorting results: {e}")
|
|
2579
|
+
key_results = []
|
|
2580
|
+
|
|
2581
|
+
# Simplified normalized tables
|
|
2582
|
+
try:
|
|
2583
|
+
normalized_tables = propose_normalized_tables(cols, candidate_keys, fds)
|
|
2584
|
+
except Exception as e:
|
|
2585
|
+
print(f" Error creating normalized tables: {e}")
|
|
2586
|
+
normalized_tables = []
|
|
2587
|
+
|
|
2588
|
+
total_time = time.time() - start_time
|
|
2589
|
+
print(f" HIGH-COLUMN analysis completed in {total_time:.2f}s")
|
|
2590
|
+
|
|
2591
|
+
return fd_results, key_results, n_rows, cols, max_combination_size, max_lhs_size, normalized_tables
|
|
2592
|
+
|
|
2593
|
+
except Exception as e:
|
|
2594
|
+
print(f" Critical error in HIGH-COLUMN analysis: {e}")
|
|
2595
|
+
import traceback
|
|
2596
|
+
traceback.print_exc()
|
|
2597
|
+
# Return safe defaults
|
|
2598
|
+
return [], [], len(df), list(df.columns), max_combination_size, max_lhs_size, []
|
|
2599
|
+
|
|
2600
|
+
|
|
2601
|
+
def test_high_column_scenario():
|
|
2602
|
+
"""
|
|
2603
|
+
Test the high-column optimization with scenarios similar to user's 16k×100 case.
|
|
2604
|
+
"""
|
|
2605
|
+
print("=== HIGH-COLUMN SCENARIO TEST ===\n")
|
|
2606
|
+
|
|
2607
|
+
# Test different high-column scenarios
|
|
2608
|
+
test_scenarios = [
|
|
2609
|
+
(1000, 60, "1K×60 columns"),
|
|
2610
|
+
(5000, 80, "5K×80 columns"),
|
|
2611
|
+
(16000, 100, "16K×100 columns (user scenario)"),
|
|
2612
|
+
(10000, 120, "10K×120 columns"),
|
|
2613
|
+
(50000, 200, "50K×200 columns (extreme)")
|
|
2614
|
+
]
|
|
2615
|
+
|
|
2616
|
+
results = []
|
|
2617
|
+
|
|
2618
|
+
for n_rows, n_cols, description in test_scenarios:
|
|
2619
|
+
print(f"\n{'='*60}")
|
|
2620
|
+
print(f"TESTING: {description}")
|
|
2621
|
+
print('='*60)
|
|
2622
|
+
|
|
2623
|
+
try:
|
|
2624
|
+
# Create test data with many columns
|
|
2625
|
+
print(f"Creating test dataset with {n_rows:,} rows and {n_cols} columns...")
|
|
2626
|
+
|
|
2627
|
+
# Create a realistic high-column dataset
|
|
2628
|
+
np.random.seed(42)
|
|
2629
|
+
random.seed(42)
|
|
2630
|
+
|
|
2631
|
+
data = {}
|
|
2632
|
+
|
|
2633
|
+
# Add ID column (primary key)
|
|
2634
|
+
data['id'] = range(1, n_rows + 1)
|
|
2635
|
+
|
|
2636
|
+
# Add categorical columns of various cardinalities
|
|
2637
|
+
for i in range(min(20, n_cols - 1)):
|
|
2638
|
+
if i < 5:
|
|
2639
|
+
# Low cardinality categorical
|
|
2640
|
+
cardinality = min(10, n_rows // 100)
|
|
2641
|
+
elif i < 10:
|
|
2642
|
+
# Medium cardinality categorical
|
|
2643
|
+
cardinality = min(100, n_rows // 10)
|
|
2644
|
+
else:
|
|
2645
|
+
# Higher cardinality categorical
|
|
2646
|
+
cardinality = min(1000, n_rows // 5)
|
|
2647
|
+
|
|
2648
|
+
data[f'cat_{i}'] = [f'cat_{i}_val_{j % cardinality}' for j in range(n_rows)]
|
|
2649
|
+
|
|
2650
|
+
# Add numeric columns
|
|
2651
|
+
remaining_cols = n_cols - len(data)
|
|
2652
|
+
for i in range(remaining_cols):
|
|
2653
|
+
if i % 4 == 0:
|
|
2654
|
+
# Integer columns
|
|
2655
|
+
data[f'num_{i}'] = np.random.randint(1, 1000, n_rows)
|
|
2656
|
+
elif i % 4 == 1:
|
|
2657
|
+
# Float columns
|
|
2658
|
+
data[f'float_{i}'] = np.random.uniform(0, 100, n_rows)
|
|
2659
|
+
elif i % 4 == 2:
|
|
2660
|
+
# Boolean-like columns
|
|
2661
|
+
data[f'bool_{i}'] = np.random.choice([0, 1], n_rows)
|
|
2662
|
+
else:
|
|
2663
|
+
# Text columns
|
|
2664
|
+
data[f'text_{i}'] = [f'text_{j % 50}' for j in range(n_rows)]
|
|
2665
|
+
|
|
2666
|
+
df = pd.DataFrame(data)
|
|
2667
|
+
|
|
2668
|
+
# Ensure we have the right number of columns
|
|
2669
|
+
if len(df.columns) != n_cols:
|
|
2670
|
+
print(f" Adjusting columns: created {len(df.columns)}, target {n_cols}")
|
|
2671
|
+
while len(df.columns) < n_cols:
|
|
2672
|
+
col_name = f'extra_{len(df.columns)}'
|
|
2673
|
+
df[col_name] = np.random.randint(1, 100, n_rows)
|
|
2674
|
+
df = df.iloc[:, :n_cols] # Trim if too many
|
|
2675
|
+
|
|
2676
|
+
memory_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
|
|
2677
|
+
print(f"Memory usage: {memory_mb:.1f} MB")
|
|
2678
|
+
|
|
2679
|
+
# Test the high-column optimized version
|
|
2680
|
+
start_time = time.time()
|
|
2681
|
+
|
|
2682
|
+
print(f"\n🏗️ Running HIGH-COLUMN-OPTIMIZED analysis...")
|
|
2683
|
+
fd_results, key_results, n_rows_result, cols, max_combo, max_lhs, norm_tables = profile_high_column_optimized(
|
|
2684
|
+
df, max_combination_size=3, max_lhs_size=2
|
|
2685
|
+
)
|
|
2686
|
+
|
|
2687
|
+
analysis_time = time.time() - start_time
|
|
2688
|
+
|
|
2689
|
+
candidate_keys = [k for k in key_results if "Candidate Key" in k[3]]
|
|
2690
|
+
|
|
2691
|
+
print(f"\n✅ SUCCESS!")
|
|
2692
|
+
print(f" • Analysis time: {analysis_time:.2f} seconds")
|
|
2693
|
+
print(f" • Memory usage: {memory_mb:.1f} MB")
|
|
2694
|
+
print(f" • Processing rate: {n_rows / analysis_time:,.0f} rows/second")
|
|
2695
|
+
print(f" • Column processing rate: {n_cols / analysis_time:.1f} columns/second")
|
|
2696
|
+
print(f" • Found {len(fd_results)} functional dependencies")
|
|
2697
|
+
print(f" • Found {len(candidate_keys)} candidate keys")
|
|
2698
|
+
|
|
2699
|
+
# Performance assessment
|
|
2700
|
+
if analysis_time < 10:
|
|
2701
|
+
performance = "🔥 EXCELLENT"
|
|
2702
|
+
elif analysis_time < 30:
|
|
2703
|
+
performance = "✅ GOOD"
|
|
2704
|
+
elif analysis_time < 120:
|
|
2705
|
+
performance = "⚠️ ACCEPTABLE"
|
|
2706
|
+
else:
|
|
2707
|
+
performance = "❌ TOO SLOW"
|
|
2708
|
+
|
|
2709
|
+
print(f" • Performance: {performance}")
|
|
2710
|
+
|
|
2711
|
+
# Show some sample results
|
|
2712
|
+
if fd_results:
|
|
2713
|
+
print(f"\n🔍 Sample functional dependencies found:")
|
|
2714
|
+
for i, (lhs, rhs) in enumerate(fd_results[:3]):
|
|
2715
|
+
print(f" • {lhs} → {rhs}")
|
|
2716
|
+
if len(fd_results) > 3:
|
|
2717
|
+
print(f" ... and {len(fd_results) - 3} more")
|
|
2718
|
+
|
|
2719
|
+
if candidate_keys:
|
|
2720
|
+
print(f"\n🔑 Candidate keys found:")
|
|
2721
|
+
for cols_str, count, ratio, key_type in candidate_keys[:3]:
|
|
2722
|
+
print(f" • {cols_str} ({ratio} unique)")
|
|
2723
|
+
if len(candidate_keys) > 3:
|
|
2724
|
+
print(f" ... and {len(candidate_keys) - 3} more")
|
|
2725
|
+
|
|
2726
|
+
results.append({
|
|
2727
|
+
'scenario': description,
|
|
2728
|
+
'rows': n_rows,
|
|
2729
|
+
'cols': n_cols,
|
|
2730
|
+
'memory_mb': memory_mb,
|
|
2731
|
+
'time': analysis_time,
|
|
2732
|
+
'rows_per_sec': n_rows / analysis_time,
|
|
2733
|
+
'cols_per_sec': n_cols / analysis_time,
|
|
2734
|
+
'fds': len(fd_results),
|
|
2735
|
+
'keys': len(candidate_keys),
|
|
2736
|
+
'performance': performance,
|
|
2737
|
+
'success': True
|
|
2738
|
+
})
|
|
2739
|
+
|
|
2740
|
+
except Exception as e:
|
|
2741
|
+
print(f"❌ FAILED: {str(e)}")
|
|
2742
|
+
import traceback
|
|
2743
|
+
traceback.print_exc()
|
|
2744
|
+
|
|
2745
|
+
results.append({
|
|
2746
|
+
'scenario': description,
|
|
2747
|
+
'rows': n_rows,
|
|
2748
|
+
'cols': n_cols,
|
|
2749
|
+
'memory_mb': 0,
|
|
2750
|
+
'time': float('inf'),
|
|
2751
|
+
'rows_per_sec': 0,
|
|
2752
|
+
'cols_per_sec': 0,
|
|
2753
|
+
'fds': 0,
|
|
2754
|
+
'keys': 0,
|
|
2755
|
+
'performance': "❌ FAILED",
|
|
2756
|
+
'success': False
|
|
2757
|
+
})
|
|
2758
|
+
|
|
2759
|
+
# Summary
|
|
2760
|
+
print(f"\n{'='*80}")
|
|
2761
|
+
print("HIGH-COLUMN OPTIMIZATION SUMMARY")
|
|
2762
|
+
print('='*80)
|
|
2763
|
+
print(f"{'Scenario':<25} {'Memory':<8} {'Time':<8} {'Rows/s':<8} {'Cols/s':<8} {'FDs':<4} {'Keys':<4} {'Performance'}")
|
|
2764
|
+
print("-" * 80)
|
|
2765
|
+
|
|
2766
|
+
for result in results:
|
|
2767
|
+
scenario = result['scenario'][:24]
|
|
2768
|
+
memory = f"{result['memory_mb']:.1f}MB"
|
|
2769
|
+
time_str = f"{result['time']:.1f}s" if result['time'] != float('inf') else "FAIL"
|
|
2770
|
+
rows_rate = f"{result['rows_per_sec']:,.0f}" if result['success'] else "N/A"
|
|
2771
|
+
cols_rate = f"{result['cols_per_sec']:.1f}" if result['success'] else "N/A"
|
|
2772
|
+
fds = str(result['fds'])
|
|
2773
|
+
keys = str(result['keys'])
|
|
2774
|
+
performance = result['performance'].split()[0] # Just the emoji
|
|
2775
|
+
|
|
2776
|
+
print(f"{scenario:<25} {memory:<8} {time_str:<8} {rows_rate:<8} {cols_rate:<8} {fds:<4} {keys:<4} {performance}")
|
|
2777
|
+
|
|
2778
|
+
# Analysis
|
|
2779
|
+
successful = [r for r in results if r['success']]
|
|
2780
|
+
if successful:
|
|
2781
|
+
print(f"\n🎯 PERFORMANCE ANALYSIS:")
|
|
2782
|
+
|
|
2783
|
+
# Check if user scenario (16K×100) was successful
|
|
2784
|
+
user_scenario = next((r for r in successful if '16K×100' in r['scenario']), None)
|
|
2785
|
+
if user_scenario:
|
|
2786
|
+
print(f" ✅ User scenario (16K×100 columns) completed in {user_scenario['time']:.1f} seconds")
|
|
2787
|
+
if user_scenario['time'] < 30:
|
|
2788
|
+
print(f" 🎉 This should be much faster on your smaller machine!")
|
|
2789
|
+
elif user_scenario['time'] < 120:
|
|
2790
|
+
print(f" 👍 This should provide reasonable performance on your smaller machine")
|
|
2791
|
+
else:
|
|
2792
|
+
print(f" ⚠️ May still be slow on smaller machines - consider further optimization")
|
|
2793
|
+
|
|
2794
|
+
avg_time = np.mean([r['time'] for r in successful])
|
|
2795
|
+
avg_cols_per_sec = np.mean([r['cols_per_sec'] for r in successful])
|
|
2796
|
+
|
|
2797
|
+
print(f" • Average analysis time: {avg_time:.1f} seconds")
|
|
2798
|
+
print(f" • Average column processing rate: {avg_cols_per_sec:.1f} columns/second")
|
|
2799
|
+
print(f" • Successfully handled datasets up to {max(r['cols'] for r in successful)} columns")
|
|
2800
|
+
|
|
2801
|
+
# Specific optimizations applied
|
|
2802
|
+
print(f"\n💡 HIGH-COLUMN OPTIMIZATIONS APPLIED:")
|
|
2803
|
+
print(f" • Intelligent column selection (top 30 LHS candidates)")
|
|
2804
|
+
print(f" • Aggressive sampling (max 2000 rows for analysis)")
|
|
2805
|
+
print(f" • Limited combination testing (max 200 FD tests)")
|
|
2806
|
+
print(f" • Prioritized high-cardinality columns for keys")
|
|
2807
|
+
print(f" • Early termination for very wide datasets (>80 columns)")
|
|
2808
|
+
|
|
2809
|
+
return results
|
|
2810
|
+
|
|
2811
|
+
|
|
2812
|
+
# Test functions to run when script is executed directly
|
|
2813
|
+
if __name__ == "__main__":
|
|
2814
|
+
if len(sys.argv) > 1:
|
|
2815
|
+
if sys.argv[1] == "benchmark":
|
|
2816
|
+
benchmark_performance()
|
|
2817
|
+
elif sys.argv[1] == "comprehensive":
|
|
2818
|
+
comprehensive_benchmark()
|
|
2819
|
+
elif sys.argv[1] == "small":
|
|
2820
|
+
test_small_data_optimizations()
|
|
2821
|
+
elif sys.argv[1] == "hyper":
|
|
2822
|
+
test_hyper_optimized_scenario()
|
|
2823
|
+
elif sys.argv[1] == "bigdata":
|
|
2824
|
+
test_big_data_scenario()
|
|
2825
|
+
elif sys.argv[1] == "test":
|
|
2826
|
+
test_realistic_scenario()
|
|
2827
|
+
elif sys.argv[1] == "demo":
|
|
2828
|
+
demo_performance_improvements()
|
|
2829
|
+
elif sys.argv[1] == "highcol":
|
|
2830
|
+
test_high_column_scenario()
|
|
2831
|
+
else:
|
|
2832
|
+
test_profile_keys()
|
|
2833
|
+
else:
|
|
2834
|
+
test_profile_keys()
|