superquantx 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- superquantx/__init__.py +321 -0
- superquantx/algorithms/__init__.py +55 -0
- superquantx/algorithms/base_algorithm.py +413 -0
- superquantx/algorithms/hybrid_classifier.py +628 -0
- superquantx/algorithms/qaoa.py +406 -0
- superquantx/algorithms/quantum_agents.py +1006 -0
- superquantx/algorithms/quantum_kmeans.py +575 -0
- superquantx/algorithms/quantum_nn.py +544 -0
- superquantx/algorithms/quantum_pca.py +499 -0
- superquantx/algorithms/quantum_svm.py +346 -0
- superquantx/algorithms/vqe.py +553 -0
- superquantx/algorithms.py +863 -0
- superquantx/backends/__init__.py +265 -0
- superquantx/backends/base_backend.py +321 -0
- superquantx/backends/braket_backend.py +420 -0
- superquantx/backends/cirq_backend.py +466 -0
- superquantx/backends/ocean_backend.py +491 -0
- superquantx/backends/pennylane_backend.py +419 -0
- superquantx/backends/qiskit_backend.py +451 -0
- superquantx/backends/simulator_backend.py +455 -0
- superquantx/backends/tket_backend.py +519 -0
- superquantx/circuits.py +447 -0
- superquantx/cli/__init__.py +28 -0
- superquantx/cli/commands.py +528 -0
- superquantx/cli/main.py +254 -0
- superquantx/client.py +298 -0
- superquantx/config.py +326 -0
- superquantx/exceptions.py +287 -0
- superquantx/gates.py +588 -0
- superquantx/logging_config.py +347 -0
- superquantx/measurements.py +702 -0
- superquantx/ml.py +936 -0
- superquantx/noise.py +760 -0
- superquantx/utils/__init__.py +83 -0
- superquantx/utils/benchmarking.py +523 -0
- superquantx/utils/classical_utils.py +575 -0
- superquantx/utils/feature_mapping.py +467 -0
- superquantx/utils/optimization.py +410 -0
- superquantx/utils/quantum_utils.py +456 -0
- superquantx/utils/visualization.py +654 -0
- superquantx/version.py +33 -0
- superquantx-0.1.0.dist-info/METADATA +365 -0
- superquantx-0.1.0.dist-info/RECORD +46 -0
- superquantx-0.1.0.dist-info/WHEEL +4 -0
- superquantx-0.1.0.dist-info/entry_points.txt +2 -0
- superquantx-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,575 @@
|
|
1
|
+
"""Classical machine learning utilities for quantum algorithms.
|
2
|
+
|
3
|
+
This module provides classical ML utilities that complement quantum algorithms,
|
4
|
+
including cross-validation, hyperparameter search, and model selection.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import itertools
|
8
|
+
import time
|
9
|
+
from dataclasses import dataclass
|
10
|
+
from typing import Any, Dict, List, Optional, Tuple
|
11
|
+
|
12
|
+
import numpy as np
|
13
|
+
from sklearn.metrics import accuracy_score, mean_squared_error
|
14
|
+
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
|
15
|
+
|
16
|
+
|
17
|
+
@dataclass
|
18
|
+
class CrossValidationResult:
|
19
|
+
"""Results from cross-validation."""
|
20
|
+
|
21
|
+
scores: List[float]
|
22
|
+
mean_score: float
|
23
|
+
std_score: float
|
24
|
+
fold_times: List[float]
|
25
|
+
mean_time: float
|
26
|
+
best_params: Optional[Dict[str, Any]] = None
|
27
|
+
|
28
|
+
|
29
|
+
def cross_validation(
|
30
|
+
algorithm: Any,
|
31
|
+
X: np.ndarray,
|
32
|
+
y: np.ndarray,
|
33
|
+
cv_folds: int = 5,
|
34
|
+
scoring: str = 'accuracy',
|
35
|
+
stratify: bool = True,
|
36
|
+
random_state: Optional[int] = 42,
|
37
|
+
verbose: bool = False
|
38
|
+
) -> CrossValidationResult:
|
39
|
+
"""Perform k-fold cross-validation on quantum algorithm.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
algorithm: Quantum algorithm instance
|
43
|
+
X: Feature matrix
|
44
|
+
y: Target vector
|
45
|
+
cv_folds: Number of CV folds
|
46
|
+
scoring: Scoring metric ('accuracy', 'mse', 'mae')
|
47
|
+
stratify: Whether to use stratified CV for classification
|
48
|
+
random_state: Random seed
|
49
|
+
verbose: Whether to print progress
|
50
|
+
|
51
|
+
Returns:
|
52
|
+
CrossValidationResult with scores and timing info
|
53
|
+
|
54
|
+
"""
|
55
|
+
n_samples = len(X)
|
56
|
+
|
57
|
+
# Choose cross-validation strategy
|
58
|
+
if stratify and _is_classification_task(y):
|
59
|
+
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
|
60
|
+
else:
|
61
|
+
cv = KFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
|
62
|
+
|
63
|
+
scores = []
|
64
|
+
fold_times = []
|
65
|
+
|
66
|
+
for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X, y)):
|
67
|
+
if verbose:
|
68
|
+
print(f"Fold {fold_idx + 1}/{cv_folds}")
|
69
|
+
|
70
|
+
X_train, X_val = X[train_idx], X[val_idx]
|
71
|
+
y_train, y_val = y[train_idx], y[val_idx]
|
72
|
+
|
73
|
+
start_time = time.time()
|
74
|
+
|
75
|
+
# Train algorithm
|
76
|
+
algorithm.fit(X_train, y_train)
|
77
|
+
|
78
|
+
# Make predictions
|
79
|
+
y_pred = algorithm.predict(X_val)
|
80
|
+
|
81
|
+
fold_time = time.time() - start_time
|
82
|
+
fold_times.append(fold_time)
|
83
|
+
|
84
|
+
# Calculate score
|
85
|
+
if scoring == 'accuracy':
|
86
|
+
score = accuracy_score(y_val, y_pred)
|
87
|
+
elif scoring == 'mse':
|
88
|
+
score = -mean_squared_error(y_val, y_pred) # Negative for "higher is better"
|
89
|
+
elif scoring == 'mae':
|
90
|
+
score = -np.mean(np.abs(y_val - y_pred))
|
91
|
+
else:
|
92
|
+
raise ValueError(f"Unknown scoring metric: {scoring}")
|
93
|
+
|
94
|
+
scores.append(score)
|
95
|
+
|
96
|
+
if verbose:
|
97
|
+
print(f" Score: {score:.4f}, Time: {fold_time:.2f}s")
|
98
|
+
|
99
|
+
return CrossValidationResult(
|
100
|
+
scores=scores,
|
101
|
+
mean_score=np.mean(scores),
|
102
|
+
std_score=np.std(scores),
|
103
|
+
fold_times=fold_times,
|
104
|
+
mean_time=np.mean(fold_times)
|
105
|
+
)
|
106
|
+
|
107
|
+
|
108
|
+
def hyperparameter_search(
|
109
|
+
algorithm_class: type,
|
110
|
+
param_grid: Dict[str, List[Any]],
|
111
|
+
X: np.ndarray,
|
112
|
+
y: np.ndarray,
|
113
|
+
cv_folds: int = 3,
|
114
|
+
scoring: str = 'accuracy',
|
115
|
+
n_jobs: int = 1,
|
116
|
+
random_state: Optional[int] = 42,
|
117
|
+
verbose: bool = False
|
118
|
+
) -> Dict[str, Any]:
|
119
|
+
"""Perform grid search for hyperparameter optimization.
|
120
|
+
|
121
|
+
Args:
|
122
|
+
algorithm_class: Quantum algorithm class
|
123
|
+
param_grid: Dictionary of parameter names and values to try
|
124
|
+
X: Feature matrix
|
125
|
+
y: Target vector
|
126
|
+
cv_folds: Number of CV folds
|
127
|
+
scoring: Scoring metric
|
128
|
+
n_jobs: Number of parallel jobs (not implemented)
|
129
|
+
random_state: Random seed
|
130
|
+
verbose: Whether to print progress
|
131
|
+
|
132
|
+
Returns:
|
133
|
+
Dictionary with best parameters and results
|
134
|
+
|
135
|
+
"""
|
136
|
+
# Generate all parameter combinations
|
137
|
+
param_names = list(param_grid.keys())
|
138
|
+
param_values = list(param_grid.values())
|
139
|
+
param_combinations = list(itertools.product(*param_values))
|
140
|
+
|
141
|
+
best_score = float('-inf')
|
142
|
+
best_params = None
|
143
|
+
all_results = []
|
144
|
+
|
145
|
+
if verbose:
|
146
|
+
print(f"Testing {len(param_combinations)} parameter combinations...")
|
147
|
+
|
148
|
+
for i, param_values in enumerate(param_combinations):
|
149
|
+
# Create parameter dictionary
|
150
|
+
params = dict(zip(param_names, param_values))
|
151
|
+
|
152
|
+
if verbose:
|
153
|
+
print(f"Combination {i+1}/{len(param_combinations)}: {params}")
|
154
|
+
|
155
|
+
try:
|
156
|
+
# Create algorithm instance with these parameters
|
157
|
+
algorithm = algorithm_class(**params)
|
158
|
+
|
159
|
+
# Perform cross-validation
|
160
|
+
cv_result = cross_validation(
|
161
|
+
algorithm, X, y, cv_folds=cv_folds,
|
162
|
+
scoring=scoring, random_state=random_state,
|
163
|
+
verbose=False
|
164
|
+
)
|
165
|
+
|
166
|
+
mean_score = cv_result.mean_score
|
167
|
+
|
168
|
+
# Track results
|
169
|
+
result = {
|
170
|
+
'params': params,
|
171
|
+
'mean_score': mean_score,
|
172
|
+
'std_score': cv_result.std_score,
|
173
|
+
'mean_time': cv_result.mean_time
|
174
|
+
}
|
175
|
+
all_results.append(result)
|
176
|
+
|
177
|
+
# Update best score
|
178
|
+
if mean_score > best_score:
|
179
|
+
best_score = mean_score
|
180
|
+
best_params = params
|
181
|
+
|
182
|
+
if verbose:
|
183
|
+
print(f" Score: {mean_score:.4f} ± {cv_result.std_score:.4f}")
|
184
|
+
|
185
|
+
except Exception as e:
|
186
|
+
if verbose:
|
187
|
+
print(f" Failed: {str(e)}")
|
188
|
+
|
189
|
+
result = {
|
190
|
+
'params': params,
|
191
|
+
'mean_score': None,
|
192
|
+
'std_score': None,
|
193
|
+
'mean_time': None,
|
194
|
+
'error': str(e)
|
195
|
+
}
|
196
|
+
all_results.append(result)
|
197
|
+
|
198
|
+
return {
|
199
|
+
'best_params': best_params,
|
200
|
+
'best_score': best_score,
|
201
|
+
'all_results': all_results,
|
202
|
+
'n_combinations': len(param_combinations)
|
203
|
+
}
|
204
|
+
|
205
|
+
|
206
|
+
def model_selection(
|
207
|
+
algorithms: List[Tuple[str, type, Dict[str, Any]]],
|
208
|
+
X: np.ndarray,
|
209
|
+
y: np.ndarray,
|
210
|
+
cv_folds: int = 5,
|
211
|
+
scoring: str = 'accuracy',
|
212
|
+
test_size: float = 0.2,
|
213
|
+
random_state: Optional[int] = 42,
|
214
|
+
verbose: bool = False
|
215
|
+
) -> Dict[str, Any]:
|
216
|
+
"""Compare multiple algorithms and select the best one.
|
217
|
+
|
218
|
+
Args:
|
219
|
+
algorithms: List of (name, class, params) tuples
|
220
|
+
X: Feature matrix
|
221
|
+
y: Target vector
|
222
|
+
cv_folds: Number of CV folds
|
223
|
+
scoring: Scoring metric
|
224
|
+
test_size: Proportion for test set
|
225
|
+
random_state: Random seed
|
226
|
+
verbose: Whether to print progress
|
227
|
+
|
228
|
+
Returns:
|
229
|
+
Dictionary with model selection results
|
230
|
+
|
231
|
+
"""
|
232
|
+
# Split data
|
233
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
234
|
+
X, y, test_size=test_size, random_state=random_state,
|
235
|
+
stratify=y if _is_classification_task(y) else None
|
236
|
+
)
|
237
|
+
|
238
|
+
results = {}
|
239
|
+
best_algorithm = None
|
240
|
+
best_score = float('-inf')
|
241
|
+
|
242
|
+
for name, algorithm_class, params in algorithms:
|
243
|
+
if verbose:
|
244
|
+
print(f"Evaluating {name}...")
|
245
|
+
|
246
|
+
try:
|
247
|
+
# Create algorithm instance
|
248
|
+
algorithm = algorithm_class(**params)
|
249
|
+
|
250
|
+
# Cross-validation on training set
|
251
|
+
cv_result = cross_validation(
|
252
|
+
algorithm, X_train, y_train, cv_folds=cv_folds,
|
253
|
+
scoring=scoring, random_state=random_state,
|
254
|
+
verbose=False
|
255
|
+
)
|
256
|
+
|
257
|
+
# Final evaluation on test set
|
258
|
+
algorithm.fit(X_train, y_train)
|
259
|
+
y_pred = algorithm.predict(X_test)
|
260
|
+
|
261
|
+
if scoring == 'accuracy':
|
262
|
+
test_score = accuracy_score(y_test, y_pred)
|
263
|
+
elif scoring == 'mse':
|
264
|
+
test_score = -mean_squared_error(y_test, y_pred)
|
265
|
+
elif scoring == 'mae':
|
266
|
+
test_score = -np.mean(np.abs(y_test - y_pred))
|
267
|
+
else:
|
268
|
+
test_score = 0 # Fallback
|
269
|
+
|
270
|
+
results[name] = {
|
271
|
+
'cv_mean': cv_result.mean_score,
|
272
|
+
'cv_std': cv_result.std_score,
|
273
|
+
'test_score': test_score,
|
274
|
+
'mean_time': cv_result.mean_time,
|
275
|
+
'params': params
|
276
|
+
}
|
277
|
+
|
278
|
+
# Track best algorithm
|
279
|
+
if cv_result.mean_score > best_score:
|
280
|
+
best_score = cv_result.mean_score
|
281
|
+
best_algorithm = name
|
282
|
+
|
283
|
+
if verbose:
|
284
|
+
print(f" CV: {cv_result.mean_score:.4f} ± {cv_result.std_score:.4f}")
|
285
|
+
print(f" Test: {test_score:.4f}")
|
286
|
+
|
287
|
+
except Exception as e:
|
288
|
+
if verbose:
|
289
|
+
print(f" Failed: {str(e)}")
|
290
|
+
|
291
|
+
results[name] = {
|
292
|
+
'cv_mean': None,
|
293
|
+
'cv_std': None,
|
294
|
+
'test_score': None,
|
295
|
+
'mean_time': None,
|
296
|
+
'params': params,
|
297
|
+
'error': str(e)
|
298
|
+
}
|
299
|
+
|
300
|
+
return {
|
301
|
+
'results': results,
|
302
|
+
'best_algorithm': best_algorithm,
|
303
|
+
'best_score': best_score
|
304
|
+
}
|
305
|
+
|
306
|
+
|
307
|
+
def data_splitting(
|
308
|
+
X: np.ndarray,
|
309
|
+
y: np.ndarray,
|
310
|
+
train_size: float = 0.7,
|
311
|
+
val_size: float = 0.15,
|
312
|
+
test_size: float = 0.15,
|
313
|
+
stratify: bool = True,
|
314
|
+
random_state: Optional[int] = 42
|
315
|
+
) -> Tuple[np.ndarray, ...]:
|
316
|
+
"""Split data into train, validation, and test sets.
|
317
|
+
|
318
|
+
Args:
|
319
|
+
X: Feature matrix
|
320
|
+
y: Target vector
|
321
|
+
train_size: Proportion for training
|
322
|
+
val_size: Proportion for validation
|
323
|
+
test_size: Proportion for testing
|
324
|
+
stratify: Whether to stratify splits for classification
|
325
|
+
random_state: Random seed
|
326
|
+
|
327
|
+
Returns:
|
328
|
+
Tuple of (X_train, X_val, X_test, y_train, y_val, y_test)
|
329
|
+
|
330
|
+
"""
|
331
|
+
if not np.isclose(train_size + val_size + test_size, 1.0):
|
332
|
+
raise ValueError("Split sizes must sum to 1.0")
|
333
|
+
|
334
|
+
stratify_target = y if (stratify and _is_classification_task(y)) else None
|
335
|
+
|
336
|
+
# First split: separate test set
|
337
|
+
X_temp, X_test, y_temp, y_test = train_test_split(
|
338
|
+
X, y, test_size=test_size, random_state=random_state,
|
339
|
+
stratify=stratify_target
|
340
|
+
)
|
341
|
+
|
342
|
+
# Second split: separate train and validation
|
343
|
+
relative_val_size = val_size / (train_size + val_size)
|
344
|
+
stratify_temp = y_temp if (stratify and _is_classification_task(y)) else None
|
345
|
+
|
346
|
+
X_train, X_val, y_train, y_val = train_test_split(
|
347
|
+
X_temp, y_temp, test_size=relative_val_size,
|
348
|
+
random_state=random_state, stratify=stratify_temp
|
349
|
+
)
|
350
|
+
|
351
|
+
return X_train, X_val, X_test, y_train, y_val, y_test
|
352
|
+
|
353
|
+
|
354
|
+
def learning_curve_analysis(
|
355
|
+
algorithm: Any,
|
356
|
+
X: np.ndarray,
|
357
|
+
y: np.ndarray,
|
358
|
+
train_sizes: Optional[List[float]] = None,
|
359
|
+
cv_folds: int = 5,
|
360
|
+
scoring: str = 'accuracy',
|
361
|
+
random_state: Optional[int] = 42,
|
362
|
+
verbose: bool = False
|
363
|
+
) -> Dict[str, Any]:
|
364
|
+
"""Analyze learning curve by varying training set size.
|
365
|
+
|
366
|
+
Args:
|
367
|
+
algorithm: Quantum algorithm instance
|
368
|
+
X: Feature matrix
|
369
|
+
y: Target vector
|
370
|
+
train_sizes: Fractions of training data to use
|
371
|
+
cv_folds: Number of CV folds
|
372
|
+
scoring: Scoring metric
|
373
|
+
random_state: Random seed
|
374
|
+
verbose: Whether to print progress
|
375
|
+
|
376
|
+
Returns:
|
377
|
+
Learning curve analysis results
|
378
|
+
|
379
|
+
"""
|
380
|
+
if train_sizes is None:
|
381
|
+
train_sizes = [0.1, 0.2, 0.4, 0.6, 0.8, 1.0]
|
382
|
+
|
383
|
+
results = {
|
384
|
+
'train_sizes': [],
|
385
|
+
'train_scores': [],
|
386
|
+
'val_scores': [],
|
387
|
+
'training_times': []
|
388
|
+
}
|
389
|
+
|
390
|
+
for size in train_sizes:
|
391
|
+
if verbose:
|
392
|
+
print(f"Training size: {size:.1%}")
|
393
|
+
|
394
|
+
# Subsample training data
|
395
|
+
n_samples = int(len(X) * size)
|
396
|
+
indices = np.random.RandomState(random_state).choice(
|
397
|
+
len(X), n_samples, replace=False
|
398
|
+
)
|
399
|
+
X_subset = X[indices]
|
400
|
+
y_subset = y[indices]
|
401
|
+
|
402
|
+
# Cross-validation
|
403
|
+
cv_result = cross_validation(
|
404
|
+
algorithm, X_subset, y_subset, cv_folds=cv_folds,
|
405
|
+
scoring=scoring, random_state=random_state,
|
406
|
+
verbose=False
|
407
|
+
)
|
408
|
+
|
409
|
+
results['train_sizes'].append(n_samples)
|
410
|
+
results['val_scores'].append(cv_result.mean_score)
|
411
|
+
results['training_times'].append(cv_result.mean_time)
|
412
|
+
|
413
|
+
# Training score (fit on full subset, score on same data)
|
414
|
+
algorithm.fit(X_subset, y_subset)
|
415
|
+
y_pred = algorithm.predict(X_subset)
|
416
|
+
|
417
|
+
if scoring == 'accuracy':
|
418
|
+
train_score = accuracy_score(y_subset, y_pred)
|
419
|
+
elif scoring == 'mse':
|
420
|
+
train_score = -mean_squared_error(y_subset, y_pred)
|
421
|
+
elif scoring == 'mae':
|
422
|
+
train_score = -np.mean(np.abs(y_subset - y_pred))
|
423
|
+
else:
|
424
|
+
train_score = 0
|
425
|
+
|
426
|
+
results['train_scores'].append(train_score)
|
427
|
+
|
428
|
+
if verbose:
|
429
|
+
print(f" Train: {train_score:.4f}, Val: {cv_result.mean_score:.4f}")
|
430
|
+
|
431
|
+
return results
|
432
|
+
|
433
|
+
|
434
|
+
def feature_importance_analysis(
|
435
|
+
algorithm: Any,
|
436
|
+
X: np.ndarray,
|
437
|
+
y: np.ndarray,
|
438
|
+
method: str = 'permutation',
|
439
|
+
n_repeats: int = 10,
|
440
|
+
scoring: str = 'accuracy',
|
441
|
+
random_state: Optional[int] = 42
|
442
|
+
) -> np.ndarray:
|
443
|
+
"""Analyze feature importance using permutation or other methods.
|
444
|
+
|
445
|
+
Args:
|
446
|
+
algorithm: Fitted quantum algorithm
|
447
|
+
X: Feature matrix
|
448
|
+
y: Target vector
|
449
|
+
method: Method for importance ('permutation', 'ablation')
|
450
|
+
n_repeats: Number of permutation repeats
|
451
|
+
scoring: Scoring metric
|
452
|
+
random_state: Random seed
|
453
|
+
|
454
|
+
Returns:
|
455
|
+
Feature importance scores
|
456
|
+
|
457
|
+
"""
|
458
|
+
if method == 'permutation':
|
459
|
+
return _permutation_importance(
|
460
|
+
algorithm, X, y, n_repeats, scoring, random_state
|
461
|
+
)
|
462
|
+
elif method == 'ablation':
|
463
|
+
return _ablation_importance(algorithm, X, y, scoring)
|
464
|
+
else:
|
465
|
+
raise ValueError(f"Unknown importance method: {method}")
|
466
|
+
|
467
|
+
|
468
|
+
def _permutation_importance(
|
469
|
+
algorithm: Any,
|
470
|
+
X: np.ndarray,
|
471
|
+
y: np.ndarray,
|
472
|
+
n_repeats: int,
|
473
|
+
scoring: str,
|
474
|
+
random_state: Optional[int]
|
475
|
+
) -> np.ndarray:
|
476
|
+
"""Calculate permutation feature importance."""
|
477
|
+
# Baseline score
|
478
|
+
y_pred = algorithm.predict(X)
|
479
|
+
if scoring == 'accuracy':
|
480
|
+
baseline_score = accuracy_score(y, y_pred)
|
481
|
+
elif scoring == 'mse':
|
482
|
+
baseline_score = mean_squared_error(y, y_pred)
|
483
|
+
else:
|
484
|
+
baseline_score = 0
|
485
|
+
|
486
|
+
n_features = X.shape[1]
|
487
|
+
importance_scores = np.zeros(n_features)
|
488
|
+
|
489
|
+
rng = np.random.RandomState(random_state)
|
490
|
+
|
491
|
+
for feature_idx in range(n_features):
|
492
|
+
feature_scores = []
|
493
|
+
|
494
|
+
for _ in range(n_repeats):
|
495
|
+
# Permute feature
|
496
|
+
X_permuted = X.copy()
|
497
|
+
X_permuted[:, feature_idx] = rng.permutation(X_permuted[:, feature_idx])
|
498
|
+
|
499
|
+
# Score with permuted feature
|
500
|
+
y_pred_perm = algorithm.predict(X_permuted)
|
501
|
+
if scoring == 'accuracy':
|
502
|
+
perm_score = accuracy_score(y, y_pred_perm)
|
503
|
+
# For accuracy, importance = decrease in accuracy
|
504
|
+
feature_importance = baseline_score - perm_score
|
505
|
+
elif scoring == 'mse':
|
506
|
+
perm_score = mean_squared_error(y, y_pred_perm)
|
507
|
+
# For MSE, importance = increase in MSE
|
508
|
+
feature_importance = perm_score - baseline_score
|
509
|
+
else:
|
510
|
+
feature_importance = 0
|
511
|
+
|
512
|
+
feature_scores.append(feature_importance)
|
513
|
+
|
514
|
+
importance_scores[feature_idx] = np.mean(feature_scores)
|
515
|
+
|
516
|
+
return importance_scores
|
517
|
+
|
518
|
+
|
519
|
+
def _ablation_importance(
|
520
|
+
algorithm: Any,
|
521
|
+
X: np.ndarray,
|
522
|
+
y: np.ndarray,
|
523
|
+
scoring: str
|
524
|
+
) -> np.ndarray:
|
525
|
+
"""Calculate ablation feature importance."""
|
526
|
+
n_features = X.shape[1]
|
527
|
+
importance_scores = np.zeros(n_features)
|
528
|
+
|
529
|
+
# Baseline score with all features
|
530
|
+
y_pred = algorithm.predict(X)
|
531
|
+
if scoring == 'accuracy':
|
532
|
+
baseline_score = accuracy_score(y, y_pred)
|
533
|
+
elif scoring == 'mse':
|
534
|
+
baseline_score = mean_squared_error(y, y_pred)
|
535
|
+
else:
|
536
|
+
baseline_score = 0
|
537
|
+
|
538
|
+
for feature_idx in range(n_features):
|
539
|
+
# Remove feature (set to zero)
|
540
|
+
X_ablated = X.copy()
|
541
|
+
X_ablated[:, feature_idx] = 0
|
542
|
+
|
543
|
+
# Score without feature
|
544
|
+
y_pred_ablated = algorithm.predict(X_ablated)
|
545
|
+
if scoring == 'accuracy':
|
546
|
+
ablated_score = accuracy_score(y, y_pred_ablated)
|
547
|
+
importance = baseline_score - ablated_score
|
548
|
+
elif scoring == 'mse':
|
549
|
+
ablated_score = mean_squared_error(y, y_pred_ablated)
|
550
|
+
importance = ablated_score - baseline_score
|
551
|
+
else:
|
552
|
+
importance = 0
|
553
|
+
|
554
|
+
importance_scores[feature_idx] = importance
|
555
|
+
|
556
|
+
return importance_scores
|
557
|
+
|
558
|
+
|
559
|
+
def _is_classification_task(y: np.ndarray) -> bool:
|
560
|
+
"""Check if task is classification based on target values."""
|
561
|
+
unique_vals = np.unique(y)
|
562
|
+
|
563
|
+
# Classification if:
|
564
|
+
# 1. Integer values
|
565
|
+
# 2. Small number of unique values relative to sample size
|
566
|
+
# 3. Values look like class labels (0, 1, 2, ...)
|
567
|
+
|
568
|
+
is_integer = np.all(y == y.astype(int))
|
569
|
+
n_unique = len(unique_vals)
|
570
|
+
n_samples = len(y)
|
571
|
+
|
572
|
+
if is_integer and n_unique <= min(20, n_samples * 0.1):
|
573
|
+
return True
|
574
|
+
|
575
|
+
return False
|