moose-fs 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- LICENSE +21 -0
- README.md +190 -0
- moose_fs-0.1.0.dist-info/METADATA +232 -0
- moose_fs-0.1.0.dist-info/RECORD +40 -0
- moose_fs-0.1.0.dist-info/WHEEL +4 -0
- moose_fs-0.1.0.dist-info/entry_points.txt +2 -0
- moose_fs-0.1.0.dist-info/licenses/LICENSE +21 -0
- moosefs/__init__.py +6 -0
- moosefs/core/__init__.py +6 -0
- moosefs/core/data_processor.py +319 -0
- moosefs/core/feature.py +44 -0
- moosefs/core/novovicova.py +60 -0
- moosefs/core/pareto.py +90 -0
- moosefs/feature_selection_pipeline.py +548 -0
- moosefs/feature_selectors/__init__.py +26 -0
- moosefs/feature_selectors/base_selector.py +38 -0
- moosefs/feature_selectors/default_variance.py +21 -0
- moosefs/feature_selectors/elastic_net_selector.py +75 -0
- moosefs/feature_selectors/f_statistic_selector.py +42 -0
- moosefs/feature_selectors/lasso_selector.py +46 -0
- moosefs/feature_selectors/mrmr_selector.py +57 -0
- moosefs/feature_selectors/mutual_info_selector.py +45 -0
- moosefs/feature_selectors/random_forest_selector.py +48 -0
- moosefs/feature_selectors/svm_selector.py +50 -0
- moosefs/feature_selectors/variance_selectors.py +16 -0
- moosefs/feature_selectors/xgboost_selector.py +44 -0
- moosefs/merging_strategies/__init__.py +17 -0
- moosefs/merging_strategies/arithmetic_mean_merger.py +46 -0
- moosefs/merging_strategies/base_merger.py +64 -0
- moosefs/merging_strategies/borda_merger.py +46 -0
- moosefs/merging_strategies/consensus_merger.py +80 -0
- moosefs/merging_strategies/l2_norm_merger.py +42 -0
- moosefs/merging_strategies/union_of_intersections_merger.py +89 -0
- moosefs/metrics/__init__.py +23 -0
- moosefs/metrics/performance_metrics.py +239 -0
- moosefs/metrics/stability_metrics.py +49 -0
- moosefs/utils.py +161 -0
- scripts/config.yml +92 -0
- scripts/main.py +163 -0
- scripts/utils.py +186 -0
|
@@ -0,0 +1,548 @@
|
|
|
1
|
+
from itertools import combinations
|
|
2
|
+
import os
|
|
3
|
+
import random
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
from joblib import Parallel, delayed, parallel_backend
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from sklearn.model_selection import train_test_split
|
|
10
|
+
|
|
11
|
+
# tqdm is not used; keep imports minimal
|
|
12
|
+
from .core import Feature, ParetoAnalysis
|
|
13
|
+
from .metrics.stability_metrics import compute_stability_metrics, diversity_agreement
|
|
14
|
+
from .utils import extract_params, get_class_info
|
|
15
|
+
|
|
16
|
+
# for test purpose
|
|
17
|
+
agreement_flag = False
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class FeatureSelectionPipeline:
|
|
21
|
+
"""End-to-end pipeline for ensemble feature selection.
|
|
22
|
+
|
|
23
|
+
Orchestrates feature scoring, merging, metric evaluation, and Pareto-based
|
|
24
|
+
selection across repeated runs and method subgroups.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
data: Optional[pd.DataFrame] = None,
|
|
30
|
+
*,
|
|
31
|
+
X: Optional[pd.DataFrame] = None,
|
|
32
|
+
y: Optional[pd.Series] = None,
|
|
33
|
+
fs_methods: list,
|
|
34
|
+
merging_strategy: Any,
|
|
35
|
+
num_repeats: int,
|
|
36
|
+
num_features_to_select: Optional[int],
|
|
37
|
+
metrics: list = ["logloss", "f1_score", "accuracy"],
|
|
38
|
+
task: str = "classification",
|
|
39
|
+
min_group_size: int = 2,
|
|
40
|
+
fill: bool = False,
|
|
41
|
+
random_state: Optional[int] = None,
|
|
42
|
+
n_jobs: Optional[int] = None,
|
|
43
|
+
) -> None:
|
|
44
|
+
"""Initialize the pipeline.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
data: Combined DataFrame where the last column is treated as the target.
|
|
48
|
+
X: Feature DataFrame (use together with ``y`` instead of ``data``).
|
|
49
|
+
y: Target Series aligned with ``X``.
|
|
50
|
+
fs_methods: Feature selectors (identifiers or instances).
|
|
51
|
+
merging_strategy: Merging strategy (identifier or instance).
|
|
52
|
+
num_repeats: Number of repeats for the pipeline.
|
|
53
|
+
num_features_to_select: Desired number of features to select.
|
|
54
|
+
metrics: Metric functions (identifiers or instances).
|
|
55
|
+
task: 'classification' or 'regression'.
|
|
56
|
+
min_group_size: Minimum number of methods in each subgroup.
|
|
57
|
+
fill: If True, enforce exact size after merging.
|
|
58
|
+
random_state: Seed for reproducibility.
|
|
59
|
+
n_jobs: Parallel jobs (use num_repeats when -1 or None).
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
ValueError: If task is invalid or required parameters are missing.
|
|
63
|
+
|
|
64
|
+
Note:
|
|
65
|
+
Exactly one of ``data`` or the pair ``(X, y)`` must be provided.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
# parameters validation
|
|
69
|
+
self._validate_task(task)
|
|
70
|
+
self.X, self.y = self._validate_X_y(data=data, X=X, y=y)
|
|
71
|
+
self.target_name = self.y.name
|
|
72
|
+
self.data = pd.concat([self.X, self.y], axis=1)
|
|
73
|
+
self.task = task
|
|
74
|
+
self.num_repeats = num_repeats
|
|
75
|
+
self.num_features_to_select = num_features_to_select
|
|
76
|
+
self.random_state = random_state if random_state is not None else random.randint(0, 1000)
|
|
77
|
+
self.n_jobs = n_jobs
|
|
78
|
+
self.min_group_size = min_group_size
|
|
79
|
+
self.fill = fill
|
|
80
|
+
|
|
81
|
+
# set seed for reproducibility
|
|
82
|
+
self._set_seed(self.random_state)
|
|
83
|
+
|
|
84
|
+
# Keep original specs and also instantiate now for introspection
|
|
85
|
+
self._fs_method_specs = list(fs_methods)
|
|
86
|
+
self._metric_specs = list(metrics)
|
|
87
|
+
self._merging_spec = merging_strategy
|
|
88
|
+
|
|
89
|
+
# dynamically load classes or instantiate them (initial instances)
|
|
90
|
+
self.fs_methods = [self._load_class(m, instantiate=True) for m in self._fs_method_specs]
|
|
91
|
+
self.metrics = [self._load_class(m, instantiate=True) for m in self._metric_specs]
|
|
92
|
+
self.merging_strategy = self._load_class(self._merging_spec, instantiate=True)
|
|
93
|
+
|
|
94
|
+
# validate and preparation
|
|
95
|
+
if self.num_features_to_select is None:
|
|
96
|
+
raise ValueError("num_features_to_select must be provided")
|
|
97
|
+
# subgroup names are generated in run() after instantiation
|
|
98
|
+
self.subgroup_names: list = []
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def _validate_task(task: str) -> None:
|
|
102
|
+
"""Validate task string.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
task: Expected 'classification' or 'regression'.
|
|
106
|
+
"""
|
|
107
|
+
if task not in ["classification", "regression"]:
|
|
108
|
+
raise ValueError("Task must be either 'classification' or 'regression'.")
|
|
109
|
+
|
|
110
|
+
@staticmethod
|
|
111
|
+
def _set_seed(seed: int, idx: Optional[int] = None) -> None:
|
|
112
|
+
"""Seed numpy/python RNGs for reproducibility."""
|
|
113
|
+
np.random.seed(seed)
|
|
114
|
+
random.seed(seed)
|
|
115
|
+
os.environ["PYTHONHASHSEED"] = str(seed)
|
|
116
|
+
|
|
117
|
+
@staticmethod
|
|
118
|
+
def _validate_X_y(*, data=None, X=None, y=None):
|
|
119
|
+
"""Normalize user inputs into a feature DataFrame and target Series."""
|
|
120
|
+
if data is not None:
|
|
121
|
+
if X is not None or y is not None:
|
|
122
|
+
raise ValueError("Provide either `data` or (`X`, `y`), not both.")
|
|
123
|
+
if not isinstance(data, pd.DataFrame):
|
|
124
|
+
raise TypeError("`data` must be a pandas DataFrame.")
|
|
125
|
+
if data.shape[1] < 1:
|
|
126
|
+
raise ValueError("`data` must contain at least one column.")
|
|
127
|
+
X_df = data.iloc[:, :-1]
|
|
128
|
+
y_ser = data.iloc[:, -1]
|
|
129
|
+
else:
|
|
130
|
+
if X is None or y is None:
|
|
131
|
+
raise ValueError("Provide either `data` or both `X` and `y`.")
|
|
132
|
+
if not isinstance(X, pd.DataFrame):
|
|
133
|
+
raise TypeError("`X` must be a pandas DataFrame.")
|
|
134
|
+
if not isinstance(y, pd.Series):
|
|
135
|
+
raise TypeError("`y` must be a pandas Series.")
|
|
136
|
+
if len(X) != len(y):
|
|
137
|
+
raise ValueError("`X` and `y` must have the same number of rows.")
|
|
138
|
+
X_df = X
|
|
139
|
+
y_ser = y
|
|
140
|
+
|
|
141
|
+
target_name = y_ser.name if y_ser.name is not None else "target"
|
|
142
|
+
if target_name in X_df.columns:
|
|
143
|
+
raise ValueError(f"Target column name '{target_name}' conflicts with feature columns.")
|
|
144
|
+
return X_df.copy(), y_ser.rename(target_name).copy()
|
|
145
|
+
|
|
146
|
+
def _per_repeat_seed(self, idx: int) -> int:
|
|
147
|
+
"""Derive a per-repeat seed from the top-level seed."""
|
|
148
|
+
return int(self.random_state) + int(idx)
|
|
149
|
+
|
|
150
|
+
def _effective_n_jobs(self) -> int:
|
|
151
|
+
"""Return parallel job count capped by number of repeats."""
|
|
152
|
+
n = self.n_jobs if self.n_jobs is not None and self.n_jobs != -1 else self.num_repeats
|
|
153
|
+
return min(int(n), int(self.num_repeats))
|
|
154
|
+
|
|
155
|
+
def _generate_subgroup_names(self, min_group_size: int) -> list:
|
|
156
|
+
"""Generate all selector-name combinations with minimum size.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
min_group_size: Minimum subgroup size.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
List of tuples of selector names.
|
|
163
|
+
"""
|
|
164
|
+
fs_method_names = [fs_method.name for fs_method in self.fs_methods]
|
|
165
|
+
if min_group_size > len(fs_method_names):
|
|
166
|
+
raise ValueError(
|
|
167
|
+
f"Minimum group size of {min_group_size} exceeds available methods ({len(fs_method_names)})."
|
|
168
|
+
)
|
|
169
|
+
return [
|
|
170
|
+
combo for r in range(min_group_size, len(fs_method_names) + 1) for combo in combinations(fs_method_names, r)
|
|
171
|
+
]
|
|
172
|
+
|
|
173
|
+
# Public method to run the feature selection pipeline
|
|
174
|
+
def run(self, verbose: bool = True) -> tuple:
|
|
175
|
+
"""Execute the pipeline and return best merged features.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
(merged_features, best_repeat_idx, best_group_names).
|
|
179
|
+
"""
|
|
180
|
+
self._set_seed(self.random_state)
|
|
181
|
+
|
|
182
|
+
# Fresh objects for each run to avoid hidden state
|
|
183
|
+
self.fs_methods = [self._load_class(m, instantiate=True) for m in self._fs_method_specs]
|
|
184
|
+
self.metrics = [self._load_class(m, instantiate=True) for m in self._metric_specs]
|
|
185
|
+
self.merging_strategy = self._load_class(self._merging_spec, instantiate=True)
|
|
186
|
+
|
|
187
|
+
# Regenerate subgroup names from fresh fs_methods
|
|
188
|
+
self.subgroup_names = self._generate_subgroup_names(self.min_group_size)
|
|
189
|
+
|
|
190
|
+
# Reset internal state so that run() always starts fresh
|
|
191
|
+
self.fs_subsets: dict = {}
|
|
192
|
+
self.merged_features: dict = {}
|
|
193
|
+
|
|
194
|
+
num_metrics = self._num_metrics_total()
|
|
195
|
+
result_dicts: list = [{} for _ in range(num_metrics)]
|
|
196
|
+
|
|
197
|
+
# Ensure we don't allocate more jobs than repeats
|
|
198
|
+
n_jobs = self._effective_n_jobs()
|
|
199
|
+
|
|
200
|
+
with parallel_backend("loky", inner_max_num_threads=1): # Prevents oversubscription
|
|
201
|
+
parallel_results = Parallel(n_jobs=n_jobs)(
|
|
202
|
+
delayed(self._pipeline_run_for_repeat)(i, verbose) for i in range(self.num_repeats)
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# Sort results by repeat index
|
|
206
|
+
parallel_results.sort(key=lambda x: x[0]) # Now, x[0] is the repeat index
|
|
207
|
+
|
|
208
|
+
# Merge results in a fixed order
|
|
209
|
+
self.fs_subsets = {}
|
|
210
|
+
self.merged_features = {}
|
|
211
|
+
|
|
212
|
+
for (
|
|
213
|
+
_,
|
|
214
|
+
partial_fs_subsets,
|
|
215
|
+
partial_merged_features,
|
|
216
|
+
partial_result_dicts,
|
|
217
|
+
) in parallel_results:
|
|
218
|
+
self.fs_subsets.update(partial_fs_subsets)
|
|
219
|
+
self.merged_features.update(partial_merged_features)
|
|
220
|
+
for dict_idx in range(num_metrics):
|
|
221
|
+
result_dicts[dict_idx].update(partial_result_dicts[dict_idx])
|
|
222
|
+
|
|
223
|
+
# Compute Pareto analysis as usual
|
|
224
|
+
means_list = self._calculate_means(result_dicts, self.subgroup_names)
|
|
225
|
+
means_list = self._replace_none(means_list)
|
|
226
|
+
# pairs = sorted(zip(self.subgroup_names, means_list), key=lambda p: tuple(p[0]))
|
|
227
|
+
# self.subgroup_names, means_list = map(list, zip(*pairs))
|
|
228
|
+
best_group = self._compute_pareto(means_list, self.subgroup_names)
|
|
229
|
+
best_group_metrics = self._extract_repeat_metrics(best_group, *result_dicts)
|
|
230
|
+
best_group_metrics = self._replace_none(best_group_metrics)
|
|
231
|
+
best_repeat = self._compute_pareto(best_group_metrics, [str(i) for i in range(self.num_repeats)])
|
|
232
|
+
|
|
233
|
+
return (self.merged_features[(int(best_repeat), best_group)], int(best_repeat), best_group)
|
|
234
|
+
|
|
235
|
+
def _pipeline_run_for_repeat(self, i: int, verbose: bool) -> Any:
|
|
236
|
+
"""Execute one repeat and return partial results tuple."""
|
|
237
|
+
self._set_seed(self._per_repeat_seed(i))
|
|
238
|
+
|
|
239
|
+
train_data, test_data = self._split_data(test_size=0.20, random_state=self._per_repeat_seed(i))
|
|
240
|
+
|
|
241
|
+
fs_subsets_local = self._compute_subset(train_data, i)
|
|
242
|
+
merged_features_local = self._compute_merging(fs_subsets_local, i, verbose)
|
|
243
|
+
local_result_dicts = self._compute_metrics(fs_subsets_local, merged_features_local, train_data, test_data, i)
|
|
244
|
+
|
|
245
|
+
# Return repeat index as the first element
|
|
246
|
+
return i, fs_subsets_local, merged_features_local, local_result_dicts
|
|
247
|
+
|
|
248
|
+
def _replace_none(self, metrics: list) -> list:
|
|
249
|
+
"""Replace any group with None with a list of -inf.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
metrics: Per-group metric lists.
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
Same shape with None replaced by -inf rows.
|
|
256
|
+
"""
|
|
257
|
+
return [
|
|
258
|
+
(
|
|
259
|
+
group_metrics
|
|
260
|
+
if all(metric is not None for metric in group_metrics)
|
|
261
|
+
else [-float("inf")] * len(group_metrics)
|
|
262
|
+
)
|
|
263
|
+
for group_metrics in metrics
|
|
264
|
+
]
|
|
265
|
+
|
|
266
|
+
def _split_data(self, test_size: float, random_state: int) -> tuple:
|
|
267
|
+
"""Split data into train/test using stratification when classification."""
|
|
268
|
+
stratify = self.y if self.task == "classification" else None
|
|
269
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
270
|
+
self.X,
|
|
271
|
+
self.y,
|
|
272
|
+
test_size=test_size,
|
|
273
|
+
random_state=random_state,
|
|
274
|
+
stratify=stratify,
|
|
275
|
+
)
|
|
276
|
+
train_df = pd.concat([X_train, y_train], axis=1)
|
|
277
|
+
test_df = pd.concat([X_test, y_test], axis=1)
|
|
278
|
+
return train_df, test_df
|
|
279
|
+
|
|
280
|
+
def _compute_subset(self, train_data: pd.DataFrame, idx: int) -> dict:
|
|
281
|
+
"""Compute selected Feature objects per method for this repeat."""
|
|
282
|
+
self._set_seed(self._per_repeat_seed(idx))
|
|
283
|
+
X_train = train_data.drop(columns=[self.target_name])
|
|
284
|
+
y_train = train_data[self.target_name]
|
|
285
|
+
feature_names = X_train.columns.tolist()
|
|
286
|
+
|
|
287
|
+
fs_subsets_local = {}
|
|
288
|
+
for fs_method in self.fs_methods:
|
|
289
|
+
method_name = fs_method.name
|
|
290
|
+
scores, indices = fs_method.select_features(X_train, y_train)
|
|
291
|
+
fs_subsets_local[(idx, method_name)] = [
|
|
292
|
+
Feature(
|
|
293
|
+
name,
|
|
294
|
+
score=scores[i] if scores is not None else None,
|
|
295
|
+
selected=(i in indices),
|
|
296
|
+
)
|
|
297
|
+
for i, name in enumerate(feature_names)
|
|
298
|
+
]
|
|
299
|
+
|
|
300
|
+
return fs_subsets_local
|
|
301
|
+
|
|
302
|
+
def _compute_merging(
|
|
303
|
+
self,
|
|
304
|
+
fs_subsets_local: dict,
|
|
305
|
+
idx: int,
|
|
306
|
+
verbose: bool = True,
|
|
307
|
+
) -> dict:
|
|
308
|
+
"""Merge per-group features and return mapping for this repeat."""
|
|
309
|
+
self._set_seed(self._per_repeat_seed(idx))
|
|
310
|
+
merged_features_local = {}
|
|
311
|
+
for group in self.subgroup_names:
|
|
312
|
+
merged = self._merge_group_features(fs_subsets_local, idx, group)
|
|
313
|
+
if merged:
|
|
314
|
+
merged_features_local[(idx, group)] = merged
|
|
315
|
+
elif verbose:
|
|
316
|
+
print(f"Warning: {group} produced no merged features.")
|
|
317
|
+
return merged_features_local
|
|
318
|
+
|
|
319
|
+
def _merge_group_features(
|
|
320
|
+
self,
|
|
321
|
+
fs_subsets_local: dict,
|
|
322
|
+
idx: int,
|
|
323
|
+
group: tuple,
|
|
324
|
+
) -> list:
|
|
325
|
+
"""Merge features for a specific group of methods.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
idx: Repeat index.
|
|
329
|
+
group: Tuple of selector names.
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
Merged features (type depends on strategy).
|
|
333
|
+
"""
|
|
334
|
+
group_features = [[f for f in fs_subsets_local[(idx, method)] if f.selected] for method in group]
|
|
335
|
+
|
|
336
|
+
# Determine set-based vs rank-based via method call when available
|
|
337
|
+
is_set_based_attr = getattr(self.merging_strategy, "is_set_based", None)
|
|
338
|
+
if callable(is_set_based_attr):
|
|
339
|
+
is_set_based = bool(is_set_based_attr())
|
|
340
|
+
elif isinstance(is_set_based_attr, bool):
|
|
341
|
+
is_set_based = is_set_based_attr
|
|
342
|
+
else:
|
|
343
|
+
is_set_based = True # default behavior as before
|
|
344
|
+
if is_set_based:
|
|
345
|
+
return self.merging_strategy.merge(group_features, self.num_features_to_select, fill=self.fill)
|
|
346
|
+
else:
|
|
347
|
+
return self.merging_strategy.merge(group_features, self.num_features_to_select)
|
|
348
|
+
|
|
349
|
+
def _compute_performance_metrics(
|
|
350
|
+
self,
|
|
351
|
+
X_train: pd.DataFrame,
|
|
352
|
+
y_train: pd.Series,
|
|
353
|
+
X_test: pd.DataFrame,
|
|
354
|
+
y_test: pd.Series,
|
|
355
|
+
) -> list:
|
|
356
|
+
"""Compute performance metrics using configured metric methods.
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
Averaged metric values per configured metric.
|
|
360
|
+
"""
|
|
361
|
+
self._set_seed(self.random_state)
|
|
362
|
+
if not self.metrics:
|
|
363
|
+
return []
|
|
364
|
+
|
|
365
|
+
shared_results: dict = {}
|
|
366
|
+
metric_values: list = []
|
|
367
|
+
|
|
368
|
+
for metric in self.metrics:
|
|
369
|
+
aggregator = getattr(metric, "aggregate_from_results", None)
|
|
370
|
+
if not callable(aggregator):
|
|
371
|
+
metric_values.append(metric.compute(X_train, y_train, X_test, y_test))
|
|
372
|
+
continue
|
|
373
|
+
|
|
374
|
+
signature_fn = getattr(metric, "model_signature", None)
|
|
375
|
+
cache_key = signature_fn() if callable(signature_fn) else None
|
|
376
|
+
results = shared_results.get(cache_key) if cache_key is not None else None
|
|
377
|
+
|
|
378
|
+
if results is None:
|
|
379
|
+
results = metric.train_and_predict(X_train, y_train, X_test, y_test)
|
|
380
|
+
if cache_key is not None:
|
|
381
|
+
shared_results[cache_key] = results
|
|
382
|
+
|
|
383
|
+
metric_values.append(aggregator(y_test, results))
|
|
384
|
+
|
|
385
|
+
return metric_values
|
|
386
|
+
|
|
387
|
+
def _compute_metrics(
|
|
388
|
+
self,
|
|
389
|
+
fs_subsets_local: dict,
|
|
390
|
+
merged_features_local: dict,
|
|
391
|
+
train_data: pd.DataFrame,
|
|
392
|
+
test_data: pd.DataFrame,
|
|
393
|
+
idx: int,
|
|
394
|
+
) -> list:
|
|
395
|
+
"""Compute and collect performance and stability metrics for subgroups.
|
|
396
|
+
|
|
397
|
+
Args:
|
|
398
|
+
fs_subsets_local: Local selected Feature lists per (repeat, method).
|
|
399
|
+
merged_features_local: Merged features per (repeat, group).
|
|
400
|
+
train_data: Training dataframe.
|
|
401
|
+
test_data: Test dataframe.
|
|
402
|
+
idx: Repeat index.
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
List of per-metric dicts keyed by (repeat, group).
|
|
406
|
+
"""
|
|
407
|
+
self._set_seed(self._per_repeat_seed(idx))
|
|
408
|
+
num_metrics = self._num_metrics_total()
|
|
409
|
+
local_result_dicts = [{} for _ in range(num_metrics)]
|
|
410
|
+
feature_train = train_data.drop(columns=self.target_name)
|
|
411
|
+
feature_test = test_data.drop(columns=self.target_name)
|
|
412
|
+
y_train_full = train_data[self.target_name]
|
|
413
|
+
y_test_full = test_data[self.target_name]
|
|
414
|
+
column_positions = {name: position for position, name in enumerate(feature_train.columns)}
|
|
415
|
+
|
|
416
|
+
for group in self.subgroup_names:
|
|
417
|
+
key = (idx, group)
|
|
418
|
+
if key not in merged_features_local:
|
|
419
|
+
continue
|
|
420
|
+
|
|
421
|
+
merged_feature_names = merged_features_local[key]
|
|
422
|
+
ordered_features = [feature for feature in merged_feature_names if feature in column_positions]
|
|
423
|
+
ordered_features.sort(key=column_positions.__getitem__)
|
|
424
|
+
if not ordered_features:
|
|
425
|
+
continue
|
|
426
|
+
|
|
427
|
+
X_train_subset = feature_train[ordered_features]
|
|
428
|
+
X_test_subset = feature_test[ordered_features]
|
|
429
|
+
|
|
430
|
+
metric_vals = self._compute_performance_metrics(
|
|
431
|
+
X_train_subset,
|
|
432
|
+
y_train_full,
|
|
433
|
+
X_test_subset,
|
|
434
|
+
y_test_full,
|
|
435
|
+
)
|
|
436
|
+
for m_idx, val in enumerate(metric_vals):
|
|
437
|
+
local_result_dicts[m_idx][key] = val
|
|
438
|
+
|
|
439
|
+
fs_lists = [[f.name for f in fs_subsets_local[(idx, method)] if f.selected] for method in group]
|
|
440
|
+
stability = compute_stability_metrics(fs_lists) if fs_lists else 0
|
|
441
|
+
|
|
442
|
+
if agreement_flag:
|
|
443
|
+
agreement = diversity_agreement(fs_lists, ordered_features, alpha=0.5) if fs_lists else 0
|
|
444
|
+
local_result_dicts[len(metric_vals)][key] = agreement
|
|
445
|
+
local_result_dicts[len(metric_vals) + 1][key] = stability
|
|
446
|
+
else:
|
|
447
|
+
local_result_dicts[len(metric_vals)][key] = stability
|
|
448
|
+
|
|
449
|
+
return local_result_dicts
|
|
450
|
+
|
|
451
|
+
@staticmethod
|
|
452
|
+
def _calculate_means(
|
|
453
|
+
result_dicts: list,
|
|
454
|
+
group_names: list,
|
|
455
|
+
) -> list:
|
|
456
|
+
"""Calculate mean metrics per subgroup across repeats.
|
|
457
|
+
|
|
458
|
+
Args:
|
|
459
|
+
result_dicts: Per-metric dicts keyed by (repeat, group).
|
|
460
|
+
group_names: Subgroup names to summarize.
|
|
461
|
+
|
|
462
|
+
Returns:
|
|
463
|
+
List of [means per metric] for each subgroup.
|
|
464
|
+
"""
|
|
465
|
+
means_list = []
|
|
466
|
+
for group in group_names:
|
|
467
|
+
group_means = []
|
|
468
|
+
for d in result_dicts:
|
|
469
|
+
vals = [value for (idx, name), value in d.items() if name == group]
|
|
470
|
+
m = np.mean(vals) if len(vals) else np.nan
|
|
471
|
+
group_means.append(None if np.isnan(m) else float(m))
|
|
472
|
+
means_list.append(group_means)
|
|
473
|
+
return means_list
|
|
474
|
+
|
|
475
|
+
@staticmethod
|
|
476
|
+
def _compute_pareto(groups: list, names: list) -> Any:
|
|
477
|
+
"""Return the name of the winner using Pareto analysis."""
|
|
478
|
+
pareto = ParetoAnalysis(groups, names)
|
|
479
|
+
pareto_results = pareto.get_results()
|
|
480
|
+
return pareto_results[0][0]
|
|
481
|
+
|
|
482
|
+
def _extract_repeat_metrics(
|
|
483
|
+
self,
|
|
484
|
+
group: Any,
|
|
485
|
+
*result_dicts: dict,
|
|
486
|
+
) -> list:
|
|
487
|
+
"""Return a row per repeat for the given group.
|
|
488
|
+
|
|
489
|
+
Missing values remain as None and are later replaced by -inf.
|
|
490
|
+
"""
|
|
491
|
+
result_array: list = []
|
|
492
|
+
for idx in range(self.num_repeats): # <- full range
|
|
493
|
+
row = [d.get((idx, group)) for d in result_dicts]
|
|
494
|
+
result_array.append(row)
|
|
495
|
+
return result_array
|
|
496
|
+
|
|
497
|
+
def _load_class(self, input: Any, instantiate: bool = False) -> Any:
|
|
498
|
+
"""Resolve identifiers to classes/instances and optionally instantiate.
|
|
499
|
+
|
|
500
|
+
Args:
|
|
501
|
+
input: Identifier or instance of a selector/merger/metric.
|
|
502
|
+
instantiate: If True, instantiate using extracted parameters.
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
Class or instance.
|
|
506
|
+
|
|
507
|
+
Raises:
|
|
508
|
+
ValueError: If ``input`` is invalid.
|
|
509
|
+
"""
|
|
510
|
+
if isinstance(input, str):
|
|
511
|
+
cls, params = get_class_info(input)
|
|
512
|
+
if instantiate:
|
|
513
|
+
init_params = extract_params(cls, self, params)
|
|
514
|
+
return cls(**init_params)
|
|
515
|
+
return cls
|
|
516
|
+
elif hasattr(input, "select_features") or hasattr(input, "merge"):
|
|
517
|
+
# Assumes valid instance if it has a 'select_features' or 'merge' method.
|
|
518
|
+
if instantiate:
|
|
519
|
+
# Best-effort: re-instantiate using the class and pipeline params
|
|
520
|
+
cls = input.__class__
|
|
521
|
+
init_params = extract_params(cls, self, [])
|
|
522
|
+
try:
|
|
523
|
+
return cls(**init_params)
|
|
524
|
+
except Exception:
|
|
525
|
+
# Fallback to returning the same instance if re-instantiation fails
|
|
526
|
+
return input
|
|
527
|
+
return input
|
|
528
|
+
else:
|
|
529
|
+
raise ValueError(
|
|
530
|
+
"Input must be a string identifier or a valid instance of a feature selector or merging strategy."
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
def _num_metrics_total(self) -> int:
|
|
534
|
+
"""Return total number of metrics tracked per group.
|
|
535
|
+
|
|
536
|
+
Includes performance metrics plus stability and optional agreement.
|
|
537
|
+
"""
|
|
538
|
+
return len(self.metrics) + (2 if agreement_flag else 1)
|
|
539
|
+
|
|
540
|
+
def __str__(self) -> str:
|
|
541
|
+
return (
|
|
542
|
+
f"Feature selection pipeline with: merging strategy: {self.merging_strategy}, "
|
|
543
|
+
f"feature selection methods: {self.fs_methods}, "
|
|
544
|
+
f"number of repeats: {self.num_repeats}"
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
def __call__(self) -> Any:
|
|
548
|
+
return self.run()
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from .base_selector import FeatureSelector
|
|
2
|
+
from .default_variance import variance_selector_default
|
|
3
|
+
from .elastic_net_selector import ElasticNetSelector
|
|
4
|
+
from .f_statistic_selector import FStatisticSelector
|
|
5
|
+
from .lasso_selector import LassoSelector
|
|
6
|
+
|
|
7
|
+
# from .mrmr_selector import MRMRSelector
|
|
8
|
+
from .mutual_info_selector import MutualInfoSelector
|
|
9
|
+
from .random_forest_selector import RandomForestSelector
|
|
10
|
+
from .svm_selector import SVMSelector
|
|
11
|
+
from .variance_selectors import VarianceSelector
|
|
12
|
+
from .xgboost_selector import XGBoostSelector
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"RandomForestSelector",
|
|
16
|
+
"FStatisticSelector",
|
|
17
|
+
"MutualInfoSelector",
|
|
18
|
+
"SVMSelector",
|
|
19
|
+
"XGBoostSelector",
|
|
20
|
+
"FeatureSelector",
|
|
21
|
+
# "MRMRSelector",
|
|
22
|
+
"LassoSelector",
|
|
23
|
+
"ElasticNetSelector",
|
|
24
|
+
"VarianceSelector",
|
|
25
|
+
"variance_selector_default",
|
|
26
|
+
]
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class FeatureSelector:
|
|
7
|
+
"""Base class for feature selection.
|
|
8
|
+
|
|
9
|
+
Subclasses must implement ``compute_scores`` returning a score per feature.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, task: str, num_features_to_select: int) -> None:
|
|
13
|
+
"""Initialize the selector.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
task: Either "classification" or "regression".
|
|
17
|
+
num_features_to_select: Number of top features to select.
|
|
18
|
+
"""
|
|
19
|
+
self.task = task
|
|
20
|
+
self.num_features_to_select = num_features_to_select
|
|
21
|
+
|
|
22
|
+
def select_features(self, X: Any, y: Any) -> tuple:
|
|
23
|
+
"""Select top features using the computed scores.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
X: Training samples, shape (n_samples, n_features).
|
|
27
|
+
y: Targets, shape (n_samples,) or (n_samples, n_outputs).
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Tuple (scores, indices) where indices are the top-k positions.
|
|
31
|
+
"""
|
|
32
|
+
scores = self.compute_scores(X, y)
|
|
33
|
+
indices = np.argsort(scores)[::-1][: self.num_features_to_select]
|
|
34
|
+
return scores, indices
|
|
35
|
+
|
|
36
|
+
def compute_scores(self, X: Any, y: Any) -> np.ndarray:
|
|
37
|
+
"""Compute per-feature scores (override in subclasses)."""
|
|
38
|
+
raise NotImplementedError("Subclasses must implement compute_scores")
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def variance_selector_default(X, y=None, alpha=0.01):
|
|
6
|
+
# ensure DataFrame for variance call
|
|
7
|
+
if isinstance(X, np.ndarray):
|
|
8
|
+
X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
|
|
9
|
+
|
|
10
|
+
# 1-D array of float64
|
|
11
|
+
variances = X.var(ddof=0).to_numpy(dtype=float)
|
|
12
|
+
|
|
13
|
+
# convert to plain Python floats to avoid dtype-object surprises
|
|
14
|
+
scores = [float(v) for v in variances]
|
|
15
|
+
|
|
16
|
+
threshold = alpha * float(np.median(variances))
|
|
17
|
+
|
|
18
|
+
# plain Python ints for indices
|
|
19
|
+
indices = [int(i) for i in np.where(variances >= threshold)[0]]
|
|
20
|
+
|
|
21
|
+
return scores, indices
|