sdufseval 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sdufseval/fseval.py
CHANGED
|
@@ -4,9 +4,8 @@ import time
|
|
|
4
4
|
import warnings
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
|
-
from
|
|
8
|
-
from
|
|
9
|
-
|
|
7
|
+
from eval import unsupervised_eval, supervised_eval
|
|
8
|
+
from loader import load_dataset
|
|
10
9
|
|
|
11
10
|
class FSEVAL:
|
|
12
11
|
def __init__(self,
|
|
@@ -17,22 +16,19 @@ class FSEVAL:
|
|
|
17
16
|
unsupervised_iter=10,
|
|
18
17
|
eval_type="both",
|
|
19
18
|
metrics=None,
|
|
20
|
-
experiments=None
|
|
21
|
-
|
|
22
|
-
Feature Selection Evaluation Suite.
|
|
23
|
-
"""
|
|
19
|
+
experiments=None,
|
|
20
|
+
save_all=False):
|
|
24
21
|
self.output_dir = output_dir
|
|
25
22
|
self.cv = cv
|
|
26
23
|
self.avg_steps = avg_steps
|
|
27
24
|
self.supervised_iter = supervised_iter
|
|
28
25
|
self.unsupervised_iter = unsupervised_iter
|
|
29
26
|
self.eval_type = eval_type
|
|
27
|
+
self.save_all = save_all
|
|
30
28
|
|
|
31
|
-
# Metric configuration
|
|
32
29
|
all_metrics = ["CLSACC", "NMI", "ACC", "AUC"]
|
|
33
30
|
self.selected_metrics = metrics if metrics else all_metrics
|
|
34
31
|
|
|
35
|
-
# Experiment/Scale configuration
|
|
36
32
|
self.scales = {}
|
|
37
33
|
target_exps = experiments if experiments else ["10Percent", "100Percent"]
|
|
38
34
|
if "10Percent" in target_exps:
|
|
@@ -44,24 +40,30 @@ class FSEVAL:
|
|
|
44
40
|
os.makedirs(self.output_dir)
|
|
45
41
|
|
|
46
42
|
def random_baseline(self, X, **kwargs):
|
|
47
|
-
"""
|
|
48
|
-
Randomly assigns importance scores to features.
|
|
49
|
-
Internal method for lower-bound baseline.
|
|
50
|
-
"""
|
|
51
43
|
return np.random.rand(X.shape[1])
|
|
52
44
|
|
|
45
|
+
def _should_skip(self, ds_name, methods):
|
|
46
|
+
for m_info in methods:
|
|
47
|
+
for scale_name in self.scales.keys():
|
|
48
|
+
last_met = self.selected_metrics[-1]
|
|
49
|
+
fname = os.path.join(self.output_dir, f"{m_info['name']}_{last_met}_{scale_name}.csv")
|
|
50
|
+
|
|
51
|
+
if not os.path.exists(fname):
|
|
52
|
+
return False
|
|
53
|
+
|
|
54
|
+
df = pd.read_csv(fname)
|
|
55
|
+
if 'Dataset' not in df.columns or ds_name not in df['Dataset'].values:
|
|
56
|
+
return False
|
|
57
|
+
return True
|
|
58
|
+
|
|
53
59
|
def run(self, datasets, methods, classifier=None):
|
|
54
|
-
"""
|
|
55
|
-
Executes the benchmark for given datasets and FS methods.
|
|
56
|
-
|
|
57
|
-
Args:
|
|
58
|
-
datasets: List of dataset names.
|
|
59
|
-
methods: List of dicts {'name': str, 'func': callable, 'stochastic': bool}.
|
|
60
|
-
classifier: Optional sklearn classifier instance to pass to supervised_eval.
|
|
61
|
-
"""
|
|
62
60
|
warnings.filterwarnings("ignore")
|
|
61
|
+
|
|
63
62
|
for ds_name in datasets:
|
|
64
|
-
|
|
63
|
+
if self._should_skip(ds_name, methods):
|
|
64
|
+
print(f">>> Skipping {ds_name}")
|
|
65
|
+
continue
|
|
66
|
+
|
|
65
67
|
X, y_raw = load_dataset(ds_name)
|
|
66
68
|
if X is None: continue
|
|
67
69
|
|
|
@@ -71,89 +73,74 @@ class FSEVAL:
|
|
|
71
73
|
for m_info in methods:
|
|
72
74
|
name = m_info['name']
|
|
73
75
|
fs_func = m_info['func']
|
|
74
|
-
# Stochastic methods run 10 times and average
|
|
75
76
|
repeats = self.avg_steps if m_info.get('stochastic', False) else 1
|
|
76
77
|
|
|
77
|
-
# Internal storage for current dataset results
|
|
78
78
|
ds_results = {s: {met: [] for met in self.selected_metrics} for s in self.scales}
|
|
79
79
|
|
|
80
80
|
for r in range(repeats):
|
|
81
|
-
print(f" [{name}]
|
|
82
|
-
|
|
83
|
-
# Get feature ranking
|
|
81
|
+
print(f" [{name}] {ds_name} - Run {r+1}/{repeats}")
|
|
84
82
|
scores = fs_func(X)
|
|
85
83
|
indices = np.argsort(scores)[::-1]
|
|
86
84
|
|
|
87
85
|
for scale_name, percentages in self.scales.items():
|
|
88
86
|
row = {met: {'Dataset': ds_name} for met in self.selected_metrics}
|
|
89
|
-
|
|
90
87
|
for p in percentages:
|
|
91
88
|
k = max(1, min(math.ceil(p * n_features), n_features))
|
|
92
89
|
X_subset = X[:, indices[:k]]
|
|
93
90
|
|
|
94
|
-
|
|
95
|
-
c_acc, nmi, acc, auc = np.nan, np.nan, np.nan, np.nan
|
|
96
|
-
|
|
91
|
+
res = {"CLSACC": np.nan, "NMI": np.nan, "ACC": np.nan, "AUC": np.nan}
|
|
97
92
|
if self.eval_type in ["unsupervised", "both"]:
|
|
98
|
-
|
|
99
|
-
|
|
93
|
+
res["CLSACC"], res["NMI"] = unsupervised_eval(X_subset, y, avg_steps=self.unsupervised_iter)
|
|
100
94
|
if self.eval_type in ["supervised", "both"]:
|
|
101
|
-
|
|
102
|
-
acc, auc = supervised_eval(X_subset, y, classifier=classifier, cv=self.cv, avg_steps=self.supervised_iter)
|
|
95
|
+
res["ACC"], res["AUC"] = supervised_eval(X_subset, y, classifier=classifier, cv=self.cv, avg_steps=self.supervised_iter)
|
|
103
96
|
|
|
104
|
-
# Map metrics to columns
|
|
105
|
-
mapping = {"CLSACC": c_acc, "NMI": nmi, "ACC": acc, "AUC": auc}
|
|
106
97
|
for met in self.selected_metrics:
|
|
107
|
-
row[met][p] =
|
|
98
|
+
row[met][p] = res[met]
|
|
108
99
|
|
|
109
100
|
for met in self.selected_metrics:
|
|
110
101
|
ds_results[scale_name][met].append(row[met])
|
|
111
102
|
|
|
112
|
-
# Save/Update results for this method/dataset
|
|
113
103
|
self._save_results(name, ds_results)
|
|
114
104
|
|
|
115
|
-
|
|
105
|
+
def _save_results(self, method_name, ds_results):
|
|
106
|
+
for scale, metrics in ds_results.items():
|
|
107
|
+
for met_name, rows in metrics.items():
|
|
108
|
+
df_new = pd.DataFrame(rows)
|
|
109
|
+
|
|
110
|
+
if not self.save_all:
|
|
111
|
+
df_new = df_new.groupby('Dataset').mean().reset_index()
|
|
112
|
+
|
|
113
|
+
df_new.columns = df_new.columns.astype(str)
|
|
114
|
+
fname = os.path.join(self.output_dir, f"{method_name}_{met_name}_{scale}.csv")
|
|
115
|
+
|
|
116
|
+
if os.path.exists(fname):
|
|
117
|
+
df_old = pd.read_csv(fname)
|
|
118
|
+
df_old.columns = df_old.columns.astype(str)
|
|
119
|
+
|
|
120
|
+
if self.save_all:
|
|
121
|
+
df_final = pd.concat([df_old, df_new], ignore_index=True)
|
|
122
|
+
else:
|
|
123
|
+
df_final = pd.concat([df_old, df_new]).drop_duplicates(subset=['Dataset'], keep='last')
|
|
124
|
+
else:
|
|
125
|
+
df_final = df_new
|
|
126
|
+
|
|
127
|
+
df_final.to_csv(fname, index=False)
|
|
128
|
+
|
|
116
129
|
def timer(self, methods, vary_param='both', time_limit=3600):
|
|
117
|
-
"""
|
|
118
|
-
Runs a standalone runtime analysis experiment with a time cap.
|
|
119
|
-
|
|
120
|
-
Args:
|
|
121
|
-
methods: List of dicts {'name': str, 'func': callable}.
|
|
122
|
-
vary_param: 'features', 'instances', or 'both'.
|
|
123
|
-
time_limit: Max seconds per method before it is skipped.
|
|
124
|
-
"""
|
|
125
|
-
|
|
126
|
-
# Determine which experiments to run
|
|
127
130
|
experiments = []
|
|
128
131
|
if vary_param in ['features', 'both']:
|
|
129
|
-
experiments.append({
|
|
130
|
-
'name': 'features',
|
|
131
|
-
'fixed_val': 100,
|
|
132
|
-
'range': range(1000, 20001, 500),
|
|
133
|
-
'file': 'time_analysis_features.csv'
|
|
134
|
-
})
|
|
132
|
+
experiments.append({'name': 'features', 'fixed_val': 100, 'range': range(1000, 20001, 500), 'file': 'time_analysis_features.csv'})
|
|
135
133
|
if vary_param in ['instances', 'both']:
|
|
136
|
-
experiments.append({
|
|
137
|
-
'name': 'instances',
|
|
138
|
-
'fixed_val': 100,
|
|
139
|
-
'range': range(1000, 20001, 500),
|
|
140
|
-
'file': 'time_analysis_instances.csv'
|
|
141
|
-
})
|
|
134
|
+
experiments.append({'name': 'instances', 'fixed_val': 100, 'range': range(1000, 20001, 500), 'file': 'time_analysis_instances.csv'})
|
|
142
135
|
|
|
143
136
|
for exp in experiments:
|
|
144
137
|
vary_type = exp['name']
|
|
145
138
|
val_range = exp['range']
|
|
146
139
|
filename = os.path.join(self.output_dir, exp['file'])
|
|
147
|
-
|
|
148
|
-
# Tracking for this specific experiment
|
|
149
140
|
timed_out_methods = set()
|
|
150
141
|
results = {m['name']: [] for m in methods}
|
|
151
142
|
|
|
152
|
-
print(f"\n--- Starting Experiment: Varying {vary_type} ---")
|
|
153
|
-
print(f"Time limit: {time_limit}s | Output: {filename}")
|
|
154
|
-
|
|
155
143
|
for val in val_range:
|
|
156
|
-
# 1. Generate synthetic data based on vary_param
|
|
157
144
|
if vary_type == 'features':
|
|
158
145
|
n_samples, n_features = exp['fixed_val'], val
|
|
159
146
|
else:
|
|
@@ -162,62 +149,27 @@ class FSEVAL:
|
|
|
162
149
|
try:
|
|
163
150
|
X = np.random.rand(n_samples, n_features)
|
|
164
151
|
except MemoryError:
|
|
165
|
-
print(f" FATAL: MemoryError: Failed to allocate {n_samples}x{n_features} data.")
|
|
166
152
|
for m in methods:
|
|
167
153
|
results[m['name']].append(-1 if m['name'] in timed_out_methods else np.nan)
|
|
168
154
|
continue
|
|
169
155
|
|
|
170
|
-
# 2. Run each method
|
|
171
156
|
for m_info in methods:
|
|
172
157
|
name = m_info['name']
|
|
173
158
|
func = m_info['func']
|
|
174
|
-
|
|
175
|
-
# Check if method has already timed out in this experiment
|
|
176
159
|
if name in timed_out_methods:
|
|
177
160
|
results[name].append(-1)
|
|
178
161
|
continue
|
|
179
162
|
|
|
180
163
|
try:
|
|
181
164
|
start_time = time.time()
|
|
182
|
-
|
|
183
|
-
# Execute the method (assuming benchmark format)
|
|
184
165
|
func(X)
|
|
185
|
-
|
|
186
166
|
duration = time.time() - start_time
|
|
187
|
-
|
|
188
167
|
if duration > time_limit:
|
|
189
|
-
print(f" - {name:<18}: {duration:.4f}s (TIMEOUT - skipping future runs)")
|
|
190
168
|
timed_out_methods.add(name)
|
|
191
|
-
else:
|
|
192
|
-
print(f" - {name:<18}: {duration:.4f}s")
|
|
193
|
-
|
|
194
169
|
results[name].append(duration)
|
|
195
|
-
|
|
196
|
-
except Exception as e:
|
|
197
|
-
print(f" - {name:<18}: FAILED ({type(e).__name__})")
|
|
170
|
+
except Exception:
|
|
198
171
|
results[name].append(np.nan)
|
|
199
172
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
df_results.index.name = 'Method'
|
|
204
|
-
df_results.to_csv(filename)
|
|
205
|
-
print(f"\n--- Results saved to {filename} ---")
|
|
206
|
-
except Exception as e:
|
|
207
|
-
print(f"\n--- FAILED to save results: {e} ---")
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
def _save_results(self, method_name, ds_results):
|
|
211
|
-
"""Aggregates repeats and saves to disk after each dataset."""
|
|
212
|
-
for scale, metrics in ds_results.items():
|
|
213
|
-
for met_name, rows in metrics.items():
|
|
214
|
-
df_new = pd.DataFrame(rows).groupby('Dataset').mean().reset_index()
|
|
215
|
-
fname = os.path.join(self.output_dir, f"{method_name}_{met_name}_{scale}.csv")
|
|
216
|
-
|
|
217
|
-
if os.path.exists(fname):
|
|
218
|
-
df_old = pd.read_csv(fname)
|
|
219
|
-
df_final = pd.concat([df_old, df_new]).drop_duplicates(subset=['Dataset'], keep='last')
|
|
220
|
-
else:
|
|
221
|
-
df_final = df_new
|
|
222
|
-
|
|
223
|
-
df_final.to_csv(fname, index=False)
|
|
173
|
+
df_results = pd.DataFrame.from_dict(results, orient='index', columns=list(val_range))
|
|
174
|
+
df_results.index.name = 'Method'
|
|
175
|
+
df_results.to_csv(filename)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sdufseval
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.6
|
|
4
4
|
Summary: Evaluation and Benchmark Tool for Feature Selection
|
|
5
5
|
Project-URL: Homepage, https://github.com/mrajabinasab/FSEVAL
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/mrajabinasab/FSEVAL/issues
|
|
@@ -124,6 +124,7 @@ Initializes the evalutation and benchmark object.
|
|
|
124
124
|
| **`eval_type`** | both | "supervised", "unsupervised", or "both". |
|
|
125
125
|
| **`metrics`** | ["CLSACC", "NMI", "ACC", "AUC"] | Evaluation metrics to calculate. |
|
|
126
126
|
| **`experiments`** | ["10Percent", "100Percent"] | Which feature ratio grids to evaluate. |
|
|
127
|
+
| **`save_all`** | False | Save the results of all runs of the stochastic methods separately. |
|
|
127
128
|
|
|
128
129
|
### ⚙️ `run(datasets, methods, classifier=None)`
|
|
129
130
|
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
sdufseval/__init__.py,sha256=KIZIPxldHV2TLZVTW1jP6L8-tDw7-iMgmW-e-DeTdvo,271
|
|
2
|
+
sdufseval/eval.py,sha256=445pNvgOo_fcdOKNElDsflDZ6iptxPRKGz5wuxLYPUE,2573
|
|
3
|
+
sdufseval/fseval.py,sha256=VNFFYB72GCesInKeFJftGDg9pd2eI75gkArIPQrf5sY,7594
|
|
4
|
+
sdufseval/loader.py,sha256=YUMSAdi2zcg2MizcGlnCxhsV5Y5cikL1hfk5ofwaI6s,2286
|
|
5
|
+
sdufseval-1.0.6.dist-info/METADATA,sha256=Ja5jQbLYP9Fkb5BnZxQ6S1YiM4BkiD4SEYVIIGtgDwI,5892
|
|
6
|
+
sdufseval-1.0.6.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
7
|
+
sdufseval-1.0.6.dist-info/RECORD,,
|
sdufseval-1.0.4.dist-info/RECORD
DELETED
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
sdufseval/__init__.py,sha256=KIZIPxldHV2TLZVTW1jP6L8-tDw7-iMgmW-e-DeTdvo,271
|
|
2
|
-
sdufseval/eval.py,sha256=445pNvgOo_fcdOKNElDsflDZ6iptxPRKGz5wuxLYPUE,2573
|
|
3
|
-
sdufseval/fseval.py,sha256=LPBKfXEj6lHSKPSEVv1b9U7V_7jXNBWpd5snD355_Rc,9513
|
|
4
|
-
sdufseval/loader.py,sha256=YUMSAdi2zcg2MizcGlnCxhsV5Y5cikL1hfk5ofwaI6s,2286
|
|
5
|
-
sdufseval-1.0.4.dist-info/METADATA,sha256=1qm0ZQSe4UhcAazHwOdLdoIISx3QpxhKI4qPbtPswOU,5796
|
|
6
|
-
sdufseval-1.0.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
7
|
-
sdufseval-1.0.4.dist-info/RECORD,,
|
|
File without changes
|