sublimex 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sublimex/__init__.py +22 -0
- sublimex/core.py +221 -0
- sublimex/models.py +102 -0
- sublimex/objectives.py +152 -0
- sublimex/py.typed +2 -0
- sublimex/transforms.py +38 -0
- sublimex/visualization.py +131 -0
- sublimex-0.1.0.dist-info/METADATA +360 -0
- sublimex-0.1.0.dist-info/RECORD +12 -0
- sublimex-0.1.0.dist-info/WHEEL +5 -0
- sublimex-0.1.0.dist-info/licenses/LICENSE +21 -0
- sublimex-0.1.0.dist-info/top_level.txt +1 -0
sublimex/__init__.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""SublimeX: Supervised Bottom-Up Localized Multi-Representative Feature Extraction."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
__author__ = "J.C. Wolber"
|
|
5
|
+
|
|
6
|
+
from sublimex.core import SublimeX
|
|
7
|
+
from sublimex.transforms import TRANSFORMS, register_transform, get_transform, list_transforms
|
|
8
|
+
from sublimex.objectives import (AGGREGATIONS, mean_objective, aggregate_objective,
|
|
9
|
+
pattern_objective, create_custom_objective)
|
|
10
|
+
from sublimex.models import LightGBMModel, SklearnModelWrapper
|
|
11
|
+
from sublimex.visualization import (plot_feature_importance, plot_segment_on_signal,
|
|
12
|
+
plot_feature_distributions, plot_transform_comparison,
|
|
13
|
+
plot_optimization_history)
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"__version__", "__author__", "SublimeX",
|
|
17
|
+
"TRANSFORMS", "register_transform", "get_transform", "list_transforms",
|
|
18
|
+
"AGGREGATIONS", "mean_objective", "aggregate_objective", "pattern_objective", "create_custom_objective",
|
|
19
|
+
"LightGBMModel", "SklearnModelWrapper",
|
|
20
|
+
"plot_feature_importance", "plot_segment_on_signal", "plot_feature_distributions",
|
|
21
|
+
"plot_transform_comparison", "plot_optimization_history",
|
|
22
|
+
]
|
sublimex/core.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""SublimeX core module - interpretable feature extraction."""
|
|
2
|
+
import numpy as np
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import warnings
|
|
6
|
+
from typing import List, Dict, Any, Optional
|
|
7
|
+
|
|
8
|
+
import optuna
|
|
9
|
+
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
|
|
10
|
+
|
|
11
|
+
from sublimex.transforms import TRANSFORMS
|
|
12
|
+
from sublimex.objectives import default_objective
|
|
13
|
+
from sublimex.models import LightGBMModel
|
|
14
|
+
|
|
15
|
+
warnings.filterwarnings('ignore', category=optuna.exceptions.ExperimentalWarning)
|
|
16
|
+
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MockTrial:
|
|
20
|
+
"""Mock Optuna trial for replaying saved parameters."""
|
|
21
|
+
__slots__ = ('params',)
|
|
22
|
+
|
|
23
|
+
def __init__(self, params):
|
|
24
|
+
self.params = params
|
|
25
|
+
|
|
26
|
+
def suggest_int(self, name, low, high):
|
|
27
|
+
return int(self.params[name])
|
|
28
|
+
|
|
29
|
+
def suggest_float(self, name, low, high):
|
|
30
|
+
return float(self.params[name])
|
|
31
|
+
|
|
32
|
+
def suggest_categorical(self, name, choices):
|
|
33
|
+
return self.params[name]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class SublimeX:
|
|
37
|
+
"""SublimeX: Supervised Bottom-Up Localized Multi-Representative Feature Extraction."""
|
|
38
|
+
|
|
39
|
+
def __init__(self, metric='auc', n_trials=300, inner_cv=1, val_size=0.5,
|
|
40
|
+
verbose=False, show_progress_bar=False, transforms=None,
|
|
41
|
+
objective_fn=None, model=None, sampler='tpe'):
|
|
42
|
+
self.metric = metric
|
|
43
|
+
self.n_trials = n_trials
|
|
44
|
+
self.inner_cv = inner_cv
|
|
45
|
+
self.val_size = val_size
|
|
46
|
+
self.verbose = verbose
|
|
47
|
+
self.show_progress_bar = show_progress_bar
|
|
48
|
+
self.transforms = transforms or TRANSFORMS
|
|
49
|
+
self.objective_fn = objective_fn or default_objective
|
|
50
|
+
self.model = model
|
|
51
|
+
self.sampler = sampler
|
|
52
|
+
|
|
53
|
+
self.extracted_features: List[Dict[str, Any]] = []
|
|
54
|
+
self.transform_names: List[str] = list(self.transforms.keys())
|
|
55
|
+
self.n_channels: Optional[int] = None
|
|
56
|
+
self.n_time: Optional[int] = None
|
|
57
|
+
self._is_fitted: bool = False
|
|
58
|
+
|
|
59
|
+
def _apply_transforms(self, data):
|
|
60
|
+
"""Apply all transforms to input data."""
|
|
61
|
+
n_samples, n_channels, n_time = data.shape
|
|
62
|
+
out = np.empty((len(self.transform_names), n_samples, n_channels, n_time), dtype=np.float32)
|
|
63
|
+
for ti, tname in enumerate(self.transform_names):
|
|
64
|
+
flat = data.reshape(-1, n_time)
|
|
65
|
+
out[ti] = self.transforms[tname](flat).reshape(n_samples, n_channels, n_time)
|
|
66
|
+
return out
|
|
67
|
+
|
|
68
|
+
def _extract_feature(self, params, ctx):
|
|
69
|
+
"""Extract a single feature using saved parameters."""
|
|
70
|
+
ctx['extract_only'] = True
|
|
71
|
+
self.objective_fn(MockTrial(params), ctx)
|
|
72
|
+
ctx['extract_only'] = False
|
|
73
|
+
return ctx['last_feature']
|
|
74
|
+
|
|
75
|
+
def _create_cv_splits(self, n_samples, y):
|
|
76
|
+
"""Create CV splits for internal evaluation."""
|
|
77
|
+
if self.inner_cv == 1:
|
|
78
|
+
stratify = y if self.metric != 'rmse' else None
|
|
79
|
+
train_idx, val_idx = train_test_split(
|
|
80
|
+
np.arange(n_samples), test_size=self.val_size,
|
|
81
|
+
random_state=42, stratify=stratify)
|
|
82
|
+
return [(train_idx, val_idx)]
|
|
83
|
+
cv_cls = StratifiedKFold if self.metric != 'rmse' else KFold
|
|
84
|
+
return list(cv_cls(self.inner_cv, shuffle=True, random_state=42).split(np.zeros(n_samples), y))
|
|
85
|
+
|
|
86
|
+
def _create_sampler(self):
|
|
87
|
+
"""Create Optuna sampler."""
|
|
88
|
+
if self.sampler == 'nsga2':
|
|
89
|
+
return optuna.samplers.NSGAIISampler()
|
|
90
|
+
return optuna.samplers.TPESampler(multivariate=True, group=True, constant_liar=True)
|
|
91
|
+
|
|
92
|
+
def fit(self, input_series, y):
|
|
93
|
+
"""Fit the feature extractor to training data."""
|
|
94
|
+
data = np.stack(list(input_series), axis=1).astype(np.float32)
|
|
95
|
+
n_samples, self.n_channels, self.n_time = data.shape
|
|
96
|
+
|
|
97
|
+
if self.verbose:
|
|
98
|
+
print(f"\nSublimeX Feature Extraction")
|
|
99
|
+
print(f" Samples: {n_samples}")
|
|
100
|
+
print(f" Channels: {self.n_channels}")
|
|
101
|
+
print(f" Time points: {self.n_time}")
|
|
102
|
+
print(f" Transforms: {self.transform_names}")
|
|
103
|
+
print(f" Metric: {self.metric}\n")
|
|
104
|
+
|
|
105
|
+
if self.model is None:
|
|
106
|
+
task = 'regression' if self.metric == 'rmse' else 'classification'
|
|
107
|
+
self.model = LightGBMModel(task=task)
|
|
108
|
+
|
|
109
|
+
cv_splits = self._create_cv_splits(n_samples, y)
|
|
110
|
+
transformed = self._apply_transforms(data)
|
|
111
|
+
|
|
112
|
+
direction = 'minimize' if self.metric == 'rmse' else 'maximize'
|
|
113
|
+
is_maximize = direction == 'maximize'
|
|
114
|
+
|
|
115
|
+
ctx = {
|
|
116
|
+
'transformed': transformed, 'y': y, 'model': self.model,
|
|
117
|
+
'metric': self.metric, 'n_channels': self.n_channels,
|
|
118
|
+
'n_time': self.n_time, 'transform_names': self.transform_names,
|
|
119
|
+
'cv_splits': cv_splits,
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
self.extracted_features = []
|
|
123
|
+
current_X = np.empty((n_samples, 0), dtype=np.float32)
|
|
124
|
+
best_score = float('inf') if not is_maximize else -float('inf')
|
|
125
|
+
while True:
|
|
126
|
+
ctx['current_X'] = current_X
|
|
127
|
+
|
|
128
|
+
study = optuna.create_study(direction=direction, sampler=self._create_sampler())
|
|
129
|
+
study.optimize(lambda t: self.objective_fn(t, ctx), n_trials=self.n_trials,
|
|
130
|
+
show_progress_bar=self.show_progress_bar, n_jobs=-1)
|
|
131
|
+
|
|
132
|
+
improved = (is_maximize and study.best_value > best_score) or \
|
|
133
|
+
(not is_maximize and study.best_value < best_score)
|
|
134
|
+
|
|
135
|
+
if not improved:
|
|
136
|
+
break
|
|
137
|
+
|
|
138
|
+
best_score = study.best_value
|
|
139
|
+
params = study.best_params
|
|
140
|
+
self.extracted_features.append(params)
|
|
141
|
+
|
|
142
|
+
feat = self._extract_feature(params, ctx)
|
|
143
|
+
current_X = np.hstack([current_X, feat]) if current_X.size else feat
|
|
144
|
+
|
|
145
|
+
if self.verbose:
|
|
146
|
+
print(f" Feature {len(self.extracted_features)}: {self.metric}={best_score:.5f}, params={params}")
|
|
147
|
+
|
|
148
|
+
self._is_fitted = True
|
|
149
|
+
if self.verbose:
|
|
150
|
+
print(f"\nDiscovered {len(self.extracted_features)} features")
|
|
151
|
+
return self
|
|
152
|
+
|
|
153
|
+
def transform(self, input_series):
|
|
154
|
+
"""Transform data using extracted features."""
|
|
155
|
+
|
|
156
|
+
data = np.stack(list(input_series), axis=1).astype(np.float32)
|
|
157
|
+
n_samples, n_channels, n_time = data.shape
|
|
158
|
+
transformed = self._apply_transforms(data)
|
|
159
|
+
|
|
160
|
+
ctx = {'transformed': transformed, 'n_time': n_time,
|
|
161
|
+
'n_channels': n_channels, 'transform_names': self.transform_names}
|
|
162
|
+
|
|
163
|
+
features = [self._extract_feature(p, ctx) for p in self.extracted_features]
|
|
164
|
+
return np.hstack(features).astype(np.float32)
|
|
165
|
+
|
|
166
|
+
def fit_transform(self, input_series, y):
|
|
167
|
+
"""Fit and transform in one step."""
|
|
168
|
+
return self.fit(input_series, y).transform(input_series)
|
|
169
|
+
|
|
170
|
+
def save_features(self, path):
|
|
171
|
+
"""Save extracted feature parameters to JSON file."""
|
|
172
|
+
os.makedirs(os.path.dirname(path) or '.', exist_ok=True)
|
|
173
|
+
features_list = []
|
|
174
|
+
for i, params in enumerate(self.extracted_features):
|
|
175
|
+
feature_dict = {'feature_id': i + 1}
|
|
176
|
+
for k, v in params.items():
|
|
177
|
+
if isinstance(v, (np.integer, int)):
|
|
178
|
+
feature_dict[k] = int(v)
|
|
179
|
+
elif isinstance(v, (np.floating, float)):
|
|
180
|
+
feature_dict[k] = float(v)
|
|
181
|
+
else:
|
|
182
|
+
feature_dict[k] = v
|
|
183
|
+
features_list.append(feature_dict)
|
|
184
|
+
with open(path, 'w') as f:
|
|
185
|
+
json.dump(features_list, f, indent=2)
|
|
186
|
+
|
|
187
|
+
def load_features(self, path):
|
|
188
|
+
"""Load feature parameters from JSON file."""
|
|
189
|
+
with open(path, 'r') as f:
|
|
190
|
+
features_list = json.load(f)
|
|
191
|
+
self.extracted_features = [{k: v for k, v in f.items() if k != 'feature_id'}
|
|
192
|
+
for f in features_list]
|
|
193
|
+
self._is_fitted = True
|
|
194
|
+
return self
|
|
195
|
+
|
|
196
|
+
def get_feature_descriptions(self):
|
|
197
|
+
"""Get human-readable descriptions of extracted features."""
|
|
198
|
+
if not self.extracted_features:
|
|
199
|
+
return []
|
|
200
|
+
|
|
201
|
+
descriptions = []
|
|
202
|
+
for i, params in enumerate(self.extracted_features):
|
|
203
|
+
t_idx = params.get('t', 0)
|
|
204
|
+
transform = self.transform_names[t_idx] if t_idx < len(self.transform_names) else f"transform_{t_idx}"
|
|
205
|
+
ch = params.get('ch', 0)
|
|
206
|
+
c, r = params.get('c', 0.5), params.get('r', 0.5)
|
|
207
|
+
|
|
208
|
+
if self.n_time:
|
|
209
|
+
start = int(c * self.n_time - r * self.n_time / 2)
|
|
210
|
+
end = int(c * self.n_time + r * self.n_time / 2)
|
|
211
|
+
pos_str = f"positions {max(0, start)}-{min(self.n_time, end)}"
|
|
212
|
+
else:
|
|
213
|
+
pos_str = f"center={c:.2f}, range={r:.2f}"
|
|
214
|
+
|
|
215
|
+
feat_type = params.get('feature_type', 'mean')
|
|
216
|
+
descriptions.append(f"Feature {i+1}: {feat_type} of {transform} in channel {ch}, {pos_str}")
|
|
217
|
+
return descriptions
|
|
218
|
+
|
|
219
|
+
def __repr__(self):
|
|
220
|
+
status = "fitted" if self._is_fitted else "not fitted"
|
|
221
|
+
return f"SublimeX(metric='{self.metric}', n_trials={self.n_trials}, status={status}, n_features={len(self.extracted_features)})"
|
sublimex/models.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""ML model wrappers for SublimeX."""
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.metrics import roc_auc_score, accuracy_score, mean_squared_error
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _compute_score(y_true, pred, metric):
|
|
7
|
+
"""Compute evaluation metric."""
|
|
8
|
+
if metric == 'auc':
|
|
9
|
+
if pred.ndim == 2 and pred.shape[1] == 2:
|
|
10
|
+
return roc_auc_score(y_true, pred[:, 1])
|
|
11
|
+
elif pred.ndim == 2:
|
|
12
|
+
return roc_auc_score(y_true, pred, multi_class='ovr')
|
|
13
|
+
return roc_auc_score(y_true, pred)
|
|
14
|
+
elif metric == 'accuracy':
|
|
15
|
+
if pred.ndim == 2:
|
|
16
|
+
pred = pred.argmax(axis=1)
|
|
17
|
+
return accuracy_score(y_true, pred)
|
|
18
|
+
elif metric == 'rmse':
|
|
19
|
+
return np.sqrt(mean_squared_error(y_true, pred))
|
|
20
|
+
raise ValueError(f"Unknown metric: {metric}")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class LightGBMModel:
|
|
24
|
+
"""LightGBM wrapper for SublimeX evaluation loop."""
|
|
25
|
+
|
|
26
|
+
_BASE_PARAMS = {
|
|
27
|
+
'max_depth': 3,
|
|
28
|
+
'data_sample_strategy': 'goss',
|
|
29
|
+
'verbosity': -1,
|
|
30
|
+
'force_row_wise': True,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
def __init__(self, task='classification', max_depth=3, n_estimators=100, early_stopping_rounds=10):
|
|
34
|
+
self.task = task
|
|
35
|
+
self.max_depth = max_depth
|
|
36
|
+
self.n_estimators = n_estimators
|
|
37
|
+
self.early_stopping_rounds = early_stopping_rounds
|
|
38
|
+
self.model = None
|
|
39
|
+
|
|
40
|
+
from lightgbm import LGBMClassifier, LGBMRegressor, early_stopping
|
|
41
|
+
self._clf_cls = LGBMClassifier
|
|
42
|
+
self._reg_cls = LGBMRegressor
|
|
43
|
+
self._early_stopping = early_stopping
|
|
44
|
+
self._model_cls = self._clf_cls if task == 'classification' else self._reg_cls
|
|
45
|
+
|
|
46
|
+
def evaluate(self, X_train, y_train, X_val, y_val, metric):
|
|
47
|
+
"""Train and evaluate on validation set."""
|
|
48
|
+
params = {**self._BASE_PARAMS, 'num_threads': 1}
|
|
49
|
+
model = self._model_cls(**params)
|
|
50
|
+
model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
|
|
51
|
+
callbacks=[self._early_stopping(self.early_stopping_rounds, verbose=False)])
|
|
52
|
+
pred = model.predict_proba(X_val) if metric == 'auc' and self.task == 'classification' else model.predict(X_val)
|
|
53
|
+
return _compute_score(y_val, pred, metric)
|
|
54
|
+
|
|
55
|
+
def test(self, X_train, y_train, X_test, y_test, metric):
|
|
56
|
+
"""Train on full training set and evaluate on test set."""
|
|
57
|
+
params = {**self._BASE_PARAMS, 'num_threads': -1}
|
|
58
|
+
self.model = self._model_cls(**params)
|
|
59
|
+
self.model.fit(X_train, y_train)
|
|
60
|
+
pred = self.model.predict_proba(X_test) if metric == 'auc' and self.task == 'classification' else self.model.predict(X_test)
|
|
61
|
+
return _compute_score(y_test, pred, metric)
|
|
62
|
+
|
|
63
|
+
def predict(self, X):
|
|
64
|
+
return self.model.predict(X)
|
|
65
|
+
|
|
66
|
+
def predict_proba(self, X):
|
|
67
|
+
return self.model.predict_proba(X)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class SklearnModelWrapper:
|
|
71
|
+
"""Wrapper for any sklearn estimator."""
|
|
72
|
+
|
|
73
|
+
def __init__(self, estimator, clone_for_each_eval=True):
|
|
74
|
+
self.estimator = estimator
|
|
75
|
+
self.clone_for_each_eval = clone_for_each_eval
|
|
76
|
+
self.model = None
|
|
77
|
+
self._is_classifier = hasattr(estimator, 'predict_proba')
|
|
78
|
+
from sklearn.base import clone
|
|
79
|
+
self._clone = clone
|
|
80
|
+
|
|
81
|
+
def _get_estimator(self):
|
|
82
|
+
return self._clone(self.estimator) if self.clone_for_each_eval else self.estimator
|
|
83
|
+
|
|
84
|
+
def evaluate(self, X_train, y_train, X_val, y_val, metric):
|
|
85
|
+
"""Train and evaluate on validation set."""
|
|
86
|
+
model = self._get_estimator()
|
|
87
|
+
model.fit(X_train, y_train)
|
|
88
|
+
pred = model.predict_proba(X_val) if metric == 'auc' and self._is_classifier else model.predict(X_val)
|
|
89
|
+
return _compute_score(y_val, pred, metric)
|
|
90
|
+
|
|
91
|
+
def test(self, X_train, y_train, X_test, y_test, metric):
|
|
92
|
+
"""Train on full training set and evaluate on test set."""
|
|
93
|
+
self.model = self._get_estimator()
|
|
94
|
+
self.model.fit(X_train, y_train)
|
|
95
|
+
pred = self.model.predict_proba(X_test) if metric == 'auc' and self._is_classifier else self.model.predict(X_test)
|
|
96
|
+
return _compute_score(y_test, pred, metric)
|
|
97
|
+
|
|
98
|
+
def predict(self, X):
|
|
99
|
+
return self.model.predict(X)
|
|
100
|
+
|
|
101
|
+
def predict_proba(self, X):
|
|
102
|
+
return self.model.predict_proba(X)
|
sublimex/objectives.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Objective functions for SublimeX optimization."""
|
|
2
|
+
import numpy as np
|
|
3
|
+
from numpy.lib.stride_tricks import sliding_window_view
|
|
4
|
+
from typing import Callable, Dict, Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
AGGREGATIONS = {
|
|
8
|
+
'mean': lambda x: x.mean(axis=1, keepdims=True),
|
|
9
|
+
'min': lambda x: x.min(axis=1, keepdims=True),
|
|
10
|
+
'max': lambda x: x.max(axis=1, keepdims=True),
|
|
11
|
+
'range': lambda x: np.ptp(x, axis=1, keepdims=True),
|
|
12
|
+
'std': lambda x: x.std(axis=1, keepdims=True),
|
|
13
|
+
'median': lambda x: np.median(x, axis=1, keepdims=True),
|
|
14
|
+
'argmin': lambda x: x.argmin(axis=1, keepdims=True).astype(np.float32) / max(x.shape[1] - 1, 1),
|
|
15
|
+
'argmax': lambda x: x.argmax(axis=1, keepdims=True).astype(np.float32) / max(x.shape[1] - 1, 1),
|
|
16
|
+
}
|
|
17
|
+
AGG_KEYS = list(AGGREGATIONS.keys())
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_segment_indices(center: float, range_val: float, n_time: int) -> tuple:
|
|
21
|
+
"""Convert normalized center and range to segment indices."""
|
|
22
|
+
center_idx = center * (n_time - 1)
|
|
23
|
+
half_width = (range_val * (n_time - 1)) * 0.5
|
|
24
|
+
start = max(0, int(center_idx - half_width))
|
|
25
|
+
end = min(n_time - 1, int(center_idx + half_width))
|
|
26
|
+
return start, end
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _evaluate(feature: np.ndarray, ctx: Dict[str, Any]) -> float:
|
|
30
|
+
"""Evaluate a candidate feature using cross-validation."""
|
|
31
|
+
ctx['last_feature'] = feature
|
|
32
|
+
if ctx.get('extract_only'):
|
|
33
|
+
return 0.0
|
|
34
|
+
|
|
35
|
+
current_X = ctx['current_X']
|
|
36
|
+
X = np.hstack([current_X, feature]) if current_X.size else feature
|
|
37
|
+
|
|
38
|
+
scores = []
|
|
39
|
+
for train_idx, val_idx in ctx['cv_splits']:
|
|
40
|
+
score = ctx['model'].evaluate(X[train_idx], ctx['y'][train_idx],
|
|
41
|
+
X[val_idx], ctx['y'][val_idx], ctx['metric'])
|
|
42
|
+
scores.append(score)
|
|
43
|
+
return np.mean(scores)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _get_segment(trial, ctx: Dict[str, Any], feature_type: str) -> tuple:
|
|
47
|
+
"""Extract segment parameters from trial and retrieve data."""
|
|
48
|
+
ch = trial.suggest_int('ch', 0, ctx['n_channels'] - 1)
|
|
49
|
+
t = trial.suggest_int('t', 0, len(ctx['transform_names']) - 1)
|
|
50
|
+
c = trial.suggest_float('c', 0, 1)
|
|
51
|
+
r = trial.suggest_float('r', 0, 1)
|
|
52
|
+
|
|
53
|
+
start, end = get_segment_indices(c, r, ctx['n_time'])
|
|
54
|
+
segment = ctx['transformed'][t, :, ch, start:end+1]
|
|
55
|
+
|
|
56
|
+
params = {
|
|
57
|
+
'feature_type': feature_type, 'channel': ch, 'transform_idx': t,
|
|
58
|
+
'transform_name': ctx['transform_names'][t], 'center': c, 'range': r,
|
|
59
|
+
'start_idx': start, 'end_idx': end,
|
|
60
|
+
}
|
|
61
|
+
return segment, params
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def mean_objective(trial, ctx: Dict[str, Any]) -> float:
|
|
65
|
+
"""Mean objective - extract mean value over optimized segment."""
|
|
66
|
+
segment, _ = _get_segment(trial, ctx, 'mean')
|
|
67
|
+
feature = segment.mean(axis=1, keepdims=True).astype(np.float32)
|
|
68
|
+
return _evaluate(feature, ctx)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def aggregate_objective(trial, ctx: Dict[str, Any]) -> float:
|
|
72
|
+
"""Aggregate objective - choose from multiple aggregation functions."""
|
|
73
|
+
segment, _ = _get_segment(trial, ctx, 'aggregate')
|
|
74
|
+
agg_name = trial.suggest_categorical('agg', AGG_KEYS)
|
|
75
|
+
feature = AGGREGATIONS[agg_name](segment).astype(np.float32)
|
|
76
|
+
return _evaluate(feature, ctx)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def pattern_objective(trial, ctx: Dict[str, Any]) -> float:
|
|
80
|
+
"""Pattern objective - quadratic B-spline matching."""
|
|
81
|
+
segment, params = _get_segment(trial, ctx, 'pattern')
|
|
82
|
+
|
|
83
|
+
w = trial.suggest_float('w', 0.05, 0.5)
|
|
84
|
+
cp0 = trial.suggest_float('cp0', 0, 1)
|
|
85
|
+
cp1 = trial.suggest_float('cp1', 0, 1)
|
|
86
|
+
cp2 = trial.suggest_float('cp2', 0, 1)
|
|
87
|
+
|
|
88
|
+
n_time = ctx['n_time']
|
|
89
|
+
n_samples, segment_len = segment.shape
|
|
90
|
+
width = min(max(2, int(w * n_time)), segment_len)
|
|
91
|
+
|
|
92
|
+
if width > segment_len or segment_len - width + 1 <= 0:
|
|
93
|
+
return _evaluate(np.full((n_samples, 1), np.inf, dtype=np.float32), ctx)
|
|
94
|
+
|
|
95
|
+
t = np.linspace(0, 1, width, dtype=np.float32)
|
|
96
|
+
pattern = (1 - t) ** 2 * cp0 + 2 * (1 - t) * t * cp1 + t ** 2 * cp2
|
|
97
|
+
|
|
98
|
+
windows = sliding_window_view(segment, window_shape=width, axis=1)
|
|
99
|
+
distances = np.linalg.norm(windows - pattern, axis=2) / np.sqrt(width)
|
|
100
|
+
feature = distances.min(axis=1, keepdims=True).astype(np.float32)
|
|
101
|
+
|
|
102
|
+
return _evaluate(feature, ctx)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
default_objective = mean_objective
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def create_custom_objective(aggregation_fn: Callable, name: str = 'custom') -> Callable:
|
|
109
|
+
"""Create a custom objective with user-defined aggregation."""
|
|
110
|
+
def custom_objective(trial, ctx: Dict[str, Any]) -> float:
|
|
111
|
+
segment, _ = _get_segment(trial, ctx, name)
|
|
112
|
+
feature = aggregation_fn(segment).astype(np.float32)
|
|
113
|
+
return _evaluate(feature, ctx)
|
|
114
|
+
custom_objective.__name__ = f'{name}_objective'
|
|
115
|
+
return custom_objective
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def parallel_objective(trial, ctx):
|
|
119
|
+
"""Parallel objective: optimize multiple features simultaneously.
|
|
120
|
+
|
|
121
|
+
Suggests parameters for N features at once (ch_0, t_0, c_0, r_0, ..., ch_N-1, ...)
|
|
122
|
+
and evaluates the combined feature set.
|
|
123
|
+
"""
|
|
124
|
+
n_features = ctx.get('n_target_features', 10) # Default fallback
|
|
125
|
+
|
|
126
|
+
# Extract all features in parallel with different parameters for each
|
|
127
|
+
features = []
|
|
128
|
+
for i in range(n_features):
|
|
129
|
+
# Suggest different parameters for each feature
|
|
130
|
+
ch = trial.suggest_int(f'ch_{i}', 0, ctx['n_channels'] - 1)
|
|
131
|
+
t_idx = trial.suggest_int(f't_{i}', 0, len(ctx['transform_names']) - 1)
|
|
132
|
+
c = trial.suggest_float(f'c_{i}', 0, 1)
|
|
133
|
+
r = trial.suggest_float(f'r_{i}', 0, 1)
|
|
134
|
+
s, e = get_segment_indices(c, r, ctx['n_time'])
|
|
135
|
+
segment = ctx['transformed'][t_idx, :, ch, s:e+1]
|
|
136
|
+
feat = segment.mean(axis=1, keepdims=True).astype(np.float32)
|
|
137
|
+
features.append(feat)
|
|
138
|
+
|
|
139
|
+
# Combine all features
|
|
140
|
+
combined_feat = np.hstack(features) if len(features) > 1 else features[0]
|
|
141
|
+
|
|
142
|
+
# Store for extraction mode
|
|
143
|
+
ctx['last_feature'] = combined_feat
|
|
144
|
+
if ctx.get('extract_only'):
|
|
145
|
+
return 0.0
|
|
146
|
+
|
|
147
|
+
# Evaluate combined feature set
|
|
148
|
+
current_X = ctx['current_X']
|
|
149
|
+
X = np.hstack([current_X, combined_feat]) if current_X.size else combined_feat
|
|
150
|
+
model, y, metric, cv_splits = ctx['model'], ctx['y'], ctx['metric'], ctx['cv_splits']
|
|
151
|
+
total = sum(model.evaluate(X[tr], y[tr], X[va], y[va], metric) for tr, va in cv_splits)
|
|
152
|
+
return total / len(cv_splits)
|
sublimex/py.typed
ADDED
sublimex/transforms.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Signal transforms for SublimeX feature extraction."""
|
|
2
|
+
import numpy as np
|
|
3
|
+
from typing import Callable, Dict, List
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _fft_power(d):
|
|
7
|
+
"""FFT power spectrum interpolated to original length."""
|
|
8
|
+
p = np.abs(np.fft.rfft(d, axis=-1)) ** 2
|
|
9
|
+
x = np.linspace(0, p.shape[-1]-1, d.shape[-1])
|
|
10
|
+
idx = np.minimum(x.astype(int), p.shape[-1]-2)
|
|
11
|
+
return p[..., idx] * (1 - (x - idx)) + p[..., idx + 1] * (x - idx)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
TRANSFORMS: Dict[str, Callable] = {
|
|
15
|
+
'raw': lambda d: d,
|
|
16
|
+
'zscore': lambda d: (d - d.mean(-1, keepdims=True)) / (d.std(-1, keepdims=True) + 1e-8),
|
|
17
|
+
'derivative': lambda d: np.gradient(d, axis=-1),
|
|
18
|
+
'fft_power': _fft_power,
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def register_transform(name: str, func: Callable, overwrite: bool = False):
|
|
23
|
+
"""Register a custom transform function."""
|
|
24
|
+
if name in TRANSFORMS and not overwrite:
|
|
25
|
+
raise ValueError(f"Transform '{name}' already exists. Use overwrite=True.")
|
|
26
|
+
TRANSFORMS[name] = func
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_transform(name: str) -> Callable:
|
|
30
|
+
"""Get transform by name."""
|
|
31
|
+
if name not in TRANSFORMS:
|
|
32
|
+
raise KeyError(f"Transform '{name}' not found. Available: {list(TRANSFORMS.keys())}")
|
|
33
|
+
return TRANSFORMS[name]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def list_transforms() -> List[str]:
|
|
37
|
+
"""List available transform names."""
|
|
38
|
+
return list(TRANSFORMS.keys())
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Visualization utilities for SublimeX."""
|
|
2
|
+
import numpy as np
|
|
3
|
+
from typing import List, Dict, Any, Optional, Tuple, Union
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _import_plt():
|
|
7
|
+
import matplotlib.pyplot as plt
|
|
8
|
+
return plt
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def plot_feature_importance(importances, descriptions=None, top_k=None,
|
|
12
|
+
figsize=(10, 6), color='#2ecc71'):
|
|
13
|
+
"""Plot feature importance as horizontal bar chart."""
|
|
14
|
+
plt = _import_plt()
|
|
15
|
+
n = len(importances)
|
|
16
|
+
if descriptions is None:
|
|
17
|
+
descriptions = [f'Feature {i+1}' for i in range(n)]
|
|
18
|
+
|
|
19
|
+
indices = np.argsort(importances)[::-1]
|
|
20
|
+
if top_k:
|
|
21
|
+
indices = indices[:top_k]
|
|
22
|
+
|
|
23
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
24
|
+
y_pos = np.arange(len(indices))
|
|
25
|
+
ax.barh(y_pos, importances[indices], color=color)
|
|
26
|
+
ax.set_yticks(y_pos)
|
|
27
|
+
ax.set_yticklabels([descriptions[i] for i in indices])
|
|
28
|
+
ax.invert_yaxis()
|
|
29
|
+
ax.set_xlabel('Importance')
|
|
30
|
+
plt.tight_layout()
|
|
31
|
+
return fig
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def plot_segment_on_signal(signal, feature_params, n_time, transform_names=None,
|
|
35
|
+
sample_idx=0, figsize=(12, 4)):
|
|
36
|
+
"""Visualize where a feature's segment falls on the signal."""
|
|
37
|
+
plt = _import_plt()
|
|
38
|
+
sig = signal if signal.ndim == 1 else signal[sample_idx]
|
|
39
|
+
|
|
40
|
+
c, r, t = feature_params.get('c', 0.5), feature_params.get('r', 0.5), feature_params.get('t', 0)
|
|
41
|
+
center_idx = c * (n_time - 1)
|
|
42
|
+
half_width = (r * (n_time - 1)) * 0.5
|
|
43
|
+
start, end = max(0, int(center_idx - half_width)), min(n_time - 1, int(center_idx + half_width))
|
|
44
|
+
|
|
45
|
+
transform_name = transform_names[t] if transform_names and t < len(transform_names) else f'transform_{t}'
|
|
46
|
+
|
|
47
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
48
|
+
ax.plot(sig, color='#3498db', linewidth=1.5, label='Signal')
|
|
49
|
+
ax.axvspan(start, end, alpha=0.3, color='#e74c3c', label=f'Segment [{start}:{end}]')
|
|
50
|
+
ax.axvline(start, color='#e74c3c', linestyle='--', alpha=0.7)
|
|
51
|
+
ax.axvline(end, color='#e74c3c', linestyle='--', alpha=0.7)
|
|
52
|
+
ax.set_xlabel('Time')
|
|
53
|
+
ax.set_ylabel('Value')
|
|
54
|
+
ax.set_title(f'Feature segment (transform: {transform_name})')
|
|
55
|
+
ax.legend()
|
|
56
|
+
plt.tight_layout()
|
|
57
|
+
return fig
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def plot_feature_distributions(features, y, feature_idx=0, feature_description=None,
|
|
61
|
+
figsize=(10, 5), bins=30):
|
|
62
|
+
"""Plot distribution of a feature's values across classes."""
|
|
63
|
+
plt = _import_plt()
|
|
64
|
+
feat_values = features[:, feature_idx]
|
|
65
|
+
classes = np.unique(y)
|
|
66
|
+
colors = plt.cm.get_cmap('tab10')(range(len(classes)))
|
|
67
|
+
|
|
68
|
+
fig, axes = plt.subplots(1, 2, figsize=figsize)
|
|
69
|
+
|
|
70
|
+
for i, cls in enumerate(classes):
|
|
71
|
+
mask = y == cls
|
|
72
|
+
axes[0].hist(feat_values[mask], bins=bins, alpha=0.6, color=colors[i], label=f'Class {cls}')
|
|
73
|
+
axes[0].set_xlabel('Feature Value')
|
|
74
|
+
axes[0].set_ylabel('Count')
|
|
75
|
+
axes[0].legend()
|
|
76
|
+
|
|
77
|
+
bp = axes[1].boxplot([feat_values[y == c] for c in classes],
|
|
78
|
+
labels=[f'Class {c}' for c in classes], patch_artist=True)
|
|
79
|
+
for patch, color in zip(bp['boxes'], colors):
|
|
80
|
+
patch.set_facecolor(color)
|
|
81
|
+
patch.set_alpha(0.6)
|
|
82
|
+
axes[1].set_ylabel('Feature Value')
|
|
83
|
+
|
|
84
|
+
fig.suptitle(feature_description or f'Feature {feature_idx + 1}', fontweight='bold')
|
|
85
|
+
plt.tight_layout()
|
|
86
|
+
return fig
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def plot_transform_comparison(signal, transforms, figsize=(12, 8), sample_idx=0):
|
|
90
|
+
"""Show a signal under all available transforms."""
|
|
91
|
+
plt = _import_plt()
|
|
92
|
+
sig = signal.reshape(1, -1) if signal.ndim == 1 else signal[sample_idx:sample_idx+1]
|
|
93
|
+
|
|
94
|
+
n_transforms = len(transforms)
|
|
95
|
+
n_cols, n_rows = 2, (n_transforms + 1) // 2
|
|
96
|
+
fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
|
|
97
|
+
axes = axes.flatten()
|
|
98
|
+
colors = plt.cm.viridis(np.linspace(0, 0.8, n_transforms))
|
|
99
|
+
|
|
100
|
+
for i, (name, fn) in enumerate(transforms.items()):
|
|
101
|
+
try:
|
|
102
|
+
axes[i].plot(fn(sig)[0], color=colors[i], linewidth=1.5)
|
|
103
|
+
axes[i].set_title(name, fontweight='bold')
|
|
104
|
+
except Exception as e:
|
|
105
|
+
axes[i].text(0.5, 0.5, f'Error: {str(e)[:30]}', ha='center', va='center', transform=axes[i].transAxes)
|
|
106
|
+
axes[i].set_xlabel('Time')
|
|
107
|
+
axes[i].set_ylabel('Value')
|
|
108
|
+
|
|
109
|
+
for i in range(n_transforms, len(axes)):
|
|
110
|
+
axes[i].set_visible(False)
|
|
111
|
+
|
|
112
|
+
fig.suptitle('Signal Transforms Comparison', fontsize=14, fontweight='bold')
|
|
113
|
+
plt.tight_layout()
|
|
114
|
+
return fig
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def plot_optimization_history(study_or_scores, figsize=(10, 5), color='#9b59b6'):
|
|
118
|
+
"""Plot optimization progress over trials."""
|
|
119
|
+
plt = _import_plt()
|
|
120
|
+
scores = [t.value for t in study_or_scores.trials if t.value is not None] if hasattr(study_or_scores, 'trials') else list(study_or_scores)
|
|
121
|
+
|
|
122
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
123
|
+
trials = np.arange(1, len(scores) + 1)
|
|
124
|
+
ax.plot(trials, scores, alpha=0.3, color=color, label='Trial scores')
|
|
125
|
+
ax.plot(trials, np.maximum.accumulate(scores), color=color, linewidth=2, label='Best so far')
|
|
126
|
+
ax.set_xlabel('Trial')
|
|
127
|
+
ax.set_ylabel('Score')
|
|
128
|
+
ax.set_title('Optimization Progress')
|
|
129
|
+
ax.legend()
|
|
130
|
+
plt.tight_layout()
|
|
131
|
+
return fig
|
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sublimex
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: SublimeX: Supervised Bottom-Up Localized Multi-Representative Feature eXtraction for time series and spatial data
|
|
5
|
+
Author-email: "J.C. Wolber" <jwolber@ukaachen.de>
|
|
6
|
+
Maintainer-email: "J.C. Wolber" <jwolber@ukaachen.de>
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/Prgrmmrjns/SublimeX
|
|
9
|
+
Project-URL: Documentation, https://github.com/Prgrmmrjns/SublimeX#readme
|
|
10
|
+
Project-URL: Repository, https://github.com/Prgrmmrjns/SublimeX.git
|
|
11
|
+
Project-URL: Issues, https://github.com/Prgrmmrjns/SublimeX/issues
|
|
12
|
+
Keywords: time series,feature extraction,machine learning,interpretable ml,signal processing,classification,regression,bayesian optimization
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
26
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
27
|
+
Requires-Python: >=3.9
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Requires-Dist: numpy>=1.21.0
|
|
31
|
+
Requires-Dist: scipy>=1.7.0
|
|
32
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
33
|
+
Requires-Dist: optuna>=3.0.0
|
|
34
|
+
Requires-Dist: lightgbm>=3.3.0
|
|
35
|
+
Provides-Extra: visualization
|
|
36
|
+
Requires-Dist: matplotlib>=3.5.0; extra == "visualization"
|
|
37
|
+
Provides-Extra: full
|
|
38
|
+
Requires-Dist: matplotlib>=3.5.0; extra == "full"
|
|
39
|
+
Requires-Dist: pandas>=1.3.0; extra == "full"
|
|
40
|
+
Provides-Extra: dev
|
|
41
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
42
|
+
Requires-Dist: pytest-cov>=3.0.0; extra == "dev"
|
|
43
|
+
Requires-Dist: black>=22.0.0; extra == "dev"
|
|
44
|
+
Requires-Dist: flake8>=4.0.0; extra == "dev"
|
|
45
|
+
Requires-Dist: mypy>=0.950; extra == "dev"
|
|
46
|
+
Requires-Dist: matplotlib>=3.5.0; extra == "dev"
|
|
47
|
+
Requires-Dist: pandas>=1.3.0; extra == "dev"
|
|
48
|
+
Dynamic: license-file
|
|
49
|
+
|
|
50
|
+
# SublimeX
|
|
51
|
+
|
|
52
|
+
**Supervised Bottom-Up Localized Multi-Representative Feature eXtraction**
|
|
53
|
+
|
|
54
|
+
[](https://badge.fury.io/py/sublimex)
|
|
55
|
+
[](https://www.python.org/downloads/)
|
|
56
|
+
[](https://opensource.org/licenses/MIT)
|
|
57
|
+
|
|
58
|
+
SublimeX is an interpretable feature extraction framework for time series and spatial data. It discovers a minimal set of task-specific features through Bayesian optimization, where each feature has explicit, human-readable semantics.
|
|
59
|
+
|
|
60
|
+
## Key Features
|
|
61
|
+
|
|
62
|
+
- **Minimal feature sets**: Typically 5-15 features vs. thousands from other methods
|
|
63
|
+
- **Full interpretability**: Each feature = statistic over optimized segment of transformed signal
|
|
64
|
+
- **Competitive performance**: Matches deep learning on many tasks
|
|
65
|
+
- **Modular design**: Custom transforms, objectives, and ML models
|
|
66
|
+
|
|
67
|
+
## Installation
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install sublimex
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Or install from source:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
git clone https://github.com/Prgrmmrjns/SublimeX.git
|
|
77
|
+
cd SublimeX
|
|
78
|
+
pip install -e .
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Quick Start
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
import sublimex
|
|
85
|
+
import numpy as np
|
|
86
|
+
from sklearn.model_selection import train_test_split
|
|
87
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
88
|
+
|
|
89
|
+
# Your data: list of arrays/DataFrames, one per channel
|
|
90
|
+
# Each array has shape (n_samples, n_time_points)
|
|
91
|
+
X = [channel1_data, channel2_data, channel3_data] # 3 channels
|
|
92
|
+
y = labels # Binary or continuous targets
|
|
93
|
+
|
|
94
|
+
# Split data
|
|
95
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
96
|
+
range(len(y)), test_size=0.2, stratify=y, random_state=42
|
|
97
|
+
)
|
|
98
|
+
X_train = [x.iloc[X_train] for x in X]
|
|
99
|
+
X_test = [x.iloc[X_test] for x in X]
|
|
100
|
+
y_train, y_test = y[X_train], y[X_test]
|
|
101
|
+
|
|
102
|
+
# Fit SublimeX
|
|
103
|
+
model = sublimex.SublimeX(metric='auc', n_trials=100, verbose=True)
|
|
104
|
+
train_features = model.fit_transform(X_train, y_train)
|
|
105
|
+
test_features = model.transform(X_test)
|
|
106
|
+
|
|
107
|
+
# Use with any classifier
|
|
108
|
+
clf = RandomForestClassifier()
|
|
109
|
+
clf.fit(train_features, y_train)
|
|
110
|
+
predictions = clf.predict(test_features)
|
|
111
|
+
|
|
112
|
+
# Interpret discovered features
|
|
113
|
+
for desc in model.get_feature_descriptions():
|
|
114
|
+
print(desc)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## How It Works
|
|
118
|
+
|
|
119
|
+
SublimeX discovers discriminative features through a simple but effective process:
|
|
120
|
+
|
|
121
|
+
1. **Signal Transformation**: Apply multiple transforms to create different "views" of the data (raw, z-score normalized, derivative, FFT power spectrum)
|
|
122
|
+
|
|
123
|
+
2. **Segment Optimization**: Use Bayesian optimization (Optuna) to find segments that maximize downstream model performance
|
|
124
|
+
|
|
125
|
+
3. **Feature Extraction**: Compute statistics (e.g., mean) over discovered segments
|
|
126
|
+
|
|
127
|
+
4. **Iterative Discovery**: Repeat until adding new features no longer improves performance
|
|
128
|
+
|
|
129
|
+
Each discovered feature is fully interpretable:
|
|
130
|
+
> "Mean of z-score normalized signal in channel 2, positions 40-60"
|
|
131
|
+
|
|
132
|
+
## Configuration
|
|
133
|
+
|
|
134
|
+
### Basic Parameters
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
import sublimex
|
|
138
|
+
|
|
139
|
+
model = sublimex.SublimeX(
|
|
140
|
+
metric='auc', # 'auc', 'accuracy', or 'rmse'
|
|
141
|
+
n_trials=300, # Optimization trials per feature
|
|
142
|
+
inner_cv=1, # Internal CV folds (1 = single split)
|
|
143
|
+
val_size=0.5, # Validation size when inner_cv=1
|
|
144
|
+
verbose=True, # Print progress
|
|
145
|
+
)
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Custom Transforms
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
import sublimex
|
|
152
|
+
|
|
153
|
+
# Register a custom transform
|
|
154
|
+
def hilbert_envelope(data):
|
|
155
|
+
from scipy.signal import hilbert
|
|
156
|
+
return np.abs(hilbert(data, axis=-1))
|
|
157
|
+
|
|
158
|
+
sublimex.register_transform('hilbert', hilbert_envelope)
|
|
159
|
+
|
|
160
|
+
# Use specific transforms
|
|
161
|
+
model = sublimex.SublimeX(
|
|
162
|
+
transforms={
|
|
163
|
+
'raw': lambda x: x,
|
|
164
|
+
'hilbert': hilbert_envelope,
|
|
165
|
+
}
|
|
166
|
+
)
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Custom Objective Functions
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
import sublimex
|
|
173
|
+
import numpy as np
|
|
174
|
+
|
|
175
|
+
# Create objective with custom aggregation
|
|
176
|
+
def rms(segment):
|
|
177
|
+
"""Root mean square."""
|
|
178
|
+
return np.sqrt((segment ** 2).mean(axis=1, keepdims=True))
|
|
179
|
+
|
|
180
|
+
rms_objective = sublimex.create_custom_objective(rms, 'rms')
|
|
181
|
+
model = sublimex.SublimeX(objective_fn=rms_objective)
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### Custom ML Models
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
import sublimex
|
|
188
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
189
|
+
|
|
190
|
+
# Wrap any sklearn estimator
|
|
191
|
+
rf = RandomForestClassifier(n_estimators=100)
|
|
192
|
+
model = sublimex.SublimeX(model=sublimex.SklearnModelWrapper(rf))
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## Built-in Options
|
|
196
|
+
|
|
197
|
+
### Transforms
|
|
198
|
+
|
|
199
|
+
| Transform | Description | Use Case |
|
|
200
|
+
|-----------|-------------|----------|
|
|
201
|
+
| `raw` | Identity (original signal) | Amplitude differences |
|
|
202
|
+
| `zscore` | Z-score normalization | Shape differences |
|
|
203
|
+
| `derivative` | First-order gradient | Rate of change, transitions |
|
|
204
|
+
| `fft_power` | FFT power spectrum | Frequency content, periodicity |
|
|
205
|
+
|
|
206
|
+
### Aggregations (for `aggregate_objective`)
|
|
207
|
+
|
|
208
|
+
| Aggregation | Description |
|
|
209
|
+
|-------------|-------------|
|
|
210
|
+
| `mean` | Average value (default) |
|
|
211
|
+
| `min`, `max` | Extreme values |
|
|
212
|
+
| `range` | max - min |
|
|
213
|
+
| `std` | Standard deviation |
|
|
214
|
+
| `median` | Robust central tendency |
|
|
215
|
+
| `argmin`, `argmax` | Position of extrema |
|
|
216
|
+
|
|
217
|
+
### Objectives
|
|
218
|
+
|
|
219
|
+
| Objective | Description |
|
|
220
|
+
|-----------|-------------|
|
|
221
|
+
| `mean_objective` | Mean over segment (default, most interpretable) |
|
|
222
|
+
| `aggregate_objective` | Choose from 8 aggregations |
|
|
223
|
+
| `pattern_objective` | B-spline pattern matching |
|
|
224
|
+
|
|
225
|
+
## Visualization
|
|
226
|
+
|
|
227
|
+
```python
|
|
228
|
+
import sublimex
|
|
229
|
+
import matplotlib.pyplot as plt
|
|
230
|
+
|
|
231
|
+
# Compare feature values across classes
|
|
232
|
+
fig = sublimex.plot_feature_distributions(features, y, feature_idx=0)
|
|
233
|
+
plt.show()
|
|
234
|
+
|
|
235
|
+
# Show where a feature's segment falls on the signal
|
|
236
|
+
fig = sublimex.plot_segment_on_signal(
|
|
237
|
+
signal, model.extracted_features[0], model.n_time
|
|
238
|
+
)
|
|
239
|
+
plt.show()
|
|
240
|
+
|
|
241
|
+
# Compare all transforms for a sample
|
|
242
|
+
fig = sublimex.plot_transform_comparison(signal, sublimex.TRANSFORMS)
|
|
243
|
+
plt.show()
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
## Saving and Loading
|
|
247
|
+
|
|
248
|
+
```python
|
|
249
|
+
import sublimex
|
|
250
|
+
|
|
251
|
+
# Save discovered features
|
|
252
|
+
model.save_features('features.json')
|
|
253
|
+
|
|
254
|
+
# Load and reuse
|
|
255
|
+
new_model = sublimex.SublimeX()
|
|
256
|
+
new_model.load_features('features.json')
|
|
257
|
+
features = new_model.transform(X_new)
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
## Complete Example: Synthetic Multi-Channel Time Series
|
|
261
|
+
|
|
262
|
+
```python
|
|
263
|
+
"""Complete example with synthetic data."""
|
|
264
|
+
import sublimex
|
|
265
|
+
import numpy as np
|
|
266
|
+
import pandas as pd
|
|
267
|
+
from sklearn.model_selection import train_test_split
|
|
268
|
+
from sklearn.metrics import roc_auc_score
|
|
269
|
+
import lightgbm as lgb
|
|
270
|
+
|
|
271
|
+
# Generate synthetic multi-channel time series
|
|
272
|
+
np.random.seed(42)
|
|
273
|
+
n_samples, n_channels, n_bins = 500, 5, 200
|
|
274
|
+
|
|
275
|
+
X = []
|
|
276
|
+
y = np.random.randint(0, 2, n_samples)
|
|
277
|
+
|
|
278
|
+
for channel in range(n_channels):
|
|
279
|
+
channel_data = []
|
|
280
|
+
for i in range(n_samples):
|
|
281
|
+
t = np.linspace(0, 4 * np.pi, n_bins)
|
|
282
|
+
if y[i] == 1:
|
|
283
|
+
# Class 1: Higher amplitude with distinctive peak
|
|
284
|
+
signal = 2.0 * np.sin(t) + 1.5 * np.sin(2 * t)
|
|
285
|
+
peak = n_bins // 2
|
|
286
|
+
signal[peak-20:peak+20] += 1.5 * np.exp(-((np.arange(40) - 20) ** 2) / 50)
|
|
287
|
+
else:
|
|
288
|
+
# Class 0: Lower amplitude, more noise
|
|
289
|
+
signal = 1.0 * np.sin(t) + 0.5 * np.sin(3 * t)
|
|
290
|
+
signal += np.random.normal(0, 0.3, n_bins)
|
|
291
|
+
channel_data.append(signal)
|
|
292
|
+
|
|
293
|
+
X.append(pd.DataFrame(channel_data, columns=[f'bin_{j}' for j in range(n_bins)]))
|
|
294
|
+
|
|
295
|
+
# Split data
|
|
296
|
+
idx_train, idx_test = train_test_split(
|
|
297
|
+
range(len(y)), test_size=0.2, stratify=y, random_state=42
|
|
298
|
+
)
|
|
299
|
+
X_train = [x.iloc[idx_train].astype(np.float32) for x in X]
|
|
300
|
+
X_test = [x.iloc[idx_test].astype(np.float32) for x in X]
|
|
301
|
+
y_train, y_test = y[idx_train], y[idx_test]
|
|
302
|
+
|
|
303
|
+
# Discover features with SublimeX
|
|
304
|
+
model = sublimex.SublimeX(metric='auc', n_trials=50, verbose=True)
|
|
305
|
+
train_features = model.fit_transform(X_train, y_train)
|
|
306
|
+
test_features = model.transform(X_test)
|
|
307
|
+
|
|
308
|
+
print(f"Discovered {train_features.shape[1]} features")
|
|
309
|
+
|
|
310
|
+
# Train classifier
|
|
311
|
+
clf = lgb.LGBMClassifier(n_estimators=100, verbose=-1)
|
|
312
|
+
clf.fit(train_features, y_train)
|
|
313
|
+
auc = roc_auc_score(y_test, clf.predict_proba(test_features)[:, 1])
|
|
314
|
+
print(f"Test AUC: {auc:.4f}")
|
|
315
|
+
|
|
316
|
+
# Interpret features
|
|
317
|
+
print("\nDiscovered Features:")
|
|
318
|
+
for desc in model.get_feature_descriptions():
|
|
319
|
+
print(f" {desc}")
|
|
320
|
+
|
|
321
|
+
# Visualize
|
|
322
|
+
import matplotlib.pyplot as plt
|
|
323
|
+
|
|
324
|
+
fig = sublimex.plot_feature_importance(
|
|
325
|
+
clf.feature_importances_,
|
|
326
|
+
model.get_feature_descriptions()
|
|
327
|
+
)
|
|
328
|
+
plt.show()
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
## Comparison with Other Methods
|
|
332
|
+
|
|
333
|
+
| Method | Features | Interpretability | Optimization |
|
|
334
|
+
|--------|----------|------------------|--------------|
|
|
335
|
+
| **SublimeX** | 5-15 | High (explicit segments) | Bayesian |
|
|
336
|
+
| tsfresh | 100-800 | Medium (statistical) | Filter |
|
|
337
|
+
| catch22 | 22 | Medium (fixed set) | None |
|
|
338
|
+
| MiniRocket | ~10,000 | Low | Deterministic |
|
|
339
|
+
| RDST | 2k-10k | Medium (shapelets) | Random |
|
|
340
|
+
|
|
341
|
+
## Citation
|
|
342
|
+
|
|
343
|
+
If you use SublimeX in your research, please cite:
|
|
344
|
+
|
|
345
|
+
```bibtex
|
|
346
|
+
@software{sublimex2025,
|
|
347
|
+
title={SublimeX: Supervised Bottom-Up Localized Multi-Representative Feature eXtraction},
|
|
348
|
+
author={Wolber, J.C.},
|
|
349
|
+
year={2026},
|
|
350
|
+
url={https://github.com/Prgrmmrjns/SublimeX}
|
|
351
|
+
}
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
## License
|
|
355
|
+
|
|
356
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
|
357
|
+
|
|
358
|
+
## Contributing
|
|
359
|
+
|
|
360
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
sublimex/__init__.py,sha256=_IwnQG13qAVNG3K54Z10iZ7IIoOhBsQsDKVmyOxJMXI,1172
|
|
2
|
+
sublimex/core.py,sha256=6AVhcEEhEpfZeRhgF8--4im0BSG_4lCICFilwUlijzs,9262
|
|
3
|
+
sublimex/models.py,sha256=nA8SUBDqM5ChXBYiND8RgGvR_eCitE3Tjch2Xh0Osmc,4155
|
|
4
|
+
sublimex/objectives.py,sha256=eqJJUTA57dIcXP22DeZNozapzb0xO7iiY8SOKoQx_mo,6159
|
|
5
|
+
sublimex/py.typed,sha256=IVB0agohk5jYgq7eRKHocR6LrSNCBZrlPuv5x-_BiLM,93
|
|
6
|
+
sublimex/transforms.py,sha256=tDH2xluUtC-LAlP1-KEZ3cO7eMGM2BTuRh_LtTpcS0Q,1284
|
|
7
|
+
sublimex/visualization.py,sha256=JKv7ij2agowG8e3nHuu6dKeRiMJ9_bG5oi8X4AxEQ2s,5094
|
|
8
|
+
sublimex-0.1.0.dist-info/licenses/LICENSE,sha256=rikCGiAwdvAZnZTcZD0QCqKOawQkigjw4yfcf0aiS5U,1068
|
|
9
|
+
sublimex-0.1.0.dist-info/METADATA,sha256=w0MncQMgLzknKgR1LngO5IITzvUpBN9DP45byGrp0OI,11164
|
|
10
|
+
sublimex-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
11
|
+
sublimex-0.1.0.dist-info/top_level.txt,sha256=paeo1dixjEC0yxT-C9QwOODcZlG9uFF7nWBEHh18NKU,9
|
|
12
|
+
sublimex-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 J.C. Wolber
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
sublimex
|