wavetrainer 0.0.26__tar.gz → 0.0.28__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wavetrainer-0.0.26/wavetrainer.egg-info → wavetrainer-0.0.28}/PKG-INFO +1 -1
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/setup.py +1 -1
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/__init__.py +1 -1
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/calibrator/calibrator_router.py +9 -4
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/calibrator/mapie_calibrator.py +4 -2
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/calibrator/vennabers_calibrator.py +4 -2
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/create.py +0 -2
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/model/catboost_model.py +32 -12
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/model/model.py +5 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/model/model_router.py +15 -5
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/model/tabpfn_model.py +8 -2
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/params.py +5 -2
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/reducer/base_selector_reducer.py +4 -2
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/reducer/combined_reducer.py +11 -10
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/reducer/correlation_reducer.py +12 -1
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/reducer/nonnumeric_reducer.py +4 -2
- wavetrainer-0.0.28/wavetrainer/reducer/smart_correlation_reducer.py +32 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/reducer/unseen_reducer.py +4 -2
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/selector/selector.py +4 -2
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/trainer.py +15 -19
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/weights/class_weights.py +4 -2
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/weights/combined_weights.py +6 -4
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/weights/exponential_weights.py +4 -2
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/weights/linear_weights.py +4 -2
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/weights/noop_weights.py +4 -2
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/weights/sigmoid_weights.py +4 -2
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/weights/weights_router.py +5 -2
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/windower/windower.py +4 -2
- {wavetrainer-0.0.26 → wavetrainer-0.0.28/wavetrainer.egg-info}/PKG-INFO +1 -1
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer.egg-info/SOURCES.txt +1 -1
- wavetrainer-0.0.26/wavetrainer/reducer/pca_reducer.py +0 -77
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/LICENSE +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/MANIFEST.in +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/README.md +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/requirements.txt +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/setup.cfg +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/tests/__init__.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/tests/model/__init__.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/tests/model/catboost_kwargs_test.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/tests/trainer_test.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/calibrator/__init__.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/calibrator/calibrator.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/exceptions.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/fit.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/model/__init__.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/model/catboost_classifier_wrap.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/model/catboost_kwargs.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/model/catboost_regressor_wrap.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/model_type.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/reducer/__init__.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/reducer/constant_reducer.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/reducer/duplicate_reducer.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/reducer/reducer.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/selector/__init__.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/weights/__init__.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/weights/weights.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer/windower/__init__.py +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer.egg-info/dependency_links.txt +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer.egg-info/not-zip-safe +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer.egg-info/requires.txt +0 -0
- {wavetrainer-0.0.26 → wavetrainer-0.0.28}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
|
|
23
23
|
|
24
24
|
setup(
|
25
25
|
name='wavetrainer',
|
26
|
-
version='0.0.
|
26
|
+
version='0.0.28',
|
27
27
|
description='A library for automatically finding the optimal model within feature and hyperparameter space.',
|
28
28
|
long_description=long_description,
|
29
29
|
long_description_content_type='text/markdown',
|
@@ -36,8 +36,13 @@ class CalibratorRouter(Calibrator):
|
|
36
36
|
def name(cls) -> str:
|
37
37
|
return "router"
|
38
38
|
|
39
|
-
def set_options(
|
40
|
-
|
39
|
+
def set_options(
|
40
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
41
|
+
) -> None:
|
42
|
+
calibrator = self._calibrator
|
43
|
+
if calibrator is None:
|
44
|
+
return
|
45
|
+
calibrator.set_options(trial, df)
|
41
46
|
|
42
47
|
def load(self, folder: str) -> None:
|
43
48
|
with open(
|
@@ -48,11 +53,11 @@ class CalibratorRouter(Calibrator):
|
|
48
53
|
calibrator.load(folder)
|
49
54
|
self._calibrator = calibrator
|
50
55
|
|
51
|
-
def save(self, folder: str) -> None:
|
56
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
52
57
|
calibrator = self._calibrator
|
53
58
|
if calibrator is None:
|
54
59
|
raise ValueError("calibrator is null.")
|
55
|
-
calibrator.save(folder)
|
60
|
+
calibrator.save(folder, trial)
|
56
61
|
with open(
|
57
62
|
os.path.join(folder, _CALIBRATOR_ROUTER_FILE), "w", encoding="utf8"
|
58
63
|
) as handle:
|
@@ -29,13 +29,15 @@ class MAPIECalibrator(Calibrator):
|
|
29
29
|
def name(cls) -> str:
|
30
30
|
return "mapie"
|
31
31
|
|
32
|
-
def set_options(
|
32
|
+
def set_options(
|
33
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
34
|
+
) -> None:
|
33
35
|
pass
|
34
36
|
|
35
37
|
def load(self, folder: str) -> None:
|
36
38
|
self._mapie = joblib.load(os.path.join(folder, _CALIBRATOR_FILENAME))
|
37
39
|
|
38
|
-
def save(self, folder: str) -> None:
|
40
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
39
41
|
joblib.dump(self._mapie, os.path.join(folder, _CALIBRATOR_FILENAME))
|
40
42
|
|
41
43
|
def fit(
|
@@ -27,13 +27,15 @@ class VennabersCalibrator(Calibrator):
|
|
27
27
|
def name(cls) -> str:
|
28
28
|
return "vennabers"
|
29
29
|
|
30
|
-
def set_options(
|
30
|
+
def set_options(
|
31
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
32
|
+
) -> None:
|
31
33
|
pass
|
32
34
|
|
33
35
|
def load(self, folder: str) -> None:
|
34
36
|
self._vennabers = joblib.load(os.path.join(folder, _CALIBRATOR_FILENAME))
|
35
37
|
|
36
|
-
def save(self, folder: str) -> None:
|
38
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
37
39
|
joblib.dump(self._vennabers, os.path.join(folder, _CALIBRATOR_FILENAME))
|
38
40
|
|
39
41
|
def fit(
|
@@ -15,7 +15,6 @@ def create(
|
|
15
15
|
dt_column: str | None = None,
|
16
16
|
max_train_timeout: datetime.timedelta | None = None,
|
17
17
|
cutoff_dt: datetime.datetime | None = None,
|
18
|
-
max_features: int | None = None,
|
19
18
|
) -> Trainer:
|
20
19
|
"""Create a trainer."""
|
21
20
|
return Trainer(
|
@@ -26,5 +25,4 @@ def create(
|
|
26
25
|
dt_column=dt_column,
|
27
26
|
max_train_timeout=max_train_timeout,
|
28
27
|
cutoff_dt=cutoff_dt,
|
29
|
-
max_features=max_features,
|
30
28
|
)
|
@@ -26,6 +26,7 @@ _L2_LEAF_REG_KEY = "l2_leaf_reg"
|
|
26
26
|
_BOOSTING_TYPE_KEY = "boosting_type"
|
27
27
|
_MODEL_TYPE_KEY = "model_type"
|
28
28
|
_EARLY_STOPPING_ROUNDS = "early_stopping_rounds"
|
29
|
+
_BEST_ITERATION_KEY = "best_iteration"
|
29
30
|
|
30
31
|
|
31
32
|
class CatboostModel(Model):
|
@@ -41,11 +42,16 @@ class CatboostModel(Model):
|
|
41
42
|
_boosting_type: None | str
|
42
43
|
_model_type: None | ModelType
|
43
44
|
_early_stopping_rounds: None | int
|
45
|
+
_best_iteration: None | int
|
44
46
|
|
45
47
|
@classmethod
|
46
48
|
def name(cls) -> str:
|
47
49
|
return "catboost"
|
48
50
|
|
51
|
+
@classmethod
|
52
|
+
def supports_x(cls, df: pd.DataFrame) -> bool:
|
53
|
+
return True
|
54
|
+
|
49
55
|
def __init__(self) -> None:
|
50
56
|
super().__init__()
|
51
57
|
self._catboost = None
|
@@ -56,6 +62,7 @@ class CatboostModel(Model):
|
|
56
62
|
self._boosting_type = None
|
57
63
|
self._model_type = None
|
58
64
|
self._early_stopping_rounds = None
|
65
|
+
self._best_iteration = None
|
59
66
|
|
60
67
|
@property
|
61
68
|
def estimator(self) -> Any:
|
@@ -83,7 +90,9 @@ class CatboostModel(Model):
|
|
83
90
|
"sample_weight": w,
|
84
91
|
}
|
85
92
|
|
86
|
-
def set_options(
|
93
|
+
def set_options(
|
94
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
95
|
+
) -> None:
|
87
96
|
self._iterations = trial.suggest_int(_ITERATIONS_KEY, 100, 10000)
|
88
97
|
self._learning_rate = trial.suggest_float(_LEARNING_RATE_KEY, 0.001, 0.3)
|
89
98
|
self._depth = trial.suggest_int(_DEPTH_KEY, 1, 10)
|
@@ -92,6 +101,7 @@ class CatboostModel(Model):
|
|
92
101
|
_BOOSTING_TYPE_KEY, ["Ordered", "Plain"]
|
93
102
|
)
|
94
103
|
self._early_stopping_rounds = trial.suggest_int(_EARLY_STOPPING_ROUNDS, 10, 500)
|
104
|
+
self._best_iteration = trial.user_attrs.get(_BEST_ITERATION_KEY)
|
95
105
|
|
96
106
|
def load(self, folder: str) -> None:
|
97
107
|
with open(
|
@@ -105,10 +115,11 @@ class CatboostModel(Model):
|
|
105
115
|
self._boosting_type = params[_BOOSTING_TYPE_KEY]
|
106
116
|
self._model_type = ModelType(params[_MODEL_TYPE_KEY])
|
107
117
|
self._early_stopping_rounds = params[_EARLY_STOPPING_ROUNDS]
|
118
|
+
self._best_iteration = params.get(_BEST_ITERATION_KEY)
|
108
119
|
catboost = self._provide_catboost()
|
109
120
|
catboost.load_model(os.path.join(folder, _MODEL_FILENAME))
|
110
121
|
|
111
|
-
def save(self, folder: str) -> None:
|
122
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
112
123
|
with open(
|
113
124
|
os.path.join(folder, _MODEL_PARAMS_FILENAME), "w", encoding="utf8"
|
114
125
|
) as handle:
|
@@ -121,11 +132,13 @@ class CatboostModel(Model):
|
|
121
132
|
_BOOSTING_TYPE_KEY: self._boosting_type,
|
122
133
|
_MODEL_TYPE_KEY: str(self._model_type),
|
123
134
|
_EARLY_STOPPING_ROUNDS: self._early_stopping_rounds,
|
135
|
+
_BEST_ITERATION_KEY: self._best_iteration,
|
124
136
|
},
|
125
137
|
handle,
|
126
138
|
)
|
127
139
|
catboost = self._provide_catboost()
|
128
140
|
catboost.save_model(os.path.join(folder, _MODEL_FILENAME))
|
141
|
+
trial.user_attrs[_BEST_ITERATION_KEY] = self._best_iteration
|
129
142
|
|
130
143
|
def fit(
|
131
144
|
self,
|
@@ -137,8 +150,6 @@ class CatboostModel(Model):
|
|
137
150
|
) -> Self:
|
138
151
|
if y is None:
|
139
152
|
raise ValueError("y is null.")
|
140
|
-
if eval_x is None:
|
141
|
-
raise ValueError("eval_x is null.")
|
142
153
|
self._model_type = determine_model_type(y)
|
143
154
|
catboost = self._provide_catboost()
|
144
155
|
|
@@ -148,10 +159,14 @@ class CatboostModel(Model):
|
|
148
159
|
weight=w,
|
149
160
|
cat_features=df.select_dtypes(include="category").columns.tolist(),
|
150
161
|
)
|
151
|
-
eval_pool =
|
152
|
-
|
153
|
-
|
154
|
-
|
162
|
+
eval_pool = (
|
163
|
+
Pool(
|
164
|
+
eval_x,
|
165
|
+
label=eval_y,
|
166
|
+
cat_features=eval_x.select_dtypes(include="category").columns.tolist(),
|
167
|
+
)
|
168
|
+
if eval_x is not None
|
169
|
+
else None
|
155
170
|
)
|
156
171
|
catboost.fit(
|
157
172
|
train_pool,
|
@@ -162,6 +177,7 @@ class CatboostModel(Model):
|
|
162
177
|
)
|
163
178
|
importances = catboost.get_feature_importance(prettified=True)
|
164
179
|
logging.info("Importances:\n%s", importances)
|
180
|
+
self._best_iteration = catboost.get_best_iteration()
|
165
181
|
return self
|
166
182
|
|
167
183
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
@@ -186,10 +202,14 @@ class CatboostModel(Model):
|
|
186
202
|
def _provide_catboost(self) -> CatBoost:
|
187
203
|
catboost = self._catboost
|
188
204
|
if catboost is None:
|
205
|
+
best_iteration = self._best_iteration
|
206
|
+
iterations = (
|
207
|
+
best_iteration if best_iteration is not None else self._iterations
|
208
|
+
)
|
189
209
|
match self._model_type:
|
190
210
|
case ModelType.BINARY:
|
191
211
|
catboost = CatBoostClassifierWrapper(
|
192
|
-
iterations=
|
212
|
+
iterations=iterations,
|
193
213
|
learning_rate=self._learning_rate,
|
194
214
|
depth=self._depth,
|
195
215
|
l2_leaf_reg=self._l2_leaf_reg,
|
@@ -201,7 +221,7 @@ class CatboostModel(Model):
|
|
201
221
|
)
|
202
222
|
case ModelType.REGRESSION:
|
203
223
|
catboost = CatBoostRegressorWrapper(
|
204
|
-
iterations=
|
224
|
+
iterations=iterations,
|
205
225
|
learning_rate=self._learning_rate,
|
206
226
|
depth=self._depth,
|
207
227
|
l2_leaf_reg=self._l2_leaf_reg,
|
@@ -213,7 +233,7 @@ class CatboostModel(Model):
|
|
213
233
|
)
|
214
234
|
case ModelType.BINNED_BINARY:
|
215
235
|
catboost = CatBoostClassifierWrapper(
|
216
|
-
iterations=
|
236
|
+
iterations=iterations,
|
217
237
|
learning_rate=self._learning_rate,
|
218
238
|
depth=self._depth,
|
219
239
|
l2_leaf_reg=self._l2_leaf_reg,
|
@@ -225,7 +245,7 @@ class CatboostModel(Model):
|
|
225
245
|
)
|
226
246
|
case ModelType.MULTI_CLASSIFICATION:
|
227
247
|
catboost = CatBoostClassifierWrapper(
|
228
|
-
iterations=
|
248
|
+
iterations=iterations,
|
229
249
|
learning_rate=self._learning_rate,
|
230
250
|
depth=self._depth,
|
231
251
|
l2_leaf_reg=self._l2_leaf_reg,
|
@@ -20,6 +20,11 @@ class Model(Params, Fit):
|
|
20
20
|
"""The name of the model."""
|
21
21
|
raise NotImplementedError("name not implemented in parent class.")
|
22
22
|
|
23
|
+
@classmethod
|
24
|
+
def supports_x(cls, df: pd.DataFrame) -> bool:
|
25
|
+
"""Whether the model supports the X values."""
|
26
|
+
raise NotImplementedError("supports_x not implemented in parent class.")
|
27
|
+
|
23
28
|
@property
|
24
29
|
def estimator(self) -> Any:
|
25
30
|
"""The estimator backing the model."""
|
@@ -34,6 +34,10 @@ class ModelRouter(Model):
|
|
34
34
|
def name(cls) -> str:
|
35
35
|
return "router"
|
36
36
|
|
37
|
+
@classmethod
|
38
|
+
def supports_x(cls, df: pd.DataFrame) -> bool:
|
39
|
+
return True
|
40
|
+
|
37
41
|
@property
|
38
42
|
def estimator(self) -> Any:
|
39
43
|
model = self._model
|
@@ -61,9 +65,15 @@ class ModelRouter(Model):
|
|
61
65
|
raise ValueError("model is null")
|
62
66
|
return model.pre_fit(df, y=y, eval_x=eval_x, eval_y=eval_y, w=w)
|
63
67
|
|
64
|
-
def set_options(
|
65
|
-
|
66
|
-
|
68
|
+
def set_options(
|
69
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
70
|
+
) -> None:
|
71
|
+
model = _MODELS[
|
72
|
+
trial.suggest_categorical(
|
73
|
+
"model", [k for k, v in _MODELS.items() if v.supports_x(df)]
|
74
|
+
)
|
75
|
+
]()
|
76
|
+
model.set_options(trial, df)
|
67
77
|
self._model = model
|
68
78
|
|
69
79
|
def load(self, folder: str) -> None:
|
@@ -73,11 +83,11 @@ class ModelRouter(Model):
|
|
73
83
|
model.load(folder)
|
74
84
|
self._model = model
|
75
85
|
|
76
|
-
def save(self, folder: str) -> None:
|
86
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
77
87
|
model = self._model
|
78
88
|
if model is None:
|
79
89
|
raise ValueError("model is null")
|
80
|
-
model.save(folder)
|
90
|
+
model.save(folder, trial)
|
81
91
|
with open(
|
82
92
|
os.path.join(folder, _MODEL_ROUTER_FILE), "w", encoding="utf8"
|
83
93
|
) as handle:
|
@@ -31,6 +31,10 @@ class TabPFNModel(Model):
|
|
31
31
|
def name(cls) -> str:
|
32
32
|
return "tabpfn"
|
33
33
|
|
34
|
+
@classmethod
|
35
|
+
def supports_x(cls, df: pd.DataFrame) -> bool:
|
36
|
+
return len(df.columns.values) < 500
|
37
|
+
|
34
38
|
def __init__(self) -> None:
|
35
39
|
super().__init__()
|
36
40
|
self._tabpfn = None
|
@@ -57,7 +61,9 @@ class TabPFNModel(Model):
|
|
57
61
|
self._model_type = determine_model_type(y)
|
58
62
|
return {}
|
59
63
|
|
60
|
-
def set_options(
|
64
|
+
def set_options(
|
65
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
66
|
+
) -> None:
|
61
67
|
pass
|
62
68
|
|
63
69
|
def load(self, folder: str) -> None:
|
@@ -69,7 +75,7 @@ class TabPFNModel(Model):
|
|
69
75
|
params = json.load(handle)
|
70
76
|
self._model_type = ModelType(params[_MODEL_TYPE_KEY])
|
71
77
|
|
72
|
-
def save(self, folder: str) -> None:
|
78
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
73
79
|
with open(os.path.join(folder, _MODEL_FILENAME), "wb") as f:
|
74
80
|
pickle.dump(self._tabpfn, f)
|
75
81
|
with open(
|
@@ -1,12 +1,15 @@
|
|
1
1
|
"""A class for loading/saving parameters."""
|
2
2
|
|
3
3
|
import optuna
|
4
|
+
import pandas as pd
|
4
5
|
|
5
6
|
|
6
7
|
class Params:
|
7
8
|
"""The params prototype class."""
|
8
9
|
|
9
|
-
def set_options(
|
10
|
+
def set_options(
|
11
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
12
|
+
) -> None:
|
10
13
|
"""Set the options used in the object."""
|
11
14
|
raise NotImplementedError("set_options not implemented in parent class.")
|
12
15
|
|
@@ -14,6 +17,6 @@ class Params:
|
|
14
17
|
"""Loads the objects from a folder."""
|
15
18
|
raise NotImplementedError("load not implemented in parent class.")
|
16
19
|
|
17
|
-
def save(self, folder: str) -> None:
|
20
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
18
21
|
"""Saves the objects into a folder."""
|
19
22
|
raise NotImplementedError("save not implemented in parent class.")
|
@@ -32,14 +32,16 @@ class BaseSelectorReducer(Reducer):
|
|
32
32
|
"""Whether the class should raise its exception if it encounters it."""
|
33
33
|
return True
|
34
34
|
|
35
|
-
def set_options(
|
35
|
+
def set_options(
|
36
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
37
|
+
) -> None:
|
36
38
|
pass
|
37
39
|
|
38
40
|
def load(self, folder: str) -> None:
|
39
41
|
file_path = os.path.join(folder, self._file_name)
|
40
42
|
self._base_selector = joblib.load(file_path)
|
41
43
|
|
42
|
-
def save(self, folder: str) -> None:
|
44
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
43
45
|
file_path = os.path.join(folder, self._file_name)
|
44
46
|
joblib.dump(self._base_selector, file_path)
|
45
47
|
|
@@ -12,8 +12,8 @@ from .constant_reducer import ConstantReducer
|
|
12
12
|
from .correlation_reducer import CorrelationReducer
|
13
13
|
from .duplicate_reducer import DuplicateReducer
|
14
14
|
from .nonnumeric_reducer import NonNumericReducer
|
15
|
-
from .pca_reducer import PCAReducer
|
16
15
|
from .reducer import Reducer
|
16
|
+
from .smart_correlation_reducer import SmartCorrelationReducer
|
17
17
|
from .unseen_reducer import UnseenReducer
|
18
18
|
|
19
19
|
_COMBINED_REDUCER_FILE = "combined_reducer.json"
|
@@ -25,25 +25,26 @@ class CombinedReducer(Reducer):
|
|
25
25
|
|
26
26
|
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
27
27
|
|
28
|
-
def __init__(self
|
28
|
+
def __init__(self):
|
29
29
|
super().__init__()
|
30
|
-
self._max_features = max_features
|
31
30
|
self._reducers = [
|
32
31
|
UnseenReducer(),
|
33
32
|
NonNumericReducer(),
|
34
33
|
ConstantReducer(),
|
35
34
|
DuplicateReducer(),
|
36
35
|
CorrelationReducer(),
|
37
|
-
|
36
|
+
SmartCorrelationReducer(),
|
38
37
|
]
|
39
38
|
|
40
39
|
@classmethod
|
41
40
|
def name(cls) -> str:
|
42
41
|
return "combined"
|
43
42
|
|
44
|
-
def set_options(
|
43
|
+
def set_options(
|
44
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
45
|
+
) -> None:
|
45
46
|
for reducer in self._reducers:
|
46
|
-
reducer.set_options(trial)
|
47
|
+
reducer.set_options(trial, df)
|
47
48
|
|
48
49
|
def load(self, folder: str) -> None:
|
49
50
|
self._reducers = []
|
@@ -62,12 +63,12 @@ class CombinedReducer(Reducer):
|
|
62
63
|
self._reducers.append(NonNumericReducer())
|
63
64
|
elif reducer_name == UnseenReducer.name():
|
64
65
|
self._reducers.append(UnseenReducer())
|
65
|
-
elif reducer_name ==
|
66
|
-
self._reducers.append(
|
66
|
+
elif reducer_name == SmartCorrelationReducer.name():
|
67
|
+
self._reducers.append(SmartCorrelationReducer())
|
67
68
|
for reducer in self._reducers:
|
68
69
|
reducer.load(folder)
|
69
70
|
|
70
|
-
def save(self, folder: str) -> None:
|
71
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
71
72
|
with open(
|
72
73
|
os.path.join(folder, _COMBINED_REDUCER_FILE), "w", encoding="utf8"
|
73
74
|
) as handle:
|
@@ -78,7 +79,7 @@ class CombinedReducer(Reducer):
|
|
78
79
|
handle,
|
79
80
|
)
|
80
81
|
for reducer in self._reducers:
|
81
|
-
reducer.save(folder)
|
82
|
+
reducer.save(folder, trial)
|
82
83
|
|
83
84
|
def fit(
|
84
85
|
self,
|
@@ -1,18 +1,22 @@
|
|
1
1
|
"""A reducer that removes correlation features."""
|
2
2
|
|
3
|
+
import optuna
|
4
|
+
import pandas as pd
|
3
5
|
from feature_engine.selection import DropCorrelatedFeatures
|
4
6
|
|
5
7
|
from .base_selector_reducer import BaseSelectorReducer
|
6
8
|
|
7
9
|
_CORRELATION_REDUCER_FILENAME = "correlation_reducer.joblib"
|
10
|
+
_CORRELATION_REDUCER_THRESHOLD = "correlation_reducer_threshold"
|
8
11
|
|
9
12
|
|
10
13
|
class CorrelationReducer(BaseSelectorReducer):
|
11
14
|
"""A class that removes correlated values from a dataset."""
|
12
15
|
|
13
16
|
def __init__(self) -> None:
|
17
|
+
self._correlation_selector = DropCorrelatedFeatures(missing_values="ignore")
|
14
18
|
super().__init__(
|
15
|
-
|
19
|
+
self._correlation_selector,
|
16
20
|
_CORRELATION_REDUCER_FILENAME,
|
17
21
|
)
|
18
22
|
|
@@ -23,3 +27,10 @@ class CorrelationReducer(BaseSelectorReducer):
|
|
23
27
|
@classmethod
|
24
28
|
def should_raise(cls) -> bool:
|
25
29
|
return False
|
30
|
+
|
31
|
+
def set_options(
|
32
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
33
|
+
) -> None:
|
34
|
+
self._correlation_selector.threshold = trial.suggest_float(
|
35
|
+
_CORRELATION_REDUCER_THRESHOLD, 0.1, 0.9
|
36
|
+
)
|
@@ -17,13 +17,15 @@ class NonNumericReducer(Reducer):
|
|
17
17
|
def name(cls) -> str:
|
18
18
|
return "nonnumeric"
|
19
19
|
|
20
|
-
def set_options(
|
20
|
+
def set_options(
|
21
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
22
|
+
) -> None:
|
21
23
|
pass
|
22
24
|
|
23
25
|
def load(self, folder: str) -> None:
|
24
26
|
pass
|
25
27
|
|
26
|
-
def save(self, folder: str) -> None:
|
28
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
27
29
|
pass
|
28
30
|
|
29
31
|
def fit(
|
@@ -0,0 +1,32 @@
|
|
1
|
+
"""A reducer that removes correlation features via further heuristics."""
|
2
|
+
|
3
|
+
import optuna
|
4
|
+
import pandas as pd
|
5
|
+
from feature_engine.selection import SmartCorrelatedSelection
|
6
|
+
|
7
|
+
from .base_selector_reducer import BaseSelectorReducer
|
8
|
+
|
9
|
+
_SMART_CORRELATION_REDUCER_FILENAME = "smart_correlation_reducer.joblib"
|
10
|
+
_SMART_CORRELATION_REDUCER_THRESHOLD = "smart_correlation_reducer_threshold"
|
11
|
+
|
12
|
+
|
13
|
+
class SmartCorrelationReducer(BaseSelectorReducer):
|
14
|
+
"""A class that removes smart correlated values from a dataset."""
|
15
|
+
|
16
|
+
def __init__(self) -> None:
|
17
|
+
self._correlation_selector = SmartCorrelatedSelection(missing_values="ignore")
|
18
|
+
super().__init__(
|
19
|
+
self._correlation_selector,
|
20
|
+
_SMART_CORRELATION_REDUCER_FILENAME,
|
21
|
+
)
|
22
|
+
|
23
|
+
@classmethod
|
24
|
+
def name(cls) -> str:
|
25
|
+
return "smart_correlation"
|
26
|
+
|
27
|
+
def set_options(
|
28
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
29
|
+
) -> None:
|
30
|
+
self._correlation_selector.threshold = trial.suggest_float(
|
31
|
+
_SMART_CORRELATION_REDUCER_THRESHOLD, 0.1, 0.9
|
32
|
+
)
|
@@ -25,7 +25,9 @@ class UnseenReducer(Reducer):
|
|
25
25
|
def name(cls) -> str:
|
26
26
|
return "unseen"
|
27
27
|
|
28
|
-
def set_options(
|
28
|
+
def set_options(
|
29
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
30
|
+
) -> None:
|
29
31
|
pass
|
30
32
|
|
31
33
|
def load(self, folder: str) -> None:
|
@@ -34,7 +36,7 @@ class UnseenReducer(Reducer):
|
|
34
36
|
) as handle:
|
35
37
|
self._seen_features = json.load(handle)
|
36
38
|
|
37
|
-
def save(self, folder: str) -> None:
|
39
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
38
40
|
with open(
|
39
41
|
os.path.join(folder, _UNSEEN_REDUCER_FILE), "w", encoding="utf8"
|
40
42
|
) as handle:
|
@@ -31,14 +31,16 @@ class Selector(Params, Fit):
|
|
31
31
|
self._steps = 0
|
32
32
|
self._selector = None
|
33
33
|
|
34
|
-
def set_options(
|
34
|
+
def set_options(
|
35
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
36
|
+
) -> None:
|
35
37
|
self._feature_ratio = trial.suggest_float("feature_ratio", 0.0, 1.0)
|
36
38
|
self._steps = trial.suggest_int("steps", 1, 10)
|
37
39
|
|
38
40
|
def load(self, folder: str) -> None:
|
39
41
|
self._selector = joblib.load(os.path.join(folder, _SELECTOR_FILE))
|
40
42
|
|
41
|
-
def save(self, folder: str) -> None:
|
43
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
42
44
|
joblib.dump(self._selector, os.path.join(folder, _SELECTOR_FILE))
|
43
45
|
|
44
46
|
def fit(
|
@@ -36,7 +36,6 @@ _TEST_SIZE_KEY = "test_size"
|
|
36
36
|
_VALIDATION_SIZE_KEY = "validation_size"
|
37
37
|
_IDX_USR_ATTR_KEY = "idx"
|
38
38
|
_DT_COLUMN_KEY = "dt_column"
|
39
|
-
_MAX_FEATURES_KEY = "max_features"
|
40
39
|
|
41
40
|
|
42
41
|
class Trainer(Fit):
|
@@ -54,7 +53,6 @@ class Trainer(Fit):
|
|
54
53
|
dt_column: str | None = None,
|
55
54
|
max_train_timeout: datetime.timedelta | None = None,
|
56
55
|
cutoff_dt: datetime.datetime | None = None,
|
57
|
-
max_features: int | None = None,
|
58
56
|
):
|
59
57
|
tqdm.tqdm.pandas()
|
60
58
|
|
@@ -105,7 +103,6 @@ class Trainer(Fit):
|
|
105
103
|
)
|
106
104
|
if dt_column is None:
|
107
105
|
dt_column = params[_DT_COLUMN_KEY]
|
108
|
-
max_features = params.get(_MAX_FEATURES_KEY)
|
109
106
|
else:
|
110
107
|
with open(params_file, "w", encoding="utf8") as handle:
|
111
108
|
validation_size_value = None
|
@@ -136,7 +133,6 @@ class Trainer(Fit):
|
|
136
133
|
_TEST_SIZE_KEY: test_size_value,
|
137
134
|
_VALIDATION_SIZE_KEY: validation_size_value,
|
138
135
|
_DT_COLUMN_KEY: dt_column,
|
139
|
-
_MAX_FEATURES_KEY: max_features,
|
140
136
|
},
|
141
137
|
handle,
|
142
138
|
)
|
@@ -147,7 +143,6 @@ class Trainer(Fit):
|
|
147
143
|
self._dt_column = dt_column
|
148
144
|
self._max_train_timeout = max_train_timeout
|
149
145
|
self._cutoff_dt = cutoff_dt
|
150
|
-
self._max_features = max_features
|
151
146
|
|
152
147
|
def _provide_study(self, column: str) -> optuna.Study:
|
153
148
|
storage_name = f"sqlite:///{self._folder}/{column}/{_STUDYDB_FILENAME}"
|
@@ -213,7 +208,7 @@ class Trainer(Fit):
|
|
213
208
|
try:
|
214
209
|
# Window the data
|
215
210
|
windower = Windower(self._dt_column)
|
216
|
-
windower.set_options(trial)
|
211
|
+
windower.set_options(trial, x)
|
217
212
|
x_train = windower.fit_transform(x_train)
|
218
213
|
y_train = y_train[-len(x_train) :]
|
219
214
|
if len(y_train.unique()) <= 1:
|
@@ -221,25 +216,25 @@ class Trainer(Fit):
|
|
221
216
|
return -1.0
|
222
217
|
|
223
218
|
# Perform common reductions
|
224
|
-
reducer = CombinedReducer(
|
225
|
-
reducer.set_options(trial)
|
219
|
+
reducer = CombinedReducer()
|
220
|
+
reducer.set_options(trial, x)
|
226
221
|
x_train = reducer.fit_transform(x_train)
|
227
222
|
x_test = reducer.transform(x_test)
|
228
223
|
|
229
224
|
# Calculate the row weights
|
230
225
|
weights = CombinedWeights()
|
231
|
-
weights.set_options(trial)
|
226
|
+
weights.set_options(trial, x)
|
232
227
|
w = weights.fit(x_train, y=y_train).transform(y_train.to_frame())[
|
233
228
|
WEIGHTS_COLUMN
|
234
229
|
]
|
235
230
|
|
236
231
|
# Create model
|
237
232
|
model = ModelRouter()
|
238
|
-
model.set_options(trial)
|
233
|
+
model.set_options(trial, x)
|
239
234
|
|
240
235
|
# Train
|
241
236
|
selector = Selector(model)
|
242
|
-
selector.set_options(trial)
|
237
|
+
selector.set_options(trial, x)
|
243
238
|
selector.fit(x_train, y=y_train, w=w, eval_x=x_test, eval_y=y_test)
|
244
239
|
x_train = selector.transform(x_train)
|
245
240
|
x_test = selector.transform(x_test)
|
@@ -249,7 +244,7 @@ class Trainer(Fit):
|
|
249
244
|
|
250
245
|
# Calibrate
|
251
246
|
calibrator = CalibratorRouter(model)
|
252
|
-
calibrator.set_options(trial)
|
247
|
+
calibrator.set_options(trial, x)
|
253
248
|
calibrator.fit(x_pred, y=y_train)
|
254
249
|
|
255
250
|
if save:
|
@@ -258,12 +253,12 @@ class Trainer(Fit):
|
|
258
253
|
)
|
259
254
|
if not os.path.exists(folder):
|
260
255
|
os.mkdir(folder)
|
261
|
-
windower.save(folder)
|
262
|
-
reducer.save(folder)
|
263
|
-
weights.save(folder)
|
264
|
-
model.save(folder)
|
265
|
-
selector.save(folder)
|
266
|
-
calibrator.save(folder)
|
256
|
+
windower.save(folder, trial)
|
257
|
+
reducer.save(folder, trial)
|
258
|
+
weights.save(folder, trial)
|
259
|
+
model.save(folder, trial)
|
260
|
+
selector.save(folder, trial)
|
261
|
+
calibrator.save(folder, trial)
|
267
262
|
|
268
263
|
y_pred = model.transform(x_test)
|
269
264
|
y_pred = calibrator.transform(y_pred)
|
@@ -380,6 +375,7 @@ class Trainer(Fit):
|
|
380
375
|
|
381
376
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
382
377
|
"""Predict the expected values of the data."""
|
378
|
+
tqdm.tqdm.pandas(desc="Inferring...")
|
383
379
|
input_df = df.copy()
|
384
380
|
df = df.reindex(sorted(df.columns), axis=1)
|
385
381
|
feature_columns = df.columns.values
|
@@ -430,7 +426,7 @@ class Trainer(Fit):
|
|
430
426
|
date_str = dates[-1].isoformat()
|
431
427
|
folder = os.path.join(column_path, date_str)
|
432
428
|
|
433
|
-
reducer = CombinedReducer(
|
429
|
+
reducer = CombinedReducer()
|
434
430
|
reducer.load(folder)
|
435
431
|
|
436
432
|
model = ModelRouter()
|
@@ -27,13 +27,15 @@ class ClassWeights(Weights):
|
|
27
27
|
"""The name of the weight class."""
|
28
28
|
return "class"
|
29
29
|
|
30
|
-
def set_options(
|
30
|
+
def set_options(
|
31
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
32
|
+
) -> None:
|
31
33
|
pass
|
32
34
|
|
33
35
|
def load(self, folder: str) -> None:
|
34
36
|
pass
|
35
37
|
|
36
|
-
def save(self, folder: str) -> None:
|
38
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
37
39
|
pass
|
38
40
|
|
39
41
|
def fit(
|
@@ -23,17 +23,19 @@ class CombinedWeights(Weights):
|
|
23
23
|
def name(cls) -> str:
|
24
24
|
return "combined"
|
25
25
|
|
26
|
-
def set_options(
|
26
|
+
def set_options(
|
27
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
28
|
+
) -> None:
|
27
29
|
for weights in self._weights:
|
28
|
-
weights.set_options(trial)
|
30
|
+
weights.set_options(trial, df)
|
29
31
|
|
30
32
|
def load(self, folder: str) -> None:
|
31
33
|
for weights in self._weights:
|
32
34
|
weights.load(folder)
|
33
35
|
|
34
|
-
def save(self, folder: str) -> None:
|
36
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
35
37
|
for weights in self._weights:
|
36
|
-
weights.save(folder)
|
38
|
+
weights.save(folder, trial)
|
37
39
|
|
38
40
|
def fit(
|
39
41
|
self,
|
@@ -19,13 +19,15 @@ class ExponentialWeights(Weights):
|
|
19
19
|
"""The name of the weight class."""
|
20
20
|
return "exponential"
|
21
21
|
|
22
|
-
def set_options(
|
22
|
+
def set_options(
|
23
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
24
|
+
) -> None:
|
23
25
|
pass
|
24
26
|
|
25
27
|
def load(self, folder: str) -> None:
|
26
28
|
pass
|
27
29
|
|
28
|
-
def save(self, folder: str) -> None:
|
30
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
29
31
|
pass
|
30
32
|
|
31
33
|
def fit(
|
@@ -19,13 +19,15 @@ class LinearWeights(Weights):
|
|
19
19
|
"""The name of the weight class."""
|
20
20
|
return "linear"
|
21
21
|
|
22
|
-
def set_options(
|
22
|
+
def set_options(
|
23
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
24
|
+
) -> None:
|
23
25
|
pass
|
24
26
|
|
25
27
|
def load(self, folder: str) -> None:
|
26
28
|
pass
|
27
29
|
|
28
|
-
def save(self, folder: str) -> None:
|
30
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
29
31
|
pass
|
30
32
|
|
31
33
|
def fit(
|
@@ -19,13 +19,15 @@ class NoopWeights(Weights):
|
|
19
19
|
"""The name of the weight class."""
|
20
20
|
return "noop"
|
21
21
|
|
22
|
-
def set_options(
|
22
|
+
def set_options(
|
23
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
24
|
+
) -> None:
|
23
25
|
pass
|
24
26
|
|
25
27
|
def load(self, folder: str) -> None:
|
26
28
|
pass
|
27
29
|
|
28
|
-
def save(self, folder: str) -> None:
|
30
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
29
31
|
pass
|
30
32
|
|
31
33
|
def fit(
|
@@ -20,13 +20,15 @@ class SigmoidWeights(Weights):
|
|
20
20
|
"""The name of the weight class."""
|
21
21
|
return "sigmoid"
|
22
22
|
|
23
|
-
def set_options(
|
23
|
+
def set_options(
|
24
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
25
|
+
) -> None:
|
24
26
|
pass
|
25
27
|
|
26
28
|
def load(self, folder: str) -> None:
|
27
29
|
pass
|
28
30
|
|
29
|
-
def save(self, folder: str) -> None:
|
31
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
30
32
|
pass
|
31
33
|
|
32
34
|
def fit(
|
@@ -38,7 +38,9 @@ class WeightsRouter(Weights):
|
|
38
38
|
def name(cls) -> str:
|
39
39
|
return "router"
|
40
40
|
|
41
|
-
def set_options(
|
41
|
+
def set_options(
|
42
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
43
|
+
) -> None:
|
42
44
|
self._weights = _WEIGHTS[
|
43
45
|
trial.suggest_categorical("weights", list(_WEIGHTS.keys()))
|
44
46
|
]()
|
@@ -54,10 +56,11 @@ class WeightsRouter(Weights):
|
|
54
56
|
weights = _WEIGHTS[params[_WEIGHTS_KEY]]()
|
55
57
|
self._weights = weights
|
56
58
|
|
57
|
-
def save(self, folder: str) -> None:
|
59
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
58
60
|
weights = self._weights
|
59
61
|
if weights is None:
|
60
62
|
raise ValueError("weights is null")
|
63
|
+
weights.save(folder, trial)
|
61
64
|
with open(
|
62
65
|
os.path.join(folder, _WEIGHTS_ROUTER_FILE), "w", encoding="utf8"
|
63
66
|
) as handle:
|
@@ -28,7 +28,9 @@ class Windower(Params, Fit):
|
|
28
28
|
self._lookback_ratio = None
|
29
29
|
self._dt_column = dt_column
|
30
30
|
|
31
|
-
def set_options(
|
31
|
+
def set_options(
|
32
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
33
|
+
) -> None:
|
32
34
|
self._lookback_ratio = trial.suggest_float("lookback", 0.1, 1.0)
|
33
35
|
|
34
36
|
def load(self, folder: str) -> None:
|
@@ -36,7 +38,7 @@ class Windower(Params, Fit):
|
|
36
38
|
params = json.load(handle)
|
37
39
|
self._lookback = params[_LOOKBACK_KEY]
|
38
40
|
|
39
|
-
def save(self, folder: str) -> None:
|
41
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
40
42
|
with open(os.path.join(folder, _WINDOWER_FILE), "w", encoding="utf8") as handle:
|
41
43
|
json.dump(
|
42
44
|
{
|
@@ -40,8 +40,8 @@ wavetrainer/reducer/constant_reducer.py
|
|
40
40
|
wavetrainer/reducer/correlation_reducer.py
|
41
41
|
wavetrainer/reducer/duplicate_reducer.py
|
42
42
|
wavetrainer/reducer/nonnumeric_reducer.py
|
43
|
-
wavetrainer/reducer/pca_reducer.py
|
44
43
|
wavetrainer/reducer/reducer.py
|
44
|
+
wavetrainer/reducer/smart_correlation_reducer.py
|
45
45
|
wavetrainer/reducer/unseen_reducer.py
|
46
46
|
wavetrainer/selector/__init__.py
|
47
47
|
wavetrainer/selector/selector.py
|
@@ -1,77 +0,0 @@
|
|
1
|
-
"""A reducer that removes low variance columns."""
|
2
|
-
|
3
|
-
import os
|
4
|
-
from typing import Self
|
5
|
-
|
6
|
-
import joblib # type: ignore
|
7
|
-
import optuna
|
8
|
-
import pandas as pd
|
9
|
-
from sklearn.decomposition import PCA # type: ignore
|
10
|
-
from sklearn.preprocessing import StandardScaler # type: ignore
|
11
|
-
|
12
|
-
from .reducer import Reducer
|
13
|
-
|
14
|
-
_PCA_FILE = "pca.joblib"
|
15
|
-
_PCA_SCALER_FILE = "pca_scaler.joblib"
|
16
|
-
|
17
|
-
|
18
|
-
class PCAReducer(Reducer):
|
19
|
-
"""A class that removes low variance columns from a dataframe."""
|
20
|
-
|
21
|
-
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
22
|
-
|
23
|
-
def __init__(self, max_features: int | None):
|
24
|
-
super().__init__()
|
25
|
-
self._max_features = max_features
|
26
|
-
if max_features is not None:
|
27
|
-
self._scaler = StandardScaler()
|
28
|
-
self._pca = PCA(n_components=max_features)
|
29
|
-
else:
|
30
|
-
self._scaler = None
|
31
|
-
self._pca = None
|
32
|
-
|
33
|
-
@classmethod
|
34
|
-
def name(cls) -> str:
|
35
|
-
return "pca"
|
36
|
-
|
37
|
-
def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
38
|
-
pass
|
39
|
-
|
40
|
-
def load(self, folder: str) -> None:
|
41
|
-
pca_scaler_file = os.path.join(folder, _PCA_SCALER_FILE)
|
42
|
-
pca_file = os.path.join(folder, _PCA_FILE)
|
43
|
-
if os.path.exists(pca_scaler_file):
|
44
|
-
self._scaler = joblib.load(pca_scaler_file)
|
45
|
-
if os.path.exists(pca_file):
|
46
|
-
self._pca = joblib.load(pca_file)
|
47
|
-
|
48
|
-
def save(self, folder: str) -> None:
|
49
|
-
if self._scaler is not None:
|
50
|
-
joblib.dump(self._scaler, os.path.join(folder, _PCA_SCALER_FILE))
|
51
|
-
if self._pca is not None:
|
52
|
-
joblib.dump(self._pca, os.path.join(folder, _PCA_FILE))
|
53
|
-
|
54
|
-
def fit(
|
55
|
-
self,
|
56
|
-
df: pd.DataFrame,
|
57
|
-
y: pd.Series | pd.DataFrame | None = None,
|
58
|
-
w: pd.Series | None = None,
|
59
|
-
eval_x: pd.DataFrame | None = None,
|
60
|
-
eval_y: pd.Series | pd.DataFrame | None = None,
|
61
|
-
) -> Self:
|
62
|
-
pca = self._pca
|
63
|
-
scaler = self._scaler
|
64
|
-
if pca is None or scaler is None:
|
65
|
-
return self
|
66
|
-
if len(df.columns.values) < pca.n_components: # type: ignore
|
67
|
-
return self
|
68
|
-
x_scaled = scaler.fit_transform(df)
|
69
|
-
pca.fit(x_scaled)
|
70
|
-
return self
|
71
|
-
|
72
|
-
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
73
|
-
if self._pca is None:
|
74
|
-
return df
|
75
|
-
if len(df.columns.values) < self._pca.n_components: # type: ignore
|
76
|
-
return df
|
77
|
-
return self._pca.transform(df)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|