wavetrainer 0.0.49__tar.gz → 0.0.51__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wavetrainer-0.0.49/wavetrainer.egg-info → wavetrainer-0.0.51}/PKG-INFO +3 -1
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/README.md +1 -0
- wavetrainer-0.0.49/wavetrainer.egg-info/requires.txt → wavetrainer-0.0.51/requirements.txt +1 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/setup.py +1 -1
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/__init__.py +1 -1
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/model/catboost/catboost_model.py +59 -1
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/model/model_router.py +52 -2
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/model/xgboost/xgboost_model.py +73 -1
- wavetrainer-0.0.51/wavetrainer/reducer/correlation_reducer.py +112 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51/wavetrainer.egg-info}/PKG-INFO +3 -1
- wavetrainer-0.0.49/requirements.txt → wavetrainer-0.0.51/wavetrainer.egg-info/requires.txt +2 -1
- wavetrainer-0.0.49/wavetrainer/reducer/correlation_reducer.py +0 -52
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/LICENSE +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/MANIFEST.in +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/setup.cfg +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/tests/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/tests/model/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/tests/model/catboost_kwargs_test.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/tests/trainer_test.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/calibrator/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/calibrator/calibrator.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/calibrator/calibrator_router.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/calibrator/mapie_calibrator.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/calibrator/vennabers_calibrator.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/create.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/exceptions.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/fit.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/model/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/model/catboost/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/model/catboost/catboost_classifier_wrap.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/model/catboost/catboost_kwargs.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/model/catboost/catboost_regressor_wrap.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/model/model.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/model/tabpfn/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/model/tabpfn/tabpfn_model.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/model/xgboost/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/model/xgboost/early_stopper.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/model/xgboost/xgboost_logger.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/model_type.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/params.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/reducer/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/reducer/base_selector_reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/reducer/combined_reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/reducer/constant_reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/reducer/duplicate_reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/reducer/non_categorical_numeric_columns.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/reducer/nonnumeric_reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/reducer/pca_reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/reducer/reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/reducer/select_by_single_feature_performance_reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/reducer/smart_correlation_reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/reducer/unseen_reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/selector/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/selector/selector.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/trainer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/weights/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/weights/class_weights.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/weights/combined_weights.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/weights/exponential_weights.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/weights/linear_weights.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/weights/noop_weights.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/weights/sigmoid_weights.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/weights/weights.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/weights/weights_router.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/windower/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/windower/windower.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer.egg-info/SOURCES.txt +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer.egg-info/dependency_links.txt +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer.egg-info/not-zip-safe +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: wavetrainer
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.51
|
4
4
|
Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
|
5
5
|
Home-page: https://github.com/8W9aG/wavetrainer
|
6
6
|
Author: Will Sackfield
|
@@ -26,6 +26,7 @@ Requires-Dist: torch>=2.6.0
|
|
26
26
|
Requires-Dist: tabpfn>=2.0.6
|
27
27
|
Requires-Dist: pytest-is-running>=1.5.1
|
28
28
|
Requires-Dist: xgboost>=3.0.0
|
29
|
+
Requires-Dist: jax>=0.6.1
|
29
30
|
|
30
31
|
# wavetrainer
|
31
32
|
|
@@ -58,6 +59,7 @@ Python 3.11.6:
|
|
58
59
|
- [tabpfn](https://github.com/PriorLabs/TabPFN)
|
59
60
|
- [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
|
60
61
|
- [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
|
62
|
+
- [jax](https://github.com/jax-ml/jax)
|
61
63
|
|
62
64
|
## Raison D'être :thought_balloon:
|
63
65
|
|
@@ -29,6 +29,7 @@ Python 3.11.6:
|
|
29
29
|
- [tabpfn](https://github.com/PriorLabs/TabPFN)
|
30
30
|
- [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
|
31
31
|
- [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
|
32
|
+
- [jax](https://github.com/jax-ml/jax)
|
32
33
|
|
33
34
|
## Raison D'être :thought_balloon:
|
34
35
|
|
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
|
|
23
23
|
|
24
24
|
setup(
|
25
25
|
name='wavetrainer',
|
26
|
-
version='0.0.
|
26
|
+
version='0.0.51',
|
27
27
|
description='A library for automatically finding the optimal model within feature and hyperparameter space.',
|
28
28
|
long_description=long_description,
|
29
29
|
long_description_content_type='text/markdown',
|
@@ -18,6 +18,7 @@ from .catboost_regressor_wrap import CatBoostRegressorWrapper
|
|
18
18
|
|
19
19
|
_MODEL_FILENAME = "model.cbm"
|
20
20
|
_MODEL_PARAMS_FILENAME = "model_params.json"
|
21
|
+
_MODEL_CATEGORICAL_FEATURES_FILENAME = "catboost_categorical_features.json"
|
21
22
|
_ITERATIONS_KEY = "iterations"
|
22
23
|
_LEARNING_RATE_KEY = "learning_rate"
|
23
24
|
_DEPTH_KEY = "depth"
|
@@ -26,6 +27,11 @@ _BOOSTING_TYPE_KEY = "boosting_type"
|
|
26
27
|
_MODEL_TYPE_KEY = "model_type"
|
27
28
|
_EARLY_STOPPING_ROUNDS = "early_stopping_rounds"
|
28
29
|
_BEST_ITERATION_KEY = "best_iteration"
|
30
|
+
_LOSS_FUNCTION_KEY = "loss_function"
|
31
|
+
_DEFAULT_LOSS_FUNCTION = "default"
|
32
|
+
_FOCALLOSS_LOSS_FUNCTION = "focalloss"
|
33
|
+
_GAMMA_KEY = "focalloss_gamma"
|
34
|
+
_ALPHA_KEY = "focalloss_alpha"
|
29
35
|
|
30
36
|
|
31
37
|
class CatboostModel(Model):
|
@@ -42,6 +48,10 @@ class CatboostModel(Model):
|
|
42
48
|
_model_type: None | ModelType
|
43
49
|
_early_stopping_rounds: None | int
|
44
50
|
_best_iteration: None | int
|
51
|
+
_categorical_features: dict[str, bool]
|
52
|
+
_loss_function: None | str
|
53
|
+
_gamma: None | float
|
54
|
+
_alpha: None | float
|
45
55
|
|
46
56
|
@classmethod
|
47
57
|
def name(cls) -> str:
|
@@ -62,6 +72,10 @@ class CatboostModel(Model):
|
|
62
72
|
self._model_type = None
|
63
73
|
self._early_stopping_rounds = None
|
64
74
|
self._best_iteration = None
|
75
|
+
self._categorical_features = {}
|
76
|
+
self._loss_function = None
|
77
|
+
self._gamma = None
|
78
|
+
self._alpha = None
|
65
79
|
|
66
80
|
@property
|
67
81
|
def supports_importances(self) -> bool:
|
@@ -76,7 +90,10 @@ class CatboostModel(Model):
|
|
76
90
|
feature_ids = importances["Feature Id"].to_list() # type: ignore
|
77
91
|
importances = importances["Importances"].to_list() # type: ignore
|
78
92
|
total = sum(importances)
|
79
|
-
return {
|
93
|
+
return {
|
94
|
+
feature_ids[x]: importances[x] / total if total != 0.0 else 0.0
|
95
|
+
for x in range(len(feature_ids))
|
96
|
+
}
|
80
97
|
|
81
98
|
def provide_estimator(self):
|
82
99
|
return self._provide_catboost()
|
@@ -105,6 +122,13 @@ class CatboostModel(Model):
|
|
105
122
|
)
|
106
123
|
self._early_stopping_rounds = trial.suggest_int(_EARLY_STOPPING_ROUNDS, 10, 500)
|
107
124
|
self._best_iteration = trial.user_attrs.get(_BEST_ITERATION_KEY)
|
125
|
+
loss_function = trial.suggest_categorical(
|
126
|
+
_LOSS_FUNCTION_KEY, [_DEFAULT_LOSS_FUNCTION, _FOCALLOSS_LOSS_FUNCTION]
|
127
|
+
)
|
128
|
+
self._loss_function = loss_function
|
129
|
+
if loss_function == _FOCALLOSS_LOSS_FUNCTION:
|
130
|
+
self._gamma = trial.suggest_float(_GAMMA_KEY, 0.5, 5.0)
|
131
|
+
self._alpha = trial.suggest_float(_ALPHA_KEY, 0.05, 0.95)
|
108
132
|
|
109
133
|
def load(self, folder: str) -> None:
|
110
134
|
with open(
|
@@ -119,6 +143,13 @@ class CatboostModel(Model):
|
|
119
143
|
self._model_type = ModelType(params[_MODEL_TYPE_KEY])
|
120
144
|
self._early_stopping_rounds = params[_EARLY_STOPPING_ROUNDS]
|
121
145
|
self._best_iteration = params.get(_BEST_ITERATION_KEY)
|
146
|
+
self._loss_function = params.get(_LOSS_FUNCTION_KEY, _DEFAULT_LOSS_FUNCTION)
|
147
|
+
self._gamma = params.get(_GAMMA_KEY)
|
148
|
+
self._alpha = params.get(_ALPHA_KEY)
|
149
|
+
with open(
|
150
|
+
os.path.join(folder, _MODEL_CATEGORICAL_FEATURES_FILENAME), encoding="utf8"
|
151
|
+
) as handle:
|
152
|
+
self._categorical_features = json.load(handle)
|
122
153
|
catboost = self._provide_catboost()
|
123
154
|
catboost.load_model(os.path.join(folder, _MODEL_FILENAME))
|
124
155
|
|
@@ -136,9 +167,18 @@ class CatboostModel(Model):
|
|
136
167
|
_MODEL_TYPE_KEY: str(self._model_type),
|
137
168
|
_EARLY_STOPPING_ROUNDS: self._early_stopping_rounds,
|
138
169
|
_BEST_ITERATION_KEY: self._best_iteration,
|
170
|
+
_LOSS_FUNCTION_KEY: self._loss_function,
|
171
|
+
_GAMMA_KEY: self._gamma,
|
172
|
+
_ALPHA_KEY: self._alpha,
|
139
173
|
},
|
140
174
|
handle,
|
141
175
|
)
|
176
|
+
with open(
|
177
|
+
os.path.join(folder, _MODEL_CATEGORICAL_FEATURES_FILENAME),
|
178
|
+
"w",
|
179
|
+
encoding="utf8",
|
180
|
+
) as handle:
|
181
|
+
json.dump(self._categorical_features, handle)
|
142
182
|
catboost = self._provide_catboost()
|
143
183
|
catboost.save_model(os.path.join(folder, _MODEL_FILENAME))
|
144
184
|
trial.set_user_attr(_BEST_ITERATION_KEY, self._best_iteration)
|
@@ -155,6 +195,9 @@ class CatboostModel(Model):
|
|
155
195
|
raise ValueError("y is null.")
|
156
196
|
self._model_type = determine_model_type(y)
|
157
197
|
catboost = self._provide_catboost()
|
198
|
+
self._categorical_features = {
|
199
|
+
x: True for x in df.select_dtypes(include="category").columns.tolist()
|
200
|
+
}
|
158
201
|
|
159
202
|
train_pool = Pool(
|
160
203
|
df,
|
@@ -184,6 +227,10 @@ class CatboostModel(Model):
|
|
184
227
|
return self
|
185
228
|
|
186
229
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
230
|
+
for categorical_feature_column in self._categorical_features.keys():
|
231
|
+
df[categorical_feature_column] = df[categorical_feature_column].astype(
|
232
|
+
"category"
|
233
|
+
)
|
187
234
|
pred_pool = Pool(
|
188
235
|
df,
|
189
236
|
cat_features=df.select_dtypes(include="category").columns.tolist(),
|
@@ -217,6 +264,14 @@ class CatboostModel(Model):
|
|
217
264
|
print(
|
218
265
|
f"Creating catboost model with depth {self._depth}, boosting type {self._boosting_type}, best iteration {best_iteration}",
|
219
266
|
)
|
267
|
+
loss_function = None
|
268
|
+
if (
|
269
|
+
self._loss_function == _FOCALLOSS_LOSS_FUNCTION
|
270
|
+
and self._alpha is not None
|
271
|
+
and self._gamma is not None
|
272
|
+
and self._model_type != ModelType.REGRESSION
|
273
|
+
):
|
274
|
+
loss_function = f"Focal:focal_alpha={self._alpha};focal_gamma={self._gamma}"
|
220
275
|
match self._model_type:
|
221
276
|
case ModelType.BINARY:
|
222
277
|
return CatBoostClassifierWrapper(
|
@@ -229,6 +284,7 @@ class CatboostModel(Model):
|
|
229
284
|
metric_period=100,
|
230
285
|
task_type="GPU" if torch.cuda.is_available() else "CPU",
|
231
286
|
devices="0" if torch.cuda.is_available() else None,
|
287
|
+
loss_function=loss_function,
|
232
288
|
)
|
233
289
|
case ModelType.REGRESSION:
|
234
290
|
return CatBoostRegressorWrapper(
|
@@ -253,6 +309,7 @@ class CatboostModel(Model):
|
|
253
309
|
metric_period=100,
|
254
310
|
task_type="GPU" if torch.cuda.is_available() else "CPU",
|
255
311
|
devices="0" if torch.cuda.is_available() else None,
|
312
|
+
loss_function=loss_function,
|
256
313
|
)
|
257
314
|
case ModelType.MULTI_CLASSIFICATION:
|
258
315
|
return CatBoostClassifierWrapper(
|
@@ -265,6 +322,7 @@ class CatboostModel(Model):
|
|
265
322
|
metric_period=100,
|
266
323
|
task_type="GPU" if torch.cuda.is_available() else "CPU",
|
267
324
|
devices="0" if torch.cuda.is_available() else None,
|
325
|
+
loss_function=loss_function,
|
268
326
|
)
|
269
327
|
case _:
|
270
328
|
raise ValueError(f"Unrecognised model type: {self._model_type}")
|
@@ -1,19 +1,23 @@
|
|
1
1
|
"""A model class that routes to other models."""
|
2
2
|
|
3
|
+
import functools
|
3
4
|
import json
|
4
5
|
import os
|
5
6
|
from typing import Self
|
6
7
|
|
7
8
|
import optuna
|
8
9
|
import pandas as pd
|
10
|
+
from sklearn.metrics import accuracy_score # type: ignore
|
9
11
|
|
12
|
+
from ..model_type import ModelType, determine_model_type
|
10
13
|
from .catboost.catboost_model import CatboostModel
|
11
|
-
from .model import Model
|
14
|
+
from .model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
|
12
15
|
from .tabpfn.tabpfn_model import TabPFNModel
|
13
16
|
from .xgboost.xgboost_model import XGBoostModel
|
14
17
|
|
15
18
|
_MODEL_ROUTER_FILE = "model_router.json"
|
16
19
|
_MODEL_KEY = "model"
|
20
|
+
_FALSE_POSITIVE_REDUCTION_STEPS_KEY = "false_positive_reduction_steps"
|
17
21
|
_MODELS = {
|
18
22
|
CatboostModel.name(): CatboostModel,
|
19
23
|
TabPFNModel.name(): TabPFNModel,
|
@@ -27,10 +31,12 @@ class ModelRouter(Model):
|
|
27
31
|
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
28
32
|
|
29
33
|
_model: Model | None
|
34
|
+
_false_positive_reduction_steps: int | None
|
30
35
|
|
31
36
|
def __init__(self) -> None:
|
32
37
|
super().__init__()
|
33
38
|
self._model = None
|
39
|
+
self._false_positive_reduction_steps = None
|
34
40
|
|
35
41
|
@classmethod
|
36
42
|
def name(cls) -> str:
|
@@ -81,6 +87,9 @@ class ModelRouter(Model):
|
|
81
87
|
def set_options(
|
82
88
|
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
83
89
|
) -> None:
|
90
|
+
self._false_positive_reduction_steps = trial.suggest_int(
|
91
|
+
_FALSE_POSITIVE_REDUCTION_STEPS_KEY, 0, 5
|
92
|
+
)
|
84
93
|
model_name = trial.suggest_categorical(
|
85
94
|
"model", [k for k, v in _MODELS.items() if v.supports_x(df)]
|
86
95
|
)
|
@@ -122,7 +131,48 @@ class ModelRouter(Model):
|
|
122
131
|
model = self._model
|
123
132
|
if model is None:
|
124
133
|
raise ValueError("model is null")
|
125
|
-
|
134
|
+
false_positive_reduction_steps = self._false_positive_reduction_steps
|
135
|
+
if false_positive_reduction_steps is None:
|
136
|
+
false_positive_reduction_steps = 0
|
137
|
+
for i in range(max(false_positive_reduction_steps, 1)):
|
138
|
+
print(f"False Positive Reduction Step: {i + 1}")
|
139
|
+
pred = model.fit_transform(df, y=y, w=w, eval_x=eval_x, eval_y=eval_y)
|
140
|
+
if (
|
141
|
+
w is None
|
142
|
+
or y is None
|
143
|
+
or determine_model_type(y) == ModelType.REGRESSION
|
144
|
+
):
|
145
|
+
break
|
146
|
+
print(f"Accuracy: {accuracy_score(y, pred[PREDICTION_COLUMN])}")
|
147
|
+
pred["__wavetrain_correct"] = pred[PREDICTION_COLUMN] != y
|
148
|
+
pred["__wavetrain_error_weight"] = pred["__wavetrain_correct"].astype(float)
|
149
|
+
prob_columns = sorted(
|
150
|
+
[
|
151
|
+
x
|
152
|
+
for x in pred.columns.values.tolist()
|
153
|
+
if x.startswith(PROBABILITY_COLUMN_PREFIX)
|
154
|
+
]
|
155
|
+
)
|
156
|
+
if prob_columns:
|
157
|
+
|
158
|
+
def determine_error_weight(
|
159
|
+
row: pd.Series, prob_columns: list[str]
|
160
|
+
) -> float:
|
161
|
+
nonlocal y
|
162
|
+
if not row["__wavetrain_correct"]:
|
163
|
+
return abs(row[prob_columns[1 - int(y.loc[row.name])]]) # type: ignore
|
164
|
+
return 0.0
|
165
|
+
|
166
|
+
pred["__wavetrain_error_weight"] = pred.apply(
|
167
|
+
functools.partial(
|
168
|
+
determine_error_weight,
|
169
|
+
prob_columns=prob_columns,
|
170
|
+
),
|
171
|
+
axis=1,
|
172
|
+
)
|
173
|
+
w += pred["__wavetrain_error_weight"]
|
174
|
+
w = w.clip(lower=0.0)
|
175
|
+
|
126
176
|
return self
|
127
177
|
|
128
178
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
@@ -5,10 +5,13 @@ import json
|
|
5
5
|
import os
|
6
6
|
from typing import Self
|
7
7
|
|
8
|
+
import jax.numpy as jnp
|
9
|
+
import numpy as np
|
8
10
|
import optuna
|
9
11
|
import pandas as pd
|
10
12
|
import pytest_is_running
|
11
13
|
import torch
|
14
|
+
from jax import grad, hessian, vmap
|
12
15
|
from xgboost import XGBClassifier, XGBRegressor
|
13
16
|
from xgboost.callback import TrainingCallback
|
14
17
|
from xgboost.core import XGBoostError
|
@@ -39,6 +42,11 @@ _RATE_DROP_KEY = "rate_drop"
|
|
39
42
|
_SKIP_DROP_KEY = "skip_drop"
|
40
43
|
_NUM_BOOST_ROUNDS_KEY = "num_boost_rounds"
|
41
44
|
_EARLY_STOPPING_ROUNDS_KEY = "early_stopping_rounds"
|
45
|
+
_LOSS_FUNCTION_KEY = "xgboost_loss_function"
|
46
|
+
_DEFAULT_LOSS_FUNCTION = "default"
|
47
|
+
_FOCALLOSS_LOSS_FUNCTION = "focalloss"
|
48
|
+
_FOCALLOSS_GAMMA_KEY = "focalloss_gamma"
|
49
|
+
_FOCALLOSS_ALPHA_KEY = "focalloss_alpha"
|
42
50
|
|
43
51
|
|
44
52
|
def _convert_categoricals(input_df: pd.DataFrame) -> pd.DataFrame:
|
@@ -70,6 +78,9 @@ class XGBoostModel(Model):
|
|
70
78
|
_num_boost_rounds: int | None
|
71
79
|
_early_stopping_rounds: int | None
|
72
80
|
_best_iteration: int | None
|
81
|
+
_focalloss_alpha: float | None
|
82
|
+
_focalloss_gamma: float | None
|
83
|
+
_loss_function: str | None
|
73
84
|
|
74
85
|
@classmethod
|
75
86
|
def name(cls) -> str:
|
@@ -100,6 +111,9 @@ class XGBoostModel(Model):
|
|
100
111
|
self._num_boost_rounds = None
|
101
112
|
self._early_stopping_rounds = None
|
102
113
|
self._best_iteration = None
|
114
|
+
self._loss_function = None
|
115
|
+
self._focalloss_gamma = None
|
116
|
+
self._focalloss_alpha = None
|
103
117
|
|
104
118
|
@property
|
105
119
|
def supports_importances(self) -> bool:
|
@@ -111,7 +125,10 @@ class XGBoostModel(Model):
|
|
111
125
|
try:
|
112
126
|
score_dict = bst.get_booster().get_score(importance_type="weight") # type: ignore
|
113
127
|
total = sum(score_dict.values()) # type: ignore
|
114
|
-
return {
|
128
|
+
return {
|
129
|
+
k: 0.0 if total == 0.0 else v / total # type: ignore
|
130
|
+
for k, v in score_dict.items() # type: ignore
|
131
|
+
} # type: ignore
|
115
132
|
except XGBoostError as exc:
|
116
133
|
print(str(exc))
|
117
134
|
return {}
|
@@ -167,6 +184,15 @@ class XGBoostModel(Model):
|
|
167
184
|
_EARLY_STOPPING_ROUNDS_KEY, 50, 500
|
168
185
|
)
|
169
186
|
self._best_iteration = trial.user_attrs.get(_BEST_ITERATION_KEY)
|
187
|
+
loss_function = trial.suggest_categorical(
|
188
|
+
_LOSS_FUNCTION_KEY, [_DEFAULT_LOSS_FUNCTION, _FOCALLOSS_LOSS_FUNCTION]
|
189
|
+
)
|
190
|
+
self._loss_function = loss_function
|
191
|
+
if loss_function == _FOCALLOSS_LOSS_FUNCTION:
|
192
|
+
self._focalloss_gamma = trial.suggest_float(_FOCALLOSS_GAMMA_KEY, 0.5, 5.0)
|
193
|
+
self._focalloss_alpha = trial.suggest_float(
|
194
|
+
_FOCALLOSS_ALPHA_KEY, 0.05, 0.95
|
195
|
+
)
|
170
196
|
|
171
197
|
def load(self, folder: str) -> None:
|
172
198
|
with open(
|
@@ -191,6 +217,9 @@ class XGBoostModel(Model):
|
|
191
217
|
self._num_boost_rounds = params[_NUM_BOOST_ROUNDS_KEY]
|
192
218
|
self._early_stopping_rounds = params[_EARLY_STOPPING_ROUNDS_KEY]
|
193
219
|
self._best_iteration = params.get(_BEST_ITERATION_KEY)
|
220
|
+
self._loss_function = params.get(_LOSS_FUNCTION_KEY, _DEFAULT_LOSS_FUNCTION)
|
221
|
+
self._focalloss_gamma = params.get(_FOCALLOSS_GAMMA_KEY)
|
222
|
+
self._focalloss_alpha = params.get(_FOCALLOSS_ALPHA_KEY)
|
194
223
|
bst = self._provide_xgboost()
|
195
224
|
bst.load_model(os.path.join(folder, _MODEL_FILENAME))
|
196
225
|
|
@@ -220,6 +249,9 @@ class XGBoostModel(Model):
|
|
220
249
|
_SKIP_DROP_KEY: self._skip_drop,
|
221
250
|
_NUM_BOOST_ROUNDS_KEY: self._num_boost_rounds,
|
222
251
|
_EARLY_STOPPING_ROUNDS_KEY: self._early_stopping_rounds,
|
252
|
+
_LOSS_FUNCTION_KEY: self._loss_function,
|
253
|
+
_FOCALLOSS_GAMMA_KEY: self._gamma,
|
254
|
+
_FOCALLOSS_ALPHA_KEY: self._alpha,
|
223
255
|
},
|
224
256
|
handle,
|
225
257
|
)
|
@@ -328,6 +360,46 @@ class XGBoostModel(Model):
|
|
328
360
|
param["normalize_type"] = self._normalize_type
|
329
361
|
param["rate_drop"] = self._rate_drop
|
330
362
|
param["skip_drop"] = self._skip_drop
|
363
|
+
if (
|
364
|
+
self._loss_function == _FOCALLOSS_LOSS_FUNCTION
|
365
|
+
and self._focalloss_alpha is not None
|
366
|
+
and self._focalloss_gamma is not None
|
367
|
+
):
|
368
|
+
|
369
|
+
def focal_loss(alpha=0.25, gamma=2.0):
|
370
|
+
def fl(x, t):
|
371
|
+
p = 1 / (1 + jnp.exp(-x))
|
372
|
+
pt = t * p + (1 - t) * (1 - p)
|
373
|
+
alpha_t = alpha * t + (1 - alpha) * (1 - t)
|
374
|
+
return (
|
375
|
+
-alpha_t * (1 - pt) ** gamma * jnp.log(jnp.clip(pt, 1e-8, 1.0))
|
376
|
+
)
|
377
|
+
|
378
|
+
fl_grad = grad(fl)
|
379
|
+
fl_hess = hessian(fl)
|
380
|
+
grad_batch = vmap(fl_grad)
|
381
|
+
hess_batch = vmap(fl_hess)
|
382
|
+
|
383
|
+
def custom_loss(y_pred, y_true, sample_weight=None):
|
384
|
+
y_true = jnp.array(y_true)
|
385
|
+
y_pred = jnp.array(y_pred)
|
386
|
+
|
387
|
+
grad_vals = grad_batch(y_pred, y_true)
|
388
|
+
hess_vals = hess_batch(y_pred, y_true)
|
389
|
+
|
390
|
+
if sample_weight is not None:
|
391
|
+
sample_weight = jnp.array(sample_weight)
|
392
|
+
grad_vals *= sample_weight
|
393
|
+
hess_vals *= sample_weight
|
394
|
+
|
395
|
+
# Convert to NumPy arrays for XGBoost compatibility
|
396
|
+
return np.array(grad_vals), np.array(hess_vals)
|
397
|
+
|
398
|
+
return custom_loss
|
399
|
+
|
400
|
+
param["objective"] = focal_loss(
|
401
|
+
alpha=self._focalloss_alpha, gamma=self._focalloss_gamma
|
402
|
+
)
|
331
403
|
print(
|
332
404
|
f"Creating xgboost model with max_depth {self._max_depth}, best iteration {best_iteration}, booster: {self._booster}",
|
333
405
|
)
|
@@ -0,0 +1,112 @@
|
|
1
|
+
"""A reducer that removes correlation features."""
|
2
|
+
|
3
|
+
# pylint: disable=too-many-arguments,too-many-positional-arguments,consider-using-enumerate
|
4
|
+
import json
|
5
|
+
import os
|
6
|
+
from typing import Self
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
import optuna
|
10
|
+
import pandas as pd
|
11
|
+
|
12
|
+
from .non_categorical_numeric_columns import \
|
13
|
+
find_non_categorical_numeric_columns
|
14
|
+
from .reducer import Reducer
|
15
|
+
|
16
|
+
_CORRELATION_REDUCER_FILENAME = "correlation_reducer.json"
|
17
|
+
_CORRELATION_REDUCER_THRESHOLD = "correlation_reducer_threshold"
|
18
|
+
|
19
|
+
|
20
|
+
def _get_correlated_features_to_drop(
|
21
|
+
df: pd.DataFrame, threshold: float = 0.85, random_seed: int = 42
|
22
|
+
) -> list[str]:
|
23
|
+
"""
|
24
|
+
Identify highly correlated features to drop, keeping one per group.
|
25
|
+
NaNs are replaced with a single fixed junk value to allow correlation computation.
|
26
|
+
Columns are processed in sorted order to ensure deterministic output.
|
27
|
+
|
28
|
+
Args:
|
29
|
+
df (pd.DataFrame): Input DataFrame.
|
30
|
+
threshold (float): Correlation threshold above which features are considered redundant.
|
31
|
+
random_seed (int): Seed used to generate the fixed junk value.
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
List[str]: List of column names to drop.
|
35
|
+
"""
|
36
|
+
np.random.seed(random_seed)
|
37
|
+
|
38
|
+
# Select and sort numeric columns
|
39
|
+
sorted_cols = sorted(find_non_categorical_numeric_columns(df))
|
40
|
+
df_numeric = df[sorted_cols].copy()
|
41
|
+
|
42
|
+
# Generate and apply a fixed junk value for NaNs
|
43
|
+
junk_value = np.random.uniform(-1e9, 1e9)
|
44
|
+
df_numeric = df_numeric.fillna(junk_value)
|
45
|
+
|
46
|
+
if df_numeric.shape[1] < 2:
|
47
|
+
return []
|
48
|
+
|
49
|
+
# Compute absolute correlation matrix
|
50
|
+
corr_matrix = np.corrcoef(df_numeric.values, rowvar=False)
|
51
|
+
abs_corr = np.abs(corr_matrix)
|
52
|
+
|
53
|
+
# Greedy feature drop based on sorted order
|
54
|
+
to_drop = set()
|
55
|
+
for i in range(len(sorted_cols)):
|
56
|
+
if sorted_cols[i] in to_drop:
|
57
|
+
continue
|
58
|
+
for j in range(i + 1, len(sorted_cols)):
|
59
|
+
if sorted_cols[j] in to_drop:
|
60
|
+
continue
|
61
|
+
if abs_corr[i, j] > threshold:
|
62
|
+
to_drop.add(sorted_cols[j])
|
63
|
+
|
64
|
+
return sorted(to_drop)
|
65
|
+
|
66
|
+
|
67
|
+
class CorrelationReducer(Reducer):
|
68
|
+
"""A class that removes correlated values from a dataset."""
|
69
|
+
|
70
|
+
_correlation_drop_features: dict[str, bool]
|
71
|
+
|
72
|
+
def __init__(self) -> None:
|
73
|
+
self._threshold = 0.0
|
74
|
+
self._correlation_drop_features = {}
|
75
|
+
|
76
|
+
@classmethod
|
77
|
+
def name(cls) -> str:
|
78
|
+
return "correlation"
|
79
|
+
|
80
|
+
def set_options(
|
81
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
82
|
+
) -> None:
|
83
|
+
self._threshold = trial.suggest_float(_CORRELATION_REDUCER_THRESHOLD, 0.7, 0.99)
|
84
|
+
|
85
|
+
def load(self, folder: str) -> None:
|
86
|
+
with open(
|
87
|
+
os.path.join(folder, _CORRELATION_REDUCER_FILENAME), encoding="utf8"
|
88
|
+
) as handle:
|
89
|
+
self._correlation_drop_features = json.load(handle)
|
90
|
+
|
91
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
92
|
+
with open(
|
93
|
+
os.path.join(folder, _CORRELATION_REDUCER_FILENAME), "w", encoding="utf8"
|
94
|
+
) as handle:
|
95
|
+
json.dump(self._correlation_drop_features, handle)
|
96
|
+
|
97
|
+
def fit(
|
98
|
+
self,
|
99
|
+
df: pd.DataFrame,
|
100
|
+
y: pd.Series | pd.DataFrame | None = None,
|
101
|
+
w: pd.Series | None = None,
|
102
|
+
eval_x: pd.DataFrame | None = None,
|
103
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
104
|
+
) -> Self:
|
105
|
+
drop_features = _get_correlated_features_to_drop(df, threshold=self._threshold)
|
106
|
+
self._correlation_drop_features = {x: True for x in drop_features}
|
107
|
+
return self
|
108
|
+
|
109
|
+
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
110
|
+
return df.drop(
|
111
|
+
columns=list(self._correlation_drop_features.keys()), errors="ignore"
|
112
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: wavetrainer
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.51
|
4
4
|
Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
|
5
5
|
Home-page: https://github.com/8W9aG/wavetrainer
|
6
6
|
Author: Will Sackfield
|
@@ -26,6 +26,7 @@ Requires-Dist: torch>=2.6.0
|
|
26
26
|
Requires-Dist: tabpfn>=2.0.6
|
27
27
|
Requires-Dist: pytest-is-running>=1.5.1
|
28
28
|
Requires-Dist: xgboost>=3.0.0
|
29
|
+
Requires-Dist: jax>=0.6.1
|
29
30
|
|
30
31
|
# wavetrainer
|
31
32
|
|
@@ -58,6 +59,7 @@ Python 3.11.6:
|
|
58
59
|
- [tabpfn](https://github.com/PriorLabs/TabPFN)
|
59
60
|
- [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
|
60
61
|
- [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
|
62
|
+
- [jax](https://github.com/jax-ml/jax)
|
61
63
|
|
62
64
|
## Raison D'être :thought_balloon:
|
63
65
|
|
@@ -1,52 +0,0 @@
|
|
1
|
-
"""A reducer that removes correlation features."""
|
2
|
-
|
3
|
-
# pylint: disable=too-many-arguments,too-many-positional-arguments
|
4
|
-
from typing import Self
|
5
|
-
|
6
|
-
import optuna
|
7
|
-
import pandas as pd
|
8
|
-
from feature_engine.selection import DropCorrelatedFeatures
|
9
|
-
|
10
|
-
from .base_selector_reducer import BaseSelectorReducer
|
11
|
-
from .non_categorical_numeric_columns import \
|
12
|
-
find_non_categorical_numeric_columns
|
13
|
-
|
14
|
-
_CORRELATION_REDUCER_FILENAME = "correlation_reducer.joblib"
|
15
|
-
_CORRELATION_REDUCER_THRESHOLD = "correlation_reducer_threshold"
|
16
|
-
|
17
|
-
|
18
|
-
class CorrelationReducer(BaseSelectorReducer):
|
19
|
-
"""A class that removes correlated values from a dataset."""
|
20
|
-
|
21
|
-
def __init__(self) -> None:
|
22
|
-
self._correlation_selector = DropCorrelatedFeatures(missing_values="ignore")
|
23
|
-
super().__init__(
|
24
|
-
self._correlation_selector,
|
25
|
-
_CORRELATION_REDUCER_FILENAME,
|
26
|
-
)
|
27
|
-
|
28
|
-
@classmethod
|
29
|
-
def name(cls) -> str:
|
30
|
-
return "correlation"
|
31
|
-
|
32
|
-
@classmethod
|
33
|
-
def should_raise(cls) -> bool:
|
34
|
-
return False
|
35
|
-
|
36
|
-
def set_options(
|
37
|
-
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
38
|
-
) -> None:
|
39
|
-
self._correlation_selector.threshold = trial.suggest_float(
|
40
|
-
_CORRELATION_REDUCER_THRESHOLD, 0.7, 0.99
|
41
|
-
)
|
42
|
-
|
43
|
-
def fit(
|
44
|
-
self,
|
45
|
-
df: pd.DataFrame,
|
46
|
-
y: pd.Series | pd.DataFrame | None = None,
|
47
|
-
w: pd.Series | None = None,
|
48
|
-
eval_x: pd.DataFrame | None = None,
|
49
|
-
eval_y: pd.Series | pd.DataFrame | None = None,
|
50
|
-
) -> Self:
|
51
|
-
self._correlation_selector.variables = find_non_categorical_numeric_columns(df)
|
52
|
-
return super().fit(df, y=y, w=w, eval_x=eval_x, eval_y=eval_y)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/model/catboost/catboost_classifier_wrap.py
RENAMED
File without changes
|
File without changes
|
{wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/model/catboost/catboost_regressor_wrap.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{wavetrainer-0.0.49 → wavetrainer-0.0.51}/wavetrainer/reducer/non_categorical_numeric_columns.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|