wavetrainer 0.0.49__tar.gz → 0.0.50__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wavetrainer-0.0.49/wavetrainer.egg-info → wavetrainer-0.0.50}/PKG-INFO +3 -1
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/README.md +1 -0
- wavetrainer-0.0.49/wavetrainer.egg-info/requires.txt → wavetrainer-0.0.50/requirements.txt +1 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/setup.py +1 -1
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/__init__.py +1 -1
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/catboost/catboost_model.py +59 -1
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/xgboost/xgboost_model.py +69 -0
- wavetrainer-0.0.50/wavetrainer/reducer/correlation_reducer.py +112 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50/wavetrainer.egg-info}/PKG-INFO +3 -1
- wavetrainer-0.0.49/requirements.txt → wavetrainer-0.0.50/wavetrainer.egg-info/requires.txt +2 -1
- wavetrainer-0.0.49/wavetrainer/reducer/correlation_reducer.py +0 -52
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/LICENSE +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/MANIFEST.in +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/setup.cfg +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/tests/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/tests/model/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/tests/model/catboost_kwargs_test.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/tests/trainer_test.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/calibrator/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/calibrator/calibrator.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/calibrator/calibrator_router.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/calibrator/mapie_calibrator.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/calibrator/vennabers_calibrator.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/create.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/exceptions.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/fit.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/catboost/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/catboost/catboost_classifier_wrap.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/catboost/catboost_kwargs.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/catboost/catboost_regressor_wrap.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/model.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/model_router.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/tabpfn/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/tabpfn/tabpfn_model.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/xgboost/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/xgboost/early_stopper.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/xgboost/xgboost_logger.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model_type.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/params.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/base_selector_reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/combined_reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/constant_reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/duplicate_reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/non_categorical_numeric_columns.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/nonnumeric_reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/pca_reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/select_by_single_feature_performance_reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/smart_correlation_reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/unseen_reducer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/selector/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/selector/selector.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/trainer.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/weights/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/weights/class_weights.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/weights/combined_weights.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/weights/exponential_weights.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/weights/linear_weights.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/weights/noop_weights.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/weights/sigmoid_weights.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/weights/weights.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/weights/weights_router.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/windower/__init__.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/windower/windower.py +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer.egg-info/SOURCES.txt +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer.egg-info/dependency_links.txt +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer.egg-info/not-zip-safe +0 -0
- {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: wavetrainer
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.50
|
4
4
|
Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
|
5
5
|
Home-page: https://github.com/8W9aG/wavetrainer
|
6
6
|
Author: Will Sackfield
|
@@ -26,6 +26,7 @@ Requires-Dist: torch>=2.6.0
|
|
26
26
|
Requires-Dist: tabpfn>=2.0.6
|
27
27
|
Requires-Dist: pytest-is-running>=1.5.1
|
28
28
|
Requires-Dist: xgboost>=3.0.0
|
29
|
+
Requires-Dist: jax>=0.6.1
|
29
30
|
|
30
31
|
# wavetrainer
|
31
32
|
|
@@ -58,6 +59,7 @@ Python 3.11.6:
|
|
58
59
|
- [tabpfn](https://github.com/PriorLabs/TabPFN)
|
59
60
|
- [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
|
60
61
|
- [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
|
62
|
+
- [jax](https://github.com/jax-ml/jax)
|
61
63
|
|
62
64
|
## Raison D'être :thought_balloon:
|
63
65
|
|
@@ -29,6 +29,7 @@ Python 3.11.6:
|
|
29
29
|
- [tabpfn](https://github.com/PriorLabs/TabPFN)
|
30
30
|
- [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
|
31
31
|
- [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
|
32
|
+
- [jax](https://github.com/jax-ml/jax)
|
32
33
|
|
33
34
|
## Raison D'être :thought_balloon:
|
34
35
|
|
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
|
|
23
23
|
|
24
24
|
setup(
|
25
25
|
name='wavetrainer',
|
26
|
-
version='0.0.
|
26
|
+
version='0.0.50',
|
27
27
|
description='A library for automatically finding the optimal model within feature and hyperparameter space.',
|
28
28
|
long_description=long_description,
|
29
29
|
long_description_content_type='text/markdown',
|
@@ -18,6 +18,7 @@ from .catboost_regressor_wrap import CatBoostRegressorWrapper
|
|
18
18
|
|
19
19
|
_MODEL_FILENAME = "model.cbm"
|
20
20
|
_MODEL_PARAMS_FILENAME = "model_params.json"
|
21
|
+
_MODEL_CATEGORICAL_FEATURES_FILENAME = "catboost_categorical_features.json"
|
21
22
|
_ITERATIONS_KEY = "iterations"
|
22
23
|
_LEARNING_RATE_KEY = "learning_rate"
|
23
24
|
_DEPTH_KEY = "depth"
|
@@ -26,6 +27,11 @@ _BOOSTING_TYPE_KEY = "boosting_type"
|
|
26
27
|
_MODEL_TYPE_KEY = "model_type"
|
27
28
|
_EARLY_STOPPING_ROUNDS = "early_stopping_rounds"
|
28
29
|
_BEST_ITERATION_KEY = "best_iteration"
|
30
|
+
_LOSS_FUNCTION_KEY = "loss_function"
|
31
|
+
_DEFAULT_LOSS_FUNCTION = "default"
|
32
|
+
_FOCALLOSS_LOSS_FUNCTION = "focalloss"
|
33
|
+
_GAMMA_KEY = "focalloss_gamma"
|
34
|
+
_ALPHA_KEY = "focalloss_alpha"
|
29
35
|
|
30
36
|
|
31
37
|
class CatboostModel(Model):
|
@@ -42,6 +48,10 @@ class CatboostModel(Model):
|
|
42
48
|
_model_type: None | ModelType
|
43
49
|
_early_stopping_rounds: None | int
|
44
50
|
_best_iteration: None | int
|
51
|
+
_categorical_features: dict[str, bool]
|
52
|
+
_loss_function: None | str
|
53
|
+
_gamma: None | float
|
54
|
+
_alpha: None | float
|
45
55
|
|
46
56
|
@classmethod
|
47
57
|
def name(cls) -> str:
|
@@ -62,6 +72,10 @@ class CatboostModel(Model):
|
|
62
72
|
self._model_type = None
|
63
73
|
self._early_stopping_rounds = None
|
64
74
|
self._best_iteration = None
|
75
|
+
self._categorical_features = {}
|
76
|
+
self._loss_function = None
|
77
|
+
self._gamma = None
|
78
|
+
self._alpha = None
|
65
79
|
|
66
80
|
@property
|
67
81
|
def supports_importances(self) -> bool:
|
@@ -76,7 +90,10 @@ class CatboostModel(Model):
|
|
76
90
|
feature_ids = importances["Feature Id"].to_list() # type: ignore
|
77
91
|
importances = importances["Importances"].to_list() # type: ignore
|
78
92
|
total = sum(importances)
|
79
|
-
return {
|
93
|
+
return {
|
94
|
+
feature_ids[x]: importances[x] / total if total != 0.0 else 0.0
|
95
|
+
for x in range(len(feature_ids))
|
96
|
+
}
|
80
97
|
|
81
98
|
def provide_estimator(self):
|
82
99
|
return self._provide_catboost()
|
@@ -105,6 +122,13 @@ class CatboostModel(Model):
|
|
105
122
|
)
|
106
123
|
self._early_stopping_rounds = trial.suggest_int(_EARLY_STOPPING_ROUNDS, 10, 500)
|
107
124
|
self._best_iteration = trial.user_attrs.get(_BEST_ITERATION_KEY)
|
125
|
+
loss_function = trial.suggest_categorical(
|
126
|
+
_LOSS_FUNCTION_KEY, [_DEFAULT_LOSS_FUNCTION, _FOCALLOSS_LOSS_FUNCTION]
|
127
|
+
)
|
128
|
+
self._loss_function = loss_function
|
129
|
+
if loss_function == _FOCALLOSS_LOSS_FUNCTION:
|
130
|
+
self._gamma = trial.suggest_float(_GAMMA_KEY, 0.5, 5.0)
|
131
|
+
self._alpha = trial.suggest_float(_ALPHA_KEY, 0.05, 0.95)
|
108
132
|
|
109
133
|
def load(self, folder: str) -> None:
|
110
134
|
with open(
|
@@ -119,6 +143,13 @@ class CatboostModel(Model):
|
|
119
143
|
self._model_type = ModelType(params[_MODEL_TYPE_KEY])
|
120
144
|
self._early_stopping_rounds = params[_EARLY_STOPPING_ROUNDS]
|
121
145
|
self._best_iteration = params.get(_BEST_ITERATION_KEY)
|
146
|
+
self._loss_function = params.get(_LOSS_FUNCTION_KEY, _DEFAULT_LOSS_FUNCTION)
|
147
|
+
self._gamma = params.get(_GAMMA_KEY)
|
148
|
+
self._alpha = params.get(_ALPHA_KEY)
|
149
|
+
with open(
|
150
|
+
os.path.join(folder, _MODEL_CATEGORICAL_FEATURES_FILENAME), encoding="utf8"
|
151
|
+
) as handle:
|
152
|
+
self._categorical_features = json.load(handle)
|
122
153
|
catboost = self._provide_catboost()
|
123
154
|
catboost.load_model(os.path.join(folder, _MODEL_FILENAME))
|
124
155
|
|
@@ -136,9 +167,18 @@ class CatboostModel(Model):
|
|
136
167
|
_MODEL_TYPE_KEY: str(self._model_type),
|
137
168
|
_EARLY_STOPPING_ROUNDS: self._early_stopping_rounds,
|
138
169
|
_BEST_ITERATION_KEY: self._best_iteration,
|
170
|
+
_LOSS_FUNCTION_KEY: self._loss_function,
|
171
|
+
_GAMMA_KEY: self._gamma,
|
172
|
+
_ALPHA_KEY: self._alpha,
|
139
173
|
},
|
140
174
|
handle,
|
141
175
|
)
|
176
|
+
with open(
|
177
|
+
os.path.join(folder, _MODEL_CATEGORICAL_FEATURES_FILENAME),
|
178
|
+
"w",
|
179
|
+
encoding="utf8",
|
180
|
+
) as handle:
|
181
|
+
json.dump(self._categorical_features, handle)
|
142
182
|
catboost = self._provide_catboost()
|
143
183
|
catboost.save_model(os.path.join(folder, _MODEL_FILENAME))
|
144
184
|
trial.set_user_attr(_BEST_ITERATION_KEY, self._best_iteration)
|
@@ -155,6 +195,9 @@ class CatboostModel(Model):
|
|
155
195
|
raise ValueError("y is null.")
|
156
196
|
self._model_type = determine_model_type(y)
|
157
197
|
catboost = self._provide_catboost()
|
198
|
+
self._categorical_features = {
|
199
|
+
x: True for x in df.select_dtypes(include="category").columns.tolist()
|
200
|
+
}
|
158
201
|
|
159
202
|
train_pool = Pool(
|
160
203
|
df,
|
@@ -184,6 +227,10 @@ class CatboostModel(Model):
|
|
184
227
|
return self
|
185
228
|
|
186
229
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
230
|
+
for categorical_feature_column in self._categorical_features.keys():
|
231
|
+
df[categorical_feature_column] = df[categorical_feature_column].astype(
|
232
|
+
"category"
|
233
|
+
)
|
187
234
|
pred_pool = Pool(
|
188
235
|
df,
|
189
236
|
cat_features=df.select_dtypes(include="category").columns.tolist(),
|
@@ -217,6 +264,14 @@ class CatboostModel(Model):
|
|
217
264
|
print(
|
218
265
|
f"Creating catboost model with depth {self._depth}, boosting type {self._boosting_type}, best iteration {best_iteration}",
|
219
266
|
)
|
267
|
+
loss_function = None
|
268
|
+
if (
|
269
|
+
self._loss_function == _FOCALLOSS_LOSS_FUNCTION
|
270
|
+
and self._alpha is not None
|
271
|
+
and self._gamma is not None
|
272
|
+
and self._model_type != ModelType.REGRESSION
|
273
|
+
):
|
274
|
+
loss_function = f"Focal:focal_alpha={self._alpha};focal_gamma={self._gamma}"
|
220
275
|
match self._model_type:
|
221
276
|
case ModelType.BINARY:
|
222
277
|
return CatBoostClassifierWrapper(
|
@@ -229,6 +284,7 @@ class CatboostModel(Model):
|
|
229
284
|
metric_period=100,
|
230
285
|
task_type="GPU" if torch.cuda.is_available() else "CPU",
|
231
286
|
devices="0" if torch.cuda.is_available() else None,
|
287
|
+
loss_function=loss_function,
|
232
288
|
)
|
233
289
|
case ModelType.REGRESSION:
|
234
290
|
return CatBoostRegressorWrapper(
|
@@ -253,6 +309,7 @@ class CatboostModel(Model):
|
|
253
309
|
metric_period=100,
|
254
310
|
task_type="GPU" if torch.cuda.is_available() else "CPU",
|
255
311
|
devices="0" if torch.cuda.is_available() else None,
|
312
|
+
loss_function=loss_function,
|
256
313
|
)
|
257
314
|
case ModelType.MULTI_CLASSIFICATION:
|
258
315
|
return CatBoostClassifierWrapper(
|
@@ -265,6 +322,7 @@ class CatboostModel(Model):
|
|
265
322
|
metric_period=100,
|
266
323
|
task_type="GPU" if torch.cuda.is_available() else "CPU",
|
267
324
|
devices="0" if torch.cuda.is_available() else None,
|
325
|
+
loss_function=loss_function,
|
268
326
|
)
|
269
327
|
case _:
|
270
328
|
raise ValueError(f"Unrecognised model type: {self._model_type}")
|
@@ -5,10 +5,13 @@ import json
|
|
5
5
|
import os
|
6
6
|
from typing import Self
|
7
7
|
|
8
|
+
import jax.numpy as jnp
|
9
|
+
import numpy as np
|
8
10
|
import optuna
|
9
11
|
import pandas as pd
|
10
12
|
import pytest_is_running
|
11
13
|
import torch
|
14
|
+
from jax import grad, hessian, vmap
|
12
15
|
from xgboost import XGBClassifier, XGBRegressor
|
13
16
|
from xgboost.callback import TrainingCallback
|
14
17
|
from xgboost.core import XGBoostError
|
@@ -39,6 +42,11 @@ _RATE_DROP_KEY = "rate_drop"
|
|
39
42
|
_SKIP_DROP_KEY = "skip_drop"
|
40
43
|
_NUM_BOOST_ROUNDS_KEY = "num_boost_rounds"
|
41
44
|
_EARLY_STOPPING_ROUNDS_KEY = "early_stopping_rounds"
|
45
|
+
_LOSS_FUNCTION_KEY = "xgboost_loss_function"
|
46
|
+
_DEFAULT_LOSS_FUNCTION = "default"
|
47
|
+
_FOCALLOSS_LOSS_FUNCTION = "focalloss"
|
48
|
+
_FOCALLOSS_GAMMA_KEY = "focalloss_gamma"
|
49
|
+
_FOCALLOSS_ALPHA_KEY = "focalloss_alpha"
|
42
50
|
|
43
51
|
|
44
52
|
def _convert_categoricals(input_df: pd.DataFrame) -> pd.DataFrame:
|
@@ -70,6 +78,9 @@ class XGBoostModel(Model):
|
|
70
78
|
_num_boost_rounds: int | None
|
71
79
|
_early_stopping_rounds: int | None
|
72
80
|
_best_iteration: int | None
|
81
|
+
_focalloss_alpha: float | None
|
82
|
+
_focalloss_gamma: float | None
|
83
|
+
_loss_function: str | None
|
73
84
|
|
74
85
|
@classmethod
|
75
86
|
def name(cls) -> str:
|
@@ -100,6 +111,9 @@ class XGBoostModel(Model):
|
|
100
111
|
self._num_boost_rounds = None
|
101
112
|
self._early_stopping_rounds = None
|
102
113
|
self._best_iteration = None
|
114
|
+
self._loss_function = None
|
115
|
+
self._focalloss_gamma = None
|
116
|
+
self._focalloss_alpha = None
|
103
117
|
|
104
118
|
@property
|
105
119
|
def supports_importances(self) -> bool:
|
@@ -167,6 +181,15 @@ class XGBoostModel(Model):
|
|
167
181
|
_EARLY_STOPPING_ROUNDS_KEY, 50, 500
|
168
182
|
)
|
169
183
|
self._best_iteration = trial.user_attrs.get(_BEST_ITERATION_KEY)
|
184
|
+
loss_function = trial.suggest_categorical(
|
185
|
+
_LOSS_FUNCTION_KEY, [_DEFAULT_LOSS_FUNCTION, _FOCALLOSS_LOSS_FUNCTION]
|
186
|
+
)
|
187
|
+
self._loss_function = loss_function
|
188
|
+
if loss_function == _FOCALLOSS_LOSS_FUNCTION:
|
189
|
+
self._focalloss_gamma = trial.suggest_float(_FOCALLOSS_GAMMA_KEY, 0.5, 5.0)
|
190
|
+
self._focalloss_alpha = trial.suggest_float(
|
191
|
+
_FOCALLOSS_ALPHA_KEY, 0.05, 0.95
|
192
|
+
)
|
170
193
|
|
171
194
|
def load(self, folder: str) -> None:
|
172
195
|
with open(
|
@@ -191,6 +214,9 @@ class XGBoostModel(Model):
|
|
191
214
|
self._num_boost_rounds = params[_NUM_BOOST_ROUNDS_KEY]
|
192
215
|
self._early_stopping_rounds = params[_EARLY_STOPPING_ROUNDS_KEY]
|
193
216
|
self._best_iteration = params.get(_BEST_ITERATION_KEY)
|
217
|
+
self._loss_function = params.get(_LOSS_FUNCTION_KEY, _DEFAULT_LOSS_FUNCTION)
|
218
|
+
self._focalloss_gamma = params.get(_FOCALLOSS_GAMMA_KEY)
|
219
|
+
self._focalloss_alpha = params.get(_FOCALLOSS_ALPHA_KEY)
|
194
220
|
bst = self._provide_xgboost()
|
195
221
|
bst.load_model(os.path.join(folder, _MODEL_FILENAME))
|
196
222
|
|
@@ -220,6 +246,9 @@ class XGBoostModel(Model):
|
|
220
246
|
_SKIP_DROP_KEY: self._skip_drop,
|
221
247
|
_NUM_BOOST_ROUNDS_KEY: self._num_boost_rounds,
|
222
248
|
_EARLY_STOPPING_ROUNDS_KEY: self._early_stopping_rounds,
|
249
|
+
_LOSS_FUNCTION_KEY: self._loss_function,
|
250
|
+
_FOCALLOSS_GAMMA_KEY: self._gamma,
|
251
|
+
_FOCALLOSS_ALPHA_KEY: self._alpha,
|
223
252
|
},
|
224
253
|
handle,
|
225
254
|
)
|
@@ -328,6 +357,46 @@ class XGBoostModel(Model):
|
|
328
357
|
param["normalize_type"] = self._normalize_type
|
329
358
|
param["rate_drop"] = self._rate_drop
|
330
359
|
param["skip_drop"] = self._skip_drop
|
360
|
+
if (
|
361
|
+
self._loss_function == _FOCALLOSS_LOSS_FUNCTION
|
362
|
+
and self._focalloss_alpha is not None
|
363
|
+
and self._focalloss_gamma is not None
|
364
|
+
):
|
365
|
+
|
366
|
+
def focal_loss(alpha=0.25, gamma=2.0):
|
367
|
+
def fl(x, t):
|
368
|
+
p = 1 / (1 + jnp.exp(-x))
|
369
|
+
pt = t * p + (1 - t) * (1 - p)
|
370
|
+
alpha_t = alpha * t + (1 - alpha) * (1 - t)
|
371
|
+
return (
|
372
|
+
-alpha_t * (1 - pt) ** gamma * jnp.log(jnp.clip(pt, 1e-8, 1.0))
|
373
|
+
)
|
374
|
+
|
375
|
+
fl_grad = grad(fl)
|
376
|
+
fl_hess = hessian(fl)
|
377
|
+
grad_batch = vmap(fl_grad)
|
378
|
+
hess_batch = vmap(fl_hess)
|
379
|
+
|
380
|
+
def custom_loss(y_pred, y_true, sample_weight=None):
|
381
|
+
y_true = jnp.array(y_true)
|
382
|
+
y_pred = jnp.array(y_pred)
|
383
|
+
|
384
|
+
grad_vals = grad_batch(y_pred, y_true)
|
385
|
+
hess_vals = hess_batch(y_pred, y_true)
|
386
|
+
|
387
|
+
if sample_weight is not None:
|
388
|
+
sample_weight = jnp.array(sample_weight)
|
389
|
+
grad_vals *= sample_weight
|
390
|
+
hess_vals *= sample_weight
|
391
|
+
|
392
|
+
# Convert to NumPy arrays for XGBoost compatibility
|
393
|
+
return np.array(grad_vals), np.array(hess_vals)
|
394
|
+
|
395
|
+
return custom_loss
|
396
|
+
|
397
|
+
param["objective"] = focal_loss(
|
398
|
+
alpha=self._focalloss_alpha, gamma=self._focalloss_gamma
|
399
|
+
)
|
331
400
|
print(
|
332
401
|
f"Creating xgboost model with max_depth {self._max_depth}, best iteration {best_iteration}, booster: {self._booster}",
|
333
402
|
)
|
@@ -0,0 +1,112 @@
|
|
1
|
+
"""A reducer that removes correlation features."""
|
2
|
+
|
3
|
+
# pylint: disable=too-many-arguments,too-many-positional-arguments,consider-using-enumerate
|
4
|
+
import json
|
5
|
+
import os
|
6
|
+
from typing import Self
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
import optuna
|
10
|
+
import pandas as pd
|
11
|
+
|
12
|
+
from .non_categorical_numeric_columns import \
|
13
|
+
find_non_categorical_numeric_columns
|
14
|
+
from .reducer import Reducer
|
15
|
+
|
16
|
+
_CORRELATION_REDUCER_FILENAME = "correlation_reducer.json"
|
17
|
+
_CORRELATION_REDUCER_THRESHOLD = "correlation_reducer_threshold"
|
18
|
+
|
19
|
+
|
20
|
+
def _get_correlated_features_to_drop(
|
21
|
+
df: pd.DataFrame, threshold: float = 0.85, random_seed: int = 42
|
22
|
+
) -> list[str]:
|
23
|
+
"""
|
24
|
+
Identify highly correlated features to drop, keeping one per group.
|
25
|
+
NaNs are replaced with a single fixed junk value to allow correlation computation.
|
26
|
+
Columns are processed in sorted order to ensure deterministic output.
|
27
|
+
|
28
|
+
Args:
|
29
|
+
df (pd.DataFrame): Input DataFrame.
|
30
|
+
threshold (float): Correlation threshold above which features are considered redundant.
|
31
|
+
random_seed (int): Seed used to generate the fixed junk value.
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
List[str]: List of column names to drop.
|
35
|
+
"""
|
36
|
+
np.random.seed(random_seed)
|
37
|
+
|
38
|
+
# Select and sort numeric columns
|
39
|
+
sorted_cols = sorted(find_non_categorical_numeric_columns(df))
|
40
|
+
df_numeric = df[sorted_cols].copy()
|
41
|
+
|
42
|
+
# Generate and apply a fixed junk value for NaNs
|
43
|
+
junk_value = np.random.uniform(-1e9, 1e9)
|
44
|
+
df_numeric = df_numeric.fillna(junk_value)
|
45
|
+
|
46
|
+
if df_numeric.shape[1] < 2:
|
47
|
+
return []
|
48
|
+
|
49
|
+
# Compute absolute correlation matrix
|
50
|
+
corr_matrix = np.corrcoef(df_numeric.values, rowvar=False)
|
51
|
+
abs_corr = np.abs(corr_matrix)
|
52
|
+
|
53
|
+
# Greedy feature drop based on sorted order
|
54
|
+
to_drop = set()
|
55
|
+
for i in range(len(sorted_cols)):
|
56
|
+
if sorted_cols[i] in to_drop:
|
57
|
+
continue
|
58
|
+
for j in range(i + 1, len(sorted_cols)):
|
59
|
+
if sorted_cols[j] in to_drop:
|
60
|
+
continue
|
61
|
+
if abs_corr[i, j] > threshold:
|
62
|
+
to_drop.add(sorted_cols[j])
|
63
|
+
|
64
|
+
return sorted(to_drop)
|
65
|
+
|
66
|
+
|
67
|
+
class CorrelationReducer(Reducer):
|
68
|
+
"""A class that removes correlated values from a dataset."""
|
69
|
+
|
70
|
+
_correlation_drop_features: dict[str, bool]
|
71
|
+
|
72
|
+
def __init__(self) -> None:
|
73
|
+
self._threshold = 0.0
|
74
|
+
self._correlation_drop_features = {}
|
75
|
+
|
76
|
+
@classmethod
|
77
|
+
def name(cls) -> str:
|
78
|
+
return "correlation"
|
79
|
+
|
80
|
+
def set_options(
|
81
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
82
|
+
) -> None:
|
83
|
+
self._threshold = trial.suggest_float(_CORRELATION_REDUCER_THRESHOLD, 0.7, 0.99)
|
84
|
+
|
85
|
+
def load(self, folder: str) -> None:
|
86
|
+
with open(
|
87
|
+
os.path.join(folder, _CORRELATION_REDUCER_FILENAME), encoding="utf8"
|
88
|
+
) as handle:
|
89
|
+
self._correlation_drop_features = json.load(handle)
|
90
|
+
|
91
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
92
|
+
with open(
|
93
|
+
os.path.join(folder, _CORRELATION_REDUCER_FILENAME), "w", encoding="utf8"
|
94
|
+
) as handle:
|
95
|
+
json.dump(self._correlation_drop_features, handle)
|
96
|
+
|
97
|
+
def fit(
|
98
|
+
self,
|
99
|
+
df: pd.DataFrame,
|
100
|
+
y: pd.Series | pd.DataFrame | None = None,
|
101
|
+
w: pd.Series | None = None,
|
102
|
+
eval_x: pd.DataFrame | None = None,
|
103
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
104
|
+
) -> Self:
|
105
|
+
drop_features = _get_correlated_features_to_drop(df, threshold=self._threshold)
|
106
|
+
self._correlation_drop_features = {x: True for x in drop_features}
|
107
|
+
return self
|
108
|
+
|
109
|
+
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
110
|
+
return df.drop(
|
111
|
+
columns=list(self._correlation_drop_features.keys()), errors="ignore"
|
112
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: wavetrainer
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.50
|
4
4
|
Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
|
5
5
|
Home-page: https://github.com/8W9aG/wavetrainer
|
6
6
|
Author: Will Sackfield
|
@@ -26,6 +26,7 @@ Requires-Dist: torch>=2.6.0
|
|
26
26
|
Requires-Dist: tabpfn>=2.0.6
|
27
27
|
Requires-Dist: pytest-is-running>=1.5.1
|
28
28
|
Requires-Dist: xgboost>=3.0.0
|
29
|
+
Requires-Dist: jax>=0.6.1
|
29
30
|
|
30
31
|
# wavetrainer
|
31
32
|
|
@@ -58,6 +59,7 @@ Python 3.11.6:
|
|
58
59
|
- [tabpfn](https://github.com/PriorLabs/TabPFN)
|
59
60
|
- [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
|
60
61
|
- [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
|
62
|
+
- [jax](https://github.com/jax-ml/jax)
|
61
63
|
|
62
64
|
## Raison D'être :thought_balloon:
|
63
65
|
|
@@ -1,52 +0,0 @@
|
|
1
|
-
"""A reducer that removes correlation features."""
|
2
|
-
|
3
|
-
# pylint: disable=too-many-arguments,too-many-positional-arguments
|
4
|
-
from typing import Self
|
5
|
-
|
6
|
-
import optuna
|
7
|
-
import pandas as pd
|
8
|
-
from feature_engine.selection import DropCorrelatedFeatures
|
9
|
-
|
10
|
-
from .base_selector_reducer import BaseSelectorReducer
|
11
|
-
from .non_categorical_numeric_columns import \
|
12
|
-
find_non_categorical_numeric_columns
|
13
|
-
|
14
|
-
_CORRELATION_REDUCER_FILENAME = "correlation_reducer.joblib"
|
15
|
-
_CORRELATION_REDUCER_THRESHOLD = "correlation_reducer_threshold"
|
16
|
-
|
17
|
-
|
18
|
-
class CorrelationReducer(BaseSelectorReducer):
|
19
|
-
"""A class that removes correlated values from a dataset."""
|
20
|
-
|
21
|
-
def __init__(self) -> None:
|
22
|
-
self._correlation_selector = DropCorrelatedFeatures(missing_values="ignore")
|
23
|
-
super().__init__(
|
24
|
-
self._correlation_selector,
|
25
|
-
_CORRELATION_REDUCER_FILENAME,
|
26
|
-
)
|
27
|
-
|
28
|
-
@classmethod
|
29
|
-
def name(cls) -> str:
|
30
|
-
return "correlation"
|
31
|
-
|
32
|
-
@classmethod
|
33
|
-
def should_raise(cls) -> bool:
|
34
|
-
return False
|
35
|
-
|
36
|
-
def set_options(
|
37
|
-
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
38
|
-
) -> None:
|
39
|
-
self._correlation_selector.threshold = trial.suggest_float(
|
40
|
-
_CORRELATION_REDUCER_THRESHOLD, 0.7, 0.99
|
41
|
-
)
|
42
|
-
|
43
|
-
def fit(
|
44
|
-
self,
|
45
|
-
df: pd.DataFrame,
|
46
|
-
y: pd.Series | pd.DataFrame | None = None,
|
47
|
-
w: pd.Series | None = None,
|
48
|
-
eval_x: pd.DataFrame | None = None,
|
49
|
-
eval_y: pd.Series | pd.DataFrame | None = None,
|
50
|
-
) -> Self:
|
51
|
-
self._correlation_selector.variables = find_non_categorical_numeric_columns(df)
|
52
|
-
return super().fit(df, y=y, w=w, eval_x=eval_x, eval_y=eval_y)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/catboost/catboost_classifier_wrap.py
RENAMED
File without changes
|
File without changes
|
{wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/catboost/catboost_regressor_wrap.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/non_categorical_numeric_columns.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|