wavetrainer 0.0.49__tar.gz → 0.0.50__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {wavetrainer-0.0.49/wavetrainer.egg-info → wavetrainer-0.0.50}/PKG-INFO +3 -1
  2. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/README.md +1 -0
  3. wavetrainer-0.0.49/wavetrainer.egg-info/requires.txt → wavetrainer-0.0.50/requirements.txt +1 -0
  4. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/setup.py +1 -1
  5. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/__init__.py +1 -1
  6. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/catboost/catboost_model.py +59 -1
  7. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/xgboost/xgboost_model.py +69 -0
  8. wavetrainer-0.0.50/wavetrainer/reducer/correlation_reducer.py +112 -0
  9. {wavetrainer-0.0.49 → wavetrainer-0.0.50/wavetrainer.egg-info}/PKG-INFO +3 -1
  10. wavetrainer-0.0.49/requirements.txt → wavetrainer-0.0.50/wavetrainer.egg-info/requires.txt +2 -1
  11. wavetrainer-0.0.49/wavetrainer/reducer/correlation_reducer.py +0 -52
  12. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/LICENSE +0 -0
  13. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/MANIFEST.in +0 -0
  14. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/setup.cfg +0 -0
  15. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/tests/__init__.py +0 -0
  16. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/tests/model/__init__.py +0 -0
  17. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/tests/model/catboost_kwargs_test.py +0 -0
  18. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/tests/trainer_test.py +0 -0
  19. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/calibrator/__init__.py +0 -0
  20. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/calibrator/calibrator.py +0 -0
  21. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/calibrator/calibrator_router.py +0 -0
  22. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/calibrator/mapie_calibrator.py +0 -0
  23. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/calibrator/vennabers_calibrator.py +0 -0
  24. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/create.py +0 -0
  25. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/exceptions.py +0 -0
  26. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/fit.py +0 -0
  27. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/__init__.py +0 -0
  28. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/catboost/__init__.py +0 -0
  29. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/catboost/catboost_classifier_wrap.py +0 -0
  30. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/catboost/catboost_kwargs.py +0 -0
  31. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/catboost/catboost_regressor_wrap.py +0 -0
  32. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/model.py +0 -0
  33. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/model_router.py +0 -0
  34. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/tabpfn/__init__.py +0 -0
  35. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/tabpfn/tabpfn_model.py +0 -0
  36. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/xgboost/__init__.py +0 -0
  37. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/xgboost/early_stopper.py +0 -0
  38. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/xgboost/xgboost_logger.py +0 -0
  39. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model_type.py +0 -0
  40. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/params.py +0 -0
  41. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/__init__.py +0 -0
  42. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/base_selector_reducer.py +0 -0
  43. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/combined_reducer.py +0 -0
  44. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/constant_reducer.py +0 -0
  45. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/duplicate_reducer.py +0 -0
  46. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/non_categorical_numeric_columns.py +0 -0
  47. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/nonnumeric_reducer.py +0 -0
  48. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/pca_reducer.py +0 -0
  49. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/reducer.py +0 -0
  50. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/select_by_single_feature_performance_reducer.py +0 -0
  51. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/smart_correlation_reducer.py +0 -0
  52. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/reducer/unseen_reducer.py +0 -0
  53. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/selector/__init__.py +0 -0
  54. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/selector/selector.py +0 -0
  55. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/trainer.py +0 -0
  56. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/weights/__init__.py +0 -0
  57. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/weights/class_weights.py +0 -0
  58. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/weights/combined_weights.py +0 -0
  59. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/weights/exponential_weights.py +0 -0
  60. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/weights/linear_weights.py +0 -0
  61. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/weights/noop_weights.py +0 -0
  62. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/weights/sigmoid_weights.py +0 -0
  63. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/weights/weights.py +0 -0
  64. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/weights/weights_router.py +0 -0
  65. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/windower/__init__.py +0 -0
  66. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/windower/windower.py +0 -0
  67. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer.egg-info/SOURCES.txt +0 -0
  68. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer.egg-info/dependency_links.txt +0 -0
  69. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer.egg-info/not-zip-safe +0 -0
  70. {wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.0.49
3
+ Version: 0.0.50
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
@@ -26,6 +26,7 @@ Requires-Dist: torch>=2.6.0
26
26
  Requires-Dist: tabpfn>=2.0.6
27
27
  Requires-Dist: pytest-is-running>=1.5.1
28
28
  Requires-Dist: xgboost>=3.0.0
29
+ Requires-Dist: jax>=0.6.1
29
30
 
30
31
  # wavetrainer
31
32
 
@@ -58,6 +59,7 @@ Python 3.11.6:
58
59
  - [tabpfn](https://github.com/PriorLabs/TabPFN)
59
60
  - [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
60
61
  - [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
62
+ - [jax](https://github.com/jax-ml/jax)
61
63
 
62
64
  ## Raison D'être :thought_balloon:
63
65
 
@@ -29,6 +29,7 @@ Python 3.11.6:
29
29
  - [tabpfn](https://github.com/PriorLabs/TabPFN)
30
30
  - [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
31
31
  - [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
32
+ - [jax](https://github.com/jax-ml/jax)
32
33
 
33
34
  ## Raison D'être :thought_balloon:
34
35
 
@@ -13,3 +13,4 @@ torch>=2.6.0
13
13
  tabpfn>=2.0.6
14
14
  pytest-is-running>=1.5.1
15
15
  xgboost>=3.0.0
16
+ jax>=0.6.1
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
23
23
 
24
24
  setup(
25
25
  name='wavetrainer',
26
- version='0.0.49',
26
+ version='0.0.50',
27
27
  description='A library for automatically finding the optimal model within feature and hyperparameter space.',
28
28
  long_description=long_description,
29
29
  long_description_content_type='text/markdown',
@@ -2,5 +2,5 @@
2
2
 
3
3
  from .create import create
4
4
 
5
- __VERSION__ = "0.0.49"
5
+ __VERSION__ = "0.0.50"
6
6
  __all__ = ("create",)
@@ -18,6 +18,7 @@ from .catboost_regressor_wrap import CatBoostRegressorWrapper
18
18
 
19
19
  _MODEL_FILENAME = "model.cbm"
20
20
  _MODEL_PARAMS_FILENAME = "model_params.json"
21
+ _MODEL_CATEGORICAL_FEATURES_FILENAME = "catboost_categorical_features.json"
21
22
  _ITERATIONS_KEY = "iterations"
22
23
  _LEARNING_RATE_KEY = "learning_rate"
23
24
  _DEPTH_KEY = "depth"
@@ -26,6 +27,11 @@ _BOOSTING_TYPE_KEY = "boosting_type"
26
27
  _MODEL_TYPE_KEY = "model_type"
27
28
  _EARLY_STOPPING_ROUNDS = "early_stopping_rounds"
28
29
  _BEST_ITERATION_KEY = "best_iteration"
30
+ _LOSS_FUNCTION_KEY = "loss_function"
31
+ _DEFAULT_LOSS_FUNCTION = "default"
32
+ _FOCALLOSS_LOSS_FUNCTION = "focalloss"
33
+ _GAMMA_KEY = "focalloss_gamma"
34
+ _ALPHA_KEY = "focalloss_alpha"
29
35
 
30
36
 
31
37
  class CatboostModel(Model):
@@ -42,6 +48,10 @@ class CatboostModel(Model):
42
48
  _model_type: None | ModelType
43
49
  _early_stopping_rounds: None | int
44
50
  _best_iteration: None | int
51
+ _categorical_features: dict[str, bool]
52
+ _loss_function: None | str
53
+ _gamma: None | float
54
+ _alpha: None | float
45
55
 
46
56
  @classmethod
47
57
  def name(cls) -> str:
@@ -62,6 +72,10 @@ class CatboostModel(Model):
62
72
  self._model_type = None
63
73
  self._early_stopping_rounds = None
64
74
  self._best_iteration = None
75
+ self._categorical_features = {}
76
+ self._loss_function = None
77
+ self._gamma = None
78
+ self._alpha = None
65
79
 
66
80
  @property
67
81
  def supports_importances(self) -> bool:
@@ -76,7 +90,10 @@ class CatboostModel(Model):
76
90
  feature_ids = importances["Feature Id"].to_list() # type: ignore
77
91
  importances = importances["Importances"].to_list() # type: ignore
78
92
  total = sum(importances)
79
- return {feature_ids[x]: importances[x] / total for x in range(len(feature_ids))}
93
+ return {
94
+ feature_ids[x]: importances[x] / total if total != 0.0 else 0.0
95
+ for x in range(len(feature_ids))
96
+ }
80
97
 
81
98
  def provide_estimator(self):
82
99
  return self._provide_catboost()
@@ -105,6 +122,13 @@ class CatboostModel(Model):
105
122
  )
106
123
  self._early_stopping_rounds = trial.suggest_int(_EARLY_STOPPING_ROUNDS, 10, 500)
107
124
  self._best_iteration = trial.user_attrs.get(_BEST_ITERATION_KEY)
125
+ loss_function = trial.suggest_categorical(
126
+ _LOSS_FUNCTION_KEY, [_DEFAULT_LOSS_FUNCTION, _FOCALLOSS_LOSS_FUNCTION]
127
+ )
128
+ self._loss_function = loss_function
129
+ if loss_function == _FOCALLOSS_LOSS_FUNCTION:
130
+ self._gamma = trial.suggest_float(_GAMMA_KEY, 0.5, 5.0)
131
+ self._alpha = trial.suggest_float(_ALPHA_KEY, 0.05, 0.95)
108
132
 
109
133
  def load(self, folder: str) -> None:
110
134
  with open(
@@ -119,6 +143,13 @@ class CatboostModel(Model):
119
143
  self._model_type = ModelType(params[_MODEL_TYPE_KEY])
120
144
  self._early_stopping_rounds = params[_EARLY_STOPPING_ROUNDS]
121
145
  self._best_iteration = params.get(_BEST_ITERATION_KEY)
146
+ self._loss_function = params.get(_LOSS_FUNCTION_KEY, _DEFAULT_LOSS_FUNCTION)
147
+ self._gamma = params.get(_GAMMA_KEY)
148
+ self._alpha = params.get(_ALPHA_KEY)
149
+ with open(
150
+ os.path.join(folder, _MODEL_CATEGORICAL_FEATURES_FILENAME), encoding="utf8"
151
+ ) as handle:
152
+ self._categorical_features = json.load(handle)
122
153
  catboost = self._provide_catboost()
123
154
  catboost.load_model(os.path.join(folder, _MODEL_FILENAME))
124
155
 
@@ -136,9 +167,18 @@ class CatboostModel(Model):
136
167
  _MODEL_TYPE_KEY: str(self._model_type),
137
168
  _EARLY_STOPPING_ROUNDS: self._early_stopping_rounds,
138
169
  _BEST_ITERATION_KEY: self._best_iteration,
170
+ _LOSS_FUNCTION_KEY: self._loss_function,
171
+ _GAMMA_KEY: self._gamma,
172
+ _ALPHA_KEY: self._alpha,
139
173
  },
140
174
  handle,
141
175
  )
176
+ with open(
177
+ os.path.join(folder, _MODEL_CATEGORICAL_FEATURES_FILENAME),
178
+ "w",
179
+ encoding="utf8",
180
+ ) as handle:
181
+ json.dump(self._categorical_features, handle)
142
182
  catboost = self._provide_catboost()
143
183
  catboost.save_model(os.path.join(folder, _MODEL_FILENAME))
144
184
  trial.set_user_attr(_BEST_ITERATION_KEY, self._best_iteration)
@@ -155,6 +195,9 @@ class CatboostModel(Model):
155
195
  raise ValueError("y is null.")
156
196
  self._model_type = determine_model_type(y)
157
197
  catboost = self._provide_catboost()
198
+ self._categorical_features = {
199
+ x: True for x in df.select_dtypes(include="category").columns.tolist()
200
+ }
158
201
 
159
202
  train_pool = Pool(
160
203
  df,
@@ -184,6 +227,10 @@ class CatboostModel(Model):
184
227
  return self
185
228
 
186
229
  def transform(self, df: pd.DataFrame) -> pd.DataFrame:
230
+ for categorical_feature_column in self._categorical_features.keys():
231
+ df[categorical_feature_column] = df[categorical_feature_column].astype(
232
+ "category"
233
+ )
187
234
  pred_pool = Pool(
188
235
  df,
189
236
  cat_features=df.select_dtypes(include="category").columns.tolist(),
@@ -217,6 +264,14 @@ class CatboostModel(Model):
217
264
  print(
218
265
  f"Creating catboost model with depth {self._depth}, boosting type {self._boosting_type}, best iteration {best_iteration}",
219
266
  )
267
+ loss_function = None
268
+ if (
269
+ self._loss_function == _FOCALLOSS_LOSS_FUNCTION
270
+ and self._alpha is not None
271
+ and self._gamma is not None
272
+ and self._model_type != ModelType.REGRESSION
273
+ ):
274
+ loss_function = f"Focal:focal_alpha={self._alpha};focal_gamma={self._gamma}"
220
275
  match self._model_type:
221
276
  case ModelType.BINARY:
222
277
  return CatBoostClassifierWrapper(
@@ -229,6 +284,7 @@ class CatboostModel(Model):
229
284
  metric_period=100,
230
285
  task_type="GPU" if torch.cuda.is_available() else "CPU",
231
286
  devices="0" if torch.cuda.is_available() else None,
287
+ loss_function=loss_function,
232
288
  )
233
289
  case ModelType.REGRESSION:
234
290
  return CatBoostRegressorWrapper(
@@ -253,6 +309,7 @@ class CatboostModel(Model):
253
309
  metric_period=100,
254
310
  task_type="GPU" if torch.cuda.is_available() else "CPU",
255
311
  devices="0" if torch.cuda.is_available() else None,
312
+ loss_function=loss_function,
256
313
  )
257
314
  case ModelType.MULTI_CLASSIFICATION:
258
315
  return CatBoostClassifierWrapper(
@@ -265,6 +322,7 @@ class CatboostModel(Model):
265
322
  metric_period=100,
266
323
  task_type="GPU" if torch.cuda.is_available() else "CPU",
267
324
  devices="0" if torch.cuda.is_available() else None,
325
+ loss_function=loss_function,
268
326
  )
269
327
  case _:
270
328
  raise ValueError(f"Unrecognised model type: {self._model_type}")
@@ -5,10 +5,13 @@ import json
5
5
  import os
6
6
  from typing import Self
7
7
 
8
+ import jax.numpy as jnp
9
+ import numpy as np
8
10
  import optuna
9
11
  import pandas as pd
10
12
  import pytest_is_running
11
13
  import torch
14
+ from jax import grad, hessian, vmap
12
15
  from xgboost import XGBClassifier, XGBRegressor
13
16
  from xgboost.callback import TrainingCallback
14
17
  from xgboost.core import XGBoostError
@@ -39,6 +42,11 @@ _RATE_DROP_KEY = "rate_drop"
39
42
  _SKIP_DROP_KEY = "skip_drop"
40
43
  _NUM_BOOST_ROUNDS_KEY = "num_boost_rounds"
41
44
  _EARLY_STOPPING_ROUNDS_KEY = "early_stopping_rounds"
45
+ _LOSS_FUNCTION_KEY = "xgboost_loss_function"
46
+ _DEFAULT_LOSS_FUNCTION = "default"
47
+ _FOCALLOSS_LOSS_FUNCTION = "focalloss"
48
+ _FOCALLOSS_GAMMA_KEY = "focalloss_gamma"
49
+ _FOCALLOSS_ALPHA_KEY = "focalloss_alpha"
42
50
 
43
51
 
44
52
  def _convert_categoricals(input_df: pd.DataFrame) -> pd.DataFrame:
@@ -70,6 +78,9 @@ class XGBoostModel(Model):
70
78
  _num_boost_rounds: int | None
71
79
  _early_stopping_rounds: int | None
72
80
  _best_iteration: int | None
81
+ _focalloss_alpha: float | None
82
+ _focalloss_gamma: float | None
83
+ _loss_function: str | None
73
84
 
74
85
  @classmethod
75
86
  def name(cls) -> str:
@@ -100,6 +111,9 @@ class XGBoostModel(Model):
100
111
  self._num_boost_rounds = None
101
112
  self._early_stopping_rounds = None
102
113
  self._best_iteration = None
114
+ self._loss_function = None
115
+ self._focalloss_gamma = None
116
+ self._focalloss_alpha = None
103
117
 
104
118
  @property
105
119
  def supports_importances(self) -> bool:
@@ -167,6 +181,15 @@ class XGBoostModel(Model):
167
181
  _EARLY_STOPPING_ROUNDS_KEY, 50, 500
168
182
  )
169
183
  self._best_iteration = trial.user_attrs.get(_BEST_ITERATION_KEY)
184
+ loss_function = trial.suggest_categorical(
185
+ _LOSS_FUNCTION_KEY, [_DEFAULT_LOSS_FUNCTION, _FOCALLOSS_LOSS_FUNCTION]
186
+ )
187
+ self._loss_function = loss_function
188
+ if loss_function == _FOCALLOSS_LOSS_FUNCTION:
189
+ self._focalloss_gamma = trial.suggest_float(_FOCALLOSS_GAMMA_KEY, 0.5, 5.0)
190
+ self._focalloss_alpha = trial.suggest_float(
191
+ _FOCALLOSS_ALPHA_KEY, 0.05, 0.95
192
+ )
170
193
 
171
194
  def load(self, folder: str) -> None:
172
195
  with open(
@@ -191,6 +214,9 @@ class XGBoostModel(Model):
191
214
  self._num_boost_rounds = params[_NUM_BOOST_ROUNDS_KEY]
192
215
  self._early_stopping_rounds = params[_EARLY_STOPPING_ROUNDS_KEY]
193
216
  self._best_iteration = params.get(_BEST_ITERATION_KEY)
217
+ self._loss_function = params.get(_LOSS_FUNCTION_KEY, _DEFAULT_LOSS_FUNCTION)
218
+ self._focalloss_gamma = params.get(_FOCALLOSS_GAMMA_KEY)
219
+ self._focalloss_alpha = params.get(_FOCALLOSS_ALPHA_KEY)
194
220
  bst = self._provide_xgboost()
195
221
  bst.load_model(os.path.join(folder, _MODEL_FILENAME))
196
222
 
@@ -220,6 +246,9 @@ class XGBoostModel(Model):
220
246
  _SKIP_DROP_KEY: self._skip_drop,
221
247
  _NUM_BOOST_ROUNDS_KEY: self._num_boost_rounds,
222
248
  _EARLY_STOPPING_ROUNDS_KEY: self._early_stopping_rounds,
249
+ _LOSS_FUNCTION_KEY: self._loss_function,
250
+ _FOCALLOSS_GAMMA_KEY: self._gamma,
251
+ _FOCALLOSS_ALPHA_KEY: self._alpha,
223
252
  },
224
253
  handle,
225
254
  )
@@ -328,6 +357,46 @@ class XGBoostModel(Model):
328
357
  param["normalize_type"] = self._normalize_type
329
358
  param["rate_drop"] = self._rate_drop
330
359
  param["skip_drop"] = self._skip_drop
360
+ if (
361
+ self._loss_function == _FOCALLOSS_LOSS_FUNCTION
362
+ and self._focalloss_alpha is not None
363
+ and self._focalloss_gamma is not None
364
+ ):
365
+
366
+ def focal_loss(alpha=0.25, gamma=2.0):
367
+ def fl(x, t):
368
+ p = 1 / (1 + jnp.exp(-x))
369
+ pt = t * p + (1 - t) * (1 - p)
370
+ alpha_t = alpha * t + (1 - alpha) * (1 - t)
371
+ return (
372
+ -alpha_t * (1 - pt) ** gamma * jnp.log(jnp.clip(pt, 1e-8, 1.0))
373
+ )
374
+
375
+ fl_grad = grad(fl)
376
+ fl_hess = hessian(fl)
377
+ grad_batch = vmap(fl_grad)
378
+ hess_batch = vmap(fl_hess)
379
+
380
+ def custom_loss(y_pred, y_true, sample_weight=None):
381
+ y_true = jnp.array(y_true)
382
+ y_pred = jnp.array(y_pred)
383
+
384
+ grad_vals = grad_batch(y_pred, y_true)
385
+ hess_vals = hess_batch(y_pred, y_true)
386
+
387
+ if sample_weight is not None:
388
+ sample_weight = jnp.array(sample_weight)
389
+ grad_vals *= sample_weight
390
+ hess_vals *= sample_weight
391
+
392
+ # Convert to NumPy arrays for XGBoost compatibility
393
+ return np.array(grad_vals), np.array(hess_vals)
394
+
395
+ return custom_loss
396
+
397
+ param["objective"] = focal_loss(
398
+ alpha=self._focalloss_alpha, gamma=self._focalloss_gamma
399
+ )
331
400
  print(
332
401
  f"Creating xgboost model with max_depth {self._max_depth}, best iteration {best_iteration}, booster: {self._booster}",
333
402
  )
@@ -0,0 +1,112 @@
1
+ """A reducer that removes correlation features."""
2
+
3
+ # pylint: disable=too-many-arguments,too-many-positional-arguments,consider-using-enumerate
4
+ import json
5
+ import os
6
+ from typing import Self
7
+
8
+ import numpy as np
9
+ import optuna
10
+ import pandas as pd
11
+
12
+ from .non_categorical_numeric_columns import \
13
+ find_non_categorical_numeric_columns
14
+ from .reducer import Reducer
15
+
16
+ _CORRELATION_REDUCER_FILENAME = "correlation_reducer.json"
17
+ _CORRELATION_REDUCER_THRESHOLD = "correlation_reducer_threshold"
18
+
19
+
20
+ def _get_correlated_features_to_drop(
21
+ df: pd.DataFrame, threshold: float = 0.85, random_seed: int = 42
22
+ ) -> list[str]:
23
+ """
24
+ Identify highly correlated features to drop, keeping one per group.
25
+ NaNs are replaced with a single fixed junk value to allow correlation computation.
26
+ Columns are processed in sorted order to ensure deterministic output.
27
+
28
+ Args:
29
+ df (pd.DataFrame): Input DataFrame.
30
+ threshold (float): Correlation threshold above which features are considered redundant.
31
+ random_seed (int): Seed used to generate the fixed junk value.
32
+
33
+ Returns:
34
+ List[str]: List of column names to drop.
35
+ """
36
+ np.random.seed(random_seed)
37
+
38
+ # Select and sort numeric columns
39
+ sorted_cols = sorted(find_non_categorical_numeric_columns(df))
40
+ df_numeric = df[sorted_cols].copy()
41
+
42
+ # Generate and apply a fixed junk value for NaNs
43
+ junk_value = np.random.uniform(-1e9, 1e9)
44
+ df_numeric = df_numeric.fillna(junk_value)
45
+
46
+ if df_numeric.shape[1] < 2:
47
+ return []
48
+
49
+ # Compute absolute correlation matrix
50
+ corr_matrix = np.corrcoef(df_numeric.values, rowvar=False)
51
+ abs_corr = np.abs(corr_matrix)
52
+
53
+ # Greedy feature drop based on sorted order
54
+ to_drop = set()
55
+ for i in range(len(sorted_cols)):
56
+ if sorted_cols[i] in to_drop:
57
+ continue
58
+ for j in range(i + 1, len(sorted_cols)):
59
+ if sorted_cols[j] in to_drop:
60
+ continue
61
+ if abs_corr[i, j] > threshold:
62
+ to_drop.add(sorted_cols[j])
63
+
64
+ return sorted(to_drop)
65
+
66
+
67
+ class CorrelationReducer(Reducer):
68
+ """A class that removes correlated values from a dataset."""
69
+
70
+ _correlation_drop_features: dict[str, bool]
71
+
72
+ def __init__(self) -> None:
73
+ self._threshold = 0.0
74
+ self._correlation_drop_features = {}
75
+
76
+ @classmethod
77
+ def name(cls) -> str:
78
+ return "correlation"
79
+
80
+ def set_options(
81
+ self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
82
+ ) -> None:
83
+ self._threshold = trial.suggest_float(_CORRELATION_REDUCER_THRESHOLD, 0.7, 0.99)
84
+
85
+ def load(self, folder: str) -> None:
86
+ with open(
87
+ os.path.join(folder, _CORRELATION_REDUCER_FILENAME), encoding="utf8"
88
+ ) as handle:
89
+ self._correlation_drop_features = json.load(handle)
90
+
91
+ def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
92
+ with open(
93
+ os.path.join(folder, _CORRELATION_REDUCER_FILENAME), "w", encoding="utf8"
94
+ ) as handle:
95
+ json.dump(self._correlation_drop_features, handle)
96
+
97
+ def fit(
98
+ self,
99
+ df: pd.DataFrame,
100
+ y: pd.Series | pd.DataFrame | None = None,
101
+ w: pd.Series | None = None,
102
+ eval_x: pd.DataFrame | None = None,
103
+ eval_y: pd.Series | pd.DataFrame | None = None,
104
+ ) -> Self:
105
+ drop_features = _get_correlated_features_to_drop(df, threshold=self._threshold)
106
+ self._correlation_drop_features = {x: True for x in drop_features}
107
+ return self
108
+
109
+ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
110
+ return df.drop(
111
+ columns=list(self._correlation_drop_features.keys()), errors="ignore"
112
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.0.49
3
+ Version: 0.0.50
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
@@ -26,6 +26,7 @@ Requires-Dist: torch>=2.6.0
26
26
  Requires-Dist: tabpfn>=2.0.6
27
27
  Requires-Dist: pytest-is-running>=1.5.1
28
28
  Requires-Dist: xgboost>=3.0.0
29
+ Requires-Dist: jax>=0.6.1
29
30
 
30
31
  # wavetrainer
31
32
 
@@ -58,6 +59,7 @@ Python 3.11.6:
58
59
  - [tabpfn](https://github.com/PriorLabs/TabPFN)
59
60
  - [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
60
61
  - [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
62
+ - [jax](https://github.com/jax-ml/jax)
61
63
 
62
64
  ## Raison D'être :thought_balloon:
63
65
 
@@ -12,4 +12,5 @@ pytz>=2025.1
12
12
  torch>=2.6.0
13
13
  tabpfn>=2.0.6
14
14
  pytest-is-running>=1.5.1
15
- xgboost>=3.0.0
15
+ xgboost>=3.0.0
16
+ jax>=0.6.1
@@ -1,52 +0,0 @@
1
- """A reducer that removes correlation features."""
2
-
3
- # pylint: disable=too-many-arguments,too-many-positional-arguments
4
- from typing import Self
5
-
6
- import optuna
7
- import pandas as pd
8
- from feature_engine.selection import DropCorrelatedFeatures
9
-
10
- from .base_selector_reducer import BaseSelectorReducer
11
- from .non_categorical_numeric_columns import \
12
- find_non_categorical_numeric_columns
13
-
14
- _CORRELATION_REDUCER_FILENAME = "correlation_reducer.joblib"
15
- _CORRELATION_REDUCER_THRESHOLD = "correlation_reducer_threshold"
16
-
17
-
18
- class CorrelationReducer(BaseSelectorReducer):
19
- """A class that removes correlated values from a dataset."""
20
-
21
- def __init__(self) -> None:
22
- self._correlation_selector = DropCorrelatedFeatures(missing_values="ignore")
23
- super().__init__(
24
- self._correlation_selector,
25
- _CORRELATION_REDUCER_FILENAME,
26
- )
27
-
28
- @classmethod
29
- def name(cls) -> str:
30
- return "correlation"
31
-
32
- @classmethod
33
- def should_raise(cls) -> bool:
34
- return False
35
-
36
- def set_options(
37
- self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
38
- ) -> None:
39
- self._correlation_selector.threshold = trial.suggest_float(
40
- _CORRELATION_REDUCER_THRESHOLD, 0.7, 0.99
41
- )
42
-
43
- def fit(
44
- self,
45
- df: pd.DataFrame,
46
- y: pd.Series | pd.DataFrame | None = None,
47
- w: pd.Series | None = None,
48
- eval_x: pd.DataFrame | None = None,
49
- eval_y: pd.Series | pd.DataFrame | None = None,
50
- ) -> Self:
51
- self._correlation_selector.variables = find_non_categorical_numeric_columns(df)
52
- return super().fit(df, y=y, w=w, eval_x=eval_x, eval_y=eval_y)
File without changes
File without changes
File without changes