wavetrainer 0.1.10__tar.gz → 0.1.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {wavetrainer-0.1.10/wavetrainer.egg-info → wavetrainer-0.1.12}/PKG-INFO +5 -1
  2. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/README.md +2 -0
  3. wavetrainer-0.1.10/wavetrainer.egg-info/requires.txt → wavetrainer-0.1.12/requirements.txt +2 -0
  4. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/setup.py +1 -1
  5. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/__init__.py +1 -1
  6. wavetrainer-0.1.12/wavetrainer/model/lightgbm/__init__.py +1 -0
  7. wavetrainer-0.1.12/wavetrainer/model/lightgbm/lightgbm_model.py +245 -0
  8. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/model/model_router.py +2 -0
  9. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/model/xgboost/xgboost_model.py +1 -1
  10. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/reducer/combined_reducer.py +1 -2
  11. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/trainer.py +69 -12
  12. {wavetrainer-0.1.10 → wavetrainer-0.1.12/wavetrainer.egg-info}/PKG-INFO +5 -1
  13. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer.egg-info/SOURCES.txt +2 -0
  14. wavetrainer-0.1.10/requirements.txt → wavetrainer-0.1.12/wavetrainer.egg-info/requires.txt +3 -1
  15. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/LICENSE +0 -0
  16. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/MANIFEST.in +0 -0
  17. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/setup.cfg +0 -0
  18. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/tests/__init__.py +0 -0
  19. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/tests/model/__init__.py +0 -0
  20. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/tests/model/catboost_kwargs_test.py +0 -0
  21. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/tests/trainer_test.py +0 -0
  22. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/calibrator/__init__.py +0 -0
  23. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/calibrator/calibrator.py +0 -0
  24. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/calibrator/calibrator_router.py +0 -0
  25. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/calibrator/vennabers_calibrator.py +0 -0
  26. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/create.py +0 -0
  27. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/exceptions.py +0 -0
  28. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/fit.py +0 -0
  29. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/model/__init__.py +0 -0
  30. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/model/catboost/__init__.py +0 -0
  31. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/model/catboost/catboost_classifier_wrap.py +0 -0
  32. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/model/catboost/catboost_kwargs.py +0 -0
  33. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/model/catboost/catboost_model.py +0 -0
  34. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/model/catboost/catboost_regressor_wrap.py +0 -0
  35. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/model/model.py +0 -0
  36. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/model/tabpfn/__init__.py +0 -0
  37. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/model/tabpfn/tabpfn_model.py +0 -0
  38. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/model/xgboost/__init__.py +0 -0
  39. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/model/xgboost/early_stopper.py +0 -0
  40. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/model/xgboost/xgboost_logger.py +0 -0
  41. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/model_type.py +0 -0
  42. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/params.py +0 -0
  43. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/reducer/__init__.py +0 -0
  44. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/reducer/base_selector_reducer.py +0 -0
  45. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/reducer/constant_reducer.py +0 -0
  46. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/reducer/correlation_reducer.py +0 -0
  47. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/reducer/duplicate_reducer.py +0 -0
  48. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/reducer/non_categorical_numeric_columns.py +0 -0
  49. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/reducer/nonnumeric_reducer.py +0 -0
  50. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/reducer/pca_reducer.py +0 -0
  51. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/reducer/reducer.py +0 -0
  52. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/reducer/select_by_single_feature_performance_reducer.py +0 -0
  53. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/reducer/smart_correlation_reducer.py +0 -0
  54. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/reducer/unseen_reducer.py +0 -0
  55. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/selector/__init__.py +0 -0
  56. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/selector/selector.py +0 -0
  57. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/weights/__init__.py +0 -0
  58. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/weights/class_weights.py +0 -0
  59. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/weights/combined_weights.py +0 -0
  60. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/weights/exponential_weights.py +0 -0
  61. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/weights/linear_weights.py +0 -0
  62. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/weights/noop_weights.py +0 -0
  63. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/weights/sigmoid_weights.py +0 -0
  64. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/weights/weights.py +0 -0
  65. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/weights/weights_router.py +0 -0
  66. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/windower/__init__.py +0 -0
  67. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/windower/windower.py +0 -0
  68. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer.egg-info/dependency_links.txt +0 -0
  69. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer.egg-info/not-zip-safe +0 -0
  70. {wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.1.10
3
+ Version: 0.1.12
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
@@ -29,6 +29,8 @@ Requires-Dist: jax>=0.6.1
29
29
  Requires-Dist: tabpfn_extensions>=0.0.4
30
30
  Requires-Dist: hyperopt>=0.2.7
31
31
  Requires-Dist: pycaleva>=0.8.2
32
+ Requires-Dist: lightgbm>=4.6.0
33
+ Requires-Dist: kaleido>=0.2.1
32
34
 
33
35
  # wavetrainer
34
36
 
@@ -64,6 +66,8 @@ Python 3.11.6:
64
66
  - [tabpfn_extensions](https://github.com/PriorLabs/tabpfn-extensions)
65
67
  - [hyperopt](https://github.com/hyperopt/hyperopt)
66
68
  - [pycaleva](https://github.com/MartinWeigl/pycaleva)
69
+ - [lightgbm](https://github.com/microsoft/LightGBM)
70
+ - [kaleido](https://github.com/plotly/Kaleido)
67
71
 
68
72
  ## Raison D'être :thought_balloon:
69
73
 
@@ -32,6 +32,8 @@ Python 3.11.6:
32
32
  - [tabpfn_extensions](https://github.com/PriorLabs/tabpfn-extensions)
33
33
  - [hyperopt](https://github.com/hyperopt/hyperopt)
34
34
  - [pycaleva](https://github.com/MartinWeigl/pycaleva)
35
+ - [lightgbm](https://github.com/microsoft/LightGBM)
36
+ - [kaleido](https://github.com/plotly/Kaleido)
35
37
 
36
38
  ## Raison D'être :thought_balloon:
37
39
 
@@ -16,3 +16,5 @@ jax>=0.6.1
16
16
  tabpfn_extensions>=0.0.4
17
17
  hyperopt>=0.2.7
18
18
  pycaleva>=0.8.2
19
+ lightgbm>=4.6.0
20
+ kaleido>=0.2.1
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
23
23
 
24
24
  setup(
25
25
  name='wavetrainer',
26
- version='0.1.10',
26
+ version='0.1.12',
27
27
  description='A library for automatically finding the optimal model within feature and hyperparameter space.',
28
28
  long_description=long_description,
29
29
  long_description_content_type='text/markdown',
@@ -2,5 +2,5 @@
2
2
 
3
3
  from .create import create
4
4
 
5
- __VERSION__ = "0.1.10"
5
+ __VERSION__ = "0.1.12"
6
6
  __all__ = ("create",)
@@ -0,0 +1 @@
1
+ """The wavetrain lightgbm model module."""
@@ -0,0 +1,245 @@
1
+ """A model that wraps lightgbm."""
2
+
3
+ # pylint: disable=duplicate-code,too-many-arguments,too-many-positional-arguments,too-many-instance-attributes
4
+ import json
5
+ import os
6
+ from typing import Self
7
+
8
+ import joblib # type: ignore
9
+ import lightgbm as lgb
10
+ import optuna
11
+ import pandas as pd
12
+ import torch
13
+
14
+ from ...exceptions import WavetrainException
15
+ from ...model_type import ModelType, determine_model_type
16
+ from ..model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
17
+
18
+ _BOOSTING_TYPE_KEY = "gbm_boosting_type"
19
+ _NUM_LEAVES_KEY = "gbm_num_leaves"
20
+ _MIN_CHILD_SAMPLES_KEY = "gbm_min_child_samples"
21
+ _MODEL_PARAMS_FILENAME = "model_params.json"
22
+ _MODEL_FILENAME = "model.pkl"
23
+ _BEST_ITERATION_KEY = "best_iteration"
24
+ _EARLY_STOPPING_ROUNDS_KEY = "gbm_early_stopping_rounds"
25
+ _ITERATIONS_KEY = "gbm_iterations"
26
+
27
+
28
+ class LightGBMModel(Model):
29
+ """A class that uses lightgbm as a model."""
30
+
31
+ _gbm: lgb.LGBMModel | None
32
+ _boosting_type: str | None
33
+ _num_leaves: int | None
34
+ _min_child_samples: int | None
35
+ _model_type: None | ModelType
36
+ _best_iteration: None | int
37
+ _early_stopping_rounds: None | int
38
+ _iterations: None | int
39
+
40
+ @classmethod
41
+ def name(cls) -> str:
42
+ return "lightgbm"
43
+
44
+ @classmethod
45
+ def supports_x(cls, df: pd.DataFrame) -> bool:
46
+ return True
47
+
48
+ def __init__(self) -> None:
49
+ super().__init__()
50
+ self._gbm = None
51
+ self._boosting_type = None
52
+ self._num_leaves = None
53
+ self._min_child_samples = None
54
+ self._model_type = None
55
+ self._best_iteration = None
56
+ self._early_stopping_rounds = None
57
+ self._iterations = None
58
+
59
+ @property
60
+ def supports_importances(self) -> bool:
61
+ return True
62
+
63
+ @property
64
+ def feature_importances(self) -> dict[str, float]:
65
+ gbm = self._provide_gbm()
66
+ importances = gbm.feature_importances_
67
+ names = gbm.feature_name_
68
+ total_importances = sum(importances)
69
+ return {
70
+ names[count]: importance / total_importances
71
+ for count, importance in enumerate(importances)
72
+ }
73
+
74
+ def provide_estimator(self):
75
+ return self._provide_gbm()
76
+
77
+ def create_estimator(self):
78
+ return self._create_gbm()
79
+
80
+ def reset(self):
81
+ self._gbm = None
82
+ self._best_iteration = None
83
+
84
+ def convert_df(self, df: pd.DataFrame) -> pd.DataFrame:
85
+ return df
86
+
87
+ def set_options(
88
+ self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
89
+ ) -> None:
90
+ self._boosting_type = trial.suggest_categorical(
91
+ _BOOSTING_TYPE_KEY, ["gbdt", "dart", "rf"]
92
+ )
93
+ self._num_leaves = trial.suggest_int(_NUM_LEAVES_KEY, 2, 256)
94
+ self._min_child_samples = trial.suggest_int(_MIN_CHILD_SAMPLES_KEY, 5, 100)
95
+ self._best_iteration = trial.user_attrs.get(_BEST_ITERATION_KEY)
96
+ self._early_stopping_rounds = trial.suggest_int(
97
+ _EARLY_STOPPING_ROUNDS_KEY, 10, 500
98
+ )
99
+ self._iterations = trial.suggest_int(_ITERATIONS_KEY, 100, 10000)
100
+
101
+ def load(self, folder: str) -> None:
102
+ with open(
103
+ os.path.join(folder, _MODEL_PARAMS_FILENAME), encoding="utf8"
104
+ ) as handle:
105
+ params = json.load(handle)
106
+ self._boosting_type = params[_BOOSTING_TYPE_KEY]
107
+ self._num_leaves = params[_NUM_LEAVES_KEY]
108
+ self._min_child_samples = params[_MIN_CHILD_SAMPLES_KEY]
109
+ self._best_iteration = params.get(_BEST_ITERATION_KEY)
110
+ self._early_stopping_rounds = params[_EARLY_STOPPING_ROUNDS_KEY]
111
+ self._iterations = params[_ITERATIONS_KEY]
112
+ self._gbm = joblib.load(os.path.join(folder, _MODEL_FILENAME))
113
+
114
+ def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
115
+ with open(
116
+ os.path.join(folder, _MODEL_PARAMS_FILENAME), "w", encoding="utf8"
117
+ ) as handle:
118
+ json.dump(
119
+ {
120
+ _BOOSTING_TYPE_KEY: self._boosting_type,
121
+ _NUM_LEAVES_KEY: self._num_leaves,
122
+ _MIN_CHILD_SAMPLES_KEY: self._min_child_samples,
123
+ _BEST_ITERATION_KEY: self._best_iteration,
124
+ _EARLY_STOPPING_ROUNDS_KEY: self._early_stopping_rounds,
125
+ _ITERATIONS_KEY: self._iterations,
126
+ },
127
+ handle,
128
+ )
129
+ gbm = self._provide_gbm()
130
+ joblib.dump(gbm, os.path.join(folder, _MODEL_FILENAME))
131
+ trial.set_user_attr(_BEST_ITERATION_KEY, self._best_iteration)
132
+
133
+ def fit(
134
+ self,
135
+ df: pd.DataFrame,
136
+ y: pd.Series | pd.DataFrame | None = None,
137
+ w: pd.Series | None = None,
138
+ eval_x: pd.DataFrame | None = None,
139
+ eval_y: pd.Series | pd.DataFrame | None = None,
140
+ ) -> Self:
141
+ if y is None:
142
+ raise ValueError("y is null.")
143
+ self._model_type = determine_model_type(y)
144
+ gbm = self._provide_gbm()
145
+ early_stopping_rounds = self._early_stopping_rounds
146
+ if early_stopping_rounds is None:
147
+ raise ValueError("early_stopping_rounds is null")
148
+
149
+ eval_set = None
150
+ callbacks = []
151
+ if eval_x is not None and eval_y is not None:
152
+ eval_set = [(eval_x, eval_y.to_numpy().flatten())] # type: ignore
153
+ callbacks = [
154
+ lgb.early_stopping(stopping_rounds=early_stopping_rounds),
155
+ ]
156
+ if self._best_iteration is not None:
157
+ eval_set = None
158
+ callbacks = []
159
+ try:
160
+ gbm.fit(
161
+ X=df,
162
+ y=y.to_numpy().flatten(),
163
+ sample_weight=w,
164
+ eval_set=eval_set, # type: ignore
165
+ callbacks=callbacks, # type: ignore
166
+ )
167
+ except lgb.basic.LightGBMError as exc:
168
+ raise WavetrainException() from exc
169
+ self._best_iteration = gbm.best_iteration_
170
+ return self
171
+
172
+ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
173
+ gbm = self._provide_gbm()
174
+ pred = gbm.predict(df)
175
+ pred_df = pd.DataFrame(
176
+ index=df.index,
177
+ data={
178
+ PREDICTION_COLUMN: pred.flatten(), # type: ignore
179
+ },
180
+ )
181
+ if self._model_type != ModelType.REGRESSION:
182
+ proba = gbm.predict_proba(df) # type: ignore
183
+ for i in range(proba.shape[1]):
184
+ pred_df[f"{PROBABILITY_COLUMN_PREFIX}{i}"] = proba[:, i]
185
+ return pred_df
186
+
187
+ def _provide_gbm(self) -> lgb.LGBMModel:
188
+ gbm = self._gbm
189
+ if gbm is None:
190
+ gbm = self._create_gbm()
191
+ self._gbm = gbm
192
+ if gbm is None:
193
+ raise ValueError("gbm is null")
194
+ return gbm
195
+
196
+ def _create_gbm(self) -> lgb.LGBMModel:
197
+ best_iteration = self._best_iteration
198
+ iterations = best_iteration if best_iteration is not None else self._iterations
199
+ boosting_type = self._boosting_type
200
+ if boosting_type is None:
201
+ raise ValueError("boosting_type is null")
202
+ num_leaves = self._num_leaves
203
+ if num_leaves is None:
204
+ raise ValueError("num_leaves is null")
205
+ min_child_samples = self._min_child_samples
206
+ if min_child_samples is None:
207
+ raise ValueError("min_child_samples is null")
208
+
209
+ match self._model_type:
210
+ case ModelType.BINARY:
211
+ return lgb.LGBMClassifier(
212
+ boosting_type=boosting_type,
213
+ num_leaves=num_leaves,
214
+ objective="binary",
215
+ min_child_samples=min_child_samples,
216
+ num_iterations=iterations,
217
+ device="gpu" if torch.cuda.is_available() else None,
218
+ )
219
+ case ModelType.REGRESSION:
220
+ return lgb.LGBMRegressor(
221
+ boosting_type=boosting_type,
222
+ num_leaves=num_leaves,
223
+ min_child_samples=min_child_samples,
224
+ num_iterations=iterations,
225
+ device="gpu" if torch.cuda.is_available() else None,
226
+ )
227
+ case ModelType.BINNED_BINARY:
228
+ return lgb.LGBMClassifier(
229
+ boosting_type=boosting_type,
230
+ num_leaves=num_leaves,
231
+ objective="binary",
232
+ min_child_samples=min_child_samples,
233
+ num_iterations=iterations,
234
+ device="gpu" if torch.cuda.is_available() else None,
235
+ )
236
+ case ModelType.MULTI_CLASSIFICATION:
237
+ return lgb.LGBMClassifier(
238
+ boosting_type=boosting_type,
239
+ num_leaves=num_leaves,
240
+ min_child_samples=min_child_samples,
241
+ num_iterations=iterations,
242
+ device="gpu" if torch.cuda.is_available() else None,
243
+ )
244
+ case _:
245
+ raise ValueError(f"Unrecognised model type: {self._model_type}")
@@ -11,6 +11,7 @@ from sklearn.metrics import accuracy_score # type: ignore
11
11
 
12
12
  from ..model_type import ModelType, determine_model_type
13
13
  from .catboost.catboost_model import CatboostModel
14
+ from .lightgbm.lightgbm_model import LightGBMModel
14
15
  from .model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
15
16
  from .tabpfn.tabpfn_model import TabPFNModel
16
17
  from .xgboost.xgboost_model import XGBoostModel
@@ -22,6 +23,7 @@ _MODELS = {
22
23
  CatboostModel.name(): CatboostModel,
23
24
  TabPFNModel.name(): TabPFNModel,
24
25
  XGBoostModel.name(): XGBoostModel,
26
+ LightGBMModel.name(): LightGBMModel,
25
27
  }
26
28
 
27
29
 
@@ -53,7 +53,7 @@ def _convert_categoricals(input_df: pd.DataFrame) -> pd.DataFrame:
53
53
  output_df = input_df.copy()
54
54
  for col in input_df.select_dtypes(include=["category"]).columns:
55
55
  output_df[col] = output_df[col].cat.codes
56
- return output_df
56
+ return output_df.replace([np.inf, -np.inf], np.nan)
57
57
 
58
58
 
59
59
  class XGBoostModel(Model):
@@ -2,7 +2,6 @@
2
2
 
3
3
  # pylint: disable=line-too-long
4
4
  import json
5
- import logging
6
5
  import os
7
6
  import time
8
7
  from typing import Self
@@ -129,6 +128,6 @@ class CombinedReducer(Reducer):
129
128
  try:
130
129
  df = reducer.transform(df)
131
130
  except ValueError as exc:
132
- logging.warning("Failed to reduce %s", reducer.name())
131
+ print("Failed to reduce %s", reducer.name())
133
132
  raise exc
134
133
  return df
@@ -1,5 +1,6 @@
1
1
  """The trainer class."""
2
2
 
3
+ # pylint: disable=line-too-long
3
4
  import datetime
4
5
  import functools
5
6
  import json
@@ -12,12 +13,14 @@ from typing import Self
12
13
  import optuna
13
14
  import pandas as pd
14
15
  import tqdm
15
- from sklearn.metrics import f1_score, r2_score # type: ignore
16
+ from sklearn.metrics import f1_score # type: ignore
17
+ from sklearn.metrics import (accuracy_score, brier_score_loss, log_loss,
18
+ precision_score, r2_score, recall_score)
16
19
 
17
20
  from .calibrator.calibrator_router import CalibratorRouter
18
21
  from .exceptions import WavetrainException
19
22
  from .fit import Fit
20
- from .model.model import PREDICTION_COLUMN
23
+ from .model.model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX
21
24
  from .model.model_router import ModelRouter
22
25
  from .model_type import ModelType, determine_model_type
23
26
  from .reducer.combined_reducer import CombinedReducer
@@ -48,6 +51,11 @@ def _assign_bin(timestamp, bins: list[datetime.datetime]) -> int:
48
51
  return len(bins) - 2 # Assign to last bin if at the end
49
52
 
50
53
 
54
+ def _best_trial(study: optuna.Study) -> optuna.trial.FrozenTrial:
55
+ best_brier = min(study.best_trials, key=lambda t: t.values[1])
56
+ return best_brier
57
+
58
+
51
59
  class Trainer(Fit):
52
60
  """A class for training and predicting from an array of data."""
53
61
 
@@ -170,7 +178,10 @@ class Trainer(Fit):
170
178
  storage=storage_name,
171
179
  load_if_exists=True,
172
180
  sampler=restored_sampler,
173
- direction=optuna.study.StudyDirection.MAXIMIZE,
181
+ directions=[
182
+ optuna.study.StudyDirection.MAXIMIZE,
183
+ optuna.study.StudyDirection.MINIMIZE,
184
+ ],
174
185
  )
175
186
 
176
187
  def fit(
@@ -210,7 +221,7 @@ class Trainer(Fit):
210
221
  save: bool,
211
222
  split_idx: datetime.datetime,
212
223
  no_evaluation: bool,
213
- ) -> float:
224
+ ) -> tuple[float, float]:
214
225
  print(f"Beginning trial for: {split_idx.isoformat()}")
215
226
  trial.set_user_attr(_IDX_USR_ATTR_KEY, split_idx.isoformat())
216
227
  folder = os.path.join(
@@ -246,7 +257,7 @@ class Trainer(Fit):
246
257
  if new_folder:
247
258
  os.removedirs(folder)
248
259
  logging.warning("Y train only contains 1 unique datapoint.")
249
- return _BAD_OUTPUT
260
+ return _BAD_OUTPUT, -_BAD_OUTPUT
250
261
  print(f"Windowing took {time.time() - start_windower}")
251
262
 
252
263
  # Perform common reductions
@@ -311,10 +322,29 @@ class Trainer(Fit):
311
322
  )
312
323
  cal_pred[PREDICTION_COLUMN] = y_pred[PREDICTION_COLUMN]
313
324
  output = 0.0
325
+ loss = 0.0
314
326
  if determine_model_type(y_series) == ModelType.REGRESSION:
315
327
  output = float(r2_score(y_test, y_pred[[PREDICTION_COLUMN]]))
328
+ print(f"R2: {output}")
316
329
  else:
317
330
  output = float(f1_score(y_test, y_pred[[PREDICTION_COLUMN]]))
331
+ print(f"F1: {output}")
332
+ prob_col = PROBABILITY_COLUMN_PREFIX + str(1)
333
+ if prob_col in y_pred.columns.values.tolist():
334
+ loss = float(brier_score_loss(y_test, y_pred[[prob_col]]))
335
+ print(f"Brier: {loss}")
336
+ print(
337
+ f"Log Loss: {float(log_loss(y_test.astype(float), y_pred[[prob_col]]))}"
338
+ )
339
+ print(
340
+ f"Accuracy: {float(accuracy_score(y_test, y_pred[[PREDICTION_COLUMN]]))}"
341
+ )
342
+ print(
343
+ f"Precision: {float(precision_score(y_test, y_pred[[PREDICTION_COLUMN]]))}"
344
+ )
345
+ print(
346
+ f"Recall: {float(recall_score(y_test, y_pred[[PREDICTION_COLUMN]]))}"
347
+ )
318
348
 
319
349
  if save:
320
350
  windower.save(folder, trial)
@@ -332,13 +362,13 @@ class Trainer(Fit):
332
362
  handle,
333
363
  )
334
364
 
335
- return output
365
+ return output, loss
336
366
  except WavetrainException as exc:
337
367
  print(str(exc))
338
368
  logging.warning(str(exc))
339
369
  if new_folder:
340
370
  os.removedirs(folder)
341
- return _BAD_OUTPUT
371
+ return _BAD_OUTPUT, -_BAD_OUTPUT
342
372
 
343
373
  start_validation_index = (
344
374
  dt_index.to_list()[-int(len(dt_index) * self._validation_size) - 1]
@@ -359,7 +389,7 @@ class Trainer(Fit):
359
389
  ].to_list()[0]
360
390
  )
361
391
 
362
- def test_objective(trial: optuna.Trial) -> float:
392
+ def test_objective(trial: optuna.Trial) -> tuple[float, float]:
363
393
  return _fit(
364
394
  trial,
365
395
  test_df,
@@ -382,7 +412,8 @@ class Trainer(Fit):
382
412
  else self._max_train_timeout.total_seconds(),
383
413
  )
384
414
  while (
385
- study.best_trial.value is None or study.best_trial.value == _BAD_OUTPUT
415
+ _best_trial(study).values is None
416
+ or _best_trial(study).values == (_BAD_OUTPUT, -_BAD_OUTPUT)
386
417
  ) and len(study.trials) < 1000:
387
418
  logging.info("Performing extra train")
388
419
  study.optimize(
@@ -420,7 +451,7 @@ class Trainer(Fit):
420
451
  if found:
421
452
  last_processed_dt = test_dt
422
453
  _fit(
423
- study.best_trial,
454
+ _best_trial(study),
424
455
  test_df.copy(),
425
456
  test_series,
426
457
  True,
@@ -441,7 +472,7 @@ class Trainer(Fit):
441
472
 
442
473
  def validate_objctive(
443
474
  trial: optuna.Trial, idx: datetime.datetime, series: pd.Series
444
- ) -> float:
475
+ ) -> tuple[float, float]:
445
476
  return _fit(trial, test_df.copy(), series, False, idx, False)
446
477
 
447
478
  study.optimize(
@@ -457,10 +488,36 @@ class Trainer(Fit):
457
488
  break
458
489
 
459
490
  _fit(
460
- study.best_trial, test_df.copy(), test_series, True, test_idx, True
491
+ _best_trial(study),
492
+ test_df.copy(),
493
+ test_series,
494
+ True,
495
+ test_idx,
496
+ True,
461
497
  )
462
498
  last_processed_dt = test_idx
463
499
 
500
+ target_names = ["F1", "Brier"]
501
+ fig = optuna.visualization.plot_pareto_front(
502
+ study, target_names=target_names
503
+ )
504
+ fig.write_image(
505
+ os.path.join(column_dir, "pareto_frontier.png"),
506
+ format="png",
507
+ width=800,
508
+ height=600,
509
+ )
510
+ for target_name in target_names:
511
+ fig = optuna.visualization.plot_param_importances(
512
+ study, target=lambda t: t.values[0], target_name=target_name
513
+ )
514
+ fig.write_image(
515
+ os.path.join(column_dir, f"{target_name}_frontier.png"),
516
+ format="png",
517
+ width=800,
518
+ height=600,
519
+ )
520
+
464
521
  if isinstance(y, pd.Series):
465
522
  _fit_column(y)
466
523
  else:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.1.10
3
+ Version: 0.1.12
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
@@ -29,6 +29,8 @@ Requires-Dist: jax>=0.6.1
29
29
  Requires-Dist: tabpfn_extensions>=0.0.4
30
30
  Requires-Dist: hyperopt>=0.2.7
31
31
  Requires-Dist: pycaleva>=0.8.2
32
+ Requires-Dist: lightgbm>=4.6.0
33
+ Requires-Dist: kaleido>=0.2.1
32
34
 
33
35
  # wavetrainer
34
36
 
@@ -64,6 +66,8 @@ Python 3.11.6:
64
66
  - [tabpfn_extensions](https://github.com/PriorLabs/tabpfn-extensions)
65
67
  - [hyperopt](https://github.com/hyperopt/hyperopt)
66
68
  - [pycaleva](https://github.com/MartinWeigl/pycaleva)
69
+ - [lightgbm](https://github.com/microsoft/LightGBM)
70
+ - [kaleido](https://github.com/plotly/Kaleido)
67
71
 
68
72
  ## Raison D'être :thought_balloon:
69
73
 
@@ -32,6 +32,8 @@ wavetrainer/model/catboost/catboost_classifier_wrap.py
32
32
  wavetrainer/model/catboost/catboost_kwargs.py
33
33
  wavetrainer/model/catboost/catboost_model.py
34
34
  wavetrainer/model/catboost/catboost_regressor_wrap.py
35
+ wavetrainer/model/lightgbm/__init__.py
36
+ wavetrainer/model/lightgbm/lightgbm_model.py
35
37
  wavetrainer/model/tabpfn/__init__.py
36
38
  wavetrainer/model/tabpfn/tabpfn_model.py
37
39
  wavetrainer/model/xgboost/__init__.py
@@ -15,4 +15,6 @@ xgboost>=3.0.0
15
15
  jax>=0.6.1
16
16
  tabpfn_extensions>=0.0.4
17
17
  hyperopt>=0.2.7
18
- pycaleva>=0.8.2
18
+ pycaleva>=0.8.2
19
+ lightgbm>=4.6.0
20
+ kaleido>=0.2.1
File without changes
File without changes
File without changes