wavetrainer 0.0.3__tar.gz → 0.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wavetrainer-0.0.3/wavetrainer.egg-info → wavetrainer-0.0.5}/PKG-INFO +1 -1
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/setup.py +1 -1
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/tests/trainer_test.py +1 -1
- wavetrainer-0.0.5/wavetrainer/__init__.py +6 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/calibrator/mapie_calibrator.py +13 -8
- wavetrainer-0.0.5/wavetrainer/model/catboost_model.py +187 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/model/model.py +6 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/model/model_router.py +7 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/model_type.py +6 -6
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/reducer/combined_reducer.py +10 -2
- wavetrainer-0.0.5/wavetrainer/reducer/nonnumeric_reducer.py +39 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/selector/selector.py +19 -13
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/trainer.py +15 -7
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/weights/class_weights.py +1 -1
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/windower/windower.py +5 -2
- {wavetrainer-0.0.3 → wavetrainer-0.0.5/wavetrainer.egg-info}/PKG-INFO +1 -1
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer.egg-info/SOURCES.txt +1 -1
- wavetrainer-0.0.3/wavetrainer/__init__.py +0 -10
- wavetrainer-0.0.3/wavetrainer/load.py +0 -8
- wavetrainer-0.0.3/wavetrainer/model/catboost_model.py +0 -80
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/LICENSE +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/MANIFEST.in +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/README.md +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/requirements.txt +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/setup.cfg +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/tests/__init__.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/calibrator/__init__.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/calibrator/calibrator.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/calibrator/calibrator_router.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/calibrator/vennabers_calibrator.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/create.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/exceptions.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/fit.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/model/__init__.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/params.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/reducer/__init__.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/reducer/base_selector_reducer.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/reducer/constant_reducer.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/reducer/correlation_reducer.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/reducer/duplicate_reducer.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/reducer/reducer.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/selector/__init__.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/weights/__init__.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/weights/combined_weights.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/weights/exponential_weights.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/weights/linear_weights.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/weights/noop_weights.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/weights/sigmoid_weights.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/weights/weights.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/weights/weights_router.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/windower/__init__.py +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer.egg-info/dependency_links.txt +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer.egg-info/not-zip-safe +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer.egg-info/requires.txt +0 -0
- {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
|
|
23
23
|
|
24
24
|
setup(
|
25
25
|
name='wavetrainer',
|
26
|
-
version='0.0.
|
26
|
+
version='0.0.5',
|
27
27
|
description='A library for automatically finding the optimal model within feature and hyperparameter space.',
|
28
28
|
long_description=long_description,
|
29
29
|
long_description_content_type='text/markdown',
|
@@ -13,7 +13,7 @@ class TestTrainer(unittest.TestCase):
|
|
13
13
|
|
14
14
|
def test_trainer(self):
|
15
15
|
with tempfile.TemporaryDirectory() as tmpdir:
|
16
|
-
trainer = Trainer(tmpdir, walkforward_timedelta=datetime.timedelta(days=
|
16
|
+
trainer = Trainer(tmpdir, walkforward_timedelta=datetime.timedelta(days=7), trials=1)
|
17
17
|
x_data = [i for i in range(100)]
|
18
18
|
x_index = [datetime.datetime(2022, 1, 1) + datetime.timedelta(days=i) for i in range(len(x_data))]
|
19
19
|
df = pd.DataFrame(
|
@@ -49,12 +49,17 @@ class MAPIECalibrator(Calibrator):
|
|
49
49
|
return self
|
50
50
|
|
51
51
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
52
|
-
alpha = [
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
)
|
52
|
+
alpha = []
|
53
|
+
for potential_alpha in [0.05, 0.32]:
|
54
|
+
if len(df) > int(1.0 / potential_alpha):
|
55
|
+
alpha.append(potential_alpha)
|
56
|
+
if alpha:
|
57
|
+
_, y_pis = self._mapie.predict(df, alpha=alpha)
|
58
|
+
for i in range(y_pis.shape[1]):
|
59
|
+
if i >= len(alpha):
|
60
|
+
continue
|
61
|
+
for ii in range(y_pis.shape[2]):
|
62
|
+
alpha_val = alpha[i]
|
63
|
+
values = y_pis[:, i, ii].flatten().tolist()
|
64
|
+
df[f"{PROBABILITY_COLUMN_PREFIX}{alpha_val}_{ii == 1}"] = values
|
60
65
|
return df
|
@@ -0,0 +1,187 @@
|
|
1
|
+
"""A model that wraps catboost."""
|
2
|
+
|
3
|
+
import json
|
4
|
+
import os
|
5
|
+
from typing import Any, Self
|
6
|
+
|
7
|
+
import optuna
|
8
|
+
import pandas as pd
|
9
|
+
from catboost import CatBoostClassifier # type: ignore
|
10
|
+
from catboost import CatBoost, CatBoostRegressor, Pool
|
11
|
+
|
12
|
+
from ..model_type import ModelType, determine_model_type
|
13
|
+
from .model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
|
14
|
+
|
15
|
+
_MODEL_FILENAME = "model.cbm"
|
16
|
+
_MODEL_PARAMS_FILENAME = "model_params.json"
|
17
|
+
_ITERATIONS_KEY = "iterations"
|
18
|
+
_LEARNING_RATE_KEY = "learning_rate"
|
19
|
+
_DEPTH_KEY = "depth"
|
20
|
+
_L2_LEAF_REG_KEY = "l2_leaf_reg"
|
21
|
+
_BOOSTING_TYPE_KEY = "boosting_type"
|
22
|
+
_MODEL_TYPE_KEY = "model_type"
|
23
|
+
|
24
|
+
|
25
|
+
class CatboostModel(Model):
|
26
|
+
"""A class that uses Catboost as a model."""
|
27
|
+
|
28
|
+
_catboost: CatBoost | None
|
29
|
+
_iterations: None | int
|
30
|
+
_learning_rate: None | float
|
31
|
+
_depth: None | int
|
32
|
+
_l2_leaf_reg: None | float
|
33
|
+
_boosting_type: None | str
|
34
|
+
_model_type: None | ModelType
|
35
|
+
|
36
|
+
@classmethod
|
37
|
+
def name(cls) -> str:
|
38
|
+
return "catboost"
|
39
|
+
|
40
|
+
def __init__(self) -> None:
|
41
|
+
super().__init__()
|
42
|
+
self._catboost = None
|
43
|
+
self._iterations = None
|
44
|
+
self._learning_rate = None
|
45
|
+
self._depth = None
|
46
|
+
self._l2_leaf_reg = None
|
47
|
+
self._boosting_type = None
|
48
|
+
self._model_type = None
|
49
|
+
|
50
|
+
@property
|
51
|
+
def estimator(self) -> Any:
|
52
|
+
return self._provide_catboost()
|
53
|
+
|
54
|
+
def pre_fit(self, y: pd.Series | pd.DataFrame | None):
|
55
|
+
if y is None:
|
56
|
+
raise ValueError("y is null.")
|
57
|
+
self._model_type = determine_model_type(y)
|
58
|
+
|
59
|
+
def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
60
|
+
self._iterations = trial.suggest_int(_ITERATIONS_KEY, 100, 10000)
|
61
|
+
self._learning_rate = trial.suggest_float(_LEARNING_RATE_KEY, 0.001, 0.3)
|
62
|
+
self._depth = trial.suggest_int(_DEPTH_KEY, 1, 12)
|
63
|
+
self._l2_leaf_reg = trial.suggest_float(_L2_LEAF_REG_KEY, 3.0, 50.0)
|
64
|
+
self._boosting_type = trial.suggest_categorical(
|
65
|
+
_BOOSTING_TYPE_KEY, ["Ordered", "Plain"]
|
66
|
+
)
|
67
|
+
|
68
|
+
def load(self, folder: str) -> None:
|
69
|
+
with open(
|
70
|
+
os.path.join(folder, _MODEL_PARAMS_FILENAME), encoding="utf8"
|
71
|
+
) as handle:
|
72
|
+
params = json.load(handle)
|
73
|
+
self._iterations = params[_ITERATIONS_KEY]
|
74
|
+
self._learning_rate = params[_LEARNING_RATE_KEY]
|
75
|
+
self._depth = params[_DEPTH_KEY]
|
76
|
+
self._l2_leaf_reg = params[_L2_LEAF_REG_KEY]
|
77
|
+
self._boosting_type = params[_BOOSTING_TYPE_KEY]
|
78
|
+
self._model_type = ModelType(params[_MODEL_TYPE_KEY])
|
79
|
+
catboost = self._provide_catboost()
|
80
|
+
catboost.load_model(os.path.join(folder, _MODEL_FILENAME))
|
81
|
+
|
82
|
+
def save(self, folder: str) -> None:
|
83
|
+
with open(
|
84
|
+
os.path.join(folder, _MODEL_PARAMS_FILENAME), "w", encoding="utf8"
|
85
|
+
) as handle:
|
86
|
+
json.dump(
|
87
|
+
{
|
88
|
+
_ITERATIONS_KEY: self._iterations,
|
89
|
+
_LEARNING_RATE_KEY: self._learning_rate,
|
90
|
+
_DEPTH_KEY: self._depth,
|
91
|
+
_L2_LEAF_REG_KEY: self._l2_leaf_reg,
|
92
|
+
_BOOSTING_TYPE_KEY: self._boosting_type,
|
93
|
+
_MODEL_TYPE_KEY: str(self._model_type),
|
94
|
+
},
|
95
|
+
handle,
|
96
|
+
)
|
97
|
+
catboost = self._provide_catboost()
|
98
|
+
catboost.save_model(os.path.join(folder, _MODEL_FILENAME))
|
99
|
+
|
100
|
+
def fit(
|
101
|
+
self,
|
102
|
+
df: pd.DataFrame,
|
103
|
+
y: pd.Series | pd.DataFrame | None = None,
|
104
|
+
w: pd.Series | None = None,
|
105
|
+
) -> Self:
|
106
|
+
if y is None:
|
107
|
+
raise ValueError("y is null.")
|
108
|
+
self._model_type = determine_model_type(y)
|
109
|
+
catboost = self._provide_catboost()
|
110
|
+
|
111
|
+
train_pool = Pool(
|
112
|
+
df,
|
113
|
+
label=y,
|
114
|
+
weight=w,
|
115
|
+
)
|
116
|
+
catboost.fit(
|
117
|
+
train_pool,
|
118
|
+
early_stopping_rounds=100,
|
119
|
+
verbose=False,
|
120
|
+
metric_period=100,
|
121
|
+
)
|
122
|
+
return self
|
123
|
+
|
124
|
+
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
125
|
+
pred_pool = Pool(df)
|
126
|
+
catboost = self._provide_catboost()
|
127
|
+
pred = catboost.predict(pred_pool)
|
128
|
+
df = pd.DataFrame(
|
129
|
+
index=df.index,
|
130
|
+
data={
|
131
|
+
PREDICTION_COLUMN: pred.flatten(),
|
132
|
+
},
|
133
|
+
)
|
134
|
+
if self._model_type != ModelType.REGRESSION:
|
135
|
+
proba = catboost.predict_proba(pred_pool) # type: ignore
|
136
|
+
for i in range(proba.shape[1]):
|
137
|
+
df[f"{PROBABILITY_COLUMN_PREFIX}{i}"] = proba[:, i]
|
138
|
+
return df
|
139
|
+
|
140
|
+
def _provide_catboost(self) -> CatBoost:
|
141
|
+
catboost = self._catboost
|
142
|
+
if catboost is None:
|
143
|
+
match self._model_type:
|
144
|
+
case ModelType.BINARY:
|
145
|
+
catboost = CatBoostClassifier(
|
146
|
+
iterations=self._iterations,
|
147
|
+
learning_rate=self._learning_rate,
|
148
|
+
depth=self._depth,
|
149
|
+
l2_leaf_reg=self._l2_leaf_reg,
|
150
|
+
boosting_type=self._boosting_type,
|
151
|
+
early_stopping_rounds=100,
|
152
|
+
metric_period=100,
|
153
|
+
)
|
154
|
+
case ModelType.REGRESSION:
|
155
|
+
catboost = CatBoostRegressor(
|
156
|
+
iterations=self._iterations,
|
157
|
+
learning_rate=self._learning_rate,
|
158
|
+
depth=self._depth,
|
159
|
+
l2_leaf_reg=self._l2_leaf_reg,
|
160
|
+
boosting_type=self._boosting_type,
|
161
|
+
early_stopping_rounds=100,
|
162
|
+
metric_period=100,
|
163
|
+
)
|
164
|
+
case ModelType.BINNED_BINARY:
|
165
|
+
catboost = CatBoostClassifier(
|
166
|
+
iterations=self._iterations,
|
167
|
+
learning_rate=self._learning_rate,
|
168
|
+
depth=self._depth,
|
169
|
+
l2_leaf_reg=self._l2_leaf_reg,
|
170
|
+
boosting_type=self._boosting_type,
|
171
|
+
early_stopping_rounds=100,
|
172
|
+
metric_period=100,
|
173
|
+
)
|
174
|
+
case ModelType.MULTI_CLASSIFICATION:
|
175
|
+
catboost = CatBoostClassifier(
|
176
|
+
iterations=self._iterations,
|
177
|
+
learning_rate=self._learning_rate,
|
178
|
+
depth=self._depth,
|
179
|
+
l2_leaf_reg=self._l2_leaf_reg,
|
180
|
+
boosting_type=self._boosting_type,
|
181
|
+
early_stopping_rounds=100,
|
182
|
+
metric_period=100,
|
183
|
+
)
|
184
|
+
self._catboost = catboost
|
185
|
+
if catboost is None:
|
186
|
+
raise ValueError("catboost is null")
|
187
|
+
return catboost
|
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
from typing import Any
|
4
4
|
|
5
|
+
import pandas as pd
|
6
|
+
|
5
7
|
from ..fit import Fit
|
6
8
|
from ..params import Params
|
7
9
|
|
@@ -21,3 +23,7 @@ class Model(Params, Fit):
|
|
21
23
|
def estimator(self) -> Any:
|
22
24
|
"""The estimator backing the model."""
|
23
25
|
raise NotImplementedError("estimator not implemented in parent class.")
|
26
|
+
|
27
|
+
def pre_fit(self, y: pd.Series | pd.DataFrame | None) -> None:
|
28
|
+
"""A call to make sure the model is prepared for the target type."""
|
29
|
+
raise NotImplementedError("pre_fit not implemented in parent class.")
|
@@ -37,10 +37,17 @@ class ModelRouter(Model):
|
|
37
37
|
raise ValueError("model is null")
|
38
38
|
return model.estimator
|
39
39
|
|
40
|
+
def pre_fit(self, y: pd.Series | pd.DataFrame | None):
|
41
|
+
model = self._model
|
42
|
+
if model is None:
|
43
|
+
raise ValueError("model is null")
|
44
|
+
model.pre_fit(y)
|
45
|
+
|
40
46
|
def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
41
47
|
self._model = _MODELS[
|
42
48
|
trial.suggest_categorical("model", list(_MODELS.keys()))
|
43
49
|
]()
|
50
|
+
self._model.set_options(trial)
|
44
51
|
|
45
52
|
def load(self, folder: str) -> None:
|
46
53
|
with open(os.path.join(folder, _MODEL_ROUTER_FILE), encoding="utf8") as handle:
|
@@ -1,17 +1,17 @@
|
|
1
1
|
"""An enum to define the model type."""
|
2
2
|
|
3
|
-
from enum import
|
3
|
+
from enum import StrEnum, auto
|
4
4
|
|
5
5
|
import pandas as pd
|
6
6
|
|
7
7
|
|
8
|
-
class ModelType(
|
8
|
+
class ModelType(StrEnum):
|
9
9
|
"""The type of model being run."""
|
10
10
|
|
11
|
-
BINARY =
|
12
|
-
REGRESSION =
|
13
|
-
BINNED_BINARY =
|
14
|
-
MULTI_CLASSIFICATION =
|
11
|
+
BINARY = auto()
|
12
|
+
REGRESSION = auto()
|
13
|
+
BINNED_BINARY = auto()
|
14
|
+
MULTI_CLASSIFICATION = auto()
|
15
15
|
|
16
16
|
|
17
17
|
def determine_model_type(y: pd.Series | pd.DataFrame) -> ModelType:
|
@@ -10,6 +10,7 @@ import pandas as pd
|
|
10
10
|
from .constant_reducer import ConstantReducer
|
11
11
|
from .correlation_reducer import CorrelationReducer
|
12
12
|
from .duplicate_reducer import DuplicateReducer
|
13
|
+
from .nonnumeric_reducer import NonNumericReducer
|
13
14
|
from .reducer import Reducer
|
14
15
|
|
15
16
|
_COMBINED_REDUCER_FILE = "combined_reducer.json"
|
@@ -21,7 +22,12 @@ class CombinedReducer(Reducer):
|
|
21
22
|
|
22
23
|
def __init__(self):
|
23
24
|
super().__init__()
|
24
|
-
self._reducers = [
|
25
|
+
self._reducers = [
|
26
|
+
ConstantReducer(),
|
27
|
+
DuplicateReducer(),
|
28
|
+
CorrelationReducer(),
|
29
|
+
NonNumericReducer(),
|
30
|
+
]
|
25
31
|
|
26
32
|
@classmethod
|
27
33
|
def name(cls) -> str:
|
@@ -44,6 +50,8 @@ class CombinedReducer(Reducer):
|
|
44
50
|
self._reducers.append(DuplicateReducer())
|
45
51
|
elif reducer_name == CorrelationReducer.name():
|
46
52
|
self._reducers.append(CorrelationReducer())
|
53
|
+
elif reducer_name == NonNumericReducer.name():
|
54
|
+
self._reducers.append(NonNumericReducer())
|
47
55
|
for reducer in self._reducers:
|
48
56
|
reducer.load(folder)
|
49
57
|
|
@@ -67,7 +75,7 @@ class CombinedReducer(Reducer):
|
|
67
75
|
w: pd.Series | None = None,
|
68
76
|
) -> Self:
|
69
77
|
for reducer in self._reducers:
|
70
|
-
reducer.
|
78
|
+
df = reducer.fit_transform(df)
|
71
79
|
return self
|
72
80
|
|
73
81
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
@@ -0,0 +1,39 @@
|
|
1
|
+
"""A reducer that removes non-numeric columns."""
|
2
|
+
|
3
|
+
from typing import Self
|
4
|
+
|
5
|
+
import optuna
|
6
|
+
import pandas as pd
|
7
|
+
|
8
|
+
from .reducer import Reducer
|
9
|
+
|
10
|
+
|
11
|
+
class NonNumericReducer(Reducer):
|
12
|
+
"""A class that removes non numeric columns from a dataframe."""
|
13
|
+
|
14
|
+
@classmethod
|
15
|
+
def name(cls) -> str:
|
16
|
+
return "nonnumeric"
|
17
|
+
|
18
|
+
def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
19
|
+
pass
|
20
|
+
|
21
|
+
def load(self, folder: str) -> None:
|
22
|
+
pass
|
23
|
+
|
24
|
+
def save(self, folder: str) -> None:
|
25
|
+
pass
|
26
|
+
|
27
|
+
def fit(
|
28
|
+
self,
|
29
|
+
df: pd.DataFrame,
|
30
|
+
y: pd.Series | pd.DataFrame | None = None,
|
31
|
+
w: pd.Series | None = None,
|
32
|
+
) -> Self:
|
33
|
+
return self
|
34
|
+
|
35
|
+
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
36
|
+
categorical_cols = df.select_dtypes(include="category").columns.tolist()
|
37
|
+
numeric_cols = df.select_dtypes(include="number").columns.tolist()
|
38
|
+
keep_cols = categorical_cols + numeric_cols
|
39
|
+
return df[keep_cols]
|
@@ -19,24 +19,18 @@ _SELECTOR_FILE = "selector.joblib"
|
|
19
19
|
class Selector(Params, Fit):
|
20
20
|
"""The selector class."""
|
21
21
|
|
22
|
-
|
22
|
+
_selector: RFE | None
|
23
|
+
|
24
|
+
def __init__(self, model: Model):
|
23
25
|
super().__init__()
|
24
26
|
self._model = model
|
25
27
|
self._feature_ratio = 0.0
|
26
28
|
self._steps = 0
|
27
|
-
|
28
|
-
self._selector = RFE(
|
29
|
-
model.estimator,
|
30
|
-
n_features_to_select=n_features_to_select,
|
31
|
-
step=self._steps,
|
32
|
-
verbose=1,
|
33
|
-
)
|
29
|
+
self._selector = None
|
34
30
|
|
35
31
|
def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
36
32
|
self._feature_ratio = trial.suggest_float("feature_ratio", 0.0, 1.0)
|
37
|
-
|
38
|
-
self._steps = steps
|
39
|
-
self._selector.step = steps
|
33
|
+
self._steps = trial.suggest_int("steps", 1, 16)
|
40
34
|
|
41
35
|
def load(self, folder: str) -> None:
|
42
36
|
self._selector = joblib.load(os.path.join(folder, _SELECTOR_FILE))
|
@@ -50,9 +44,18 @@ class Selector(Params, Fit):
|
|
50
44
|
y: pd.Series | pd.DataFrame | None = None,
|
51
45
|
w: pd.Series | None = None,
|
52
46
|
) -> Self:
|
47
|
+
self._model.pre_fit(y)
|
53
48
|
if not isinstance(y, pd.Series):
|
54
49
|
raise ValueError("y is not a series.")
|
55
|
-
|
50
|
+
n_features_to_select = max(1, int(len(df.columns) * self._feature_ratio))
|
51
|
+
self._selector = RFE(
|
52
|
+
self._model.estimator,
|
53
|
+
n_features_to_select=n_features_to_select,
|
54
|
+
step=max(
|
55
|
+
1,
|
56
|
+
int((len(df.columns) - n_features_to_select) / self._steps),
|
57
|
+
),
|
58
|
+
)
|
56
59
|
try:
|
57
60
|
self._selector.fit(df, y=y, sample_weight=w)
|
58
61
|
except ValueError as exc:
|
@@ -61,8 +64,11 @@ class Selector(Params, Fit):
|
|
61
64
|
return self
|
62
65
|
|
63
66
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
67
|
+
selector = self._selector
|
68
|
+
if selector is None:
|
69
|
+
raise ValueError("selector is null.")
|
64
70
|
try:
|
65
|
-
return df[
|
71
|
+
return df[selector.get_feature_names_out()]
|
66
72
|
except AttributeError as exc:
|
67
73
|
# Catch issues with 1 feature as a reduction target.
|
68
74
|
logging.warning(str(exc))
|
@@ -11,7 +11,7 @@ from typing import Self
|
|
11
11
|
import optuna
|
12
12
|
import pandas as pd
|
13
13
|
import tqdm
|
14
|
-
from sklearn.metrics import
|
14
|
+
from sklearn.metrics import f1_score, mean_absolute_error # type: ignore
|
15
15
|
|
16
16
|
from .calibrator.calibrator_router import CalibratorRouter
|
17
17
|
from .exceptions import WavetrainException
|
@@ -215,7 +215,7 @@ class Trainer(Fit):
|
|
215
215
|
model.set_options(trial)
|
216
216
|
|
217
217
|
# Train
|
218
|
-
selector = Selector(model
|
218
|
+
selector = Selector(model)
|
219
219
|
selector.set_options(trial)
|
220
220
|
selector.fit(x_train, y=y_train, w=w)
|
221
221
|
x_train = selector.transform(x_train)
|
@@ -243,7 +243,7 @@ class Trainer(Fit):
|
|
243
243
|
y_pred = model.transform(x_test)
|
244
244
|
y_pred = calibrator.transform(y_pred)
|
245
245
|
if determine_model_type(y_series) == ModelType.REGRESSION:
|
246
|
-
return
|
246
|
+
return mean_absolute_error(y_test, y_pred[[PREDICTION_COLUMN]])
|
247
247
|
return f1_score(y_test, y_pred[[PREDICTION_COLUMN]])
|
248
248
|
except WavetrainException as exc:
|
249
249
|
logging.warning(str(exc))
|
@@ -252,14 +252,16 @@ class Trainer(Fit):
|
|
252
252
|
start_validation_index = (
|
253
253
|
dt_index[-int(len(dt_index) * self._validation_size) - 1]
|
254
254
|
if isinstance(self._validation_size, float)
|
255
|
-
else dt_index[
|
255
|
+
else dt_index[
|
256
|
+
dt_index >= (dt_index.to_list()[-1] - self._validation_size)
|
257
|
+
].to_list()[0]
|
256
258
|
)
|
257
259
|
test_df = df[dt_index < start_validation_index]
|
258
260
|
test_dt_index = (
|
259
261
|
test_df.index if self._dt_column is None else test_df[self._dt_column]
|
260
262
|
)
|
261
263
|
start_test_index = (
|
262
|
-
test_dt_index[-int(len(test_dt_index) * self._test_size)]
|
264
|
+
test_dt_index.to_list()[-int(len(test_dt_index) * self._test_size)]
|
263
265
|
if isinstance(self._test_size, float)
|
264
266
|
else test_dt_index[test_dt_index >= self._test_size][0]
|
265
267
|
)
|
@@ -284,9 +286,15 @@ class Trainer(Fit):
|
|
284
286
|
train_len = len(df[dt_index < start_test_index])
|
285
287
|
test_len = len(df.loc[start_test_index:start_validation_index])
|
286
288
|
|
289
|
+
last_processed_dt = None
|
287
290
|
for count, test_idx in tqdm.tqdm(
|
288
|
-
enumerate(
|
291
|
+
enumerate(test_dt_index[test_dt_index >= start_test_index])
|
289
292
|
):
|
293
|
+
if (
|
294
|
+
last_processed_dt is not None
|
295
|
+
and test_idx < last_processed_dt + self._walkforward_timedelta
|
296
|
+
):
|
297
|
+
continue
|
290
298
|
test_dt = test_idx.to_pydatetime()
|
291
299
|
found = False
|
292
300
|
for trial in study.trials:
|
@@ -371,7 +379,7 @@ class Trainer(Fit):
|
|
371
379
|
model = ModelRouter()
|
372
380
|
model.load(folder)
|
373
381
|
|
374
|
-
selector = Selector(model
|
382
|
+
selector = Selector(model)
|
375
383
|
selector.load(folder)
|
376
384
|
|
377
385
|
calibrator = CalibratorRouter(model)
|
@@ -47,7 +47,7 @@ class ClassWeights(Weights):
|
|
47
47
|
self._class_weights = {}
|
48
48
|
return self
|
49
49
|
|
50
|
-
arr =
|
50
|
+
arr = y.astype(int).to_numpy().flatten().astype(float)
|
51
51
|
unique_vals = np.unique(arr)
|
52
52
|
w_arr = compute_class_weight(
|
53
53
|
class_weight="balanced", classes=unique_vals, y=arr
|
@@ -52,7 +52,9 @@ class Windower(Params, Fit):
|
|
52
52
|
lookback_ratio = self._lookback_ratio
|
53
53
|
if lookback_ratio is None:
|
54
54
|
raise ValueError("lookback_ratio is null")
|
55
|
-
dt_index =
|
55
|
+
dt_index = (
|
56
|
+
df.index if self._dt_column is None else df[self._dt_column].to_list()
|
57
|
+
)
|
56
58
|
start_idx = dt_index[int(len(df) * lookback_ratio)]
|
57
59
|
end_idx = dt_index[-1]
|
58
60
|
td = end_idx.to_pydatetime() - start_idx.to_pydatetime()
|
@@ -66,5 +68,6 @@ class Windower(Params, Fit):
|
|
66
68
|
dt_index = df.index if self._dt_column is None else df[self._dt_column]
|
67
69
|
return df[
|
68
70
|
dt_index
|
69
|
-
>= dt_index[-1].to_pydatetime()
|
71
|
+
>= dt_index.to_list()[-1].to_pydatetime()
|
72
|
+
- datetime.timedelta(seconds=lookback)
|
70
73
|
]
|
@@ -9,7 +9,6 @@ wavetrainer/__init__.py
|
|
9
9
|
wavetrainer/create.py
|
10
10
|
wavetrainer/exceptions.py
|
11
11
|
wavetrainer/fit.py
|
12
|
-
wavetrainer/load.py
|
13
12
|
wavetrainer/model_type.py
|
14
13
|
wavetrainer/params.py
|
15
14
|
wavetrainer/trainer.py
|
@@ -34,6 +33,7 @@ wavetrainer/reducer/combined_reducer.py
|
|
34
33
|
wavetrainer/reducer/constant_reducer.py
|
35
34
|
wavetrainer/reducer/correlation_reducer.py
|
36
35
|
wavetrainer/reducer/duplicate_reducer.py
|
36
|
+
wavetrainer/reducer/nonnumeric_reducer.py
|
37
37
|
wavetrainer/reducer/reducer.py
|
38
38
|
wavetrainer/selector/__init__.py
|
39
39
|
wavetrainer/selector/selector.py
|
@@ -1,80 +0,0 @@
|
|
1
|
-
"""A model that wraps catboost."""
|
2
|
-
|
3
|
-
import os
|
4
|
-
from typing import Any, Self
|
5
|
-
|
6
|
-
import optuna
|
7
|
-
import pandas as pd
|
8
|
-
from catboost import CatBoostClassifier, Pool # type: ignore
|
9
|
-
|
10
|
-
from .model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
|
11
|
-
|
12
|
-
_MODEL_FILENAME = "model.cbm"
|
13
|
-
|
14
|
-
|
15
|
-
class CatboostModel(Model):
|
16
|
-
"""A class that uses Catboost as a model."""
|
17
|
-
|
18
|
-
@classmethod
|
19
|
-
def name(cls) -> str:
|
20
|
-
return "catboost"
|
21
|
-
|
22
|
-
def __init__(self) -> None:
|
23
|
-
super().__init__()
|
24
|
-
self._catboost = CatBoostClassifier()
|
25
|
-
|
26
|
-
@property
|
27
|
-
def estimator(self) -> Any:
|
28
|
-
return self._catboost
|
29
|
-
|
30
|
-
def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
31
|
-
iterations = trial.suggest_int("iterations", 100, 10000)
|
32
|
-
learning_rate = trial.suggest_float("learning_rate", 0.001, 0.3)
|
33
|
-
depth = trial.suggest_int("depth", 1, 12)
|
34
|
-
l2_leaf_reg = trial.suggest_float("l2_leaf_reg", 3.0, 50.0)
|
35
|
-
boosting_type = trial.suggest_categorical("boosting_type", ["Ordered", "Plain"])
|
36
|
-
self._catboost.set_params(
|
37
|
-
iterations=iterations,
|
38
|
-
learning_rate=learning_rate,
|
39
|
-
depth=depth,
|
40
|
-
l2_leaf_reg=l2_leaf_reg,
|
41
|
-
boosting_type=boosting_type,
|
42
|
-
early_stopping_rounds=100,
|
43
|
-
)
|
44
|
-
|
45
|
-
def load(self, folder: str) -> None:
|
46
|
-
self._catboost.load_model(os.path.join(folder, _MODEL_FILENAME))
|
47
|
-
|
48
|
-
def save(self, folder: str) -> None:
|
49
|
-
self._catboost.save_model(os.path.join(folder, _MODEL_FILENAME))
|
50
|
-
|
51
|
-
def fit(
|
52
|
-
self,
|
53
|
-
df: pd.DataFrame,
|
54
|
-
y: pd.Series | pd.DataFrame | None = None,
|
55
|
-
w: pd.Series | None = None,
|
56
|
-
) -> Self:
|
57
|
-
train_pool = Pool(
|
58
|
-
df,
|
59
|
-
label=y,
|
60
|
-
weight=w,
|
61
|
-
)
|
62
|
-
self._catboost.fit(
|
63
|
-
train_pool,
|
64
|
-
early_stopping_rounds=100,
|
65
|
-
)
|
66
|
-
return self
|
67
|
-
|
68
|
-
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
69
|
-
pred_pool = Pool(df)
|
70
|
-
pred = self._catboost.predict(pred_pool)
|
71
|
-
proba = self._catboost.predict_proba(pred_pool)
|
72
|
-
df = pd.DataFrame(
|
73
|
-
index=df.index,
|
74
|
-
data={
|
75
|
-
PREDICTION_COLUMN: pred.flatten(),
|
76
|
-
},
|
77
|
-
)
|
78
|
-
for i in range(proba.shape[1]):
|
79
|
-
df[f"{PROBABILITY_COLUMN_PREFIX}{i}"] = proba[:, i]
|
80
|
-
return df
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|