wavetrainer 0.0.4__tar.gz → 0.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wavetrainer-0.0.4/wavetrainer.egg-info → wavetrainer-0.0.6}/PKG-INFO +1 -3
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/README.md +0 -1
- wavetrainer-0.0.4/wavetrainer.egg-info/requires.txt → wavetrainer-0.0.6/requirements.txt +1 -2
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/setup.py +1 -1
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/tests/trainer_test.py +1 -1
- wavetrainer-0.0.6/wavetrainer/__init__.py +6 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/calibrator/calibrator_router.py +5 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/calibrator/mapie_calibrator.py +24 -8
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/calibrator/vennabers_calibrator.py +4 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/fit.py +8 -1
- wavetrainer-0.0.6/wavetrainer/model/catboost_classifier_wrap.py +15 -0
- wavetrainer-0.0.6/wavetrainer/model/catboost_kwargs.py +35 -0
- wavetrainer-0.0.6/wavetrainer/model/catboost_model.py +209 -0
- wavetrainer-0.0.6/wavetrainer/model/catboost_regressor_wrap.py +13 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/model/model.py +12 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/model/model_router.py +18 -1
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/model_type.py +6 -6
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/reducer/base_selector_reducer.py +4 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/reducer/combined_reducer.py +4 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/reducer/nonnumeric_reducer.py +4 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/selector/selector.py +26 -14
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/trainer.py +18 -8
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/weights/class_weights.py +4 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/weights/combined_weights.py +4 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/weights/exponential_weights.py +4 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/weights/linear_weights.py +3 -1
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/weights/noop_weights.py +3 -1
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/weights/sigmoid_weights.py +3 -1
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/weights/weights_router.py +4 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/windower/windower.py +4 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6/wavetrainer.egg-info}/PKG-INFO +1 -3
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer.egg-info/SOURCES.txt +3 -1
- wavetrainer-0.0.4/requirements.txt → wavetrainer-0.0.6/wavetrainer.egg-info/requires.txt +0 -1
- wavetrainer-0.0.4/wavetrainer/__init__.py +0 -10
- wavetrainer-0.0.4/wavetrainer/load.py +0 -8
- wavetrainer-0.0.4/wavetrainer/model/catboost_model.py +0 -80
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/LICENSE +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/MANIFEST.in +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/setup.cfg +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/tests/__init__.py +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/calibrator/__init__.py +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/calibrator/calibrator.py +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/create.py +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/exceptions.py +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/model/__init__.py +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/params.py +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/reducer/__init__.py +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/reducer/constant_reducer.py +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/reducer/correlation_reducer.py +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/reducer/duplicate_reducer.py +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/reducer/reducer.py +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/selector/__init__.py +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/weights/__init__.py +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/weights/weights.py +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer/windower/__init__.py +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer.egg-info/dependency_links.txt +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer.egg-info/not-zip-safe +0 -0
- {wavetrainer-0.0.4 → wavetrainer-0.0.6}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: wavetrainer
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.6
|
4
4
|
Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
|
5
5
|
Home-page: https://github.com/8W9aG/wavetrainer
|
6
6
|
Author: Will Sackfield
|
@@ -21,7 +21,6 @@ Requires-Dist: scipy>=1.15.2
|
|
21
21
|
Requires-Dist: catboost>=1.2.7
|
22
22
|
Requires-Dist: venn-abers>=1.4.6
|
23
23
|
Requires-Dist: mapie>=0.9.2
|
24
|
-
Requires-Dist: shapiq>=1.2.2
|
25
24
|
|
26
25
|
# wavetrainer
|
27
26
|
|
@@ -49,7 +48,6 @@ Python 3.11.6:
|
|
49
48
|
- [catboost](https://catboost.ai/)
|
50
49
|
- [venn-abers](https://github.com/ip200/venn-abers)
|
51
50
|
- [mapie](https://mapie.readthedocs.io/en/stable/)
|
52
|
-
- [shapiq](https://github.com/mmschlk/shapiq)
|
53
51
|
|
54
52
|
## Raison D'être :thought_balloon:
|
55
53
|
|
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
|
|
23
23
|
|
24
24
|
setup(
|
25
25
|
name='wavetrainer',
|
26
|
-
version='0.0.
|
26
|
+
version='0.0.6',
|
27
27
|
description='A library for automatically finding the optimal model within feature and hyperparameter space.',
|
28
28
|
long_description=long_description,
|
29
29
|
long_description_content_type='text/markdown',
|
@@ -13,7 +13,7 @@ class TestTrainer(unittest.TestCase):
|
|
13
13
|
|
14
14
|
def test_trainer(self):
|
15
15
|
with tempfile.TemporaryDirectory() as tmpdir:
|
16
|
-
trainer = Trainer(tmpdir, walkforward_timedelta=datetime.timedelta(days=
|
16
|
+
trainer = Trainer(tmpdir, walkforward_timedelta=datetime.timedelta(days=7), trials=1)
|
17
17
|
x_data = [i for i in range(100)]
|
18
18
|
x_index = [datetime.datetime(2022, 1, 1) + datetime.timedelta(days=i) for i in range(len(x_data))]
|
19
19
|
df = pd.DataFrame(
|
@@ -24,6 +24,8 @@ _CALIBRATORS = {
|
|
24
24
|
class CalibratorRouter(Calibrator):
|
25
25
|
"""A router that routes to a different calibrator class."""
|
26
26
|
|
27
|
+
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
28
|
+
|
27
29
|
_calibrator: Calibrator | None
|
28
30
|
|
29
31
|
def __init__(self, model: Model):
|
@@ -66,7 +68,10 @@ class CalibratorRouter(Calibrator):
|
|
66
68
|
df: pd.DataFrame,
|
67
69
|
y: pd.Series | pd.DataFrame | None = None,
|
68
70
|
w: pd.Series | None = None,
|
71
|
+
eval_x: pd.DataFrame | None = None,
|
72
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
69
73
|
) -> Self:
|
74
|
+
# pylint: disable=no-else-return
|
70
75
|
calibrator: Calibrator | None = None
|
71
76
|
if determine_model_type(df) == ModelType.REGRESSION:
|
72
77
|
calibrator = MAPIECalibrator(self._model)
|
@@ -1,11 +1,13 @@
|
|
1
1
|
"""A calibrator that implements MAPIE."""
|
2
2
|
|
3
|
+
import logging
|
3
4
|
import os
|
4
5
|
from typing import Self
|
5
6
|
|
6
7
|
import joblib # type: ignore
|
7
8
|
import optuna
|
8
9
|
import pandas as pd
|
10
|
+
import sklearn # type: ignore
|
9
11
|
from mapie.regression import MapieRegressor # type: ignore
|
10
12
|
|
11
13
|
from ..model.model import PROBABILITY_COLUMN_PREFIX, Model
|
@@ -17,6 +19,8 @@ _CALIBRATOR_FILENAME = "mapie.joblib"
|
|
17
19
|
class MAPIECalibrator(Calibrator):
|
18
20
|
"""A class that uses MAPIE as a calibrator."""
|
19
21
|
|
22
|
+
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
23
|
+
|
20
24
|
def __init__(self, model: Model):
|
21
25
|
super().__init__(model)
|
22
26
|
self._mapie = MapieRegressor(model.estimator, method="plus")
|
@@ -39,22 +43,34 @@ class MAPIECalibrator(Calibrator):
|
|
39
43
|
df: pd.DataFrame,
|
40
44
|
y: pd.Series | pd.DataFrame | None = None,
|
41
45
|
w: pd.Series | None = None,
|
46
|
+
eval_x: pd.DataFrame | None = None,
|
47
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
42
48
|
) -> Self:
|
43
49
|
mapie = self._mapie
|
44
50
|
if mapie is None:
|
45
51
|
raise ValueError("mapie is null")
|
46
52
|
if y is None:
|
47
53
|
raise ValueError("y is null")
|
54
|
+
if len(df) <= 5:
|
55
|
+
return self
|
48
56
|
mapie.fit(df.to_numpy(), y.to_numpy())
|
49
57
|
return self
|
50
58
|
|
51
59
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
)
|
60
|
+
try:
|
61
|
+
alpha = []
|
62
|
+
for potential_alpha in [0.05, 0.32]:
|
63
|
+
if len(df) > int(1.0 / potential_alpha):
|
64
|
+
alpha.append(potential_alpha)
|
65
|
+
if alpha:
|
66
|
+
_, y_pis = self._mapie.predict(df, alpha=alpha)
|
67
|
+
for i in range(y_pis.shape[1]):
|
68
|
+
if i >= len(alpha):
|
69
|
+
continue
|
70
|
+
for ii in range(y_pis.shape[2]):
|
71
|
+
alpha_val = alpha[i]
|
72
|
+
values = y_pis[:, i, ii].flatten().tolist()
|
73
|
+
df[f"{PROBABILITY_COLUMN_PREFIX}{alpha_val}_{ii == 1}"] = values
|
74
|
+
except sklearn.exceptions.NotFittedError as exc: # type: ignore
|
75
|
+
logging.warning(str(exc))
|
60
76
|
return df
|
@@ -17,6 +17,8 @@ _CALIBRATOR_FILENAME = "vennabers.joblib"
|
|
17
17
|
class VennabersCalibrator(Calibrator):
|
18
18
|
"""A class that uses venn abers as a calibrator."""
|
19
19
|
|
20
|
+
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
21
|
+
|
20
22
|
def __init__(self, model: Model):
|
21
23
|
super().__init__(model)
|
22
24
|
self._vennabers = VennAbers()
|
@@ -39,6 +41,8 @@ class VennabersCalibrator(Calibrator):
|
|
39
41
|
df: pd.DataFrame,
|
40
42
|
y: pd.Series | pd.DataFrame | None = None,
|
41
43
|
w: pd.Series | None = None,
|
44
|
+
eval_x: pd.DataFrame | None = None,
|
45
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
42
46
|
) -> Self:
|
43
47
|
vennabers = self._vennabers
|
44
48
|
if vennabers is None:
|
@@ -8,11 +8,15 @@ import pandas as pd
|
|
8
8
|
class Fit:
|
9
9
|
"""The prototype fit class."""
|
10
10
|
|
11
|
+
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
12
|
+
|
11
13
|
def fit(
|
12
14
|
self,
|
13
15
|
df: pd.DataFrame,
|
14
16
|
y: pd.Series | pd.DataFrame | None = None,
|
15
17
|
w: pd.Series | None = None,
|
18
|
+
eval_x: pd.DataFrame | None = None,
|
19
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
16
20
|
) -> Self:
|
17
21
|
"""Fit the dataframe."""
|
18
22
|
raise NotImplementedError("fit not implemented in parent class.")
|
@@ -25,6 +29,9 @@ class Fit:
|
|
25
29
|
self,
|
26
30
|
df: pd.DataFrame,
|
27
31
|
y: pd.Series | pd.DataFrame | None = None,
|
32
|
+
w: pd.Series | None = None,
|
33
|
+
eval_x: pd.DataFrame | None = None,
|
34
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
28
35
|
) -> pd.DataFrame:
|
29
36
|
"""Fit and then trasnfrom the dataframe."""
|
30
|
-
return self.fit(df, y=y).transform(df)
|
37
|
+
return self.fit(df, y=y, w=w, eval_x=eval_x, eval_y=eval_y).transform(df)
|
@@ -0,0 +1,15 @@
|
|
1
|
+
"""A wrapper for catboost classifier to handle some edge cases."""
|
2
|
+
|
3
|
+
# pylint: disable=duplicate-code
|
4
|
+
|
5
|
+
from catboost import CatBoostClassifier # type: ignore
|
6
|
+
|
7
|
+
from .catboost_kwargs import handle_fit_kwargs
|
8
|
+
|
9
|
+
|
10
|
+
class CatBoostClassifierWrapper(CatBoostClassifier):
|
11
|
+
"""A wrapper for the catboost classifier."""
|
12
|
+
|
13
|
+
def fit(self, *args, **kwargs):
|
14
|
+
kwargs = handle_fit_kwargs(*args, **kwargs)
|
15
|
+
return super().fit(*args, **kwargs)
|
@@ -0,0 +1,35 @@
|
|
1
|
+
"""A list of constant catboost kwargs."""
|
2
|
+
|
3
|
+
from typing import Any
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
from catboost import Pool # type: ignore
|
7
|
+
|
8
|
+
ORIGINAL_X = "original_x"
|
9
|
+
EVAL_SET = "eval_set"
|
10
|
+
|
11
|
+
|
12
|
+
def handle_fit_kwargs(*args, **kwargs) -> dict[str, Any]:
|
13
|
+
"""Handles keyword args coming into a catboost fit method."""
|
14
|
+
if ORIGINAL_X in kwargs:
|
15
|
+
df = kwargs[ORIGINAL_X]
|
16
|
+
eval_x, eval_y = kwargs[EVAL_SET]
|
17
|
+
fit_x = args[0]
|
18
|
+
fix_x_cp = fit_x.copy()
|
19
|
+
|
20
|
+
# Stupid code to ensure eval is feature equivalent to train data
|
21
|
+
included_columns = []
|
22
|
+
for i in range(fix_x_cp.shape[1]):
|
23
|
+
arr_col_values = fix_x_cp[:, i]
|
24
|
+
for col in df.columns:
|
25
|
+
df_col_values = df[col].values
|
26
|
+
if np.allclose(df_col_values, arr_col_values, equal_nan=True):
|
27
|
+
included_columns.append(col)
|
28
|
+
df = df.drop(col, axis=1)
|
29
|
+
break
|
30
|
+
|
31
|
+
eval_x = eval_x[included_columns]
|
32
|
+
kwargs[EVAL_SET] = Pool(eval_x, label=eval_y)
|
33
|
+
|
34
|
+
del kwargs[ORIGINAL_X]
|
35
|
+
return kwargs
|
@@ -0,0 +1,209 @@
|
|
1
|
+
"""A model that wraps catboost."""
|
2
|
+
|
3
|
+
import json
|
4
|
+
import os
|
5
|
+
from typing import Any, Self
|
6
|
+
|
7
|
+
import optuna
|
8
|
+
import pandas as pd
|
9
|
+
from catboost import CatBoost, Pool # type: ignore
|
10
|
+
|
11
|
+
from ..model_type import ModelType, determine_model_type
|
12
|
+
from .catboost_classifier_wrap import CatBoostClassifierWrapper
|
13
|
+
from .catboost_kwargs import EVAL_SET, ORIGINAL_X
|
14
|
+
from .catboost_regressor_wrap import CatBoostRegressorWrapper
|
15
|
+
from .model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
|
16
|
+
|
17
|
+
_MODEL_FILENAME = "model.cbm"
|
18
|
+
_MODEL_PARAMS_FILENAME = "model_params.json"
|
19
|
+
_ITERATIONS_KEY = "iterations"
|
20
|
+
_LEARNING_RATE_KEY = "learning_rate"
|
21
|
+
_DEPTH_KEY = "depth"
|
22
|
+
_L2_LEAF_REG_KEY = "l2_leaf_reg"
|
23
|
+
_BOOSTING_TYPE_KEY = "boosting_type"
|
24
|
+
_MODEL_TYPE_KEY = "model_type"
|
25
|
+
|
26
|
+
|
27
|
+
class CatboostModel(Model):
|
28
|
+
"""A class that uses Catboost as a model."""
|
29
|
+
|
30
|
+
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
31
|
+
|
32
|
+
_catboost: CatBoost | None
|
33
|
+
_iterations: None | int
|
34
|
+
_learning_rate: None | float
|
35
|
+
_depth: None | int
|
36
|
+
_l2_leaf_reg: None | float
|
37
|
+
_boosting_type: None | str
|
38
|
+
_model_type: None | ModelType
|
39
|
+
|
40
|
+
@classmethod
|
41
|
+
def name(cls) -> str:
|
42
|
+
return "catboost"
|
43
|
+
|
44
|
+
def __init__(self) -> None:
|
45
|
+
super().__init__()
|
46
|
+
self._catboost = None
|
47
|
+
self._iterations = None
|
48
|
+
self._learning_rate = None
|
49
|
+
self._depth = None
|
50
|
+
self._l2_leaf_reg = None
|
51
|
+
self._boosting_type = None
|
52
|
+
self._model_type = None
|
53
|
+
|
54
|
+
@property
|
55
|
+
def estimator(self) -> Any:
|
56
|
+
return self._provide_catboost()
|
57
|
+
|
58
|
+
def pre_fit(
|
59
|
+
self,
|
60
|
+
df: pd.DataFrame,
|
61
|
+
y: pd.Series | pd.DataFrame | None,
|
62
|
+
eval_x: pd.DataFrame | None = None,
|
63
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
64
|
+
):
|
65
|
+
if y is None:
|
66
|
+
raise ValueError("y is null.")
|
67
|
+
self._model_type = determine_model_type(y)
|
68
|
+
return {
|
69
|
+
EVAL_SET: (eval_x, eval_y),
|
70
|
+
"cat_features": df.select_dtypes(include="category").columns.tolist(),
|
71
|
+
ORIGINAL_X: df,
|
72
|
+
}
|
73
|
+
|
74
|
+
def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
75
|
+
self._iterations = trial.suggest_int(_ITERATIONS_KEY, 100, 10000)
|
76
|
+
self._learning_rate = trial.suggest_float(_LEARNING_RATE_KEY, 0.001, 0.3)
|
77
|
+
self._depth = trial.suggest_int(_DEPTH_KEY, 1, 12)
|
78
|
+
self._l2_leaf_reg = trial.suggest_float(_L2_LEAF_REG_KEY, 3.0, 50.0)
|
79
|
+
self._boosting_type = trial.suggest_categorical(
|
80
|
+
_BOOSTING_TYPE_KEY, ["Ordered", "Plain"]
|
81
|
+
)
|
82
|
+
|
83
|
+
def load(self, folder: str) -> None:
|
84
|
+
with open(
|
85
|
+
os.path.join(folder, _MODEL_PARAMS_FILENAME), encoding="utf8"
|
86
|
+
) as handle:
|
87
|
+
params = json.load(handle)
|
88
|
+
self._iterations = params[_ITERATIONS_KEY]
|
89
|
+
self._learning_rate = params[_LEARNING_RATE_KEY]
|
90
|
+
self._depth = params[_DEPTH_KEY]
|
91
|
+
self._l2_leaf_reg = params[_L2_LEAF_REG_KEY]
|
92
|
+
self._boosting_type = params[_BOOSTING_TYPE_KEY]
|
93
|
+
self._model_type = ModelType(params[_MODEL_TYPE_KEY])
|
94
|
+
catboost = self._provide_catboost()
|
95
|
+
catboost.load_model(os.path.join(folder, _MODEL_FILENAME))
|
96
|
+
|
97
|
+
def save(self, folder: str) -> None:
|
98
|
+
with open(
|
99
|
+
os.path.join(folder, _MODEL_PARAMS_FILENAME), "w", encoding="utf8"
|
100
|
+
) as handle:
|
101
|
+
json.dump(
|
102
|
+
{
|
103
|
+
_ITERATIONS_KEY: self._iterations,
|
104
|
+
_LEARNING_RATE_KEY: self._learning_rate,
|
105
|
+
_DEPTH_KEY: self._depth,
|
106
|
+
_L2_LEAF_REG_KEY: self._l2_leaf_reg,
|
107
|
+
_BOOSTING_TYPE_KEY: self._boosting_type,
|
108
|
+
_MODEL_TYPE_KEY: str(self._model_type),
|
109
|
+
},
|
110
|
+
handle,
|
111
|
+
)
|
112
|
+
catboost = self._provide_catboost()
|
113
|
+
catboost.save_model(os.path.join(folder, _MODEL_FILENAME))
|
114
|
+
|
115
|
+
def fit(
|
116
|
+
self,
|
117
|
+
df: pd.DataFrame,
|
118
|
+
y: pd.Series | pd.DataFrame | None = None,
|
119
|
+
w: pd.Series | None = None,
|
120
|
+
eval_x: pd.DataFrame | None = None,
|
121
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
122
|
+
) -> Self:
|
123
|
+
if y is None:
|
124
|
+
raise ValueError("y is null.")
|
125
|
+
self._model_type = determine_model_type(y)
|
126
|
+
catboost = self._provide_catboost()
|
127
|
+
|
128
|
+
train_pool = Pool(
|
129
|
+
df,
|
130
|
+
label=y,
|
131
|
+
weight=w,
|
132
|
+
)
|
133
|
+
eval_pool = Pool(
|
134
|
+
eval_x,
|
135
|
+
label=eval_y,
|
136
|
+
)
|
137
|
+
catboost.fit(
|
138
|
+
train_pool,
|
139
|
+
early_stopping_rounds=100,
|
140
|
+
verbose=False,
|
141
|
+
metric_period=100,
|
142
|
+
eval_set=eval_pool,
|
143
|
+
)
|
144
|
+
return self
|
145
|
+
|
146
|
+
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
147
|
+
pred_pool = Pool(df)
|
148
|
+
catboost = self._provide_catboost()
|
149
|
+
pred = catboost.predict(pred_pool)
|
150
|
+
df = pd.DataFrame(
|
151
|
+
index=df.index,
|
152
|
+
data={
|
153
|
+
PREDICTION_COLUMN: pred.flatten(),
|
154
|
+
},
|
155
|
+
)
|
156
|
+
if self._model_type != ModelType.REGRESSION:
|
157
|
+
proba = catboost.predict_proba(pred_pool) # type: ignore
|
158
|
+
for i in range(proba.shape[1]):
|
159
|
+
df[f"{PROBABILITY_COLUMN_PREFIX}{i}"] = proba[:, i]
|
160
|
+
return df
|
161
|
+
|
162
|
+
def _provide_catboost(self) -> CatBoost:
|
163
|
+
catboost = self._catboost
|
164
|
+
if catboost is None:
|
165
|
+
match self._model_type:
|
166
|
+
case ModelType.BINARY:
|
167
|
+
catboost = CatBoostClassifierWrapper(
|
168
|
+
iterations=self._iterations,
|
169
|
+
learning_rate=self._learning_rate,
|
170
|
+
depth=self._depth,
|
171
|
+
l2_leaf_reg=self._l2_leaf_reg,
|
172
|
+
boosting_type=self._boosting_type,
|
173
|
+
early_stopping_rounds=100,
|
174
|
+
metric_period=100,
|
175
|
+
)
|
176
|
+
case ModelType.REGRESSION:
|
177
|
+
catboost = CatBoostRegressorWrapper(
|
178
|
+
iterations=self._iterations,
|
179
|
+
learning_rate=self._learning_rate,
|
180
|
+
depth=self._depth,
|
181
|
+
l2_leaf_reg=self._l2_leaf_reg,
|
182
|
+
boosting_type=self._boosting_type,
|
183
|
+
early_stopping_rounds=100,
|
184
|
+
metric_period=100,
|
185
|
+
)
|
186
|
+
case ModelType.BINNED_BINARY:
|
187
|
+
catboost = CatBoostClassifierWrapper(
|
188
|
+
iterations=self._iterations,
|
189
|
+
learning_rate=self._learning_rate,
|
190
|
+
depth=self._depth,
|
191
|
+
l2_leaf_reg=self._l2_leaf_reg,
|
192
|
+
boosting_type=self._boosting_type,
|
193
|
+
early_stopping_rounds=100,
|
194
|
+
metric_period=100,
|
195
|
+
)
|
196
|
+
case ModelType.MULTI_CLASSIFICATION:
|
197
|
+
catboost = CatBoostClassifierWrapper(
|
198
|
+
iterations=self._iterations,
|
199
|
+
learning_rate=self._learning_rate,
|
200
|
+
depth=self._depth,
|
201
|
+
l2_leaf_reg=self._l2_leaf_reg,
|
202
|
+
boosting_type=self._boosting_type,
|
203
|
+
early_stopping_rounds=100,
|
204
|
+
metric_period=100,
|
205
|
+
)
|
206
|
+
self._catboost = catboost
|
207
|
+
if catboost is None:
|
208
|
+
raise ValueError("catboost is null")
|
209
|
+
return catboost
|
@@ -0,0 +1,13 @@
|
|
1
|
+
"""A wrapper for catboost regressor to handle some edge cases."""
|
2
|
+
|
3
|
+
from catboost import CatBoostRegressor # type: ignore
|
4
|
+
|
5
|
+
from .catboost_kwargs import handle_fit_kwargs
|
6
|
+
|
7
|
+
|
8
|
+
class CatBoostRegressorWrapper(CatBoostRegressor):
|
9
|
+
"""A wrapper for the catboost regressor."""
|
10
|
+
|
11
|
+
def fit(self, *args, **kwargs):
|
12
|
+
kwargs = handle_fit_kwargs(*args, **kwargs)
|
13
|
+
return super().fit(*args, **kwargs)
|
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
from typing import Any
|
4
4
|
|
5
|
+
import pandas as pd
|
6
|
+
|
5
7
|
from ..fit import Fit
|
6
8
|
from ..params import Params
|
7
9
|
|
@@ -21,3 +23,13 @@ class Model(Params, Fit):
|
|
21
23
|
def estimator(self) -> Any:
|
22
24
|
"""The estimator backing the model."""
|
23
25
|
raise NotImplementedError("estimator not implemented in parent class.")
|
26
|
+
|
27
|
+
def pre_fit(
|
28
|
+
self,
|
29
|
+
df: pd.DataFrame,
|
30
|
+
y: pd.Series | pd.DataFrame | None,
|
31
|
+
eval_x: pd.DataFrame | None = None,
|
32
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
33
|
+
) -> dict[str, Any]:
|
34
|
+
"""A call to make sure the model is prepared for the target type."""
|
35
|
+
raise NotImplementedError("pre_fit not implemented in parent class.")
|
@@ -20,6 +20,8 @@ _MODELS = {
|
|
20
20
|
class ModelRouter(Model):
|
21
21
|
"""A router that routes to a different weights class."""
|
22
22
|
|
23
|
+
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
24
|
+
|
23
25
|
_model: Model | None
|
24
26
|
|
25
27
|
def __init__(self) -> None:
|
@@ -37,10 +39,23 @@ class ModelRouter(Model):
|
|
37
39
|
raise ValueError("model is null")
|
38
40
|
return model.estimator
|
39
41
|
|
42
|
+
def pre_fit(
|
43
|
+
self,
|
44
|
+
df: pd.DataFrame,
|
45
|
+
y: pd.Series | pd.DataFrame | None,
|
46
|
+
eval_x: pd.DataFrame | None = None,
|
47
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
48
|
+
) -> dict[str, Any]:
|
49
|
+
model = self._model
|
50
|
+
if model is None:
|
51
|
+
raise ValueError("model is null")
|
52
|
+
return model.pre_fit(df, y=y, eval_x=eval_x, eval_y=eval_y)
|
53
|
+
|
40
54
|
def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
41
55
|
self._model = _MODELS[
|
42
56
|
trial.suggest_categorical("model", list(_MODELS.keys()))
|
43
57
|
]()
|
58
|
+
self._model.set_options(trial)
|
44
59
|
|
45
60
|
def load(self, folder: str) -> None:
|
46
61
|
with open(os.path.join(folder, _MODEL_ROUTER_FILE), encoding="utf8") as handle:
|
@@ -69,11 +84,13 @@ class ModelRouter(Model):
|
|
69
84
|
df: pd.DataFrame,
|
70
85
|
y: pd.Series | pd.DataFrame | None = None,
|
71
86
|
w: pd.Series | None = None,
|
87
|
+
eval_x: pd.DataFrame | None = None,
|
88
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
72
89
|
) -> Self:
|
73
90
|
model = self._model
|
74
91
|
if model is None:
|
75
92
|
raise ValueError("model is null")
|
76
|
-
model.fit(df, y=y, w=w)
|
93
|
+
model.fit(df, y=y, w=w, eval_x=eval_x, eval_y=eval_y)
|
77
94
|
return self
|
78
95
|
|
79
96
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
@@ -1,17 +1,17 @@
|
|
1
1
|
"""An enum to define the model type."""
|
2
2
|
|
3
|
-
from enum import
|
3
|
+
from enum import StrEnum, auto
|
4
4
|
|
5
5
|
import pandas as pd
|
6
6
|
|
7
7
|
|
8
|
-
class ModelType(
|
8
|
+
class ModelType(StrEnum):
|
9
9
|
"""The type of model being run."""
|
10
10
|
|
11
|
-
BINARY =
|
12
|
-
REGRESSION =
|
13
|
-
BINNED_BINARY =
|
14
|
-
MULTI_CLASSIFICATION =
|
11
|
+
BINARY = auto()
|
12
|
+
REGRESSION = auto()
|
13
|
+
BINNED_BINARY = auto()
|
14
|
+
MULTI_CLASSIFICATION = auto()
|
15
15
|
|
16
16
|
|
17
17
|
def determine_model_type(y: pd.Series | pd.DataFrame) -> ModelType:
|
@@ -15,6 +15,8 @@ from .reducer import Reducer
|
|
15
15
|
class BaseSelectorReducer(Reducer):
|
16
16
|
"""A class that uses the base selector from the feature engine."""
|
17
17
|
|
18
|
+
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
19
|
+
|
18
20
|
def __init__(self, base_selector: BaseSelector, file_name: str) -> None:
|
19
21
|
super().__init__()
|
20
22
|
self._base_selector = base_selector
|
@@ -40,6 +42,8 @@ class BaseSelectorReducer(Reducer):
|
|
40
42
|
df: pd.DataFrame,
|
41
43
|
y: pd.Series | pd.DataFrame | None = None,
|
42
44
|
w: pd.Series | None = None,
|
45
|
+
eval_x: pd.DataFrame | None = None,
|
46
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
43
47
|
) -> Self:
|
44
48
|
try:
|
45
49
|
self._base_selector.fit(df) # type: ignore
|
@@ -20,6 +20,8 @@ _REDUCERS_KEY = "reducers"
|
|
20
20
|
class CombinedReducer(Reducer):
|
21
21
|
"""A reducer that combines a series of reducers."""
|
22
22
|
|
23
|
+
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
24
|
+
|
23
25
|
def __init__(self):
|
24
26
|
super().__init__()
|
25
27
|
self._reducers = [
|
@@ -73,6 +75,8 @@ class CombinedReducer(Reducer):
|
|
73
75
|
df: pd.DataFrame,
|
74
76
|
y: pd.Series | pd.DataFrame | None = None,
|
75
77
|
w: pd.Series | None = None,
|
78
|
+
eval_x: pd.DataFrame | None = None,
|
79
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
76
80
|
) -> Self:
|
77
81
|
for reducer in self._reducers:
|
78
82
|
df = reducer.fit_transform(df)
|
@@ -11,6 +11,8 @@ from .reducer import Reducer
|
|
11
11
|
class NonNumericReducer(Reducer):
|
12
12
|
"""A class that removes non numeric columns from a dataframe."""
|
13
13
|
|
14
|
+
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
15
|
+
|
14
16
|
@classmethod
|
15
17
|
def name(cls) -> str:
|
16
18
|
return "nonnumeric"
|
@@ -29,6 +31,8 @@ class NonNumericReducer(Reducer):
|
|
29
31
|
df: pd.DataFrame,
|
30
32
|
y: pd.Series | pd.DataFrame | None = None,
|
31
33
|
w: pd.Series | None = None,
|
34
|
+
eval_x: pd.DataFrame | None = None,
|
35
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
32
36
|
) -> Self:
|
33
37
|
return self
|
34
38
|
|
@@ -7,6 +7,7 @@ from typing import Self
|
|
7
7
|
import joblib # type: ignore
|
8
8
|
import optuna
|
9
9
|
import pandas as pd
|
10
|
+
import sklearn # type: ignore
|
10
11
|
from sklearn.feature_selection import RFE # type: ignore
|
11
12
|
|
12
13
|
from ..fit import Fit
|
@@ -19,24 +20,20 @@ _SELECTOR_FILE = "selector.joblib"
|
|
19
20
|
class Selector(Params, Fit):
|
20
21
|
"""The selector class."""
|
21
22
|
|
22
|
-
|
23
|
+
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
24
|
+
|
25
|
+
_selector: RFE | None
|
26
|
+
|
27
|
+
def __init__(self, model: Model):
|
23
28
|
super().__init__()
|
24
29
|
self._model = model
|
25
30
|
self._feature_ratio = 0.0
|
26
31
|
self._steps = 0
|
27
|
-
|
28
|
-
self._selector = RFE(
|
29
|
-
model.estimator,
|
30
|
-
n_features_to_select=n_features_to_select,
|
31
|
-
step=self._steps,
|
32
|
-
verbose=1,
|
33
|
-
)
|
32
|
+
self._selector = None
|
34
33
|
|
35
34
|
def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
36
35
|
self._feature_ratio = trial.suggest_float("feature_ratio", 0.0, 1.0)
|
37
|
-
|
38
|
-
self._steps = steps
|
39
|
-
self._selector.step = steps
|
36
|
+
self._steps = trial.suggest_int("steps", 1, 16)
|
40
37
|
|
41
38
|
def load(self, folder: str) -> None:
|
42
39
|
self._selector = joblib.load(os.path.join(folder, _SELECTOR_FILE))
|
@@ -49,20 +46,35 @@ class Selector(Params, Fit):
|
|
49
46
|
df: pd.DataFrame,
|
50
47
|
y: pd.Series | pd.DataFrame | None = None,
|
51
48
|
w: pd.Series | None = None,
|
49
|
+
eval_x: pd.DataFrame | None = None,
|
50
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
52
51
|
) -> Self:
|
52
|
+
sklearn.set_config(enable_metadata_routing=False)
|
53
|
+
model_kwargs = self._model.pre_fit(df, y=y, eval_x=eval_x, eval_y=eval_y)
|
53
54
|
if not isinstance(y, pd.Series):
|
54
55
|
raise ValueError("y is not a series.")
|
55
|
-
|
56
|
+
n_features_to_select = max(1, int(len(df.columns) * self._feature_ratio))
|
57
|
+
self._selector = RFE(
|
58
|
+
self._model.estimator,
|
59
|
+
n_features_to_select=n_features_to_select,
|
60
|
+
step=max(
|
61
|
+
1,
|
62
|
+
int((len(df.columns) - n_features_to_select) / self._steps),
|
63
|
+
),
|
64
|
+
)
|
56
65
|
try:
|
57
|
-
self._selector.fit(df, y=y, sample_weight=w)
|
66
|
+
self._selector.fit(df, y=y, sample_weight=w, **model_kwargs)
|
58
67
|
except ValueError as exc:
|
59
68
|
# Catch issues with 1 feature as a reduction target.
|
60
69
|
logging.warning(str(exc))
|
61
70
|
return self
|
62
71
|
|
63
72
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
73
|
+
selector = self._selector
|
74
|
+
if selector is None:
|
75
|
+
raise ValueError("selector is null.")
|
64
76
|
try:
|
65
|
-
return df[
|
77
|
+
return df[selector.get_feature_names_out()]
|
66
78
|
except AttributeError as exc:
|
67
79
|
# Catch issues with 1 feature as a reduction target.
|
68
80
|
logging.warning(str(exc))
|
@@ -11,7 +11,7 @@ from typing import Self
|
|
11
11
|
import optuna
|
12
12
|
import pandas as pd
|
13
13
|
import tqdm
|
14
|
-
from sklearn.metrics import
|
14
|
+
from sklearn.metrics import f1_score, r2_score # type: ignore
|
15
15
|
|
16
16
|
from .calibrator.calibrator_router import CalibratorRouter
|
17
17
|
from .exceptions import WavetrainException
|
@@ -158,6 +158,8 @@ class Trainer(Fit):
|
|
158
158
|
df: pd.DataFrame,
|
159
159
|
y: pd.Series | pd.DataFrame | None = None,
|
160
160
|
w: pd.Series | None = None,
|
161
|
+
eval_x: pd.DataFrame | None = None,
|
162
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
161
163
|
) -> Self:
|
162
164
|
"""Perform a train on the data to fit to the targets."""
|
163
165
|
if y is None:
|
@@ -215,12 +217,14 @@ class Trainer(Fit):
|
|
215
217
|
model.set_options(trial)
|
216
218
|
|
217
219
|
# Train
|
218
|
-
selector = Selector(model
|
220
|
+
selector = Selector(model)
|
219
221
|
selector.set_options(trial)
|
220
|
-
selector.fit(x_train, y=y_train, w=w)
|
222
|
+
selector.fit(x_train, y=y_train, w=w, eval_x=x_test, eval_y=y_test)
|
221
223
|
x_train = selector.transform(x_train)
|
222
224
|
x_test = selector.transform(x_test)
|
223
|
-
x_pred = model.fit_transform(
|
225
|
+
x_pred = model.fit_transform(
|
226
|
+
x_train, y=y_train, w=w, eval_x=x_test, eval_y=y_test
|
227
|
+
)
|
224
228
|
|
225
229
|
# Calibrate
|
226
230
|
calibrator = CalibratorRouter(model)
|
@@ -243,8 +247,8 @@ class Trainer(Fit):
|
|
243
247
|
y_pred = model.transform(x_test)
|
244
248
|
y_pred = calibrator.transform(y_pred)
|
245
249
|
if determine_model_type(y_series) == ModelType.REGRESSION:
|
246
|
-
return
|
247
|
-
return f1_score(y_test, y_pred[[PREDICTION_COLUMN]])
|
250
|
+
return float(r2_score(y_test, y_pred[[PREDICTION_COLUMN]]))
|
251
|
+
return float(f1_score(y_test, y_pred[[PREDICTION_COLUMN]]))
|
248
252
|
except WavetrainException as exc:
|
249
253
|
logging.warning(str(exc))
|
250
254
|
return -1.0
|
@@ -286,9 +290,15 @@ class Trainer(Fit):
|
|
286
290
|
train_len = len(df[dt_index < start_test_index])
|
287
291
|
test_len = len(df.loc[start_test_index:start_validation_index])
|
288
292
|
|
293
|
+
last_processed_dt = None
|
289
294
|
for count, test_idx in tqdm.tqdm(
|
290
|
-
enumerate(
|
295
|
+
enumerate(test_dt_index[test_dt_index >= start_test_index])
|
291
296
|
):
|
297
|
+
if (
|
298
|
+
last_processed_dt is not None
|
299
|
+
and test_idx < last_processed_dt + self._walkforward_timedelta
|
300
|
+
):
|
301
|
+
continue
|
292
302
|
test_dt = test_idx.to_pydatetime()
|
293
303
|
found = False
|
294
304
|
for trial in study.trials:
|
@@ -373,7 +383,7 @@ class Trainer(Fit):
|
|
373
383
|
model = ModelRouter()
|
374
384
|
model.load(folder)
|
375
385
|
|
376
|
-
selector = Selector(model
|
386
|
+
selector = Selector(model)
|
377
387
|
selector.load(folder)
|
378
388
|
|
379
389
|
calibrator = CalibratorRouter(model)
|
@@ -14,6 +14,8 @@ from .weights import WEIGHTS_COLUMN, Weights
|
|
14
14
|
class ClassWeights(Weights):
|
15
15
|
"""Class weight class."""
|
16
16
|
|
17
|
+
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
18
|
+
|
17
19
|
_class_weights: dict[Any, float]
|
18
20
|
|
19
21
|
def __init__(self) -> None:
|
@@ -39,6 +41,8 @@ class ClassWeights(Weights):
|
|
39
41
|
df: pd.DataFrame,
|
40
42
|
y: pd.Series | pd.DataFrame | None = None,
|
41
43
|
w: pd.Series | None = None,
|
44
|
+
eval_x: pd.DataFrame | None = None,
|
45
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
42
46
|
) -> Self:
|
43
47
|
if not isinstance(y, pd.Series):
|
44
48
|
raise ValueError("y is not a series.")
|
@@ -13,6 +13,8 @@ from .weights_router import WeightsRouter
|
|
13
13
|
class CombinedWeights(Weights):
|
14
14
|
"""A weights class that combines multiple weights."""
|
15
15
|
|
16
|
+
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
17
|
+
|
16
18
|
def __init__(self) -> None:
|
17
19
|
super().__init__()
|
18
20
|
self._weights = [WeightsRouter(), ClassWeights()]
|
@@ -38,6 +40,8 @@ class CombinedWeights(Weights):
|
|
38
40
|
df: pd.DataFrame,
|
39
41
|
y: pd.Series | pd.DataFrame | None = None,
|
40
42
|
w: pd.Series | None = None,
|
43
|
+
eval_x: pd.DataFrame | None = None,
|
44
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
41
45
|
) -> Self:
|
42
46
|
for weights in self._weights:
|
43
47
|
weights.fit(df, y=y)
|
@@ -12,6 +12,8 @@ from .weights import WEIGHTS_COLUMN, Weights
|
|
12
12
|
class ExponentialWeights(Weights):
|
13
13
|
"""Exponential weight class."""
|
14
14
|
|
15
|
+
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
16
|
+
|
15
17
|
@classmethod
|
16
18
|
def name(cls) -> str:
|
17
19
|
"""The name of the weight class."""
|
@@ -31,6 +33,8 @@ class ExponentialWeights(Weights):
|
|
31
33
|
df: pd.DataFrame,
|
32
34
|
y: pd.Series | pd.DataFrame | None = None,
|
33
35
|
w: pd.Series | None = None,
|
36
|
+
eval_x: pd.DataFrame | None = None,
|
37
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
34
38
|
) -> Self:
|
35
39
|
return self
|
36
40
|
|
@@ -12,7 +12,7 @@ from .weights import WEIGHTS_COLUMN, Weights
|
|
12
12
|
class LinearWeights(Weights):
|
13
13
|
"""Linear weight class."""
|
14
14
|
|
15
|
-
# pylint: disable=duplicate-code
|
15
|
+
# pylint: disable=duplicate-code,too-many-positional-arguments,too-many-arguments
|
16
16
|
|
17
17
|
@classmethod
|
18
18
|
def name(cls) -> str:
|
@@ -33,6 +33,8 @@ class LinearWeights(Weights):
|
|
33
33
|
df: pd.DataFrame,
|
34
34
|
y: pd.Series | pd.DataFrame | None = None,
|
35
35
|
w: pd.Series | None = None,
|
36
|
+
eval_x: pd.DataFrame | None = None,
|
37
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
36
38
|
) -> Self:
|
37
39
|
return self
|
38
40
|
|
@@ -12,7 +12,7 @@ from .weights import WEIGHTS_COLUMN, Weights
|
|
12
12
|
class NoopWeights(Weights):
|
13
13
|
"""Noop weight class."""
|
14
14
|
|
15
|
-
# pylint: disable=duplicate-code
|
15
|
+
# pylint: disable=duplicate-code,too-many-positional-arguments,too-many-arguments
|
16
16
|
|
17
17
|
@classmethod
|
18
18
|
def name(cls) -> str:
|
@@ -33,6 +33,8 @@ class NoopWeights(Weights):
|
|
33
33
|
df: pd.DataFrame,
|
34
34
|
y: pd.Series | pd.DataFrame | None = None,
|
35
35
|
w: pd.Series | None = None,
|
36
|
+
eval_x: pd.DataFrame | None = None,
|
37
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
36
38
|
) -> Self:
|
37
39
|
return self
|
38
40
|
|
@@ -13,7 +13,7 @@ from .weights import WEIGHTS_COLUMN, Weights
|
|
13
13
|
class SigmoidWeights(Weights):
|
14
14
|
"""Sigmoid weight class."""
|
15
15
|
|
16
|
-
# pylint: disable=duplicate-code
|
16
|
+
# pylint: disable=duplicate-code,too-many-positional-arguments,too-many-arguments
|
17
17
|
|
18
18
|
@classmethod
|
19
19
|
def name(cls) -> str:
|
@@ -34,6 +34,8 @@ class SigmoidWeights(Weights):
|
|
34
34
|
df: pd.DataFrame,
|
35
35
|
y: pd.Series | pd.DataFrame | None = None,
|
36
36
|
w: pd.Series | None = None,
|
37
|
+
eval_x: pd.DataFrame | None = None,
|
38
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
37
39
|
) -> Self:
|
38
40
|
return self
|
39
41
|
|
@@ -26,6 +26,8 @@ _WEIGHTS = {
|
|
26
26
|
class WeightsRouter(Weights):
|
27
27
|
"""A router that routes to a different weights class."""
|
28
28
|
|
29
|
+
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
30
|
+
|
29
31
|
_weights: Weights | None
|
30
32
|
|
31
33
|
def __init__(self) -> None:
|
@@ -71,6 +73,8 @@ class WeightsRouter(Weights):
|
|
71
73
|
df: pd.DataFrame,
|
72
74
|
y: pd.Series | pd.DataFrame | None = None,
|
73
75
|
w: pd.Series | None = None,
|
76
|
+
eval_x: pd.DataFrame | None = None,
|
77
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
74
78
|
) -> Self:
|
75
79
|
return self
|
76
80
|
|
@@ -18,6 +18,8 @@ _LOOKBACK_KEY = "lookback"
|
|
18
18
|
class Windower(Params, Fit):
|
19
19
|
"""The windower class."""
|
20
20
|
|
21
|
+
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
22
|
+
|
21
23
|
_lookback_ratio: float | None
|
22
24
|
|
23
25
|
def __init__(self, dt_column: str | None):
|
@@ -48,6 +50,8 @@ class Windower(Params, Fit):
|
|
48
50
|
df: pd.DataFrame,
|
49
51
|
y: pd.Series | pd.DataFrame | None = None,
|
50
52
|
w: pd.Series | None = None,
|
53
|
+
eval_x: pd.DataFrame | None = None,
|
54
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
51
55
|
) -> Self:
|
52
56
|
lookback_ratio = self._lookback_ratio
|
53
57
|
if lookback_ratio is None:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: wavetrainer
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.6
|
4
4
|
Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
|
5
5
|
Home-page: https://github.com/8W9aG/wavetrainer
|
6
6
|
Author: Will Sackfield
|
@@ -21,7 +21,6 @@ Requires-Dist: scipy>=1.15.2
|
|
21
21
|
Requires-Dist: catboost>=1.2.7
|
22
22
|
Requires-Dist: venn-abers>=1.4.6
|
23
23
|
Requires-Dist: mapie>=0.9.2
|
24
|
-
Requires-Dist: shapiq>=1.2.2
|
25
24
|
|
26
25
|
# wavetrainer
|
27
26
|
|
@@ -49,7 +48,6 @@ Python 3.11.6:
|
|
49
48
|
- [catboost](https://catboost.ai/)
|
50
49
|
- [venn-abers](https://github.com/ip200/venn-abers)
|
51
50
|
- [mapie](https://mapie.readthedocs.io/en/stable/)
|
52
|
-
- [shapiq](https://github.com/mmschlk/shapiq)
|
53
51
|
|
54
52
|
## Raison D'être :thought_balloon:
|
55
53
|
|
@@ -9,7 +9,6 @@ wavetrainer/__init__.py
|
|
9
9
|
wavetrainer/create.py
|
10
10
|
wavetrainer/exceptions.py
|
11
11
|
wavetrainer/fit.py
|
12
|
-
wavetrainer/load.py
|
13
12
|
wavetrainer/model_type.py
|
14
13
|
wavetrainer/params.py
|
15
14
|
wavetrainer/trainer.py
|
@@ -25,7 +24,10 @@ wavetrainer/calibrator/calibrator_router.py
|
|
25
24
|
wavetrainer/calibrator/mapie_calibrator.py
|
26
25
|
wavetrainer/calibrator/vennabers_calibrator.py
|
27
26
|
wavetrainer/model/__init__.py
|
27
|
+
wavetrainer/model/catboost_classifier_wrap.py
|
28
|
+
wavetrainer/model/catboost_kwargs.py
|
28
29
|
wavetrainer/model/catboost_model.py
|
30
|
+
wavetrainer/model/catboost_regressor_wrap.py
|
29
31
|
wavetrainer/model/model.py
|
30
32
|
wavetrainer/model/model_router.py
|
31
33
|
wavetrainer/reducer/__init__.py
|
@@ -1,80 +0,0 @@
|
|
1
|
-
"""A model that wraps catboost."""
|
2
|
-
|
3
|
-
import os
|
4
|
-
from typing import Any, Self
|
5
|
-
|
6
|
-
import optuna
|
7
|
-
import pandas as pd
|
8
|
-
from catboost import CatBoostClassifier, Pool # type: ignore
|
9
|
-
|
10
|
-
from .model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
|
11
|
-
|
12
|
-
_MODEL_FILENAME = "model.cbm"
|
13
|
-
|
14
|
-
|
15
|
-
class CatboostModel(Model):
|
16
|
-
"""A class that uses Catboost as a model."""
|
17
|
-
|
18
|
-
@classmethod
|
19
|
-
def name(cls) -> str:
|
20
|
-
return "catboost"
|
21
|
-
|
22
|
-
def __init__(self) -> None:
|
23
|
-
super().__init__()
|
24
|
-
self._catboost = CatBoostClassifier()
|
25
|
-
|
26
|
-
@property
|
27
|
-
def estimator(self) -> Any:
|
28
|
-
return self._catboost
|
29
|
-
|
30
|
-
def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
31
|
-
iterations = trial.suggest_int("iterations", 100, 10000)
|
32
|
-
learning_rate = trial.suggest_float("learning_rate", 0.001, 0.3)
|
33
|
-
depth = trial.suggest_int("depth", 1, 12)
|
34
|
-
l2_leaf_reg = trial.suggest_float("l2_leaf_reg", 3.0, 50.0)
|
35
|
-
boosting_type = trial.suggest_categorical("boosting_type", ["Ordered", "Plain"])
|
36
|
-
self._catboost.set_params(
|
37
|
-
iterations=iterations,
|
38
|
-
learning_rate=learning_rate,
|
39
|
-
depth=depth,
|
40
|
-
l2_leaf_reg=l2_leaf_reg,
|
41
|
-
boosting_type=boosting_type,
|
42
|
-
early_stopping_rounds=100,
|
43
|
-
)
|
44
|
-
|
45
|
-
def load(self, folder: str) -> None:
|
46
|
-
self._catboost.load_model(os.path.join(folder, _MODEL_FILENAME))
|
47
|
-
|
48
|
-
def save(self, folder: str) -> None:
|
49
|
-
self._catboost.save_model(os.path.join(folder, _MODEL_FILENAME))
|
50
|
-
|
51
|
-
def fit(
|
52
|
-
self,
|
53
|
-
df: pd.DataFrame,
|
54
|
-
y: pd.Series | pd.DataFrame | None = None,
|
55
|
-
w: pd.Series | None = None,
|
56
|
-
) -> Self:
|
57
|
-
train_pool = Pool(
|
58
|
-
df,
|
59
|
-
label=y,
|
60
|
-
weight=w,
|
61
|
-
)
|
62
|
-
self._catboost.fit(
|
63
|
-
train_pool,
|
64
|
-
early_stopping_rounds=100,
|
65
|
-
)
|
66
|
-
return self
|
67
|
-
|
68
|
-
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
69
|
-
pred_pool = Pool(df)
|
70
|
-
pred = self._catboost.predict(pred_pool)
|
71
|
-
proba = self._catboost.predict_proba(pred_pool)
|
72
|
-
df = pd.DataFrame(
|
73
|
-
index=df.index,
|
74
|
-
data={
|
75
|
-
PREDICTION_COLUMN: pred.flatten(),
|
76
|
-
},
|
77
|
-
)
|
78
|
-
for i in range(proba.shape[1]):
|
79
|
-
df[f"{PROBABILITY_COLUMN_PREFIX}{i}"] = proba[:, i]
|
80
|
-
return df
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|