wavetrainer 0.0.3__tar.gz → 0.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {wavetrainer-0.0.3/wavetrainer.egg-info → wavetrainer-0.0.5}/PKG-INFO +1 -1
  2. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/setup.py +1 -1
  3. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/tests/trainer_test.py +1 -1
  4. wavetrainer-0.0.5/wavetrainer/__init__.py +6 -0
  5. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/calibrator/mapie_calibrator.py +13 -8
  6. wavetrainer-0.0.5/wavetrainer/model/catboost_model.py +187 -0
  7. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/model/model.py +6 -0
  8. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/model/model_router.py +7 -0
  9. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/model_type.py +6 -6
  10. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/reducer/combined_reducer.py +10 -2
  11. wavetrainer-0.0.5/wavetrainer/reducer/nonnumeric_reducer.py +39 -0
  12. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/selector/selector.py +19 -13
  13. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/trainer.py +15 -7
  14. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/weights/class_weights.py +1 -1
  15. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/windower/windower.py +5 -2
  16. {wavetrainer-0.0.3 → wavetrainer-0.0.5/wavetrainer.egg-info}/PKG-INFO +1 -1
  17. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer.egg-info/SOURCES.txt +1 -1
  18. wavetrainer-0.0.3/wavetrainer/__init__.py +0 -10
  19. wavetrainer-0.0.3/wavetrainer/load.py +0 -8
  20. wavetrainer-0.0.3/wavetrainer/model/catboost_model.py +0 -80
  21. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/LICENSE +0 -0
  22. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/MANIFEST.in +0 -0
  23. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/README.md +0 -0
  24. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/requirements.txt +0 -0
  25. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/setup.cfg +0 -0
  26. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/tests/__init__.py +0 -0
  27. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/calibrator/__init__.py +0 -0
  28. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/calibrator/calibrator.py +0 -0
  29. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/calibrator/calibrator_router.py +0 -0
  30. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/calibrator/vennabers_calibrator.py +0 -0
  31. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/create.py +0 -0
  32. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/exceptions.py +0 -0
  33. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/fit.py +0 -0
  34. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/model/__init__.py +0 -0
  35. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/params.py +0 -0
  36. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/reducer/__init__.py +0 -0
  37. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/reducer/base_selector_reducer.py +0 -0
  38. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/reducer/constant_reducer.py +0 -0
  39. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/reducer/correlation_reducer.py +0 -0
  40. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/reducer/duplicate_reducer.py +0 -0
  41. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/reducer/reducer.py +0 -0
  42. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/selector/__init__.py +0 -0
  43. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/weights/__init__.py +0 -0
  44. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/weights/combined_weights.py +0 -0
  45. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/weights/exponential_weights.py +0 -0
  46. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/weights/linear_weights.py +0 -0
  47. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/weights/noop_weights.py +0 -0
  48. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/weights/sigmoid_weights.py +0 -0
  49. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/weights/weights.py +0 -0
  50. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/weights/weights_router.py +0 -0
  51. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer/windower/__init__.py +0 -0
  52. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer.egg-info/dependency_links.txt +0 -0
  53. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer.egg-info/not-zip-safe +0 -0
  54. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer.egg-info/requires.txt +0 -0
  55. {wavetrainer-0.0.3 → wavetrainer-0.0.5}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.0.3
3
+ Version: 0.0.5
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
23
23
 
24
24
  setup(
25
25
  name='wavetrainer',
26
- version='0.0.3',
26
+ version='0.0.5',
27
27
  description='A library for automatically finding the optimal model within feature and hyperparameter space.',
28
28
  long_description=long_description,
29
29
  long_description_content_type='text/markdown',
@@ -13,7 +13,7 @@ class TestTrainer(unittest.TestCase):
13
13
 
14
14
  def test_trainer(self):
15
15
  with tempfile.TemporaryDirectory() as tmpdir:
16
- trainer = Trainer(tmpdir, walkforward_timedelta=datetime.timedelta(days=1), trials=10)
16
+ trainer = Trainer(tmpdir, walkforward_timedelta=datetime.timedelta(days=7), trials=1)
17
17
  x_data = [i for i in range(100)]
18
18
  x_index = [datetime.datetime(2022, 1, 1) + datetime.timedelta(days=i) for i in range(len(x_data))]
19
19
  df = pd.DataFrame(
@@ -0,0 +1,6 @@
1
+ """The wavetrain main module."""
2
+
3
+ from .create import create
4
+
5
+ __VERSION__ = "0.0.5"
6
+ __all__ = ("create",)
@@ -49,12 +49,17 @@ class MAPIECalibrator(Calibrator):
49
49
  return self
50
50
 
51
51
  def transform(self, df: pd.DataFrame) -> pd.DataFrame:
52
- alpha = [0.05, 0.32]
53
- _, y_pis = self._mapie.predict(df, alpha=alpha)
54
- df = pd.DataFrame(data=None, index=df.index)
55
- for i in range(y_pis.shape[1]):
56
- for ii in range(y_pis.shape[2]):
57
- df[f"{PROBABILITY_COLUMN_PREFIX}{alpha[i]}_{ii == 1}"] = (
58
- y_pis[:, i, ii].flatten().tolist()
59
- )
52
+ alpha = []
53
+ for potential_alpha in [0.05, 0.32]:
54
+ if len(df) > int(1.0 / potential_alpha):
55
+ alpha.append(potential_alpha)
56
+ if alpha:
57
+ _, y_pis = self._mapie.predict(df, alpha=alpha)
58
+ for i in range(y_pis.shape[1]):
59
+ if i >= len(alpha):
60
+ continue
61
+ for ii in range(y_pis.shape[2]):
62
+ alpha_val = alpha[i]
63
+ values = y_pis[:, i, ii].flatten().tolist()
64
+ df[f"{PROBABILITY_COLUMN_PREFIX}{alpha_val}_{ii == 1}"] = values
60
65
  return df
@@ -0,0 +1,187 @@
1
+ """A model that wraps catboost."""
2
+
3
+ import json
4
+ import os
5
+ from typing import Any, Self
6
+
7
+ import optuna
8
+ import pandas as pd
9
+ from catboost import CatBoostClassifier # type: ignore
10
+ from catboost import CatBoost, CatBoostRegressor, Pool
11
+
12
+ from ..model_type import ModelType, determine_model_type
13
+ from .model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
14
+
15
+ _MODEL_FILENAME = "model.cbm"
16
+ _MODEL_PARAMS_FILENAME = "model_params.json"
17
+ _ITERATIONS_KEY = "iterations"
18
+ _LEARNING_RATE_KEY = "learning_rate"
19
+ _DEPTH_KEY = "depth"
20
+ _L2_LEAF_REG_KEY = "l2_leaf_reg"
21
+ _BOOSTING_TYPE_KEY = "boosting_type"
22
+ _MODEL_TYPE_KEY = "model_type"
23
+
24
+
25
+ class CatboostModel(Model):
26
+ """A class that uses Catboost as a model."""
27
+
28
+ _catboost: CatBoost | None
29
+ _iterations: None | int
30
+ _learning_rate: None | float
31
+ _depth: None | int
32
+ _l2_leaf_reg: None | float
33
+ _boosting_type: None | str
34
+ _model_type: None | ModelType
35
+
36
+ @classmethod
37
+ def name(cls) -> str:
38
+ return "catboost"
39
+
40
+ def __init__(self) -> None:
41
+ super().__init__()
42
+ self._catboost = None
43
+ self._iterations = None
44
+ self._learning_rate = None
45
+ self._depth = None
46
+ self._l2_leaf_reg = None
47
+ self._boosting_type = None
48
+ self._model_type = None
49
+
50
+ @property
51
+ def estimator(self) -> Any:
52
+ return self._provide_catboost()
53
+
54
+ def pre_fit(self, y: pd.Series | pd.DataFrame | None):
55
+ if y is None:
56
+ raise ValueError("y is null.")
57
+ self._model_type = determine_model_type(y)
58
+
59
+ def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
60
+ self._iterations = trial.suggest_int(_ITERATIONS_KEY, 100, 10000)
61
+ self._learning_rate = trial.suggest_float(_LEARNING_RATE_KEY, 0.001, 0.3)
62
+ self._depth = trial.suggest_int(_DEPTH_KEY, 1, 12)
63
+ self._l2_leaf_reg = trial.suggest_float(_L2_LEAF_REG_KEY, 3.0, 50.0)
64
+ self._boosting_type = trial.suggest_categorical(
65
+ _BOOSTING_TYPE_KEY, ["Ordered", "Plain"]
66
+ )
67
+
68
+ def load(self, folder: str) -> None:
69
+ with open(
70
+ os.path.join(folder, _MODEL_PARAMS_FILENAME), encoding="utf8"
71
+ ) as handle:
72
+ params = json.load(handle)
73
+ self._iterations = params[_ITERATIONS_KEY]
74
+ self._learning_rate = params[_LEARNING_RATE_KEY]
75
+ self._depth = params[_DEPTH_KEY]
76
+ self._l2_leaf_reg = params[_L2_LEAF_REG_KEY]
77
+ self._boosting_type = params[_BOOSTING_TYPE_KEY]
78
+ self._model_type = ModelType(params[_MODEL_TYPE_KEY])
79
+ catboost = self._provide_catboost()
80
+ catboost.load_model(os.path.join(folder, _MODEL_FILENAME))
81
+
82
+ def save(self, folder: str) -> None:
83
+ with open(
84
+ os.path.join(folder, _MODEL_PARAMS_FILENAME), "w", encoding="utf8"
85
+ ) as handle:
86
+ json.dump(
87
+ {
88
+ _ITERATIONS_KEY: self._iterations,
89
+ _LEARNING_RATE_KEY: self._learning_rate,
90
+ _DEPTH_KEY: self._depth,
91
+ _L2_LEAF_REG_KEY: self._l2_leaf_reg,
92
+ _BOOSTING_TYPE_KEY: self._boosting_type,
93
+ _MODEL_TYPE_KEY: str(self._model_type),
94
+ },
95
+ handle,
96
+ )
97
+ catboost = self._provide_catboost()
98
+ catboost.save_model(os.path.join(folder, _MODEL_FILENAME))
99
+
100
+ def fit(
101
+ self,
102
+ df: pd.DataFrame,
103
+ y: pd.Series | pd.DataFrame | None = None,
104
+ w: pd.Series | None = None,
105
+ ) -> Self:
106
+ if y is None:
107
+ raise ValueError("y is null.")
108
+ self._model_type = determine_model_type(y)
109
+ catboost = self._provide_catboost()
110
+
111
+ train_pool = Pool(
112
+ df,
113
+ label=y,
114
+ weight=w,
115
+ )
116
+ catboost.fit(
117
+ train_pool,
118
+ early_stopping_rounds=100,
119
+ verbose=False,
120
+ metric_period=100,
121
+ )
122
+ return self
123
+
124
+ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
125
+ pred_pool = Pool(df)
126
+ catboost = self._provide_catboost()
127
+ pred = catboost.predict(pred_pool)
128
+ df = pd.DataFrame(
129
+ index=df.index,
130
+ data={
131
+ PREDICTION_COLUMN: pred.flatten(),
132
+ },
133
+ )
134
+ if self._model_type != ModelType.REGRESSION:
135
+ proba = catboost.predict_proba(pred_pool) # type: ignore
136
+ for i in range(proba.shape[1]):
137
+ df[f"{PROBABILITY_COLUMN_PREFIX}{i}"] = proba[:, i]
138
+ return df
139
+
140
+ def _provide_catboost(self) -> CatBoost:
141
+ catboost = self._catboost
142
+ if catboost is None:
143
+ match self._model_type:
144
+ case ModelType.BINARY:
145
+ catboost = CatBoostClassifier(
146
+ iterations=self._iterations,
147
+ learning_rate=self._learning_rate,
148
+ depth=self._depth,
149
+ l2_leaf_reg=self._l2_leaf_reg,
150
+ boosting_type=self._boosting_type,
151
+ early_stopping_rounds=100,
152
+ metric_period=100,
153
+ )
154
+ case ModelType.REGRESSION:
155
+ catboost = CatBoostRegressor(
156
+ iterations=self._iterations,
157
+ learning_rate=self._learning_rate,
158
+ depth=self._depth,
159
+ l2_leaf_reg=self._l2_leaf_reg,
160
+ boosting_type=self._boosting_type,
161
+ early_stopping_rounds=100,
162
+ metric_period=100,
163
+ )
164
+ case ModelType.BINNED_BINARY:
165
+ catboost = CatBoostClassifier(
166
+ iterations=self._iterations,
167
+ learning_rate=self._learning_rate,
168
+ depth=self._depth,
169
+ l2_leaf_reg=self._l2_leaf_reg,
170
+ boosting_type=self._boosting_type,
171
+ early_stopping_rounds=100,
172
+ metric_period=100,
173
+ )
174
+ case ModelType.MULTI_CLASSIFICATION:
175
+ catboost = CatBoostClassifier(
176
+ iterations=self._iterations,
177
+ learning_rate=self._learning_rate,
178
+ depth=self._depth,
179
+ l2_leaf_reg=self._l2_leaf_reg,
180
+ boosting_type=self._boosting_type,
181
+ early_stopping_rounds=100,
182
+ metric_period=100,
183
+ )
184
+ self._catboost = catboost
185
+ if catboost is None:
186
+ raise ValueError("catboost is null")
187
+ return catboost
@@ -2,6 +2,8 @@
2
2
 
3
3
  from typing import Any
4
4
 
5
+ import pandas as pd
6
+
5
7
  from ..fit import Fit
6
8
  from ..params import Params
7
9
 
@@ -21,3 +23,7 @@ class Model(Params, Fit):
21
23
  def estimator(self) -> Any:
22
24
  """The estimator backing the model."""
23
25
  raise NotImplementedError("estimator not implemented in parent class.")
26
+
27
+ def pre_fit(self, y: pd.Series | pd.DataFrame | None) -> None:
28
+ """A call to make sure the model is prepared for the target type."""
29
+ raise NotImplementedError("pre_fit not implemented in parent class.")
@@ -37,10 +37,17 @@ class ModelRouter(Model):
37
37
  raise ValueError("model is null")
38
38
  return model.estimator
39
39
 
40
+ def pre_fit(self, y: pd.Series | pd.DataFrame | None):
41
+ model = self._model
42
+ if model is None:
43
+ raise ValueError("model is null")
44
+ model.pre_fit(y)
45
+
40
46
  def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
41
47
  self._model = _MODELS[
42
48
  trial.suggest_categorical("model", list(_MODELS.keys()))
43
49
  ]()
50
+ self._model.set_options(trial)
44
51
 
45
52
  def load(self, folder: str) -> None:
46
53
  with open(os.path.join(folder, _MODEL_ROUTER_FILE), encoding="utf8") as handle:
@@ -1,17 +1,17 @@
1
1
  """An enum to define the model type."""
2
2
 
3
- from enum import Enum
3
+ from enum import StrEnum, auto
4
4
 
5
5
  import pandas as pd
6
6
 
7
7
 
8
- class ModelType(Enum):
8
+ class ModelType(StrEnum):
9
9
  """The type of model being run."""
10
10
 
11
- BINARY = 1
12
- REGRESSION = 2
13
- BINNED_BINARY = 3
14
- MULTI_CLASSIFICATION = 4
11
+ BINARY = auto()
12
+ REGRESSION = auto()
13
+ BINNED_BINARY = auto()
14
+ MULTI_CLASSIFICATION = auto()
15
15
 
16
16
 
17
17
  def determine_model_type(y: pd.Series | pd.DataFrame) -> ModelType:
@@ -10,6 +10,7 @@ import pandas as pd
10
10
  from .constant_reducer import ConstantReducer
11
11
  from .correlation_reducer import CorrelationReducer
12
12
  from .duplicate_reducer import DuplicateReducer
13
+ from .nonnumeric_reducer import NonNumericReducer
13
14
  from .reducer import Reducer
14
15
 
15
16
  _COMBINED_REDUCER_FILE = "combined_reducer.json"
@@ -21,7 +22,12 @@ class CombinedReducer(Reducer):
21
22
 
22
23
  def __init__(self):
23
24
  super().__init__()
24
- self._reducers = [ConstantReducer(), DuplicateReducer(), CorrelationReducer()]
25
+ self._reducers = [
26
+ ConstantReducer(),
27
+ DuplicateReducer(),
28
+ CorrelationReducer(),
29
+ NonNumericReducer(),
30
+ ]
25
31
 
26
32
  @classmethod
27
33
  def name(cls) -> str:
@@ -44,6 +50,8 @@ class CombinedReducer(Reducer):
44
50
  self._reducers.append(DuplicateReducer())
45
51
  elif reducer_name == CorrelationReducer.name():
46
52
  self._reducers.append(CorrelationReducer())
53
+ elif reducer_name == NonNumericReducer.name():
54
+ self._reducers.append(NonNumericReducer())
47
55
  for reducer in self._reducers:
48
56
  reducer.load(folder)
49
57
 
@@ -67,7 +75,7 @@ class CombinedReducer(Reducer):
67
75
  w: pd.Series | None = None,
68
76
  ) -> Self:
69
77
  for reducer in self._reducers:
70
- reducer.fit(df)
78
+ df = reducer.fit_transform(df)
71
79
  return self
72
80
 
73
81
  def transform(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -0,0 +1,39 @@
1
+ """A reducer that removes non-numeric columns."""
2
+
3
+ from typing import Self
4
+
5
+ import optuna
6
+ import pandas as pd
7
+
8
+ from .reducer import Reducer
9
+
10
+
11
+ class NonNumericReducer(Reducer):
12
+ """A class that removes non numeric columns from a dataframe."""
13
+
14
+ @classmethod
15
+ def name(cls) -> str:
16
+ return "nonnumeric"
17
+
18
+ def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
19
+ pass
20
+
21
+ def load(self, folder: str) -> None:
22
+ pass
23
+
24
+ def save(self, folder: str) -> None:
25
+ pass
26
+
27
+ def fit(
28
+ self,
29
+ df: pd.DataFrame,
30
+ y: pd.Series | pd.DataFrame | None = None,
31
+ w: pd.Series | None = None,
32
+ ) -> Self:
33
+ return self
34
+
35
+ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
36
+ categorical_cols = df.select_dtypes(include="category").columns.tolist()
37
+ numeric_cols = df.select_dtypes(include="number").columns.tolist()
38
+ keep_cols = categorical_cols + numeric_cols
39
+ return df[keep_cols]
@@ -19,24 +19,18 @@ _SELECTOR_FILE = "selector.joblib"
19
19
  class Selector(Params, Fit):
20
20
  """The selector class."""
21
21
 
22
- def __init__(self, model: Model, total_features: int):
22
+ _selector: RFE | None
23
+
24
+ def __init__(self, model: Model):
23
25
  super().__init__()
24
26
  self._model = model
25
27
  self._feature_ratio = 0.0
26
28
  self._steps = 0
27
- n_features_to_select = max(1, total_features * self._feature_ratio)
28
- self._selector = RFE(
29
- model.estimator,
30
- n_features_to_select=n_features_to_select,
31
- step=self._steps,
32
- verbose=1,
33
- )
29
+ self._selector = None
34
30
 
35
31
  def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
36
32
  self._feature_ratio = trial.suggest_float("feature_ratio", 0.0, 1.0)
37
- steps = trial.suggest_int("steps", 1, 16)
38
- self._steps = steps
39
- self._selector.step = steps
33
+ self._steps = trial.suggest_int("steps", 1, 16)
40
34
 
41
35
  def load(self, folder: str) -> None:
42
36
  self._selector = joblib.load(os.path.join(folder, _SELECTOR_FILE))
@@ -50,9 +44,18 @@ class Selector(Params, Fit):
50
44
  y: pd.Series | pd.DataFrame | None = None,
51
45
  w: pd.Series | None = None,
52
46
  ) -> Self:
47
+ self._model.pre_fit(y)
53
48
  if not isinstance(y, pd.Series):
54
49
  raise ValueError("y is not a series.")
55
-
50
+ n_features_to_select = max(1, int(len(df.columns) * self._feature_ratio))
51
+ self._selector = RFE(
52
+ self._model.estimator,
53
+ n_features_to_select=n_features_to_select,
54
+ step=max(
55
+ 1,
56
+ int((len(df.columns) - n_features_to_select) / self._steps),
57
+ ),
58
+ )
56
59
  try:
57
60
  self._selector.fit(df, y=y, sample_weight=w)
58
61
  except ValueError as exc:
@@ -61,8 +64,11 @@ class Selector(Params, Fit):
61
64
  return self
62
65
 
63
66
  def transform(self, df: pd.DataFrame) -> pd.DataFrame:
67
+ selector = self._selector
68
+ if selector is None:
69
+ raise ValueError("selector is null.")
64
70
  try:
65
- return df[self._selector.get_feature_names_out()]
71
+ return df[selector.get_feature_names_out()]
66
72
  except AttributeError as exc:
67
73
  # Catch issues with 1 feature as a reduction target.
68
74
  logging.warning(str(exc))
@@ -11,7 +11,7 @@ from typing import Self
11
11
  import optuna
12
12
  import pandas as pd
13
13
  import tqdm
14
- from sklearn.metrics import accuracy_score, f1_score # type: ignore
14
+ from sklearn.metrics import f1_score, mean_absolute_error # type: ignore
15
15
 
16
16
  from .calibrator.calibrator_router import CalibratorRouter
17
17
  from .exceptions import WavetrainException
@@ -215,7 +215,7 @@ class Trainer(Fit):
215
215
  model.set_options(trial)
216
216
 
217
217
  # Train
218
- selector = Selector(model, len(x_train.columns.values))
218
+ selector = Selector(model)
219
219
  selector.set_options(trial)
220
220
  selector.fit(x_train, y=y_train, w=w)
221
221
  x_train = selector.transform(x_train)
@@ -243,7 +243,7 @@ class Trainer(Fit):
243
243
  y_pred = model.transform(x_test)
244
244
  y_pred = calibrator.transform(y_pred)
245
245
  if determine_model_type(y_series) == ModelType.REGRESSION:
246
- return accuracy_score(y_test, y_pred[[PREDICTION_COLUMN]])
246
+ return mean_absolute_error(y_test, y_pred[[PREDICTION_COLUMN]])
247
247
  return f1_score(y_test, y_pred[[PREDICTION_COLUMN]])
248
248
  except WavetrainException as exc:
249
249
  logging.warning(str(exc))
@@ -252,14 +252,16 @@ class Trainer(Fit):
252
252
  start_validation_index = (
253
253
  dt_index[-int(len(dt_index) * self._validation_size) - 1]
254
254
  if isinstance(self._validation_size, float)
255
- else dt_index[dt_index >= self._validation_size][0]
255
+ else dt_index[
256
+ dt_index >= (dt_index.to_list()[-1] - self._validation_size)
257
+ ].to_list()[0]
256
258
  )
257
259
  test_df = df[dt_index < start_validation_index]
258
260
  test_dt_index = (
259
261
  test_df.index if self._dt_column is None else test_df[self._dt_column]
260
262
  )
261
263
  start_test_index = (
262
- test_dt_index[-int(len(test_dt_index) * self._test_size)]
264
+ test_dt_index.to_list()[-int(len(test_dt_index) * self._test_size)]
263
265
  if isinstance(self._test_size, float)
264
266
  else test_dt_index[test_dt_index >= self._test_size][0]
265
267
  )
@@ -284,9 +286,15 @@ class Trainer(Fit):
284
286
  train_len = len(df[dt_index < start_test_index])
285
287
  test_len = len(df.loc[start_test_index:start_validation_index])
286
288
 
289
+ last_processed_dt = None
287
290
  for count, test_idx in tqdm.tqdm(
288
- enumerate(df[dt_index >= start_test_index].index)
291
+ enumerate(test_dt_index[test_dt_index >= start_test_index])
289
292
  ):
293
+ if (
294
+ last_processed_dt is not None
295
+ and test_idx < last_processed_dt + self._walkforward_timedelta
296
+ ):
297
+ continue
290
298
  test_dt = test_idx.to_pydatetime()
291
299
  found = False
292
300
  for trial in study.trials:
@@ -371,7 +379,7 @@ class Trainer(Fit):
371
379
  model = ModelRouter()
372
380
  model.load(folder)
373
381
 
374
- selector = Selector(model, len(df.columns.values))
382
+ selector = Selector(model)
375
383
  selector.load(folder)
376
384
 
377
385
  calibrator = CalibratorRouter(model)
@@ -47,7 +47,7 @@ class ClassWeights(Weights):
47
47
  self._class_weights = {}
48
48
  return self
49
49
 
50
- arr = df.astype(int).to_numpy().flatten().astype(float)
50
+ arr = y.astype(int).to_numpy().flatten().astype(float)
51
51
  unique_vals = np.unique(arr)
52
52
  w_arr = compute_class_weight(
53
53
  class_weight="balanced", classes=unique_vals, y=arr
@@ -52,7 +52,9 @@ class Windower(Params, Fit):
52
52
  lookback_ratio = self._lookback_ratio
53
53
  if lookback_ratio is None:
54
54
  raise ValueError("lookback_ratio is null")
55
- dt_index = df.index if self._dt_column is None else df[self._dt_column]
55
+ dt_index = (
56
+ df.index if self._dt_column is None else df[self._dt_column].to_list()
57
+ )
56
58
  start_idx = dt_index[int(len(df) * lookback_ratio)]
57
59
  end_idx = dt_index[-1]
58
60
  td = end_idx.to_pydatetime() - start_idx.to_pydatetime()
@@ -66,5 +68,6 @@ class Windower(Params, Fit):
66
68
  dt_index = df.index if self._dt_column is None else df[self._dt_column]
67
69
  return df[
68
70
  dt_index
69
- >= dt_index[-1].to_pydatetime() - datetime.timedelta(seconds=lookback)
71
+ >= dt_index.to_list()[-1].to_pydatetime()
72
+ - datetime.timedelta(seconds=lookback)
70
73
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.0.3
3
+ Version: 0.0.5
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
@@ -9,7 +9,6 @@ wavetrainer/__init__.py
9
9
  wavetrainer/create.py
10
10
  wavetrainer/exceptions.py
11
11
  wavetrainer/fit.py
12
- wavetrainer/load.py
13
12
  wavetrainer/model_type.py
14
13
  wavetrainer/params.py
15
14
  wavetrainer/trainer.py
@@ -34,6 +33,7 @@ wavetrainer/reducer/combined_reducer.py
34
33
  wavetrainer/reducer/constant_reducer.py
35
34
  wavetrainer/reducer/correlation_reducer.py
36
35
  wavetrainer/reducer/duplicate_reducer.py
36
+ wavetrainer/reducer/nonnumeric_reducer.py
37
37
  wavetrainer/reducer/reducer.py
38
38
  wavetrainer/selector/__init__.py
39
39
  wavetrainer/selector/selector.py
@@ -1,10 +0,0 @@
1
- """The wavetrain main module."""
2
-
3
- from .create import create
4
- from .load import load
5
-
6
- __VERSION__ = "0.0.3"
7
- __all__ = (
8
- "create",
9
- "load",
10
- )
@@ -1,8 +0,0 @@
1
- """The function for loading the trainer state from disk."""
2
-
3
- from .trainer import Trainer
4
-
5
-
6
- def load(folder: str) -> Trainer:
7
- """Loads the trainer from the folder."""
8
- raise NotImplementedError("load isn't implemented.")
@@ -1,80 +0,0 @@
1
- """A model that wraps catboost."""
2
-
3
- import os
4
- from typing import Any, Self
5
-
6
- import optuna
7
- import pandas as pd
8
- from catboost import CatBoostClassifier, Pool # type: ignore
9
-
10
- from .model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
11
-
12
- _MODEL_FILENAME = "model.cbm"
13
-
14
-
15
- class CatboostModel(Model):
16
- """A class that uses Catboost as a model."""
17
-
18
- @classmethod
19
- def name(cls) -> str:
20
- return "catboost"
21
-
22
- def __init__(self) -> None:
23
- super().__init__()
24
- self._catboost = CatBoostClassifier()
25
-
26
- @property
27
- def estimator(self) -> Any:
28
- return self._catboost
29
-
30
- def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
31
- iterations = trial.suggest_int("iterations", 100, 10000)
32
- learning_rate = trial.suggest_float("learning_rate", 0.001, 0.3)
33
- depth = trial.suggest_int("depth", 1, 12)
34
- l2_leaf_reg = trial.suggest_float("l2_leaf_reg", 3.0, 50.0)
35
- boosting_type = trial.suggest_categorical("boosting_type", ["Ordered", "Plain"])
36
- self._catboost.set_params(
37
- iterations=iterations,
38
- learning_rate=learning_rate,
39
- depth=depth,
40
- l2_leaf_reg=l2_leaf_reg,
41
- boosting_type=boosting_type,
42
- early_stopping_rounds=100,
43
- )
44
-
45
- def load(self, folder: str) -> None:
46
- self._catboost.load_model(os.path.join(folder, _MODEL_FILENAME))
47
-
48
- def save(self, folder: str) -> None:
49
- self._catboost.save_model(os.path.join(folder, _MODEL_FILENAME))
50
-
51
- def fit(
52
- self,
53
- df: pd.DataFrame,
54
- y: pd.Series | pd.DataFrame | None = None,
55
- w: pd.Series | None = None,
56
- ) -> Self:
57
- train_pool = Pool(
58
- df,
59
- label=y,
60
- weight=w,
61
- )
62
- self._catboost.fit(
63
- train_pool,
64
- early_stopping_rounds=100,
65
- )
66
- return self
67
-
68
- def transform(self, df: pd.DataFrame) -> pd.DataFrame:
69
- pred_pool = Pool(df)
70
- pred = self._catboost.predict(pred_pool)
71
- proba = self._catboost.predict_proba(pred_pool)
72
- df = pd.DataFrame(
73
- index=df.index,
74
- data={
75
- PREDICTION_COLUMN: pred.flatten(),
76
- },
77
- )
78
- for i in range(proba.shape[1]):
79
- df[f"{PROBABILITY_COLUMN_PREFIX}{i}"] = proba[:, i]
80
- return df
File without changes
File without changes
File without changes
File without changes