wavetrainer 0.0.8__tar.gz → 0.0.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wavetrainer-0.0.8/wavetrainer.egg-info → wavetrainer-0.0.10}/PKG-INFO +1 -1
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/setup.py +1 -1
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/tests/trainer_test.py +22 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/__init__.py +1 -1
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/model/catboost_model.py +18 -7
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/reducer/base_selector_reducer.py +13 -1
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/reducer/correlation_reducer.py +4 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/selector/selector.py +4 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/trainer.py +31 -10
- {wavetrainer-0.0.8 → wavetrainer-0.0.10/wavetrainer.egg-info}/PKG-INFO +1 -1
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/LICENSE +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/MANIFEST.in +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/README.md +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/requirements.txt +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/setup.cfg +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/tests/__init__.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/tests/model/__init__.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/tests/model/catboost_kwargs_test.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/calibrator/__init__.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/calibrator/calibrator.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/calibrator/calibrator_router.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/calibrator/mapie_calibrator.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/calibrator/vennabers_calibrator.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/create.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/exceptions.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/fit.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/model/__init__.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/model/catboost_classifier_wrap.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/model/catboost_kwargs.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/model/catboost_regressor_wrap.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/model/model.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/model/model_router.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/model_type.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/params.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/reducer/__init__.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/reducer/combined_reducer.py +1 -1
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/reducer/constant_reducer.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/reducer/duplicate_reducer.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/reducer/nonnumeric_reducer.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/reducer/reducer.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/selector/__init__.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/weights/__init__.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/weights/class_weights.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/weights/combined_weights.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/weights/exponential_weights.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/weights/linear_weights.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/weights/noop_weights.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/weights/sigmoid_weights.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/weights/weights.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/weights/weights_router.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/windower/__init__.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer/windower/windower.py +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer.egg-info/SOURCES.txt +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer.egg-info/dependency_links.txt +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer.egg-info/not-zip-safe +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer.egg-info/requires.txt +0 -0
- {wavetrainer-0.0.8 → wavetrainer-0.0.10}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
|
|
23
23
|
|
24
24
|
setup(
|
25
25
|
name='wavetrainer',
|
26
|
-
version='0.0.
|
26
|
+
version='0.0.10',
|
27
27
|
description='A library for automatically finding the optimal model within feature and hyperparameter space.',
|
28
28
|
long_description=long_description,
|
29
29
|
long_description_content_type='text/markdown',
|
@@ -37,3 +37,25 @@ class TestTrainer(unittest.TestCase):
|
|
37
37
|
df = trainer.transform(df)
|
38
38
|
print("df:")
|
39
39
|
print(df)
|
40
|
+
|
41
|
+
def test_trainer_dt_column(self):
|
42
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
43
|
+
trainer = Trainer(tmpdir, walkforward_timedelta=datetime.timedelta(days=7), trials=1, dt_column="dt_column")
|
44
|
+
x_data = [i for i in range(100)]
|
45
|
+
x_index = [datetime.datetime(2022, 1, 1) + datetime.timedelta(days=i) for i in range(len(x_data))]
|
46
|
+
df = pd.DataFrame(
|
47
|
+
data={
|
48
|
+
"column1": x_data,
|
49
|
+
"dt_column": x_index,
|
50
|
+
},
|
51
|
+
)
|
52
|
+
y = pd.DataFrame(
|
53
|
+
data={
|
54
|
+
"y": [x % 2 == 0 for x in x_data],
|
55
|
+
},
|
56
|
+
index=df.index,
|
57
|
+
)
|
58
|
+
trainer.fit(df, y=y)
|
59
|
+
df = trainer.transform(df)
|
60
|
+
print("df:")
|
61
|
+
print(df)
|
@@ -23,12 +23,13 @@ _DEPTH_KEY = "depth"
|
|
23
23
|
_L2_LEAF_REG_KEY = "l2_leaf_reg"
|
24
24
|
_BOOSTING_TYPE_KEY = "boosting_type"
|
25
25
|
_MODEL_TYPE_KEY = "model_type"
|
26
|
+
_EARLY_STOPPING_ROUNDS = "early_stopping_rounds"
|
26
27
|
|
27
28
|
|
28
29
|
class CatboostModel(Model):
|
29
30
|
"""A class that uses Catboost as a model."""
|
30
31
|
|
31
|
-
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
32
|
+
# pylint: disable=too-many-positional-arguments,too-many-arguments,too-many-instance-attributes
|
32
33
|
|
33
34
|
_catboost: CatBoost | None
|
34
35
|
_iterations: None | int
|
@@ -37,6 +38,7 @@ class CatboostModel(Model):
|
|
37
38
|
_l2_leaf_reg: None | float
|
38
39
|
_boosting_type: None | str
|
39
40
|
_model_type: None | ModelType
|
41
|
+
_early_stopping_rounds: None | int
|
40
42
|
|
41
43
|
@classmethod
|
42
44
|
def name(cls) -> str:
|
@@ -51,6 +53,7 @@ class CatboostModel(Model):
|
|
51
53
|
self._l2_leaf_reg = None
|
52
54
|
self._boosting_type = None
|
53
55
|
self._model_type = None
|
56
|
+
self._early_stopping_rounds = None
|
54
57
|
|
55
58
|
@property
|
56
59
|
def estimator(self) -> Any:
|
@@ -80,6 +83,9 @@ class CatboostModel(Model):
|
|
80
83
|
self._boosting_type = trial.suggest_categorical(
|
81
84
|
_BOOSTING_TYPE_KEY, ["Ordered", "Plain"]
|
82
85
|
)
|
86
|
+
self._early_stopping_rounds = trial.suggest_int(
|
87
|
+
_EARLY_STOPPING_ROUNDS, 10, 1000
|
88
|
+
)
|
83
89
|
|
84
90
|
def load(self, folder: str) -> None:
|
85
91
|
with open(
|
@@ -92,6 +98,7 @@ class CatboostModel(Model):
|
|
92
98
|
self._l2_leaf_reg = params[_L2_LEAF_REG_KEY]
|
93
99
|
self._boosting_type = params[_BOOSTING_TYPE_KEY]
|
94
100
|
self._model_type = ModelType(params[_MODEL_TYPE_KEY])
|
101
|
+
self._early_stopping_rounds = params[_EARLY_STOPPING_ROUNDS]
|
95
102
|
catboost = self._provide_catboost()
|
96
103
|
catboost.load_model(os.path.join(folder, _MODEL_FILENAME))
|
97
104
|
|
@@ -107,6 +114,7 @@ class CatboostModel(Model):
|
|
107
114
|
_L2_LEAF_REG_KEY: self._l2_leaf_reg,
|
108
115
|
_BOOSTING_TYPE_KEY: self._boosting_type,
|
109
116
|
_MODEL_TYPE_KEY: str(self._model_type),
|
117
|
+
_EARLY_STOPPING_ROUNDS: self._early_stopping_rounds,
|
110
118
|
},
|
111
119
|
handle,
|
112
120
|
)
|
@@ -141,7 +149,7 @@ class CatboostModel(Model):
|
|
141
149
|
)
|
142
150
|
catboost.fit(
|
143
151
|
train_pool,
|
144
|
-
early_stopping_rounds=
|
152
|
+
early_stopping_rounds=self._early_stopping_rounds,
|
145
153
|
verbose=False,
|
146
154
|
metric_period=100,
|
147
155
|
eval_set=eval_pool,
|
@@ -149,7 +157,10 @@ class CatboostModel(Model):
|
|
149
157
|
return self
|
150
158
|
|
151
159
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
152
|
-
pred_pool = Pool(
|
160
|
+
pred_pool = Pool(
|
161
|
+
df,
|
162
|
+
cat_features=df.select_dtypes(include="category").columns.tolist(),
|
163
|
+
)
|
153
164
|
catboost = self._provide_catboost()
|
154
165
|
pred = catboost.predict(pred_pool)
|
155
166
|
df = pd.DataFrame(
|
@@ -175,7 +186,7 @@ class CatboostModel(Model):
|
|
175
186
|
depth=self._depth,
|
176
187
|
l2_leaf_reg=self._l2_leaf_reg,
|
177
188
|
boosting_type=self._boosting_type,
|
178
|
-
early_stopping_rounds=
|
189
|
+
early_stopping_rounds=self._early_stopping_rounds,
|
179
190
|
metric_period=100,
|
180
191
|
)
|
181
192
|
case ModelType.REGRESSION:
|
@@ -185,7 +196,7 @@ class CatboostModel(Model):
|
|
185
196
|
depth=self._depth,
|
186
197
|
l2_leaf_reg=self._l2_leaf_reg,
|
187
198
|
boosting_type=self._boosting_type,
|
188
|
-
early_stopping_rounds=
|
199
|
+
early_stopping_rounds=self._early_stopping_rounds,
|
189
200
|
metric_period=100,
|
190
201
|
)
|
191
202
|
case ModelType.BINNED_BINARY:
|
@@ -195,7 +206,7 @@ class CatboostModel(Model):
|
|
195
206
|
depth=self._depth,
|
196
207
|
l2_leaf_reg=self._l2_leaf_reg,
|
197
208
|
boosting_type=self._boosting_type,
|
198
|
-
early_stopping_rounds=
|
209
|
+
early_stopping_rounds=self._early_stopping_rounds,
|
199
210
|
metric_period=100,
|
200
211
|
)
|
201
212
|
case ModelType.MULTI_CLASSIFICATION:
|
@@ -205,7 +216,7 @@ class CatboostModel(Model):
|
|
205
216
|
depth=self._depth,
|
206
217
|
l2_leaf_reg=self._l2_leaf_reg,
|
207
218
|
boosting_type=self._boosting_type,
|
208
|
-
early_stopping_rounds=
|
219
|
+
early_stopping_rounds=self._early_stopping_rounds,
|
209
220
|
metric_period=100,
|
210
221
|
)
|
211
222
|
self._catboost = catboost
|
@@ -1,5 +1,6 @@
|
|
1
1
|
"""A reducer that uses a base selector from the feature engine."""
|
2
2
|
|
3
|
+
import logging
|
3
4
|
import os
|
4
5
|
from typing import Self
|
5
6
|
|
@@ -26,6 +27,11 @@ class BaseSelectorReducer(Reducer):
|
|
26
27
|
def name(cls) -> str:
|
27
28
|
raise NotImplementedError("name not implemented in parent class.")
|
28
29
|
|
30
|
+
@classmethod
|
31
|
+
def should_raise(cls) -> bool:
|
32
|
+
"""Whether the class should raise its exception if it encounters it."""
|
33
|
+
return True
|
34
|
+
|
29
35
|
def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
30
36
|
pass
|
31
37
|
|
@@ -45,11 +51,17 @@ class BaseSelectorReducer(Reducer):
|
|
45
51
|
eval_x: pd.DataFrame | None = None,
|
46
52
|
eval_y: pd.Series | pd.DataFrame | None = None,
|
47
53
|
) -> Self:
|
54
|
+
if len(df.columns) <= 1:
|
55
|
+
return self
|
48
56
|
try:
|
49
57
|
self._base_selector.fit(df) # type: ignore
|
50
58
|
except ValueError as exc:
|
51
|
-
|
59
|
+
logging.warning(str(exc))
|
60
|
+
if self.should_raise():
|
61
|
+
raise WavetrainException() from exc
|
52
62
|
return self
|
53
63
|
|
54
64
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
65
|
+
if len(df.columns) <= 1:
|
66
|
+
return df
|
55
67
|
return self._base_selector.transform(df)
|
@@ -53,6 +53,8 @@ class Selector(Params, Fit):
|
|
53
53
|
model_kwargs = self._model.pre_fit(df, y=y, eval_x=eval_x, eval_y=eval_y)
|
54
54
|
if not isinstance(y, pd.Series):
|
55
55
|
raise ValueError("y is not a series.")
|
56
|
+
if len(df.columns) <= 1:
|
57
|
+
return self
|
56
58
|
n_features_to_select = max(1, int(len(df.columns) * self._feature_ratio))
|
57
59
|
self._selector = RFE(
|
58
60
|
self._model.estimator,
|
@@ -70,6 +72,8 @@ class Selector(Params, Fit):
|
|
70
72
|
return self
|
71
73
|
|
72
74
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
75
|
+
if len(df.columns) <= 1:
|
76
|
+
return df
|
73
77
|
selector = self._selector
|
74
78
|
if selector is None:
|
75
79
|
raise ValueError("selector is null.")
|
@@ -165,7 +165,11 @@ class Trainer(Fit):
|
|
165
165
|
if y is None:
|
166
166
|
return self
|
167
167
|
|
168
|
-
dt_index =
|
168
|
+
dt_index = (
|
169
|
+
df.index
|
170
|
+
if self._dt_column is None
|
171
|
+
else pd.DatetimeIndex(pd.to_datetime(df[self._dt_column]))
|
172
|
+
)
|
169
173
|
|
170
174
|
def _fit_column(y_series: pd.Series):
|
171
175
|
column_dir = os.path.join(self._folder, str(y_series.name))
|
@@ -184,10 +188,10 @@ class Trainer(Fit):
|
|
184
188
|
trial.set_user_attr(_IDX_USR_ATTR_KEY, split_idx.isoformat())
|
185
189
|
|
186
190
|
train_dt_index = dt_index[: len(x)]
|
187
|
-
x_train = x[train_dt_index < split_idx]
|
188
|
-
x_test = x[train_dt_index >= split_idx]
|
189
|
-
y_train = y_series[train_dt_index < split_idx]
|
190
|
-
y_test = y_series[train_dt_index >= split_idx]
|
191
|
+
x_train = x[train_dt_index < split_idx] # type: ignore
|
192
|
+
x_test = x[train_dt_index >= split_idx] # type: ignore
|
193
|
+
y_train = y_series[train_dt_index < split_idx] # type: ignore
|
194
|
+
y_test = y_series[train_dt_index >= split_idx] # type: ignore
|
191
195
|
|
192
196
|
try:
|
193
197
|
# Window the data
|
@@ -250,14 +254,15 @@ class Trainer(Fit):
|
|
250
254
|
return float(r2_score(y_test, y_pred[[PREDICTION_COLUMN]]))
|
251
255
|
return float(f1_score(y_test, y_pred[[PREDICTION_COLUMN]]))
|
252
256
|
except WavetrainException as exc:
|
257
|
+
logging.warning("WE DID NOT END UP TRAINING ANYTHING!!!!!")
|
253
258
|
logging.warning(str(exc))
|
254
259
|
return -1.0
|
255
260
|
|
256
261
|
start_validation_index = (
|
257
|
-
dt_index[-int(len(dt_index) * self._validation_size) - 1]
|
262
|
+
dt_index.to_list()[-int(len(dt_index) * self._validation_size) - 1]
|
258
263
|
if isinstance(self._validation_size, float)
|
259
264
|
else dt_index[
|
260
|
-
dt_index >= (dt_index.to_list()[-1] - self._validation_size)
|
265
|
+
dt_index >= (dt_index.to_list()[-1] - self._validation_size) # type: ignore
|
261
266
|
].to_list()[0]
|
262
267
|
)
|
263
268
|
test_df = df[dt_index < start_validation_index]
|
@@ -288,7 +293,12 @@ class Trainer(Fit):
|
|
288
293
|
)
|
289
294
|
|
290
295
|
train_len = len(df[dt_index < start_test_index])
|
291
|
-
test_len = len(
|
296
|
+
test_len = len(
|
297
|
+
dt_index[
|
298
|
+
(dt_index >= start_test_index)
|
299
|
+
& (dt_index <= start_validation_index)
|
300
|
+
]
|
301
|
+
)
|
292
302
|
|
293
303
|
last_processed_dt = None
|
294
304
|
for count, test_idx in tqdm.tqdm(
|
@@ -341,7 +351,11 @@ class Trainer(Fit):
|
|
341
351
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
342
352
|
"""Predict the expected values of the data."""
|
343
353
|
feature_columns = df.columns.values
|
344
|
-
dt_index =
|
354
|
+
dt_index = (
|
355
|
+
df.index
|
356
|
+
if self._dt_column is None
|
357
|
+
else pd.DatetimeIndex(pd.to_datetime(df[self._dt_column]))
|
358
|
+
)
|
345
359
|
|
346
360
|
for column in os.listdir(self._folder):
|
347
361
|
column_path = os.path.join(self._folder, column)
|
@@ -353,6 +367,8 @@ class Trainer(Fit):
|
|
353
367
|
if not os.path.isdir(date_path):
|
354
368
|
continue
|
355
369
|
dates.append(datetime.datetime.fromisoformat(date_str))
|
370
|
+
if not dates:
|
371
|
+
raise ValueError(f"no dates found for {column}.")
|
356
372
|
bins: list[datetime.datetime] = sorted(
|
357
373
|
[dt_index.min().to_pydatetime()]
|
358
374
|
+ dates
|
@@ -371,7 +387,12 @@ class Trainer(Fit):
|
|
371
387
|
column: str,
|
372
388
|
dates: list[datetime.datetime],
|
373
389
|
) -> pd.DataFrame:
|
374
|
-
|
390
|
+
group_dt_index = (
|
391
|
+
group.index
|
392
|
+
if self._dt_column is None
|
393
|
+
else pd.DatetimeIndex(pd.to_datetime(group[self._dt_column]))
|
394
|
+
)
|
395
|
+
filtered_dates = [x for x in dates if x < group_dt_index.max()]
|
375
396
|
if not filtered_dates:
|
376
397
|
filtered_dates = [dates[-1]]
|
377
398
|
date_str = dates[-1].isoformat()
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|