wavetrainer 0.0.24__tar.gz → 0.0.26__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wavetrainer-0.0.24/wavetrainer.egg-info → wavetrainer-0.0.26}/PKG-INFO +11 -1
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/README.md +5 -0
- wavetrainer-0.0.24/wavetrainer.egg-info/requires.txt → wavetrainer-0.0.26/requirements.txt +5 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/setup.py +1 -1
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/tests/trainer_test.py +1 -1
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/__init__.py +1 -1
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/create.py +2 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/model/catboost_model.py +6 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/model/model.py +9 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/model/model_router.py +14 -5
- wavetrainer-0.0.26/wavetrainer/model/tabpfn_model.py +145 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/reducer/combined_reducer.py +6 -1
- wavetrainer-0.0.26/wavetrainer/reducer/pca_reducer.py +77 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/selector/selector.py +6 -3
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/trainer.py +9 -3
- {wavetrainer-0.0.24 → wavetrainer-0.0.26/wavetrainer.egg-info}/PKG-INFO +11 -1
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer.egg-info/SOURCES.txt +2 -0
- wavetrainer-0.0.24/requirements.txt → wavetrainer-0.0.26/wavetrainer.egg-info/requires.txt +6 -1
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/LICENSE +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/MANIFEST.in +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/setup.cfg +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/tests/__init__.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/tests/model/__init__.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/tests/model/catboost_kwargs_test.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/calibrator/__init__.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/calibrator/calibrator.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/calibrator/calibrator_router.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/calibrator/mapie_calibrator.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/calibrator/vennabers_calibrator.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/exceptions.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/fit.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/model/__init__.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/model/catboost_classifier_wrap.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/model/catboost_kwargs.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/model/catboost_regressor_wrap.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/model_type.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/params.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/reducer/__init__.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/reducer/base_selector_reducer.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/reducer/constant_reducer.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/reducer/correlation_reducer.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/reducer/duplicate_reducer.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/reducer/nonnumeric_reducer.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/reducer/reducer.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/reducer/unseen_reducer.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/selector/__init__.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/weights/__init__.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/weights/class_weights.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/weights/combined_weights.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/weights/exponential_weights.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/weights/linear_weights.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/weights/noop_weights.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/weights/sigmoid_weights.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/weights/weights.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/weights/weights_router.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/windower/__init__.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer/windower/windower.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer.egg-info/dependency_links.txt +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer.egg-info/not-zip-safe +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.26}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: wavetrainer
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.26
|
4
4
|
Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
|
5
5
|
Home-page: https://github.com/8W9aG/wavetrainer
|
6
6
|
Author: Will Sackfield
|
@@ -23,6 +23,11 @@ Requires-Dist: venn-abers>=1.4.6
|
|
23
23
|
Requires-Dist: mapie>=0.9.2
|
24
24
|
Requires-Dist: pytz>=2025.1
|
25
25
|
Requires-Dist: torch>=2.6.0
|
26
|
+
Requires-Dist: tabpfn>=2.0.6
|
27
|
+
Requires-Dist: tabpfn-extensions>=0.0.4
|
28
|
+
Requires-Dist: shap>=0.47.2
|
29
|
+
Requires-Dist: hyperopt>=0.2.7
|
30
|
+
Requires-Dist: pytest-is-running>=1.5.1
|
26
31
|
|
27
32
|
# wavetrainer
|
28
33
|
|
@@ -52,6 +57,11 @@ Python 3.11.6:
|
|
52
57
|
- [mapie](https://mapie.readthedocs.io/en/stable/)
|
53
58
|
- [pytz](https://pythonhosted.org/pytz/)
|
54
59
|
- [torch](https://pytorch.org/)
|
60
|
+
- [tabpfn](https://github.com/PriorLabs/TabPFN)
|
61
|
+
- [tabpfn-extensions](https://github.com/PriorLabs/tabpfn-extensions)
|
62
|
+
- [shap](https://shap.readthedocs.io/en/latest/)
|
63
|
+
- [hyperopt](https://hyperopt.github.io/hyperopt/)
|
64
|
+
- [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
|
55
65
|
|
56
66
|
## Raison D'être :thought_balloon:
|
57
67
|
|
@@ -26,6 +26,11 @@ Python 3.11.6:
|
|
26
26
|
- [mapie](https://mapie.readthedocs.io/en/stable/)
|
27
27
|
- [pytz](https://pythonhosted.org/pytz/)
|
28
28
|
- [torch](https://pytorch.org/)
|
29
|
+
- [tabpfn](https://github.com/PriorLabs/TabPFN)
|
30
|
+
- [tabpfn-extensions](https://github.com/PriorLabs/tabpfn-extensions)
|
31
|
+
- [shap](https://shap.readthedocs.io/en/latest/)
|
32
|
+
- [hyperopt](https://hyperopt.github.io/hyperopt/)
|
33
|
+
- [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
|
29
34
|
|
30
35
|
## Raison D'être :thought_balloon:
|
31
36
|
|
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
|
|
23
23
|
|
24
24
|
setup(
|
25
25
|
name='wavetrainer',
|
26
|
-
version='0.0.
|
26
|
+
version='0.0.26',
|
27
27
|
description='A library for automatically finding the optimal model within feature and hyperparameter space.',
|
28
28
|
long_description=long_description,
|
29
29
|
long_description_content_type='text/markdown',
|
@@ -14,7 +14,7 @@ class TestTrainer(unittest.TestCase):
|
|
14
14
|
def test_trainer(self):
|
15
15
|
with tempfile.TemporaryDirectory() as tmpdir:
|
16
16
|
trainer = Trainer(tmpdir, walkforward_timedelta=datetime.timedelta(days=7), trials=1)
|
17
|
-
x_data = [i for i in range(
|
17
|
+
x_data = [i for i in range(101)]
|
18
18
|
x_index = [datetime.datetime(2022, 1, 1) + datetime.timedelta(days=i) for i in range(len(x_data))]
|
19
19
|
df = pd.DataFrame(
|
20
20
|
data={
|
@@ -15,6 +15,7 @@ def create(
|
|
15
15
|
dt_column: str | None = None,
|
16
16
|
max_train_timeout: datetime.timedelta | None = None,
|
17
17
|
cutoff_dt: datetime.datetime | None = None,
|
18
|
+
max_features: int | None = None,
|
18
19
|
) -> Trainer:
|
19
20
|
"""Create a trainer."""
|
20
21
|
return Trainer(
|
@@ -25,4 +26,5 @@ def create(
|
|
25
26
|
dt_column=dt_column,
|
26
27
|
max_train_timeout=max_train_timeout,
|
27
28
|
cutoff_dt=cutoff_dt,
|
29
|
+
max_features=max_features,
|
28
30
|
)
|
@@ -61,12 +61,17 @@ class CatboostModel(Model):
|
|
61
61
|
def estimator(self) -> Any:
|
62
62
|
return self._provide_catboost()
|
63
63
|
|
64
|
+
@property
|
65
|
+
def supports_importances(self) -> bool:
|
66
|
+
return True
|
67
|
+
|
64
68
|
def pre_fit(
|
65
69
|
self,
|
66
70
|
df: pd.DataFrame,
|
67
71
|
y: pd.Series | pd.DataFrame | None,
|
68
72
|
eval_x: pd.DataFrame | None = None,
|
69
73
|
eval_y: pd.Series | pd.DataFrame | None = None,
|
74
|
+
w: pd.Series | None = None,
|
70
75
|
):
|
71
76
|
if y is None:
|
72
77
|
raise ValueError("y is null.")
|
@@ -75,6 +80,7 @@ class CatboostModel(Model):
|
|
75
80
|
EVAL_SET_ARG_KEY: (eval_x, eval_y),
|
76
81
|
CAT_FEATURES_ARG_KEY: df.select_dtypes(include="category").columns.tolist(),
|
77
82
|
ORIGINAL_X_ARG_KEY: df,
|
83
|
+
"sample_weight": w,
|
78
84
|
}
|
79
85
|
|
80
86
|
def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
@@ -1,5 +1,6 @@
|
|
1
1
|
"""The prototype model class."""
|
2
2
|
|
3
|
+
# pylint: disable=too-many-arguments,too-many-positional-arguments
|
3
4
|
from typing import Any
|
4
5
|
|
5
6
|
import pandas as pd
|
@@ -24,12 +25,20 @@ class Model(Params, Fit):
|
|
24
25
|
"""The estimator backing the model."""
|
25
26
|
raise NotImplementedError("estimator not implemented in parent class.")
|
26
27
|
|
28
|
+
@property
|
29
|
+
def supports_importances(self) -> bool:
|
30
|
+
"""Whether this model supports feature importances."""
|
31
|
+
raise NotImplementedError(
|
32
|
+
"supports_importances not implemented in parent class."
|
33
|
+
)
|
34
|
+
|
27
35
|
def pre_fit(
|
28
36
|
self,
|
29
37
|
df: pd.DataFrame,
|
30
38
|
y: pd.Series | pd.DataFrame | None,
|
31
39
|
eval_x: pd.DataFrame | None = None,
|
32
40
|
eval_y: pd.Series | pd.DataFrame | None = None,
|
41
|
+
w: pd.Series | None = None,
|
33
42
|
) -> dict[str, Any]:
|
34
43
|
"""A call to make sure the model is prepared for the target type."""
|
35
44
|
raise NotImplementedError("pre_fit not implemented in parent class.")
|
@@ -9,11 +9,13 @@ import pandas as pd
|
|
9
9
|
|
10
10
|
from .catboost_model import CatboostModel
|
11
11
|
from .model import Model
|
12
|
+
from .tabpfn_model import TabPFNModel
|
12
13
|
|
13
14
|
_MODEL_ROUTER_FILE = "model_router.json"
|
14
15
|
_MODEL_KEY = "model"
|
15
16
|
_MODELS = {
|
16
17
|
CatboostModel.name(): CatboostModel,
|
18
|
+
TabPFNModel.name(): TabPFNModel,
|
17
19
|
}
|
18
20
|
|
19
21
|
|
@@ -39,23 +41,30 @@ class ModelRouter(Model):
|
|
39
41
|
raise ValueError("model is null")
|
40
42
|
return model.estimator
|
41
43
|
|
44
|
+
@property
|
45
|
+
def supports_importances(self) -> bool:
|
46
|
+
model = self._model
|
47
|
+
if model is None:
|
48
|
+
raise ValueError("model is null")
|
49
|
+
return model.supports_importances
|
50
|
+
|
42
51
|
def pre_fit(
|
43
52
|
self,
|
44
53
|
df: pd.DataFrame,
|
45
54
|
y: pd.Series | pd.DataFrame | None,
|
46
55
|
eval_x: pd.DataFrame | None = None,
|
47
56
|
eval_y: pd.Series | pd.DataFrame | None = None,
|
57
|
+
w: pd.Series | None = None,
|
48
58
|
) -> dict[str, Any]:
|
49
59
|
model = self._model
|
50
60
|
if model is None:
|
51
61
|
raise ValueError("model is null")
|
52
|
-
return model.pre_fit(df, y=y, eval_x=eval_x, eval_y=eval_y)
|
62
|
+
return model.pre_fit(df, y=y, eval_x=eval_x, eval_y=eval_y, w=w)
|
53
63
|
|
54
64
|
def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
self._model.set_options(trial)
|
65
|
+
model = _MODELS[trial.suggest_categorical("model", list(_MODELS.keys()))]()
|
66
|
+
model.set_options(trial)
|
67
|
+
self._model = model
|
59
68
|
|
60
69
|
def load(self, folder: str) -> None:
|
61
70
|
with open(os.path.join(folder, _MODEL_ROUTER_FILE), encoding="utf8") as handle:
|
@@ -0,0 +1,145 @@
|
|
1
|
+
"""A model that wraps tabpfn."""
|
2
|
+
# pylint: disable=duplicate-code,too-many-arguments,too-many-positional-arguments
|
3
|
+
|
4
|
+
import json
|
5
|
+
import os
|
6
|
+
import pickle
|
7
|
+
from typing import Any, Self
|
8
|
+
|
9
|
+
import optuna
|
10
|
+
import pandas as pd
|
11
|
+
import pytest_is_running
|
12
|
+
import torch
|
13
|
+
from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import ( # type: ignore
|
14
|
+
AutoTabPFNClassifier, AutoTabPFNRegressor)
|
15
|
+
|
16
|
+
from ..model_type import ModelType, determine_model_type
|
17
|
+
from .model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
|
18
|
+
|
19
|
+
_MODEL_FILENAME = "model.pkl"
|
20
|
+
_MODEL_PARAMS_FILENAME = "model_params.json"
|
21
|
+
_MODEL_TYPE_KEY = "model_type"
|
22
|
+
|
23
|
+
|
24
|
+
class TabPFNModel(Model):
|
25
|
+
"""A class that uses TabPFN as a model."""
|
26
|
+
|
27
|
+
_tabpfn: AutoTabPFNClassifier | AutoTabPFNRegressor | None
|
28
|
+
_model_type: None | ModelType
|
29
|
+
|
30
|
+
@classmethod
|
31
|
+
def name(cls) -> str:
|
32
|
+
return "tabpfn"
|
33
|
+
|
34
|
+
def __init__(self) -> None:
|
35
|
+
super().__init__()
|
36
|
+
self._tabpfn = None
|
37
|
+
self._model_type = None
|
38
|
+
|
39
|
+
@property
|
40
|
+
def estimator(self) -> Any:
|
41
|
+
return self._provide_tabpfn()
|
42
|
+
|
43
|
+
@property
|
44
|
+
def supports_importances(self) -> bool:
|
45
|
+
return False
|
46
|
+
|
47
|
+
def pre_fit(
|
48
|
+
self,
|
49
|
+
df: pd.DataFrame,
|
50
|
+
y: pd.Series | pd.DataFrame | None,
|
51
|
+
eval_x: pd.DataFrame | None = None,
|
52
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
53
|
+
w: pd.Series | None = None,
|
54
|
+
):
|
55
|
+
if y is None:
|
56
|
+
raise ValueError("y is null.")
|
57
|
+
self._model_type = determine_model_type(y)
|
58
|
+
return {}
|
59
|
+
|
60
|
+
def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
61
|
+
pass
|
62
|
+
|
63
|
+
def load(self, folder: str) -> None:
|
64
|
+
with open(os.path.join(folder, _MODEL_FILENAME), "rb") as f:
|
65
|
+
self._tabpfn = pickle.load(f)
|
66
|
+
with open(
|
67
|
+
os.path.join(folder, _MODEL_PARAMS_FILENAME), encoding="utf8"
|
68
|
+
) as handle:
|
69
|
+
params = json.load(handle)
|
70
|
+
self._model_type = ModelType(params[_MODEL_TYPE_KEY])
|
71
|
+
|
72
|
+
def save(self, folder: str) -> None:
|
73
|
+
with open(os.path.join(folder, _MODEL_FILENAME), "wb") as f:
|
74
|
+
pickle.dump(self._tabpfn, f)
|
75
|
+
with open(
|
76
|
+
os.path.join(folder, _MODEL_PARAMS_FILENAME), "w", encoding="utf8"
|
77
|
+
) as handle:
|
78
|
+
json.dump(
|
79
|
+
{
|
80
|
+
_MODEL_TYPE_KEY: str(self._model_type),
|
81
|
+
},
|
82
|
+
handle,
|
83
|
+
)
|
84
|
+
|
85
|
+
def fit(
|
86
|
+
self,
|
87
|
+
df: pd.DataFrame,
|
88
|
+
y: pd.Series | pd.DataFrame | None = None,
|
89
|
+
w: pd.Series | None = None,
|
90
|
+
eval_x: pd.DataFrame | None = None,
|
91
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
92
|
+
) -> Self:
|
93
|
+
if y is None:
|
94
|
+
raise ValueError("y is null.")
|
95
|
+
self._model_type = determine_model_type(y)
|
96
|
+
tabpfn = self._provide_tabpfn()
|
97
|
+
tabpfn.fit(df, y)
|
98
|
+
return self
|
99
|
+
|
100
|
+
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
101
|
+
tabpfn = self._provide_tabpfn()
|
102
|
+
if tabpfn is None:
|
103
|
+
raise ValueError("tabpfn is null")
|
104
|
+
pred = tabpfn.predict(df)
|
105
|
+
new_df = pd.DataFrame(
|
106
|
+
index=df.index,
|
107
|
+
data={
|
108
|
+
PREDICTION_COLUMN: pred.flatten(),
|
109
|
+
},
|
110
|
+
)
|
111
|
+
if isinstance(tabpfn, AutoTabPFNClassifier):
|
112
|
+
proba = tabpfn.predict_proba(df)
|
113
|
+
for i in range(proba.shape[1]):
|
114
|
+
new_df[f"{PROBABILITY_COLUMN_PREFIX}{i}"] = proba[:, i]
|
115
|
+
return new_df
|
116
|
+
|
117
|
+
def _provide_tabpfn(self) -> AutoTabPFNClassifier | AutoTabPFNRegressor:
|
118
|
+
tabpfn = self._tabpfn
|
119
|
+
if tabpfn is None:
|
120
|
+
max_time = 1 if pytest_is_running.is_running() else 120
|
121
|
+
match self._model_type:
|
122
|
+
case ModelType.BINARY:
|
123
|
+
tabpfn = AutoTabPFNClassifier(
|
124
|
+
max_time=max_time,
|
125
|
+
device="cuda" if torch.cuda.is_available() else "cpu",
|
126
|
+
)
|
127
|
+
case ModelType.REGRESSION:
|
128
|
+
tabpfn = AutoTabPFNRegressor(
|
129
|
+
max_time=max_time,
|
130
|
+
device="cuda" if torch.cuda.is_available() else "cpu",
|
131
|
+
)
|
132
|
+
case ModelType.BINNED_BINARY:
|
133
|
+
tabpfn = AutoTabPFNClassifier(
|
134
|
+
max_time=max_time,
|
135
|
+
device="cuda" if torch.cuda.is_available() else "cpu",
|
136
|
+
)
|
137
|
+
case ModelType.MULTI_CLASSIFICATION:
|
138
|
+
tabpfn = AutoTabPFNClassifier(
|
139
|
+
max_time=max_time,
|
140
|
+
device="cuda" if torch.cuda.is_available() else "cpu",
|
141
|
+
)
|
142
|
+
self._tabpfn = tabpfn
|
143
|
+
if tabpfn is None:
|
144
|
+
raise ValueError("tabpfn is null")
|
145
|
+
return tabpfn
|
@@ -12,6 +12,7 @@ from .constant_reducer import ConstantReducer
|
|
12
12
|
from .correlation_reducer import CorrelationReducer
|
13
13
|
from .duplicate_reducer import DuplicateReducer
|
14
14
|
from .nonnumeric_reducer import NonNumericReducer
|
15
|
+
from .pca_reducer import PCAReducer
|
15
16
|
from .reducer import Reducer
|
16
17
|
from .unseen_reducer import UnseenReducer
|
17
18
|
|
@@ -24,14 +25,16 @@ class CombinedReducer(Reducer):
|
|
24
25
|
|
25
26
|
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
26
27
|
|
27
|
-
def __init__(self):
|
28
|
+
def __init__(self, max_features: int | None):
|
28
29
|
super().__init__()
|
30
|
+
self._max_features = max_features
|
29
31
|
self._reducers = [
|
30
32
|
UnseenReducer(),
|
31
33
|
NonNumericReducer(),
|
32
34
|
ConstantReducer(),
|
33
35
|
DuplicateReducer(),
|
34
36
|
CorrelationReducer(),
|
37
|
+
PCAReducer(max_features),
|
35
38
|
]
|
36
39
|
|
37
40
|
@classmethod
|
@@ -59,6 +62,8 @@ class CombinedReducer(Reducer):
|
|
59
62
|
self._reducers.append(NonNumericReducer())
|
60
63
|
elif reducer_name == UnseenReducer.name():
|
61
64
|
self._reducers.append(UnseenReducer())
|
65
|
+
elif reducer_name == PCAReducer.name():
|
66
|
+
self._reducers.append(PCAReducer(self._max_features))
|
62
67
|
for reducer in self._reducers:
|
63
68
|
reducer.load(folder)
|
64
69
|
|
@@ -0,0 +1,77 @@
|
|
1
|
+
"""A reducer that removes low variance columns."""
|
2
|
+
|
3
|
+
import os
|
4
|
+
from typing import Self
|
5
|
+
|
6
|
+
import joblib # type: ignore
|
7
|
+
import optuna
|
8
|
+
import pandas as pd
|
9
|
+
from sklearn.decomposition import PCA # type: ignore
|
10
|
+
from sklearn.preprocessing import StandardScaler # type: ignore
|
11
|
+
|
12
|
+
from .reducer import Reducer
|
13
|
+
|
14
|
+
_PCA_FILE = "pca.joblib"
|
15
|
+
_PCA_SCALER_FILE = "pca_scaler.joblib"
|
16
|
+
|
17
|
+
|
18
|
+
class PCAReducer(Reducer):
|
19
|
+
"""A class that removes low variance columns from a dataframe."""
|
20
|
+
|
21
|
+
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
22
|
+
|
23
|
+
def __init__(self, max_features: int | None):
|
24
|
+
super().__init__()
|
25
|
+
self._max_features = max_features
|
26
|
+
if max_features is not None:
|
27
|
+
self._scaler = StandardScaler()
|
28
|
+
self._pca = PCA(n_components=max_features)
|
29
|
+
else:
|
30
|
+
self._scaler = None
|
31
|
+
self._pca = None
|
32
|
+
|
33
|
+
@classmethod
|
34
|
+
def name(cls) -> str:
|
35
|
+
return "pca"
|
36
|
+
|
37
|
+
def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
38
|
+
pass
|
39
|
+
|
40
|
+
def load(self, folder: str) -> None:
|
41
|
+
pca_scaler_file = os.path.join(folder, _PCA_SCALER_FILE)
|
42
|
+
pca_file = os.path.join(folder, _PCA_FILE)
|
43
|
+
if os.path.exists(pca_scaler_file):
|
44
|
+
self._scaler = joblib.load(pca_scaler_file)
|
45
|
+
if os.path.exists(pca_file):
|
46
|
+
self._pca = joblib.load(pca_file)
|
47
|
+
|
48
|
+
def save(self, folder: str) -> None:
|
49
|
+
if self._scaler is not None:
|
50
|
+
joblib.dump(self._scaler, os.path.join(folder, _PCA_SCALER_FILE))
|
51
|
+
if self._pca is not None:
|
52
|
+
joblib.dump(self._pca, os.path.join(folder, _PCA_FILE))
|
53
|
+
|
54
|
+
def fit(
|
55
|
+
self,
|
56
|
+
df: pd.DataFrame,
|
57
|
+
y: pd.Series | pd.DataFrame | None = None,
|
58
|
+
w: pd.Series | None = None,
|
59
|
+
eval_x: pd.DataFrame | None = None,
|
60
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
61
|
+
) -> Self:
|
62
|
+
pca = self._pca
|
63
|
+
scaler = self._scaler
|
64
|
+
if pca is None or scaler is None:
|
65
|
+
return self
|
66
|
+
if len(df.columns.values) < pca.n_components: # type: ignore
|
67
|
+
return self
|
68
|
+
x_scaled = scaler.fit_transform(df)
|
69
|
+
pca.fit(x_scaled)
|
70
|
+
return self
|
71
|
+
|
72
|
+
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
73
|
+
if self._pca is None:
|
74
|
+
return df
|
75
|
+
if len(df.columns.values) < self._pca.n_components: # type: ignore
|
76
|
+
return df
|
77
|
+
return self._pca.transform(df)
|
@@ -49,8 +49,10 @@ class Selector(Params, Fit):
|
|
49
49
|
eval_x: pd.DataFrame | None = None,
|
50
50
|
eval_y: pd.Series | pd.DataFrame | None = None,
|
51
51
|
) -> Self:
|
52
|
+
if not self._model.supports_importances:
|
53
|
+
return self
|
52
54
|
sklearn.set_config(enable_metadata_routing=False)
|
53
|
-
model_kwargs = self._model.pre_fit(df, y=y, eval_x=eval_x, eval_y=eval_y)
|
55
|
+
model_kwargs = self._model.pre_fit(df, y=y, eval_x=eval_x, eval_y=eval_y, w=w)
|
54
56
|
if not isinstance(y, pd.Series):
|
55
57
|
raise ValueError("y is not a series.")
|
56
58
|
if len(df.columns) <= 1:
|
@@ -65,7 +67,7 @@ class Selector(Params, Fit):
|
|
65
67
|
),
|
66
68
|
)
|
67
69
|
try:
|
68
|
-
self._selector.fit(df, y=y,
|
70
|
+
self._selector.fit(df, y=y, **model_kwargs)
|
69
71
|
except ValueError as exc:
|
70
72
|
# Catch issues with 1 feature as a reduction target.
|
71
73
|
logging.warning(str(exc))
|
@@ -76,7 +78,8 @@ class Selector(Params, Fit):
|
|
76
78
|
return df
|
77
79
|
selector = self._selector
|
78
80
|
if selector is None:
|
79
|
-
|
81
|
+
logging.warning("selector is null")
|
82
|
+
return df
|
80
83
|
try:
|
81
84
|
return df[selector.get_feature_names_out()]
|
82
85
|
except AttributeError as exc:
|
@@ -36,6 +36,7 @@ _TEST_SIZE_KEY = "test_size"
|
|
36
36
|
_VALIDATION_SIZE_KEY = "validation_size"
|
37
37
|
_IDX_USR_ATTR_KEY = "idx"
|
38
38
|
_DT_COLUMN_KEY = "dt_column"
|
39
|
+
_MAX_FEATURES_KEY = "max_features"
|
39
40
|
|
40
41
|
|
41
42
|
class Trainer(Fit):
|
@@ -53,6 +54,7 @@ class Trainer(Fit):
|
|
53
54
|
dt_column: str | None = None,
|
54
55
|
max_train_timeout: datetime.timedelta | None = None,
|
55
56
|
cutoff_dt: datetime.datetime | None = None,
|
57
|
+
max_features: int | None = None,
|
56
58
|
):
|
57
59
|
tqdm.tqdm.pandas()
|
58
60
|
|
@@ -103,6 +105,7 @@ class Trainer(Fit):
|
|
103
105
|
)
|
104
106
|
if dt_column is None:
|
105
107
|
dt_column = params[_DT_COLUMN_KEY]
|
108
|
+
max_features = params.get(_MAX_FEATURES_KEY)
|
106
109
|
else:
|
107
110
|
with open(params_file, "w", encoding="utf8") as handle:
|
108
111
|
validation_size_value = None
|
@@ -133,6 +136,7 @@ class Trainer(Fit):
|
|
133
136
|
_TEST_SIZE_KEY: test_size_value,
|
134
137
|
_VALIDATION_SIZE_KEY: validation_size_value,
|
135
138
|
_DT_COLUMN_KEY: dt_column,
|
139
|
+
_MAX_FEATURES_KEY: max_features,
|
136
140
|
},
|
137
141
|
handle,
|
138
142
|
)
|
@@ -143,6 +147,7 @@ class Trainer(Fit):
|
|
143
147
|
self._dt_column = dt_column
|
144
148
|
self._max_train_timeout = max_train_timeout
|
145
149
|
self._cutoff_dt = cutoff_dt
|
150
|
+
self._max_features = max_features
|
146
151
|
|
147
152
|
def _provide_study(self, column: str) -> optuna.Study:
|
148
153
|
storage_name = f"sqlite:///{self._folder}/{column}/{_STUDYDB_FILENAME}"
|
@@ -216,7 +221,7 @@ class Trainer(Fit):
|
|
216
221
|
return -1.0
|
217
222
|
|
218
223
|
# Perform common reductions
|
219
|
-
reducer = CombinedReducer()
|
224
|
+
reducer = CombinedReducer(self._max_features)
|
220
225
|
reducer.set_options(trial)
|
221
226
|
x_train = reducer.fit_transform(x_train)
|
222
227
|
x_test = reducer.transform(x_test)
|
@@ -266,7 +271,6 @@ class Trainer(Fit):
|
|
266
271
|
return float(r2_score(y_test, y_pred[[PREDICTION_COLUMN]]))
|
267
272
|
return float(f1_score(y_test, y_pred[[PREDICTION_COLUMN]]))
|
268
273
|
except WavetrainException as exc:
|
269
|
-
logging.warning("WE DID NOT END UP TRAINING ANYTHING!!!!!")
|
270
274
|
logging.warning(str(exc))
|
271
275
|
return -1.0
|
272
276
|
|
@@ -343,6 +347,8 @@ class Trainer(Fit):
|
|
343
347
|
|
344
348
|
test_df = df.iloc[: train_len + count + test_len]
|
345
349
|
test_series = y_series.iloc[: train_len + count + test_len]
|
350
|
+
if len(test_df) <= 2:
|
351
|
+
continue
|
346
352
|
|
347
353
|
if test_idx < start_validation_index:
|
348
354
|
|
@@ -424,7 +430,7 @@ class Trainer(Fit):
|
|
424
430
|
date_str = dates[-1].isoformat()
|
425
431
|
folder = os.path.join(column_path, date_str)
|
426
432
|
|
427
|
-
reducer = CombinedReducer()
|
433
|
+
reducer = CombinedReducer(self._max_features)
|
428
434
|
reducer.load(folder)
|
429
435
|
|
430
436
|
model = ModelRouter()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: wavetrainer
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.26
|
4
4
|
Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
|
5
5
|
Home-page: https://github.com/8W9aG/wavetrainer
|
6
6
|
Author: Will Sackfield
|
@@ -23,6 +23,11 @@ Requires-Dist: venn-abers>=1.4.6
|
|
23
23
|
Requires-Dist: mapie>=0.9.2
|
24
24
|
Requires-Dist: pytz>=2025.1
|
25
25
|
Requires-Dist: torch>=2.6.0
|
26
|
+
Requires-Dist: tabpfn>=2.0.6
|
27
|
+
Requires-Dist: tabpfn-extensions>=0.0.4
|
28
|
+
Requires-Dist: shap>=0.47.2
|
29
|
+
Requires-Dist: hyperopt>=0.2.7
|
30
|
+
Requires-Dist: pytest-is-running>=1.5.1
|
26
31
|
|
27
32
|
# wavetrainer
|
28
33
|
|
@@ -52,6 +57,11 @@ Python 3.11.6:
|
|
52
57
|
- [mapie](https://mapie.readthedocs.io/en/stable/)
|
53
58
|
- [pytz](https://pythonhosted.org/pytz/)
|
54
59
|
- [torch](https://pytorch.org/)
|
60
|
+
- [tabpfn](https://github.com/PriorLabs/TabPFN)
|
61
|
+
- [tabpfn-extensions](https://github.com/PriorLabs/tabpfn-extensions)
|
62
|
+
- [shap](https://shap.readthedocs.io/en/latest/)
|
63
|
+
- [hyperopt](https://hyperopt.github.io/hyperopt/)
|
64
|
+
- [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
|
55
65
|
|
56
66
|
## Raison D'être :thought_balloon:
|
57
67
|
|
@@ -32,6 +32,7 @@ wavetrainer/model/catboost_model.py
|
|
32
32
|
wavetrainer/model/catboost_regressor_wrap.py
|
33
33
|
wavetrainer/model/model.py
|
34
34
|
wavetrainer/model/model_router.py
|
35
|
+
wavetrainer/model/tabpfn_model.py
|
35
36
|
wavetrainer/reducer/__init__.py
|
36
37
|
wavetrainer/reducer/base_selector_reducer.py
|
37
38
|
wavetrainer/reducer/combined_reducer.py
|
@@ -39,6 +40,7 @@ wavetrainer/reducer/constant_reducer.py
|
|
39
40
|
wavetrainer/reducer/correlation_reducer.py
|
40
41
|
wavetrainer/reducer/duplicate_reducer.py
|
41
42
|
wavetrainer/reducer/nonnumeric_reducer.py
|
43
|
+
wavetrainer/reducer/pca_reducer.py
|
42
44
|
wavetrainer/reducer/reducer.py
|
43
45
|
wavetrainer/reducer/unseen_reducer.py
|
44
46
|
wavetrainer/selector/__init__.py
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|