wavetrainer 0.0.24__tar.gz → 0.0.25__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wavetrainer-0.0.24/wavetrainer.egg-info → wavetrainer-0.0.25}/PKG-INFO +11 -1
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/README.md +5 -0
- wavetrainer-0.0.24/wavetrainer.egg-info/requires.txt → wavetrainer-0.0.25/requirements.txt +5 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/setup.py +1 -1
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/tests/trainer_test.py +1 -1
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/__init__.py +1 -1
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/model/catboost_model.py +6 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/model/model.py +9 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/model/model_router.py +14 -5
- wavetrainer-0.0.25/wavetrainer/model/tabpfn_model.py +145 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/reducer/combined_reducer.py +4 -0
- wavetrainer-0.0.25/wavetrainer/reducer/pca_reducer.py +60 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/selector/selector.py +6 -3
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/trainer.py +2 -1
- {wavetrainer-0.0.24 → wavetrainer-0.0.25/wavetrainer.egg-info}/PKG-INFO +11 -1
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer.egg-info/SOURCES.txt +2 -0
- wavetrainer-0.0.24/requirements.txt → wavetrainer-0.0.25/wavetrainer.egg-info/requires.txt +6 -1
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/LICENSE +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/MANIFEST.in +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/setup.cfg +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/tests/__init__.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/tests/model/__init__.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/tests/model/catboost_kwargs_test.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/calibrator/__init__.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/calibrator/calibrator.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/calibrator/calibrator_router.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/calibrator/mapie_calibrator.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/calibrator/vennabers_calibrator.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/create.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/exceptions.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/fit.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/model/__init__.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/model/catboost_classifier_wrap.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/model/catboost_kwargs.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/model/catboost_regressor_wrap.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/model_type.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/params.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/reducer/__init__.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/reducer/base_selector_reducer.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/reducer/constant_reducer.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/reducer/correlation_reducer.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/reducer/duplicate_reducer.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/reducer/nonnumeric_reducer.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/reducer/reducer.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/reducer/unseen_reducer.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/selector/__init__.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/weights/__init__.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/weights/class_weights.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/weights/combined_weights.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/weights/exponential_weights.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/weights/linear_weights.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/weights/noop_weights.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/weights/sigmoid_weights.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/weights/weights.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/weights/weights_router.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/windower/__init__.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer/windower/windower.py +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer.egg-info/dependency_links.txt +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer.egg-info/not-zip-safe +0 -0
- {wavetrainer-0.0.24 → wavetrainer-0.0.25}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: wavetrainer
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.25
|
4
4
|
Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
|
5
5
|
Home-page: https://github.com/8W9aG/wavetrainer
|
6
6
|
Author: Will Sackfield
|
@@ -23,6 +23,11 @@ Requires-Dist: venn-abers>=1.4.6
|
|
23
23
|
Requires-Dist: mapie>=0.9.2
|
24
24
|
Requires-Dist: pytz>=2025.1
|
25
25
|
Requires-Dist: torch>=2.6.0
|
26
|
+
Requires-Dist: tabpfn>=2.0.6
|
27
|
+
Requires-Dist: tabpfn-extensions>=0.0.4
|
28
|
+
Requires-Dist: shap>=0.47.2
|
29
|
+
Requires-Dist: hyperopt>=0.2.7
|
30
|
+
Requires-Dist: pytest-is-running>=1.5.1
|
26
31
|
|
27
32
|
# wavetrainer
|
28
33
|
|
@@ -52,6 +57,11 @@ Python 3.11.6:
|
|
52
57
|
- [mapie](https://mapie.readthedocs.io/en/stable/)
|
53
58
|
- [pytz](https://pythonhosted.org/pytz/)
|
54
59
|
- [torch](https://pytorch.org/)
|
60
|
+
- [tabpfn](https://github.com/PriorLabs/TabPFN)
|
61
|
+
- [tabpfn-extensions](https://github.com/PriorLabs/tabpfn-extensions)
|
62
|
+
- [shap](https://shap.readthedocs.io/en/latest/)
|
63
|
+
- [hyperopt](https://hyperopt.github.io/hyperopt/)
|
64
|
+
- [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
|
55
65
|
|
56
66
|
## Raison D'être :thought_balloon:
|
57
67
|
|
@@ -26,6 +26,11 @@ Python 3.11.6:
|
|
26
26
|
- [mapie](https://mapie.readthedocs.io/en/stable/)
|
27
27
|
- [pytz](https://pythonhosted.org/pytz/)
|
28
28
|
- [torch](https://pytorch.org/)
|
29
|
+
- [tabpfn](https://github.com/PriorLabs/TabPFN)
|
30
|
+
- [tabpfn-extensions](https://github.com/PriorLabs/tabpfn-extensions)
|
31
|
+
- [shap](https://shap.readthedocs.io/en/latest/)
|
32
|
+
- [hyperopt](https://hyperopt.github.io/hyperopt/)
|
33
|
+
- [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
|
29
34
|
|
30
35
|
## Raison D'être :thought_balloon:
|
31
36
|
|
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
|
|
23
23
|
|
24
24
|
setup(
|
25
25
|
name='wavetrainer',
|
26
|
-
version='0.0.
|
26
|
+
version='0.0.25',
|
27
27
|
description='A library for automatically finding the optimal model within feature and hyperparameter space.',
|
28
28
|
long_description=long_description,
|
29
29
|
long_description_content_type='text/markdown',
|
@@ -14,7 +14,7 @@ class TestTrainer(unittest.TestCase):
|
|
14
14
|
def test_trainer(self):
|
15
15
|
with tempfile.TemporaryDirectory() as tmpdir:
|
16
16
|
trainer = Trainer(tmpdir, walkforward_timedelta=datetime.timedelta(days=7), trials=1)
|
17
|
-
x_data = [i for i in range(
|
17
|
+
x_data = [i for i in range(101)]
|
18
18
|
x_index = [datetime.datetime(2022, 1, 1) + datetime.timedelta(days=i) for i in range(len(x_data))]
|
19
19
|
df = pd.DataFrame(
|
20
20
|
data={
|
@@ -61,12 +61,17 @@ class CatboostModel(Model):
|
|
61
61
|
def estimator(self) -> Any:
|
62
62
|
return self._provide_catboost()
|
63
63
|
|
64
|
+
@property
|
65
|
+
def supports_importances(self) -> bool:
|
66
|
+
return True
|
67
|
+
|
64
68
|
def pre_fit(
|
65
69
|
self,
|
66
70
|
df: pd.DataFrame,
|
67
71
|
y: pd.Series | pd.DataFrame | None,
|
68
72
|
eval_x: pd.DataFrame | None = None,
|
69
73
|
eval_y: pd.Series | pd.DataFrame | None = None,
|
74
|
+
w: pd.Series | None = None,
|
70
75
|
):
|
71
76
|
if y is None:
|
72
77
|
raise ValueError("y is null.")
|
@@ -75,6 +80,7 @@ class CatboostModel(Model):
|
|
75
80
|
EVAL_SET_ARG_KEY: (eval_x, eval_y),
|
76
81
|
CAT_FEATURES_ARG_KEY: df.select_dtypes(include="category").columns.tolist(),
|
77
82
|
ORIGINAL_X_ARG_KEY: df,
|
83
|
+
"sample_weight": w,
|
78
84
|
}
|
79
85
|
|
80
86
|
def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
@@ -1,5 +1,6 @@
|
|
1
1
|
"""The prototype model class."""
|
2
2
|
|
3
|
+
# pylint: disable=too-many-arguments,too-many-positional-arguments
|
3
4
|
from typing import Any
|
4
5
|
|
5
6
|
import pandas as pd
|
@@ -24,12 +25,20 @@ class Model(Params, Fit):
|
|
24
25
|
"""The estimator backing the model."""
|
25
26
|
raise NotImplementedError("estimator not implemented in parent class.")
|
26
27
|
|
28
|
+
@property
|
29
|
+
def supports_importances(self) -> bool:
|
30
|
+
"""Whether this model supports feature importances."""
|
31
|
+
raise NotImplementedError(
|
32
|
+
"supports_importances not implemented in parent class."
|
33
|
+
)
|
34
|
+
|
27
35
|
def pre_fit(
|
28
36
|
self,
|
29
37
|
df: pd.DataFrame,
|
30
38
|
y: pd.Series | pd.DataFrame | None,
|
31
39
|
eval_x: pd.DataFrame | None = None,
|
32
40
|
eval_y: pd.Series | pd.DataFrame | None = None,
|
41
|
+
w: pd.Series | None = None,
|
33
42
|
) -> dict[str, Any]:
|
34
43
|
"""A call to make sure the model is prepared for the target type."""
|
35
44
|
raise NotImplementedError("pre_fit not implemented in parent class.")
|
@@ -9,11 +9,13 @@ import pandas as pd
|
|
9
9
|
|
10
10
|
from .catboost_model import CatboostModel
|
11
11
|
from .model import Model
|
12
|
+
from .tabpfn_model import TabPFNModel
|
12
13
|
|
13
14
|
_MODEL_ROUTER_FILE = "model_router.json"
|
14
15
|
_MODEL_KEY = "model"
|
15
16
|
_MODELS = {
|
16
17
|
CatboostModel.name(): CatboostModel,
|
18
|
+
TabPFNModel.name(): TabPFNModel,
|
17
19
|
}
|
18
20
|
|
19
21
|
|
@@ -39,23 +41,30 @@ class ModelRouter(Model):
|
|
39
41
|
raise ValueError("model is null")
|
40
42
|
return model.estimator
|
41
43
|
|
44
|
+
@property
|
45
|
+
def supports_importances(self) -> bool:
|
46
|
+
model = self._model
|
47
|
+
if model is None:
|
48
|
+
raise ValueError("model is null")
|
49
|
+
return model.supports_importances
|
50
|
+
|
42
51
|
def pre_fit(
|
43
52
|
self,
|
44
53
|
df: pd.DataFrame,
|
45
54
|
y: pd.Series | pd.DataFrame | None,
|
46
55
|
eval_x: pd.DataFrame | None = None,
|
47
56
|
eval_y: pd.Series | pd.DataFrame | None = None,
|
57
|
+
w: pd.Series | None = None,
|
48
58
|
) -> dict[str, Any]:
|
49
59
|
model = self._model
|
50
60
|
if model is None:
|
51
61
|
raise ValueError("model is null")
|
52
|
-
return model.pre_fit(df, y=y, eval_x=eval_x, eval_y=eval_y)
|
62
|
+
return model.pre_fit(df, y=y, eval_x=eval_x, eval_y=eval_y, w=w)
|
53
63
|
|
54
64
|
def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
self._model.set_options(trial)
|
65
|
+
model = _MODELS[trial.suggest_categorical("model", list(_MODELS.keys()))]()
|
66
|
+
model.set_options(trial)
|
67
|
+
self._model = model
|
59
68
|
|
60
69
|
def load(self, folder: str) -> None:
|
61
70
|
with open(os.path.join(folder, _MODEL_ROUTER_FILE), encoding="utf8") as handle:
|
@@ -0,0 +1,145 @@
|
|
1
|
+
"""A model that wraps tabpfn."""
|
2
|
+
# pylint: disable=duplicate-code,too-many-arguments,too-many-positional-arguments
|
3
|
+
|
4
|
+
import json
|
5
|
+
import os
|
6
|
+
import pickle
|
7
|
+
from typing import Any, Self
|
8
|
+
|
9
|
+
import optuna
|
10
|
+
import pandas as pd
|
11
|
+
import pytest_is_running
|
12
|
+
import torch
|
13
|
+
from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import ( # type: ignore
|
14
|
+
AutoTabPFNClassifier, AutoTabPFNRegressor)
|
15
|
+
|
16
|
+
from ..model_type import ModelType, determine_model_type
|
17
|
+
from .model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
|
18
|
+
|
19
|
+
_MODEL_FILENAME = "model.pkl"
|
20
|
+
_MODEL_PARAMS_FILENAME = "model_params.json"
|
21
|
+
_MODEL_TYPE_KEY = "model_type"
|
22
|
+
|
23
|
+
|
24
|
+
class TabPFNModel(Model):
|
25
|
+
"""A class that uses TabPFN as a model."""
|
26
|
+
|
27
|
+
_tabpfn: AutoTabPFNClassifier | AutoTabPFNRegressor | None
|
28
|
+
_model_type: None | ModelType
|
29
|
+
|
30
|
+
@classmethod
|
31
|
+
def name(cls) -> str:
|
32
|
+
return "tabpfn"
|
33
|
+
|
34
|
+
def __init__(self) -> None:
|
35
|
+
super().__init__()
|
36
|
+
self._tabpfn = None
|
37
|
+
self._model_type = None
|
38
|
+
|
39
|
+
@property
|
40
|
+
def estimator(self) -> Any:
|
41
|
+
return self._provide_tabpfn()
|
42
|
+
|
43
|
+
@property
|
44
|
+
def supports_importances(self) -> bool:
|
45
|
+
return False
|
46
|
+
|
47
|
+
def pre_fit(
|
48
|
+
self,
|
49
|
+
df: pd.DataFrame,
|
50
|
+
y: pd.Series | pd.DataFrame | None,
|
51
|
+
eval_x: pd.DataFrame | None = None,
|
52
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
53
|
+
w: pd.Series | None = None,
|
54
|
+
):
|
55
|
+
if y is None:
|
56
|
+
raise ValueError("y is null.")
|
57
|
+
self._model_type = determine_model_type(y)
|
58
|
+
return {}
|
59
|
+
|
60
|
+
def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
61
|
+
pass
|
62
|
+
|
63
|
+
def load(self, folder: str) -> None:
|
64
|
+
with open(os.path.join(folder, _MODEL_FILENAME), "rb") as f:
|
65
|
+
self._tabpfn = pickle.load(f)
|
66
|
+
with open(
|
67
|
+
os.path.join(folder, _MODEL_PARAMS_FILENAME), encoding="utf8"
|
68
|
+
) as handle:
|
69
|
+
params = json.load(handle)
|
70
|
+
self._model_type = ModelType(params[_MODEL_TYPE_KEY])
|
71
|
+
|
72
|
+
def save(self, folder: str) -> None:
|
73
|
+
with open(os.path.join(folder, _MODEL_FILENAME), "wb") as f:
|
74
|
+
pickle.dump(self._tabpfn, f)
|
75
|
+
with open(
|
76
|
+
os.path.join(folder, _MODEL_PARAMS_FILENAME), "w", encoding="utf8"
|
77
|
+
) as handle:
|
78
|
+
json.dump(
|
79
|
+
{
|
80
|
+
_MODEL_TYPE_KEY: str(self._model_type),
|
81
|
+
},
|
82
|
+
handle,
|
83
|
+
)
|
84
|
+
|
85
|
+
def fit(
|
86
|
+
self,
|
87
|
+
df: pd.DataFrame,
|
88
|
+
y: pd.Series | pd.DataFrame | None = None,
|
89
|
+
w: pd.Series | None = None,
|
90
|
+
eval_x: pd.DataFrame | None = None,
|
91
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
92
|
+
) -> Self:
|
93
|
+
if y is None:
|
94
|
+
raise ValueError("y is null.")
|
95
|
+
self._model_type = determine_model_type(y)
|
96
|
+
tabpfn = self._provide_tabpfn()
|
97
|
+
tabpfn.fit(df, y)
|
98
|
+
return self
|
99
|
+
|
100
|
+
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
101
|
+
tabpfn = self._provide_tabpfn()
|
102
|
+
if tabpfn is None:
|
103
|
+
raise ValueError("tabpfn is null")
|
104
|
+
pred = tabpfn.predict(df)
|
105
|
+
new_df = pd.DataFrame(
|
106
|
+
index=df.index,
|
107
|
+
data={
|
108
|
+
PREDICTION_COLUMN: pred.flatten(),
|
109
|
+
},
|
110
|
+
)
|
111
|
+
if isinstance(tabpfn, AutoTabPFNClassifier):
|
112
|
+
proba = tabpfn.predict_proba(df)
|
113
|
+
for i in range(proba.shape[1]):
|
114
|
+
new_df[f"{PROBABILITY_COLUMN_PREFIX}{i}"] = proba[:, i]
|
115
|
+
return new_df
|
116
|
+
|
117
|
+
def _provide_tabpfn(self) -> AutoTabPFNClassifier | AutoTabPFNRegressor:
|
118
|
+
tabpfn = self._tabpfn
|
119
|
+
if tabpfn is None:
|
120
|
+
max_time = 1 if pytest_is_running.is_running() else 120
|
121
|
+
match self._model_type:
|
122
|
+
case ModelType.BINARY:
|
123
|
+
tabpfn = AutoTabPFNClassifier(
|
124
|
+
max_time=max_time,
|
125
|
+
device="cuda" if torch.cuda.is_available() else "cpu",
|
126
|
+
)
|
127
|
+
case ModelType.REGRESSION:
|
128
|
+
tabpfn = AutoTabPFNRegressor(
|
129
|
+
max_time=max_time,
|
130
|
+
device="cuda" if torch.cuda.is_available() else "cpu",
|
131
|
+
)
|
132
|
+
case ModelType.BINNED_BINARY:
|
133
|
+
tabpfn = AutoTabPFNClassifier(
|
134
|
+
max_time=max_time,
|
135
|
+
device="cuda" if torch.cuda.is_available() else "cpu",
|
136
|
+
)
|
137
|
+
case ModelType.MULTI_CLASSIFICATION:
|
138
|
+
tabpfn = AutoTabPFNClassifier(
|
139
|
+
max_time=max_time,
|
140
|
+
device="cuda" if torch.cuda.is_available() else "cpu",
|
141
|
+
)
|
142
|
+
self._tabpfn = tabpfn
|
143
|
+
if tabpfn is None:
|
144
|
+
raise ValueError("tabpfn is null")
|
145
|
+
return tabpfn
|
@@ -12,6 +12,7 @@ from .constant_reducer import ConstantReducer
|
|
12
12
|
from .correlation_reducer import CorrelationReducer
|
13
13
|
from .duplicate_reducer import DuplicateReducer
|
14
14
|
from .nonnumeric_reducer import NonNumericReducer
|
15
|
+
from .pca_reducer import PCAReducer
|
15
16
|
from .reducer import Reducer
|
16
17
|
from .unseen_reducer import UnseenReducer
|
17
18
|
|
@@ -32,6 +33,7 @@ class CombinedReducer(Reducer):
|
|
32
33
|
ConstantReducer(),
|
33
34
|
DuplicateReducer(),
|
34
35
|
CorrelationReducer(),
|
36
|
+
PCAReducer(),
|
35
37
|
]
|
36
38
|
|
37
39
|
@classmethod
|
@@ -59,6 +61,8 @@ class CombinedReducer(Reducer):
|
|
59
61
|
self._reducers.append(NonNumericReducer())
|
60
62
|
elif reducer_name == UnseenReducer.name():
|
61
63
|
self._reducers.append(UnseenReducer())
|
64
|
+
elif reducer_name == PCAReducer.name():
|
65
|
+
self._reducers.append(PCAReducer())
|
62
66
|
for reducer in self._reducers:
|
63
67
|
reducer.load(folder)
|
64
68
|
|
@@ -0,0 +1,60 @@
|
|
1
|
+
"""A reducer that removes low variance columns."""
|
2
|
+
|
3
|
+
import os
|
4
|
+
from typing import Self
|
5
|
+
|
6
|
+
import joblib # type: ignore
|
7
|
+
import optuna
|
8
|
+
import pandas as pd
|
9
|
+
from sklearn.decomposition import PCA # type: ignore
|
10
|
+
from sklearn.preprocessing import StandardScaler # type: ignore
|
11
|
+
|
12
|
+
from .reducer import Reducer
|
13
|
+
|
14
|
+
_PCA_FILE = "pca.joblib"
|
15
|
+
_PCA_SCALER_FILE = "pca_scaler.joblib"
|
16
|
+
|
17
|
+
|
18
|
+
class PCAReducer(Reducer):
|
19
|
+
"""A class that removes low variance columns from a dataframe."""
|
20
|
+
|
21
|
+
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
22
|
+
|
23
|
+
def __init__(self):
|
24
|
+
super().__init__()
|
25
|
+
self._scaler = StandardScaler()
|
26
|
+
self._pca = PCA(n_components=300)
|
27
|
+
|
28
|
+
@classmethod
|
29
|
+
def name(cls) -> str:
|
30
|
+
return "pca"
|
31
|
+
|
32
|
+
def set_options(self, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
33
|
+
pass
|
34
|
+
|
35
|
+
def load(self, folder: str) -> None:
|
36
|
+
self._scaler = joblib.load(os.path.join(folder, _PCA_SCALER_FILE))
|
37
|
+
self._pca = joblib.load(os.path.join(folder, _PCA_FILE))
|
38
|
+
|
39
|
+
def save(self, folder: str) -> None:
|
40
|
+
joblib.dump(self._scaler, os.path.join(folder, _PCA_SCALER_FILE))
|
41
|
+
joblib.dump(self._pca, os.path.join(folder, _PCA_FILE))
|
42
|
+
|
43
|
+
def fit(
|
44
|
+
self,
|
45
|
+
df: pd.DataFrame,
|
46
|
+
y: pd.Series | pd.DataFrame | None = None,
|
47
|
+
w: pd.Series | None = None,
|
48
|
+
eval_x: pd.DataFrame | None = None,
|
49
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
50
|
+
) -> Self:
|
51
|
+
if len(df.columns.values) < self._pca.n_components: # type: ignore
|
52
|
+
return self
|
53
|
+
x_scaled = self._scaler.fit_transform(df)
|
54
|
+
self._pca.fit(x_scaled)
|
55
|
+
return self
|
56
|
+
|
57
|
+
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
58
|
+
if len(df.columns.values) < self._pca.n_components: # type: ignore
|
59
|
+
return df
|
60
|
+
return self._pca.transform(df)
|
@@ -49,8 +49,10 @@ class Selector(Params, Fit):
|
|
49
49
|
eval_x: pd.DataFrame | None = None,
|
50
50
|
eval_y: pd.Series | pd.DataFrame | None = None,
|
51
51
|
) -> Self:
|
52
|
+
if not self._model.supports_importances:
|
53
|
+
return self
|
52
54
|
sklearn.set_config(enable_metadata_routing=False)
|
53
|
-
model_kwargs = self._model.pre_fit(df, y=y, eval_x=eval_x, eval_y=eval_y)
|
55
|
+
model_kwargs = self._model.pre_fit(df, y=y, eval_x=eval_x, eval_y=eval_y, w=w)
|
54
56
|
if not isinstance(y, pd.Series):
|
55
57
|
raise ValueError("y is not a series.")
|
56
58
|
if len(df.columns) <= 1:
|
@@ -65,7 +67,7 @@ class Selector(Params, Fit):
|
|
65
67
|
),
|
66
68
|
)
|
67
69
|
try:
|
68
|
-
self._selector.fit(df, y=y,
|
70
|
+
self._selector.fit(df, y=y, **model_kwargs)
|
69
71
|
except ValueError as exc:
|
70
72
|
# Catch issues with 1 feature as a reduction target.
|
71
73
|
logging.warning(str(exc))
|
@@ -76,7 +78,8 @@ class Selector(Params, Fit):
|
|
76
78
|
return df
|
77
79
|
selector = self._selector
|
78
80
|
if selector is None:
|
79
|
-
|
81
|
+
logging.warning("selector is null")
|
82
|
+
return df
|
80
83
|
try:
|
81
84
|
return df[selector.get_feature_names_out()]
|
82
85
|
except AttributeError as exc:
|
@@ -266,7 +266,6 @@ class Trainer(Fit):
|
|
266
266
|
return float(r2_score(y_test, y_pred[[PREDICTION_COLUMN]]))
|
267
267
|
return float(f1_score(y_test, y_pred[[PREDICTION_COLUMN]]))
|
268
268
|
except WavetrainException as exc:
|
269
|
-
logging.warning("WE DID NOT END UP TRAINING ANYTHING!!!!!")
|
270
269
|
logging.warning(str(exc))
|
271
270
|
return -1.0
|
272
271
|
|
@@ -343,6 +342,8 @@ class Trainer(Fit):
|
|
343
342
|
|
344
343
|
test_df = df.iloc[: train_len + count + test_len]
|
345
344
|
test_series = y_series.iloc[: train_len + count + test_len]
|
345
|
+
if len(test_df) <= 2:
|
346
|
+
continue
|
346
347
|
|
347
348
|
if test_idx < start_validation_index:
|
348
349
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: wavetrainer
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.25
|
4
4
|
Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
|
5
5
|
Home-page: https://github.com/8W9aG/wavetrainer
|
6
6
|
Author: Will Sackfield
|
@@ -23,6 +23,11 @@ Requires-Dist: venn-abers>=1.4.6
|
|
23
23
|
Requires-Dist: mapie>=0.9.2
|
24
24
|
Requires-Dist: pytz>=2025.1
|
25
25
|
Requires-Dist: torch>=2.6.0
|
26
|
+
Requires-Dist: tabpfn>=2.0.6
|
27
|
+
Requires-Dist: tabpfn-extensions>=0.0.4
|
28
|
+
Requires-Dist: shap>=0.47.2
|
29
|
+
Requires-Dist: hyperopt>=0.2.7
|
30
|
+
Requires-Dist: pytest-is-running>=1.5.1
|
26
31
|
|
27
32
|
# wavetrainer
|
28
33
|
|
@@ -52,6 +57,11 @@ Python 3.11.6:
|
|
52
57
|
- [mapie](https://mapie.readthedocs.io/en/stable/)
|
53
58
|
- [pytz](https://pythonhosted.org/pytz/)
|
54
59
|
- [torch](https://pytorch.org/)
|
60
|
+
- [tabpfn](https://github.com/PriorLabs/TabPFN)
|
61
|
+
- [tabpfn-extensions](https://github.com/PriorLabs/tabpfn-extensions)
|
62
|
+
- [shap](https://shap.readthedocs.io/en/latest/)
|
63
|
+
- [hyperopt](https://hyperopt.github.io/hyperopt/)
|
64
|
+
- [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
|
55
65
|
|
56
66
|
## Raison D'être :thought_balloon:
|
57
67
|
|
@@ -32,6 +32,7 @@ wavetrainer/model/catboost_model.py
|
|
32
32
|
wavetrainer/model/catboost_regressor_wrap.py
|
33
33
|
wavetrainer/model/model.py
|
34
34
|
wavetrainer/model/model_router.py
|
35
|
+
wavetrainer/model/tabpfn_model.py
|
35
36
|
wavetrainer/reducer/__init__.py
|
36
37
|
wavetrainer/reducer/base_selector_reducer.py
|
37
38
|
wavetrainer/reducer/combined_reducer.py
|
@@ -39,6 +40,7 @@ wavetrainer/reducer/constant_reducer.py
|
|
39
40
|
wavetrainer/reducer/correlation_reducer.py
|
40
41
|
wavetrainer/reducer/duplicate_reducer.py
|
41
42
|
wavetrainer/reducer/nonnumeric_reducer.py
|
43
|
+
wavetrainer/reducer/pca_reducer.py
|
42
44
|
wavetrainer/reducer/reducer.py
|
43
45
|
wavetrainer/reducer/unseen_reducer.py
|
44
46
|
wavetrainer/selector/__init__.py
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|