wavetrainer 0.0.38__tar.gz → 0.0.40__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wavetrainer-0.0.38/wavetrainer.egg-info → wavetrainer-0.0.40}/PKG-INFO +3 -7
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/README.md +1 -3
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/requirements.txt +2 -4
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/setup.py +1 -1
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/tests/model/catboost_kwargs_test.py +1 -1
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/tests/trainer_test.py +1 -1
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/__init__.py +1 -1
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/calibrator/calibrator_router.py +3 -1
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/calibrator/vennabers_calibrator.py +6 -1
- wavetrainer-0.0.40/wavetrainer/model/catboost/__init__.py +1 -0
- {wavetrainer-0.0.38/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/catboost}/catboost_model.py +5 -4
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/model/model_router.py +4 -2
- wavetrainer-0.0.40/wavetrainer/model/tabpfn/__init__.py +1 -0
- {wavetrainer-0.0.38/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/tabpfn}/tabpfn_model.py +3 -3
- wavetrainer-0.0.40/wavetrainer/model/xgboost/__init__.py +1 -0
- wavetrainer-0.0.40/wavetrainer/model/xgboost/early_stopper.py +16 -0
- wavetrainer-0.0.40/wavetrainer/model/xgboost/xgboost_logger.py +23 -0
- wavetrainer-0.0.40/wavetrainer/model/xgboost/xgboost_model.py +277 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/combined_reducer.py +6 -1
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/nonnumeric_reducer.py +2 -1
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/select_by_single_feature_performance_reducer.py +6 -3
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/trainer.py +17 -3
- {wavetrainer-0.0.38 → wavetrainer-0.0.40/wavetrainer.egg-info}/PKG-INFO +3 -7
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer.egg-info/SOURCES.txt +11 -5
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer.egg-info/requires.txt +1 -3
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/LICENSE +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/MANIFEST.in +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/setup.cfg +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/tests/__init__.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/tests/model/__init__.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/calibrator/__init__.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/calibrator/calibrator.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/calibrator/mapie_calibrator.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/create.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/exceptions.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/fit.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/model/__init__.py +0 -0
- {wavetrainer-0.0.38/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/catboost}/catboost_classifier_wrap.py +0 -0
- {wavetrainer-0.0.38/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/catboost}/catboost_kwargs.py +0 -0
- {wavetrainer-0.0.38/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/catboost}/catboost_regressor_wrap.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/model/model.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/model_type.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/params.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/__init__.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/base_selector_reducer.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/constant_reducer.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/correlation_reducer.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/duplicate_reducer.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/non_categorical_numeric_columns.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/reducer.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/smart_correlation_reducer.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/unseen_reducer.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/selector/__init__.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/selector/selector.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/weights/__init__.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/weights/class_weights.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/weights/combined_weights.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/weights/exponential_weights.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/weights/linear_weights.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/weights/noop_weights.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/weights/sigmoid_weights.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/weights/weights.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/weights/weights_router.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/windower/__init__.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/windower/windower.py +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer.egg-info/dependency_links.txt +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer.egg-info/not-zip-safe +0 -0
- {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: wavetrainer
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.40
|
4
4
|
Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
|
5
5
|
Home-page: https://github.com/8W9aG/wavetrainer
|
6
6
|
Author: Will Sackfield
|
@@ -24,10 +24,8 @@ Requires-Dist: mapie>=0.9.2
|
|
24
24
|
Requires-Dist: pytz>=2025.1
|
25
25
|
Requires-Dist: torch>=2.6.0
|
26
26
|
Requires-Dist: tabpfn>=2.0.6
|
27
|
-
Requires-Dist: tabpfn-extensions>=0.0.4
|
28
|
-
Requires-Dist: shap>=0.47.2
|
29
|
-
Requires-Dist: hyperopt>=0.2.7
|
30
27
|
Requires-Dist: pytest-is-running>=1.5.1
|
28
|
+
Requires-Dist: xgboost>=3.0.0
|
31
29
|
|
32
30
|
# wavetrainer
|
33
31
|
|
@@ -58,10 +56,8 @@ Python 3.11.6:
|
|
58
56
|
- [pytz](https://pythonhosted.org/pytz/)
|
59
57
|
- [torch](https://pytorch.org/)
|
60
58
|
- [tabpfn](https://github.com/PriorLabs/TabPFN)
|
61
|
-
- [tabpfn-extensions](https://github.com/PriorLabs/tabpfn-extensions)
|
62
|
-
- [shap](https://shap.readthedocs.io/en/latest/)
|
63
|
-
- [hyperopt](https://hyperopt.github.io/hyperopt/)
|
64
59
|
- [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
|
60
|
+
- [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
|
65
61
|
|
66
62
|
## Raison D'être :thought_balloon:
|
67
63
|
|
@@ -27,10 +27,8 @@ Python 3.11.6:
|
|
27
27
|
- [pytz](https://pythonhosted.org/pytz/)
|
28
28
|
- [torch](https://pytorch.org/)
|
29
29
|
- [tabpfn](https://github.com/PriorLabs/TabPFN)
|
30
|
-
- [tabpfn-extensions](https://github.com/PriorLabs/tabpfn-extensions)
|
31
|
-
- [shap](https://shap.readthedocs.io/en/latest/)
|
32
|
-
- [hyperopt](https://hyperopt.github.io/hyperopt/)
|
33
30
|
- [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
|
31
|
+
- [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
|
34
32
|
|
35
33
|
## Raison D'être :thought_balloon:
|
36
34
|
|
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
|
|
23
23
|
|
24
24
|
setup(
|
25
25
|
name='wavetrainer',
|
26
|
-
version='0.0.
|
26
|
+
version='0.0.40',
|
27
27
|
description='A library for automatically finding the optimal model within feature and hyperparameter space.',
|
28
28
|
long_description=long_description,
|
29
29
|
long_description_content_type='text/markdown',
|
@@ -13,7 +13,7 @@ class TestTrainer(unittest.TestCase):
|
|
13
13
|
|
14
14
|
def test_trainer(self):
|
15
15
|
with tempfile.TemporaryDirectory() as tmpdir:
|
16
|
-
trainer = Trainer(tmpdir, walkforward_timedelta=datetime.timedelta(days=7), trials=
|
16
|
+
trainer = Trainer(tmpdir, walkforward_timedelta=datetime.timedelta(days=7), trials=5)
|
17
17
|
x_data = [i for i in range(101)]
|
18
18
|
x_index = [datetime.datetime(2022, 1, 1) + datetime.timedelta(days=i) for i in range(len(x_data))]
|
19
19
|
df = pd.DataFrame(
|
@@ -78,7 +78,9 @@ class CalibratorRouter(Calibrator):
|
|
78
78
|
) -> Self:
|
79
79
|
# pylint: disable=no-else-return
|
80
80
|
calibrator: Calibrator | None = None
|
81
|
-
if
|
81
|
+
if y is None:
|
82
|
+
raise ValueError("y is null")
|
83
|
+
if determine_model_type(y) == ModelType.REGRESSION:
|
82
84
|
calibrator = MAPIECalibrator(self._model)
|
83
85
|
else:
|
84
86
|
calibrator = VennabersCalibrator(self._model)
|
@@ -1,5 +1,6 @@
|
|
1
1
|
"""A calibrator that implements venn abers."""
|
2
2
|
|
3
|
+
import logging
|
3
4
|
import os
|
4
5
|
from typing import Self
|
5
6
|
|
@@ -54,7 +55,11 @@ class VennabersCalibrator(Calibrator):
|
|
54
55
|
prob_columns = [
|
55
56
|
x for x in df.columns.values if x.startswith(PROBABILITY_COLUMN_PREFIX)
|
56
57
|
]
|
57
|
-
|
58
|
+
try:
|
59
|
+
vennabers.fit(df[prob_columns].to_numpy(), y.to_numpy())
|
60
|
+
except IndexError:
|
61
|
+
logging.error(df)
|
62
|
+
raise
|
58
63
|
return self
|
59
64
|
|
60
65
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
@@ -0,0 +1 @@
|
|
1
|
+
"""The wavetrain catboost model module."""
|
@@ -10,12 +10,12 @@ import pandas as pd
|
|
10
10
|
import torch
|
11
11
|
from catboost import CatBoost, Pool # type: ignore
|
12
12
|
|
13
|
-
from
|
13
|
+
from ...model_type import ModelType, determine_model_type
|
14
|
+
from ..model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
|
14
15
|
from .catboost_classifier_wrap import CatBoostClassifierWrapper
|
15
16
|
from .catboost_kwargs import (CAT_FEATURES_ARG_KEY, EVAL_SET_ARG_KEY,
|
16
17
|
ORIGINAL_X_ARG_KEY)
|
17
18
|
from .catboost_regressor_wrap import CatBoostRegressorWrapper
|
18
|
-
from .model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
|
19
19
|
|
20
20
|
_MODEL_FILENAME = "model.cbm"
|
21
21
|
_MODEL_PARAMS_FILENAME = "model_params.json"
|
@@ -148,7 +148,7 @@ class CatboostModel(Model):
|
|
148
148
|
)
|
149
149
|
catboost = self._provide_catboost()
|
150
150
|
catboost.save_model(os.path.join(folder, _MODEL_FILENAME))
|
151
|
-
trial.
|
151
|
+
trial.set_user_attr(_BEST_ITERATION_KEY, self._best_iteration)
|
152
152
|
|
153
153
|
def fit(
|
154
154
|
self,
|
@@ -219,9 +219,10 @@ class CatboostModel(Model):
|
|
219
219
|
best_iteration if best_iteration is not None else self._iterations
|
220
220
|
)
|
221
221
|
logging.info(
|
222
|
-
"Creating catboost model with depth %d, boosting type %s",
|
222
|
+
"Creating catboost model with depth %d, boosting type %s, best iteration %d",
|
223
223
|
self._depth,
|
224
224
|
self._boosting_type,
|
225
|
+
-1 if best_iteration is None else best_iteration,
|
225
226
|
)
|
226
227
|
match self._model_type:
|
227
228
|
case ModelType.BINARY:
|
@@ -7,15 +7,17 @@ from typing import Any, Self
|
|
7
7
|
import optuna
|
8
8
|
import pandas as pd
|
9
9
|
|
10
|
-
from .catboost_model import CatboostModel
|
10
|
+
from .catboost.catboost_model import CatboostModel
|
11
11
|
from .model import Model
|
12
|
-
from .tabpfn_model import TabPFNModel
|
12
|
+
from .tabpfn.tabpfn_model import TabPFNModel
|
13
|
+
from .xgboost.xgboost_model import XGBoostModel
|
13
14
|
|
14
15
|
_MODEL_ROUTER_FILE = "model_router.json"
|
15
16
|
_MODEL_KEY = "model"
|
16
17
|
_MODELS = {
|
17
18
|
CatboostModel.name(): CatboostModel,
|
18
19
|
TabPFNModel.name(): TabPFNModel,
|
20
|
+
XGBoostModel.name(): XGBoostModel,
|
19
21
|
}
|
20
22
|
|
21
23
|
|
@@ -0,0 +1 @@
|
|
1
|
+
"""The wavetrain tabpfn model module."""
|
{wavetrainer-0.0.38/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/tabpfn}/tabpfn_model.py
RENAMED
@@ -14,9 +14,9 @@ import torch
|
|
14
14
|
from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import ( # type: ignore
|
15
15
|
AutoTabPFNClassifier, AutoTabPFNRegressor)
|
16
16
|
|
17
|
-
from
|
18
|
-
from
|
19
|
-
from
|
17
|
+
from ...exceptions import WavetrainException
|
18
|
+
from ...model_type import ModelType, determine_model_type
|
19
|
+
from ..model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
|
20
20
|
|
21
21
|
_MODEL_FILENAME = "model.pkl"
|
22
22
|
_MODEL_PARAMS_FILENAME = "model_params.json"
|
@@ -0,0 +1 @@
|
|
1
|
+
"""The wavetrain xgboost model module."""
|
@@ -0,0 +1,16 @@
|
|
1
|
+
"""A callback function for early stopping."""
|
2
|
+
|
3
|
+
from typing import Any
|
4
|
+
|
5
|
+
from xgboost.callback import EarlyStopping, TrainingCallback
|
6
|
+
|
7
|
+
|
8
|
+
class XGBoostEarlyStoppingCallback(EarlyStopping):
|
9
|
+
"""A callback for early stopping in XGBoost models."""
|
10
|
+
|
11
|
+
def after_iteration(
|
12
|
+
self, model: Any, epoch: int, evals_log: TrainingCallback.EvalsLog
|
13
|
+
) -> bool:
|
14
|
+
if len(evals_log.keys()) < 1:
|
15
|
+
return False
|
16
|
+
return super().after_iteration(model, epoch, evals_log)
|
@@ -0,0 +1,23 @@
|
|
1
|
+
"""An XGBoost callback class for logging epochs."""
|
2
|
+
|
3
|
+
from typing import Any
|
4
|
+
|
5
|
+
from xgboost.callback import TrainingCallback
|
6
|
+
|
7
|
+
|
8
|
+
class XGBoostEpochsLogger(TrainingCallback):
|
9
|
+
"""Log the epochs in XGBoost."""
|
10
|
+
|
11
|
+
def after_iteration(
|
12
|
+
self, model: Any, epoch: int, evals_log: TrainingCallback.EvalsLog
|
13
|
+
) -> bool:
|
14
|
+
if epoch % 100 != 0:
|
15
|
+
return False
|
16
|
+
log_items = []
|
17
|
+
for dataset, metrics in evals_log.items():
|
18
|
+
for metric_name, values in metrics.items():
|
19
|
+
current_val = values[-1]
|
20
|
+
log_items.append(f"{dataset}-{metric_name}: {current_val:.5f}")
|
21
|
+
|
22
|
+
print(f"XGBoost: [{epoch}] " + " | ".join(log_items))
|
23
|
+
return False
|
@@ -0,0 +1,277 @@
|
|
1
|
+
"""A model that wraps xgboost."""
|
2
|
+
# pylint: disable=duplicate-code,too-many-arguments,too-many-positional-arguments,too-many-instance-attributes
|
3
|
+
|
4
|
+
import json
|
5
|
+
import os
|
6
|
+
from typing import Any, Self
|
7
|
+
|
8
|
+
import optuna
|
9
|
+
import pandas as pd
|
10
|
+
import torch
|
11
|
+
from xgboost import XGBClassifier, XGBRegressor
|
12
|
+
from xgboost.callback import TrainingCallback
|
13
|
+
|
14
|
+
from ...model_type import ModelType, determine_model_type
|
15
|
+
from ..model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
|
16
|
+
from .early_stopper import XGBoostEarlyStoppingCallback
|
17
|
+
from .xgboost_logger import XGBoostEpochsLogger
|
18
|
+
|
19
|
+
_MODEL_FILENAME = "xgboost_model.json"
|
20
|
+
_MODEL_PARAMS_FILENAME = "xgboost_model_params.json"
|
21
|
+
_MODEL_TYPE_KEY = "model_type"
|
22
|
+
_BEST_ITERATION_KEY = "best_iteration"
|
23
|
+
|
24
|
+
|
25
|
+
def _convert_categoricals(input_df: pd.DataFrame) -> pd.DataFrame:
|
26
|
+
output_df = input_df.copy()
|
27
|
+
for col in input_df.select_dtypes(include=["category"]).columns:
|
28
|
+
output_df[col] = output_df[col].cat.codes
|
29
|
+
return output_df
|
30
|
+
|
31
|
+
|
32
|
+
class XGBoostModel(Model):
|
33
|
+
"""A class that uses XGBoost as a model."""
|
34
|
+
|
35
|
+
_xgboost: XGBRegressor | XGBClassifier | None
|
36
|
+
_model_type: None | ModelType
|
37
|
+
_booster: str | None
|
38
|
+
_lambda: float | None
|
39
|
+
_alpha: float | None
|
40
|
+
_subsample: float | None
|
41
|
+
_colsample_bytree: float | None
|
42
|
+
_max_depth: int | None
|
43
|
+
_min_child_weight: int | None
|
44
|
+
_eta: float | None
|
45
|
+
_gamma: float | None
|
46
|
+
_grow_policy: str | None
|
47
|
+
_sample_type: str | None
|
48
|
+
_normalize_type: str | None
|
49
|
+
_rate_drop: float | None
|
50
|
+
_skip_drop: float | None
|
51
|
+
_num_boost_rounds: int | None
|
52
|
+
_early_stopping_rounds: int | None
|
53
|
+
_best_iteration: int | None
|
54
|
+
|
55
|
+
@classmethod
|
56
|
+
def name(cls) -> str:
|
57
|
+
return "xgboost"
|
58
|
+
|
59
|
+
@classmethod
|
60
|
+
def supports_x(cls, df: pd.DataFrame) -> bool:
|
61
|
+
return True
|
62
|
+
|
63
|
+
def __init__(self) -> None:
|
64
|
+
super().__init__()
|
65
|
+
self._xgboost = None
|
66
|
+
self._model_type = None
|
67
|
+
self._booster = None
|
68
|
+
self._lambda = None
|
69
|
+
self._alpha = None
|
70
|
+
self._subsample = None
|
71
|
+
self._colsample_bytree = None
|
72
|
+
self._max_depth = None
|
73
|
+
self._min_child_weight = None
|
74
|
+
self._eta = None
|
75
|
+
self._gamma = None
|
76
|
+
self._grow_policy = None
|
77
|
+
self._sample_type = None
|
78
|
+
self._normalize_type = None
|
79
|
+
self._rate_drop = None
|
80
|
+
self._skip_drop = None
|
81
|
+
self._num_boost_rounds = None
|
82
|
+
self._early_stopping_rounds = None
|
83
|
+
self._best_iteration = None
|
84
|
+
|
85
|
+
@property
|
86
|
+
def estimator(self) -> Any:
|
87
|
+
return self._provide_xgboost()
|
88
|
+
|
89
|
+
@property
|
90
|
+
def supports_importances(self) -> bool:
|
91
|
+
return True
|
92
|
+
|
93
|
+
@property
|
94
|
+
def feature_importances(self) -> dict[str, float]:
|
95
|
+
bst = self._provide_xgboost()
|
96
|
+
return bst.get_score(importance_type="weight") # type: ignore
|
97
|
+
|
98
|
+
def pre_fit(
|
99
|
+
self,
|
100
|
+
df: pd.DataFrame,
|
101
|
+
y: pd.Series | pd.DataFrame | None,
|
102
|
+
eval_x: pd.DataFrame | None = None,
|
103
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
104
|
+
w: pd.Series | None = None,
|
105
|
+
):
|
106
|
+
if y is None:
|
107
|
+
raise ValueError("y is null.")
|
108
|
+
self._model_type = determine_model_type(y)
|
109
|
+
return {
|
110
|
+
"eval_set": (eval_x, eval_y),
|
111
|
+
"sample_weight": w,
|
112
|
+
}
|
113
|
+
|
114
|
+
def set_options(
|
115
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
116
|
+
) -> None:
|
117
|
+
self._booster = trial.suggest_categorical(
|
118
|
+
"booster", ["gbtree", "gblinear", "dart"]
|
119
|
+
)
|
120
|
+
self._lambda = trial.suggest_float("lambda", 1e-8, 1.0, log=True)
|
121
|
+
self._alpha = trial.suggest_float("alpha", 1e-8, 1.0, log=True)
|
122
|
+
self._subsample = trial.suggest_float("subsample", 0.2, 1.0)
|
123
|
+
self._colsample_bytree = trial.suggest_float("colsample_bytree", 0.2, 1.0)
|
124
|
+
if self._booster in ["gbtree", "dart"]:
|
125
|
+
self._max_depth = trial.suggest_int("max_depth", 3, 9, step=2)
|
126
|
+
self._min_child_weight = trial.suggest_int(
|
127
|
+
"min_child_weight", 2, 10, log=True
|
128
|
+
)
|
129
|
+
self._eta = trial.suggest_float("eta", 1e-8, 1.0, log=True)
|
130
|
+
self._gamma = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
|
131
|
+
self._grow_policy = trial.suggest_categorical(
|
132
|
+
"grow_policy", ["depthwise", "lossguide"]
|
133
|
+
)
|
134
|
+
else:
|
135
|
+
self._sample_type = trial.suggest_categorical(
|
136
|
+
"sample_type", ["uniform", "weighted"]
|
137
|
+
)
|
138
|
+
self._normalize_type = trial.suggest_categorical(
|
139
|
+
"normalize_type", ["tree", "forest"]
|
140
|
+
)
|
141
|
+
self._rate_drop = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
|
142
|
+
self._skip_drop = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
|
143
|
+
self._num_boost_rounds = trial.suggest_int("num_boost_rounds", 100, 10000)
|
144
|
+
self._early_stopping_rounds = trial.suggest_int(
|
145
|
+
"early_stopping_rounds", 50, 500
|
146
|
+
)
|
147
|
+
self._best_iteration = trial.user_attrs.get(_BEST_ITERATION_KEY)
|
148
|
+
|
149
|
+
def load(self, folder: str) -> None:
|
150
|
+
with open(
|
151
|
+
os.path.join(folder, _MODEL_PARAMS_FILENAME), encoding="utf8"
|
152
|
+
) as handle:
|
153
|
+
params = json.load(handle)
|
154
|
+
self._model_type = ModelType(params[_MODEL_TYPE_KEY])
|
155
|
+
self._best_iteration = params.get(_BEST_ITERATION_KEY)
|
156
|
+
bst = self._provide_xgboost()
|
157
|
+
bst.load_model(os.path.join(folder, _MODEL_FILENAME))
|
158
|
+
|
159
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
160
|
+
bst = self._provide_xgboost()
|
161
|
+
bst.save_model(os.path.join(folder, _MODEL_FILENAME))
|
162
|
+
with open(
|
163
|
+
os.path.join(folder, _MODEL_PARAMS_FILENAME), "w", encoding="utf8"
|
164
|
+
) as handle:
|
165
|
+
json.dump(
|
166
|
+
{
|
167
|
+
_MODEL_TYPE_KEY: str(self._model_type),
|
168
|
+
_BEST_ITERATION_KEY: self._best_iteration,
|
169
|
+
},
|
170
|
+
handle,
|
171
|
+
)
|
172
|
+
trial.set_user_attr(_BEST_ITERATION_KEY, self._best_iteration)
|
173
|
+
|
174
|
+
def fit(
|
175
|
+
self,
|
176
|
+
df: pd.DataFrame,
|
177
|
+
y: pd.Series | pd.DataFrame | None = None,
|
178
|
+
w: pd.Series | None = None,
|
179
|
+
eval_x: pd.DataFrame | None = None,
|
180
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
181
|
+
) -> Self:
|
182
|
+
if y is None:
|
183
|
+
raise ValueError("y is null.")
|
184
|
+
self._model_type = determine_model_type(y)
|
185
|
+
xgboost = self._provide_xgboost()
|
186
|
+
df = _convert_categoricals(df)
|
187
|
+
evals = [(df, y)]
|
188
|
+
if eval_x is not None and eval_y is not None and self._best_iteration is None:
|
189
|
+
eval_x = _convert_categoricals(eval_x)
|
190
|
+
evals.append((eval_x, eval_y))
|
191
|
+
xgboost.fit( # type: ignore
|
192
|
+
df,
|
193
|
+
y,
|
194
|
+
eval_set=evals,
|
195
|
+
sample_weight=w,
|
196
|
+
verbose=False,
|
197
|
+
)
|
198
|
+
return self
|
199
|
+
|
200
|
+
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
201
|
+
x_df = _convert_categoricals(df)
|
202
|
+
xgboost = self._provide_xgboost()
|
203
|
+
pred = xgboost.predict(x_df)
|
204
|
+
df = pd.DataFrame(
|
205
|
+
index=df.index,
|
206
|
+
data={
|
207
|
+
PREDICTION_COLUMN: pred.flatten(),
|
208
|
+
},
|
209
|
+
)
|
210
|
+
if self._model_type != ModelType.REGRESSION:
|
211
|
+
proba = xgboost.predict_proba(x_df) # type: ignore
|
212
|
+
for i in range(proba.shape[1]):
|
213
|
+
df[f"{PROBABILITY_COLUMN_PREFIX}{i}"] = proba[:, i]
|
214
|
+
return df
|
215
|
+
|
216
|
+
def _provide_xgboost(self) -> XGBClassifier | XGBRegressor:
|
217
|
+
xgboost = self._xgboost
|
218
|
+
if xgboost is None:
|
219
|
+
callbacks: list[TrainingCallback] = [
|
220
|
+
XGBoostEpochsLogger(),
|
221
|
+
]
|
222
|
+
if self._best_iteration is not None:
|
223
|
+
callbacks.append(
|
224
|
+
XGBoostEarlyStoppingCallback(rounds=self._early_stopping_rounds)
|
225
|
+
)
|
226
|
+
param = {
|
227
|
+
"objective": "binary:logistic",
|
228
|
+
"tree_method": "gpu_hist" if torch.cuda.is_available() else "exact",
|
229
|
+
# defines booster, gblinear for linear functions.
|
230
|
+
"booster": self._booster,
|
231
|
+
# L2 regularization weight.
|
232
|
+
"reg_lambda": self._lambda,
|
233
|
+
# L1 regularization weight.
|
234
|
+
"alpha": self._alpha,
|
235
|
+
# sampling ratio for training data.
|
236
|
+
"subsample": self._subsample,
|
237
|
+
# sampling according to each tree.
|
238
|
+
"colsample_bytree": self._colsample_bytree,
|
239
|
+
"n_estimators": self._best_iteration
|
240
|
+
if self._best_iteration is not None
|
241
|
+
else self._num_boost_rounds,
|
242
|
+
"base_score": 0.5,
|
243
|
+
"verbosity": 0,
|
244
|
+
"verbose": False,
|
245
|
+
"callbacks": callbacks,
|
246
|
+
"eval_metric": ["logloss", "error"],
|
247
|
+
}
|
248
|
+
if param["booster"] in ["gbtree", "dart"]:
|
249
|
+
# maximum depth of the tree, signifies complexity of the tree.
|
250
|
+
param["max_depth"] = self._max_depth
|
251
|
+
# minimum child weight, larger the term more conservative the tree.
|
252
|
+
param["min_child_weight"] = self._min_child_weight
|
253
|
+
param["eta"] = self._eta
|
254
|
+
# defines how selective algorithm is.
|
255
|
+
param["gamma"] = self._gamma
|
256
|
+
param["grow_policy"] = self._grow_policy
|
257
|
+
|
258
|
+
if param["booster"] == "dart":
|
259
|
+
param["sample_type"] = self._sample_type
|
260
|
+
param["normalize_type"] = self._normalize_type
|
261
|
+
param["rate_drop"] = self._rate_drop
|
262
|
+
param["skip_drop"] = self._skip_drop
|
263
|
+
match self._model_type:
|
264
|
+
case ModelType.BINARY:
|
265
|
+
xgboost = XGBClassifier(**param)
|
266
|
+
case ModelType.REGRESSION:
|
267
|
+
param["objective"] = "reg:squarederror"
|
268
|
+
param["eval_metric"] = ["rmse", "mae"]
|
269
|
+
xgboost = XGBRegressor(**param)
|
270
|
+
case ModelType.BINNED_BINARY:
|
271
|
+
xgboost = XGBClassifier(**param)
|
272
|
+
case ModelType.MULTI_CLASSIFICATION:
|
273
|
+
xgboost = XGBClassifier(**param)
|
274
|
+
self._xgboost = xgboost
|
275
|
+
if xgboost is None:
|
276
|
+
raise ValueError("xgboost is null")
|
277
|
+
return xgboost
|
@@ -3,6 +3,7 @@
|
|
3
3
|
import json
|
4
4
|
import logging
|
5
5
|
import os
|
6
|
+
import time
|
6
7
|
from typing import Self
|
7
8
|
|
8
9
|
import optuna
|
@@ -37,7 +38,7 @@ class CombinedReducer(Reducer):
|
|
37
38
|
DuplicateReducer(),
|
38
39
|
CorrelationReducer(),
|
39
40
|
SmartCorrelationReducer(),
|
40
|
-
SelectBySingleFeaturePerformanceReducer(),
|
41
|
+
# SelectBySingleFeaturePerformanceReducer(),
|
41
42
|
]
|
42
43
|
self._folder = None
|
43
44
|
|
@@ -99,12 +100,16 @@ class CombinedReducer(Reducer):
|
|
99
100
|
) -> Self:
|
100
101
|
removed_columns_dict = {}
|
101
102
|
for reducer in self._reducers:
|
103
|
+
start_reducer = time.time()
|
102
104
|
before_columns = set(df.columns.values)
|
103
105
|
df = reducer.fit_transform(df, y=y)
|
104
106
|
after_columns = set(df.columns.values)
|
105
107
|
removed_columns = before_columns.difference(after_columns)
|
106
108
|
if removed_columns:
|
107
109
|
removed_columns_dict[reducer.name()] = list(removed_columns)
|
110
|
+
logging.info(
|
111
|
+
"%s reducer took %f", reducer.name(), time.time() - start_reducer
|
112
|
+
)
|
108
113
|
if self._folder is not None:
|
109
114
|
with open(
|
110
115
|
os.path.join(self._folder, _REMOVED_COLUMNS_FILE), encoding="utf8"
|
@@ -41,5 +41,6 @@ class NonNumericReducer(Reducer):
|
|
41
41
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
42
42
|
categorical_cols = df.select_dtypes(include="category").columns.tolist()
|
43
43
|
numeric_cols = df.select_dtypes(include="number").columns.tolist()
|
44
|
-
|
44
|
+
boolean_cols = df.select_dtypes(include="bool").columns.tolist()
|
45
|
+
keep_cols = categorical_cols + numeric_cols + boolean_cols
|
45
46
|
return df[keep_cols]
|
@@ -1,5 +1,6 @@
|
|
1
1
|
"""A reducer that removes features by their single performance via further heuristics."""
|
2
2
|
|
3
|
+
# pylint: disable=too-many-arguments,too-many-positional-arguments
|
3
4
|
from typing import Self
|
4
5
|
|
5
6
|
import optuna
|
@@ -7,8 +8,8 @@ import pandas as pd
|
|
7
8
|
from feature_engine.selection import SelectBySingleFeaturePerformance
|
8
9
|
from sklearn.ensemble import RandomForestClassifier # type: ignore
|
9
10
|
|
10
|
-
from .base_selector_reducer import BaseSelectorReducer
|
11
11
|
from ..model_type import ModelType, determine_model_type
|
12
|
+
from .base_selector_reducer import BaseSelectorReducer
|
12
13
|
|
13
14
|
_SINGLE_FEATURE_PERFORMANCE_REDUCER_FILENAME = (
|
14
15
|
"single_feature_performance_reducer.joblib"
|
@@ -23,7 +24,7 @@ class SelectBySingleFeaturePerformanceReducer(BaseSelectorReducer):
|
|
23
24
|
|
24
25
|
def __init__(self) -> None:
|
25
26
|
self._singlefeatureperformance_selector = SelectBySingleFeaturePerformance(
|
26
|
-
RandomForestClassifier(random_state=42), scoring="accuracy"
|
27
|
+
RandomForestClassifier(random_state=42, n_jobs=-1), scoring="accuracy", cv=1
|
27
28
|
)
|
28
29
|
super().__init__(
|
29
30
|
self._singlefeatureperformance_selector,
|
@@ -53,5 +54,7 @@ class SelectBySingleFeaturePerformanceReducer(BaseSelectorReducer):
|
|
53
54
|
eval_x: pd.DataFrame | None = None,
|
54
55
|
eval_y: pd.Series | pd.DataFrame | None = None,
|
55
56
|
) -> Self:
|
56
|
-
self._singlefeatureperformance_selector.scoring =
|
57
|
+
self._singlefeatureperformance_selector.scoring = (
|
58
|
+
"r2" if determine_model_type(y) == ModelType.REGRESSION else "accuracy" # type: ignore
|
59
|
+
)
|
57
60
|
return super().fit(df, y=y, w=w, eval_x=eval_x, eval_y=eval_y)
|
@@ -6,6 +6,7 @@ import json
|
|
6
6
|
import logging
|
7
7
|
import os
|
8
8
|
import pickle
|
9
|
+
import time
|
9
10
|
from typing import Self
|
10
11
|
|
11
12
|
import optuna
|
@@ -231,6 +232,7 @@ class Trainer(Fit):
|
|
231
232
|
|
232
233
|
try:
|
233
234
|
# Window the data
|
235
|
+
start_windower = time.time()
|
234
236
|
windower = Windower(self._dt_column)
|
235
237
|
windower.set_options(trial, x)
|
236
238
|
x_train = windower.fit_transform(x_train)
|
@@ -240,25 +242,31 @@ class Trainer(Fit):
|
|
240
242
|
os.removedirs(folder)
|
241
243
|
logging.warning("Y train only contains 1 unique datapoint.")
|
242
244
|
return _BAD_OUTPUT
|
245
|
+
logging.info("Windowing took %f", time.time() - start_windower)
|
243
246
|
|
244
247
|
# Perform common reductions
|
248
|
+
start_reducer = time.time()
|
245
249
|
reducer = CombinedReducer()
|
246
250
|
reducer.set_options(trial, x)
|
247
251
|
x_train = reducer.fit_transform(x_train, y=y_train)
|
248
252
|
x_test = reducer.transform(x_test)
|
253
|
+
logging.info("Reducing took %f", time.time() - start_reducer)
|
249
254
|
|
250
255
|
# Calculate the row weights
|
256
|
+
start_row_weights = time.time()
|
251
257
|
weights = CombinedWeights()
|
252
258
|
weights.set_options(trial, x)
|
253
259
|
w = weights.fit(x_train, y=y_train).transform(y_train.to_frame())[
|
254
260
|
WEIGHTS_COLUMN
|
255
261
|
]
|
262
|
+
logging.info("Row weights took %f", time.time() - start_row_weights)
|
256
263
|
|
257
264
|
# Create model
|
258
265
|
model = ModelRouter()
|
259
266
|
model.set_options(trial, x)
|
260
267
|
|
261
268
|
# Train
|
269
|
+
start_train = time.time()
|
262
270
|
selector = Selector(model)
|
263
271
|
selector.set_options(trial, x)
|
264
272
|
selector.fit(x_train, y=y_train, w=w, eval_x=x_test, eval_y=y_test)
|
@@ -267,11 +275,14 @@ class Trainer(Fit):
|
|
267
275
|
x_pred = model.fit_transform(
|
268
276
|
x_train, y=y_train, w=w, eval_x=x_test, eval_y=y_test
|
269
277
|
)
|
278
|
+
logging.info("Training took %f", time.time() - start_train)
|
270
279
|
|
271
280
|
# Calibrate
|
281
|
+
start_calibrate = time.time()
|
272
282
|
calibrator = CalibratorRouter(model)
|
273
283
|
calibrator.set_options(trial, x)
|
274
284
|
calibrator.fit(x_pred, y=y_train)
|
285
|
+
logging.info("Calibrating took %f", time.time() - start_calibrate)
|
275
286
|
|
276
287
|
# Output
|
277
288
|
y_pred = model.transform(x_test)
|
@@ -521,8 +532,11 @@ class Trainer(Fit):
|
|
521
532
|
date_path = os.path.join(column_path, date_str)
|
522
533
|
if not os.path.isdir(date_path):
|
523
534
|
continue
|
524
|
-
|
525
|
-
|
526
|
-
|
535
|
+
try:
|
536
|
+
model = ModelRouter()
|
537
|
+
model.load(date_path)
|
538
|
+
feature_importances[date_str] = model.feature_importances
|
539
|
+
except FileNotFoundError as exc:
|
540
|
+
logging.warning(str(exc))
|
527
541
|
|
528
542
|
return feature_importances
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: wavetrainer
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.40
|
4
4
|
Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
|
5
5
|
Home-page: https://github.com/8W9aG/wavetrainer
|
6
6
|
Author: Will Sackfield
|
@@ -24,10 +24,8 @@ Requires-Dist: mapie>=0.9.2
|
|
24
24
|
Requires-Dist: pytz>=2025.1
|
25
25
|
Requires-Dist: torch>=2.6.0
|
26
26
|
Requires-Dist: tabpfn>=2.0.6
|
27
|
-
Requires-Dist: tabpfn-extensions>=0.0.4
|
28
|
-
Requires-Dist: shap>=0.47.2
|
29
|
-
Requires-Dist: hyperopt>=0.2.7
|
30
27
|
Requires-Dist: pytest-is-running>=1.5.1
|
28
|
+
Requires-Dist: xgboost>=3.0.0
|
31
29
|
|
32
30
|
# wavetrainer
|
33
31
|
|
@@ -58,10 +56,8 @@ Python 3.11.6:
|
|
58
56
|
- [pytz](https://pythonhosted.org/pytz/)
|
59
57
|
- [torch](https://pytorch.org/)
|
60
58
|
- [tabpfn](https://github.com/PriorLabs/TabPFN)
|
61
|
-
- [tabpfn-extensions](https://github.com/PriorLabs/tabpfn-extensions)
|
62
|
-
- [shap](https://shap.readthedocs.io/en/latest/)
|
63
|
-
- [hyperopt](https://hyperopt.github.io/hyperopt/)
|
64
59
|
- [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
|
60
|
+
- [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
|
65
61
|
|
66
62
|
## Raison D'être :thought_balloon:
|
67
63
|
|
@@ -26,13 +26,19 @@ wavetrainer/calibrator/calibrator_router.py
|
|
26
26
|
wavetrainer/calibrator/mapie_calibrator.py
|
27
27
|
wavetrainer/calibrator/vennabers_calibrator.py
|
28
28
|
wavetrainer/model/__init__.py
|
29
|
-
wavetrainer/model/catboost_classifier_wrap.py
|
30
|
-
wavetrainer/model/catboost_kwargs.py
|
31
|
-
wavetrainer/model/catboost_model.py
|
32
|
-
wavetrainer/model/catboost_regressor_wrap.py
|
33
29
|
wavetrainer/model/model.py
|
34
30
|
wavetrainer/model/model_router.py
|
35
|
-
wavetrainer/model/
|
31
|
+
wavetrainer/model/catboost/__init__.py
|
32
|
+
wavetrainer/model/catboost/catboost_classifier_wrap.py
|
33
|
+
wavetrainer/model/catboost/catboost_kwargs.py
|
34
|
+
wavetrainer/model/catboost/catboost_model.py
|
35
|
+
wavetrainer/model/catboost/catboost_regressor_wrap.py
|
36
|
+
wavetrainer/model/tabpfn/__init__.py
|
37
|
+
wavetrainer/model/tabpfn/tabpfn_model.py
|
38
|
+
wavetrainer/model/xgboost/__init__.py
|
39
|
+
wavetrainer/model/xgboost/early_stopper.py
|
40
|
+
wavetrainer/model/xgboost/xgboost_logger.py
|
41
|
+
wavetrainer/model/xgboost/xgboost_model.py
|
36
42
|
wavetrainer/reducer/__init__.py
|
37
43
|
wavetrainer/reducer/base_selector_reducer.py
|
38
44
|
wavetrainer/reducer/combined_reducer.py
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/non_categorical_numeric_columns.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|