wavetrainer 0.0.39__tar.gz → 0.0.40__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wavetrainer-0.0.39/wavetrainer.egg-info → wavetrainer-0.0.40}/PKG-INFO +3 -7
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/README.md +1 -3
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/requirements.txt +2 -4
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/setup.py +1 -1
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/tests/model/catboost_kwargs_test.py +1 -1
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/tests/trainer_test.py +1 -1
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/__init__.py +1 -1
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/calibrator/calibrator_router.py +3 -1
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/calibrator/vennabers_calibrator.py +6 -1
- wavetrainer-0.0.40/wavetrainer/model/catboost/__init__.py +1 -0
- {wavetrainer-0.0.39/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/catboost}/catboost_model.py +3 -3
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/model/model_router.py +4 -2
- wavetrainer-0.0.40/wavetrainer/model/tabpfn/__init__.py +1 -0
- {wavetrainer-0.0.39/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/tabpfn}/tabpfn_model.py +3 -3
- wavetrainer-0.0.40/wavetrainer/model/xgboost/__init__.py +1 -0
- wavetrainer-0.0.40/wavetrainer/model/xgboost/early_stopper.py +16 -0
- wavetrainer-0.0.40/wavetrainer/model/xgboost/xgboost_logger.py +23 -0
- wavetrainer-0.0.40/wavetrainer/model/xgboost/xgboost_model.py +277 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/nonnumeric_reducer.py +2 -1
- {wavetrainer-0.0.39 → wavetrainer-0.0.40/wavetrainer.egg-info}/PKG-INFO +3 -7
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer.egg-info/SOURCES.txt +11 -5
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer.egg-info/requires.txt +1 -3
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/LICENSE +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/MANIFEST.in +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/setup.cfg +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/tests/__init__.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/tests/model/__init__.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/calibrator/__init__.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/calibrator/calibrator.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/calibrator/mapie_calibrator.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/create.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/exceptions.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/fit.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/model/__init__.py +0 -0
- {wavetrainer-0.0.39/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/catboost}/catboost_classifier_wrap.py +0 -0
- {wavetrainer-0.0.39/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/catboost}/catboost_kwargs.py +0 -0
- {wavetrainer-0.0.39/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/catboost}/catboost_regressor_wrap.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/model/model.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/model_type.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/params.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/__init__.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/base_selector_reducer.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/combined_reducer.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/constant_reducer.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/correlation_reducer.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/duplicate_reducer.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/non_categorical_numeric_columns.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/reducer.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/select_by_single_feature_performance_reducer.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/smart_correlation_reducer.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/unseen_reducer.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/selector/__init__.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/selector/selector.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/trainer.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/weights/__init__.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/weights/class_weights.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/weights/combined_weights.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/weights/exponential_weights.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/weights/linear_weights.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/weights/noop_weights.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/weights/sigmoid_weights.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/weights/weights.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/weights/weights_router.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/windower/__init__.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/windower/windower.py +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer.egg-info/dependency_links.txt +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer.egg-info/not-zip-safe +0 -0
- {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: wavetrainer
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.40
|
4
4
|
Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
|
5
5
|
Home-page: https://github.com/8W9aG/wavetrainer
|
6
6
|
Author: Will Sackfield
|
@@ -24,10 +24,8 @@ Requires-Dist: mapie>=0.9.2
|
|
24
24
|
Requires-Dist: pytz>=2025.1
|
25
25
|
Requires-Dist: torch>=2.6.0
|
26
26
|
Requires-Dist: tabpfn>=2.0.6
|
27
|
-
Requires-Dist: tabpfn-extensions>=0.0.4
|
28
|
-
Requires-Dist: shap>=0.47.2
|
29
|
-
Requires-Dist: hyperopt>=0.2.7
|
30
27
|
Requires-Dist: pytest-is-running>=1.5.1
|
28
|
+
Requires-Dist: xgboost>=3.0.0
|
31
29
|
|
32
30
|
# wavetrainer
|
33
31
|
|
@@ -58,10 +56,8 @@ Python 3.11.6:
|
|
58
56
|
- [pytz](https://pythonhosted.org/pytz/)
|
59
57
|
- [torch](https://pytorch.org/)
|
60
58
|
- [tabpfn](https://github.com/PriorLabs/TabPFN)
|
61
|
-
- [tabpfn-extensions](https://github.com/PriorLabs/tabpfn-extensions)
|
62
|
-
- [shap](https://shap.readthedocs.io/en/latest/)
|
63
|
-
- [hyperopt](https://hyperopt.github.io/hyperopt/)
|
64
59
|
- [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
|
60
|
+
- [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
|
65
61
|
|
66
62
|
## Raison D'être :thought_balloon:
|
67
63
|
|
@@ -27,10 +27,8 @@ Python 3.11.6:
|
|
27
27
|
- [pytz](https://pythonhosted.org/pytz/)
|
28
28
|
- [torch](https://pytorch.org/)
|
29
29
|
- [tabpfn](https://github.com/PriorLabs/TabPFN)
|
30
|
-
- [tabpfn-extensions](https://github.com/PriorLabs/tabpfn-extensions)
|
31
|
-
- [shap](https://shap.readthedocs.io/en/latest/)
|
32
|
-
- [hyperopt](https://hyperopt.github.io/hyperopt/)
|
33
30
|
- [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
|
31
|
+
- [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
|
34
32
|
|
35
33
|
## Raison D'être :thought_balloon:
|
36
34
|
|
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
|
|
23
23
|
|
24
24
|
setup(
|
25
25
|
name='wavetrainer',
|
26
|
-
version='0.0.
|
26
|
+
version='0.0.40',
|
27
27
|
description='A library for automatically finding the optimal model within feature and hyperparameter space.',
|
28
28
|
long_description=long_description,
|
29
29
|
long_description_content_type='text/markdown',
|
@@ -13,7 +13,7 @@ class TestTrainer(unittest.TestCase):
|
|
13
13
|
|
14
14
|
def test_trainer(self):
|
15
15
|
with tempfile.TemporaryDirectory() as tmpdir:
|
16
|
-
trainer = Trainer(tmpdir, walkforward_timedelta=datetime.timedelta(days=7), trials=
|
16
|
+
trainer = Trainer(tmpdir, walkforward_timedelta=datetime.timedelta(days=7), trials=5)
|
17
17
|
x_data = [i for i in range(101)]
|
18
18
|
x_index = [datetime.datetime(2022, 1, 1) + datetime.timedelta(days=i) for i in range(len(x_data))]
|
19
19
|
df = pd.DataFrame(
|
@@ -78,7 +78,9 @@ class CalibratorRouter(Calibrator):
|
|
78
78
|
) -> Self:
|
79
79
|
# pylint: disable=no-else-return
|
80
80
|
calibrator: Calibrator | None = None
|
81
|
-
if
|
81
|
+
if y is None:
|
82
|
+
raise ValueError("y is null")
|
83
|
+
if determine_model_type(y) == ModelType.REGRESSION:
|
82
84
|
calibrator = MAPIECalibrator(self._model)
|
83
85
|
else:
|
84
86
|
calibrator = VennabersCalibrator(self._model)
|
@@ -1,5 +1,6 @@
|
|
1
1
|
"""A calibrator that implements venn abers."""
|
2
2
|
|
3
|
+
import logging
|
3
4
|
import os
|
4
5
|
from typing import Self
|
5
6
|
|
@@ -54,7 +55,11 @@ class VennabersCalibrator(Calibrator):
|
|
54
55
|
prob_columns = [
|
55
56
|
x for x in df.columns.values if x.startswith(PROBABILITY_COLUMN_PREFIX)
|
56
57
|
]
|
57
|
-
|
58
|
+
try:
|
59
|
+
vennabers.fit(df[prob_columns].to_numpy(), y.to_numpy())
|
60
|
+
except IndexError:
|
61
|
+
logging.error(df)
|
62
|
+
raise
|
58
63
|
return self
|
59
64
|
|
60
65
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
@@ -0,0 +1 @@
|
|
1
|
+
"""The wavetrain catboost model module."""
|
@@ -10,12 +10,12 @@ import pandas as pd
|
|
10
10
|
import torch
|
11
11
|
from catboost import CatBoost, Pool # type: ignore
|
12
12
|
|
13
|
-
from
|
13
|
+
from ...model_type import ModelType, determine_model_type
|
14
|
+
from ..model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
|
14
15
|
from .catboost_classifier_wrap import CatBoostClassifierWrapper
|
15
16
|
from .catboost_kwargs import (CAT_FEATURES_ARG_KEY, EVAL_SET_ARG_KEY,
|
16
17
|
ORIGINAL_X_ARG_KEY)
|
17
18
|
from .catboost_regressor_wrap import CatBoostRegressorWrapper
|
18
|
-
from .model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
|
19
19
|
|
20
20
|
_MODEL_FILENAME = "model.cbm"
|
21
21
|
_MODEL_PARAMS_FILENAME = "model_params.json"
|
@@ -222,7 +222,7 @@ class CatboostModel(Model):
|
|
222
222
|
"Creating catboost model with depth %d, boosting type %s, best iteration %d",
|
223
223
|
self._depth,
|
224
224
|
self._boosting_type,
|
225
|
-
best_iteration,
|
225
|
+
-1 if best_iteration is None else best_iteration,
|
226
226
|
)
|
227
227
|
match self._model_type:
|
228
228
|
case ModelType.BINARY:
|
@@ -7,15 +7,17 @@ from typing import Any, Self
|
|
7
7
|
import optuna
|
8
8
|
import pandas as pd
|
9
9
|
|
10
|
-
from .catboost_model import CatboostModel
|
10
|
+
from .catboost.catboost_model import CatboostModel
|
11
11
|
from .model import Model
|
12
|
-
from .tabpfn_model import TabPFNModel
|
12
|
+
from .tabpfn.tabpfn_model import TabPFNModel
|
13
|
+
from .xgboost.xgboost_model import XGBoostModel
|
13
14
|
|
14
15
|
_MODEL_ROUTER_FILE = "model_router.json"
|
15
16
|
_MODEL_KEY = "model"
|
16
17
|
_MODELS = {
|
17
18
|
CatboostModel.name(): CatboostModel,
|
18
19
|
TabPFNModel.name(): TabPFNModel,
|
20
|
+
XGBoostModel.name(): XGBoostModel,
|
19
21
|
}
|
20
22
|
|
21
23
|
|
@@ -0,0 +1 @@
|
|
1
|
+
"""The wavetrain tabpfn model module."""
|
{wavetrainer-0.0.39/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/tabpfn}/tabpfn_model.py
RENAMED
@@ -14,9 +14,9 @@ import torch
|
|
14
14
|
from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import ( # type: ignore
|
15
15
|
AutoTabPFNClassifier, AutoTabPFNRegressor)
|
16
16
|
|
17
|
-
from
|
18
|
-
from
|
19
|
-
from
|
17
|
+
from ...exceptions import WavetrainException
|
18
|
+
from ...model_type import ModelType, determine_model_type
|
19
|
+
from ..model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
|
20
20
|
|
21
21
|
_MODEL_FILENAME = "model.pkl"
|
22
22
|
_MODEL_PARAMS_FILENAME = "model_params.json"
|
@@ -0,0 +1 @@
|
|
1
|
+
"""The wavetrain xgboost model module."""
|
@@ -0,0 +1,16 @@
|
|
1
|
+
"""A callback function for early stopping."""
|
2
|
+
|
3
|
+
from typing import Any
|
4
|
+
|
5
|
+
from xgboost.callback import EarlyStopping, TrainingCallback
|
6
|
+
|
7
|
+
|
8
|
+
class XGBoostEarlyStoppingCallback(EarlyStopping):
|
9
|
+
"""A callback for early stopping in XGBoost models."""
|
10
|
+
|
11
|
+
def after_iteration(
|
12
|
+
self, model: Any, epoch: int, evals_log: TrainingCallback.EvalsLog
|
13
|
+
) -> bool:
|
14
|
+
if len(evals_log.keys()) < 1:
|
15
|
+
return False
|
16
|
+
return super().after_iteration(model, epoch, evals_log)
|
@@ -0,0 +1,23 @@
|
|
1
|
+
"""An XGBoost callback class for logging epochs."""
|
2
|
+
|
3
|
+
from typing import Any
|
4
|
+
|
5
|
+
from xgboost.callback import TrainingCallback
|
6
|
+
|
7
|
+
|
8
|
+
class XGBoostEpochsLogger(TrainingCallback):
|
9
|
+
"""Log the epochs in XGBoost."""
|
10
|
+
|
11
|
+
def after_iteration(
|
12
|
+
self, model: Any, epoch: int, evals_log: TrainingCallback.EvalsLog
|
13
|
+
) -> bool:
|
14
|
+
if epoch % 100 != 0:
|
15
|
+
return False
|
16
|
+
log_items = []
|
17
|
+
for dataset, metrics in evals_log.items():
|
18
|
+
for metric_name, values in metrics.items():
|
19
|
+
current_val = values[-1]
|
20
|
+
log_items.append(f"{dataset}-{metric_name}: {current_val:.5f}")
|
21
|
+
|
22
|
+
print(f"XGBoost: [{epoch}] " + " | ".join(log_items))
|
23
|
+
return False
|
@@ -0,0 +1,277 @@
|
|
1
|
+
"""A model that wraps xgboost."""
|
2
|
+
# pylint: disable=duplicate-code,too-many-arguments,too-many-positional-arguments,too-many-instance-attributes
|
3
|
+
|
4
|
+
import json
|
5
|
+
import os
|
6
|
+
from typing import Any, Self
|
7
|
+
|
8
|
+
import optuna
|
9
|
+
import pandas as pd
|
10
|
+
import torch
|
11
|
+
from xgboost import XGBClassifier, XGBRegressor
|
12
|
+
from xgboost.callback import TrainingCallback
|
13
|
+
|
14
|
+
from ...model_type import ModelType, determine_model_type
|
15
|
+
from ..model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
|
16
|
+
from .early_stopper import XGBoostEarlyStoppingCallback
|
17
|
+
from .xgboost_logger import XGBoostEpochsLogger
|
18
|
+
|
19
|
+
_MODEL_FILENAME = "xgboost_model.json"
|
20
|
+
_MODEL_PARAMS_FILENAME = "xgboost_model_params.json"
|
21
|
+
_MODEL_TYPE_KEY = "model_type"
|
22
|
+
_BEST_ITERATION_KEY = "best_iteration"
|
23
|
+
|
24
|
+
|
25
|
+
def _convert_categoricals(input_df: pd.DataFrame) -> pd.DataFrame:
|
26
|
+
output_df = input_df.copy()
|
27
|
+
for col in input_df.select_dtypes(include=["category"]).columns:
|
28
|
+
output_df[col] = output_df[col].cat.codes
|
29
|
+
return output_df
|
30
|
+
|
31
|
+
|
32
|
+
class XGBoostModel(Model):
|
33
|
+
"""A class that uses XGBoost as a model."""
|
34
|
+
|
35
|
+
_xgboost: XGBRegressor | XGBClassifier | None
|
36
|
+
_model_type: None | ModelType
|
37
|
+
_booster: str | None
|
38
|
+
_lambda: float | None
|
39
|
+
_alpha: float | None
|
40
|
+
_subsample: float | None
|
41
|
+
_colsample_bytree: float | None
|
42
|
+
_max_depth: int | None
|
43
|
+
_min_child_weight: int | None
|
44
|
+
_eta: float | None
|
45
|
+
_gamma: float | None
|
46
|
+
_grow_policy: str | None
|
47
|
+
_sample_type: str | None
|
48
|
+
_normalize_type: str | None
|
49
|
+
_rate_drop: float | None
|
50
|
+
_skip_drop: float | None
|
51
|
+
_num_boost_rounds: int | None
|
52
|
+
_early_stopping_rounds: int | None
|
53
|
+
_best_iteration: int | None
|
54
|
+
|
55
|
+
@classmethod
|
56
|
+
def name(cls) -> str:
|
57
|
+
return "xgboost"
|
58
|
+
|
59
|
+
@classmethod
|
60
|
+
def supports_x(cls, df: pd.DataFrame) -> bool:
|
61
|
+
return True
|
62
|
+
|
63
|
+
def __init__(self) -> None:
|
64
|
+
super().__init__()
|
65
|
+
self._xgboost = None
|
66
|
+
self._model_type = None
|
67
|
+
self._booster = None
|
68
|
+
self._lambda = None
|
69
|
+
self._alpha = None
|
70
|
+
self._subsample = None
|
71
|
+
self._colsample_bytree = None
|
72
|
+
self._max_depth = None
|
73
|
+
self._min_child_weight = None
|
74
|
+
self._eta = None
|
75
|
+
self._gamma = None
|
76
|
+
self._grow_policy = None
|
77
|
+
self._sample_type = None
|
78
|
+
self._normalize_type = None
|
79
|
+
self._rate_drop = None
|
80
|
+
self._skip_drop = None
|
81
|
+
self._num_boost_rounds = None
|
82
|
+
self._early_stopping_rounds = None
|
83
|
+
self._best_iteration = None
|
84
|
+
|
85
|
+
@property
|
86
|
+
def estimator(self) -> Any:
|
87
|
+
return self._provide_xgboost()
|
88
|
+
|
89
|
+
@property
|
90
|
+
def supports_importances(self) -> bool:
|
91
|
+
return True
|
92
|
+
|
93
|
+
@property
|
94
|
+
def feature_importances(self) -> dict[str, float]:
|
95
|
+
bst = self._provide_xgboost()
|
96
|
+
return bst.get_score(importance_type="weight") # type: ignore
|
97
|
+
|
98
|
+
def pre_fit(
|
99
|
+
self,
|
100
|
+
df: pd.DataFrame,
|
101
|
+
y: pd.Series | pd.DataFrame | None,
|
102
|
+
eval_x: pd.DataFrame | None = None,
|
103
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
104
|
+
w: pd.Series | None = None,
|
105
|
+
):
|
106
|
+
if y is None:
|
107
|
+
raise ValueError("y is null.")
|
108
|
+
self._model_type = determine_model_type(y)
|
109
|
+
return {
|
110
|
+
"eval_set": (eval_x, eval_y),
|
111
|
+
"sample_weight": w,
|
112
|
+
}
|
113
|
+
|
114
|
+
def set_options(
|
115
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
116
|
+
) -> None:
|
117
|
+
self._booster = trial.suggest_categorical(
|
118
|
+
"booster", ["gbtree", "gblinear", "dart"]
|
119
|
+
)
|
120
|
+
self._lambda = trial.suggest_float("lambda", 1e-8, 1.0, log=True)
|
121
|
+
self._alpha = trial.suggest_float("alpha", 1e-8, 1.0, log=True)
|
122
|
+
self._subsample = trial.suggest_float("subsample", 0.2, 1.0)
|
123
|
+
self._colsample_bytree = trial.suggest_float("colsample_bytree", 0.2, 1.0)
|
124
|
+
if self._booster in ["gbtree", "dart"]:
|
125
|
+
self._max_depth = trial.suggest_int("max_depth", 3, 9, step=2)
|
126
|
+
self._min_child_weight = trial.suggest_int(
|
127
|
+
"min_child_weight", 2, 10, log=True
|
128
|
+
)
|
129
|
+
self._eta = trial.suggest_float("eta", 1e-8, 1.0, log=True)
|
130
|
+
self._gamma = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
|
131
|
+
self._grow_policy = trial.suggest_categorical(
|
132
|
+
"grow_policy", ["depthwise", "lossguide"]
|
133
|
+
)
|
134
|
+
else:
|
135
|
+
self._sample_type = trial.suggest_categorical(
|
136
|
+
"sample_type", ["uniform", "weighted"]
|
137
|
+
)
|
138
|
+
self._normalize_type = trial.suggest_categorical(
|
139
|
+
"normalize_type", ["tree", "forest"]
|
140
|
+
)
|
141
|
+
self._rate_drop = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
|
142
|
+
self._skip_drop = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
|
143
|
+
self._num_boost_rounds = trial.suggest_int("num_boost_rounds", 100, 10000)
|
144
|
+
self._early_stopping_rounds = trial.suggest_int(
|
145
|
+
"early_stopping_rounds", 50, 500
|
146
|
+
)
|
147
|
+
self._best_iteration = trial.user_attrs.get(_BEST_ITERATION_KEY)
|
148
|
+
|
149
|
+
def load(self, folder: str) -> None:
|
150
|
+
with open(
|
151
|
+
os.path.join(folder, _MODEL_PARAMS_FILENAME), encoding="utf8"
|
152
|
+
) as handle:
|
153
|
+
params = json.load(handle)
|
154
|
+
self._model_type = ModelType(params[_MODEL_TYPE_KEY])
|
155
|
+
self._best_iteration = params.get(_BEST_ITERATION_KEY)
|
156
|
+
bst = self._provide_xgboost()
|
157
|
+
bst.load_model(os.path.join(folder, _MODEL_FILENAME))
|
158
|
+
|
159
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
160
|
+
bst = self._provide_xgboost()
|
161
|
+
bst.save_model(os.path.join(folder, _MODEL_FILENAME))
|
162
|
+
with open(
|
163
|
+
os.path.join(folder, _MODEL_PARAMS_FILENAME), "w", encoding="utf8"
|
164
|
+
) as handle:
|
165
|
+
json.dump(
|
166
|
+
{
|
167
|
+
_MODEL_TYPE_KEY: str(self._model_type),
|
168
|
+
_BEST_ITERATION_KEY: self._best_iteration,
|
169
|
+
},
|
170
|
+
handle,
|
171
|
+
)
|
172
|
+
trial.set_user_attr(_BEST_ITERATION_KEY, self._best_iteration)
|
173
|
+
|
174
|
+
def fit(
|
175
|
+
self,
|
176
|
+
df: pd.DataFrame,
|
177
|
+
y: pd.Series | pd.DataFrame | None = None,
|
178
|
+
w: pd.Series | None = None,
|
179
|
+
eval_x: pd.DataFrame | None = None,
|
180
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
181
|
+
) -> Self:
|
182
|
+
if y is None:
|
183
|
+
raise ValueError("y is null.")
|
184
|
+
self._model_type = determine_model_type(y)
|
185
|
+
xgboost = self._provide_xgboost()
|
186
|
+
df = _convert_categoricals(df)
|
187
|
+
evals = [(df, y)]
|
188
|
+
if eval_x is not None and eval_y is not None and self._best_iteration is None:
|
189
|
+
eval_x = _convert_categoricals(eval_x)
|
190
|
+
evals.append((eval_x, eval_y))
|
191
|
+
xgboost.fit( # type: ignore
|
192
|
+
df,
|
193
|
+
y,
|
194
|
+
eval_set=evals,
|
195
|
+
sample_weight=w,
|
196
|
+
verbose=False,
|
197
|
+
)
|
198
|
+
return self
|
199
|
+
|
200
|
+
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
201
|
+
x_df = _convert_categoricals(df)
|
202
|
+
xgboost = self._provide_xgboost()
|
203
|
+
pred = xgboost.predict(x_df)
|
204
|
+
df = pd.DataFrame(
|
205
|
+
index=df.index,
|
206
|
+
data={
|
207
|
+
PREDICTION_COLUMN: pred.flatten(),
|
208
|
+
},
|
209
|
+
)
|
210
|
+
if self._model_type != ModelType.REGRESSION:
|
211
|
+
proba = xgboost.predict_proba(x_df) # type: ignore
|
212
|
+
for i in range(proba.shape[1]):
|
213
|
+
df[f"{PROBABILITY_COLUMN_PREFIX}{i}"] = proba[:, i]
|
214
|
+
return df
|
215
|
+
|
216
|
+
def _provide_xgboost(self) -> XGBClassifier | XGBRegressor:
|
217
|
+
xgboost = self._xgboost
|
218
|
+
if xgboost is None:
|
219
|
+
callbacks: list[TrainingCallback] = [
|
220
|
+
XGBoostEpochsLogger(),
|
221
|
+
]
|
222
|
+
if self._best_iteration is not None:
|
223
|
+
callbacks.append(
|
224
|
+
XGBoostEarlyStoppingCallback(rounds=self._early_stopping_rounds)
|
225
|
+
)
|
226
|
+
param = {
|
227
|
+
"objective": "binary:logistic",
|
228
|
+
"tree_method": "gpu_hist" if torch.cuda.is_available() else "exact",
|
229
|
+
# defines booster, gblinear for linear functions.
|
230
|
+
"booster": self._booster,
|
231
|
+
# L2 regularization weight.
|
232
|
+
"reg_lambda": self._lambda,
|
233
|
+
# L1 regularization weight.
|
234
|
+
"alpha": self._alpha,
|
235
|
+
# sampling ratio for training data.
|
236
|
+
"subsample": self._subsample,
|
237
|
+
# sampling according to each tree.
|
238
|
+
"colsample_bytree": self._colsample_bytree,
|
239
|
+
"n_estimators": self._best_iteration
|
240
|
+
if self._best_iteration is not None
|
241
|
+
else self._num_boost_rounds,
|
242
|
+
"base_score": 0.5,
|
243
|
+
"verbosity": 0,
|
244
|
+
"verbose": False,
|
245
|
+
"callbacks": callbacks,
|
246
|
+
"eval_metric": ["logloss", "error"],
|
247
|
+
}
|
248
|
+
if param["booster"] in ["gbtree", "dart"]:
|
249
|
+
# maximum depth of the tree, signifies complexity of the tree.
|
250
|
+
param["max_depth"] = self._max_depth
|
251
|
+
# minimum child weight, larger the term more conservative the tree.
|
252
|
+
param["min_child_weight"] = self._min_child_weight
|
253
|
+
param["eta"] = self._eta
|
254
|
+
# defines how selective algorithm is.
|
255
|
+
param["gamma"] = self._gamma
|
256
|
+
param["grow_policy"] = self._grow_policy
|
257
|
+
|
258
|
+
if param["booster"] == "dart":
|
259
|
+
param["sample_type"] = self._sample_type
|
260
|
+
param["normalize_type"] = self._normalize_type
|
261
|
+
param["rate_drop"] = self._rate_drop
|
262
|
+
param["skip_drop"] = self._skip_drop
|
263
|
+
match self._model_type:
|
264
|
+
case ModelType.BINARY:
|
265
|
+
xgboost = XGBClassifier(**param)
|
266
|
+
case ModelType.REGRESSION:
|
267
|
+
param["objective"] = "reg:squarederror"
|
268
|
+
param["eval_metric"] = ["rmse", "mae"]
|
269
|
+
xgboost = XGBRegressor(**param)
|
270
|
+
case ModelType.BINNED_BINARY:
|
271
|
+
xgboost = XGBClassifier(**param)
|
272
|
+
case ModelType.MULTI_CLASSIFICATION:
|
273
|
+
xgboost = XGBClassifier(**param)
|
274
|
+
self._xgboost = xgboost
|
275
|
+
if xgboost is None:
|
276
|
+
raise ValueError("xgboost is null")
|
277
|
+
return xgboost
|
@@ -41,5 +41,6 @@ class NonNumericReducer(Reducer):
|
|
41
41
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
42
42
|
categorical_cols = df.select_dtypes(include="category").columns.tolist()
|
43
43
|
numeric_cols = df.select_dtypes(include="number").columns.tolist()
|
44
|
-
|
44
|
+
boolean_cols = df.select_dtypes(include="bool").columns.tolist()
|
45
|
+
keep_cols = categorical_cols + numeric_cols + boolean_cols
|
45
46
|
return df[keep_cols]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: wavetrainer
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.40
|
4
4
|
Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
|
5
5
|
Home-page: https://github.com/8W9aG/wavetrainer
|
6
6
|
Author: Will Sackfield
|
@@ -24,10 +24,8 @@ Requires-Dist: mapie>=0.9.2
|
|
24
24
|
Requires-Dist: pytz>=2025.1
|
25
25
|
Requires-Dist: torch>=2.6.0
|
26
26
|
Requires-Dist: tabpfn>=2.0.6
|
27
|
-
Requires-Dist: tabpfn-extensions>=0.0.4
|
28
|
-
Requires-Dist: shap>=0.47.2
|
29
|
-
Requires-Dist: hyperopt>=0.2.7
|
30
27
|
Requires-Dist: pytest-is-running>=1.5.1
|
28
|
+
Requires-Dist: xgboost>=3.0.0
|
31
29
|
|
32
30
|
# wavetrainer
|
33
31
|
|
@@ -58,10 +56,8 @@ Python 3.11.6:
|
|
58
56
|
- [pytz](https://pythonhosted.org/pytz/)
|
59
57
|
- [torch](https://pytorch.org/)
|
60
58
|
- [tabpfn](https://github.com/PriorLabs/TabPFN)
|
61
|
-
- [tabpfn-extensions](https://github.com/PriorLabs/tabpfn-extensions)
|
62
|
-
- [shap](https://shap.readthedocs.io/en/latest/)
|
63
|
-
- [hyperopt](https://hyperopt.github.io/hyperopt/)
|
64
59
|
- [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
|
60
|
+
- [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
|
65
61
|
|
66
62
|
## Raison D'être :thought_balloon:
|
67
63
|
|
@@ -26,13 +26,19 @@ wavetrainer/calibrator/calibrator_router.py
|
|
26
26
|
wavetrainer/calibrator/mapie_calibrator.py
|
27
27
|
wavetrainer/calibrator/vennabers_calibrator.py
|
28
28
|
wavetrainer/model/__init__.py
|
29
|
-
wavetrainer/model/catboost_classifier_wrap.py
|
30
|
-
wavetrainer/model/catboost_kwargs.py
|
31
|
-
wavetrainer/model/catboost_model.py
|
32
|
-
wavetrainer/model/catboost_regressor_wrap.py
|
33
29
|
wavetrainer/model/model.py
|
34
30
|
wavetrainer/model/model_router.py
|
35
|
-
wavetrainer/model/
|
31
|
+
wavetrainer/model/catboost/__init__.py
|
32
|
+
wavetrainer/model/catboost/catboost_classifier_wrap.py
|
33
|
+
wavetrainer/model/catboost/catboost_kwargs.py
|
34
|
+
wavetrainer/model/catboost/catboost_model.py
|
35
|
+
wavetrainer/model/catboost/catboost_regressor_wrap.py
|
36
|
+
wavetrainer/model/tabpfn/__init__.py
|
37
|
+
wavetrainer/model/tabpfn/tabpfn_model.py
|
38
|
+
wavetrainer/model/xgboost/__init__.py
|
39
|
+
wavetrainer/model/xgboost/early_stopper.py
|
40
|
+
wavetrainer/model/xgboost/xgboost_logger.py
|
41
|
+
wavetrainer/model/xgboost/xgboost_model.py
|
36
42
|
wavetrainer/reducer/__init__.py
|
37
43
|
wavetrainer/reducer/base_selector_reducer.py
|
38
44
|
wavetrainer/reducer/combined_reducer.py
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/non_categorical_numeric_columns.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|