wavetrainer 0.0.39__tar.gz → 0.0.40__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {wavetrainer-0.0.39/wavetrainer.egg-info → wavetrainer-0.0.40}/PKG-INFO +3 -7
  2. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/README.md +1 -3
  3. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/requirements.txt +2 -4
  4. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/setup.py +1 -1
  5. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/tests/model/catboost_kwargs_test.py +1 -1
  6. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/tests/trainer_test.py +1 -1
  7. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/__init__.py +1 -1
  8. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/calibrator/calibrator_router.py +3 -1
  9. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/calibrator/vennabers_calibrator.py +6 -1
  10. wavetrainer-0.0.40/wavetrainer/model/catboost/__init__.py +1 -0
  11. {wavetrainer-0.0.39/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/catboost}/catboost_model.py +3 -3
  12. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/model/model_router.py +4 -2
  13. wavetrainer-0.0.40/wavetrainer/model/tabpfn/__init__.py +1 -0
  14. {wavetrainer-0.0.39/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/tabpfn}/tabpfn_model.py +3 -3
  15. wavetrainer-0.0.40/wavetrainer/model/xgboost/__init__.py +1 -0
  16. wavetrainer-0.0.40/wavetrainer/model/xgboost/early_stopper.py +16 -0
  17. wavetrainer-0.0.40/wavetrainer/model/xgboost/xgboost_logger.py +23 -0
  18. wavetrainer-0.0.40/wavetrainer/model/xgboost/xgboost_model.py +277 -0
  19. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/nonnumeric_reducer.py +2 -1
  20. {wavetrainer-0.0.39 → wavetrainer-0.0.40/wavetrainer.egg-info}/PKG-INFO +3 -7
  21. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer.egg-info/SOURCES.txt +11 -5
  22. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer.egg-info/requires.txt +1 -3
  23. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/LICENSE +0 -0
  24. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/MANIFEST.in +0 -0
  25. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/setup.cfg +0 -0
  26. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/tests/__init__.py +0 -0
  27. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/tests/model/__init__.py +0 -0
  28. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/calibrator/__init__.py +0 -0
  29. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/calibrator/calibrator.py +0 -0
  30. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/calibrator/mapie_calibrator.py +0 -0
  31. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/create.py +0 -0
  32. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/exceptions.py +0 -0
  33. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/fit.py +0 -0
  34. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/model/__init__.py +0 -0
  35. {wavetrainer-0.0.39/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/catboost}/catboost_classifier_wrap.py +0 -0
  36. {wavetrainer-0.0.39/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/catboost}/catboost_kwargs.py +0 -0
  37. {wavetrainer-0.0.39/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/catboost}/catboost_regressor_wrap.py +0 -0
  38. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/model/model.py +0 -0
  39. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/model_type.py +0 -0
  40. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/params.py +0 -0
  41. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/__init__.py +0 -0
  42. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/base_selector_reducer.py +0 -0
  43. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/combined_reducer.py +0 -0
  44. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/constant_reducer.py +0 -0
  45. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/correlation_reducer.py +0 -0
  46. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/duplicate_reducer.py +0 -0
  47. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/non_categorical_numeric_columns.py +0 -0
  48. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/reducer.py +0 -0
  49. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/select_by_single_feature_performance_reducer.py +0 -0
  50. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/smart_correlation_reducer.py +0 -0
  51. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/reducer/unseen_reducer.py +0 -0
  52. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/selector/__init__.py +0 -0
  53. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/selector/selector.py +0 -0
  54. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/trainer.py +0 -0
  55. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/weights/__init__.py +0 -0
  56. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/weights/class_weights.py +0 -0
  57. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/weights/combined_weights.py +0 -0
  58. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/weights/exponential_weights.py +0 -0
  59. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/weights/linear_weights.py +0 -0
  60. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/weights/noop_weights.py +0 -0
  61. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/weights/sigmoid_weights.py +0 -0
  62. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/weights/weights.py +0 -0
  63. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/weights/weights_router.py +0 -0
  64. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/windower/__init__.py +0 -0
  65. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer/windower/windower.py +0 -0
  66. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer.egg-info/dependency_links.txt +0 -0
  67. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer.egg-info/not-zip-safe +0 -0
  68. {wavetrainer-0.0.39 → wavetrainer-0.0.40}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.0.39
3
+ Version: 0.0.40
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
@@ -24,10 +24,8 @@ Requires-Dist: mapie>=0.9.2
24
24
  Requires-Dist: pytz>=2025.1
25
25
  Requires-Dist: torch>=2.6.0
26
26
  Requires-Dist: tabpfn>=2.0.6
27
- Requires-Dist: tabpfn-extensions>=0.0.4
28
- Requires-Dist: shap>=0.47.2
29
- Requires-Dist: hyperopt>=0.2.7
30
27
  Requires-Dist: pytest-is-running>=1.5.1
28
+ Requires-Dist: xgboost>=3.0.0
31
29
 
32
30
  # wavetrainer
33
31
 
@@ -58,10 +56,8 @@ Python 3.11.6:
58
56
  - [pytz](https://pythonhosted.org/pytz/)
59
57
  - [torch](https://pytorch.org/)
60
58
  - [tabpfn](https://github.com/PriorLabs/TabPFN)
61
- - [tabpfn-extensions](https://github.com/PriorLabs/tabpfn-extensions)
62
- - [shap](https://shap.readthedocs.io/en/latest/)
63
- - [hyperopt](https://hyperopt.github.io/hyperopt/)
64
59
  - [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
60
+ - [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
65
61
 
66
62
  ## Raison D'être :thought_balloon:
67
63
 
@@ -27,10 +27,8 @@ Python 3.11.6:
27
27
  - [pytz](https://pythonhosted.org/pytz/)
28
28
  - [torch](https://pytorch.org/)
29
29
  - [tabpfn](https://github.com/PriorLabs/TabPFN)
30
- - [tabpfn-extensions](https://github.com/PriorLabs/tabpfn-extensions)
31
- - [shap](https://shap.readthedocs.io/en/latest/)
32
- - [hyperopt](https://hyperopt.github.io/hyperopt/)
33
30
  - [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
31
+ - [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
34
32
 
35
33
  ## Raison D'être :thought_balloon:
36
34
 
@@ -11,7 +11,5 @@ mapie>=0.9.2
11
11
  pytz>=2025.1
12
12
  torch>=2.6.0
13
13
  tabpfn>=2.0.6
14
- tabpfn-extensions>=0.0.4
15
- shap>=0.47.2
16
- hyperopt>=0.2.7
17
- pytest-is-running>=1.5.1
14
+ pytest-is-running>=1.5.1
15
+ xgboost>=3.0.0
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
23
23
 
24
24
  setup(
25
25
  name='wavetrainer',
26
- version='0.0.39',
26
+ version='0.0.40',
27
27
  description='A library for automatically finding the optimal model within feature and hyperparameter space.',
28
28
  long_description=long_description,
29
29
  long_description_content_type='text/markdown',
@@ -3,7 +3,7 @@ import unittest
3
3
 
4
4
  import pandas as pd
5
5
 
6
- from wavetrainer.model.catboost_kwargs import handle_fit_kwargs
6
+ from wavetrainer.model.catboost.catboost_kwargs import handle_fit_kwargs
7
7
 
8
8
 
9
9
  class TestCatboostKwargs(unittest.TestCase):
@@ -13,7 +13,7 @@ class TestTrainer(unittest.TestCase):
13
13
 
14
14
  def test_trainer(self):
15
15
  with tempfile.TemporaryDirectory() as tmpdir:
16
- trainer = Trainer(tmpdir, walkforward_timedelta=datetime.timedelta(days=7), trials=1)
16
+ trainer = Trainer(tmpdir, walkforward_timedelta=datetime.timedelta(days=7), trials=5)
17
17
  x_data = [i for i in range(101)]
18
18
  x_index = [datetime.datetime(2022, 1, 1) + datetime.timedelta(days=i) for i in range(len(x_data))]
19
19
  df = pd.DataFrame(
@@ -2,5 +2,5 @@
2
2
 
3
3
  from .create import create
4
4
 
5
- __VERSION__ = "0.0.39"
5
+ __VERSION__ = "0.0.40"
6
6
  __all__ = ("create",)
@@ -78,7 +78,9 @@ class CalibratorRouter(Calibrator):
78
78
  ) -> Self:
79
79
  # pylint: disable=no-else-return
80
80
  calibrator: Calibrator | None = None
81
- if determine_model_type(df) == ModelType.REGRESSION:
81
+ if y is None:
82
+ raise ValueError("y is null")
83
+ if determine_model_type(y) == ModelType.REGRESSION:
82
84
  calibrator = MAPIECalibrator(self._model)
83
85
  else:
84
86
  calibrator = VennabersCalibrator(self._model)
@@ -1,5 +1,6 @@
1
1
  """A calibrator that implements venn abers."""
2
2
 
3
+ import logging
3
4
  import os
4
5
  from typing import Self
5
6
 
@@ -54,7 +55,11 @@ class VennabersCalibrator(Calibrator):
54
55
  prob_columns = [
55
56
  x for x in df.columns.values if x.startswith(PROBABILITY_COLUMN_PREFIX)
56
57
  ]
57
- vennabers.fit(df[prob_columns].to_numpy(), y.to_numpy())
58
+ try:
59
+ vennabers.fit(df[prob_columns].to_numpy(), y.to_numpy())
60
+ except IndexError:
61
+ logging.error(df)
62
+ raise
58
63
  return self
59
64
 
60
65
  def transform(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -0,0 +1 @@
1
+ """The wavetrain catboost model module."""
@@ -10,12 +10,12 @@ import pandas as pd
10
10
  import torch
11
11
  from catboost import CatBoost, Pool # type: ignore
12
12
 
13
- from ..model_type import ModelType, determine_model_type
13
+ from ...model_type import ModelType, determine_model_type
14
+ from ..model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
14
15
  from .catboost_classifier_wrap import CatBoostClassifierWrapper
15
16
  from .catboost_kwargs import (CAT_FEATURES_ARG_KEY, EVAL_SET_ARG_KEY,
16
17
  ORIGINAL_X_ARG_KEY)
17
18
  from .catboost_regressor_wrap import CatBoostRegressorWrapper
18
- from .model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
19
19
 
20
20
  _MODEL_FILENAME = "model.cbm"
21
21
  _MODEL_PARAMS_FILENAME = "model_params.json"
@@ -222,7 +222,7 @@ class CatboostModel(Model):
222
222
  "Creating catboost model with depth %d, boosting type %s, best iteration %d",
223
223
  self._depth,
224
224
  self._boosting_type,
225
- best_iteration,
225
+ -1 if best_iteration is None else best_iteration,
226
226
  )
227
227
  match self._model_type:
228
228
  case ModelType.BINARY:
@@ -7,15 +7,17 @@ from typing import Any, Self
7
7
  import optuna
8
8
  import pandas as pd
9
9
 
10
- from .catboost_model import CatboostModel
10
+ from .catboost.catboost_model import CatboostModel
11
11
  from .model import Model
12
- from .tabpfn_model import TabPFNModel
12
+ from .tabpfn.tabpfn_model import TabPFNModel
13
+ from .xgboost.xgboost_model import XGBoostModel
13
14
 
14
15
  _MODEL_ROUTER_FILE = "model_router.json"
15
16
  _MODEL_KEY = "model"
16
17
  _MODELS = {
17
18
  CatboostModel.name(): CatboostModel,
18
19
  TabPFNModel.name(): TabPFNModel,
20
+ XGBoostModel.name(): XGBoostModel,
19
21
  }
20
22
 
21
23
 
@@ -0,0 +1 @@
1
+ """The wavetrain tabpfn model module."""
@@ -14,9 +14,9 @@ import torch
14
14
  from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import ( # type: ignore
15
15
  AutoTabPFNClassifier, AutoTabPFNRegressor)
16
16
 
17
- from ..exceptions import WavetrainException
18
- from ..model_type import ModelType, determine_model_type
19
- from .model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
17
+ from ...exceptions import WavetrainException
18
+ from ...model_type import ModelType, determine_model_type
19
+ from ..model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
20
20
 
21
21
  _MODEL_FILENAME = "model.pkl"
22
22
  _MODEL_PARAMS_FILENAME = "model_params.json"
@@ -0,0 +1 @@
1
+ """The wavetrain xgboost model module."""
@@ -0,0 +1,16 @@
1
+ """A callback function for early stopping."""
2
+
3
+ from typing import Any
4
+
5
+ from xgboost.callback import EarlyStopping, TrainingCallback
6
+
7
+
8
+ class XGBoostEarlyStoppingCallback(EarlyStopping):
9
+ """A callback for early stopping in XGBoost models."""
10
+
11
+ def after_iteration(
12
+ self, model: Any, epoch: int, evals_log: TrainingCallback.EvalsLog
13
+ ) -> bool:
14
+ if len(evals_log.keys()) < 1:
15
+ return False
16
+ return super().after_iteration(model, epoch, evals_log)
@@ -0,0 +1,23 @@
1
+ """An XGBoost callback class for logging epochs."""
2
+
3
+ from typing import Any
4
+
5
+ from xgboost.callback import TrainingCallback
6
+
7
+
8
+ class XGBoostEpochsLogger(TrainingCallback):
9
+ """Log the epochs in XGBoost."""
10
+
11
+ def after_iteration(
12
+ self, model: Any, epoch: int, evals_log: TrainingCallback.EvalsLog
13
+ ) -> bool:
14
+ if epoch % 100 != 0:
15
+ return False
16
+ log_items = []
17
+ for dataset, metrics in evals_log.items():
18
+ for metric_name, values in metrics.items():
19
+ current_val = values[-1]
20
+ log_items.append(f"{dataset}-{metric_name}: {current_val:.5f}")
21
+
22
+ print(f"XGBoost: [{epoch}] " + " | ".join(log_items))
23
+ return False
@@ -0,0 +1,277 @@
1
+ """A model that wraps xgboost."""
2
+ # pylint: disable=duplicate-code,too-many-arguments,too-many-positional-arguments,too-many-instance-attributes
3
+
4
+ import json
5
+ import os
6
+ from typing import Any, Self
7
+
8
+ import optuna
9
+ import pandas as pd
10
+ import torch
11
+ from xgboost import XGBClassifier, XGBRegressor
12
+ from xgboost.callback import TrainingCallback
13
+
14
+ from ...model_type import ModelType, determine_model_type
15
+ from ..model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
16
+ from .early_stopper import XGBoostEarlyStoppingCallback
17
+ from .xgboost_logger import XGBoostEpochsLogger
18
+
19
+ _MODEL_FILENAME = "xgboost_model.json"
20
+ _MODEL_PARAMS_FILENAME = "xgboost_model_params.json"
21
+ _MODEL_TYPE_KEY = "model_type"
22
+ _BEST_ITERATION_KEY = "best_iteration"
23
+
24
+
25
+ def _convert_categoricals(input_df: pd.DataFrame) -> pd.DataFrame:
26
+ output_df = input_df.copy()
27
+ for col in input_df.select_dtypes(include=["category"]).columns:
28
+ output_df[col] = output_df[col].cat.codes
29
+ return output_df
30
+
31
+
32
+ class XGBoostModel(Model):
33
+ """A class that uses XGBoost as a model."""
34
+
35
+ _xgboost: XGBRegressor | XGBClassifier | None
36
+ _model_type: None | ModelType
37
+ _booster: str | None
38
+ _lambda: float | None
39
+ _alpha: float | None
40
+ _subsample: float | None
41
+ _colsample_bytree: float | None
42
+ _max_depth: int | None
43
+ _min_child_weight: int | None
44
+ _eta: float | None
45
+ _gamma: float | None
46
+ _grow_policy: str | None
47
+ _sample_type: str | None
48
+ _normalize_type: str | None
49
+ _rate_drop: float | None
50
+ _skip_drop: float | None
51
+ _num_boost_rounds: int | None
52
+ _early_stopping_rounds: int | None
53
+ _best_iteration: int | None
54
+
55
+ @classmethod
56
+ def name(cls) -> str:
57
+ return "xgboost"
58
+
59
+ @classmethod
60
+ def supports_x(cls, df: pd.DataFrame) -> bool:
61
+ return True
62
+
63
+ def __init__(self) -> None:
64
+ super().__init__()
65
+ self._xgboost = None
66
+ self._model_type = None
67
+ self._booster = None
68
+ self._lambda = None
69
+ self._alpha = None
70
+ self._subsample = None
71
+ self._colsample_bytree = None
72
+ self._max_depth = None
73
+ self._min_child_weight = None
74
+ self._eta = None
75
+ self._gamma = None
76
+ self._grow_policy = None
77
+ self._sample_type = None
78
+ self._normalize_type = None
79
+ self._rate_drop = None
80
+ self._skip_drop = None
81
+ self._num_boost_rounds = None
82
+ self._early_stopping_rounds = None
83
+ self._best_iteration = None
84
+
85
+ @property
86
+ def estimator(self) -> Any:
87
+ return self._provide_xgboost()
88
+
89
+ @property
90
+ def supports_importances(self) -> bool:
91
+ return True
92
+
93
+ @property
94
+ def feature_importances(self) -> dict[str, float]:
95
+ bst = self._provide_xgboost()
96
+ return bst.get_score(importance_type="weight") # type: ignore
97
+
98
+ def pre_fit(
99
+ self,
100
+ df: pd.DataFrame,
101
+ y: pd.Series | pd.DataFrame | None,
102
+ eval_x: pd.DataFrame | None = None,
103
+ eval_y: pd.Series | pd.DataFrame | None = None,
104
+ w: pd.Series | None = None,
105
+ ):
106
+ if y is None:
107
+ raise ValueError("y is null.")
108
+ self._model_type = determine_model_type(y)
109
+ return {
110
+ "eval_set": (eval_x, eval_y),
111
+ "sample_weight": w,
112
+ }
113
+
114
+ def set_options(
115
+ self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
116
+ ) -> None:
117
+ self._booster = trial.suggest_categorical(
118
+ "booster", ["gbtree", "gblinear", "dart"]
119
+ )
120
+ self._lambda = trial.suggest_float("lambda", 1e-8, 1.0, log=True)
121
+ self._alpha = trial.suggest_float("alpha", 1e-8, 1.0, log=True)
122
+ self._subsample = trial.suggest_float("subsample", 0.2, 1.0)
123
+ self._colsample_bytree = trial.suggest_float("colsample_bytree", 0.2, 1.0)
124
+ if self._booster in ["gbtree", "dart"]:
125
+ self._max_depth = trial.suggest_int("max_depth", 3, 9, step=2)
126
+ self._min_child_weight = trial.suggest_int(
127
+ "min_child_weight", 2, 10, log=True
128
+ )
129
+ self._eta = trial.suggest_float("eta", 1e-8, 1.0, log=True)
130
+ self._gamma = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
131
+ self._grow_policy = trial.suggest_categorical(
132
+ "grow_policy", ["depthwise", "lossguide"]
133
+ )
134
+ else:
135
+ self._sample_type = trial.suggest_categorical(
136
+ "sample_type", ["uniform", "weighted"]
137
+ )
138
+ self._normalize_type = trial.suggest_categorical(
139
+ "normalize_type", ["tree", "forest"]
140
+ )
141
+ self._rate_drop = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
142
+ self._skip_drop = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
143
+ self._num_boost_rounds = trial.suggest_int("num_boost_rounds", 100, 10000)
144
+ self._early_stopping_rounds = trial.suggest_int(
145
+ "early_stopping_rounds", 50, 500
146
+ )
147
+ self._best_iteration = trial.user_attrs.get(_BEST_ITERATION_KEY)
148
+
149
+ def load(self, folder: str) -> None:
150
+ with open(
151
+ os.path.join(folder, _MODEL_PARAMS_FILENAME), encoding="utf8"
152
+ ) as handle:
153
+ params = json.load(handle)
154
+ self._model_type = ModelType(params[_MODEL_TYPE_KEY])
155
+ self._best_iteration = params.get(_BEST_ITERATION_KEY)
156
+ bst = self._provide_xgboost()
157
+ bst.load_model(os.path.join(folder, _MODEL_FILENAME))
158
+
159
+ def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
160
+ bst = self._provide_xgboost()
161
+ bst.save_model(os.path.join(folder, _MODEL_FILENAME))
162
+ with open(
163
+ os.path.join(folder, _MODEL_PARAMS_FILENAME), "w", encoding="utf8"
164
+ ) as handle:
165
+ json.dump(
166
+ {
167
+ _MODEL_TYPE_KEY: str(self._model_type),
168
+ _BEST_ITERATION_KEY: self._best_iteration,
169
+ },
170
+ handle,
171
+ )
172
+ trial.set_user_attr(_BEST_ITERATION_KEY, self._best_iteration)
173
+
174
+ def fit(
175
+ self,
176
+ df: pd.DataFrame,
177
+ y: pd.Series | pd.DataFrame | None = None,
178
+ w: pd.Series | None = None,
179
+ eval_x: pd.DataFrame | None = None,
180
+ eval_y: pd.Series | pd.DataFrame | None = None,
181
+ ) -> Self:
182
+ if y is None:
183
+ raise ValueError("y is null.")
184
+ self._model_type = determine_model_type(y)
185
+ xgboost = self._provide_xgboost()
186
+ df = _convert_categoricals(df)
187
+ evals = [(df, y)]
188
+ if eval_x is not None and eval_y is not None and self._best_iteration is None:
189
+ eval_x = _convert_categoricals(eval_x)
190
+ evals.append((eval_x, eval_y))
191
+ xgboost.fit( # type: ignore
192
+ df,
193
+ y,
194
+ eval_set=evals,
195
+ sample_weight=w,
196
+ verbose=False,
197
+ )
198
+ return self
199
+
200
+ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
201
+ x_df = _convert_categoricals(df)
202
+ xgboost = self._provide_xgboost()
203
+ pred = xgboost.predict(x_df)
204
+ df = pd.DataFrame(
205
+ index=df.index,
206
+ data={
207
+ PREDICTION_COLUMN: pred.flatten(),
208
+ },
209
+ )
210
+ if self._model_type != ModelType.REGRESSION:
211
+ proba = xgboost.predict_proba(x_df) # type: ignore
212
+ for i in range(proba.shape[1]):
213
+ df[f"{PROBABILITY_COLUMN_PREFIX}{i}"] = proba[:, i]
214
+ return df
215
+
216
+ def _provide_xgboost(self) -> XGBClassifier | XGBRegressor:
217
+ xgboost = self._xgboost
218
+ if xgboost is None:
219
+ callbacks: list[TrainingCallback] = [
220
+ XGBoostEpochsLogger(),
221
+ ]
222
+ if self._best_iteration is not None:
223
+ callbacks.append(
224
+ XGBoostEarlyStoppingCallback(rounds=self._early_stopping_rounds)
225
+ )
226
+ param = {
227
+ "objective": "binary:logistic",
228
+ "tree_method": "gpu_hist" if torch.cuda.is_available() else "exact",
229
+ # defines booster, gblinear for linear functions.
230
+ "booster": self._booster,
231
+ # L2 regularization weight.
232
+ "reg_lambda": self._lambda,
233
+ # L1 regularization weight.
234
+ "alpha": self._alpha,
235
+ # sampling ratio for training data.
236
+ "subsample": self._subsample,
237
+ # sampling according to each tree.
238
+ "colsample_bytree": self._colsample_bytree,
239
+ "n_estimators": self._best_iteration
240
+ if self._best_iteration is not None
241
+ else self._num_boost_rounds,
242
+ "base_score": 0.5,
243
+ "verbosity": 0,
244
+ "verbose": False,
245
+ "callbacks": callbacks,
246
+ "eval_metric": ["logloss", "error"],
247
+ }
248
+ if param["booster"] in ["gbtree", "dart"]:
249
+ # maximum depth of the tree, signifies complexity of the tree.
250
+ param["max_depth"] = self._max_depth
251
+ # minimum child weight, larger the term more conservative the tree.
252
+ param["min_child_weight"] = self._min_child_weight
253
+ param["eta"] = self._eta
254
+ # defines how selective algorithm is.
255
+ param["gamma"] = self._gamma
256
+ param["grow_policy"] = self._grow_policy
257
+
258
+ if param["booster"] == "dart":
259
+ param["sample_type"] = self._sample_type
260
+ param["normalize_type"] = self._normalize_type
261
+ param["rate_drop"] = self._rate_drop
262
+ param["skip_drop"] = self._skip_drop
263
+ match self._model_type:
264
+ case ModelType.BINARY:
265
+ xgboost = XGBClassifier(**param)
266
+ case ModelType.REGRESSION:
267
+ param["objective"] = "reg:squarederror"
268
+ param["eval_metric"] = ["rmse", "mae"]
269
+ xgboost = XGBRegressor(**param)
270
+ case ModelType.BINNED_BINARY:
271
+ xgboost = XGBClassifier(**param)
272
+ case ModelType.MULTI_CLASSIFICATION:
273
+ xgboost = XGBClassifier(**param)
274
+ self._xgboost = xgboost
275
+ if xgboost is None:
276
+ raise ValueError("xgboost is null")
277
+ return xgboost
@@ -41,5 +41,6 @@ class NonNumericReducer(Reducer):
41
41
  def transform(self, df: pd.DataFrame) -> pd.DataFrame:
42
42
  categorical_cols = df.select_dtypes(include="category").columns.tolist()
43
43
  numeric_cols = df.select_dtypes(include="number").columns.tolist()
44
- keep_cols = categorical_cols + numeric_cols
44
+ boolean_cols = df.select_dtypes(include="bool").columns.tolist()
45
+ keep_cols = categorical_cols + numeric_cols + boolean_cols
45
46
  return df[keep_cols]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.0.39
3
+ Version: 0.0.40
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
@@ -24,10 +24,8 @@ Requires-Dist: mapie>=0.9.2
24
24
  Requires-Dist: pytz>=2025.1
25
25
  Requires-Dist: torch>=2.6.0
26
26
  Requires-Dist: tabpfn>=2.0.6
27
- Requires-Dist: tabpfn-extensions>=0.0.4
28
- Requires-Dist: shap>=0.47.2
29
- Requires-Dist: hyperopt>=0.2.7
30
27
  Requires-Dist: pytest-is-running>=1.5.1
28
+ Requires-Dist: xgboost>=3.0.0
31
29
 
32
30
  # wavetrainer
33
31
 
@@ -58,10 +56,8 @@ Python 3.11.6:
58
56
  - [pytz](https://pythonhosted.org/pytz/)
59
57
  - [torch](https://pytorch.org/)
60
58
  - [tabpfn](https://github.com/PriorLabs/TabPFN)
61
- - [tabpfn-extensions](https://github.com/PriorLabs/tabpfn-extensions)
62
- - [shap](https://shap.readthedocs.io/en/latest/)
63
- - [hyperopt](https://hyperopt.github.io/hyperopt/)
64
59
  - [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
60
+ - [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
65
61
 
66
62
  ## Raison D'être :thought_balloon:
67
63
 
@@ -26,13 +26,19 @@ wavetrainer/calibrator/calibrator_router.py
26
26
  wavetrainer/calibrator/mapie_calibrator.py
27
27
  wavetrainer/calibrator/vennabers_calibrator.py
28
28
  wavetrainer/model/__init__.py
29
- wavetrainer/model/catboost_classifier_wrap.py
30
- wavetrainer/model/catboost_kwargs.py
31
- wavetrainer/model/catboost_model.py
32
- wavetrainer/model/catboost_regressor_wrap.py
33
29
  wavetrainer/model/model.py
34
30
  wavetrainer/model/model_router.py
35
- wavetrainer/model/tabpfn_model.py
31
+ wavetrainer/model/catboost/__init__.py
32
+ wavetrainer/model/catboost/catboost_classifier_wrap.py
33
+ wavetrainer/model/catboost/catboost_kwargs.py
34
+ wavetrainer/model/catboost/catboost_model.py
35
+ wavetrainer/model/catboost/catboost_regressor_wrap.py
36
+ wavetrainer/model/tabpfn/__init__.py
37
+ wavetrainer/model/tabpfn/tabpfn_model.py
38
+ wavetrainer/model/xgboost/__init__.py
39
+ wavetrainer/model/xgboost/early_stopper.py
40
+ wavetrainer/model/xgboost/xgboost_logger.py
41
+ wavetrainer/model/xgboost/xgboost_model.py
36
42
  wavetrainer/reducer/__init__.py
37
43
  wavetrainer/reducer/base_selector_reducer.py
38
44
  wavetrainer/reducer/combined_reducer.py
@@ -11,7 +11,5 @@ mapie>=0.9.2
11
11
  pytz>=2025.1
12
12
  torch>=2.6.0
13
13
  tabpfn>=2.0.6
14
- tabpfn-extensions>=0.0.4
15
- shap>=0.47.2
16
- hyperopt>=0.2.7
17
14
  pytest-is-running>=1.5.1
15
+ xgboost>=3.0.0
File without changes
File without changes
File without changes