wavetrainer 0.0.38__tar.gz → 0.0.40__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {wavetrainer-0.0.38/wavetrainer.egg-info → wavetrainer-0.0.40}/PKG-INFO +3 -7
  2. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/README.md +1 -3
  3. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/requirements.txt +2 -4
  4. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/setup.py +1 -1
  5. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/tests/model/catboost_kwargs_test.py +1 -1
  6. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/tests/trainer_test.py +1 -1
  7. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/__init__.py +1 -1
  8. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/calibrator/calibrator_router.py +3 -1
  9. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/calibrator/vennabers_calibrator.py +6 -1
  10. wavetrainer-0.0.40/wavetrainer/model/catboost/__init__.py +1 -0
  11. {wavetrainer-0.0.38/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/catboost}/catboost_model.py +5 -4
  12. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/model/model_router.py +4 -2
  13. wavetrainer-0.0.40/wavetrainer/model/tabpfn/__init__.py +1 -0
  14. {wavetrainer-0.0.38/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/tabpfn}/tabpfn_model.py +3 -3
  15. wavetrainer-0.0.40/wavetrainer/model/xgboost/__init__.py +1 -0
  16. wavetrainer-0.0.40/wavetrainer/model/xgboost/early_stopper.py +16 -0
  17. wavetrainer-0.0.40/wavetrainer/model/xgboost/xgboost_logger.py +23 -0
  18. wavetrainer-0.0.40/wavetrainer/model/xgboost/xgboost_model.py +277 -0
  19. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/combined_reducer.py +6 -1
  20. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/nonnumeric_reducer.py +2 -1
  21. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/select_by_single_feature_performance_reducer.py +6 -3
  22. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/trainer.py +17 -3
  23. {wavetrainer-0.0.38 → wavetrainer-0.0.40/wavetrainer.egg-info}/PKG-INFO +3 -7
  24. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer.egg-info/SOURCES.txt +11 -5
  25. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer.egg-info/requires.txt +1 -3
  26. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/LICENSE +0 -0
  27. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/MANIFEST.in +0 -0
  28. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/setup.cfg +0 -0
  29. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/tests/__init__.py +0 -0
  30. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/tests/model/__init__.py +0 -0
  31. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/calibrator/__init__.py +0 -0
  32. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/calibrator/calibrator.py +0 -0
  33. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/calibrator/mapie_calibrator.py +0 -0
  34. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/create.py +0 -0
  35. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/exceptions.py +0 -0
  36. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/fit.py +0 -0
  37. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/model/__init__.py +0 -0
  38. {wavetrainer-0.0.38/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/catboost}/catboost_classifier_wrap.py +0 -0
  39. {wavetrainer-0.0.38/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/catboost}/catboost_kwargs.py +0 -0
  40. {wavetrainer-0.0.38/wavetrainer/model → wavetrainer-0.0.40/wavetrainer/model/catboost}/catboost_regressor_wrap.py +0 -0
  41. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/model/model.py +0 -0
  42. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/model_type.py +0 -0
  43. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/params.py +0 -0
  44. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/__init__.py +0 -0
  45. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/base_selector_reducer.py +0 -0
  46. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/constant_reducer.py +0 -0
  47. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/correlation_reducer.py +0 -0
  48. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/duplicate_reducer.py +0 -0
  49. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/non_categorical_numeric_columns.py +0 -0
  50. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/reducer.py +0 -0
  51. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/smart_correlation_reducer.py +0 -0
  52. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/reducer/unseen_reducer.py +0 -0
  53. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/selector/__init__.py +0 -0
  54. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/selector/selector.py +0 -0
  55. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/weights/__init__.py +0 -0
  56. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/weights/class_weights.py +0 -0
  57. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/weights/combined_weights.py +0 -0
  58. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/weights/exponential_weights.py +0 -0
  59. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/weights/linear_weights.py +0 -0
  60. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/weights/noop_weights.py +0 -0
  61. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/weights/sigmoid_weights.py +0 -0
  62. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/weights/weights.py +0 -0
  63. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/weights/weights_router.py +0 -0
  64. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/windower/__init__.py +0 -0
  65. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer/windower/windower.py +0 -0
  66. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer.egg-info/dependency_links.txt +0 -0
  67. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer.egg-info/not-zip-safe +0 -0
  68. {wavetrainer-0.0.38 → wavetrainer-0.0.40}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.0.38
3
+ Version: 0.0.40
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
@@ -24,10 +24,8 @@ Requires-Dist: mapie>=0.9.2
24
24
  Requires-Dist: pytz>=2025.1
25
25
  Requires-Dist: torch>=2.6.0
26
26
  Requires-Dist: tabpfn>=2.0.6
27
- Requires-Dist: tabpfn-extensions>=0.0.4
28
- Requires-Dist: shap>=0.47.2
29
- Requires-Dist: hyperopt>=0.2.7
30
27
  Requires-Dist: pytest-is-running>=1.5.1
28
+ Requires-Dist: xgboost>=3.0.0
31
29
 
32
30
  # wavetrainer
33
31
 
@@ -58,10 +56,8 @@ Python 3.11.6:
58
56
  - [pytz](https://pythonhosted.org/pytz/)
59
57
  - [torch](https://pytorch.org/)
60
58
  - [tabpfn](https://github.com/PriorLabs/TabPFN)
61
- - [tabpfn-extensions](https://github.com/PriorLabs/tabpfn-extensions)
62
- - [shap](https://shap.readthedocs.io/en/latest/)
63
- - [hyperopt](https://hyperopt.github.io/hyperopt/)
64
59
  - [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
60
+ - [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
65
61
 
66
62
  ## Raison D'être :thought_balloon:
67
63
 
@@ -27,10 +27,8 @@ Python 3.11.6:
27
27
  - [pytz](https://pythonhosted.org/pytz/)
28
28
  - [torch](https://pytorch.org/)
29
29
  - [tabpfn](https://github.com/PriorLabs/TabPFN)
30
- - [tabpfn-extensions](https://github.com/PriorLabs/tabpfn-extensions)
31
- - [shap](https://shap.readthedocs.io/en/latest/)
32
- - [hyperopt](https://hyperopt.github.io/hyperopt/)
33
30
  - [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
31
+ - [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
34
32
 
35
33
  ## Raison D'être :thought_balloon:
36
34
 
@@ -11,7 +11,5 @@ mapie>=0.9.2
11
11
  pytz>=2025.1
12
12
  torch>=2.6.0
13
13
  tabpfn>=2.0.6
14
- tabpfn-extensions>=0.0.4
15
- shap>=0.47.2
16
- hyperopt>=0.2.7
17
- pytest-is-running>=1.5.1
14
+ pytest-is-running>=1.5.1
15
+ xgboost>=3.0.0
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
23
23
 
24
24
  setup(
25
25
  name='wavetrainer',
26
- version='0.0.38',
26
+ version='0.0.40',
27
27
  description='A library for automatically finding the optimal model within feature and hyperparameter space.',
28
28
  long_description=long_description,
29
29
  long_description_content_type='text/markdown',
@@ -3,7 +3,7 @@ import unittest
3
3
 
4
4
  import pandas as pd
5
5
 
6
- from wavetrainer.model.catboost_kwargs import handle_fit_kwargs
6
+ from wavetrainer.model.catboost.catboost_kwargs import handle_fit_kwargs
7
7
 
8
8
 
9
9
  class TestCatboostKwargs(unittest.TestCase):
@@ -13,7 +13,7 @@ class TestTrainer(unittest.TestCase):
13
13
 
14
14
  def test_trainer(self):
15
15
  with tempfile.TemporaryDirectory() as tmpdir:
16
- trainer = Trainer(tmpdir, walkforward_timedelta=datetime.timedelta(days=7), trials=1)
16
+ trainer = Trainer(tmpdir, walkforward_timedelta=datetime.timedelta(days=7), trials=5)
17
17
  x_data = [i for i in range(101)]
18
18
  x_index = [datetime.datetime(2022, 1, 1) + datetime.timedelta(days=i) for i in range(len(x_data))]
19
19
  df = pd.DataFrame(
@@ -2,5 +2,5 @@
2
2
 
3
3
  from .create import create
4
4
 
5
- __VERSION__ = "0.0.38"
5
+ __VERSION__ = "0.0.40"
6
6
  __all__ = ("create",)
@@ -78,7 +78,9 @@ class CalibratorRouter(Calibrator):
78
78
  ) -> Self:
79
79
  # pylint: disable=no-else-return
80
80
  calibrator: Calibrator | None = None
81
- if determine_model_type(df) == ModelType.REGRESSION:
81
+ if y is None:
82
+ raise ValueError("y is null")
83
+ if determine_model_type(y) == ModelType.REGRESSION:
82
84
  calibrator = MAPIECalibrator(self._model)
83
85
  else:
84
86
  calibrator = VennabersCalibrator(self._model)
@@ -1,5 +1,6 @@
1
1
  """A calibrator that implements venn abers."""
2
2
 
3
+ import logging
3
4
  import os
4
5
  from typing import Self
5
6
 
@@ -54,7 +55,11 @@ class VennabersCalibrator(Calibrator):
54
55
  prob_columns = [
55
56
  x for x in df.columns.values if x.startswith(PROBABILITY_COLUMN_PREFIX)
56
57
  ]
57
- vennabers.fit(df[prob_columns].to_numpy(), y.to_numpy())
58
+ try:
59
+ vennabers.fit(df[prob_columns].to_numpy(), y.to_numpy())
60
+ except IndexError:
61
+ logging.error(df)
62
+ raise
58
63
  return self
59
64
 
60
65
  def transform(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -0,0 +1 @@
1
+ """The wavetrain catboost model module."""
@@ -10,12 +10,12 @@ import pandas as pd
10
10
  import torch
11
11
  from catboost import CatBoost, Pool # type: ignore
12
12
 
13
- from ..model_type import ModelType, determine_model_type
13
+ from ...model_type import ModelType, determine_model_type
14
+ from ..model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
14
15
  from .catboost_classifier_wrap import CatBoostClassifierWrapper
15
16
  from .catboost_kwargs import (CAT_FEATURES_ARG_KEY, EVAL_SET_ARG_KEY,
16
17
  ORIGINAL_X_ARG_KEY)
17
18
  from .catboost_regressor_wrap import CatBoostRegressorWrapper
18
- from .model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
19
19
 
20
20
  _MODEL_FILENAME = "model.cbm"
21
21
  _MODEL_PARAMS_FILENAME = "model_params.json"
@@ -148,7 +148,7 @@ class CatboostModel(Model):
148
148
  )
149
149
  catboost = self._provide_catboost()
150
150
  catboost.save_model(os.path.join(folder, _MODEL_FILENAME))
151
- trial.user_attrs[_BEST_ITERATION_KEY] = self._best_iteration
151
+ trial.set_user_attr(_BEST_ITERATION_KEY, self._best_iteration)
152
152
 
153
153
  def fit(
154
154
  self,
@@ -219,9 +219,10 @@ class CatboostModel(Model):
219
219
  best_iteration if best_iteration is not None else self._iterations
220
220
  )
221
221
  logging.info(
222
- "Creating catboost model with depth %d, boosting type %s",
222
+ "Creating catboost model with depth %d, boosting type %s, best iteration %d",
223
223
  self._depth,
224
224
  self._boosting_type,
225
+ -1 if best_iteration is None else best_iteration,
225
226
  )
226
227
  match self._model_type:
227
228
  case ModelType.BINARY:
@@ -7,15 +7,17 @@ from typing import Any, Self
7
7
  import optuna
8
8
  import pandas as pd
9
9
 
10
- from .catboost_model import CatboostModel
10
+ from .catboost.catboost_model import CatboostModel
11
11
  from .model import Model
12
- from .tabpfn_model import TabPFNModel
12
+ from .tabpfn.tabpfn_model import TabPFNModel
13
+ from .xgboost.xgboost_model import XGBoostModel
13
14
 
14
15
  _MODEL_ROUTER_FILE = "model_router.json"
15
16
  _MODEL_KEY = "model"
16
17
  _MODELS = {
17
18
  CatboostModel.name(): CatboostModel,
18
19
  TabPFNModel.name(): TabPFNModel,
20
+ XGBoostModel.name(): XGBoostModel,
19
21
  }
20
22
 
21
23
 
@@ -0,0 +1 @@
1
+ """The wavetrain tabpfn model module."""
@@ -14,9 +14,9 @@ import torch
14
14
  from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import ( # type: ignore
15
15
  AutoTabPFNClassifier, AutoTabPFNRegressor)
16
16
 
17
- from ..exceptions import WavetrainException
18
- from ..model_type import ModelType, determine_model_type
19
- from .model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
17
+ from ...exceptions import WavetrainException
18
+ from ...model_type import ModelType, determine_model_type
19
+ from ..model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
20
20
 
21
21
  _MODEL_FILENAME = "model.pkl"
22
22
  _MODEL_PARAMS_FILENAME = "model_params.json"
@@ -0,0 +1 @@
1
+ """The wavetrain xgboost model module."""
@@ -0,0 +1,16 @@
1
+ """A callback function for early stopping."""
2
+
3
+ from typing import Any
4
+
5
+ from xgboost.callback import EarlyStopping, TrainingCallback
6
+
7
+
8
+ class XGBoostEarlyStoppingCallback(EarlyStopping):
9
+ """A callback for early stopping in XGBoost models."""
10
+
11
+ def after_iteration(
12
+ self, model: Any, epoch: int, evals_log: TrainingCallback.EvalsLog
13
+ ) -> bool:
14
+ if len(evals_log.keys()) < 1:
15
+ return False
16
+ return super().after_iteration(model, epoch, evals_log)
@@ -0,0 +1,23 @@
1
+ """An XGBoost callback class for logging epochs."""
2
+
3
+ from typing import Any
4
+
5
+ from xgboost.callback import TrainingCallback
6
+
7
+
8
+ class XGBoostEpochsLogger(TrainingCallback):
9
+ """Log the epochs in XGBoost."""
10
+
11
+ def after_iteration(
12
+ self, model: Any, epoch: int, evals_log: TrainingCallback.EvalsLog
13
+ ) -> bool:
14
+ if epoch % 100 != 0:
15
+ return False
16
+ log_items = []
17
+ for dataset, metrics in evals_log.items():
18
+ for metric_name, values in metrics.items():
19
+ current_val = values[-1]
20
+ log_items.append(f"{dataset}-{metric_name}: {current_val:.5f}")
21
+
22
+ print(f"XGBoost: [{epoch}] " + " | ".join(log_items))
23
+ return False
@@ -0,0 +1,277 @@
1
+ """A model that wraps xgboost."""
2
+ # pylint: disable=duplicate-code,too-many-arguments,too-many-positional-arguments,too-many-instance-attributes
3
+
4
+ import json
5
+ import os
6
+ from typing import Any, Self
7
+
8
+ import optuna
9
+ import pandas as pd
10
+ import torch
11
+ from xgboost import XGBClassifier, XGBRegressor
12
+ from xgboost.callback import TrainingCallback
13
+
14
+ from ...model_type import ModelType, determine_model_type
15
+ from ..model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
16
+ from .early_stopper import XGBoostEarlyStoppingCallback
17
+ from .xgboost_logger import XGBoostEpochsLogger
18
+
19
+ _MODEL_FILENAME = "xgboost_model.json"
20
+ _MODEL_PARAMS_FILENAME = "xgboost_model_params.json"
21
+ _MODEL_TYPE_KEY = "model_type"
22
+ _BEST_ITERATION_KEY = "best_iteration"
23
+
24
+
25
+ def _convert_categoricals(input_df: pd.DataFrame) -> pd.DataFrame:
26
+ output_df = input_df.copy()
27
+ for col in input_df.select_dtypes(include=["category"]).columns:
28
+ output_df[col] = output_df[col].cat.codes
29
+ return output_df
30
+
31
+
32
+ class XGBoostModel(Model):
33
+ """A class that uses XGBoost as a model."""
34
+
35
+ _xgboost: XGBRegressor | XGBClassifier | None
36
+ _model_type: None | ModelType
37
+ _booster: str | None
38
+ _lambda: float | None
39
+ _alpha: float | None
40
+ _subsample: float | None
41
+ _colsample_bytree: float | None
42
+ _max_depth: int | None
43
+ _min_child_weight: int | None
44
+ _eta: float | None
45
+ _gamma: float | None
46
+ _grow_policy: str | None
47
+ _sample_type: str | None
48
+ _normalize_type: str | None
49
+ _rate_drop: float | None
50
+ _skip_drop: float | None
51
+ _num_boost_rounds: int | None
52
+ _early_stopping_rounds: int | None
53
+ _best_iteration: int | None
54
+
55
+ @classmethod
56
+ def name(cls) -> str:
57
+ return "xgboost"
58
+
59
+ @classmethod
60
+ def supports_x(cls, df: pd.DataFrame) -> bool:
61
+ return True
62
+
63
+ def __init__(self) -> None:
64
+ super().__init__()
65
+ self._xgboost = None
66
+ self._model_type = None
67
+ self._booster = None
68
+ self._lambda = None
69
+ self._alpha = None
70
+ self._subsample = None
71
+ self._colsample_bytree = None
72
+ self._max_depth = None
73
+ self._min_child_weight = None
74
+ self._eta = None
75
+ self._gamma = None
76
+ self._grow_policy = None
77
+ self._sample_type = None
78
+ self._normalize_type = None
79
+ self._rate_drop = None
80
+ self._skip_drop = None
81
+ self._num_boost_rounds = None
82
+ self._early_stopping_rounds = None
83
+ self._best_iteration = None
84
+
85
+ @property
86
+ def estimator(self) -> Any:
87
+ return self._provide_xgboost()
88
+
89
+ @property
90
+ def supports_importances(self) -> bool:
91
+ return True
92
+
93
+ @property
94
+ def feature_importances(self) -> dict[str, float]:
95
+ bst = self._provide_xgboost()
96
+ return bst.get_score(importance_type="weight") # type: ignore
97
+
98
+ def pre_fit(
99
+ self,
100
+ df: pd.DataFrame,
101
+ y: pd.Series | pd.DataFrame | None,
102
+ eval_x: pd.DataFrame | None = None,
103
+ eval_y: pd.Series | pd.DataFrame | None = None,
104
+ w: pd.Series | None = None,
105
+ ):
106
+ if y is None:
107
+ raise ValueError("y is null.")
108
+ self._model_type = determine_model_type(y)
109
+ return {
110
+ "eval_set": (eval_x, eval_y),
111
+ "sample_weight": w,
112
+ }
113
+
114
+ def set_options(
115
+ self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
116
+ ) -> None:
117
+ self._booster = trial.suggest_categorical(
118
+ "booster", ["gbtree", "gblinear", "dart"]
119
+ )
120
+ self._lambda = trial.suggest_float("lambda", 1e-8, 1.0, log=True)
121
+ self._alpha = trial.suggest_float("alpha", 1e-8, 1.0, log=True)
122
+ self._subsample = trial.suggest_float("subsample", 0.2, 1.0)
123
+ self._colsample_bytree = trial.suggest_float("colsample_bytree", 0.2, 1.0)
124
+ if self._booster in ["gbtree", "dart"]:
125
+ self._max_depth = trial.suggest_int("max_depth", 3, 9, step=2)
126
+ self._min_child_weight = trial.suggest_int(
127
+ "min_child_weight", 2, 10, log=True
128
+ )
129
+ self._eta = trial.suggest_float("eta", 1e-8, 1.0, log=True)
130
+ self._gamma = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
131
+ self._grow_policy = trial.suggest_categorical(
132
+ "grow_policy", ["depthwise", "lossguide"]
133
+ )
134
+ else:
135
+ self._sample_type = trial.suggest_categorical(
136
+ "sample_type", ["uniform", "weighted"]
137
+ )
138
+ self._normalize_type = trial.suggest_categorical(
139
+ "normalize_type", ["tree", "forest"]
140
+ )
141
+ self._rate_drop = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
142
+ self._skip_drop = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
143
+ self._num_boost_rounds = trial.suggest_int("num_boost_rounds", 100, 10000)
144
+ self._early_stopping_rounds = trial.suggest_int(
145
+ "early_stopping_rounds", 50, 500
146
+ )
147
+ self._best_iteration = trial.user_attrs.get(_BEST_ITERATION_KEY)
148
+
149
+ def load(self, folder: str) -> None:
150
+ with open(
151
+ os.path.join(folder, _MODEL_PARAMS_FILENAME), encoding="utf8"
152
+ ) as handle:
153
+ params = json.load(handle)
154
+ self._model_type = ModelType(params[_MODEL_TYPE_KEY])
155
+ self._best_iteration = params.get(_BEST_ITERATION_KEY)
156
+ bst = self._provide_xgboost()
157
+ bst.load_model(os.path.join(folder, _MODEL_FILENAME))
158
+
159
+ def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
160
+ bst = self._provide_xgboost()
161
+ bst.save_model(os.path.join(folder, _MODEL_FILENAME))
162
+ with open(
163
+ os.path.join(folder, _MODEL_PARAMS_FILENAME), "w", encoding="utf8"
164
+ ) as handle:
165
+ json.dump(
166
+ {
167
+ _MODEL_TYPE_KEY: str(self._model_type),
168
+ _BEST_ITERATION_KEY: self._best_iteration,
169
+ },
170
+ handle,
171
+ )
172
+ trial.set_user_attr(_BEST_ITERATION_KEY, self._best_iteration)
173
+
174
+ def fit(
175
+ self,
176
+ df: pd.DataFrame,
177
+ y: pd.Series | pd.DataFrame | None = None,
178
+ w: pd.Series | None = None,
179
+ eval_x: pd.DataFrame | None = None,
180
+ eval_y: pd.Series | pd.DataFrame | None = None,
181
+ ) -> Self:
182
+ if y is None:
183
+ raise ValueError("y is null.")
184
+ self._model_type = determine_model_type(y)
185
+ xgboost = self._provide_xgboost()
186
+ df = _convert_categoricals(df)
187
+ evals = [(df, y)]
188
+ if eval_x is not None and eval_y is not None and self._best_iteration is None:
189
+ eval_x = _convert_categoricals(eval_x)
190
+ evals.append((eval_x, eval_y))
191
+ xgboost.fit( # type: ignore
192
+ df,
193
+ y,
194
+ eval_set=evals,
195
+ sample_weight=w,
196
+ verbose=False,
197
+ )
198
+ return self
199
+
200
+ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
201
+ x_df = _convert_categoricals(df)
202
+ xgboost = self._provide_xgboost()
203
+ pred = xgboost.predict(x_df)
204
+ df = pd.DataFrame(
205
+ index=df.index,
206
+ data={
207
+ PREDICTION_COLUMN: pred.flatten(),
208
+ },
209
+ )
210
+ if self._model_type != ModelType.REGRESSION:
211
+ proba = xgboost.predict_proba(x_df) # type: ignore
212
+ for i in range(proba.shape[1]):
213
+ df[f"{PROBABILITY_COLUMN_PREFIX}{i}"] = proba[:, i]
214
+ return df
215
+
216
+ def _provide_xgboost(self) -> XGBClassifier | XGBRegressor:
217
+ xgboost = self._xgboost
218
+ if xgboost is None:
219
+ callbacks: list[TrainingCallback] = [
220
+ XGBoostEpochsLogger(),
221
+ ]
222
+ if self._best_iteration is not None:
223
+ callbacks.append(
224
+ XGBoostEarlyStoppingCallback(rounds=self._early_stopping_rounds)
225
+ )
226
+ param = {
227
+ "objective": "binary:logistic",
228
+ "tree_method": "gpu_hist" if torch.cuda.is_available() else "exact",
229
+ # defines booster, gblinear for linear functions.
230
+ "booster": self._booster,
231
+ # L2 regularization weight.
232
+ "reg_lambda": self._lambda,
233
+ # L1 regularization weight.
234
+ "alpha": self._alpha,
235
+ # sampling ratio for training data.
236
+ "subsample": self._subsample,
237
+ # sampling according to each tree.
238
+ "colsample_bytree": self._colsample_bytree,
239
+ "n_estimators": self._best_iteration
240
+ if self._best_iteration is not None
241
+ else self._num_boost_rounds,
242
+ "base_score": 0.5,
243
+ "verbosity": 0,
244
+ "verbose": False,
245
+ "callbacks": callbacks,
246
+ "eval_metric": ["logloss", "error"],
247
+ }
248
+ if param["booster"] in ["gbtree", "dart"]:
249
+ # maximum depth of the tree, signifies complexity of the tree.
250
+ param["max_depth"] = self._max_depth
251
+ # minimum child weight, larger the term more conservative the tree.
252
+ param["min_child_weight"] = self._min_child_weight
253
+ param["eta"] = self._eta
254
+ # defines how selective algorithm is.
255
+ param["gamma"] = self._gamma
256
+ param["grow_policy"] = self._grow_policy
257
+
258
+ if param["booster"] == "dart":
259
+ param["sample_type"] = self._sample_type
260
+ param["normalize_type"] = self._normalize_type
261
+ param["rate_drop"] = self._rate_drop
262
+ param["skip_drop"] = self._skip_drop
263
+ match self._model_type:
264
+ case ModelType.BINARY:
265
+ xgboost = XGBClassifier(**param)
266
+ case ModelType.REGRESSION:
267
+ param["objective"] = "reg:squarederror"
268
+ param["eval_metric"] = ["rmse", "mae"]
269
+ xgboost = XGBRegressor(**param)
270
+ case ModelType.BINNED_BINARY:
271
+ xgboost = XGBClassifier(**param)
272
+ case ModelType.MULTI_CLASSIFICATION:
273
+ xgboost = XGBClassifier(**param)
274
+ self._xgboost = xgboost
275
+ if xgboost is None:
276
+ raise ValueError("xgboost is null")
277
+ return xgboost
@@ -3,6 +3,7 @@
3
3
  import json
4
4
  import logging
5
5
  import os
6
+ import time
6
7
  from typing import Self
7
8
 
8
9
  import optuna
@@ -37,7 +38,7 @@ class CombinedReducer(Reducer):
37
38
  DuplicateReducer(),
38
39
  CorrelationReducer(),
39
40
  SmartCorrelationReducer(),
40
- SelectBySingleFeaturePerformanceReducer(),
41
+ # SelectBySingleFeaturePerformanceReducer(),
41
42
  ]
42
43
  self._folder = None
43
44
 
@@ -99,12 +100,16 @@ class CombinedReducer(Reducer):
99
100
  ) -> Self:
100
101
  removed_columns_dict = {}
101
102
  for reducer in self._reducers:
103
+ start_reducer = time.time()
102
104
  before_columns = set(df.columns.values)
103
105
  df = reducer.fit_transform(df, y=y)
104
106
  after_columns = set(df.columns.values)
105
107
  removed_columns = before_columns.difference(after_columns)
106
108
  if removed_columns:
107
109
  removed_columns_dict[reducer.name()] = list(removed_columns)
110
+ logging.info(
111
+ "%s reducer took %f", reducer.name(), time.time() - start_reducer
112
+ )
108
113
  if self._folder is not None:
109
114
  with open(
110
115
  os.path.join(self._folder, _REMOVED_COLUMNS_FILE), encoding="utf8"
@@ -41,5 +41,6 @@ class NonNumericReducer(Reducer):
41
41
  def transform(self, df: pd.DataFrame) -> pd.DataFrame:
42
42
  categorical_cols = df.select_dtypes(include="category").columns.tolist()
43
43
  numeric_cols = df.select_dtypes(include="number").columns.tolist()
44
- keep_cols = categorical_cols + numeric_cols
44
+ boolean_cols = df.select_dtypes(include="bool").columns.tolist()
45
+ keep_cols = categorical_cols + numeric_cols + boolean_cols
45
46
  return df[keep_cols]
@@ -1,5 +1,6 @@
1
1
  """A reducer that removes features by their single performance via further heuristics."""
2
2
 
3
+ # pylint: disable=too-many-arguments,too-many-positional-arguments
3
4
  from typing import Self
4
5
 
5
6
  import optuna
@@ -7,8 +8,8 @@ import pandas as pd
7
8
  from feature_engine.selection import SelectBySingleFeaturePerformance
8
9
  from sklearn.ensemble import RandomForestClassifier # type: ignore
9
10
 
10
- from .base_selector_reducer import BaseSelectorReducer
11
11
  from ..model_type import ModelType, determine_model_type
12
+ from .base_selector_reducer import BaseSelectorReducer
12
13
 
13
14
  _SINGLE_FEATURE_PERFORMANCE_REDUCER_FILENAME = (
14
15
  "single_feature_performance_reducer.joblib"
@@ -23,7 +24,7 @@ class SelectBySingleFeaturePerformanceReducer(BaseSelectorReducer):
23
24
 
24
25
  def __init__(self) -> None:
25
26
  self._singlefeatureperformance_selector = SelectBySingleFeaturePerformance(
26
- RandomForestClassifier(random_state=42), scoring="accuracy"
27
+ RandomForestClassifier(random_state=42, n_jobs=-1), scoring="accuracy", cv=1
27
28
  )
28
29
  super().__init__(
29
30
  self._singlefeatureperformance_selector,
@@ -53,5 +54,7 @@ class SelectBySingleFeaturePerformanceReducer(BaseSelectorReducer):
53
54
  eval_x: pd.DataFrame | None = None,
54
55
  eval_y: pd.Series | pd.DataFrame | None = None,
55
56
  ) -> Self:
56
- self._singlefeatureperformance_selector.scoring = "r2" if determine_model_type(y) == ModelType.REGRESSION else "accuracy"
57
+ self._singlefeatureperformance_selector.scoring = (
58
+ "r2" if determine_model_type(y) == ModelType.REGRESSION else "accuracy" # type: ignore
59
+ )
57
60
  return super().fit(df, y=y, w=w, eval_x=eval_x, eval_y=eval_y)
@@ -6,6 +6,7 @@ import json
6
6
  import logging
7
7
  import os
8
8
  import pickle
9
+ import time
9
10
  from typing import Self
10
11
 
11
12
  import optuna
@@ -231,6 +232,7 @@ class Trainer(Fit):
231
232
 
232
233
  try:
233
234
  # Window the data
235
+ start_windower = time.time()
234
236
  windower = Windower(self._dt_column)
235
237
  windower.set_options(trial, x)
236
238
  x_train = windower.fit_transform(x_train)
@@ -240,25 +242,31 @@ class Trainer(Fit):
240
242
  os.removedirs(folder)
241
243
  logging.warning("Y train only contains 1 unique datapoint.")
242
244
  return _BAD_OUTPUT
245
+ logging.info("Windowing took %f", time.time() - start_windower)
243
246
 
244
247
  # Perform common reductions
248
+ start_reducer = time.time()
245
249
  reducer = CombinedReducer()
246
250
  reducer.set_options(trial, x)
247
251
  x_train = reducer.fit_transform(x_train, y=y_train)
248
252
  x_test = reducer.transform(x_test)
253
+ logging.info("Reducing took %f", time.time() - start_reducer)
249
254
 
250
255
  # Calculate the row weights
256
+ start_row_weights = time.time()
251
257
  weights = CombinedWeights()
252
258
  weights.set_options(trial, x)
253
259
  w = weights.fit(x_train, y=y_train).transform(y_train.to_frame())[
254
260
  WEIGHTS_COLUMN
255
261
  ]
262
+ logging.info("Row weights took %f", time.time() - start_row_weights)
256
263
 
257
264
  # Create model
258
265
  model = ModelRouter()
259
266
  model.set_options(trial, x)
260
267
 
261
268
  # Train
269
+ start_train = time.time()
262
270
  selector = Selector(model)
263
271
  selector.set_options(trial, x)
264
272
  selector.fit(x_train, y=y_train, w=w, eval_x=x_test, eval_y=y_test)
@@ -267,11 +275,14 @@ class Trainer(Fit):
267
275
  x_pred = model.fit_transform(
268
276
  x_train, y=y_train, w=w, eval_x=x_test, eval_y=y_test
269
277
  )
278
+ logging.info("Training took %f", time.time() - start_train)
270
279
 
271
280
  # Calibrate
281
+ start_calibrate = time.time()
272
282
  calibrator = CalibratorRouter(model)
273
283
  calibrator.set_options(trial, x)
274
284
  calibrator.fit(x_pred, y=y_train)
285
+ logging.info("Calibrating took %f", time.time() - start_calibrate)
275
286
 
276
287
  # Output
277
288
  y_pred = model.transform(x_test)
@@ -521,8 +532,11 @@ class Trainer(Fit):
521
532
  date_path = os.path.join(column_path, date_str)
522
533
  if not os.path.isdir(date_path):
523
534
  continue
524
- model = ModelRouter()
525
- model.load(date_path)
526
- feature_importances[date_str] = model.feature_importances
535
+ try:
536
+ model = ModelRouter()
537
+ model.load(date_path)
538
+ feature_importances[date_str] = model.feature_importances
539
+ except FileNotFoundError as exc:
540
+ logging.warning(str(exc))
527
541
 
528
542
  return feature_importances
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.0.38
3
+ Version: 0.0.40
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
@@ -24,10 +24,8 @@ Requires-Dist: mapie>=0.9.2
24
24
  Requires-Dist: pytz>=2025.1
25
25
  Requires-Dist: torch>=2.6.0
26
26
  Requires-Dist: tabpfn>=2.0.6
27
- Requires-Dist: tabpfn-extensions>=0.0.4
28
- Requires-Dist: shap>=0.47.2
29
- Requires-Dist: hyperopt>=0.2.7
30
27
  Requires-Dist: pytest-is-running>=1.5.1
28
+ Requires-Dist: xgboost>=3.0.0
31
29
 
32
30
  # wavetrainer
33
31
 
@@ -58,10 +56,8 @@ Python 3.11.6:
58
56
  - [pytz](https://pythonhosted.org/pytz/)
59
57
  - [torch](https://pytorch.org/)
60
58
  - [tabpfn](https://github.com/PriorLabs/TabPFN)
61
- - [tabpfn-extensions](https://github.com/PriorLabs/tabpfn-extensions)
62
- - [shap](https://shap.readthedocs.io/en/latest/)
63
- - [hyperopt](https://hyperopt.github.io/hyperopt/)
64
59
  - [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
60
+ - [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
65
61
 
66
62
  ## Raison D'être :thought_balloon:
67
63
 
@@ -26,13 +26,19 @@ wavetrainer/calibrator/calibrator_router.py
26
26
  wavetrainer/calibrator/mapie_calibrator.py
27
27
  wavetrainer/calibrator/vennabers_calibrator.py
28
28
  wavetrainer/model/__init__.py
29
- wavetrainer/model/catboost_classifier_wrap.py
30
- wavetrainer/model/catboost_kwargs.py
31
- wavetrainer/model/catboost_model.py
32
- wavetrainer/model/catboost_regressor_wrap.py
33
29
  wavetrainer/model/model.py
34
30
  wavetrainer/model/model_router.py
35
- wavetrainer/model/tabpfn_model.py
31
+ wavetrainer/model/catboost/__init__.py
32
+ wavetrainer/model/catboost/catboost_classifier_wrap.py
33
+ wavetrainer/model/catboost/catboost_kwargs.py
34
+ wavetrainer/model/catboost/catboost_model.py
35
+ wavetrainer/model/catboost/catboost_regressor_wrap.py
36
+ wavetrainer/model/tabpfn/__init__.py
37
+ wavetrainer/model/tabpfn/tabpfn_model.py
38
+ wavetrainer/model/xgboost/__init__.py
39
+ wavetrainer/model/xgboost/early_stopper.py
40
+ wavetrainer/model/xgboost/xgboost_logger.py
41
+ wavetrainer/model/xgboost/xgboost_model.py
36
42
  wavetrainer/reducer/__init__.py
37
43
  wavetrainer/reducer/base_selector_reducer.py
38
44
  wavetrainer/reducer/combined_reducer.py
@@ -11,7 +11,5 @@ mapie>=0.9.2
11
11
  pytz>=2025.1
12
12
  torch>=2.6.0
13
13
  tabpfn>=2.0.6
14
- tabpfn-extensions>=0.0.4
15
- shap>=0.47.2
16
- hyperopt>=0.2.7
17
14
  pytest-is-running>=1.5.1
15
+ xgboost>=3.0.0
File without changes
File without changes
File without changes