PyPI - wavetrainer - Versions diffs - 0.1.10__tar.gz → 0.1.12__tar.gz - Mend

wavetrainer 0.1.10tar.gz → 0.1.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

{wavetrainer-0.1.10/wavetrainer.egg-info → wavetrainer-0.1.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: wavetrainer
-Version: 0.1.10
+Version: 0.1.12
 Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
 Home-page: https://github.com/8W9aG/wavetrainer
 Author: Will Sackfield
@@ -29,6 +29,8 @@ Requires-Dist: jax>=0.6.1
 Requires-Dist: tabpfn_extensions>=0.0.4
 Requires-Dist: hyperopt>=0.2.7
 Requires-Dist: pycaleva>=0.8.2
+Requires-Dist: lightgbm>=4.6.0
+Requires-Dist: kaleido>=0.2.1
 # wavetrainer
@@ -64,6 +66,8 @@ Python 3.11.6:
 - [tabpfn_extensions](https://github.com/PriorLabs/tabpfn-extensions)
 - [hyperopt](https://github.com/hyperopt/hyperopt)
 - [pycaleva](https://github.com/MartinWeigl/pycaleva)
+- [lightgbm](https://github.com/microsoft/LightGBM)
+- [kaleido](https://github.com/plotly/Kaleido)
 ## Raison D'être :thought_balloon:

{wavetrainer-0.1.10 → wavetrainer-0.1.12}/README.md RENAMED Viewed

@@ -32,6 +32,8 @@ Python 3.11.6:
 - [tabpfn_extensions](https://github.com/PriorLabs/tabpfn-extensions)
 - [hyperopt](https://github.com/hyperopt/hyperopt)
 - [pycaleva](https://github.com/MartinWeigl/pycaleva)
+- [lightgbm](https://github.com/microsoft/LightGBM)
+- [kaleido](https://github.com/plotly/Kaleido)
 ## Raison D'être :thought_balloon:

wavetrainer-0.1.10/wavetrainer.egg-info/requires.txt → wavetrainer-0.1.12/requirements.txt RENAMED Viewed

@@ -16,3 +16,5 @@ jax>=0.6.1
 tabpfn_extensions>=0.0.4
 hyperopt>=0.2.7
 pycaleva>=0.8.2
+lightgbm>=4.6.0
+kaleido>=0.2.1

{wavetrainer-0.1.10 → wavetrainer-0.1.12}/setup.py RENAMED Viewed

@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
 setup(
     name='wavetrainer',
-    version='0.1.10',
+    version='0.1.12',
     description='A library for automatically finding the optimal model within feature and hyperparameter space.',
     long_description=long_description,
     long_description_content_type='text/markdown',

{wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/__init__.py RENAMED Viewed

@@ -2,5 +2,5 @@
 from .create import create
-__VERSION__ = "0.1.10"
+__VERSION__ = "0.1.12"
 __all__ = ("create",)

wavetrainer-0.1.12/wavetrainer/model/lightgbm/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """The wavetrain lightgbm model module."""

wavetrainer-0.1.12/wavetrainer/model/lightgbm/lightgbm_model.py ADDED Viewed

@@ -0,0 +1,245 @@
+"""A model that wraps lightgbm."""
+# pylint: disable=duplicate-code,too-many-arguments,too-many-positional-arguments,too-many-instance-attributes
+import json
+import os
+from typing import Self
+import joblib  # type: ignore
+import lightgbm as lgb
+import optuna
+import pandas as pd
+import torch
+from ...exceptions import WavetrainException
+from ...model_type import ModelType, determine_model_type
+from ..model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
+_BOOSTING_TYPE_KEY = "gbm_boosting_type"
+_NUM_LEAVES_KEY = "gbm_num_leaves"
+_MIN_CHILD_SAMPLES_KEY = "gbm_min_child_samples"
+_MODEL_PARAMS_FILENAME = "model_params.json"
+_MODEL_FILENAME = "model.pkl"
+_BEST_ITERATION_KEY = "best_iteration"
+_EARLY_STOPPING_ROUNDS_KEY = "gbm_early_stopping_rounds"
+_ITERATIONS_KEY = "gbm_iterations"
+class LightGBMModel(Model):
+    """A class that uses lightgbm as a model."""
+    _gbm: lgb.LGBMModel | None
+    _boosting_type: str | None
+    _num_leaves: int | None
+    _min_child_samples: int | None
+    _model_type: None | ModelType
+    _best_iteration: None | int
+    _early_stopping_rounds: None | int
+    _iterations: None | int
+    @classmethod
+    def name(cls) -> str:
+        return "lightgbm"
+    @classmethod
+    def supports_x(cls, df: pd.DataFrame) -> bool:
+        return True
+    def __init__(self) -> None:
+        super().__init__()
+        self._gbm = None
+        self._boosting_type = None
+        self._num_leaves = None
+        self._min_child_samples = None
+        self._model_type = None
+        self._best_iteration = None
+        self._early_stopping_rounds = None
+        self._iterations = None
+    @property
+    def supports_importances(self) -> bool:
+        return True
+    @property
+    def feature_importances(self) -> dict[str, float]:
+        gbm = self._provide_gbm()
+        importances = gbm.feature_importances_
+        names = gbm.feature_name_
+        total_importances = sum(importances)
+        return {
+            names[count]: importance / total_importances
+            for count, importance in enumerate(importances)
+        }
+    def provide_estimator(self):
+        return self._provide_gbm()
+    def create_estimator(self):
+        return self._create_gbm()
+    def reset(self):
+        self._gbm = None
+        self._best_iteration = None
+    def convert_df(self, df: pd.DataFrame) -> pd.DataFrame:
+        return df
+    def set_options(
+        self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
+    ) -> None:
+        self._boosting_type = trial.suggest_categorical(
+            _BOOSTING_TYPE_KEY, ["gbdt", "dart", "rf"]
+        )
+        self._num_leaves = trial.suggest_int(_NUM_LEAVES_KEY, 2, 256)
+        self._min_child_samples = trial.suggest_int(_MIN_CHILD_SAMPLES_KEY, 5, 100)
+        self._best_iteration = trial.user_attrs.get(_BEST_ITERATION_KEY)
+        self._early_stopping_rounds = trial.suggest_int(
+            _EARLY_STOPPING_ROUNDS_KEY, 10, 500
+        )
+        self._iterations = trial.suggest_int(_ITERATIONS_KEY, 100, 10000)
+    def load(self, folder: str) -> None:
+        with open(
+            os.path.join(folder, _MODEL_PARAMS_FILENAME), encoding="utf8"
+        ) as handle:
+            params = json.load(handle)
+            self._boosting_type = params[_BOOSTING_TYPE_KEY]
+            self._num_leaves = params[_NUM_LEAVES_KEY]
+            self._min_child_samples = params[_MIN_CHILD_SAMPLES_KEY]
+            self._best_iteration = params.get(_BEST_ITERATION_KEY)
+            self._early_stopping_rounds = params[_EARLY_STOPPING_ROUNDS_KEY]
+            self._iterations = params[_ITERATIONS_KEY]
+        self._gbm = joblib.load(os.path.join(folder, _MODEL_FILENAME))
+    def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
+        with open(
+            os.path.join(folder, _MODEL_PARAMS_FILENAME), "w", encoding="utf8"
+        ) as handle:
+            json.dump(
+                {
+                    _BOOSTING_TYPE_KEY: self._boosting_type,
+                    _NUM_LEAVES_KEY: self._num_leaves,
+                    _MIN_CHILD_SAMPLES_KEY: self._min_child_samples,
+                    _BEST_ITERATION_KEY: self._best_iteration,
+                    _EARLY_STOPPING_ROUNDS_KEY: self._early_stopping_rounds,
+                    _ITERATIONS_KEY: self._iterations,
+                },
+                handle,
+            )
+        gbm = self._provide_gbm()
+        joblib.dump(gbm, os.path.join(folder, _MODEL_FILENAME))
+        trial.set_user_attr(_BEST_ITERATION_KEY, self._best_iteration)
+    def fit(
+        self,
+        df: pd.DataFrame,
+        y: pd.Series | pd.DataFrame | None = None,
+        w: pd.Series | None = None,
+        eval_x: pd.DataFrame | None = None,
+        eval_y: pd.Series | pd.DataFrame | None = None,
+    ) -> Self:
+        if y is None:
+            raise ValueError("y is null.")
+        self._model_type = determine_model_type(y)
+        gbm = self._provide_gbm()
+        early_stopping_rounds = self._early_stopping_rounds
+        if early_stopping_rounds is None:
+            raise ValueError("early_stopping_rounds is null")
+        eval_set = None
+        callbacks = []
+        if eval_x is not None and eval_y is not None:
+            eval_set = [(eval_x, eval_y.to_numpy().flatten())]  # type: ignore
+            callbacks = [
+                lgb.early_stopping(stopping_rounds=early_stopping_rounds),
+            ]
+        if self._best_iteration is not None:
+            eval_set = None
+            callbacks = []
+        try:
+            gbm.fit(
+                X=df,
+                y=y.to_numpy().flatten(),
+                sample_weight=w,
+                eval_set=eval_set,  # type: ignore
+                callbacks=callbacks,  # type: ignore
+            )
+        except lgb.basic.LightGBMError as exc:
+            raise WavetrainException() from exc
+        self._best_iteration = gbm.best_iteration_
+        return self
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        gbm = self._provide_gbm()
+        pred = gbm.predict(df)
+        pred_df = pd.DataFrame(
+            index=df.index,
+            data={
+                PREDICTION_COLUMN: pred.flatten(),  # type: ignore
+            },
+        )
+        if self._model_type != ModelType.REGRESSION:
+            proba = gbm.predict_proba(df)  # type: ignore
+            for i in range(proba.shape[1]):
+                pred_df[f"{PROBABILITY_COLUMN_PREFIX}{i}"] = proba[:, i]
+        return pred_df
+    def _provide_gbm(self) -> lgb.LGBMModel:
+        gbm = self._gbm
+        if gbm is None:
+            gbm = self._create_gbm()
+            self._gbm = gbm
+        if gbm is None:
+            raise ValueError("gbm is null")
+        return gbm
+    def _create_gbm(self) -> lgb.LGBMModel:
+        best_iteration = self._best_iteration
+        iterations = best_iteration if best_iteration is not None else self._iterations
+        boosting_type = self._boosting_type
+        if boosting_type is None:
+            raise ValueError("boosting_type is null")
+        num_leaves = self._num_leaves
+        if num_leaves is None:
+            raise ValueError("num_leaves is null")
+        min_child_samples = self._min_child_samples
+        if min_child_samples is None:
+            raise ValueError("min_child_samples is null")
+        match self._model_type:
+            case ModelType.BINARY:
+                return lgb.LGBMClassifier(
+                    boosting_type=boosting_type,
+                    num_leaves=num_leaves,
+                    objective="binary",
+                    min_child_samples=min_child_samples,
+                    num_iterations=iterations,
+                    device="gpu" if torch.cuda.is_available() else None,
+                )
+            case ModelType.REGRESSION:
+                return lgb.LGBMRegressor(
+                    boosting_type=boosting_type,
+                    num_leaves=num_leaves,
+                    min_child_samples=min_child_samples,
+                    num_iterations=iterations,
+                    device="gpu" if torch.cuda.is_available() else None,
+                )
+            case ModelType.BINNED_BINARY:
+                return lgb.LGBMClassifier(
+                    boosting_type=boosting_type,
+                    num_leaves=num_leaves,
+                    objective="binary",
+                    min_child_samples=min_child_samples,
+                    num_iterations=iterations,
+                    device="gpu" if torch.cuda.is_available() else None,
+                )
+            case ModelType.MULTI_CLASSIFICATION:
+                return lgb.LGBMClassifier(
+                    boosting_type=boosting_type,
+                    num_leaves=num_leaves,
+                    min_child_samples=min_child_samples,
+                    num_iterations=iterations,
+                    device="gpu" if torch.cuda.is_available() else None,
+                )
+            case _:
+                raise ValueError(f"Unrecognised model type: {self._model_type}")

{wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/model/model_router.py RENAMED Viewed

@@ -11,6 +11,7 @@ from sklearn.metrics import accuracy_score  # type: ignore
 from ..model_type import ModelType, determine_model_type
 from .catboost.catboost_model import CatboostModel
+from .lightgbm.lightgbm_model import LightGBMModel
 from .model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
 from .tabpfn.tabpfn_model import TabPFNModel
 from .xgboost.xgboost_model import XGBoostModel
@@ -22,6 +23,7 @@ _MODELS = {
     CatboostModel.name(): CatboostModel,
     TabPFNModel.name(): TabPFNModel,
     XGBoostModel.name(): XGBoostModel,
+    LightGBMModel.name(): LightGBMModel,
 }

{wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/model/xgboost/xgboost_model.py RENAMED Viewed

@@ -53,7 +53,7 @@ def _convert_categoricals(input_df: pd.DataFrame) -> pd.DataFrame:
     output_df = input_df.copy()
     for col in input_df.select_dtypes(include=["category"]).columns:
         output_df[col] = output_df[col].cat.codes
-    return output_df
+    return output_df.replace([np.inf, -np.inf], np.nan)
 class XGBoostModel(Model):

{wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/reducer/combined_reducer.py RENAMED Viewed

@@ -2,7 +2,6 @@
 # pylint: disable=line-too-long
 import json
-import logging
 import os
 import time
 from typing import Self
@@ -129,6 +128,6 @@ class CombinedReducer(Reducer):
             try:
                 df = reducer.transform(df)
             except ValueError as exc:
-                logging.warning("Failed to reduce %s", reducer.name())
+                print("Failed to reduce %s", reducer.name())
                 raise exc
         return df

{wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer/trainer.py RENAMED Viewed

@@ -1,5 +1,6 @@
 """The trainer class."""
+# pylint: disable=line-too-long
 import datetime
 import functools
 import json
@@ -12,12 +13,14 @@ from typing import Self
 import optuna
 import pandas as pd
 import tqdm
-from sklearn.metrics import f1_score, r2_score  # type: ignore
+from sklearn.metrics import f1_score  # type: ignore
+from sklearn.metrics import (accuracy_score, brier_score_loss, log_loss,
+                             precision_score, r2_score, recall_score)
 from .calibrator.calibrator_router import CalibratorRouter
 from .exceptions import WavetrainException
 from .fit import Fit
-from .model.model import PREDICTION_COLUMN
+from .model.model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX
 from .model.model_router import ModelRouter
 from .model_type import ModelType, determine_model_type
 from .reducer.combined_reducer import CombinedReducer
@@ -48,6 +51,11 @@ def _assign_bin(timestamp, bins: list[datetime.datetime]) -> int:
     return len(bins) - 2  # Assign to last bin if at the end
+def _best_trial(study: optuna.Study) -> optuna.trial.FrozenTrial:
+    best_brier = min(study.best_trials, key=lambda t: t.values[1])
+    return best_brier
 class Trainer(Fit):
     """A class for training and predicting from an array of data."""
@@ -170,7 +178,10 @@ class Trainer(Fit):
             storage=storage_name,
             load_if_exists=True,
             sampler=restored_sampler,
-            direction=optuna.study.StudyDirection.MAXIMIZE,
+            directions=[
+                optuna.study.StudyDirection.MAXIMIZE,
+                optuna.study.StudyDirection.MINIMIZE,
+            ],
         )
     def fit(
@@ -210,7 +221,7 @@ class Trainer(Fit):
                 save: bool,
                 split_idx: datetime.datetime,
                 no_evaluation: bool,
-            ) -> float:
+            ) -> tuple[float, float]:
                 print(f"Beginning trial for: {split_idx.isoformat()}")
                 trial.set_user_attr(_IDX_USR_ATTR_KEY, split_idx.isoformat())
                 folder = os.path.join(
@@ -246,7 +257,7 @@ class Trainer(Fit):
                         if new_folder:
                             os.removedirs(folder)
                         logging.warning("Y train only contains 1 unique datapoint.")
-                        return _BAD_OUTPUT
+                        return _BAD_OUTPUT, -_BAD_OUTPUT
                     print(f"Windowing took {time.time() - start_windower}")
                     # Perform common reductions
@@ -311,10 +322,29 @@ class Trainer(Fit):
                     )
                     cal_pred[PREDICTION_COLUMN] = y_pred[PREDICTION_COLUMN]
                     output = 0.0
+                    loss = 0.0
                     if determine_model_type(y_series) == ModelType.REGRESSION:
                         output = float(r2_score(y_test, y_pred[[PREDICTION_COLUMN]]))
+                        print(f"R2: {output}")
                     else:
                         output = float(f1_score(y_test, y_pred[[PREDICTION_COLUMN]]))
+                        print(f"F1: {output}")
+                        prob_col = PROBABILITY_COLUMN_PREFIX + str(1)
+                        if prob_col in y_pred.columns.values.tolist():
+                            loss = float(brier_score_loss(y_test, y_pred[[prob_col]]))
+                            print(f"Brier: {loss}")
+                            print(
+                                f"Log Loss: {float(log_loss(y_test.astype(float), y_pred[[prob_col]]))}"
+                            )
+                        print(
+                            f"Accuracy: {float(accuracy_score(y_test, y_pred[[PREDICTION_COLUMN]]))}"
+                        )
+                        print(
+                            f"Precision: {float(precision_score(y_test, y_pred[[PREDICTION_COLUMN]]))}"
+                        )
+                        print(
+                            f"Recall: {float(recall_score(y_test, y_pred[[PREDICTION_COLUMN]]))}"
+                        )
                     if save:
                         windower.save(folder, trial)
@@ -332,13 +362,13 @@ class Trainer(Fit):
                                 handle,
                             )
-                    return output
+                    return output, loss
                 except WavetrainException as exc:
                     print(str(exc))
                     logging.warning(str(exc))
                     if new_folder:
                         os.removedirs(folder)
-                    return _BAD_OUTPUT
+                    return _BAD_OUTPUT, -_BAD_OUTPUT
             start_validation_index = (
                 dt_index.to_list()[-int(len(dt_index) * self._validation_size) - 1]
@@ -359,7 +389,7 @@ class Trainer(Fit):
                 ].to_list()[0]
             )
-            def test_objective(trial: optuna.Trial) -> float:
+            def test_objective(trial: optuna.Trial) -> tuple[float, float]:
                 return _fit(
                     trial,
                     test_df,
@@ -382,7 +412,8 @@ class Trainer(Fit):
                     else self._max_train_timeout.total_seconds(),
                 )
             while (
-                study.best_trial.value is None or study.best_trial.value == _BAD_OUTPUT
+                _best_trial(study).values is None
+                or _best_trial(study).values == (_BAD_OUTPUT, -_BAD_OUTPUT)
             ) and len(study.trials) < 1000:
                 logging.info("Performing extra train")
                 study.optimize(
@@ -420,7 +451,7 @@ class Trainer(Fit):
                 if found:
                     last_processed_dt = test_dt
                     _fit(
-                        study.best_trial,
+                        _best_trial(study),
                         test_df.copy(),
                         test_series,
                         True,
@@ -441,7 +472,7 @@ class Trainer(Fit):
                     def validate_objctive(
                         trial: optuna.Trial, idx: datetime.datetime, series: pd.Series
-                    ) -> float:
+                    ) -> tuple[float, float]:
                         return _fit(trial, test_df.copy(), series, False, idx, False)
                     study.optimize(
@@ -457,10 +488,36 @@ class Trainer(Fit):
                     break
                 _fit(
-                    study.best_trial, test_df.copy(), test_series, True, test_idx, True
+                    _best_trial(study),
+                    test_df.copy(),
+                    test_series,
+                    True,
+                    test_idx,
+                    True,
                 )
                 last_processed_dt = test_idx
+            target_names = ["F1", "Brier"]
+            fig = optuna.visualization.plot_pareto_front(
+                study, target_names=target_names
+            )
+            fig.write_image(
+                os.path.join(column_dir, "pareto_frontier.png"),
+                format="png",
+                width=800,
+                height=600,
+            )
+            for target_name in target_names:
+                fig = optuna.visualization.plot_param_importances(
+                    study, target=lambda t: t.values[0], target_name=target_name
+                )
+                fig.write_image(
+                    os.path.join(column_dir, f"{target_name}_frontier.png"),
+                    format="png",
+                    width=800,
+                    height=600,
+                )
         if isinstance(y, pd.Series):
             _fit_column(y)
         else:

{wavetrainer-0.1.10 → wavetrainer-0.1.12/wavetrainer.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: wavetrainer
-Version: 0.1.10
+Version: 0.1.12
 Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
 Home-page: https://github.com/8W9aG/wavetrainer
 Author: Will Sackfield
@@ -29,6 +29,8 @@ Requires-Dist: jax>=0.6.1
 Requires-Dist: tabpfn_extensions>=0.0.4
 Requires-Dist: hyperopt>=0.2.7
 Requires-Dist: pycaleva>=0.8.2
+Requires-Dist: lightgbm>=4.6.0
+Requires-Dist: kaleido>=0.2.1
 # wavetrainer
@@ -64,6 +66,8 @@ Python 3.11.6:
 - [tabpfn_extensions](https://github.com/PriorLabs/tabpfn-extensions)
 - [hyperopt](https://github.com/hyperopt/hyperopt)
 - [pycaleva](https://github.com/MartinWeigl/pycaleva)
+- [lightgbm](https://github.com/microsoft/LightGBM)
+- [kaleido](https://github.com/plotly/Kaleido)
 ## Raison D'être :thought_balloon:

{wavetrainer-0.1.10 → wavetrainer-0.1.12}/wavetrainer.egg-info/SOURCES.txt RENAMED Viewed

@@ -32,6 +32,8 @@ wavetrainer/model/catboost/catboost_classifier_wrap.py
 wavetrainer/model/catboost/catboost_kwargs.py
 wavetrainer/model/catboost/catboost_model.py
 wavetrainer/model/catboost/catboost_regressor_wrap.py
+wavetrainer/model/lightgbm/__init__.py
+wavetrainer/model/lightgbm/lightgbm_model.py
 wavetrainer/model/tabpfn/__init__.py
 wavetrainer/model/tabpfn/tabpfn_model.py
 wavetrainer/model/xgboost/__init__.py

wavetrainer-0.1.10/requirements.txt → wavetrainer-0.1.12/wavetrainer.egg-info/requires.txt RENAMED Viewed

@@ -15,4 +15,6 @@ xgboost>=3.0.0
 jax>=0.6.1
 tabpfn_extensions>=0.0.4
 hyperopt>=0.2.7
-pycaleva>=0.8.2
+pycaleva>=0.8.2
+lightgbm>=4.6.0
+kaleido>=0.2.1