PyPI - wavetrainer - Versions diffs - 0.0.49__tar.gz → 0.0.50__tar.gz - Mend

wavetrainer 0.0.49tar.gz → 0.0.50tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

{wavetrainer-0.0.49/wavetrainer.egg-info → wavetrainer-0.0.50}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: wavetrainer
-Version: 0.0.49
+Version: 0.0.50
 Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
 Home-page: https://github.com/8W9aG/wavetrainer
 Author: Will Sackfield
@@ -26,6 +26,7 @@ Requires-Dist: torch>=2.6.0
 Requires-Dist: tabpfn>=2.0.6
 Requires-Dist: pytest-is-running>=1.5.1
 Requires-Dist: xgboost>=3.0.0
+Requires-Dist: jax>=0.6.1
 # wavetrainer
@@ -58,6 +59,7 @@ Python 3.11.6:
 - [tabpfn](https://github.com/PriorLabs/TabPFN)
 - [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
 - [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
+- [jax](https://github.com/jax-ml/jax)
 ## Raison D'être :thought_balloon:

{wavetrainer-0.0.49 → wavetrainer-0.0.50}/README.md RENAMED Viewed

@@ -29,6 +29,7 @@ Python 3.11.6:
 - [tabpfn](https://github.com/PriorLabs/TabPFN)
 - [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
 - [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
+- [jax](https://github.com/jax-ml/jax)
 ## Raison D'être :thought_balloon:

wavetrainer-0.0.49/wavetrainer.egg-info/requires.txt → wavetrainer-0.0.50/requirements.txt RENAMED Viewed

@@ -13,3 +13,4 @@ torch>=2.6.0
 tabpfn>=2.0.6
 pytest-is-running>=1.5.1
 xgboost>=3.0.0
+jax>=0.6.1

{wavetrainer-0.0.49 → wavetrainer-0.0.50}/setup.py RENAMED Viewed

@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
 setup(
     name='wavetrainer',
-    version='0.0.49',
+    version='0.0.50',
     description='A library for automatically finding the optimal model within feature and hyperparameter space.',
     long_description=long_description,
     long_description_content_type='text/markdown',

{wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/__init__.py RENAMED Viewed

@@ -2,5 +2,5 @@
 from .create import create
-__VERSION__ = "0.0.49"
+__VERSION__ = "0.0.50"
 __all__ = ("create",)

{wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/catboost/catboost_model.py RENAMED Viewed

@@ -18,6 +18,7 @@ from .catboost_regressor_wrap import CatBoostRegressorWrapper
 _MODEL_FILENAME = "model.cbm"
 _MODEL_PARAMS_FILENAME = "model_params.json"
+_MODEL_CATEGORICAL_FEATURES_FILENAME = "catboost_categorical_features.json"
 _ITERATIONS_KEY = "iterations"
 _LEARNING_RATE_KEY = "learning_rate"
 _DEPTH_KEY = "depth"
@@ -26,6 +27,11 @@ _BOOSTING_TYPE_KEY = "boosting_type"
 _MODEL_TYPE_KEY = "model_type"
 _EARLY_STOPPING_ROUNDS = "early_stopping_rounds"
 _BEST_ITERATION_KEY = "best_iteration"
+_LOSS_FUNCTION_KEY = "loss_function"
+_DEFAULT_LOSS_FUNCTION = "default"
+_FOCALLOSS_LOSS_FUNCTION = "focalloss"
+_GAMMA_KEY = "focalloss_gamma"
+_ALPHA_KEY = "focalloss_alpha"
 class CatboostModel(Model):
@@ -42,6 +48,10 @@ class CatboostModel(Model):
     _model_type: None | ModelType
     _early_stopping_rounds: None | int
     _best_iteration: None | int
+    _categorical_features: dict[str, bool]
+    _loss_function: None | str
+    _gamma: None | float
+    _alpha: None | float
     @classmethod
     def name(cls) -> str:
@@ -62,6 +72,10 @@ class CatboostModel(Model):
         self._model_type = None
         self._early_stopping_rounds = None
         self._best_iteration = None
+        self._categorical_features = {}
+        self._loss_function = None
+        self._gamma = None
+        self._alpha = None
     @property
     def supports_importances(self) -> bool:
@@ -76,7 +90,10 @@ class CatboostModel(Model):
         feature_ids = importances["Feature Id"].to_list()  # type: ignore
         importances = importances["Importances"].to_list()  # type: ignore
         total = sum(importances)
-        return {feature_ids[x]: importances[x] / total for x in range(len(feature_ids))}
+        return {
+            feature_ids[x]: importances[x] / total if total != 0.0 else 0.0
+            for x in range(len(feature_ids))
+        }
     def provide_estimator(self):
         return self._provide_catboost()
@@ -105,6 +122,13 @@ class CatboostModel(Model):
         )
         self._early_stopping_rounds = trial.suggest_int(_EARLY_STOPPING_ROUNDS, 10, 500)
         self._best_iteration = trial.user_attrs.get(_BEST_ITERATION_KEY)
+        loss_function = trial.suggest_categorical(
+            _LOSS_FUNCTION_KEY, [_DEFAULT_LOSS_FUNCTION, _FOCALLOSS_LOSS_FUNCTION]
+        )
+        self._loss_function = loss_function
+        if loss_function == _FOCALLOSS_LOSS_FUNCTION:
+            self._gamma = trial.suggest_float(_GAMMA_KEY, 0.5, 5.0)
+            self._alpha = trial.suggest_float(_ALPHA_KEY, 0.05, 0.95)
     def load(self, folder: str) -> None:
         with open(
@@ -119,6 +143,13 @@ class CatboostModel(Model):
             self._model_type = ModelType(params[_MODEL_TYPE_KEY])
             self._early_stopping_rounds = params[_EARLY_STOPPING_ROUNDS]
             self._best_iteration = params.get(_BEST_ITERATION_KEY)
+            self._loss_function = params.get(_LOSS_FUNCTION_KEY, _DEFAULT_LOSS_FUNCTION)
+            self._gamma = params.get(_GAMMA_KEY)
+            self._alpha = params.get(_ALPHA_KEY)
+        with open(
+            os.path.join(folder, _MODEL_CATEGORICAL_FEATURES_FILENAME), encoding="utf8"
+        ) as handle:
+            self._categorical_features = json.load(handle)
         catboost = self._provide_catboost()
         catboost.load_model(os.path.join(folder, _MODEL_FILENAME))
@@ -136,9 +167,18 @@ class CatboostModel(Model):
                     _MODEL_TYPE_KEY: str(self._model_type),
                     _EARLY_STOPPING_ROUNDS: self._early_stopping_rounds,
                     _BEST_ITERATION_KEY: self._best_iteration,
+                    _LOSS_FUNCTION_KEY: self._loss_function,
+                    _GAMMA_KEY: self._gamma,
+                    _ALPHA_KEY: self._alpha,
                 },
                 handle,
             )
+        with open(
+            os.path.join(folder, _MODEL_CATEGORICAL_FEATURES_FILENAME),
+            "w",
+            encoding="utf8",
+        ) as handle:
+            json.dump(self._categorical_features, handle)
         catboost = self._provide_catboost()
         catboost.save_model(os.path.join(folder, _MODEL_FILENAME))
         trial.set_user_attr(_BEST_ITERATION_KEY, self._best_iteration)
@@ -155,6 +195,9 @@ class CatboostModel(Model):
             raise ValueError("y is null.")
         self._model_type = determine_model_type(y)
         catboost = self._provide_catboost()
+        self._categorical_features = {
+            x: True for x in df.select_dtypes(include="category").columns.tolist()
+        }
         train_pool = Pool(
             df,
@@ -184,6 +227,10 @@ class CatboostModel(Model):
         return self
     def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        for categorical_feature_column in self._categorical_features.keys():
+            df[categorical_feature_column] = df[categorical_feature_column].astype(
+                "category"
+            )
         pred_pool = Pool(
             df,
             cat_features=df.select_dtypes(include="category").columns.tolist(),
@@ -217,6 +264,14 @@ class CatboostModel(Model):
         print(
             f"Creating catboost model with depth {self._depth}, boosting type {self._boosting_type}, best iteration {best_iteration}",
         )
+        loss_function = None
+        if (
+            self._loss_function == _FOCALLOSS_LOSS_FUNCTION
+            and self._alpha is not None
+            and self._gamma is not None
+            and self._model_type != ModelType.REGRESSION
+        ):
+            loss_function = f"Focal:focal_alpha={self._alpha};focal_gamma={self._gamma}"
         match self._model_type:
             case ModelType.BINARY:
                 return CatBoostClassifierWrapper(
@@ -229,6 +284,7 @@ class CatboostModel(Model):
                     metric_period=100,
                     task_type="GPU" if torch.cuda.is_available() else "CPU",
                     devices="0" if torch.cuda.is_available() else None,
+                    loss_function=loss_function,
                 )
             case ModelType.REGRESSION:
                 return CatBoostRegressorWrapper(
@@ -253,6 +309,7 @@ class CatboostModel(Model):
                     metric_period=100,
                     task_type="GPU" if torch.cuda.is_available() else "CPU",
                     devices="0" if torch.cuda.is_available() else None,
+                    loss_function=loss_function,
                 )
             case ModelType.MULTI_CLASSIFICATION:
                 return CatBoostClassifierWrapper(
@@ -265,6 +322,7 @@ class CatboostModel(Model):
                     metric_period=100,
                     task_type="GPU" if torch.cuda.is_available() else "CPU",
                     devices="0" if torch.cuda.is_available() else None,
+                    loss_function=loss_function,
                 )
             case _:
                 raise ValueError(f"Unrecognised model type: {self._model_type}")

{wavetrainer-0.0.49 → wavetrainer-0.0.50}/wavetrainer/model/xgboost/xgboost_model.py RENAMED Viewed

@@ -5,10 +5,13 @@ import json
 import os
 from typing import Self
+import jax.numpy as jnp
+import numpy as np
 import optuna
 import pandas as pd
 import pytest_is_running
 import torch
+from jax import grad, hessian, vmap
 from xgboost import XGBClassifier, XGBRegressor
 from xgboost.callback import TrainingCallback
 from xgboost.core import XGBoostError
@@ -39,6 +42,11 @@ _RATE_DROP_KEY = "rate_drop"
 _SKIP_DROP_KEY = "skip_drop"
 _NUM_BOOST_ROUNDS_KEY = "num_boost_rounds"
 _EARLY_STOPPING_ROUNDS_KEY = "early_stopping_rounds"
+_LOSS_FUNCTION_KEY = "xgboost_loss_function"
+_DEFAULT_LOSS_FUNCTION = "default"
+_FOCALLOSS_LOSS_FUNCTION = "focalloss"
+_FOCALLOSS_GAMMA_KEY = "focalloss_gamma"
+_FOCALLOSS_ALPHA_KEY = "focalloss_alpha"
 def _convert_categoricals(input_df: pd.DataFrame) -> pd.DataFrame:
@@ -70,6 +78,9 @@ class XGBoostModel(Model):
     _num_boost_rounds: int | None
     _early_stopping_rounds: int | None
     _best_iteration: int | None
+    _focalloss_alpha: float | None
+    _focalloss_gamma: float | None
+    _loss_function: str | None
     @classmethod
     def name(cls) -> str:
@@ -100,6 +111,9 @@ class XGBoostModel(Model):
         self._num_boost_rounds = None
         self._early_stopping_rounds = None
         self._best_iteration = None
+        self._loss_function = None
+        self._focalloss_gamma = None
+        self._focalloss_alpha = None
     @property
     def supports_importances(self) -> bool:
@@ -167,6 +181,15 @@ class XGBoostModel(Model):
             _EARLY_STOPPING_ROUNDS_KEY, 50, 500
         )
         self._best_iteration = trial.user_attrs.get(_BEST_ITERATION_KEY)
+        loss_function = trial.suggest_categorical(
+            _LOSS_FUNCTION_KEY, [_DEFAULT_LOSS_FUNCTION, _FOCALLOSS_LOSS_FUNCTION]
+        )
+        self._loss_function = loss_function
+        if loss_function == _FOCALLOSS_LOSS_FUNCTION:
+            self._focalloss_gamma = trial.suggest_float(_FOCALLOSS_GAMMA_KEY, 0.5, 5.0)
+            self._focalloss_alpha = trial.suggest_float(
+                _FOCALLOSS_ALPHA_KEY, 0.05, 0.95
+            )
     def load(self, folder: str) -> None:
         with open(
@@ -191,6 +214,9 @@ class XGBoostModel(Model):
             self._num_boost_rounds = params[_NUM_BOOST_ROUNDS_KEY]
             self._early_stopping_rounds = params[_EARLY_STOPPING_ROUNDS_KEY]
             self._best_iteration = params.get(_BEST_ITERATION_KEY)
+            self._loss_function = params.get(_LOSS_FUNCTION_KEY, _DEFAULT_LOSS_FUNCTION)
+            self._focalloss_gamma = params.get(_FOCALLOSS_GAMMA_KEY)
+            self._focalloss_alpha = params.get(_FOCALLOSS_ALPHA_KEY)
         bst = self._provide_xgboost()
         bst.load_model(os.path.join(folder, _MODEL_FILENAME))
@@ -220,6 +246,9 @@ class XGBoostModel(Model):
                     _SKIP_DROP_KEY: self._skip_drop,
                     _NUM_BOOST_ROUNDS_KEY: self._num_boost_rounds,
                     _EARLY_STOPPING_ROUNDS_KEY: self._early_stopping_rounds,
+                    _LOSS_FUNCTION_KEY: self._loss_function,
+                    _FOCALLOSS_GAMMA_KEY: self._gamma,
+                    _FOCALLOSS_ALPHA_KEY: self._alpha,
                 },
                 handle,
             )
@@ -328,6 +357,46 @@ class XGBoostModel(Model):
             param["normalize_type"] = self._normalize_type
             param["rate_drop"] = self._rate_drop
             param["skip_drop"] = self._skip_drop
+        if (
+            self._loss_function == _FOCALLOSS_LOSS_FUNCTION
+            and self._focalloss_alpha is not None
+            and self._focalloss_gamma is not None
+        ):
+            def focal_loss(alpha=0.25, gamma=2.0):
+                def fl(x, t):
+                    p = 1 / (1 + jnp.exp(-x))
+                    pt = t * p + (1 - t) * (1 - p)
+                    alpha_t = alpha * t + (1 - alpha) * (1 - t)
+                    return (
+                        -alpha_t * (1 - pt) ** gamma * jnp.log(jnp.clip(pt, 1e-8, 1.0))
+                    )
+                fl_grad = grad(fl)
+                fl_hess = hessian(fl)
+                grad_batch = vmap(fl_grad)
+                hess_batch = vmap(fl_hess)
+                def custom_loss(y_pred, y_true, sample_weight=None):
+                    y_true = jnp.array(y_true)
+                    y_pred = jnp.array(y_pred)
+                    grad_vals = grad_batch(y_pred, y_true)
+                    hess_vals = hess_batch(y_pred, y_true)
+                    if sample_weight is not None:
+                        sample_weight = jnp.array(sample_weight)
+                        grad_vals *= sample_weight
+                        hess_vals *= sample_weight
+                    # Convert to NumPy arrays for XGBoost compatibility
+                    return np.array(grad_vals), np.array(hess_vals)
+                return custom_loss
+            param["objective"] = focal_loss(
+                alpha=self._focalloss_alpha, gamma=self._focalloss_gamma
+            )
         print(
             f"Creating xgboost model with max_depth {self._max_depth}, best iteration {best_iteration}, booster: {self._booster}",
         )

wavetrainer-0.0.50/wavetrainer/reducer/correlation_reducer.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""A reducer that removes correlation features."""
+# pylint: disable=too-many-arguments,too-many-positional-arguments,consider-using-enumerate
+import json
+import os
+from typing import Self
+import numpy as np
+import optuna
+import pandas as pd
+from .non_categorical_numeric_columns import \
+    find_non_categorical_numeric_columns
+from .reducer import Reducer
+_CORRELATION_REDUCER_FILENAME = "correlation_reducer.json"
+_CORRELATION_REDUCER_THRESHOLD = "correlation_reducer_threshold"
+def _get_correlated_features_to_drop(
+    df: pd.DataFrame, threshold: float = 0.85, random_seed: int = 42
+) -> list[str]:
+    """
+    Identify highly correlated features to drop, keeping one per group.
+    NaNs are replaced with a single fixed junk value to allow correlation computation.
+    Columns are processed in sorted order to ensure deterministic output.
+    Args:
+        df (pd.DataFrame): Input DataFrame.
+        threshold (float): Correlation threshold above which features are considered redundant.
+        random_seed (int): Seed used to generate the fixed junk value.
+    Returns:
+        List[str]: List of column names to drop.
+    """
+    np.random.seed(random_seed)
+    # Select and sort numeric columns
+    sorted_cols = sorted(find_non_categorical_numeric_columns(df))
+    df_numeric = df[sorted_cols].copy()
+    # Generate and apply a fixed junk value for NaNs
+    junk_value = np.random.uniform(-1e9, 1e9)
+    df_numeric = df_numeric.fillna(junk_value)
+    if df_numeric.shape[1] < 2:
+        return []
+    # Compute absolute correlation matrix
+    corr_matrix = np.corrcoef(df_numeric.values, rowvar=False)
+    abs_corr = np.abs(corr_matrix)
+    # Greedy feature drop based on sorted order
+    to_drop = set()
+    for i in range(len(sorted_cols)):
+        if sorted_cols[i] in to_drop:
+            continue
+        for j in range(i + 1, len(sorted_cols)):
+            if sorted_cols[j] in to_drop:
+                continue
+            if abs_corr[i, j] > threshold:
+                to_drop.add(sorted_cols[j])
+    return sorted(to_drop)
+class CorrelationReducer(Reducer):
+    """A class that removes correlated values from a dataset."""
+    _correlation_drop_features: dict[str, bool]
+    def __init__(self) -> None:
+        self._threshold = 0.0
+        self._correlation_drop_features = {}
+    @classmethod
+    def name(cls) -> str:
+        return "correlation"
+    def set_options(
+        self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
+    ) -> None:
+        self._threshold = trial.suggest_float(_CORRELATION_REDUCER_THRESHOLD, 0.7, 0.99)
+    def load(self, folder: str) -> None:
+        with open(
+            os.path.join(folder, _CORRELATION_REDUCER_FILENAME), encoding="utf8"
+        ) as handle:
+            self._correlation_drop_features = json.load(handle)
+    def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
+        with open(
+            os.path.join(folder, _CORRELATION_REDUCER_FILENAME), "w", encoding="utf8"
+        ) as handle:
+            json.dump(self._correlation_drop_features, handle)
+    def fit(
+        self,
+        df: pd.DataFrame,
+        y: pd.Series | pd.DataFrame | None = None,
+        w: pd.Series | None = None,
+        eval_x: pd.DataFrame | None = None,
+        eval_y: pd.Series | pd.DataFrame | None = None,
+    ) -> Self:
+        drop_features = _get_correlated_features_to_drop(df, threshold=self._threshold)
+        self._correlation_drop_features = {x: True for x in drop_features}
+        return self
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        return df.drop(
+            columns=list(self._correlation_drop_features.keys()), errors="ignore"
+        )

{wavetrainer-0.0.49 → wavetrainer-0.0.50/wavetrainer.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: wavetrainer
-Version: 0.0.49
+Version: 0.0.50
 Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
 Home-page: https://github.com/8W9aG/wavetrainer
 Author: Will Sackfield
@@ -26,6 +26,7 @@ Requires-Dist: torch>=2.6.0
 Requires-Dist: tabpfn>=2.0.6
 Requires-Dist: pytest-is-running>=1.5.1
 Requires-Dist: xgboost>=3.0.0
+Requires-Dist: jax>=0.6.1
 # wavetrainer
@@ -58,6 +59,7 @@ Python 3.11.6:
 - [tabpfn](https://github.com/PriorLabs/TabPFN)
 - [pytest-is-running](https://github.com/adamchainz/pytest-is-running)
 - [xgboost](https://xgboost.readthedocs.io/en/release_3.0.0/)
+- [jax](https://github.com/jax-ml/jax)
 ## Raison D'être :thought_balloon:

wavetrainer-0.0.49/requirements.txt → wavetrainer-0.0.50/wavetrainer.egg-info/requires.txt RENAMED Viewed

@@ -12,4 +12,5 @@ pytz>=2025.1
 torch>=2.6.0
 tabpfn>=2.0.6
 pytest-is-running>=1.5.1
-xgboost>=3.0.0
+xgboost>=3.0.0
+jax>=0.6.1

wavetrainer-0.0.49/wavetrainer/reducer/correlation_reducer.py DELETED Viewed

@@ -1,52 +0,0 @@
-"""A reducer that removes correlation features."""
-# pylint: disable=too-many-arguments,too-many-positional-arguments
-from typing import Self
-import optuna
-import pandas as pd
-from feature_engine.selection import DropCorrelatedFeatures
-from .base_selector_reducer import BaseSelectorReducer
-from .non_categorical_numeric_columns import \
-    find_non_categorical_numeric_columns
-_CORRELATION_REDUCER_FILENAME = "correlation_reducer.joblib"
-_CORRELATION_REDUCER_THRESHOLD = "correlation_reducer_threshold"
-class CorrelationReducer(BaseSelectorReducer):
-    """A class that removes correlated values from a dataset."""
-    def __init__(self) -> None:
-        self._correlation_selector = DropCorrelatedFeatures(missing_values="ignore")
-        super().__init__(
-            self._correlation_selector,
-            _CORRELATION_REDUCER_FILENAME,
-        )
-    @classmethod
-    def name(cls) -> str:
-        return "correlation"
-    @classmethod
-    def should_raise(cls) -> bool:
-        return False
-    def set_options(
-        self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
-    ) -> None:
-        self._correlation_selector.threshold = trial.suggest_float(
-            _CORRELATION_REDUCER_THRESHOLD, 0.7, 0.99
-        )
-    def fit(
-        self,
-        df: pd.DataFrame,
-        y: pd.Series | pd.DataFrame | None = None,
-        w: pd.Series | None = None,
-        eval_x: pd.DataFrame | None = None,
-        eval_y: pd.Series | pd.DataFrame | None = None,
-    ) -> Self:
-        self._correlation_selector.variables = find_non_categorical_numeric_columns(df)
-        return super().fit(df, y=y, w=w, eval_x=eval_x, eval_y=eval_y)