PyPI - julearn - Versions diffs - 0.3.6.dev47__tar.gz → 0.3.6.dev72__tar.gz - Mend

julearn 0.3.6.dev47tar.gz → 0.3.6.dev72tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (262) hide show

{julearn-0.3.6.dev47 → julearn-0.3.6.dev72}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: julearn
-Version: 0.3.6.dev47
+Version: 0.3.6.dev72
 Summary: Juelich Machine Learning Library
 Author-email: Fede Raimondo <f.raimondo@fz-juelich.de>, Sami Hamdan <s.hamdan@fz-juelich.de>
 Maintainer-email: Sami Hamdan <s.hamdan@fz-juelich.de>
@@ -59,10 +59,12 @@ Requires-Dist: scikit-optimize<0.11.0,>=0.10.2; extra == "skopt"
 Provides-Extra: optuna
 Requires-Dist: optuna<5.0.0,>=4.0.0; extra == "optuna"
 Requires-Dist: optuna_integration<5.0.0,>=4.0.0; extra == "optuna"
+Provides-Extra: xgboost
+Requires-Dist: xgboost<4.0.0,>=3.0.0; extra == "xgboost"
 Provides-Extra: docs
-Requires-Dist: julearn[optuna,skopt,sphinx,viz]; extra == "docs"
+Requires-Dist: julearn[optuna,skopt,sphinx,viz,xgboost]; extra == "docs"
 Provides-Extra: all
-Requires-Dist: julearn[optuna,skopt,viz]; extra == "all"
+Requires-Dist: julearn[optuna,skopt,viz,xgboost]; extra == "all"
 Dynamic: license-file
 # julearn

{julearn-0.3.6.dev47 → julearn-0.3.6.dev72}/docs/api/models.rst RENAMED Viewed

@@ -19,6 +19,21 @@ Functions
     register_model
     reset_model_register
+Julearn custom models
+---------------------
+This is a list of models implemented by Julearn that are not simple wrappers
+around existing models in other libraries but rather variants of existing
+models or novel models.
+.. autosummary::
+   :nosignatures:
+   :toctree: generated/
+   :template: class.rst
+   xgb_cvearlystopping.XGBClassifierCVEarlyStopping
+   xgb_cvearlystopping.XGBRegressorCVEarlyStopping
 Dynamic Selection (DESLib)
 ==========================

{julearn-0.3.6.dev47 → julearn-0.3.6.dev72}/docs/available_pipeline_steps.rst RENAMED Viewed

@@ -235,6 +235,20 @@ Ensemble
      - Y
      - Y
      - Y
+   * - ``xgb``
+     - XGBoost
+     - | :class:`~xgboost.XGBClassifier` and
+       | :class:`~xgboost.XGBRegressor`
+     - Y
+     - Y
+     - Y
+   * - ``xgb_cvearlystopping``
+     - XGBoost with Cross-Validation and Early Stopping
+     - | :class:`~julearn.models.xgb_cvearlystopping.XGBClassifierCVEarlyStopping` and
+       | :class:`~julearn.models.xgb_cvearlystopping.XGBRegressorCVEarlyStopping`
+     - Y
+     - Y
+     - Y
 Gaussian Processes
 ~~~~~~~~~~~~~~~~~~

{julearn-0.3.6.dev47 → julearn-0.3.6.dev72}/docs/conf.py RENAMED Viewed

@@ -231,6 +231,7 @@ intersphinx_mapping = {
         None,
     ),
     "panel": ("https://panel.holoviz.org/", None),
+    "xgboost": ("https://xgboost.readthedocs.io/en/stable/", None),
 }
 # -- sphinx.ext.extlinks configuration ---------------------------------------

{julearn-0.3.6.dev47 → julearn-0.3.6.dev72}/julearn/_version.py RENAMED Viewed

@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
 commit_id: str | None
 __commit_id__: str | None
-__version__ = version = '0.3.6.dev47'
-__version_tuple__ = version_tuple = (0, 3, 6, 'dev47')
+__version__ = version = '0.3.6.dev72'
+__version_tuple__ = version_tuple = (0, 3, 6, 'dev72')
-__commit_id__ = commit_id = 'g5501ac265'
+__commit_id__ = commit_id = 'gbc239b21b'

{julearn-0.3.6.dev47 → julearn-0.3.6.dev72}/julearn/models/available_models.py RENAMED Viewed

@@ -46,6 +46,19 @@ from sklearn.naive_bayes import (
 )
 from sklearn.svm import SVC, SVR
+try:  # pragma: no cover
+    from xgboost import XGBClassifier, XGBRegressor
+    from .xgb_cvearlystopping import (
+        XGBClassifierCVEarlyStopping,
+        XGBRegressorCVEarlyStopping,
+    )
+    _has_xgboost = True
+except ImportError:
+    _has_xgboost = False
 from ..utils import logger, raise_error, warn_with_log
 from ..utils.logging import DelayedFmtMessage as __
 from ..utils.typing import ModelLike
@@ -137,6 +150,24 @@ _available_models: dict[str, dict[str, Any]] = {
     },
 }
+if _has_xgboost is True:
+    _available_models["xgb"] = {
+        "regression": XGBRegressor,
+        "classification": XGBClassifier,
+    }
+    _available_models["xgb_cvearlystopping"] = {
+        "regression": XGBRegressorCVEarlyStopping,
+        "classification": XGBClassifierCVEarlyStopping,
+    }
+    logger.info(
+        "XGBoost is available and has been added to the model registry."
+    )
+else:
+    logger.info(
+        "XGBoost is not available and has not been added to the model "
+        "registry. To use XGBoost models, please install the xgboost package."
+    )
 _available_models_reset = deepcopy(_available_models)

julearn-0.3.6.dev72/julearn/models/tests/test_xgb_cvearlystopping.py ADDED Viewed

@@ -0,0 +1,476 @@
+"""Provide tests for XGBEarlyStoppingCV."""
+# Authors: Federico Raimondo <f.raimondo@fz-juelich.de>
+# License: AGPL
+import pandas as pd
+import pytest
+from sklearn.utils.validation import _is_fitted
+from julearn.models.xgb_cvearlystopping import (
+    XGBClassifierCVEarlyStopping,
+    XGBRegressorCVEarlyStopping,
+)
+def test_XGBRegressorCVEarlyStopping_grouped(df_iris) -> None:
+    """Test XGBRegressorCVEarlyStopping with grouped data.
+    Parameters
+    ----------
+    df_iris : pd.DataFrame
+        The iris dataset as a DataFrame.
+    """
+    X = ["sepal_length", "sepal_width", "petal_width"]
+    y = "petal_length"
+    n_groups = 20
+    bins = pd.cut(
+        df_iris.index.values, labels=list(range(n_groups)), bins=n_groups
+    )
+    df_iris["group"] = bins.astype(int)
+    model = XGBRegressorCVEarlyStopping(
+        test_size=0.2, early_stopping_rounds=5, random_state=42
+    )
+    assert _is_fitted(model) is False
+    assert not hasattr(model, "_grouped_cv")
+    model.fit(df_iris[X], df_iris[y], groups=df_iris["group"])
+    assert _is_fitted(model)
+    assert hasattr(model, "_grouped_cv")
+    assert model._grouped_cv is True
+    assert model._model.get_params()["num_parallel_tree"] is None
+    # Check that the model was refit with the best number of iterations
+    assert model._model.get_params()["early_stopping_rounds"] is None
+    assert model._model.get_params()["random_state"] == 42
+    assert model._best_iteration is not None
+    assert (
+        model._model.get_params()["n_estimators"] == model._best_iteration + 1
+    )
+    y_pred = model.predict(df_iris[X])
+    assert y_pred.shape == (len(df_iris),)
+    score = model.score(df_iris[X], df_iris[y])
+    assert isinstance(score, float)
+def test_XGBRegressorCVEarlyStopping_notgrouped(df_iris) -> None:
+    """Test XGBRegressorCVEarlyStopping with non-grouped data.
+    Parameters
+    ----------
+    df_iris : pd.DataFrame
+        The iris dataset as a DataFrame.
+    """
+    X = ["sepal_length", "sepal_width", "petal_width"]
+    y = "petal_length"
+    model = XGBRegressorCVEarlyStopping(
+        test_size=0.2, early_stopping_rounds=5, random_state=42
+    )
+    assert _is_fitted(model) is False
+    assert not hasattr(model, "_grouped_cv")
+    model.fit(df_iris[X], df_iris[y])
+    assert _is_fitted(model)
+    assert hasattr(model, "_grouped_cv")
+    assert model._grouped_cv is False
+    assert model._model.get_params()["num_parallel_tree"] is None
+    # Check that the model was refit with the best number of iterations
+    assert model._model.get_params()["early_stopping_rounds"] is None
+    assert model._model.get_params()["random_state"] == 42
+    assert model._best_iteration is not None
+    assert (
+        model._model.get_params()["n_estimators"] == model._best_iteration + 1
+    )
+def test_XGBRegressorCVEarlyStopping_numpy(df_iris) -> None:
+    """Test XGBRegressorCVEarlyStopping with numpy data.
+    Parameters
+    ----------
+    df_iris : pd.DataFrame
+        The iris dataset as a DataFrame.
+    """
+    X = ["sepal_length", "sepal_width", "petal_width"]
+    y = "petal_length"
+    model = XGBRegressorCVEarlyStopping(
+        test_size=0.2,
+        early_stopping_rounds=5,
+        random_state=42,
+        num_parallel_tree=2,
+    )
+    assert _is_fitted(model) is False
+    assert not hasattr(model, "_grouped_cv")
+    model.fit(df_iris[X].values, df_iris[y].values)
+    assert _is_fitted(model)
+    assert hasattr(model, "_grouped_cv")
+    assert model._grouped_cv is False
+    # Check that the model was refit with the best number of iterations
+    assert model._model.get_params()["early_stopping_rounds"] is None
+    assert model._model.get_params()["random_state"] == 42
+    assert model._best_iteration is not None
+    assert model._model.get_params()["num_parallel_tree"] == 2
+    assert (
+        model._model.get_params()["n_estimators"]
+        == (model._best_iteration + 1) * 2
+    )
+def test_XGBClassifierCVEarlyStopping_notgrouped(df_iris) -> None:
+    """Test XGBClassifierCVEarlyStopping with non-grouped data.
+    Parameters
+    ----------
+    df_iris : pd.DataFrame
+        The iris dataset as a DataFrame.
+    """
+    X = ["sepal_length", "sepal_width", "petal_width"]
+    y = "species"
+    model = XGBClassifierCVEarlyStopping(
+        test_size=0.2, early_stopping_rounds=5, random_state=42
+    )
+    assert _is_fitted(model) is False
+    assert not hasattr(model, "_grouped_cv")
+    model.fit(df_iris[X], df_iris[y])
+    assert _is_fitted(model)
+    assert hasattr(model, "_grouped_cv")
+    assert model._grouped_cv is False
+    assert model._model.get_params()["num_parallel_tree"] is None
+    # Check that the model was refit with the best number of iterations
+    assert model._model.get_params()["early_stopping_rounds"] is None
+    assert model._model.get_params()["random_state"] == 42
+    assert model._best_iteration is not None
+    # Three classes, so the number of trees is the best iteration times 3
+    assert (
+        model._model.get_params()["n_estimators"]
+        == (model._best_iteration + 1) * 3
+    )
+def test_XGBClassifierCVEarlyStopping_grouped(df_iris) -> None:
+    """Test XGBClassifierCVEarlyStopping with grouped data.
+    Parameters
+    ----------
+    df_iris : pd.DataFrame
+        The iris dataset as a DataFrame.
+    """
+    X = ["sepal_length", "sepal_width", "petal_width"]
+    y = "species"
+    n_groups = 20
+    bins = pd.cut(
+        df_iris.index.values, labels=list(range(n_groups)), bins=n_groups
+    )
+    df_iris["group"] = bins.astype(int)
+    model = XGBClassifierCVEarlyStopping(
+        test_size=0.2, early_stopping_rounds=5, random_state=42
+    )
+    assert _is_fitted(model) is False
+    assert not hasattr(model, "_grouped_cv")
+    model.fit(df_iris[X], df_iris[y], groups=df_iris["group"])
+    assert _is_fitted(model)
+    assert hasattr(model, "_grouped_cv")
+    assert model._grouped_cv is True
+    assert model.get_params()["test_size"] == 0.2
+    assert model.get_params()["early_stopping_rounds"] == 5
+    assert model.get_params()["random_state"] == 42
+    # Check that the model was refit with the best number of iterations
+    assert model._model.get_params()["early_stopping_rounds"] is None
+    assert model._model.get_params()["random_state"] == 42
+    assert model._best_iteration is not None
+    # Three classes, so the number of trees is the best iteration times 3
+    assert (
+        model._model.get_params()["n_estimators"]
+        == (model._best_iteration + 1) * 3
+    )
+    y_pred = model.predict(df_iris[X])
+    assert y_pred.shape == (len(df_iris),)
+    assert set(y_pred).issubset(set(df_iris[y]))
+    y_probas = model.predict_proba(df_iris[X])
+    assert y_probas.shape == (len(df_iris), 3)
+    assert (y_probas >= 0).all() and (y_probas <= 1).all()
+    score = model.score(df_iris[X], df_iris[y])
+    assert isinstance(score, float)
+def test_XGBClassifierCVEarlyStopping_binary(df_binary) -> None:
+    """Test XGBClassifierCVEarlyStopping with binary classification.
+    Parameters
+    ----------
+    df_binary : pd.DataFrame
+        The binary classification dataset as a DataFrame.
+    """
+    X = ["sepal_length", "sepal_width", "petal_width"]
+    y = "species"
+    model = XGBClassifierCVEarlyStopping(
+        test_size=0.2, early_stopping_rounds=5
+    )
+    assert _is_fitted(model) is False
+    assert not hasattr(model, "_grouped_cv")
+    model.fit(df_binary[X], df_binary[y])
+    assert _is_fitted(model)
+    assert hasattr(model, "_grouped_cv")
+    assert model._grouped_cv is False
+    assert model.get_params()["test_size"] == 0.2
+    assert model.get_params()["early_stopping_rounds"] == 5
+    # Check that the model was refit with the best number of iterations
+    assert model._model.get_params()["early_stopping_rounds"] is None
+    assert model._model.get_params()["random_state"] is None
+    assert model._best_iteration is not None
+    # Two classes, so the number of trees is the best iteration times 2
+    assert (
+        model._model.get_params()["n_estimators"]
+        == (model._best_iteration + 1) * 2
+    )
+    y_pred = model.predict(df_binary[X])
+    assert y_pred.shape == (len(df_binary),)
+    assert set(y_pred).issubset(set(df_binary[y]))
+    y_probas = model.predict_proba(df_binary[X])
+    assert y_probas.shape == (len(df_binary), 2)
+    assert (y_probas >= 0).all() and (y_probas <= 1).all()
+    score = model.score(df_binary[X], df_binary[y])
+    assert isinstance(score, float)
+def test_XGBClassifierCVEarlyStopping_grouped_numpy(df_iris) -> None:
+    """Test XGBClassifierCVEarlyStopping with grouped data and numpy arrays.
+    Parameters
+    ----------
+    df_iris : pd.DataFrame
+        The iris dataset as a DataFrame.
+    """
+    X = ["sepal_length", "sepal_width", "petal_width"]
+    y = "species"
+    n_groups = 20
+    bins = pd.cut(
+        df_iris.index.values, labels=list(range(n_groups)), bins=n_groups
+    )
+    df_iris["group"] = bins.astype(int)
+    model = XGBClassifierCVEarlyStopping(
+        test_size=0.2, early_stopping_rounds=5, random_state=42
+    )
+    assert _is_fitted(model) is False
+    assert not hasattr(model, "_grouped_cv")
+    model.fit(
+        df_iris[X].values,
+        df_iris[y].values.to_numpy(),
+        groups=df_iris["group"].values,
+    )
+    assert _is_fitted(model)
+    assert hasattr(model, "_grouped_cv")
+    assert model._grouped_cv is True
+    assert model.get_params()["test_size"] == 0.2
+    assert model.get_params()["early_stopping_rounds"] == 5
+    assert model.get_params()["random_state"] == 42
+    # Check that the model was refit with the best number of iterations
+    assert model._model.get_params()["early_stopping_rounds"] is None
+    assert model._model.get_params()["random_state"] == 42
+    assert model._best_iteration is not None
+    # Three classes, so the number of trees is the best iteration times 3
+    assert (
+        model._model.get_params()["n_estimators"]
+        == (model._best_iteration + 1) * 3
+    )
+    y_pred = model.predict(df_iris[X])
+    assert y_pred.shape == (len(df_iris),)
+    assert set(y_pred).issubset(set(df_iris[y]))
+    y_probas = model.predict_proba(df_iris[X])
+    assert y_probas.shape == (len(df_iris), 3)
+    assert (y_probas >= 0).all() and (y_probas <= 1).all()
+    score = model.score(df_iris[X], df_iris[y])
+    assert isinstance(score, float)
+def test_XGBClassifierCVEarlyStopping_errors() -> None:
+    """Test XGBClassifierCVEarlyStopping error handling."""
+    with pytest.raises(ValueError, match="early_stopping_rounds"):
+        model = XGBClassifierCVEarlyStopping(
+            test_size=0.2, early_stopping_rounds=None, random_state=42
+        )
+    with pytest.raises(ValueError, match="not fitted"):
+        model = XGBClassifierCVEarlyStopping(
+            test_size=None, early_stopping_rounds=5, random_state=42
+        )
+        model.predict([[1, 2], [3, 4], [5, 6]])
+    with pytest.raises(ValueError, match="not fitted"):
+        model = XGBClassifierCVEarlyStopping(
+            test_size=None, early_stopping_rounds=5, random_state=42
+        )
+        model.predict_proba([[1, 2], [3, 4], [5, 6]])
+def test_XGBClassifierCVEarlyStopping_numpy(df_iris) -> None:
+    """Test XGBClassifierCVEarlyStopping with numpy data.
+    Parameters
+    ----------
+    df_iris : pd.DataFrame
+        The iris dataset as a DataFrame.
+    """
+    X = ["sepal_length", "sepal_width", "petal_width"]
+    y = "species"
+    model = XGBClassifierCVEarlyStopping(
+        test_size=0.2, early_stopping_rounds=5, random_state=42
+    )
+    assert _is_fitted(model) is False
+    assert not hasattr(model, "_grouped_cv")
+    model.fit(df_iris[X].values, df_iris[y].values)
+    assert _is_fitted(model)
+    assert hasattr(model, "_grouped_cv")
+    assert model._grouped_cv is False
+    # Check that the model was refit with the best number of iterations
+    assert model._model.get_params()["early_stopping_rounds"] is None
+    assert model._model.get_params()["random_state"] == 42
+    assert model._best_iteration is not None
+    # Three classes, so the number of trees is the best iteration times 3
+    assert (
+        model._model.get_params()["n_estimators"]
+        == (model._best_iteration + 1) * 3
+    )
+    y_nostring = df_iris[y].values.to_numpy() == "setosa"
+    model.fit(df_iris[X].values, y_nostring)
+    assert _is_fitted(model)
+    assert hasattr(model, "_grouped_cv")
+    assert model._grouped_cv is False
+    # Check that the model was refit with the best number of iterations
+    assert model._model.get_params()["early_stopping_rounds"] is None
+    assert model._model.get_params()["random_state"] == 42
+    assert model._best_iteration is not None
+    # Three classes, so the number of trees is the best iteration times 3
+    assert (
+        model._model.get_params()["n_estimators"]
+        == (model._best_iteration + 1) * 2
+    )
+    y_pred = model.predict(df_iris[X].values)
+    assert y_pred.shape == (len(df_iris),)
+    assert set(y_pred).issubset(set(y_nostring))
+    y_probas = model.predict_proba(df_iris[X].values)
+    assert y_probas.shape == (len(df_iris), 2)
+    assert (y_probas >= 0).all() and (y_probas <= 1).all()
+    score = model.score(df_iris[X].values, y_nostring)
+    assert isinstance(score, float)
+def test_XGBClassifierCVEarlyStopping_set_params(df_iris) -> None:
+    """Test XGBClassifierCVEarlyStopping with grouped data.
+    Parameters
+    ----------
+    df_iris : pd.DataFrame
+        The iris dataset as a DataFrame.
+    """
+    X = ["sepal_length", "sepal_width", "petal_width"]
+    y = "species"
+    n_groups = 20
+    bins = pd.cut(
+        df_iris.index.values, labels=list(range(n_groups)), bins=n_groups
+    )
+    df_iris["group"] = bins.astype(int)
+    model = XGBClassifierCVEarlyStopping(
+        test_size=0.2, early_stopping_rounds=5, random_state=42
+    )
+    assert _is_fitted(model) is False
+    assert not hasattr(model, "_grouped_cv")
+    model.fit(df_iris[X], df_iris[y], groups=df_iris["group"])
+    assert _is_fitted(model)
+    assert hasattr(model, "_grouped_cv")
+    assert model._grouped_cv is True
+    assert model.get_params()["test_size"] == 0.2
+    assert model.get_params()["early_stopping_rounds"] == 5
+    assert model.get_params()["random_state"] == 42
+    # Check that the model was refit with the best number of iterations
+    assert model._model.get_params()["early_stopping_rounds"] is None
+    assert model._model.get_params()["random_state"] == 42
+    assert model._best_iteration is not None
+    # Three classes, so the number of trees is the best iteration times 3
+    assert (
+        model._model.get_params()["n_estimators"]
+        == (model._best_iteration + 1) * 3
+    )
+    model.set_params(
+        test_size=0.3,
+        early_stopping_rounds=10,
+        random_state=24,
+        num_parallel_tree=2,
+    )
+    assert model.get_params()["test_size"] == 0.3
+    assert model.get_params()["early_stopping_rounds"] == 10
+    assert model.get_params()["random_state"] == 24
+    assert model.get_params()["num_parallel_tree"] == 2
+    model.fit(df_iris[X], df_iris[y], groups=df_iris["group"])
+    assert _is_fitted(model)
+    assert hasattr(model, "_grouped_cv")
+    assert model._grouped_cv is True
+    assert model.get_params()["test_size"] == 0.3
+    assert model.get_params()["early_stopping_rounds"] == 10
+    assert model.get_params()["random_state"] == 24
+    assert model.get_params()["num_parallel_tree"] == 2
+    # Check that the model was refit with the best number of iterations
+    assert model._model.get_params()["early_stopping_rounds"] is None
+    assert model._model.get_params()["random_state"] == 24
+    assert model._best_iteration is not None
+    # Three classes, so the number of trees is the best iteration times 3
+    assert (
+        model._model.get_params()["n_estimators"]
+        == (model._best_iteration + 1) * 3 * 2
+    )

julearn 0.3.6.dev47__tar.gz → 0.3.6.dev72__tar.gz

julearn 0.3.6.dev47tar.gz → 0.3.6.dev72tar.gz