PyPI - julearn - Versions diffs - 0.3.2.dev21__tar.gz → 0.3.2.dev57__tar.gz - Mend

julearn 0.3.2.dev21tar.gz → 0.3.2.dev57tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (237) hide show

{julearn-0.3.2.dev21 → julearn-0.3.2.dev57}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: julearn
-Version: 0.3.2.dev21
+Version: 0.3.2.dev57
 Summary: Juelich Machine Learning Library
 Author-email: Fede Raimondo <f.raimondo@fz-juelich.de>, Sami Hamdan <s.hamdan@fz-juelich.de>
 Maintainer-email: Sami Hamdan <s.hamdan@fz-juelich.de>
@@ -40,12 +40,17 @@ Requires-Dist: furo<2024.0.0,>=2022.9.29; extra == "docs"
 Requires-Dist: sphinx_copybutton<0.6,>=0.5.0; extra == "docs"
 Requires-Dist: numpydoc<1.6,>=1.5.0; extra == "docs"
 Requires-Dist: towncrier<24; extra == "docs"
+Requires-Dist: scikit-optimize<0.11,>=0.10.0; extra == "docs"
 Provides-Extra: deslib
 Requires-Dist: deslib<0.4,>=0.3.5; extra == "deslib"
 Provides-Extra: viz
 Requires-Dist: panel>=1.3.0; extra == "viz"
 Requires-Dist: bokeh>=3.0.0; extra == "viz"
 Requires-Dist: param>=2.0.0; extra == "viz"
+Provides-Extra: skopt
+Requires-Dist: scikit-optimize<0.11,>=0.10.0; extra == "skopt"
+Provides-Extra: all
+Requires-Dist: julearn[skopt,viz]; extra == "all"
 # julearn

julearn-0.3.2.dev57/docs/changes/newsfragments/255.bugfix ADDED Viewed

	@@ -0,0 +1 @@
1	+ Update bokeh api calls to remove warnings by `Fede Raimondo`_

julearn-0.3.2.dev57/docs/changes/newsfragments/260.enh ADDED Viewed

	@@ -0,0 +1 @@
1	+ Add :class:`~skopt.BayesSearchCV` to the list of available searchers as 'bayes' by `Fede Raimondo`_

julearn-0.3.2.dev57/docs/changes/newsfragments/260.misc ADDED Viewed

	@@ -0,0 +1 @@
1	+ Add ``all`` as optional dependencies to install all functional dependencies by `Fede Raimondo`_

{julearn-0.3.2.dev21 → julearn-0.3.2.dev57}/docs/conf.py RENAMED Viewed

@@ -160,6 +160,7 @@ intersphinx_mapping = {
     # "sqlalchemy": ("https://docs.sqlalchemy.org/en/20/", None),
     "joblib": ("https://joblib.readthedocs.io/en/latest/", None),
     "scipy": ("https://docs.scipy.org/doc/scipy/", None),
+    "skopt": ("https://scikit-optimize.readthedocs.io/en/latest", None),
 }

{julearn-0.3.2.dev21 → julearn-0.3.2.dev57}/docs/getting_started.rst RENAMED Viewed

@@ -86,4 +86,8 @@ The following optional dependencies are available:
 * ``viz``: Visualization tools for ``julearn``. This includes the
   :mod:`.viz` module.
-* ``deslib``: The :mod:`.dynamic` module requires the `deslib`_ package.
+* ``deslib``: The :mod:`.dynamic` module requires the `deslib`_ package. This
+  module is not compatible with newer Python versions and it is unmaintained.
+* ``skopt``: Using the ``"bayes"`` searcher (:class:`~skopt.BayesSearchCV`)
+  requires the `scikit-optimize`_ package.
+* ``all``: Install all optional functional dependencies (except ``deslib``).

{julearn-0.3.2.dev21 → julearn-0.3.2.dev57}/docs/links.inc RENAMED Viewed

@@ -40,3 +40,4 @@
 .. _`DESlib`: https://github.com/scikit-learn-contrib/DESlib
+.. _`scikit-optimize`: https://scikit-optimize.readthedocs.io/en/stable/

julearn-0.3.2.dev57/examples/03_complex_models/run_hyperparameter_tuning_bayessearch.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""
+Tuning Hyperparameters using Bayesian Search
+============================================
+This example uses the ``fmri`` dataset, performs simple binary classification
+using a Support Vector Machine classifier and analyzes the model.
+References
+----------
+  Waskom, M.L., Frank, M.C., Wagner, A.D. (2016). Adaptive engagement of
+  cognitive control in context-dependent decision-making. Cerebral Cortex.
+.. include:: ../../links.inc
+"""
+# Authors: Federico Raimondo <f.raimondo@fz-juelich.de>
+# License: AGPL
+import numpy as np
+from seaborn import load_dataset
+from julearn import run_cross_validation
+from julearn.utils import configure_logging, logger
+from julearn.pipeline import PipelineCreator
+###############################################################################
+# Set the logging level to info to see extra information.
+configure_logging(level="INFO")
+###############################################################################
+# Set the random seed to always have the same example.
+np.random.seed(42)
+###############################################################################
+# Load the dataset.
+df_fmri = load_dataset("fmri")
+df_fmri.head()
+###############################################################################
+# Set the dataframe in the right format.
+df_fmri = df_fmri.pivot(
+    index=["subject", "timepoint", "event"], columns="region", values="signal"
+)
+df_fmri = df_fmri.reset_index()
+df_fmri.head()
+###############################################################################
+# Following the hyperparamter tuning example, we will now use a Bayesian
+# search to find the best hyperparameters for the SVM model.
+X = ["frontal", "parietal"]
+y = "event"
+creator1 = PipelineCreator(problem_type="classification")
+creator1.add("zscore")
+creator1.add(
+    "svm",
+    kernel=["linear"],
+    C=(1e-6, 1e3, "log-uniform"),
+)
+creator2 = PipelineCreator(problem_type="classification")
+creator2.add("zscore")
+creator2.add(
+    "svm",
+    kernel=["rbf"],
+    C=(1e-6, 1e3, "log-uniform"),
+    gamma=(1e-6, 1e1, "log-uniform"),
+)
+search_params = {
+    "kind": "bayes",
+    "cv": 2,  # to speed up the example
+    "n_iter": 10,  # 10 iterations of bayesian search to speed up example
+}
+scores, estimator = run_cross_validation(
+    X=X,
+    y=y,
+    data=df_fmri,
+    model=[creator1, creator2],
+    cv=2,  # to speed up the example
+    search_params=search_params,
+    return_estimator="final",
+)
+print(scores["test_score"].mean())
+###############################################################################
+# It seems that we might have found a better model, but which one is it?
+print(estimator.best_params_)

{julearn-0.3.2.dev21 → julearn-0.3.2.dev57}/examples/99_docs/run_hyperparameters_docs.py RENAMED Viewed

@@ -243,22 +243,132 @@ pprint(model_tuned.best_params_)
 # tries to find the best combination of values for the hyperparameters using
 # cross-validation.
 #
-# By default, ``julearn`` uses a :class:`~sklearn.model_selection.GridSearchCV`.
-# This searcher is very simple. First, it construct the "grid" of
-# hyperparameters to try. As we see above, we have 3 hyperparameters to tune.
-# So it constructs a 3-dimentional grid with all the possible combinations of
-# the hyperparameters values. The second step is to perform cross-validation
-# on each of the possible combinations of hyperparameters values.
+# By default, ``julearn`` uses a
+# :class:`~sklearn.model_selection.GridSearchCV`.
+# This searcher, specified as ``"grid"`` is very simple. First, it constructs
+# the _grid_ of hyperparameters to try. As we see above, we have 3
+# hyperparameters to tune. So it constructs a 3-dimentional grid with all the
+# possible combinations of the hyperparameters values. The second step is to
+# perform cross-validation on each of the possible combinations of
+# hyperparameters values.
 #
-# Another searcher that ``julearn`` provides is the
-# :class:`~sklearn.model_selection.RandomizedSearchCV`. This searcher is
-# similar to the :class:`~sklearn.model_selection.GridSearchCV`, but instead
-# of trying all the possible combinations of hyperparameters values, it tries
+# Other searchers that ``julearn`` provides are the
+# :class:`~sklearn.model_selection.RandomizedSearchCV` and
+# :class:`~skopt.BayesSearchCV`.
+#
+# The randomized searcher
+# (:class:`~sklearn.model_selection.RandomizedSearchCV`) is similar to the
+# :class:`~sklearn.model_selection.GridSearchCV`, but instead
+# of trying all the possible combinations of hyperparameter values, it tries
 # a random subset of them. This is useful when we have a lot of hyperparameters
-# to tune, since it can be very time consuming to try all the possible, as well
-# as continuous parameters that can be sampled out of a distribution. For
-# more information, see the
+# to tune, since it can be very time consuming to try all the possible
+# combinations, as well as continuous parameters that can be sampled out of a
+# distribution. For more information, see the
 # :class:`~sklearn.model_selection.RandomizedSearchCV` documentation.
+#
+# The Bayesian searcher (:class:`~skopt.BayesSearchCV`) is a bit more
+# complex. It uses Bayesian optimization to find the best hyperparameter set.
+# As with the randomized search, it is useful when we have many
+# hyperparameters to tune, and we don't want to try all the possible
+# combinations due to computational constraints. For more information, see the
+# :class:`~skopt.BayesSearchCV` documentation, including how to specify
+# the prior distributions of the hyperparameters.
+#
+# We can specify the kind of searcher and its parametrization, by setting the
+# ``search_params`` parameter in the :func:`.run_cross_validation` function.
+# For example, we can use the
+# :class:`~sklearn.model_selection.RandomizedSearchCV` searcher with
+# 10 iterations of random search.
+search_params = {
+    "kind": "random",
+    "n_iter": 10,
+}
+scores_tuned, model_tuned = run_cross_validation(
+    X=X,
+    y=y,
+    data=df,
+    X_types=X_types,
+    model=creator,
+    return_estimator="all",
+    search_params=search_params,
+)
+print(
+    "Scores with best hyperparameter using 10 iterations of "
+    f"randomized search: {scores_tuned['test_score'].mean()}"
+)
+pprint(model_tuned.best_params_)
+###############################################################################
+# We can now see that the best hyperparameter might be different from the grid
+# search. This is because it tried only 10 combinations and not the whole grid.
+# Furthermore, the  :class:`~sklearn.model_selection.RandomizedSearchCV`
+# searcher can sample hyperparameters from distributions, which can be useful
+# when we have continuous hyperparameters.
+# Let's set both ``C`` and ``gamma`` to be sampled from log-uniform
+# distributions. We can do this by setting the hyperparameter values as a
+# tuple with the following format: ``(low, high, distribution)``. The
+# distribution can be either ``"log-uniform"`` or ``"uniform"``.
+creator = PipelineCreator(problem_type="classification")
+creator.add("zscore")
+creator.add("select_k", k=[2, 3, 4])
+creator.add(
+    "svm",
+    C=(0.01, 10, "log-uniform"),
+    gamma=(1e-3, 1e-1, "log-uniform"),
+)
+print(creator)
+scores_tuned, model_tuned = run_cross_validation(
+    X=X,
+    y=y,
+    data=df,
+    X_types=X_types,
+    model=creator,
+    return_estimator="all",
+    search_params=search_params,
+)
+print(
+    "Scores with best hyperparameter using 10 iterations of "
+    f"randomized search: {scores_tuned['test_score'].mean()}"
+)
+pprint(model_tuned.best_params_)
+###############################################################################
+# We can also control the number of cross-validation folds used by the searcher
+# by setting the ``cv`` parameter in the ``search_params`` dictionary. For
+# example, we can use a bayesian search with 3 folds. Fortunately, the
+# :class:`~skopt.BayesSearchCV` searcher also accepts distributions for the
+# hyperparameters.
+search_params = {
+    "kind": "bayes",
+    "n_iter": 10,
+    "cv": 3,
+}
+scores_tuned, model_tuned = run_cross_validation(
+    X=X,
+    y=y,
+    data=df,
+    X_types=X_types,
+    model=creator,
+    return_estimator="all",
+    search_params=search_params,
+)
+print(
+    "Scores with best hyperparameter using 10 iterations of "
+    f"bayesian search and 3-fold CV: {scores_tuned['test_score'].mean()}"
+)
+pprint(model_tuned.best_params_)
 ###############################################################################
 #

{julearn-0.3.2.dev21 → julearn-0.3.2.dev57}/julearn/_version.py RENAMED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.3.2.dev21'
-__version_tuple__ = version_tuple = (0, 3, 2, 'dev21')
+__version__ = version = '0.3.2.dev57'
+__version_tuple__ = version_tuple = (0, 3, 2, 'dev57')

{julearn-0.3.2.dev21 → julearn-0.3.2.dev57}/julearn/api.py RENAMED Viewed

@@ -4,12 +4,16 @@
 #          Sami Hamdan <s.hamdan@fz-juelich.de>
 # License: AGPL
-from typing import Dict, List, Optional, Union
+from typing import Dict, Iterable, List, Optional, Union
 import numpy as np
 import pandas as pd
 from sklearn.base import BaseEstimator
-from sklearn.model_selection import check_cv, cross_validate
+from sklearn.model_selection import (
+    BaseCrossValidator,
+    check_cv,
+    cross_validate,
+)
 from sklearn.model_selection._search import BaseSearchCV
 from sklearn.pipeline import Pipeline
@@ -25,14 +29,14 @@ def run_cross_validation(  # noqa: C901
     X: List[str],  # noqa: N803
     y: str,
     model: Union[str, PipelineCreator, BaseEstimator, List[PipelineCreator]],
+    data: pd.DataFrame,
     X_types: Optional[Dict] = None,  # noqa: N803
-    data: Optional[pd.DataFrame] = None,
     problem_type: Optional[str] = None,
     preprocess: Union[None, str, List[str]] = None,
     return_estimator: Optional[str] = None,
     return_inspector: bool = False,
     return_train_score: bool = False,
-    cv: Optional[int] = None,
+    cv: Optional[Union[int, BaseCrossValidator, Iterable]] = None,
     groups: Optional[str] = None,
     scoring: Union[str, List[str], None] = None,
     pos_labels: Union[str, List[str], None] = None,
@@ -54,12 +58,11 @@ def run_cross_validation(  # noqa: C901
         See :ref:`data_usage` for details.
     model : str or scikit-learn compatible model.
         If string, it will use one of the available models.
+    data : pandas.DataFrame
+        DataFrame with the data. See :ref:`data_usage` for details.
     X_types : dict[str, list of str]
         A dictionary containing keys with column type as a str and the
         columns of this column type as a list of str.
-    data : pandas.DataFrame | None
-        DataFrame with the data (optional).
-        See :ref:`data_usage` for details.
     problem_type : str
         The kind of problem to model.
@@ -132,8 +135,8 @@ def run_cross_validation(  # noqa: C901
         the following keys:
         * 'kind': The kind of search algorithm to use, e.g.:
-            'grid' or 'random'. Can be any valid julearn searcher name or
-            scikit-learn compatible searcher.
+            'grid', 'random' or 'bayes'. Can be any valid julearn searcher name
+            or scikit-learn compatible searcher.
         * 'cv': If a searcher is going to be used, the cross-validation
             splitting strategy to use. Defaults to same CV as for the model
             evaluation.
@@ -196,7 +199,7 @@ def run_cross_validation(  # noqa: C901
         np.random.seed(seed)
     # Interpret the input data and prepare it to be used with the library
-    df_X, y, df_groups, X_types = prepare_input_data(
+    df_X, df_y, df_groups, X_types = prepare_input_data(
         X=X,
         y=y,
         df=data,
@@ -267,7 +270,7 @@ def run_cross_validation(  # noqa: C901
         if has_target_transformer:
             if isinstance(pipeline, BaseSearchCV):
-                last_step = pipeline.estimator[-1]
+                last_step = pipeline.estimator[-1]  # type: ignore
             else:
                 last_step = pipeline[-1]
             if not last_step.can_inverse_transform():
@@ -313,7 +316,7 @@ def run_cross_validation(  # noqa: C901
                     "Cannot use model_params with a model object. Use either "
                     "a string or a PipelineCreator"
                 )
-        pipeline_creator.add(step=model, **t_params)
+        pipeline_creator.add(step=model, **t_params)  # type: ignore
         # Check for extra model_params that are not used
         unused_params = []
@@ -346,17 +349,19 @@ def run_cross_validation(  # noqa: C901
     logger.info("")
     if problem_type == "classification":
-        logger.info(f"\tNumber of classes: {len(np.unique(y))}")
-        logger.info(f"\tTarget type: {y.dtype}")
-        logger.info(f"\tClass distributions: {y.value_counts()}")
+        logger.info(f"\tNumber of classes: {len(np.unique(df_y))}")
+        logger.info(f"\tTarget type: {df_y.dtype}")
+        logger.info(f"\tClass distributions: {df_y.value_counts()}")
     elif problem_type == "regression":
-        logger.info(f"\tTarget type: {y.dtype}")
+        logger.info(f"\tTarget type: {df_y.dtype}")
     # Prepare cross validation
-    cv_outer = check_cv(cv, classifier=problem_type == "classification")
+    cv_outer = check_cv(
+        cv, classifier=problem_type == "classification"  # type: ignore
+    )
     logger.info(f"Using outer CV scheme {cv_outer}")
-    check_consistency(y, cv, groups, problem_type)
+    check_consistency(df_y, cv, groups, problem_type)  # type: ignore
     cv_return_estimator = return_estimator in ["cv", "all"]
     scoring = check_scoring(pipeline, scoring, wrap_score=wrap_score)
@@ -369,14 +374,14 @@ def run_cross_validation(  # noqa: C901
     scores = cross_validate(
         pipeline,
         df_X,
-        y,
+        df_y,
         cv=cv_outer,
         scoring=scoring,
         groups=df_groups,
         return_estimator=cv_return_estimator,
         n_jobs=n_jobs,
         return_train_score=return_train_score,
-        verbose=verbose,
+        verbose=verbose,  # type: ignore
         fit_params=fit_params,
     )
@@ -387,7 +392,10 @@ def run_cross_validation(  # noqa: C901
     folds = np.tile(np.arange(n_folds), n_repeats)
     fold_sizes = np.array(
-        [list(map(len, x)) for x in cv_outer.split(df_X, y, groups=df_groups)]
+        [
+            list(map(len, x))
+            for x in cv_outer.split(df_X, df_y, groups=df_groups)
+        ]
     )
     scores["n_train"] = fold_sizes[:, 0]
     scores["n_test"] = fold_sizes[:, 1]
@@ -398,7 +406,8 @@ def run_cross_validation(  # noqa: C901
     scores_df = pd.DataFrame(scores)
     out = scores_df
     if return_estimator in ["final", "all"]:
-        pipeline.fit(df_X, y, **fit_params)
+        logger.info("Fitting final model")
+        pipeline.fit(df_X, df_y, **fit_params)
         out = scores_df, pipeline
     if return_inspector:
@@ -406,7 +415,7 @@ def run_cross_validation(  # noqa: C901
             scores=scores_df,
             model=pipeline,
             X=df_X,
-            y=y,
+            y=df_y,
             groups=df_groups,
             cv=cv_outer,
         )

{julearn-0.3.2.dev21 → julearn-0.3.2.dev57}/julearn/conftest.py RENAMED Viewed

@@ -8,10 +8,77 @@ from copy import copy
 from typing import Callable, Dict, List, Optional, Union
 import pandas as pd
-from pytest import FixtureRequest, fixture
+import pytest
+from pytest import FixtureRequest, fixture, mark
 from seaborn import load_dataset
+_filter_keys = {
+    "nodeps": "Test that runs without conditional dependencies only",
+}
+def pytest_configure(config: pytest.Config) -> None:
+    """Add a new marker to pytest.
+    Parameters
+    ----------
+    config : pytest.Config
+        The pytest configuration object.
+    """
+    # register your new marker to avoid warnings
+    for k, v in _filter_keys.items():
+        config.addinivalue_line("markers", f"{k}: {v}")
+def pytest_addoption(parser: pytest.Parser) -> None:
+    """Add a new filter option to pytest.
+    Parameters
+    ----------
+    parser : pytest.Parser
+        The pytest parser object.
+    """
+    # add your new filter option (you can name it whatever you want)
+    parser.addoption(
+        "--filter",
+        action="store",
+        help="Select tests based on markers.",
+    )
+def pytest_collection_modifyitems(
+    config: pytest.Config, items: List[pytest.Item]
+) -> None:
+    """Filter tests based on the key marker.
+    Parameters
+    ----------
+    config : pytest.Config
+        The pytest configuration object.
+    items : list
+        The list of items.
+    """
+    filter = config.getoption("--filter", None)  # type: ignore
+    if filter is None:
+        for k in _filter_keys.keys():
+            skip_keys = mark.skip(
+                reason=f"Filter not specified for this test: {k}"
+            )
+            for item in items:
+                if k in item.keywords:
+                    item.add_marker(skip_keys)  # skip the test
+    else:
+        new_items = []
+        for item in items:
+            if filter in item.keywords:
+                new_items.append(item)
+        items[:] = new_items
 @fixture(scope="function")
 def df_typed_iris() -> pd.DataFrame:
     """Return a typed iris dataset.
@@ -191,6 +258,32 @@ def search_params(request: FixtureRequest) -> Optional[Dict]:
         A dictionary with the search_params argument.
     """
+    return request.param
+@fixture(
+    params=[
+        {"kind": "bayes", "n_iter": 2, "cv": 3},
+        {"kind": "bayes", "n_iter": 2},
+    ],
+    scope="function",
+)
+def bayes_search_params(request: FixtureRequest) -> Optional[Dict]:
+    """Return different  search_params argument for BayesSearchCV.
+    Parameters
+    ----------
+    request : pytest.FixtureRequest
+        The request object.
+    Returns
+    -------
+    dict or None
+        A dictionary with the search_params argument.
+    """
     return request.param
@@ -234,6 +327,46 @@ def get_tuning_params() -> Callable:
     return get
+_tuning_distributions = {
+    "zscore": {"with_mean": [True, False]},
+    "pca": {"n_components": (0.2, 0.7, "uniform")},
+    "select_univariate": {"mode": ["k_best", "percentile"]},
+    "rf": {"n_estimators": [2, 5]},
+    "svm": {"C": (1, 10, "log-uniform")},
+    "ridge": {"alpha": (1, 3, "uniform")},
+}
+@fixture(scope="function")
+def get_tuning_distributions() -> Callable:
+    """Return a function that returns the distributions to tune.
+    Returns
+    -------
+    get : callable
+        A function that returns the distributions to tune for a given step.
+    """
+    def get(step: str) -> Dict:
+        """Return the distributions to tune for a given step.
+        Parameters
+        ----------
+        step : str
+            The name of the step.
+        Returns
+        -------
+        dict
+            The distributions to tune for the given step.
+        """
+        return copy(_tuning_distributions.get(step, {}))
+    return get
 @fixture(
     params=[
         "zscore",

{julearn-0.3.2.dev21 → julearn-0.3.2.dev57}/julearn/inspect/inspector.py RENAMED Viewed

@@ -6,13 +6,16 @@
 from typing import TYPE_CHECKING, List, Optional, Union
+import pandas as pd
+from sklearn.model_selection import BaseCrossValidator
 from ..utils.logging import raise_error
 from ._cv import FoldsInspector
 from ._pipeline import PipelineInspector
 if TYPE_CHECKING:
-    import pandas as pd
     from sklearn.base import BaseEstimator
     from ..pipeline.pipeline_creator import PipelineCreator
@@ -48,10 +51,10 @@ class Inspector:
             "BaseEstimator",
             None,
         ] = None,
-        X: Optional[List[str]] = None,  # noqa: N803
-        y: Optional[str] = None,
-        groups: Optional[str] = None,
-        cv: Optional[int] = None,
+        X: Optional[pd.DataFrame] = None,  # noqa: N803
+        y: Optional[pd.Series] = None,
+        groups: Optional[pd.Series] = None,
+        cv: Optional[Union[int, BaseCrossValidator]] = None,
     ) -> None:
         self._scores = scores
         self._model = model

julearn 0.3.2.dev21__tar.gz → 0.3.2.dev57__tar.gz

julearn 0.3.2.dev21tar.gz → 0.3.2.dev57tar.gz