PyPI - julearn - Versions diffs - 0.3.2.dev24__tar.gz → 0.3.2.dev61__tar.gz - Mend

julearn 0.3.2.dev24tar.gz → 0.3.2.dev61tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (237) hide show

{julearn-0.3.2.dev24 → julearn-0.3.2.dev61}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: julearn
-Version: 0.3.2.dev24
+Version: 0.3.2.dev61
 Summary: Juelich Machine Learning Library
 Author-email: Fede Raimondo <f.raimondo@fz-juelich.de>, Sami Hamdan <s.hamdan@fz-juelich.de>
 Maintainer-email: Sami Hamdan <s.hamdan@fz-juelich.de>
@@ -40,12 +40,17 @@ Requires-Dist: furo<2024.0.0,>=2022.9.29; extra == "docs"
 Requires-Dist: sphinx_copybutton<0.6,>=0.5.0; extra == "docs"
 Requires-Dist: numpydoc<1.6,>=1.5.0; extra == "docs"
 Requires-Dist: towncrier<24; extra == "docs"
+Requires-Dist: scikit-optimize<0.11,>=0.10.0; extra == "docs"
 Provides-Extra: deslib
 Requires-Dist: deslib<0.4,>=0.3.5; extra == "deslib"
 Provides-Extra: viz
 Requires-Dist: panel>=1.3.0; extra == "viz"
 Requires-Dist: bokeh>=3.0.0; extra == "viz"
 Requires-Dist: param>=2.0.0; extra == "viz"
+Provides-Extra: skopt
+Requires-Dist: scikit-optimize<0.11,>=0.10.0; extra == "skopt"
+Provides-Extra: all
+Requires-Dist: julearn[skopt,viz]; extra == "all"
 # julearn

julearn-0.3.2.dev61/docs/changes/newsfragments/260.enh ADDED Viewed

	@@ -0,0 +1 @@
1	+ Add :class:`~skopt.BayesSearchCV` to the list of available searchers as 'bayes' by `Fede Raimondo`_

julearn-0.3.2.dev61/docs/changes/newsfragments/260.misc ADDED Viewed

	@@ -0,0 +1 @@
1	+ Add ``all`` as optional dependencies to install all functional dependencies by `Fede Raimondo`_

{julearn-0.3.2.dev24 → julearn-0.3.2.dev61}/docs/conf.py RENAMED Viewed

@@ -160,6 +160,7 @@ intersphinx_mapping = {
     # "sqlalchemy": ("https://docs.sqlalchemy.org/en/20/", None),
     "joblib": ("https://joblib.readthedocs.io/en/latest/", None),
     "scipy": ("https://docs.scipy.org/doc/scipy/", None),
+    "skopt": ("https://scikit-optimize.readthedocs.io/en/latest", None),
 }

{julearn-0.3.2.dev24 → julearn-0.3.2.dev61}/docs/getting_started.rst RENAMED Viewed

@@ -86,4 +86,8 @@ The following optional dependencies are available:
 * ``viz``: Visualization tools for ``julearn``. This includes the
   :mod:`.viz` module.
-* ``deslib``: The :mod:`.dynamic` module requires the `deslib`_ package.
+* ``deslib``: The :mod:`.dynamic` module requires the `deslib`_ package. This
+  module is not compatible with newer Python versions and it is unmaintained.
+* ``skopt``: Using the ``"bayes"`` searcher (:class:`~skopt.BayesSearchCV`)
+  requires the `scikit-optimize`_ package.
+* ``all``: Install all optional functional dependencies (except ``deslib``).

{julearn-0.3.2.dev24 → julearn-0.3.2.dev61}/docs/links.inc RENAMED Viewed

@@ -40,3 +40,4 @@
 .. _`DESlib`: https://github.com/scikit-learn-contrib/DESlib
+.. _`scikit-optimize`: https://scikit-optimize.readthedocs.io/en/stable/

julearn-0.3.2.dev61/examples/03_complex_models/run_hyperparameter_tuning_bayessearch.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""
+Tuning Hyperparameters using Bayesian Search
+============================================
+This example uses the ``fmri`` dataset, performs simple binary classification
+using a Support Vector Machine classifier and analyzes the model.
+References
+----------
+  Waskom, M.L., Frank, M.C., Wagner, A.D. (2016). Adaptive engagement of
+  cognitive control in context-dependent decision-making. Cerebral Cortex.
+.. include:: ../../links.inc
+"""
+# Authors: Federico Raimondo <f.raimondo@fz-juelich.de>
+# License: AGPL
+import numpy as np
+from seaborn import load_dataset
+from julearn import run_cross_validation
+from julearn.utils import configure_logging, logger
+from julearn.pipeline import PipelineCreator
+###############################################################################
+# Set the logging level to info to see extra information.
+configure_logging(level="INFO")
+###############################################################################
+# Set the random seed to always have the same example.
+np.random.seed(42)
+###############################################################################
+# Load the dataset.
+df_fmri = load_dataset("fmri")
+df_fmri.head()
+###############################################################################
+# Set the dataframe in the right format.
+df_fmri = df_fmri.pivot(
+    index=["subject", "timepoint", "event"], columns="region", values="signal"
+)
+df_fmri = df_fmri.reset_index()
+df_fmri.head()
+###############################################################################
+# Following the hyperparamter tuning example, we will now use a Bayesian
+# search to find the best hyperparameters for the SVM model.
+X = ["frontal", "parietal"]
+y = "event"
+creator1 = PipelineCreator(problem_type="classification")
+creator1.add("zscore")
+creator1.add(
+    "svm",
+    kernel=["linear"],
+    C=(1e-6, 1e3, "log-uniform"),
+)
+creator2 = PipelineCreator(problem_type="classification")
+creator2.add("zscore")
+creator2.add(
+    "svm",
+    kernel=["rbf"],
+    C=(1e-6, 1e3, "log-uniform"),
+    gamma=(1e-6, 1e1, "log-uniform"),
+)
+search_params = {
+    "kind": "bayes",
+    "cv": 2,  # to speed up the example
+    "n_iter": 10,  # 10 iterations of bayesian search to speed up example
+}
+scores, estimator = run_cross_validation(
+    X=X,
+    y=y,
+    data=df_fmri,
+    model=[creator1, creator2],
+    cv=2,  # to speed up the example
+    search_params=search_params,
+    return_estimator="final",
+)
+print(scores["test_score"].mean())
+###############################################################################
+# It seems that we might have found a better model, but which one is it?
+print(estimator.best_params_)

{julearn-0.3.2.dev24 → julearn-0.3.2.dev61}/examples/99_docs/run_hyperparameters_docs.py RENAMED Viewed

@@ -243,22 +243,132 @@ pprint(model_tuned.best_params_)
 # tries to find the best combination of values for the hyperparameters using
 # cross-validation.
 #
-# By default, ``julearn`` uses a :class:`~sklearn.model_selection.GridSearchCV`.
-# This searcher is very simple. First, it construct the "grid" of
-# hyperparameters to try. As we see above, we have 3 hyperparameters to tune.
-# So it constructs a 3-dimentional grid with all the possible combinations of
-# the hyperparameters values. The second step is to perform cross-validation
-# on each of the possible combinations of hyperparameters values.
+# By default, ``julearn`` uses a
+# :class:`~sklearn.model_selection.GridSearchCV`.
+# This searcher, specified as ``"grid"`` is very simple. First, it constructs
+# the _grid_ of hyperparameters to try. As we see above, we have 3
+# hyperparameters to tune. So it constructs a 3-dimentional grid with all the
+# possible combinations of the hyperparameters values. The second step is to
+# perform cross-validation on each of the possible combinations of
+# hyperparameters values.
 #
-# Another searcher that ``julearn`` provides is the
-# :class:`~sklearn.model_selection.RandomizedSearchCV`. This searcher is
-# similar to the :class:`~sklearn.model_selection.GridSearchCV`, but instead
-# of trying all the possible combinations of hyperparameters values, it tries
+# Other searchers that ``julearn`` provides are the
+# :class:`~sklearn.model_selection.RandomizedSearchCV` and
+# :class:`~skopt.BayesSearchCV`.
+#
+# The randomized searcher
+# (:class:`~sklearn.model_selection.RandomizedSearchCV`) is similar to the
+# :class:`~sklearn.model_selection.GridSearchCV`, but instead
+# of trying all the possible combinations of hyperparameter values, it tries
 # a random subset of them. This is useful when we have a lot of hyperparameters
-# to tune, since it can be very time consuming to try all the possible, as well
-# as continuous parameters that can be sampled out of a distribution. For
-# more information, see the
+# to tune, since it can be very time consuming to try all the possible
+# combinations, as well as continuous parameters that can be sampled out of a
+# distribution. For more information, see the
 # :class:`~sklearn.model_selection.RandomizedSearchCV` documentation.
+#
+# The Bayesian searcher (:class:`~skopt.BayesSearchCV`) is a bit more
+# complex. It uses Bayesian optimization to find the best hyperparameter set.
+# As with the randomized search, it is useful when we have many
+# hyperparameters to tune, and we don't want to try all the possible
+# combinations due to computational constraints. For more information, see the
+# :class:`~skopt.BayesSearchCV` documentation, including how to specify
+# the prior distributions of the hyperparameters.
+#
+# We can specify the kind of searcher and its parametrization, by setting the
+# ``search_params`` parameter in the :func:`.run_cross_validation` function.
+# For example, we can use the
+# :class:`~sklearn.model_selection.RandomizedSearchCV` searcher with
+# 10 iterations of random search.
+search_params = {
+    "kind": "random",
+    "n_iter": 10,
+}
+scores_tuned, model_tuned = run_cross_validation(
+    X=X,
+    y=y,
+    data=df,
+    X_types=X_types,
+    model=creator,
+    return_estimator="all",
+    search_params=search_params,
+)
+print(
+    "Scores with best hyperparameter using 10 iterations of "
+    f"randomized search: {scores_tuned['test_score'].mean()}"
+)
+pprint(model_tuned.best_params_)
+###############################################################################
+# We can now see that the best hyperparameter might be different from the grid
+# search. This is because it tried only 10 combinations and not the whole grid.
+# Furthermore, the  :class:`~sklearn.model_selection.RandomizedSearchCV`
+# searcher can sample hyperparameters from distributions, which can be useful
+# when we have continuous hyperparameters.
+# Let's set both ``C`` and ``gamma`` to be sampled from log-uniform
+# distributions. We can do this by setting the hyperparameter values as a
+# tuple with the following format: ``(low, high, distribution)``. The
+# distribution can be either ``"log-uniform"`` or ``"uniform"``.
+creator = PipelineCreator(problem_type="classification")
+creator.add("zscore")
+creator.add("select_k", k=[2, 3, 4])
+creator.add(
+    "svm",
+    C=(0.01, 10, "log-uniform"),
+    gamma=(1e-3, 1e-1, "log-uniform"),
+)
+print(creator)
+scores_tuned, model_tuned = run_cross_validation(
+    X=X,
+    y=y,
+    data=df,
+    X_types=X_types,
+    model=creator,
+    return_estimator="all",
+    search_params=search_params,
+)
+print(
+    "Scores with best hyperparameter using 10 iterations of "
+    f"randomized search: {scores_tuned['test_score'].mean()}"
+)
+pprint(model_tuned.best_params_)
+###############################################################################
+# We can also control the number of cross-validation folds used by the searcher
+# by setting the ``cv`` parameter in the ``search_params`` dictionary. For
+# example, we can use a bayesian search with 3 folds. Fortunately, the
+# :class:`~skopt.BayesSearchCV` searcher also accepts distributions for the
+# hyperparameters.
+search_params = {
+    "kind": "bayes",
+    "n_iter": 10,
+    "cv": 3,
+}
+scores_tuned, model_tuned = run_cross_validation(
+    X=X,
+    y=y,
+    data=df,
+    X_types=X_types,
+    model=creator,
+    return_estimator="all",
+    search_params=search_params,
+)
+print(
+    "Scores with best hyperparameter using 10 iterations of "
+    f"bayesian search and 3-fold CV: {scores_tuned['test_score'].mean()}"
+)
+pprint(model_tuned.best_params_)
 ###############################################################################
 #

{julearn-0.3.2.dev24 → julearn-0.3.2.dev61}/julearn/_version.py RENAMED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.3.2.dev24'
-__version_tuple__ = version_tuple = (0, 3, 2, 'dev24')
+__version__ = version = '0.3.2.dev61'
+__version_tuple__ = version_tuple = (0, 3, 2, 'dev61')

{julearn-0.3.2.dev24 → julearn-0.3.2.dev61}/julearn/api.py RENAMED Viewed

@@ -8,8 +8,12 @@ from typing import Dict, List, Optional, Union
 import numpy as np
 import pandas as pd
+import sklearn
 from sklearn.base import BaseEstimator
-from sklearn.model_selection import check_cv, cross_validate
+from sklearn.model_selection import (
+    check_cv,
+    cross_validate,
+)
 from sklearn.model_selection._search import BaseSearchCV
 from sklearn.pipeline import Pipeline
@@ -19,20 +23,21 @@ from .pipeline.merger import merge_pipelines
 from .prepare import check_consistency, prepare_input_data
 from .scoring import check_scoring
 from .utils import _compute_cvmdsum, logger, raise_error
+from .utils.typing import CVLike
 def run_cross_validation(  # noqa: C901
     X: List[str],  # noqa: N803
     y: str,
     model: Union[str, PipelineCreator, BaseEstimator, List[PipelineCreator]],
+    data: pd.DataFrame,
     X_types: Optional[Dict] = None,  # noqa: N803
-    data: Optional[pd.DataFrame] = None,
     problem_type: Optional[str] = None,
     preprocess: Union[None, str, List[str]] = None,
     return_estimator: Optional[str] = None,
     return_inspector: bool = False,
     return_train_score: bool = False,
-    cv: Optional[int] = None,
+    cv: Optional[CVLike] = None,
     groups: Optional[str] = None,
     scoring: Union[str, List[str], None] = None,
     pos_labels: Union[str, List[str], None] = None,
@@ -54,12 +59,11 @@ def run_cross_validation(  # noqa: C901
         See :ref:`data_usage` for details.
     model : str or scikit-learn compatible model.
         If string, it will use one of the available models.
+    data : pandas.DataFrame
+        DataFrame with the data. See :ref:`data_usage` for details.
     X_types : dict[str, list of str]
         A dictionary containing keys with column type as a str and the
         columns of this column type as a list of str.
-    data : pandas.DataFrame | None
-        DataFrame with the data (optional).
-        See :ref:`data_usage` for details.
     problem_type : str
         The kind of problem to model.
@@ -132,8 +136,8 @@ def run_cross_validation(  # noqa: C901
         the following keys:
         * 'kind': The kind of search algorithm to use, e.g.:
-            'grid' or 'random'. Can be any valid julearn searcher name or
-            scikit-learn compatible searcher.
+            'grid', 'random' or 'bayes'. Can be any valid julearn searcher name
+            or scikit-learn compatible searcher.
         * 'cv': If a searcher is going to be used, the cross-validation
             splitting strategy to use. Defaults to same CV as for the model
             evaluation.
@@ -196,7 +200,7 @@ def run_cross_validation(  # noqa: C901
         np.random.seed(seed)
     # Interpret the input data and prepare it to be used with the library
-    df_X, y, df_groups, X_types = prepare_input_data(
+    df_X, df_y, df_groups, X_types = prepare_input_data(
         X=X,
         y=y,
         df=data,
@@ -267,7 +271,7 @@ def run_cross_validation(  # noqa: C901
         if has_target_transformer:
             if isinstance(pipeline, BaseSearchCV):
-                last_step = pipeline.estimator[-1]
+                last_step = pipeline.estimator[-1]  # type: ignore
             else:
                 last_step = pipeline[-1]
             if not last_step.can_inverse_transform():
@@ -313,7 +317,7 @@ def run_cross_validation(  # noqa: C901
                     "Cannot use model_params with a model object. Use either "
                     "a string or a PipelineCreator"
                 )
-        pipeline_creator.add(step=model, **t_params)
+        pipeline_creator.add(step=model, **t_params)  # type: ignore
         # Check for extra model_params that are not used
         unused_params = []
@@ -346,38 +350,52 @@ def run_cross_validation(  # noqa: C901
     logger.info("")
     if problem_type == "classification":
-        logger.info(f"\tNumber of classes: {len(np.unique(y))}")
-        logger.info(f"\tTarget type: {y.dtype}")
-        logger.info(f"\tClass distributions: {y.value_counts()}")
+        logger.info(f"\tNumber of classes: {len(np.unique(df_y))}")
+        logger.info(f"\tTarget type: {df_y.dtype}")
+        logger.info(f"\tClass distributions: {df_y.value_counts()}")
     elif problem_type == "regression":
-        logger.info(f"\tTarget type: {y.dtype}")
+        logger.info(f"\tTarget type: {df_y.dtype}")
     # Prepare cross validation
-    cv_outer = check_cv(cv, classifier=problem_type == "classification")
+    cv_outer = check_cv(
+        cv,  # type: ignore
+        classifier=problem_type == "classification",
+    )
     logger.info(f"Using outer CV scheme {cv_outer}")
-    check_consistency(y, cv, groups, problem_type)
+    check_consistency(df_y, cv, groups, problem_type)  # type: ignore
     cv_return_estimator = return_estimator in ["cv", "all"]
-    scoring = check_scoring(pipeline, scoring, wrap_score=wrap_score)
+    scoring = check_scoring(
+        pipeline,  # type: ignore
+        scoring,
+        wrap_score=wrap_score,
+    )
     cv_mdsum = _compute_cvmdsum(cv_outer)
     fit_params = {}
     if df_groups is not None:
         if isinstance(pipeline, BaseSearchCV):
             fit_params["groups"] = df_groups.values
+    _sklearn_deprec_fit_params = {}
+    if sklearn.__version__ >= "1.4.0":
+        _sklearn_deprec_fit_params["params"] = fit_params
+    else:
+        _sklearn_deprec_fit_params["fit_params"] = fit_params
     scores = cross_validate(
         pipeline,
         df_X,
-        y,
+        df_y,
         cv=cv_outer,
         scoring=scoring,
         groups=df_groups,
         return_estimator=cv_return_estimator,
         n_jobs=n_jobs,
         return_train_score=return_train_score,
-        verbose=verbose,
-        fit_params=fit_params,
+        verbose=verbose,  # type: ignore
+        **_sklearn_deprec_fit_params,
     )
     n_repeats = getattr(cv_outer, "n_repeats", 1)
@@ -387,7 +405,10 @@ def run_cross_validation(  # noqa: C901
     folds = np.tile(np.arange(n_folds), n_repeats)
     fold_sizes = np.array(
-        [list(map(len, x)) for x in cv_outer.split(df_X, y, groups=df_groups)]
+        [
+            list(map(len, x))
+            for x in cv_outer.split(df_X, df_y, groups=df_groups)
+        ]
     )
     scores["n_train"] = fold_sizes[:, 0]
     scores["n_test"] = fold_sizes[:, 1]
@@ -398,7 +419,8 @@ def run_cross_validation(  # noqa: C901
     scores_df = pd.DataFrame(scores)
     out = scores_df
     if return_estimator in ["final", "all"]:
-        pipeline.fit(df_X, y, **fit_params)
+        logger.info("Fitting final model")
+        pipeline.fit(df_X, df_y, **fit_params)
         out = scores_df, pipeline
     if return_inspector:
@@ -406,7 +428,7 @@ def run_cross_validation(  # noqa: C901
             scores=scores_df,
             model=pipeline,
             X=df_X,
-            y=y,
+            y=df_y,
             groups=df_groups,
             cv=cv_outer,
         )

{julearn-0.3.2.dev24 → julearn-0.3.2.dev61}/julearn/base/estimators.py RENAMED Viewed

@@ -13,11 +13,11 @@ from sklearn.utils.metaestimators import available_if
 try:  # sklearn < 1.4.0
-    from sklearn.utils.validation import _check_fit_params
+    from sklearn.utils.validation import _check_fit_params  # type: ignore
     fit_params_checker = _check_fit_params
 except ImportError:  # sklearn >= 1.4.0
-    from sklearn.utils.validation import _check_method_params
+    from sklearn.utils.validation import _check_method_params  # type: ignore
     fit_params_checker = _check_method_params
@@ -180,7 +180,12 @@ class JuTransformer(JuBaseEstimator, TransformerMixin):
         self.row_select_col_type = row_select_col_type
         self.row_select_vals = row_select_vals
-    def fit(self, X, y=None, **fit_params):  # noqa: N803
+    def fit(
+        self,
+        X: pd.DataFrame,  # noqa: N803
+        y: Optional[pd.Series] = None,
+        **fit_params,
+    ):
         """Fit the model.
         This method will fit the model using only the columns selected by
@@ -217,8 +222,21 @@ class JuTransformer(JuBaseEstimator, TransformerMixin):
             self.row_select_vals = [self.row_select_vals]
         return self._fit(**self._select_rows(X, y, **fit_params))
+    def _fit(
+        self,
+        X: pd.DataFrame,  # noqa: N803,
+        y: Optional[pd.Series],
+        **kwargs,
+    ) -> None:
+        raise_error(
+            "This method should be implemented in the concrete class",
+            klass=NotImplementedError,
+        )
     def _add_backed_filtered(
-        self, X: pd.DataFrame, X_trans: pd.DataFrame  # noqa: N803
+        self,
+        X: pd.DataFrame,  # noqa: N803
+        X_trans: pd.DataFrame,  # noqa: N803
     ) -> pd.DataFrame:
         """Add the left-out columns back to the transformed data.
@@ -301,7 +319,7 @@ class WrapModel(JuBaseEstimator):
     def fit(
         self,
-        X: pd.DataFrame,  # noqa: N803
+        X: DataLike,  # noqa: N803
         y: Optional[DataLike] = None,
         **fit_params: Any,
     ) -> "WrapModel":
@@ -312,7 +330,7 @@ class WrapModel(JuBaseEstimator):
         Parameters
         ----------
-        X : pd.DataFrame
+        X : DataLike
             The data to fit the model on.
         y : DataLike, optional
             The target data (default is None).
@@ -329,9 +347,9 @@ class WrapModel(JuBaseEstimator):
         if self.needed_types is not None:
             self.needed_types = ensure_column_types(self.needed_types)
-        Xt = self.filter_columns(X)
+        Xt = self.filter_columns(X)  # type: ignore
         self.model_ = self.model
-        self.model_.fit(Xt, y, **fit_params)
+        self.model_.fit(Xt, y, **fit_params)  # type: ignore
         return self
     def predict(self, X: pd.DataFrame) -> DataLike:  # noqa: N803

{julearn-0.3.2.dev24 → julearn-0.3.2.dev61}/julearn/base/tests/test_base_estimators.py RENAMED Viewed

@@ -110,7 +110,7 @@ def test_WrapModel(
     np.random.seed(42)
     lr = model()
-    lr.fit(X_iris_selected, y_iris)
+    lr.fit(X_iris_selected, y_iris)  # type: ignore
     pred_sk = lr.predict(X_iris_selected)
     np.random.seed(42)

julearn 0.3.2.dev24__tar.gz → 0.3.2.dev61__tar.gz

julearn 0.3.2.dev24tar.gz → 0.3.2.dev61tar.gz