PyPI - panelsplit - Versions diffs - 2.0.4.dev0__tar.gz → 2.0.5__tar.gz - Mend

panelsplit 2.0.4.dev0tar.gz → 2.0.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

{panelsplit-2.0.4.dev0 → panelsplit-2.0.5}/.github/workflows/ci.yml RENAMED Viewed

@@ -14,7 +14,7 @@ jobs:
       strategy:
         matrix:
-          python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+          python-version: ["3.11", "3.12", "3.13", "3.14"]
         fail-fast: true
       steps:

{panelsplit-2.0.4.dev0 → panelsplit-2.0.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: panelsplit
-Version: 2.0.4.dev0
+Version: 2.0.5
 Summary: A tool for panel data analysis.
 Project-URL: Homepage, https://github.com/4Freye/panelsplit
 Project-URL: Repository, https://github.com/4Freye/panelsplit
@@ -11,13 +11,13 @@ License-File: LICENSE
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
-Requires-Python: >=3.10
+Requires-Python: >=3.11
 Requires-Dist: joblib>=1.0.1
 Requires-Dist: matplotlib>=3.4.3
 Requires-Dist: narwhals>=1.42.1
 Requires-Dist: numpy>=1.21.0
 Requires-Dist: pandas>=1.3.0
-Requires-Dist: scikit-learn>=0.24.2
+Requires-Dist: scikit-learn>=1.8.0
 Requires-Dist: scipy>=1.10.1
 Requires-Dist: tqdm>=4.67.1
 Requires-Dist: typing-extensions>=4.13.2
@@ -32,7 +32,7 @@ panelsplit is a Python package designed to facilitate time series cross-validati
 ## Installation
-panelsplit is tested for compatibility with python versions >= 3.10. You can install panelsplit using pip:
+panelsplit is tested for compatibility with python versions >= 3.11. You can install panelsplit using pip:
 ```bash
 pip install panelsplit

{panelsplit-2.0.4.dev0 → panelsplit-2.0.5}/README.md RENAMED Viewed

@@ -7,7 +7,7 @@ panelsplit is a Python package designed to facilitate time series cross-validati
 ## Installation
-panelsplit is tested for compatibility with python versions >= 3.10. You can install panelsplit using pip:
+panelsplit is tested for compatibility with python versions >= 3.11. You can install panelsplit using pip:
 ```bash
 pip install panelsplit

{panelsplit-2.0.4.dev0 → panelsplit-2.0.5}/panelsplit/metrics.py RENAMED Viewed

@@ -1,37 +1,42 @@
-from .utils.validation import _safe_indexing
+"""
+Metrics that are equivalent their sklearn counterparts, except for the fact that they work with SequentialCVPipeline.
+"""
+# Standard library
+import warnings
 from inspect import signature
 from collections.abc import Iterable
 from functools import partial
-from sklearn.metrics._scorer import _MultimetricScorer
-from sklearn.utils._param_validation import (
-    validate_params,
-)
-from sklearn.metrics._scorer import _PassthroughScorer, _get_response_method_name
 from copy import deepcopy
-from sklearn.utils.validation import _check_response_method
-import warnings
-from sklearn.base import is_regressor
-from panelsplit.utils._response import _get_response_values
-from sklearn.utils.metadata_routing import (
-    _MetadataRequester,
-    _raise_for_params,
-    _routing_enabled,
-    MetadataRequest,
-)
-from .utils.typing import EstimatorLike, ArrayLike
-from numpy.typing import NDArray
 from typing import Callable, Optional, List, Union, Any, Dict
+# Third-party / typing
 from typing_extensions import Self
+from numpy.typing import NDArray
-# all the error scores:
+# Local package utilities
+from .utils.validation import _safe_indexing
+from .utils.typing import EstimatorLike, ArrayLike
+from panelsplit.utils._response import _get_response_values
+# sklearn public metrics (single consolidated import)
 from sklearn.metrics import (
     accuracy_score,
+    adjusted_mutual_info_score,
+    adjusted_rand_score,
     average_precision_score,
     balanced_accuracy_score,
     brier_score_loss,
     class_likelihood_ratios,
+    completeness_score,
     d2_absolute_error_score,
+    d2_brier_score,
+    d2_log_loss_score,
     explained_variance_score,
+    f1_score,
+    fowlkes_mallows_score,
+    jaccard_score,
+    homogeneity_score,
     log_loss,
     matthews_corrcoef,
     max_error,
@@ -42,24 +47,37 @@ from sklearn.metrics import (
     mean_squared_error,
     mean_squared_log_error,
     median_absolute_error,
+    mutual_info_score,
+    normalized_mutual_info_score,
+    precision_score,
+    rand_score,
     r2_score,
+    recall_score,
     roc_auc_score,
     root_mean_squared_error,
     root_mean_squared_log_error,
     top_k_accuracy_score,
-)
-from sklearn.metrics.cluster import (
-    adjusted_mutual_info_score,
-    adjusted_rand_score,
-    completeness_score,
-    fowlkes_mallows_score,
-    homogeneity_score,
-    mutual_info_score,
-    normalized_mutual_info_score,
-    rand_score,
     v_measure_score,
 )
+# sklearn internals / utilities (note: private APIs)
+from sklearn.metrics._scorer import (
+    _MultimetricScorer,
+    _PassthroughScorer,
+    _get_response_method_name,
+)
+from sklearn.utils._param_validation import validate_params
+from sklearn.utils.validation import _check_response_method
+from sklearn.base import is_regressor
+# metadata routing utilities (used by some sklearn internals)
+from sklearn.utils.metadata_routing import (
+    _MetadataRequester,
+    _raise_for_params,
+    _routing_enabled,
+    MetadataRequest,
+)
 def _get_idx_from_last_cv(estimator: EstimatorLike) -> Union[None, List[NDArray]]:
     """
@@ -88,14 +106,63 @@ def make_SequentialCV_scorer(
     greater_is_better: bool = True,
     **kwargs: Any,
 ) -> Callable[..., float]:
+    """
+    Make a SequentialCVPipeline-compatible scorer from a performance metric.
+    A scorer is a wrapper around an arbitrary metric or loss function that is called
+    with the signature `scorer(estimator, X, y_true, **kwargs)`.
+    The parameter `response_method` allows to specify which method of the estimator
+    should be used to feed the scoring/loss function.
+    Parameters
+    ----------
+    score_func : callable
+        Score function (or loss function) with signature
+        ``score_func(y, y_pred, **kwargs)``.
+    response_method : {"predict_proba", "decision_function", "predict"} or \
+            list/tuple of such str, default="predict"
+        Specifies the response method to use get prediction from an estimator
+        (i.e. :term:`predict_proba`, :term:`decision_function` or
+        :term:`predict`). Possible choices are:
+        - if `str`, it corresponds to the name to the method to return;
+        - if a list or tuple of `str`, it provides the method names in order of
+          preference. The method returned corresponds to the first method in
+          the list and which is implemented by `estimator`.
+    greater_is_better : bool, default=True
+        Whether `score_func` is a score function (default), meaning high is
+        good, or a loss function, meaning low is good. In the latter case, the
+        scorer object will sign-flip the outcome of the `score_func`.
+    **kwargs : additional arguments
+        Additional parameters to be passed to `score_func`.
+    Returns
+    -------
+    Callable
+        Callable object that returns a scalar score; greater is better.
+    Examples
+    --------
+    >>> from panelsplit.metrics import make_SequentialCV_scorer
+    >>> from sklearn.metrics import brier_score_loss
+    >>> brier_loss_scorer= make_SequentialCV_scorer(brier_score_loss, response_method='predict_proba', greater_is_better=False)
+    >>> from panelsplit.pipeline import SequentialCVPipeline
+    >>> from sklearn.ensemble import RandomForestClassifier
+    >>> from sklearn.datasets import load_iris
+    >>> X, y  = load_iris(return_X_y=True)
+    >>> p = SequentialCVPipeline(steps = [('rf', RandomForestClassifier())], cv_steps = [None])
+    >>> p.fit(X, y)
+    >>> brier_loss_scorer(p, X, y)
+    """
     sign = 1 if greater_is_better else -1
     if response_method is None:
-        warnings.warn(
-            "response_method=None is deprecated in version 1.6 and will be removed "
-            "in version 1.8. Leave it to its default value to avoid this warning.",
-            FutureWarning,
-        )
         response_method = "predict"
     elif response_method == "default":
         response_method = "predict"
@@ -158,7 +225,6 @@ class _BaseScorer(_MetadataRequester):
         self._sign = sign
         self._kwargs = kwargs
         self._response_method = response_method
-        # TODO (1.8): remove in 1.8 (scoring="max_error" has been deprecated in 1.6)
         self._deprecation_msg = None
     def _get_pos_label(self) -> Optional[Any]:
@@ -170,7 +236,6 @@ class _BaseScorer(_MetadataRequester):
         return None
     def _accept_sample_weight(self) -> bool:
-        # TODO(slep006): remove when metadata routing is the only way
         return "sample_weight" in signature(self._score_func).parameters
     def __repr__(self) -> str:
@@ -217,7 +282,6 @@ class _BaseScorer(_MetadataRequester):
         float
             Score function applied to prediction of estimator on X.
         """
-        # TODO (1.8): remove in 1.8 (scoring="max_error" has been deprecated in 1.6)
         if self._deprecation_msg is not None:
             warnings.warn(
                 self._deprecation_msg, category=DeprecationWarning, stacklevel=2
@@ -314,6 +378,7 @@ class _Scorer(_BaseScorer):
             X,
             pos_label=pos_label,
         )
         # make lookup dict for fast matching
         pred_dict = dict(zip(idx, y_pred))
@@ -340,6 +405,36 @@ class _Scorer(_BaseScorer):
     prefer_skip_nested_validation=True,
 )
 def get_scorer(scoring: Union[str, Callable]) -> Any:
+    """
+    Get a scorer from string.
+    `sklearn.metrics.get_scorer_names` can be used to retrieve the names
+    of all available scorers.
+    Parameters
+    ----------
+    scoring : str, callable or None
+        Scoring method as string. If callable it is returned as is.
+        If None, returns None.
+    Returns
+    -------
+    callable
+        The scorer.
+    Notes
+    -----
+    When passed a string, this function always returns a copy of the scorer
+    object. Calling `get_scorer` twice for the same scorer results in two
+    separate scorer objects.
+    Examples
+    --------
+    >>> from panelsplit.metrics import get_scorer
+    >>> accuracy = get_scorer("accuracy")
+    >>> accuracy(classifier, X, y)
+    """
     if isinstance(scoring, str):
         try:
             scorer = deepcopy(_SCORERS[scoring])
@@ -489,7 +584,11 @@ neg_mean_poisson_deviance_scorer = make_SequentialCV_scorer(
 neg_mean_gamma_deviance_scorer = make_SequentialCV_scorer(
     mean_gamma_deviance, greater_is_better=False
 )
+# D^2 scorers (fraction of explained Brier / log-loss)
 d2_absolute_error_scorer = make_SequentialCV_scorer(d2_absolute_error_score)
+d2_brier_scorer = make_SequentialCV_scorer(d2_brier_score)
+d2_log_loss_scorer = make_SequentialCV_scorer(d2_log_loss_score)
 # Standard Classification Scores
 accuracy_scorer = make_SequentialCV_scorer(accuracy_score)
@@ -583,6 +682,8 @@ _SCORERS = dict(
     neg_mean_poisson_deviance=neg_mean_poisson_deviance_scorer,
     neg_mean_gamma_deviance=neg_mean_gamma_deviance_scorer,
     d2_absolute_error_score=d2_absolute_error_scorer,
+    d2_brier_score=d2_brier_scorer,
+    d2_log_loss_score=d2_log_loss_scorer,
     accuracy=accuracy_scorer,
     top_k_accuracy=top_k_accuracy_scorer,
     roc_auc=roc_auc_scorer,
@@ -607,3 +708,17 @@ _SCORERS = dict(
     normalized_mutual_info_score=normalized_mutual_info_scorer,
     fowlkes_mallows_score=fowlkes_mallows_scorer,
 )
+for name, metric in [
+    ("precision", precision_score),
+    ("recall", recall_score),
+    ("f1", f1_score),
+    ("jaccard", jaccard_score),
+]:
+    _SCORERS[name] = make_SequentialCV_scorer(metric, average="binary")
+    for average in ["macro", "micro", "samples", "weighted"]:
+        qualified_name = "{0}_{1}".format(name, average)
+        _SCORERS[qualified_name] = make_SequentialCV_scorer(
+            metric, pos_label=None, average=average
+        )

{panelsplit-2.0.4.dev0 → panelsplit-2.0.5}/panelsplit/model_selection/model_selection.py RENAMED Viewed

@@ -970,8 +970,8 @@ class GridSearch(BaseSearch):
         If `scoring` represents a single score, one can use:
-        - a single string (see :ref:`scoring_string_names`);
-        - a callable (see :ref:`scoring_callable`) that returns a single value;
+        - a single string (see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-string-names);
+        - a callable (see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-callable) that returns a single value;
         - `None`, the `estimator`'s default evaluation criterion is used.
         If `scoring` represents multiple scores, one can use:
@@ -981,16 +981,13 @@ class GridSearch(BaseSearch):
           names and the values are the metric scores;
         - a dictionary with metric names as keys and callables as values.
-        See :ref:`multimetric_grid_search` for an example.
+        See https://scikit-learn.org/stable/modules/grid_search.html#multimetric-grid-search for an example.
     n_jobs : int, default=None
         Number of jobs to run in parallel.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors.
-        .. versionchanged:: v0.20
-           `n_jobs` default changed from 1 to None
     refit : bool, str, or callable, default=True
         Refit an estimator using the best found parameters on the whole
         dataset.
@@ -1054,67 +1051,20 @@ class GridSearch(BaseSearch):
         expensive and is not strictly required to select the parameters that
         yield the best generalization performance.
-        .. versionadded:: 0.19
-        .. versionchanged:: 0.21
-            Default value was changed from ``True`` to ``False``
     Attributes
     ----------
     cv_results_ : dict of numpy (masked) ndarrays
         A dict with keys as column headers and values as columns, that can be
         imported into a pandas ``DataFrame``.
-        For instance the below given table
-        +------------+-----------+------------+-----------------+---+---------+
-        |param_kernel|param_gamma|param_degree|split0_test_score|...|rank_t...|
-        +============+===========+============+=================+===+=========+
-        |  'poly'    |     --    |      2     |       0.80      |...|    2    |
-        +------------+-----------+------------+-----------------+---+---------+
-        |  'poly'    |     --    |      3     |       0.70      |...|    4    |
-        +------------+-----------+------------+-----------------+---+---------+
-        |  'rbf'     |     0.1   |     --     |       0.80      |...|    3    |
-        +------------+-----------+------------+-----------------+---+---------+
-        |  'rbf'     |     0.2   |     --     |       0.93      |...|    1    |
-        +------------+-----------+------------+-----------------+---+---------+
-        will be represented by a ``cv_results_`` dict of::
-            {
-            'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
-                                         mask = [False False False False]...)
-            'param_gamma': masked_array(data = [-- -- 0.1 0.2],
-                                        mask = [ True  True False False]...),
-            'param_degree': masked_array(data = [2.0 3.0 -- --],
-                                         mask = [False False  True  True]...),
-            'split0_test_score'  : [0.80, 0.70, 0.80, 0.93],
-            'split1_test_score'  : [0.82, 0.50, 0.70, 0.78],
-            'mean_test_score'    : [0.81, 0.60, 0.75, 0.85],
-            'std_test_score'     : [0.01, 0.10, 0.05, 0.08],
-            'rank_test_score'    : [2, 4, 3, 1],
-            'split0_train_score' : [0.80, 0.92, 0.70, 0.93],
-            'split1_train_score' : [0.82, 0.55, 0.70, 0.87],
-            'mean_train_score'   : [0.81, 0.74, 0.70, 0.90],
-            'std_train_score'    : [0.01, 0.19, 0.00, 0.03],
-            'mean_fit_time'      : [0.73, 0.63, 0.43, 0.49],
-            'std_fit_time'       : [0.01, 0.02, 0.01, 0.01],
-            'mean_score_time'    : [0.01, 0.06, 0.04, 0.04],
-            'std_score_time'     : [0.00, 0.00, 0.00, 0.01],
-            'params'             : [{'kernel': 'poly', 'degree': 2}, ...],
-            }
         For an example of visualization and interpretation of GridSearch results,
-        see :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_stats.py`.
+        see https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_stats.html#sphx-glr-auto-examples-model-selection-plot-grid-search-stats-py.
         NOTE
         The key ``'params'`` is used to store a list of parameter
         settings dicts for all the parameter candidates.
-        The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
-        ``std_score_time`` are all in seconds.
         For multi-metric evaluation, the scores for all the scorers are
         available in the ``cv_results_`` dict at the keys ending with that
         scorer's name (``'_<scorer_name>'``) instead of ``'_score'`` shown
@@ -1167,8 +1117,6 @@ class GridSearch(BaseSearch):
         This is present only if ``refit`` is not False.
-        .. versionadded:: 0.20
     multimetric_ : bool
         Whether or not the scorers compute several metrics.
@@ -1182,16 +1130,12 @@ class GridSearch(BaseSearch):
         parameter for more details) and that `best_estimator_` exposes
         `n_features_in_` when fit.
-        .. versionadded:: 0.24
     feature_names_in_ : ndarray of shape (`n_features_in_`,)
         Names of features seen during :term:`fit`. Only defined if
         `best_estimator_` is defined (see the documentation for the `refit`
         parameter for more details) and that `best_estimator_` exposes
         `feature_names_in_` when fit.
-        .. versionadded:: 1.0
     See Also
     --------
     ParameterGrid : Generates all the combinations of a hyperparameter grid.
@@ -1226,11 +1170,11 @@ class GridSearch(BaseSearch):
     GridSearch(estimator=SVC(),
                  param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')})
     >>> sorted(clf.cv_results_.keys())
-    ['mean_fit_time', 'mean_score_time', 'mean_test_score',...
+    ['mean_test_score',...
      'param_C', 'param_kernel', 'params',...
      'rank_test_score', 'split0_test_score',...
      'split2_test_score', ...
-     'std_fit_time', 'std_score_time', 'std_test_score']
+     'std_test_score']
     """
     _parameter_constraints: dict = {
@@ -1320,8 +1264,8 @@ class RandomizedSearch(BaseSearch):
         If `scoring` represents a single score, one can use:
-        - a single string (see :ref:`scoring_string_names`);
-        - a callable (see :ref:`scoring_callable`) that returns a single value;
+        - a single string (see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-string-names);
+        - a callable (see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-callable) that returns a single value;
         - `None`, the `estimator`'s default evaluation criterion is used.
         If `scoring` represents multiple scores, one can use:
@@ -1331,7 +1275,7 @@ class RandomizedSearch(BaseSearch):
           names and the values are the metric scores;
         - a dictionary with metric names as keys and callables as values.
-        See :ref:`multimetric_grid_search` for an example.
+        See https://scikit-learn.org/stable/modules/grid_search.html#multimetric-grid-search for an example.
         If None, the estimator's score method is used.
@@ -1341,9 +1285,6 @@ class RandomizedSearch(BaseSearch):
         ``-1`` means using all processors.
         for more details.
-        .. versionchanged:: v0.20
-           `n_jobs` default changed from 1 to None
     refit : bool, str, or callable, default=True
         Refit an estimator using the best found parameters on the whole
         dataset.
@@ -1413,62 +1354,20 @@ class RandomizedSearch(BaseSearch):
         expensive and is not strictly required to select the parameters that
         yield the best generalization performance.
-        .. versionadded:: 0.19
-        .. versionchanged:: 0.21
-            Default value was changed from ``True`` to ``False``
     Attributes
     ----------
     cv_results_ : dict of numpy (masked) ndarrays
         A dict with keys as column headers and values as columns, that can be
         imported into a pandas ``DataFrame``.
-        For instance the below given table
-        +--------------+-------------+-------------------+---+---------------+
-        | param_kernel | param_gamma | split0_test_score |...|rank_test_score|
-        +==============+=============+===================+===+===============+
-        |    'rbf'     |     0.1     |       0.80        |...|       1       |
-        +--------------+-------------+-------------------+---+---------------+
-        |    'rbf'     |     0.2     |       0.84        |...|       3       |
-        +--------------+-------------+-------------------+---+---------------+
-        |    'rbf'     |     0.3     |       0.70        |...|       2       |
-        +--------------+-------------+-------------------+---+---------------+
-        will be represented by a ``cv_results_`` dict of::
-            {
-            'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'],
-                                          mask = False),
-            'param_gamma'  : masked_array(data = [0.1 0.2 0.3], mask = False),
-            'split0_test_score'  : [0.80, 0.84, 0.70],
-            'split1_test_score'  : [0.82, 0.50, 0.70],
-            'mean_test_score'    : [0.81, 0.67, 0.70],
-            'std_test_score'     : [0.01, 0.24, 0.00],
-            'rank_test_score'    : [1, 3, 2],
-            'split0_train_score' : [0.80, 0.92, 0.70],
-            'split1_train_score' : [0.82, 0.55, 0.70],
-            'mean_train_score'   : [0.81, 0.74, 0.70],
-            'std_train_score'    : [0.01, 0.19, 0.00],
-            'mean_fit_time'      : [0.73, 0.63, 0.43],
-            'std_fit_time'       : [0.01, 0.02, 0.01],
-            'mean_score_time'    : [0.01, 0.06, 0.04],
-            'std_score_time'     : [0.00, 0.00, 0.00],
-            'params'             : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...],
-            }
         For an example of analysing ``cv_results_``,
-        see :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_stats.py`.
+        see https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_stats.html#sphx-glr-auto-examples-model-selection-plot-grid-search-stats-py.
         NOTE
         The key ``'params'`` is used to store a list of parameter
         settings dicts for all the parameter candidates.
-        The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
-        ``std_score_time`` are all in seconds.
         For multi-metric evaluation, the scores for all the scorers are
         available in the ``cv_results_`` dict at the keys ending with that
         scorer's name (``'_<scorer_name>'``) instead of ``'_score'`` shown
@@ -1524,8 +1423,6 @@ class RandomizedSearch(BaseSearch):
         This is present only if ``refit`` is not False.
-        .. versionadded:: 0.20
     multimetric_ : bool
         Whether or not the scorers compute several metrics.
@@ -1539,16 +1436,12 @@ class RandomizedSearch(BaseSearch):
         parameter for more details) and that `best_estimator_` exposes
         `n_features_in_` when fit.
-        .. versionadded:: 0.24
     feature_names_in_ : ndarray of shape (`n_features_in_`,)
         Names of features seen during :term:`fit`. Only defined if
         `best_estimator_` is defined (see the documentation for the `refit`
         parameter for more details) and that `best_estimator_` exposes
         `feature_names_in_` when fit.
-        .. versionadded:: 1.0
     See Also
     --------
     GridSearch : Does exhaustive search over a grid of parameters.

panelsplit 2.0.4.dev0__tar.gz → 2.0.5__tar.gz

panelsplit 2.0.4.dev0tar.gz → 2.0.5tar.gz