PyPI - autogluon.core - Versions diffs - 1.2.1b20250112__py3-none-any.whl → 1.2.1b20250114__py3-none-any.whl - Mend

autogluon.core 1.2.1b20250112py3-none-any.whl → 1.2.1b20250114py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

autogluon/core/models/ensemble/bagged_ensemble_model.py CHANGED Viewed

@@ -18,7 +18,7 @@ from autogluon.common.utils.distribute_utils import DistributedContext
 from autogluon.common.utils.log_utils import DuplicateFilter
 from autogluon.common.utils.try_import import try_import_ray
-from ...constants import MULTICLASS, QUANTILE, REFIT_FULL_SUFFIX, REGRESSION, SOFTCLASS
+from ...constants import BINARY, MULTICLASS, QUANTILE, REFIT_FULL_SUFFIX, REGRESSION, SOFTCLASS
 from ...hpo.exceptions import EmptySearchSpace
 from ...pseudolabeling.pseudolabeling import assert_pseudo_column_match
 from ...utils.exceptions import TimeLimitExceeded
@@ -106,6 +106,9 @@ class BaggedEnsembleModel(AbstractModel):
             # 'refit_folds': False,  # [Advanced, Experimental] Whether to refit bags immediately to a refit_full model in a single .fit call.
             # 'num_folds' None,  # Number of bagged folds per set. If specified, overrides .fit `k_fold` value.
             # 'max_sets': None,  # Maximum bagged repeats to allow, if specified, will set `self.can_fit()` to `self._n_repeats_finished < max_repeats`
+            "stratify": "auto",
+            "bin": "auto",
+            "n_bins": None,
         }
         for param, val in default_params.items():
             self._set_default_param_value(param, val)
@@ -125,11 +128,32 @@ class BaggedEnsembleModel(AbstractModel):
     def can_infer(self):
         return self.is_fit() and self.params.get("save_bag_folds", True)
-    def is_stratified(self):
-        if self.problem_type in [REGRESSION, QUANTILE, SOFTCLASS]:
-            return False
+    def is_stratified(self) -> bool:
+        """
+        Returns whether to stratify on the label during KFold splits
+        """
+        stratify = self.params.get("stratify", "auto")
+        if isinstance(stratify, str) and stratify == "auto":
+            return self.problem_type in [
+                BINARY,
+                MULTICLASS,
+                # Commented out due to inconclusive results on whether this is helpful when combined with binning
+                # REGRESSION,
+                # QUANTILE,
+            ]
         else:
-            return True
+            return stratify
+    def is_binned(self) -> bool:
+        """
+        Returns whether to bin the label during stratified KFold splits
+        """
+        bin = self.params.get("bin", "auto")
+        if isinstance(bin, str) and bin == "auto":
+            return self.problem_type in [REGRESSION, QUANTILE]
+        else:
+            return bin
     def is_fit(self) -> bool:
         return self.n_children != 0
@@ -188,8 +212,16 @@ class BaggedEnsembleModel(AbstractModel):
         else:
             return X
-    def _get_cv_splitter(self, n_splits, n_repeats, groups=None):
-        return CVSplitter(n_splits=n_splits, n_repeats=n_repeats, groups=groups, stratified=self.is_stratified(), random_state=self._random_state)
+    def _get_cv_splitter(self, n_splits: int, n_repeats: int, groups=None) -> CVSplitter:
+        return CVSplitter(
+            n_splits=n_splits,
+            n_repeats=n_repeats,
+            groups=groups,
+            stratify=self.is_stratified(),
+            bin=self.is_binned(),
+            n_bins=self.params.get("n_bins", None),
+            random_state=self._random_state,
+        )
     def _fit(
         self,

autogluon/core/utils/utils.py CHANGED Viewed

@@ -13,7 +13,8 @@ import pandas as pd
 import scipy.stats
 from numpy.typing import ArrayLike
 from pandas import DataFrame, Series
-from sklearn.model_selection import LeaveOneGroupOut, RepeatedKFold, RepeatedStratifiedKFold, train_test_split
+from sklearn.model_selection import BaseCrossValidator, LeaveOneGroupOut, RepeatedKFold, RepeatedStratifiedKFold, train_test_split
+from sklearn.preprocessing import KBinsDiscretizer
 from autogluon.common.utils.resource_utils import ResourceManager
@@ -34,12 +35,54 @@ from .miscs import warning_filter
 logger = logging.getLogger(__name__)
+# TODO: Add binned stratification support for regression in train/val split (non CV)
 class CVSplitter:
-    def __init__(self, splitter_cls=None, n_splits=5, n_repeats=1, random_state=0, stratified=False, groups=None):
+    def __init__(
+        self,
+        splitter_cls=None,
+        n_splits: int = 5,
+        n_repeats: int = 1,
+        random_state: int | None = 0,
+        stratify: bool = False,
+        bin: bool = False,
+        n_bins: int | None = None,
+        groups: pd.Series = None,
+    ):
+        """
+        Wrapper around splitter objects to perform KFold splits.
+        Supports regression stratification via the `bin` and `n_bins` argument.
+        Parameters
+        ----------
+        splitter_cls, default None
+            The class to use for splitting.
+            If None, will automatically be determined based off of `stratify`, `groups`, and `n_repeats`.
+        n_splits : int, default 5
+            The number of splits to perform.
+            Ignored if `groups` is specified.
+        n_repeats: int, default 1
+            The number of repeated splits to perform.
+            Ignored if `groups` is specified.
+        random_state : int, default 0
+            The seed to use when splitting the data.
+        stratify : bool, default False
+            If True, will stratify the splits on `y`.
+        bin : bool, default False
+            If True and `stratify` is True, will bin `y` into `n_bins` bins for stratification.
+            Should only be used for regression and quantile tasks.
+        n_bins : int, default None
+            The number of bins to use when `bin` is True.
+            If None, defaults to `np.floor(n_samples / n_splits)`.
+        groups : pd.Series, default None
+            If specified, splitter_cls will default to LeaveOneGroupOut.
+        """
         self.n_splits = n_splits
         self.n_repeats = n_repeats
         self.random_state = random_state
-        self.stratified = stratified
+        self.stratify = stratify
+        self.bin = bin
+        self.n_bins = n_bins
         self.groups = groups
         if splitter_cls is None:
             splitter_cls = self._get_splitter_cls()
@@ -53,13 +96,13 @@ class CVSplitter:
             self.n_splits = num_groups
             splitter_cls = LeaveOneGroupOut
             # pass
-        elif self.stratified:
+        elif self.stratify:
             splitter_cls = RepeatedStratifiedKFold
         else:
             splitter_cls = RepeatedKFold
         return splitter_cls
-    def _get_splitter(self, splitter_cls):
+    def _get_splitter(self, splitter_cls) -> BaseCrossValidator:
         if splitter_cls == LeaveOneGroupOut:
             return splitter_cls()
         elif splitter_cls in [RepeatedKFold, RepeatedStratifiedKFold]:
@@ -67,19 +110,38 @@ class CVSplitter:
         else:
             raise AssertionError(f"{splitter_cls} is not supported as a valid `splitter_cls` input to CVSplitter.")
-    def split(self, X, y):
-        if isinstance(self._splitter, RepeatedStratifiedKFold):
+    def split(self, X: pd.DataFrame, y: pd.Series) -> list[tuple[np.ndarray, np.ndarray]]:
+        splitter = self._splitter
+        if isinstance(splitter, RepeatedStratifiedKFold):
+            if self.bin:
+                if self.n_bins is None:
+                    n_splits = splitter.get_n_splits()
+                    n_samples = len(y)
+                    # ensure at least n_splits samples per bin
+                    n_bins = int(np.floor(n_samples / n_splits))
+                else:
+                    n_bins = self.n_bins
+                if n_bins > 1:
+                    k_bins_discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', random_state=self.random_state)
+                    y_bin = k_bins_discretizer.fit_transform(y.to_frame())[:, 0]
+                    y = pd.Series(data=y_bin, index=y.index, name=y.name)
+                else:
+                    # Don't stratify, can't bin!
+                    splitter = self._get_splitter(splitter_cls=RepeatedKFold)
             # FIXME: There is a bug in sklearn that causes an incorrect ValueError if performing stratification and all classes have fewer than n_splits samples.
             #  This is hacked by adding a dummy class with n_splits samples, performing the kfold split, then removing the dummy samples from all resulting indices.
             #  This is very inefficient and complicated and ideally should be fixed in sklearn.
             with warning_filter():
                 try:
-                    out = [[train_index, test_index] for train_index, test_index in self._splitter.split(X, y)]
+                    out = [[train_index, test_index] for train_index, test_index in splitter.split(X, y)]
                 except:
                     y_dummy = pd.concat([y, pd.Series([-1] * self.n_splits)], ignore_index=True)
                     X_dummy = pd.concat([X, X.head(self.n_splits)], ignore_index=True)
                     invalid_index = set(list(y_dummy.tail(self.n_splits).index))
-                    out = [[train_index, test_index] for train_index, test_index in self._splitter.split(X_dummy, y_dummy)]
+                    out = [[train_index, test_index] for train_index, test_index in splitter.split(X_dummy, y_dummy)]
                     len_out = len(out)
                     for i in range(len_out):
                         train_index, test_index = out[i]
@@ -87,7 +149,7 @@ class CVSplitter:
                         out[i][1] = [index for index in test_index if index not in invalid_index]
             return out
         else:
-            return [[train_index, test_index] for train_index, test_index in self._splitter.split(X, y, groups=self.groups)]
+            return [[train_index, test_index] for train_index, test_index in splitter.split(X, y, groups=self.groups)]
 def setup_compute(nthreads_per_trial, ngpus_per_trial):

autogluon/core/version.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """This is the autogluon version file."""
-__version__ = '1.2.1b20250112'
+__version__ = '1.2.1b20250114'
 __lite__ = False

{autogluon.core-1.2.1b20250112.dist-info → autogluon.core-1.2.1b20250114.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: autogluon.core
-Version: 1.2.1b20250112
+Version: 1.2.1b20250114
 Summary: Fast and Accurate ML in 3 Lines of Code
 Home-page: https://github.com/autogluon/autogluon
 Author: AutoGluon Community
@@ -43,12 +43,12 @@ Requires-Dist: tqdm<5,>=4.38
 Requires-Dist: requests
 Requires-Dist: matplotlib<3.11,>=3.7.0
 Requires-Dist: boto3<2,>=1.10
-Requires-Dist: autogluon.common==1.2.1b20250112
+Requires-Dist: autogluon.common==1.2.1b20250114
 Provides-Extra: all
-Requires-Dist: pyarrow>=15.0.0; extra == "all"
 Requires-Dist: ray[default,tune]<2.41,>=2.10.0; extra == "all"
-Requires-Dist: hyperopt<0.2.8,>=0.2.7; extra == "all"
 Requires-Dist: ray[default]<2.41,>=2.10.0; extra == "all"
+Requires-Dist: pyarrow>=15.0.0; extra == "all"
+Requires-Dist: hyperopt<0.2.8,>=0.2.7; extra == "all"
 Provides-Extra: ray
 Requires-Dist: ray[default]<2.41,>=2.10.0; extra == "ray"
 Provides-Extra: raytune
@@ -56,10 +56,10 @@ Requires-Dist: pyarrow>=15.0.0; extra == "raytune"
 Requires-Dist: ray[default,tune]<2.41,>=2.10.0; extra == "raytune"
 Requires-Dist: hyperopt<0.2.8,>=0.2.7; extra == "raytune"
 Provides-Extra: tests
-Requires-Dist: pytest-mypy; extra == "tests"
+Requires-Dist: flake8; extra == "tests"
 Requires-Dist: pytest; extra == "tests"
+Requires-Dist: pytest-mypy; extra == "tests"
 Requires-Dist: types-requests; extra == "tests"
-Requires-Dist: flake8; extra == "tests"
 Requires-Dist: types-setuptools; extra == "tests"

{autogluon.core-1.2.1b20250112.dist-info → autogluon.core-1.2.1b20250114.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-autogluon.core-1.2.1b20250112-py3.8-nspkg.pth,sha256=cQGwpuGPqg1GXscIwt-7PmME1OnSpD-7ixkikJ31WAY,554
+autogluon.core-1.2.1b20250114-py3.8-nspkg.pth,sha256=cQGwpuGPqg1GXscIwt-7PmME1OnSpD-7ixkikJ31WAY,554
 autogluon/core/__init__.py,sha256=8KfvvHzXX3a4q6z43Dw1yE7VtbAoiSMaglVpKDy6Xeg,245
 autogluon/core/_setup_utils.py,sha256=NqlGK6So0KG5M0LbBJNT1TI3iAmG93kd_6Brih6y2gQ,6935
 autogluon/core/constants.py,sha256=nEVLdSFJ-5O-tz3jUD3qPX65RMp7g8qOR38XlurbP4Y,3403
 autogluon/core/problem_type.py,sha256=XJmMgeNBgS7u43pDK-spTivatPyh_INOXveEXwQt-Rw,2993
-autogluon/core/version.py,sha256=vEGOfVve9K4_b9lFtye3NgK1GlomSv-mpvLTMz6zNKQ,90
+autogluon/core/version.py,sha256=Um-j8InMDDRIJ5WoL607BMYYvkzsVWU8cxXJ2aXfjbQ,90
 autogluon/core/augmentation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 autogluon/core/augmentation/distill_utils.py,sha256=JBlp2WOMNKoJv8aKVwJVRQSalSk8jx36HM7-k_VvkhY,9404
 autogluon/core/calibrate/__init__.py,sha256=eU6qLj7DKUhaz2HHNHDrfroRaLM-mhuSncK_v1UP4F8,62
@@ -47,7 +47,7 @@ autogluon/core/models/dummy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
 autogluon/core/models/dummy/_dummy_quantile_regressor.py,sha256=i-ZW2flJ60jsMfMK24IP39Xwc55-UlBDvHmqanIf29Q,664
 autogluon/core/models/dummy/dummy_model.py,sha256=at2FZSM2_LuAQ78E2YrRCRt3UaKMyyOnc6p2rtZgA2w,1414
 autogluon/core/models/ensemble/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-autogluon/core/models/ensemble/bagged_ensemble_model.py,sha256=HuyRqdtsdN2z_t9Fa9qWN3U5dz3O7MGYn2qe4BKU9Go,71600
+autogluon/core/models/ensemble/bagged_ensemble_model.py,sha256=p3lkm0Cweyu_maOeboirGmQIjIaaiDHXS-XHDc-wYYw,72610
 autogluon/core/models/ensemble/fold_fitting_strategy.py,sha256=01vzNVvE4FIFgD6YqbhK63XoUlSztnVFsrDdsoqm75U,47021
 autogluon/core/models/ensemble/ray_parallel_fold_fitting_strategy.py,sha256=8RASa-eV6n9kUgbqQHNt7k4IrvuB9NdrunIMLYOLwgA,2068
 autogluon/core/models/ensemble/stacker_ensemble_model.py,sha256=DuDXgozvG9JYYkRvGACA7EXDAtj3Tz_uAjXTfxu5tFg,18041
@@ -85,15 +85,15 @@ autogluon/core/utils/infer_utils.py,sha256=In-u47ELeMLol-O02-bbo79Ak2TYo6iuQ7SJk
 autogluon/core/utils/miscs.py,sha256=Kh5tXxyGHWfEEb36t3iBpdhJI7V4Pgih6xN_bpiQwFg,369
 autogluon/core/utils/plots.py,sha256=ambgOSY7Hp--CDU6raghZx6O8m4a49spdHRG3zEMypk,12238
 autogluon/core/utils/time.py,sha256=WaTonKWCSO5BoSIPo_hdgreGiQxoYE2dYWKO3aPFHj4,3841
-autogluon/core/utils/utils.py,sha256=K05ewQuGauLnVaYwccNDk1moUDg2EEzdSlq8gsw6JVM,55071
+autogluon/core/utils/utils.py,sha256=FMa9kIUAxA3IIBbATmBnNEVObSAivehZ2_zCy3PRR-c,57660
 autogluon/core/utils/version_utils.py,sha256=5-r8hLRKTaZbj5qo2uzE_2E4casH49Ye3WyeHlgHuz4,3252
 autogluon/core/utils/loaders/__init__.py,sha256=W5FAdQvpDcn_uisqJrlSAObWVta-YjJLKGN3NCbEgIo,109
 autogluon/core/utils/savers/__init__.py,sha256=bGWciSxAkj6u06vOC4pTvr22f_1ey0glgvmjCMEOm78,89
-autogluon.core-1.2.1b20250112.dist-info/LICENSE,sha256=CeipvOyAZxBGUsFoaFqwkx54aPnIKEtm9a5u2uXxEws,10142
-autogluon.core-1.2.1b20250112.dist-info/METADATA,sha256=T93g2PtedGQyrDPcRKEXMTOtAZNOnmX56H6lWeyUB0Q,12328
-autogluon.core-1.2.1b20250112.dist-info/NOTICE,sha256=7nPQuj8Kp-uXsU0S5so3-2dNU5EctS5hDXvvzzehd7E,114
-autogluon.core-1.2.1b20250112.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
-autogluon.core-1.2.1b20250112.dist-info/namespace_packages.txt,sha256=giERA4R78OkJf2ijn5slgjURlhRPzfLr7waIcGkzYAo,10
-autogluon.core-1.2.1b20250112.dist-info/top_level.txt,sha256=giERA4R78OkJf2ijn5slgjURlhRPzfLr7waIcGkzYAo,10
-autogluon.core-1.2.1b20250112.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
-autogluon.core-1.2.1b20250112.dist-info/RECORD,,
+autogluon.core-1.2.1b20250114.dist-info/LICENSE,sha256=CeipvOyAZxBGUsFoaFqwkx54aPnIKEtm9a5u2uXxEws,10142
+autogluon.core-1.2.1b20250114.dist-info/METADATA,sha256=qZ8UNbAq_mHz_lqcg-_xSY0z9dXa8fNDuHOncGl7ICg,12328
+autogluon.core-1.2.1b20250114.dist-info/NOTICE,sha256=7nPQuj8Kp-uXsU0S5so3-2dNU5EctS5hDXvvzzehd7E,114
+autogluon.core-1.2.1b20250114.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+autogluon.core-1.2.1b20250114.dist-info/namespace_packages.txt,sha256=giERA4R78OkJf2ijn5slgjURlhRPzfLr7waIcGkzYAo,10
+autogluon.core-1.2.1b20250114.dist-info/top_level.txt,sha256=giERA4R78OkJf2ijn5slgjURlhRPzfLr7waIcGkzYAo,10
+autogluon.core-1.2.1b20250114.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
+autogluon.core-1.2.1b20250114.dist-info/RECORD,,

/autogluon.core-1.2.1b20250112-py3.8-nspkg.pth → /autogluon.core-1.2.1b20250114-py3.8-nspkg.pth RENAMED Viewed

File without changes

{autogluon.core-1.2.1b20250112.dist-info → autogluon.core-1.2.1b20250114.dist-info}/LICENSE RENAMED Viewed

File without changes

{autogluon.core-1.2.1b20250112.dist-info → autogluon.core-1.2.1b20250114.dist-info}/NOTICE RENAMED Viewed

File without changes

{autogluon.core-1.2.1b20250112.dist-info → autogluon.core-1.2.1b20250114.dist-info}/WHEEL RENAMED Viewed

File without changes

{autogluon.core-1.2.1b20250112.dist-info → autogluon.core-1.2.1b20250114.dist-info}/namespace_packages.txt RENAMED Viewed

File without changes

{autogluon.core-1.2.1b20250112.dist-info → autogluon.core-1.2.1b20250114.dist-info}/top_level.txt RENAMED Viewed

File without changes

{autogluon.core-1.2.1b20250112.dist-info → autogluon.core-1.2.1b20250114.dist-info}/zip-safe RENAMED Viewed

File without changes

autogluon.core 1.2.1b20250112__py3-none-any.whl → 1.2.1b20250114__py3-none-any.whl

autogluon.core 1.2.1b20250112py3-none-any.whl → 1.2.1b20250114py3-none-any.whl