autogluon.core 1.2.1b20250113__py3-none-any.whl → 1.2.1b20250114__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,7 +18,7 @@ from autogluon.common.utils.distribute_utils import DistributedContext
18
18
  from autogluon.common.utils.log_utils import DuplicateFilter
19
19
  from autogluon.common.utils.try_import import try_import_ray
20
20
 
21
- from ...constants import MULTICLASS, QUANTILE, REFIT_FULL_SUFFIX, REGRESSION, SOFTCLASS
21
+ from ...constants import BINARY, MULTICLASS, QUANTILE, REFIT_FULL_SUFFIX, REGRESSION, SOFTCLASS
22
22
  from ...hpo.exceptions import EmptySearchSpace
23
23
  from ...pseudolabeling.pseudolabeling import assert_pseudo_column_match
24
24
  from ...utils.exceptions import TimeLimitExceeded
@@ -106,6 +106,9 @@ class BaggedEnsembleModel(AbstractModel):
106
106
  # 'refit_folds': False, # [Advanced, Experimental] Whether to refit bags immediately to a refit_full model in a single .fit call.
107
107
  # 'num_folds' None, # Number of bagged folds per set. If specified, overrides .fit `k_fold` value.
108
108
  # 'max_sets': None, # Maximum bagged repeats to allow, if specified, will set `self.can_fit()` to `self._n_repeats_finished < max_repeats`
109
+ "stratify": "auto",
110
+ "bin": "auto",
111
+ "n_bins": None,
109
112
  }
110
113
  for param, val in default_params.items():
111
114
  self._set_default_param_value(param, val)
@@ -125,11 +128,32 @@ class BaggedEnsembleModel(AbstractModel):
125
128
  def can_infer(self):
126
129
  return self.is_fit() and self.params.get("save_bag_folds", True)
127
130
 
128
- def is_stratified(self):
129
- if self.problem_type in [REGRESSION, QUANTILE, SOFTCLASS]:
130
- return False
131
+ def is_stratified(self) -> bool:
132
+ """
133
+ Returns whether to stratify on the label during KFold splits
134
+ """
135
+ stratify = self.params.get("stratify", "auto")
136
+ if isinstance(stratify, str) and stratify == "auto":
137
+ return self.problem_type in [
138
+ BINARY,
139
+ MULTICLASS,
140
+
141
+ # Commented out due to inconclusive results on whether this is helpful when combined with binning
142
+ # REGRESSION,
143
+ # QUANTILE,
144
+ ]
131
145
  else:
132
- return True
146
+ return stratify
147
+
148
+ def is_binned(self) -> bool:
149
+ """
150
+ Returns whether to bin the label during stratified KFold splits
151
+ """
152
+ bin = self.params.get("bin", "auto")
153
+ if isinstance(bin, str) and bin == "auto":
154
+ return self.problem_type in [REGRESSION, QUANTILE]
155
+ else:
156
+ return bin
133
157
 
134
158
  def is_fit(self) -> bool:
135
159
  return self.n_children != 0
@@ -188,8 +212,16 @@ class BaggedEnsembleModel(AbstractModel):
188
212
  else:
189
213
  return X
190
214
 
191
- def _get_cv_splitter(self, n_splits, n_repeats, groups=None):
192
- return CVSplitter(n_splits=n_splits, n_repeats=n_repeats, groups=groups, stratified=self.is_stratified(), random_state=self._random_state)
215
+ def _get_cv_splitter(self, n_splits: int, n_repeats: int, groups=None) -> CVSplitter:
216
+ return CVSplitter(
217
+ n_splits=n_splits,
218
+ n_repeats=n_repeats,
219
+ groups=groups,
220
+ stratify=self.is_stratified(),
221
+ bin=self.is_binned(),
222
+ n_bins=self.params.get("n_bins", None),
223
+ random_state=self._random_state,
224
+ )
193
225
 
194
226
  def _fit(
195
227
  self,
@@ -13,7 +13,8 @@ import pandas as pd
13
13
  import scipy.stats
14
14
  from numpy.typing import ArrayLike
15
15
  from pandas import DataFrame, Series
16
- from sklearn.model_selection import LeaveOneGroupOut, RepeatedKFold, RepeatedStratifiedKFold, train_test_split
16
+ from sklearn.model_selection import BaseCrossValidator, LeaveOneGroupOut, RepeatedKFold, RepeatedStratifiedKFold, train_test_split
17
+ from sklearn.preprocessing import KBinsDiscretizer
17
18
 
18
19
  from autogluon.common.utils.resource_utils import ResourceManager
19
20
 
@@ -34,12 +35,54 @@ from .miscs import warning_filter
34
35
  logger = logging.getLogger(__name__)
35
36
 
36
37
 
38
+ # TODO: Add binned stratification support for regression in train/val split (non CV)
37
39
  class CVSplitter:
38
- def __init__(self, splitter_cls=None, n_splits=5, n_repeats=1, random_state=0, stratified=False, groups=None):
40
+ def __init__(
41
+ self,
42
+ splitter_cls=None,
43
+ n_splits: int = 5,
44
+ n_repeats: int = 1,
45
+ random_state: int | None = 0,
46
+ stratify: bool = False,
47
+ bin: bool = False,
48
+ n_bins: int | None = None,
49
+ groups: pd.Series = None,
50
+ ):
51
+ """
52
+ Wrapper around splitter objects to perform KFold splits.
53
+ Supports regression stratification via the `bin` and `n_bins` argument.
54
+
55
+ Parameters
56
+ ----------
57
+ splitter_cls, default None
58
+ The class to use for splitting.
59
+ If None, will automatically be determined based off of `stratify`, `groups`, and `n_repeats`.
60
+ n_splits : int, default 5
61
+ The number of splits to perform.
62
+ Ignored if `groups` is specified.
63
+ n_repeats: int, default 1
64
+ The number of repeated splits to perform.
65
+ Ignored if `groups` is specified.
66
+ random_state : int, default 0
67
+ The seed to use when splitting the data.
68
+ stratify : bool, default False
69
+ If True, will stratify the splits on `y`.
70
+ bin : bool, default False
71
+ If True and `stratify` is True, will bin `y` into `n_bins` bins for stratification.
72
+ Should only be used for regression and quantile tasks.
73
+ n_bins : int, default None
74
+ The number of bins to use when `bin` is True.
75
+ If None, defaults to `np.floor(n_samples / n_splits)`.
76
+ groups : pd.Series, default None
77
+ If specified, splitter_cls will default to LeaveOneGroupOut.
78
+
79
+ """
39
80
  self.n_splits = n_splits
40
81
  self.n_repeats = n_repeats
41
82
  self.random_state = random_state
42
- self.stratified = stratified
83
+ self.stratify = stratify
84
+ self.bin = bin
85
+ self.n_bins = n_bins
43
86
  self.groups = groups
44
87
  if splitter_cls is None:
45
88
  splitter_cls = self._get_splitter_cls()
@@ -53,13 +96,13 @@ class CVSplitter:
53
96
  self.n_splits = num_groups
54
97
  splitter_cls = LeaveOneGroupOut
55
98
  # pass
56
- elif self.stratified:
99
+ elif self.stratify:
57
100
  splitter_cls = RepeatedStratifiedKFold
58
101
  else:
59
102
  splitter_cls = RepeatedKFold
60
103
  return splitter_cls
61
104
 
62
- def _get_splitter(self, splitter_cls):
105
+ def _get_splitter(self, splitter_cls) -> BaseCrossValidator:
63
106
  if splitter_cls == LeaveOneGroupOut:
64
107
  return splitter_cls()
65
108
  elif splitter_cls in [RepeatedKFold, RepeatedStratifiedKFold]:
@@ -67,19 +110,38 @@ class CVSplitter:
67
110
  else:
68
111
  raise AssertionError(f"{splitter_cls} is not supported as a valid `splitter_cls` input to CVSplitter.")
69
112
 
70
- def split(self, X, y):
71
- if isinstance(self._splitter, RepeatedStratifiedKFold):
113
+ def split(self, X: pd.DataFrame, y: pd.Series) -> list[tuple[np.ndarray, np.ndarray]]:
114
+ splitter = self._splitter
115
+ if isinstance(splitter, RepeatedStratifiedKFold):
116
+ if self.bin:
117
+ if self.n_bins is None:
118
+ n_splits = splitter.get_n_splits()
119
+ n_samples = len(y)
120
+
121
+ # ensure at least n_splits samples per bin
122
+ n_bins = int(np.floor(n_samples / n_splits))
123
+ else:
124
+ n_bins = self.n_bins
125
+
126
+ if n_bins > 1:
127
+ k_bins_discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', random_state=self.random_state)
128
+ y_bin = k_bins_discretizer.fit_transform(y.to_frame())[:, 0]
129
+ y = pd.Series(data=y_bin, index=y.index, name=y.name)
130
+ else:
131
+ # Don't stratify, can't bin!
132
+ splitter = self._get_splitter(splitter_cls=RepeatedKFold)
133
+
72
134
  # FIXME: There is a bug in sklearn that causes an incorrect ValueError if performing stratification and all classes have fewer than n_splits samples.
73
135
  # This is hacked by adding a dummy class with n_splits samples, performing the kfold split, then removing the dummy samples from all resulting indices.
74
136
  # This is very inefficient and complicated and ideally should be fixed in sklearn.
75
137
  with warning_filter():
76
138
  try:
77
- out = [[train_index, test_index] for train_index, test_index in self._splitter.split(X, y)]
139
+ out = [[train_index, test_index] for train_index, test_index in splitter.split(X, y)]
78
140
  except:
79
141
  y_dummy = pd.concat([y, pd.Series([-1] * self.n_splits)], ignore_index=True)
80
142
  X_dummy = pd.concat([X, X.head(self.n_splits)], ignore_index=True)
81
143
  invalid_index = set(list(y_dummy.tail(self.n_splits).index))
82
- out = [[train_index, test_index] for train_index, test_index in self._splitter.split(X_dummy, y_dummy)]
144
+ out = [[train_index, test_index] for train_index, test_index in splitter.split(X_dummy, y_dummy)]
83
145
  len_out = len(out)
84
146
  for i in range(len_out):
85
147
  train_index, test_index = out[i]
@@ -87,7 +149,7 @@ class CVSplitter:
87
149
  out[i][1] = [index for index in test_index if index not in invalid_index]
88
150
  return out
89
151
  else:
90
- return [[train_index, test_index] for train_index, test_index in self._splitter.split(X, y, groups=self.groups)]
152
+ return [[train_index, test_index] for train_index, test_index in splitter.split(X, y, groups=self.groups)]
91
153
 
92
154
 
93
155
  def setup_compute(nthreads_per_trial, ngpus_per_trial):
autogluon/core/version.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """This is the autogluon version file."""
2
- __version__ = '1.2.1b20250113'
2
+ __version__ = '1.2.1b20250114'
3
3
  __lite__ = False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: autogluon.core
3
- Version: 1.2.1b20250113
3
+ Version: 1.2.1b20250114
4
4
  Summary: Fast and Accurate ML in 3 Lines of Code
5
5
  Home-page: https://github.com/autogluon/autogluon
6
6
  Author: AutoGluon Community
@@ -43,12 +43,12 @@ Requires-Dist: tqdm<5,>=4.38
43
43
  Requires-Dist: requests
44
44
  Requires-Dist: matplotlib<3.11,>=3.7.0
45
45
  Requires-Dist: boto3<2,>=1.10
46
- Requires-Dist: autogluon.common==1.2.1b20250113
46
+ Requires-Dist: autogluon.common==1.2.1b20250114
47
47
  Provides-Extra: all
48
48
  Requires-Dist: ray[default,tune]<2.41,>=2.10.0; extra == "all"
49
- Requires-Dist: hyperopt<0.2.8,>=0.2.7; extra == "all"
50
- Requires-Dist: pyarrow>=15.0.0; extra == "all"
51
49
  Requires-Dist: ray[default]<2.41,>=2.10.0; extra == "all"
50
+ Requires-Dist: pyarrow>=15.0.0; extra == "all"
51
+ Requires-Dist: hyperopt<0.2.8,>=0.2.7; extra == "all"
52
52
  Provides-Extra: ray
53
53
  Requires-Dist: ray[default]<2.41,>=2.10.0; extra == "ray"
54
54
  Provides-Extra: raytune
@@ -56,11 +56,11 @@ Requires-Dist: pyarrow>=15.0.0; extra == "raytune"
56
56
  Requires-Dist: ray[default,tune]<2.41,>=2.10.0; extra == "raytune"
57
57
  Requires-Dist: hyperopt<0.2.8,>=0.2.7; extra == "raytune"
58
58
  Provides-Extra: tests
59
- Requires-Dist: types-requests; extra == "tests"
60
- Requires-Dist: pytest-mypy; extra == "tests"
59
+ Requires-Dist: flake8; extra == "tests"
61
60
  Requires-Dist: pytest; extra == "tests"
61
+ Requires-Dist: pytest-mypy; extra == "tests"
62
+ Requires-Dist: types-requests; extra == "tests"
62
63
  Requires-Dist: types-setuptools; extra == "tests"
63
- Requires-Dist: flake8; extra == "tests"
64
64
 
65
65
 
66
66
 
@@ -1,9 +1,9 @@
1
- autogluon.core-1.2.1b20250113-py3.8-nspkg.pth,sha256=cQGwpuGPqg1GXscIwt-7PmME1OnSpD-7ixkikJ31WAY,554
1
+ autogluon.core-1.2.1b20250114-py3.8-nspkg.pth,sha256=cQGwpuGPqg1GXscIwt-7PmME1OnSpD-7ixkikJ31WAY,554
2
2
  autogluon/core/__init__.py,sha256=8KfvvHzXX3a4q6z43Dw1yE7VtbAoiSMaglVpKDy6Xeg,245
3
3
  autogluon/core/_setup_utils.py,sha256=NqlGK6So0KG5M0LbBJNT1TI3iAmG93kd_6Brih6y2gQ,6935
4
4
  autogluon/core/constants.py,sha256=nEVLdSFJ-5O-tz3jUD3qPX65RMp7g8qOR38XlurbP4Y,3403
5
5
  autogluon/core/problem_type.py,sha256=XJmMgeNBgS7u43pDK-spTivatPyh_INOXveEXwQt-Rw,2993
6
- autogluon/core/version.py,sha256=xp_3RhUZoc8du55mJV_5b3-ffuLY-6zGvseBIwCs1Z0,90
6
+ autogluon/core/version.py,sha256=Um-j8InMDDRIJ5WoL607BMYYvkzsVWU8cxXJ2aXfjbQ,90
7
7
  autogluon/core/augmentation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  autogluon/core/augmentation/distill_utils.py,sha256=JBlp2WOMNKoJv8aKVwJVRQSalSk8jx36HM7-k_VvkhY,9404
9
9
  autogluon/core/calibrate/__init__.py,sha256=eU6qLj7DKUhaz2HHNHDrfroRaLM-mhuSncK_v1UP4F8,62
@@ -47,7 +47,7 @@ autogluon/core/models/dummy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
47
47
  autogluon/core/models/dummy/_dummy_quantile_regressor.py,sha256=i-ZW2flJ60jsMfMK24IP39Xwc55-UlBDvHmqanIf29Q,664
48
48
  autogluon/core/models/dummy/dummy_model.py,sha256=at2FZSM2_LuAQ78E2YrRCRt3UaKMyyOnc6p2rtZgA2w,1414
49
49
  autogluon/core/models/ensemble/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
- autogluon/core/models/ensemble/bagged_ensemble_model.py,sha256=HuyRqdtsdN2z_t9Fa9qWN3U5dz3O7MGYn2qe4BKU9Go,71600
50
+ autogluon/core/models/ensemble/bagged_ensemble_model.py,sha256=p3lkm0Cweyu_maOeboirGmQIjIaaiDHXS-XHDc-wYYw,72610
51
51
  autogluon/core/models/ensemble/fold_fitting_strategy.py,sha256=01vzNVvE4FIFgD6YqbhK63XoUlSztnVFsrDdsoqm75U,47021
52
52
  autogluon/core/models/ensemble/ray_parallel_fold_fitting_strategy.py,sha256=8RASa-eV6n9kUgbqQHNt7k4IrvuB9NdrunIMLYOLwgA,2068
53
53
  autogluon/core/models/ensemble/stacker_ensemble_model.py,sha256=DuDXgozvG9JYYkRvGACA7EXDAtj3Tz_uAjXTfxu5tFg,18041
@@ -85,15 +85,15 @@ autogluon/core/utils/infer_utils.py,sha256=In-u47ELeMLol-O02-bbo79Ak2TYo6iuQ7SJk
85
85
  autogluon/core/utils/miscs.py,sha256=Kh5tXxyGHWfEEb36t3iBpdhJI7V4Pgih6xN_bpiQwFg,369
86
86
  autogluon/core/utils/plots.py,sha256=ambgOSY7Hp--CDU6raghZx6O8m4a49spdHRG3zEMypk,12238
87
87
  autogluon/core/utils/time.py,sha256=WaTonKWCSO5BoSIPo_hdgreGiQxoYE2dYWKO3aPFHj4,3841
88
- autogluon/core/utils/utils.py,sha256=K05ewQuGauLnVaYwccNDk1moUDg2EEzdSlq8gsw6JVM,55071
88
+ autogluon/core/utils/utils.py,sha256=FMa9kIUAxA3IIBbATmBnNEVObSAivehZ2_zCy3PRR-c,57660
89
89
  autogluon/core/utils/version_utils.py,sha256=5-r8hLRKTaZbj5qo2uzE_2E4casH49Ye3WyeHlgHuz4,3252
90
90
  autogluon/core/utils/loaders/__init__.py,sha256=W5FAdQvpDcn_uisqJrlSAObWVta-YjJLKGN3NCbEgIo,109
91
91
  autogluon/core/utils/savers/__init__.py,sha256=bGWciSxAkj6u06vOC4pTvr22f_1ey0glgvmjCMEOm78,89
92
- autogluon.core-1.2.1b20250113.dist-info/LICENSE,sha256=CeipvOyAZxBGUsFoaFqwkx54aPnIKEtm9a5u2uXxEws,10142
93
- autogluon.core-1.2.1b20250113.dist-info/METADATA,sha256=j-_5gd4PPTvZsdGC2daWT2OV-6A0Pdf4C20Qq-8_oC4,12328
94
- autogluon.core-1.2.1b20250113.dist-info/NOTICE,sha256=7nPQuj8Kp-uXsU0S5so3-2dNU5EctS5hDXvvzzehd7E,114
95
- autogluon.core-1.2.1b20250113.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
96
- autogluon.core-1.2.1b20250113.dist-info/namespace_packages.txt,sha256=giERA4R78OkJf2ijn5slgjURlhRPzfLr7waIcGkzYAo,10
97
- autogluon.core-1.2.1b20250113.dist-info/top_level.txt,sha256=giERA4R78OkJf2ijn5slgjURlhRPzfLr7waIcGkzYAo,10
98
- autogluon.core-1.2.1b20250113.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
99
- autogluon.core-1.2.1b20250113.dist-info/RECORD,,
92
+ autogluon.core-1.2.1b20250114.dist-info/LICENSE,sha256=CeipvOyAZxBGUsFoaFqwkx54aPnIKEtm9a5u2uXxEws,10142
93
+ autogluon.core-1.2.1b20250114.dist-info/METADATA,sha256=qZ8UNbAq_mHz_lqcg-_xSY0z9dXa8fNDuHOncGl7ICg,12328
94
+ autogluon.core-1.2.1b20250114.dist-info/NOTICE,sha256=7nPQuj8Kp-uXsU0S5so3-2dNU5EctS5hDXvvzzehd7E,114
95
+ autogluon.core-1.2.1b20250114.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
96
+ autogluon.core-1.2.1b20250114.dist-info/namespace_packages.txt,sha256=giERA4R78OkJf2ijn5slgjURlhRPzfLr7waIcGkzYAo,10
97
+ autogluon.core-1.2.1b20250114.dist-info/top_level.txt,sha256=giERA4R78OkJf2ijn5slgjURlhRPzfLr7waIcGkzYAo,10
98
+ autogluon.core-1.2.1b20250114.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
99
+ autogluon.core-1.2.1b20250114.dist-info/RECORD,,