autogluon.tabular 1.4.0__py3-none-any.whl → 1.4.1b20251128__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of autogluon.tabular might be problematic. Click here for more details.

Files changed (40) hide show
  1. autogluon/tabular/configs/pipeline_presets.py +130 -0
  2. autogluon/tabular/configs/presets_configs.py +0 -3
  3. autogluon/tabular/models/__init__.py +1 -0
  4. autogluon/tabular/models/catboost/catboost_model.py +4 -1
  5. autogluon/tabular/models/ebm/__init__.py +0 -0
  6. autogluon/tabular/models/ebm/ebm_model.py +259 -0
  7. autogluon/tabular/models/ebm/hyperparameters/__init__.py +0 -0
  8. autogluon/tabular/models/ebm/hyperparameters/parameters.py +39 -0
  9. autogluon/tabular/models/ebm/hyperparameters/searchspaces.py +72 -0
  10. autogluon/tabular/models/fastainn/tabular_nn_fastai.py +4 -2
  11. autogluon/tabular/models/knn/knn_model.py +7 -3
  12. autogluon/tabular/models/lgb/lgb_model.py +56 -18
  13. autogluon/tabular/models/lr/lr_model.py +6 -1
  14. autogluon/tabular/models/lr/lr_preprocessing_utils.py +6 -7
  15. autogluon/tabular/models/mitra/_internal/models/tab2d.py +10 -10
  16. autogluon/tabular/models/mitra/mitra_model.py +43 -3
  17. autogluon/tabular/models/mitra/sklearn_interface.py +8 -21
  18. autogluon/tabular/models/realmlp/realmlp_model.py +1 -3
  19. autogluon/tabular/models/rf/rf_model.py +5 -1
  20. autogluon/tabular/models/tabicl/tabicl_model.py +1 -7
  21. autogluon/tabular/models/tabm/tabm_model.py +76 -6
  22. autogluon/tabular/models/tabpfnmix/tabpfnmix_model.py +6 -4
  23. autogluon/tabular/models/tabpfnv2/tabpfnv2_model.py +1 -7
  24. autogluon/tabular/models/tabular_nn/hyperparameters/parameters.py +1 -3
  25. autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py +2 -1
  26. autogluon/tabular/models/xgboost/xgboost_model.py +8 -1
  27. autogluon/tabular/predictor/predictor.py +63 -55
  28. autogluon/tabular/registry/_ag_model_registry.py +2 -0
  29. autogluon/tabular/testing/fit_helper.py +28 -0
  30. autogluon/tabular/version.py +1 -1
  31. autogluon.tabular-1.4.1b20251128-py3.11-nspkg.pth +1 -0
  32. {autogluon.tabular-1.4.0.dist-info → autogluon_tabular-1.4.1b20251128.dist-info}/METADATA +87 -71
  33. {autogluon.tabular-1.4.0.dist-info → autogluon_tabular-1.4.1b20251128.dist-info}/RECORD +39 -33
  34. {autogluon.tabular-1.4.0.dist-info → autogluon_tabular-1.4.1b20251128.dist-info}/WHEEL +1 -1
  35. autogluon.tabular-1.4.0-py3.9-nspkg.pth +0 -1
  36. {autogluon.tabular-1.4.0.dist-info → autogluon_tabular-1.4.1b20251128.dist-info/licenses}/LICENSE +0 -0
  37. {autogluon.tabular-1.4.0.dist-info → autogluon_tabular-1.4.1b20251128.dist-info/licenses}/NOTICE +0 -0
  38. {autogluon.tabular-1.4.0.dist-info → autogluon_tabular-1.4.1b20251128.dist-info}/namespace_packages.txt +0 -0
  39. {autogluon.tabular-1.4.0.dist-info → autogluon_tabular-1.4.1b20251128.dist-info}/top_level.txt +0 -0
  40. {autogluon.tabular-1.4.0.dist-info → autogluon_tabular-1.4.1b20251128.dist-info}/zip-safe +0 -0
@@ -0,0 +1,130 @@
1
+ from __future__ import annotations
2
+
3
+ import math
4
+
5
+ from autogluon.core.constants import BINARY, PROBLEM_TYPES
6
+ from autogluon.core.utils.utils import default_holdout_frac
7
+
8
+ USE_BAG_HOLDOUT_AUTO_THRESHOLD = 1_000_000
9
+
10
+
11
+ def _get_validation_preset(num_train_rows: int, hpo_enabled: bool) -> dict[str, int | float]:
12
+ """Recommended validation preset manually defined by the AutoGluon developers."""
13
+
14
+ # -- Default recommendation
15
+ # max 8 due to 8 cores per CPU being very common.
16
+ # down to 5 folds for small datasets to have enough samples for a representative validation set.
17
+ num_bag_folds = min(8, max(5, math.floor(num_train_rows / 10)))
18
+
19
+ num_bag_sets = 1 # More repeats do not seem to help due to overfitting on val data.
20
+ use_bag_holdout = num_train_rows >= USE_BAG_HOLDOUT_AUTO_THRESHOLD
21
+ holdout_frac = round(default_holdout_frac(num_train_rows=num_train_rows, hyperparameter_tune=hpo_enabled), 4)
22
+
23
+ return dict(
24
+ num_bag_sets=num_bag_sets,
25
+ num_bag_folds=num_bag_folds,
26
+ use_bag_holdout=use_bag_holdout,
27
+ holdout_frac=holdout_frac,
28
+ )
29
+
30
+
31
+ # TODO(refactor): use a data class for the config of the validation method.
32
+ # TODO(improvement): Implement a more sophisticated solution.
33
+ # Could also use more metadata such as num_features, num_models,
34
+ # or time_limit for a heuristic.
35
+ # num_features: The number of features in the dataset.
36
+ # num_models: The number of models in the portfolio to fit.
37
+ # time_limit: The time limit for fitting models.
38
+ # Pointer for non-heuristic approach:
39
+ # -> meta-learning like Auto-Sklearn 2.0, needs a lot of metadata
40
+ def get_validation_and_stacking_method(
41
+ # Validation parameters
42
+ num_bag_folds: int | None,
43
+ num_bag_sets: int | None,
44
+ use_bag_holdout: bool | None,
45
+ holdout_frac: float | None,
46
+ # Stacking/Pipeline parameters
47
+ auto_stack: bool,
48
+ num_stack_levels: int | None,
49
+ dynamic_stacking: bool | None,
50
+ refit_full: bool | None,
51
+ # Metadata
52
+ num_train_rows: int,
53
+ problem_type: PROBLEM_TYPES,
54
+ hpo_enabled: bool,
55
+ ) -> tuple[int, int, int, bool, bool, float, bool]:
56
+ """Get the validation method for AutoGluon via a heuristic.
57
+
58
+ Input variables are `None` if they were not specified by the user or have an explicit default.
59
+
60
+ Parameters
61
+ ----------
62
+ num_bag_folds: int | None
63
+ The number of folds for cross-validation.
64
+ num_bag_sets: int | None
65
+ The number of repeats for cross-validation.
66
+ use_bag_holdout: bool | None
67
+ Whether to use (additional) holdout validation.
68
+ holdout_frac: float | None
69
+ The fraction of data to holdout for validation.
70
+ auto_stack: bool
71
+ Whether to automatically determine the stacking method.
72
+ num_stack_levels: int | None
73
+ The number of stacking levels.
74
+ dynamic_stacking: bool | None
75
+ Whether to use dynamic stacking.
76
+ refit_full: bool
77
+ Whether to refit the full training dataset.
78
+ num_train_rows: int
79
+ The number of rows in the training dataset.
80
+ problem_type: PROBLEM_TYPES
81
+ The type of problem to solve.
82
+ hpo_enabled: bool
83
+ If True, HPO is enabled during the run of AutoGluon.
84
+
85
+ Returns:
86
+ --------
87
+ Returns all variables needed to define the validation method.
88
+ """
89
+
90
+ cv_preset = _get_validation_preset(num_train_rows=num_train_rows, hpo_enabled=hpo_enabled)
91
+
92
+ # Independent of `auto_stack`
93
+ if use_bag_holdout is None:
94
+ use_bag_holdout = cv_preset["use_bag_holdout"]
95
+ if holdout_frac is None:
96
+ holdout_frac = cv_preset["holdout_frac"]
97
+ if dynamic_stacking is None:
98
+ dynamic_stacking = not use_bag_holdout
99
+ if refit_full is None:
100
+ refit_full = False
101
+
102
+ # Changed by `auto_stack`
103
+ if num_bag_folds is None:
104
+ # `num_bag_folds == 0` -> only use holdout validation
105
+ num_bag_folds = cv_preset["num_bag_folds"] if auto_stack else 0
106
+ if num_bag_sets is None:
107
+ # `num_bag_sets == 1` -> no repeats
108
+ num_bag_sets = cv_preset["num_bag_sets"] if auto_stack else 1
109
+ if num_stack_levels is None:
110
+ # Disable multi-layer stacking by default
111
+ num_stack_levels = 0
112
+
113
+ # Activate multi-layer stacking for `auto_stack` if
114
+ if auto_stack and (
115
+ dynamic_stacking # -> We use dynamic stacking
116
+ or
117
+ # -> We have holdout validation or a non-binary problem with more than 750 training rows
118
+ ((use_bag_holdout or (problem_type != BINARY)) and (num_train_rows >= 750))
119
+ ):
120
+ num_stack_levels = 1
121
+
122
+ return (
123
+ num_bag_folds,
124
+ num_bag_sets,
125
+ num_stack_levels,
126
+ dynamic_stacking,
127
+ use_bag_holdout,
128
+ holdout_frac,
129
+ refit_full,
130
+ )
@@ -6,7 +6,6 @@ tabular_presets_dict = dict(
6
6
  best_quality={
7
7
  "auto_stack": True,
8
8
  "dynamic_stacking": "auto",
9
- "num_bag_sets": 1,
10
9
  "hyperparameters": "zeroshot",
11
10
  "time_limit": 3600,
12
11
  },
@@ -16,7 +15,6 @@ tabular_presets_dict = dict(
16
15
  high_quality={
17
16
  "auto_stack": True,
18
17
  "dynamic_stacking": "auto",
19
- "num_bag_sets": 1,
20
18
  "hyperparameters": "zeroshot",
21
19
  "time_limit": 3600,
22
20
  "refit_full": True,
@@ -29,7 +27,6 @@ tabular_presets_dict = dict(
29
27
  good_quality={
30
28
  "auto_stack": True,
31
29
  "dynamic_stacking": "auto",
32
- "num_bag_sets": 1,
33
30
  "hyperparameters": "light",
34
31
  "time_limit": 3600,
35
32
  "refit_full": True,
@@ -3,6 +3,7 @@ from autogluon.core.models.abstract.abstract_model import AbstractModel
3
3
  from .automm.automm_model import MultiModalPredictorModel
4
4
  from .automm.ft_transformer import FTTransformerModel
5
5
  from .catboost.catboost_model import CatBoostModel
6
+ from .ebm.ebm_model import EBMModel
6
7
  from .fastainn.tabular_nn_fastai import NNFastAiTabularModel
7
8
  from .fasttext.fasttext_model import FastTextModel
8
9
  from .image_prediction.image_predictor import ImagePredictorModel
@@ -39,6 +39,7 @@ class CatBoostModel(AbstractModel):
39
39
  ag_priority_by_problem_type = MappingProxyType({
40
40
  SOFTCLASS: 60
41
41
  })
42
+ seed_name = "random_seed"
42
43
 
43
44
  def __init__(self, **kwargs):
44
45
  super().__init__(**kwargs)
@@ -48,7 +49,6 @@ class CatBoostModel(AbstractModel):
48
49
  default_params = get_param_baseline(problem_type=self.problem_type)
49
50
  for param, val in default_params.items():
50
51
  self._set_default_param_value(param, val)
51
- self._set_default_param_value("random_seed", 0) # Remove randomness for reproducibility
52
52
  # Set 'allow_writing_files' to True in order to keep log files created by catboost during training (these will be saved in the directory where AutoGluon stores this model)
53
53
  self._set_default_param_value("allow_writing_files", False) # Disables creation of catboost logging files during training by default
54
54
  if self.problem_type != SOFTCLASS: # TODO: remove this after catboost 0.24
@@ -126,6 +126,7 @@ class CatBoostModel(AbstractModel):
126
126
 
127
127
  ag_params = self._get_ag_params()
128
128
  params = self._get_model_params()
129
+
129
130
  params["thread_count"] = num_cpus
130
131
  if self.problem_type == SOFTCLASS:
131
132
  # FIXME: This is extremely slow due to unoptimized metric / objective sent to CatBoost
@@ -310,6 +311,8 @@ class CatBoostModel(AbstractModel):
310
311
  max_memory_iters = math.floor(available_mem * max_memory_proportion / mem_usage_per_iter)
311
312
 
312
313
  final_iters = min(default_iters, min(max_memory_iters, estimated_iters_in_time))
314
+ if final_iters < 1:
315
+ raise TimeLimitExceeded
313
316
  return final_iters
314
317
 
315
318
  def _predict_proba(self, X, **kwargs):
File without changes
@@ -0,0 +1,259 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ import warnings
5
+ from typing import TYPE_CHECKING
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION
10
+ from autogluon.core.models import AbstractModel
11
+
12
+ from .hyperparameters.parameters import get_param_baseline
13
+ from .hyperparameters.searchspaces import get_default_searchspace
14
+
15
+ if TYPE_CHECKING:
16
+ from autogluon.core.metrics import Scorer
17
+
18
+
19
+ class EbmCallback:
20
+ """Time limit callback for EBM."""
21
+
22
+ def __init__(self, seconds: float):
23
+ self.seconds = seconds
24
+ self.end_time: float | None = None
25
+
26
+ def __call__(self, *args, **kwargs):
27
+ if self.end_time is None:
28
+ self.end_time = time.monotonic() + self.seconds
29
+ return False
30
+ return time.monotonic() > self.end_time
31
+
32
+
33
+ class EBMModel(AbstractModel):
34
+ """
35
+ The Explainable Boosting Machine (EBM) is a glass-box generalized additive model
36
+ with automatic interaction detection (https://interpret.ml/docs). EBMs are
37
+ designed to be highly interpretable while achieving accuracy comparable to
38
+ black-box models on a wide range of tabular datasets.
39
+
40
+ Requires the 'interpret' or 'interpret-core' package. Install via:
41
+
42
+ pip install interpret
43
+
44
+
45
+ Paper: InterpretML: A Unified Framework for Machine Learning Interpretability
46
+
47
+ Authors: H. Nori, S. Jenkins, P. Koch, and R. Caruana 2019
48
+
49
+ Codebase: https://github.com/interpretml/interpret
50
+
51
+ License: MIT
52
+
53
+ .. versionadded:: 1.5.0
54
+ """
55
+
56
+ ag_key = "EBM"
57
+ ag_name = "EBM"
58
+ ag_priority = 35
59
+ seed_name = "random_state"
60
+
61
+ def _fit(
62
+ self,
63
+ X: pd.DataFrame,
64
+ y: pd.Series,
65
+ X_val: pd.DataFrame | None = None,
66
+ y_val: pd.Series | None = None,
67
+ time_limit: float | None = None,
68
+ sample_weight: np.ndarray | None = None,
69
+ sample_weight_val: np.ndarray | None = None,
70
+ num_cpus: int | str = "auto",
71
+ **kwargs,
72
+ ):
73
+ # Preprocess data.
74
+ X = self.preprocess(X)
75
+ if X_val is not None:
76
+ X_val = self.preprocess(X_val)
77
+
78
+ features = self._features
79
+ if features is None:
80
+ features = X.columns
81
+
82
+ params = construct_ebm_params(
83
+ self.problem_type,
84
+ self._get_model_params(),
85
+ features,
86
+ self.stopping_metric,
87
+ num_cpus,
88
+ time_limit,
89
+ )
90
+
91
+ # Init Class
92
+ model_cls = get_class_from_problem_type(self.problem_type)
93
+ self.model = model_cls(**params)
94
+
95
+ # Handle validation data format for EBM
96
+ fit_X = X
97
+ fit_y = y
98
+ fit_sample_weight = sample_weight
99
+ bags = None
100
+ if X_val is not None:
101
+ fit_X = pd.concat([X, X_val], ignore_index=True)
102
+ fit_y = pd.concat([y, y_val], ignore_index=True)
103
+ if sample_weight is not None:
104
+ fit_sample_weight = np.hstack([sample_weight, sample_weight_val])
105
+ bags = np.full((len(fit_X), 1), 1, np.int8)
106
+ bags[len(X) :, 0] = -1
107
+
108
+ with warnings.catch_warnings(): # try to filter joblib warnings
109
+ warnings.filterwarnings(
110
+ "ignore",
111
+ category=UserWarning,
112
+ message=".*resource_tracker: process died.*",
113
+ )
114
+ self.model.fit(fit_X, fit_y, sample_weight=fit_sample_weight, bags=bags)
115
+
116
+ def _set_default_params(self):
117
+ default_params = get_param_baseline(problem_type=self.problem_type, num_classes=self.num_classes)
118
+ for param, val in default_params.items():
119
+ self._set_default_param_value(param, val)
120
+
121
+ def _get_default_searchspace(self):
122
+ return get_default_searchspace(problem_type=self.problem_type, num_classes=self.num_classes)
123
+
124
+ def _get_default_auxiliary_params(self) -> dict:
125
+ default_auxiliary_params = super()._get_default_auxiliary_params()
126
+ extra_auxiliary_params = {
127
+ "valid_raw_types": ["int", "float", "category"],
128
+ }
129
+ default_auxiliary_params.update(extra_auxiliary_params)
130
+ return default_auxiliary_params
131
+
132
+ @classmethod
133
+ def supported_problem_types(cls) -> list[str] | None:
134
+ return ["binary", "multiclass", "regression"]
135
+
136
+ @classmethod
137
+ def _class_tags(cls) -> dict:
138
+ return {"can_estimate_memory_usage_static": True}
139
+
140
+ def _more_tags(self) -> dict:
141
+ """EBMs support refit full."""
142
+ return {"can_refit_full": True}
143
+
144
+ def _estimate_memory_usage(self, X: pd.DataFrame, y: pd.Series | None = None, **kwargs) -> int:
145
+ return self.estimate_memory_usage_static(
146
+ X=X,
147
+ y=y,
148
+ hyperparameters=self._get_model_params(),
149
+ problem_type=self.problem_type,
150
+ num_classes=self.num_classes,
151
+ features=self._features,
152
+ **kwargs,
153
+ )
154
+
155
+ @classmethod
156
+ def _estimate_memory_usage_static(
157
+ cls,
158
+ *,
159
+ X: pd.DataFrame,
160
+ y: pd.Series | None = None,
161
+ hyperparameters: dict | None = None,
162
+ problem_type: str = "infer",
163
+ num_classes: int = 1,
164
+ features=None,
165
+ **kwargs,
166
+ ) -> int:
167
+ """Returns the expected peak memory usage in bytes of the EBM model during fit."""
168
+ # TODO: we can improve the memory estimate slightly by using num_classes if y is None
169
+
170
+ if features is None:
171
+ features = X.columns
172
+
173
+ model_cls = get_class_from_problem_type(problem_type)
174
+ params = construct_ebm_params(problem_type, hyperparameters, features)
175
+ baseline_memory_bytes = 400_000_000 # 400 MB baseline memory
176
+
177
+ # assuming we call pd.concat([X, X_val], ignore_index=True), then X size will be doubled
178
+ return baseline_memory_bytes + model_cls(**params).estimate_mem(
179
+ X, y, data_multiplier=2.0
180
+ )
181
+
182
+ def _validate_fit_memory_usage(self, mem_error_threshold: float = 1, **kwargs):
183
+ # Given the good mem estimates with overhead, we set the threshold to 1.
184
+ return super()._validate_fit_memory_usage(
185
+ mem_error_threshold=mem_error_threshold, **kwargs
186
+ )
187
+
188
+
189
+ def construct_ebm_params(
190
+ problem_type,
191
+ hyperparameters=None,
192
+ features=None,
193
+ stopping_metric=None,
194
+ num_cpus=-1,
195
+ time_limit=None,
196
+ ):
197
+ if hyperparameters is None:
198
+ hyperparameters = {}
199
+
200
+ hyperparameters = hyperparameters.copy() # we pop values below, so copy.
201
+
202
+ # The user can specify nominal and continuous columns.
203
+ continuous_columns = hyperparameters.pop("continuous_columns", [])
204
+ nominal_columns = hyperparameters.pop("nominal_columns", [])
205
+
206
+ feature_types = None
207
+ if features is not None:
208
+ feature_types = []
209
+ for c in features:
210
+ if c in continuous_columns:
211
+ f_type = "continuous"
212
+ elif c in nominal_columns:
213
+ f_type = "nominal"
214
+ else:
215
+ f_type = "auto"
216
+ feature_types.append(f_type)
217
+
218
+ # Default parameters for EBM
219
+ params = {
220
+ "outer_bags": 1, # AutoGluon ensemble creates outer bags, no need for this overhead.
221
+ "n_jobs": 1, # EBM only parallelizes across outer bags currently, so ignore num_cpus
222
+ "feature_names": features,
223
+ "feature_types": feature_types,
224
+ }
225
+ if stopping_metric is not None:
226
+ params["objective"] = get_metric_from_ag_metric(
227
+ metric=stopping_metric, problem_type=problem_type
228
+ )
229
+ if time_limit is not None:
230
+ params["callback"] = EbmCallback(time_limit)
231
+
232
+ params.update(hyperparameters)
233
+ return params
234
+
235
+
236
+ def get_class_from_problem_type(problem_type: str):
237
+ if problem_type in [BINARY, MULTICLASS]:
238
+ from interpret.glassbox import ExplainableBoostingClassifier
239
+
240
+ model_cls = ExplainableBoostingClassifier
241
+ elif problem_type == REGRESSION:
242
+ from interpret.glassbox import ExplainableBoostingRegressor
243
+
244
+ model_cls = ExplainableBoostingRegressor
245
+ else:
246
+ raise ValueError(f"Unsupported problem type: {problem_type}")
247
+ return model_cls
248
+
249
+
250
+ def get_metric_from_ag_metric(*, metric: Scorer, problem_type: str):
251
+ """Map AutoGluon metric to EBM metric for early stopping."""
252
+ if problem_type in [BINARY, MULTICLASS]:
253
+ metric_class = "log_loss"
254
+ elif problem_type == REGRESSION:
255
+ metric_class = "rmse"
256
+ else:
257
+ raise AssertionError(f"EBM does not support {problem_type} problem type.")
258
+
259
+ return metric_class
@@ -0,0 +1,39 @@
1
+ from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION, SOFTCLASS
2
+
3
+ def get_param_baseline(problem_type, num_classes=None):
4
+ if problem_type == BINARY:
5
+ return get_param_binary_baseline()
6
+ elif problem_type == MULTICLASS:
7
+ return get_param_multiclass_baseline(num_classes=num_classes)
8
+ elif problem_type == SOFTCLASS:
9
+ return get_param_multiclass_baseline(num_classes=num_classes)
10
+ elif problem_type == REGRESSION:
11
+ return get_param_regression_baseline()
12
+ else:
13
+ return get_param_binary_baseline()
14
+
15
+
16
+ def get_base_params():
17
+ base_params = {}
18
+ return base_params
19
+
20
+
21
+ def get_param_binary_baseline():
22
+ params = get_base_params()
23
+ baseline_params = {}
24
+ params.update(baseline_params)
25
+ return params
26
+
27
+
28
+ def get_param_multiclass_baseline(num_classes):
29
+ params = get_base_params()
30
+ baseline_params = {}
31
+ params.update(baseline_params)
32
+ return params
33
+
34
+
35
+ def get_param_regression_baseline():
36
+ params = get_base_params()
37
+ baseline_params = {}
38
+ params.update(baseline_params)
39
+ return params
@@ -0,0 +1,72 @@
1
+ """Default hyperparameter search spaces used in EBM model"""
2
+
3
+ from autogluon.common import space
4
+ from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION
5
+
6
+ def get_default_searchspace(problem_type, num_classes=None):
7
+ if problem_type == BINARY:
8
+ return get_searchspace_binary_baseline()
9
+ elif problem_type == MULTICLASS:
10
+ return get_searchspace_multiclass_baseline(num_classes=num_classes)
11
+ elif problem_type == REGRESSION:
12
+ return get_searchspace_regression_baseline()
13
+ else:
14
+ return get_searchspace_binary_baseline()
15
+
16
+
17
+ def get_base_searchspace():
18
+ base_params = {
19
+ "max_leaves": space.Int(2, 3, default=2),
20
+ "smoothing_rounds": space.Int(0, 1000, default=200),
21
+ "learning_rate": space.Real(0.0025, 0.2, default=0.02, log=True),
22
+ "interactions": space.Categorical(
23
+ 0,
24
+ "0.5x",
25
+ "1x",
26
+ "1.5x",
27
+ "2x",
28
+ "2.5x",
29
+ "3x",
30
+ "3.5x",
31
+ "4x",
32
+ "4.5x",
33
+ "5x",
34
+ "6x",
35
+ "7x",
36
+ "8x",
37
+ "9x",
38
+ "10x",
39
+ "15x",
40
+ "20x",
41
+ "25x",
42
+ ),
43
+ "interaction_smoothing_rounds": space.Int(0, 200, default=90),
44
+ "min_hessian": space.Real(1e-10, 1e-2, default=1e-4, log=True),
45
+ "min_samples_leaf": space.Int(2, 20, default=4),
46
+ "gain_scale": space.Real(0.5, 5.0, default=5.0, log=True),
47
+ "min_cat_samples": space.Int(5, 20, default=10),
48
+ "cat_smooth": space.Real(5.0, 100.0, default=10.0, log=True),
49
+ "missing": space.Categorical("separate", "low", "high", "gain"),
50
+ }
51
+ return base_params
52
+
53
+
54
+ def get_searchspace_multiclass_baseline(num_classes):
55
+ params = get_base_searchspace()
56
+ baseline_params = {}
57
+ params.update(baseline_params)
58
+ return params
59
+
60
+
61
+ def get_searchspace_binary_baseline():
62
+ params = get_base_searchspace()
63
+ baseline_params = {}
64
+ params.update(baseline_params)
65
+ return params
66
+
67
+
68
+ def get_searchspace_regression_baseline():
69
+ params = get_base_searchspace()
70
+ baseline_params = {}
71
+ params.update(baseline_params)
72
+ return params
@@ -103,6 +103,7 @@ class NNFastAiTabularModel(AbstractModel):
103
103
  ag_priority_by_problem_type = MappingProxyType({
104
104
  MULTICLASS: 95,
105
105
  })
106
+ seed_name = "random_seed"
106
107
 
107
108
  model_internals_file_name = "model-internals.pkl"
108
109
 
@@ -322,8 +323,9 @@ class NNFastAiTabularModel(AbstractModel):
322
323
  # Make deterministic
323
324
  from fastai.torch_core import set_seed
324
325
 
325
- set_seed(0, True)
326
- dls.rng.seed(0)
326
+ random_seed = params.pop(self.seed_name, self.default_random_seed)
327
+ set_seed(random_seed, True)
328
+ dls.rng.seed(random_seed)
327
329
 
328
330
  if self.problem_type == QUANTILE:
329
331
  dls.c = len(self.quantile_levels)
@@ -214,7 +214,7 @@ class KNNModel(AbstractModel):
214
214
  def sample_func(chunk, frac):
215
215
  # Guarantee at least 1 sample (otherwise log_loss would crash or model would return different column counts in pred_proba)
216
216
  n = max(math.ceil(len(chunk) * frac), 1)
217
- return chunk.sample(n=n, replace=False, random_state=0)
217
+ return chunk.sample(n=n, replace=False, random_state=self.random_seed)
218
218
 
219
219
  if self.problem_type != REGRESSION:
220
220
  y_df = y.to_frame(name="label").reset_index(drop=True)
@@ -255,9 +255,13 @@ class KNNModel(AbstractModel):
255
255
  self._X_unused_index = [i for i in range(num_rows_max) if i not in idx]
256
256
  return self.model
257
257
 
258
- def _get_maximum_resources(self) -> Dict[str, Union[int, float]]:
258
+ def _get_maximum_resources(self) -> dict[str, int | float]:
259
259
  # use at most 32 cpus to avoid OpenBLAS error: https://github.com/autogluon/autogluon/issues/1020
260
- return {"num_cpus": 32}
260
+ # no GPU support
261
+ return {
262
+ "num_cpus": 32,
263
+ "num_gpus": 0,
264
+ }
261
265
 
262
266
  def _get_default_resources(self):
263
267
  # use at most 32 cpus to avoid OpenBLAS error: https://github.com/autogluon/autogluon/issues/1020