autogluon.tabular 1.3.2b20250610__py3-none-any.whl → 1.4.1b20251214__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. autogluon/tabular/configs/config_helper.py +1 -1
  2. autogluon/tabular/configs/hyperparameter_configs.py +2 -265
  3. autogluon/tabular/configs/pipeline_presets.py +130 -0
  4. autogluon/tabular/configs/presets_configs.py +51 -26
  5. autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2023.py +0 -1
  6. autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2025.py +310 -0
  7. autogluon/tabular/models/__init__.py +6 -1
  8. autogluon/tabular/models/_utils/rapids_utils.py +1 -1
  9. autogluon/tabular/models/automm/automm_model.py +2 -0
  10. autogluon/tabular/models/automm/ft_transformer.py +4 -1
  11. autogluon/tabular/models/catboost/callbacks.py +3 -2
  12. autogluon/tabular/models/catboost/catboost_model.py +15 -9
  13. autogluon/tabular/models/catboost/catboost_utils.py +17 -3
  14. autogluon/tabular/models/ebm/__init__.py +0 -0
  15. autogluon/tabular/models/ebm/ebm_model.py +259 -0
  16. autogluon/tabular/models/ebm/hyperparameters/__init__.py +0 -0
  17. autogluon/tabular/models/ebm/hyperparameters/parameters.py +39 -0
  18. autogluon/tabular/models/ebm/hyperparameters/searchspaces.py +72 -0
  19. autogluon/tabular/models/fastainn/tabular_nn_fastai.py +7 -5
  20. autogluon/tabular/models/knn/knn_model.py +7 -3
  21. autogluon/tabular/models/lgb/lgb_model.py +60 -21
  22. autogluon/tabular/models/lr/lr_model.py +6 -1
  23. autogluon/tabular/models/lr/lr_preprocessing_utils.py +6 -7
  24. autogluon/tabular/models/lr/lr_rapids_model.py +45 -5
  25. autogluon/tabular/models/mitra/__init__.py +0 -0
  26. autogluon/tabular/models/mitra/_internal/__init__.py +1 -0
  27. autogluon/tabular/models/mitra/_internal/config/__init__.py +1 -0
  28. autogluon/tabular/models/mitra/_internal/config/config_pretrain.py +190 -0
  29. autogluon/tabular/models/mitra/_internal/config/config_run.py +32 -0
  30. autogluon/tabular/models/mitra/_internal/config/enums.py +162 -0
  31. autogluon/tabular/models/mitra/_internal/core/__init__.py +1 -0
  32. autogluon/tabular/models/mitra/_internal/core/callbacks.py +94 -0
  33. autogluon/tabular/models/mitra/_internal/core/get_loss.py +54 -0
  34. autogluon/tabular/models/mitra/_internal/core/get_optimizer.py +108 -0
  35. autogluon/tabular/models/mitra/_internal/core/get_scheduler.py +67 -0
  36. autogluon/tabular/models/mitra/_internal/core/prediction_metrics.py +132 -0
  37. autogluon/tabular/models/mitra/_internal/core/trainer_finetune.py +373 -0
  38. autogluon/tabular/models/mitra/_internal/data/__init__.py +1 -0
  39. autogluon/tabular/models/mitra/_internal/data/collator.py +46 -0
  40. autogluon/tabular/models/mitra/_internal/data/dataset_finetune.py +136 -0
  41. autogluon/tabular/models/mitra/_internal/data/dataset_split.py +57 -0
  42. autogluon/tabular/models/mitra/_internal/data/preprocessor.py +420 -0
  43. autogluon/tabular/models/mitra/_internal/models/__init__.py +1 -0
  44. autogluon/tabular/models/mitra/_internal/models/base.py +21 -0
  45. autogluon/tabular/models/mitra/_internal/models/embedding.py +182 -0
  46. autogluon/tabular/models/mitra/_internal/models/tab2d.py +667 -0
  47. autogluon/tabular/models/mitra/_internal/utils/__init__.py +1 -0
  48. autogluon/tabular/models/mitra/_internal/utils/set_seed.py +15 -0
  49. autogluon/tabular/models/mitra/mitra_model.py +380 -0
  50. autogluon/tabular/models/mitra/sklearn_interface.py +494 -0
  51. autogluon/tabular/models/realmlp/__init__.py +0 -0
  52. autogluon/tabular/models/realmlp/realmlp_model.py +360 -0
  53. autogluon/tabular/models/rf/rf_model.py +11 -6
  54. autogluon/tabular/models/tabicl/__init__.py +0 -0
  55. autogluon/tabular/models/tabicl/tabicl_model.py +179 -0
  56. autogluon/tabular/models/tabm/__init__.py +0 -0
  57. autogluon/tabular/models/tabm/_tabm_internal.py +545 -0
  58. autogluon/tabular/models/tabm/rtdl_num_embeddings.py +810 -0
  59. autogluon/tabular/models/tabm/tabm_model.py +356 -0
  60. autogluon/tabular/models/tabm/tabm_reference.py +631 -0
  61. autogluon/tabular/models/tabpfnmix/tabpfnmix_model.py +13 -7
  62. autogluon/tabular/models/tabpfnv2/__init__.py +0 -0
  63. autogluon/tabular/models/tabpfnv2/rfpfn/__init__.py +20 -0
  64. autogluon/tabular/models/tabpfnv2/rfpfn/configs.py +40 -0
  65. autogluon/tabular/models/tabpfnv2/rfpfn/scoring_utils.py +201 -0
  66. autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_based_decision_tree_tabpfn.py +1464 -0
  67. autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_based_random_forest_tabpfn.py +747 -0
  68. autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_compat.py +863 -0
  69. autogluon/tabular/models/tabpfnv2/rfpfn/utils.py +106 -0
  70. autogluon/tabular/models/tabpfnv2/tabpfnv2_model.py +388 -0
  71. autogluon/tabular/models/tabular_nn/hyperparameters/parameters.py +1 -3
  72. autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py +5 -5
  73. autogluon/tabular/models/xgboost/xgboost_model.py +10 -3
  74. autogluon/tabular/predictor/predictor.py +147 -84
  75. autogluon/tabular/registry/_ag_model_registry.py +12 -2
  76. autogluon/tabular/testing/fit_helper.py +57 -27
  77. autogluon/tabular/testing/generate_datasets.py +7 -0
  78. autogluon/tabular/trainer/abstract_trainer.py +3 -1
  79. autogluon/tabular/trainer/model_presets/presets.py +10 -1
  80. autogluon/tabular/version.py +1 -1
  81. autogluon.tabular-1.4.1b20251214-py3.11-nspkg.pth +1 -0
  82. {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info}/METADATA +112 -57
  83. {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info}/RECORD +89 -40
  84. {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info}/WHEEL +1 -1
  85. autogluon/tabular/models/tabpfn/__init__.py +0 -1
  86. autogluon/tabular/models/tabpfn/tabpfn_model.py +0 -153
  87. autogluon.tabular-1.3.2b20250610-py3.9-nspkg.pth +0 -1
  88. {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info/licenses}/LICENSE +0 -0
  89. {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info/licenses}/NOTICE +0 -0
  90. {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info}/namespace_packages.txt +0 -0
  91. {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info}/top_level.txt +0 -0
  92. {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info}/zip-safe +0 -0
@@ -0,0 +1,360 @@
1
+ """
2
+ Code Adapted from TabArena: https://github.com/autogluon/tabrepo/blob/main/tabrepo/benchmark/models/ag/realmlp/realmlp_model.py
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import logging
8
+ import math
9
+ import time
10
+ from contextlib import contextmanager
11
+ from typing import Literal
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+ from sklearn.impute import SimpleImputer
16
+
17
+ from autogluon.common.utils.pandas_utils import get_approximate_df_mem_usage
18
+ from autogluon.common.utils.resource_utils import ResourceManager
19
+ from autogluon.core.models import AbstractModel
20
+ from autogluon.tabular import __version__
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ @contextmanager
26
+ def set_logger_level(logger_name: str, level: int):
27
+ _logger = logging.getLogger(logger_name)
28
+ old_level = _logger.level
29
+ _logger.setLevel(level)
30
+ try:
31
+ yield
32
+ finally:
33
+ _logger.setLevel(old_level)
34
+
35
+
36
+ # pip install pytabkit
37
+ class RealMLPModel(AbstractModel):
38
+ """
39
+ RealMLP is an improved multilayer perception (MLP) model
40
+ through a bag of tricks and better default hyperparameters.
41
+
42
+ RealMLP is the top performing method overall on TabArena-v0.1: https://tabarena.ai
43
+
44
+ Paper: Better by Default: Strong Pre-Tuned MLPs and Boosted Trees on Tabular Data
45
+ Authors: David Holzmüller, Léo Grinsztajn, Ingo Steinwart
46
+ Codebase: https://github.com/dholzmueller/pytabkit
47
+ License: Apache-2.0
48
+
49
+ .. versionadded:: 1.4.0
50
+ """
51
+ ag_key = "REALMLP"
52
+ ag_name = "RealMLP"
53
+ ag_priority = 75
54
+ seed_name = "random_state"
55
+
56
+ def __init__(self, **kwargs):
57
+ super().__init__(**kwargs)
58
+ self._imputer = None
59
+ self._features_to_impute = None
60
+ self._features_to_keep = None
61
+ self._indicator_columns = None
62
+ self._features_bool = None
63
+ self._bool_to_cat = None
64
+
65
+ def get_model_cls(self, default_hyperparameters: Literal["td", "td_s"] = "td"):
66
+ from pytabkit import (
67
+ RealMLP_TD_Classifier,
68
+ RealMLP_TD_Regressor,
69
+ RealMLP_TD_S_Classifier,
70
+ RealMLP_TD_S_Regressor,
71
+ )
72
+
73
+ assert default_hyperparameters in ["td", "td_s"]
74
+ if self.problem_type in ['binary', 'multiclass']:
75
+ if default_hyperparameters == "td":
76
+ model_cls = RealMLP_TD_Classifier
77
+ else:
78
+ model_cls = RealMLP_TD_S_Classifier
79
+ else:
80
+ if default_hyperparameters == "td":
81
+ model_cls = RealMLP_TD_Regressor
82
+ else:
83
+ model_cls = RealMLP_TD_S_Regressor
84
+ return model_cls
85
+
86
+ def _fit(
87
+ self,
88
+ X: pd.DataFrame,
89
+ y: pd.Series,
90
+ X_val: pd.DataFrame = None,
91
+ y_val: pd.Series = None,
92
+ time_limit: float = None,
93
+ num_cpus: int = 1,
94
+ num_gpus: float = 0,
95
+ verbosity: int = 2,
96
+ **kwargs,
97
+ ):
98
+ start_time = time.time()
99
+
100
+ try:
101
+ import pytabkit
102
+ import torch
103
+ except ImportError as err:
104
+ logger.log(
105
+ 40,
106
+ f"\tFailed to import pytabkit/torch! To use the ReaLMLP model, "
107
+ f"do: `pip install autogluon.tabular[realmlp]=={__version__}`.",
108
+ )
109
+ raise err
110
+
111
+ if verbosity == 0:
112
+ _lightning_log_level = logging.ERROR
113
+ elif verbosity <= 2:
114
+ _lightning_log_level = logging.WARNING
115
+ else:
116
+ _lightning_log_level = logging.INFO
117
+
118
+ # FIXME: code assume we only see one GPU in the fit process.
119
+ device = "cpu" if num_gpus == 0 else "cuda:0"
120
+ if (device == "cuda:0") and (not torch.cuda.is_available()):
121
+ raise AssertionError(
122
+ "Fit specified to use GPU, but CUDA is not available on this machine. "
123
+ "Please switch to CPU usage instead.",
124
+ )
125
+
126
+ hyp = self._get_model_params()
127
+
128
+ default_hyperparameters = hyp.pop("default_hyperparameters", "td")
129
+
130
+ model_cls = self.get_model_cls(default_hyperparameters=default_hyperparameters)
131
+
132
+ metric_map = {
133
+ "roc_auc": "1-auc_ovr_alt",
134
+ "accuracy": "class_error",
135
+ "balanced_accuracy": "1-balanced_accuracy",
136
+ "log_loss": "cross_entropy",
137
+ "rmse": "rmse",
138
+ "root_mean_squared_error": "rmse",
139
+ "r2": "rmse",
140
+ "mae": "mae",
141
+ "mean_average_error": "mae",
142
+ }
143
+
144
+ val_metric_name = metric_map.get(self.stopping_metric.name, None)
145
+
146
+ init_kwargs = dict()
147
+
148
+ if val_metric_name is not None:
149
+ init_kwargs["val_metric_name"] = val_metric_name
150
+
151
+ # TODO: Make this smarter? Maybe use `eval_metric.needs_pred`
152
+ if hyp["use_ls"] is not None and isinstance(hyp["use_ls"], str) and hyp["use_ls"] == "auto":
153
+ if val_metric_name is None:
154
+ hyp["use_ls"] = False
155
+ elif val_metric_name in ["cross_entropy", "1-auc_ovr_alt"]:
156
+ hyp["use_ls"] = False
157
+ else:
158
+ hyp["use_ls"] = None
159
+
160
+ if X_val is None:
161
+ hyp["use_early_stopping"] = False
162
+ hyp["val_fraction"] = 0
163
+
164
+ bool_to_cat = hyp.pop("bool_to_cat", True)
165
+ impute_bool = hyp.pop("impute_bool", True)
166
+ name_categories = hyp.pop("name_categories", True)
167
+
168
+ n_features = len(X.columns)
169
+ if "predict_batch_size" in hyp and isinstance(hyp["predict_batch_size"], str) and hyp["predict_batch_size"] == "auto":
170
+ # simple heuristic to avoid OOM during inference time
171
+ # note: this isn't fool-proof, and ignores the actual memory availability of the machine.
172
+ # note: this is based on an assumption of 32 GB of memory available on the instance
173
+ # default is 1024
174
+ hyp["predict_batch_size"] = max(min(int(8192 * 200 / n_features), 8192), 64)
175
+
176
+ self.model = model_cls(
177
+ n_threads=num_cpus,
178
+ device=device,
179
+ **init_kwargs,
180
+ **hyp,
181
+ )
182
+
183
+ X = self.preprocess(X, is_train=True, bool_to_cat=bool_to_cat, impute_bool=impute_bool)
184
+
185
+ # FIXME: In rare cases can cause exceptions if name_categories=False, unknown why
186
+ extra_fit_kwargs = {}
187
+ if name_categories:
188
+ cat_col_names = X.select_dtypes(include='category').columns.tolist()
189
+ extra_fit_kwargs["cat_col_names"] = cat_col_names
190
+
191
+ if X_val is not None:
192
+ X_val = self.preprocess(X_val)
193
+
194
+ with set_logger_level("lightning.pytorch", _lightning_log_level):
195
+ self.model = self.model.fit(
196
+ X=X,
197
+ y=y,
198
+ X_val=X_val,
199
+ y_val=y_val,
200
+ time_to_fit_in_seconds=time_limit - (time.time() - start_time) if time_limit is not None else None,
201
+ **extra_fit_kwargs,
202
+ )
203
+
204
+ def _predict_proba(self, X, **kwargs) -> np.ndarray:
205
+ with set_logger_level("lightning.pytorch", logging.WARNING):
206
+ return super()._predict_proba(X=X, kwargs=kwargs)
207
+
208
+ # TODO: Move missing indicator + mean fill to a generic preprocess flag available to all models
209
+ # FIXME: bool_to_cat is a hack: Maybe move to abstract model?
210
+ def _preprocess(self, X: pd.DataFrame, is_train: bool = False, bool_to_cat: bool = False, impute_bool: bool = True, **kwargs) -> pd.DataFrame:
211
+ """
212
+ Imputes missing values via the mean and adds indicator columns for numerical features.
213
+ Converts indicator columns to categorical features to avoid them being treated as numerical by RealMLP.
214
+ """
215
+ X = super()._preprocess(X, **kwargs)
216
+
217
+ # FIXME: is copy needed?
218
+ X = X.copy(deep=True)
219
+ if is_train:
220
+ self._bool_to_cat = bool_to_cat
221
+ self._features_bool = self._feature_metadata.get_features(required_special_types=["bool"])
222
+ if impute_bool: # Technically this should do nothing useful because bools will never have NaN
223
+ self._features_to_impute = self._feature_metadata.get_features(valid_raw_types=["int", "float"])
224
+ self._features_to_keep = self._feature_metadata.get_features(invalid_raw_types=["int", "float"])
225
+ else:
226
+ self._features_to_impute = self._feature_metadata.get_features(valid_raw_types=["int", "float"], invalid_special_types=["bool"])
227
+ self._features_to_keep = [f for f in self._feature_metadata.get_features() if f not in self._features_to_impute]
228
+ if self._features_to_impute:
229
+ self._imputer = SimpleImputer(strategy="mean", add_indicator=True)
230
+ self._imputer.fit(X=X[self._features_to_impute])
231
+ self._indicator_columns = [c for c in self._imputer.get_feature_names_out() if c not in self._features_to_impute]
232
+ if self._imputer is not None:
233
+ X_impute = self._imputer.transform(X=X[self._features_to_impute])
234
+ X_impute = pd.DataFrame(X_impute, index=X.index, columns=self._imputer.get_feature_names_out())
235
+ if self._indicator_columns:
236
+ # FIXME: Use CategoryFeatureGenerator? Or tell the model which is category
237
+ # TODO: Add to features_bool?
238
+ X_impute[self._indicator_columns] = X_impute[self._indicator_columns].astype("category")
239
+ X = pd.concat([X[self._features_to_keep], X_impute], axis=1)
240
+ if self._bool_to_cat and self._features_bool:
241
+ # FIXME: Use CategoryFeatureGenerator? Or tell the model which is category
242
+ X[self._features_bool] = X[self._features_bool].astype("category")
243
+ return X
244
+
245
+ def _set_default_params(self):
246
+ default_params = dict(
247
+ # Don't use early stopping by default, seems to work well without
248
+ use_early_stopping=False,
249
+ early_stopping_additive_patience=40,
250
+ early_stopping_multiplicative_patience=3,
251
+
252
+ # verdict: use_ls="auto" is much better than None.
253
+ use_ls="auto",
254
+
255
+ # verdict: no impact, but makes more sense to be False.
256
+ impute_bool=False,
257
+
258
+ # verdict: name_categories=True avoids random exceptions being raised in rare cases
259
+ name_categories=True,
260
+
261
+ # verdict: bool_to_cat=True is equivalent to False in terms of quality, but can be slightly faster in training time
262
+ # and slightly slower in inference time
263
+ bool_to_cat=True,
264
+
265
+ # verdict: "td" is better than "td_s"
266
+ default_hyperparameters="td", # options ["td", "td_s"]
267
+
268
+ predict_batch_size="auto", # if auto, uses AutoGluon's heuristic to set a value between 8192 and 64.
269
+ )
270
+ for param, val in default_params.items():
271
+ self._set_default_param_value(param, val)
272
+
273
+ @classmethod
274
+ def supported_problem_types(cls) -> list[str] | None:
275
+ return ["binary", "multiclass", "regression"]
276
+
277
+ def _get_default_stopping_metric(self):
278
+ return self.eval_metric
279
+
280
+ def _get_default_resources(self) -> tuple[int, int]:
281
+ # Use only physical cores for better performance based on benchmarks
282
+ num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
283
+
284
+ num_gpus = min(1, ResourceManager.get_gpu_count_torch(cuda_only=True))
285
+
286
+ return num_cpus, num_gpus
287
+
288
+ def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
289
+ hyperparameters = self._get_model_params()
290
+ return self.estimate_memory_usage_static(X=X, problem_type=self.problem_type, num_classes=self.num_classes, hyperparameters=hyperparameters, **kwargs)
291
+
292
+ @classmethod
293
+ def _estimate_memory_usage_static(
294
+ cls,
295
+ *,
296
+ X: pd.DataFrame,
297
+ hyperparameters: dict = None,
298
+ **kwargs,
299
+ ) -> int:
300
+ """
301
+ Heuristic memory estimate that correlates strongly with RealMLP's more sophisticated method
302
+
303
+ More comprehensive memory estimate logic:
304
+
305
+ ```python
306
+ from typing import Any
307
+
308
+ from pytabkit.models.alg_interfaces.nn_interfaces import NNAlgInterface
309
+ from pytabkit.models.data.data import DictDataset, TensorInfo
310
+ from pytabkit.models.sklearn.default_params import DefaultParams
311
+
312
+ def estimate_realmlp_cpu_ram_gb(hparams: dict[str, Any], n_numerical: int, cat_sizes: list[int], n_classes: int,
313
+ n_samples: int):
314
+ params = copy.copy(DefaultParams.RealMLP_TD_CLASS if n_classes > 0 else DefaultParams.RealMLP_TD_REG)
315
+ params.update(hparams)
316
+
317
+ ds = DictDataset(tensors=None, tensor_infos=dict(x_cont=TensorInfo(feat_shape=[n_numerical]),
318
+ x_cat=TensorInfo(cat_sizes=cat_sizes),
319
+ y=TensorInfo(cat_sizes=[n_classes])), device='cpu',
320
+ n_samples=n_samples)
321
+
322
+ alg_interface = NNAlgInterface(**params)
323
+ res = alg_interface.get_required_resources(ds, n_cv=1, n_refit=0, n_splits=1, split_seeds=[0], n_train=n_samples)
324
+ return res.cpu_ram_gb
325
+ ```
326
+
327
+ """
328
+ if hyperparameters is None:
329
+ hyperparameters = {}
330
+ plr_hidden_1 = hyperparameters.get("plr_hidden_1", 16)
331
+ plr_hidden_2 = hyperparameters.get("plr_hidden_2", 4)
332
+ hidden_width = hyperparameters.get("hidden_width", 256)
333
+
334
+ num_features = len(X.columns)
335
+ columns_mem_est = num_features * 8e5
336
+
337
+ hidden_1_weight = 0.13
338
+ hidden_2_weight = 0.42
339
+ width_factor = math.sqrt(hidden_width / 256 + 0.6)
340
+
341
+ columns_mem_est_hidden_1 = columns_mem_est * hidden_1_weight * plr_hidden_1 / 16 * width_factor
342
+ columns_mem_est_hidden_2 = columns_mem_est * hidden_2_weight * plr_hidden_2 / 16 * width_factor
343
+ columns_mem_est = columns_mem_est_hidden_1 + columns_mem_est_hidden_2
344
+
345
+ dataset_size_mem_est = 5 * get_approximate_df_mem_usage(X).sum() # roughly 5x DataFrame memory size
346
+ baseline_overhead_mem_est = 3e8 # 300 MB generic overhead
347
+
348
+ mem_estimate = dataset_size_mem_est + columns_mem_est + baseline_overhead_mem_est
349
+
350
+ return mem_estimate
351
+
352
+ @classmethod
353
+ def _class_tags(cls) -> dict:
354
+ return {"can_estimate_memory_usage_static": True}
355
+
356
+ def _more_tags(self) -> dict:
357
+ # TODO: Need to add train params support, track best epoch
358
+ # How to mirror RealMLP learning rate scheduler while forcing stopping at a specific epoch?
359
+ tags = {"can_refit_full": False}
360
+ return tags
@@ -30,6 +30,7 @@ class RFModel(AbstractModel):
30
30
  ag_key = "RF"
31
31
  ag_name = "RandomForest"
32
32
  ag_priority = 80
33
+ seed_name = "random_state"
33
34
 
34
35
  def __init__(self, **kwargs):
35
36
  super().__init__(**kwargs)
@@ -97,7 +98,6 @@ class RFModel(AbstractModel):
97
98
  # This size scales linearly with number of rows.
98
99
  "max_leaf_nodes": 15000,
99
100
  "n_jobs": -1,
100
- "random_state": 0,
101
101
  "bootstrap": True, # Required for OOB estimates, setting to False will raise exception if bagging.
102
102
  # TODO: min_samples_leaf=5 is too large on most problems, however on some datasets it helps a lot (airlines likes >40 min_samples_leaf, adult likes 2 much better than 1)
103
103
  # This value would need to be tuned per dataset, likely very worthwhile.
@@ -151,13 +151,13 @@ class RFModel(AbstractModel):
151
151
  hyperparameters = {}
152
152
  n_estimators_final = hyperparameters.get("n_estimators", 300)
153
153
  if isinstance(n_estimators_final, int):
154
- n_estimators_minimum = min(40, n_estimators_final)
154
+ n_estimators = n_estimators_final
155
155
  else: # if search space
156
- n_estimators_minimum = 40
156
+ n_estimators = 40
157
157
  num_trees_per_estimator = cls._get_num_trees_per_estimator_static(problem_type=problem_type, num_classes=num_classes)
158
158
  bytes_per_estimator = num_trees_per_estimator * len(X) / 60000 * 1e6 # Underestimates by 3x on ExtraTrees
159
- expected_min_memory_usage = int(bytes_per_estimator * n_estimators_minimum)
160
- return expected_min_memory_usage
159
+ expected_memory_usage = int(bytes_per_estimator * n_estimators)
160
+ return expected_memory_usage
161
161
 
162
162
  def _validate_fit_memory_usage(self, mem_error_threshold: float = 0.5, mem_warning_threshold: float = 0.4, mem_size_threshold: int = 1e7, **kwargs):
163
163
  return super()._validate_fit_memory_usage(
@@ -309,8 +309,9 @@ class RFModel(AbstractModel):
309
309
  if self.model.n_outputs_ == 1:
310
310
  self.model.n_classes_ = [self.model.n_classes_]
311
311
  from sklearn.tree._tree import DOUBLE, DTYPE
312
+ from sklearn.utils.validation import check_X_y
312
313
 
313
- X, y = self.model._validate_data(X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE)
314
+ X, y = check_X_y(X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE)
314
315
  if y.ndim == 1:
315
316
  # reshape is necessary to preserve the data contiguity against vs
316
317
  # [:, np.newaxis] that does not.
@@ -367,6 +368,10 @@ class RFModel(AbstractModel):
367
368
 
368
369
  return self._convert_proba_to_unified_form(y_oof_pred_proba)
369
370
 
371
+ def _get_maximum_resources(self) -> dict[str, int | float]:
372
+ # no GPU support
373
+ return {"num_gpus": 0}
374
+
370
375
  def _get_default_auxiliary_params(self) -> dict:
371
376
  default_auxiliary_params = super()._get_default_auxiliary_params()
372
377
  extra_auxiliary_params = dict(
File without changes
@@ -0,0 +1,179 @@
1
+ """
2
+ Code Adapted from TabArena: https://github.com/autogluon/tabrepo/blob/main/tabrepo/benchmark/models/ag/tabicl/tabicl_model.py
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import logging
8
+
9
+ import pandas as pd
10
+
11
+ from autogluon.common.utils.pandas_utils import get_approximate_df_mem_usage
12
+ from autogluon.common.utils.resource_utils import ResourceManager
13
+ from autogluon.core.models import AbstractModel
14
+ from autogluon.tabular import __version__
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ # TODO: Verify if crashes when weights are not yet downloaded and fit in parallel
20
+ class TabICLModel(AbstractModel):
21
+ """
22
+ TabICL is a foundation model for tabular data using in-context learning
23
+ that is scalable to larger datasets than TabPFNv2. It is pretrained purely on synthetic data.
24
+ TabICL currently only supports classification tasks.
25
+
26
+ TabICL is one of the top performing methods overall on TabArena-v0.1: https://tabarena.ai
27
+
28
+ Paper: TabICL: A Tabular Foundation Model for In-Context Learning on Large Data
29
+ Authors: Jingang Qu, David Holzmüller, Gaël Varoquaux, Marine Le Morvan
30
+ Codebase: https://github.com/soda-inria/tabicl
31
+ License: BSD-3-Clause
32
+
33
+ .. versionadded:: 1.4.0
34
+ """
35
+ ag_key = "TABICL"
36
+ ag_name = "TabICL"
37
+ ag_priority = 65
38
+ seed_name = "random_state"
39
+
40
+ def get_model_cls(self):
41
+ from tabicl import TabICLClassifier
42
+
43
+ if self.problem_type in ["binary", "multiclass"]:
44
+ model_cls = TabICLClassifier
45
+ else:
46
+ raise AssertionError(f"Unsupported problem_type: {self.problem_type}")
47
+ return model_cls
48
+
49
+ @staticmethod
50
+ def _get_batch_size(n_cells: int):
51
+ if n_cells <= 4_000_000:
52
+ return 8
53
+ elif n_cells <= 6_000_000:
54
+ return 4
55
+ else:
56
+ return 2
57
+
58
+ def _fit(
59
+ self,
60
+ X: pd.DataFrame,
61
+ y: pd.Series,
62
+ num_cpus: int = 1,
63
+ num_gpus: int = 0,
64
+ **kwargs,
65
+ ):
66
+ try:
67
+ import tabicl
68
+ except ImportError as err:
69
+ logger.log(
70
+ 40,
71
+ f"\tFailed to import tabicl! To use the TabICL model, "
72
+ f"do: `pip install autogluon.tabular[tabicl]=={__version__}`.",
73
+ )
74
+ raise err
75
+
76
+ from torch.cuda import is_available
77
+
78
+ device = "cuda" if num_gpus != 0 else "cpu"
79
+ if (device == "cuda") and (not is_available()):
80
+ # FIXME: warn instead and switch to CPU.
81
+ raise AssertionError(
82
+ "Fit specified to use GPU, but CUDA is not available on this machine. "
83
+ "Please switch to CPU usage instead.",
84
+ )
85
+
86
+ model_cls = self.get_model_cls()
87
+ hyp = self._get_model_params()
88
+ hyp["batch_size"] = hyp.get("batch_size", self._get_batch_size(X.shape[0] * X.shape[1]))
89
+ self.model = model_cls(
90
+ **hyp,
91
+ device=device,
92
+ n_jobs=num_cpus,
93
+ )
94
+ X = self.preprocess(X)
95
+ self.model = self.model.fit(
96
+ X=X,
97
+ y=y,
98
+ )
99
+
100
+ def _get_default_auxiliary_params(self) -> dict:
101
+ default_auxiliary_params = super()._get_default_auxiliary_params()
102
+ default_auxiliary_params.update(
103
+ {
104
+ "max_rows": 30000,
105
+ "max_features": 2000,
106
+ }
107
+ )
108
+ return default_auxiliary_params
109
+
110
+ @classmethod
111
+ def supported_problem_types(cls) -> list[str] | None:
112
+ return ["binary", "multiclass"]
113
+
114
+ def _get_default_resources(self) -> tuple[int, int]:
115
+ # Use only physical cores for better performance based on benchmarks
116
+ num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
117
+
118
+ num_gpus = min(1, ResourceManager.get_gpu_count_torch(cuda_only=True))
119
+ return num_cpus, num_gpus
120
+
121
+ def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
122
+ hyperparameters = self._get_model_params()
123
+ return self.estimate_memory_usage_static(X=X, problem_type=self.problem_type, num_classes=self.num_classes, hyperparameters=hyperparameters, **kwargs)
124
+
125
+ @classmethod
126
+ def _estimate_memory_usage_static(
127
+ cls,
128
+ *,
129
+ X: pd.DataFrame,
130
+ hyperparameters: dict = None,
131
+ **kwargs,
132
+ ) -> int:
133
+ """
134
+ Heuristic memory estimate that is very primitive.
135
+ Can be vastly improved.
136
+ """
137
+ if hyperparameters is None:
138
+ hyperparameters = {}
139
+
140
+ dataset_size_mem_est = 3 * get_approximate_df_mem_usage(X).sum() # roughly 3x DataFrame memory size
141
+ baseline_overhead_mem_est = 1e9 # 1 GB generic overhead
142
+
143
+ n_rows = X.shape[0]
144
+ n_features = X.shape[1]
145
+ batch_size = hyperparameters.get("batch_size", cls._get_batch_size(X.shape[0] * X.shape[1]))
146
+ embedding_dim = 128
147
+ bytes_per_float = 4
148
+ model_mem_estimate = 2 * batch_size * embedding_dim * bytes_per_float * (4 + n_rows) * n_features
149
+
150
+ model_mem_estimate *= 1.3 # add 30% buffer
151
+
152
+ # TODO: Observed memory spikes above expected values on large datasets, increasing mem estimate to compensate
153
+ model_mem_estimate *= 2.0 # Note: 1.5 is not large enough, still gets OOM
154
+
155
+ mem_estimate = model_mem_estimate + dataset_size_mem_est + baseline_overhead_mem_est
156
+
157
+ return mem_estimate
158
+
159
+ @classmethod
160
+ def _get_default_ag_args_ensemble(cls, **kwargs) -> dict:
161
+ """
162
+ Set fold_fitting_strategy to sequential_local,
163
+ as parallel folding crashes if model weights aren't pre-downloaded.
164
+ """
165
+ default_ag_args_ensemble = super()._get_default_ag_args_ensemble(**kwargs)
166
+ extra_ag_args_ensemble = {
167
+ # FIXME: If parallel, uses way more memory, seems to behave incorrectly, so we force sequential.
168
+ "fold_fitting_strategy": "sequential_local",
169
+ "refit_folds": True, # Better to refit the model for faster inference and similar quality as the bag.
170
+ }
171
+ default_ag_args_ensemble.update(extra_ag_args_ensemble)
172
+ return default_ag_args_ensemble
173
+
174
+ @classmethod
175
+ def _class_tags(cls) -> dict:
176
+ return {"can_estimate_memory_usage_static": True}
177
+
178
+ def _more_tags(self) -> dict:
179
+ return {"can_refit_full": True}
File without changes