autogluon.tabular 1.3.2b20250723__py3-none-any.whl → 1.4.0b20250725__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of autogluon.tabular might be problematic. Click here for more details.
- autogluon/tabular/configs/hyperparameter_configs.py +2 -265
- autogluon/tabular/configs/presets_configs.py +51 -23
- autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2023.py +0 -1
- autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2025.py +309 -0
- autogluon/tabular/models/automm/automm_model.py +2 -0
- autogluon/tabular/models/automm/ft_transformer.py +4 -1
- autogluon/tabular/models/mitra/_internal/core/trainer_finetune.py +18 -6
- autogluon/tabular/models/mitra/_internal/data/dataset_finetune.py +8 -4
- autogluon/tabular/models/mitra/_internal/data/dataset_split.py +5 -1
- autogluon/tabular/models/mitra/_internal/models/tab2d.py +3 -0
- autogluon/tabular/models/mitra/mitra_model.py +85 -21
- autogluon/tabular/models/mitra/sklearn_interface.py +15 -13
- autogluon/tabular/models/realmlp/realmlp_model.py +13 -6
- autogluon/tabular/models/tabicl/tabicl_model.py +17 -8
- autogluon/tabular/models/tabm/rtdl_num_embeddings.py +3 -0
- autogluon/tabular/models/tabm/tabm_model.py +14 -6
- autogluon/tabular/models/tabm/tabm_reference.py +2 -0
- autogluon/tabular/models/tabpfnmix/tabpfnmix_model.py +4 -0
- autogluon/tabular/models/tabpfnv2/tabpfnv2_model.py +29 -12
- autogluon/tabular/predictor/predictor.py +45 -5
- autogluon/tabular/trainer/abstract_trainer.py +2 -0
- autogluon/tabular/version.py +1 -1
- {autogluon.tabular-1.3.2b20250723.dist-info → autogluon.tabular-1.4.0b20250725.dist-info}/METADATA +40 -18
- {autogluon.tabular-1.3.2b20250723.dist-info → autogluon.tabular-1.4.0b20250725.dist-info}/RECORD +31 -30
- /autogluon.tabular-1.3.2b20250723-py3.9-nspkg.pth → /autogluon.tabular-1.4.0b20250725-py3.9-nspkg.pth +0 -0
- {autogluon.tabular-1.3.2b20250723.dist-info → autogluon.tabular-1.4.0b20250725.dist-info}/LICENSE +0 -0
- {autogluon.tabular-1.3.2b20250723.dist-info → autogluon.tabular-1.4.0b20250725.dist-info}/NOTICE +0 -0
- {autogluon.tabular-1.3.2b20250723.dist-info → autogluon.tabular-1.4.0b20250725.dist-info}/WHEEL +0 -0
- {autogluon.tabular-1.3.2b20250723.dist-info → autogluon.tabular-1.4.0b20250725.dist-info}/namespace_packages.txt +0 -0
- {autogluon.tabular-1.3.2b20250723.dist-info → autogluon.tabular-1.4.0b20250725.dist-info}/top_level.txt +0 -0
- {autogluon.tabular-1.3.2b20250723.dist-info → autogluon.tabular-1.4.0b20250725.dist-info}/zip-safe +0 -0
|
@@ -1,49 +1,56 @@
|
|
|
1
|
-
|
|
2
|
-
# and os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'. The CUBLAS environment variable configures
|
|
3
|
-
# the workspace size for certain CUBLAS operations to ensure reproducibility when using CUDA >= 10.2.
|
|
4
|
-
# Both settings are required to ensure deterministic behavior in operations such as matrix multiplications.
|
|
5
|
-
import os
|
|
6
|
-
|
|
7
|
-
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
|
|
1
|
+
from __future__ import annotations
|
|
8
2
|
|
|
3
|
+
import logging
|
|
9
4
|
import os
|
|
10
5
|
from typing import List, Optional
|
|
11
6
|
|
|
12
7
|
import pandas as pd
|
|
13
|
-
import torch
|
|
14
|
-
import logging
|
|
15
8
|
|
|
16
9
|
from autogluon.common.utils.resource_utils import ResourceManager
|
|
17
10
|
from autogluon.core.models import AbstractModel
|
|
11
|
+
from autogluon.features.generators import LabelEncoderFeatureGenerator
|
|
12
|
+
from autogluon.tabular import __version__
|
|
18
13
|
|
|
19
14
|
logger = logging.getLogger(__name__)
|
|
20
15
|
|
|
21
16
|
|
|
22
|
-
# TODO: Needs memory usage estimate method
|
|
23
17
|
class MitraModel(AbstractModel):
|
|
18
|
+
"""
|
|
19
|
+
Mitra is a tabular foundation model pre-trained purely on synthetic data with the goal
|
|
20
|
+
of optimizing fine-tuning performance over in-context learning performance.
|
|
21
|
+
Mitra was developed by the AutoGluon team @ AWS AI.
|
|
22
|
+
|
|
23
|
+
Mitra's default hyperparameters outperforms all methods for small datasets on TabArena-v0.1 (excluding ensembling): https://tabarena.ai
|
|
24
|
+
|
|
25
|
+
Authors: Xiyuan Zhang, Danielle C. Maddix, Junming Yin, Nick Erickson, Abdul Fatir Ansari, Boran Han, Shuai Zhang, Leman Akoglu, Christos Faloutsos, Michael W. Mahoney, Cuixiong Hu, Huzefa Rangwala, George Karypis, Bernie Wang
|
|
26
|
+
Blog Post: https://www.amazon.science/blog/mitra-mixed-synthetic-priors-for-enhancing-tabular-foundation-models
|
|
27
|
+
License: Apache-2.0
|
|
28
|
+
|
|
29
|
+
.. versionadded:: 1.4.0
|
|
30
|
+
"""
|
|
24
31
|
ag_key = "MITRA"
|
|
25
32
|
ag_name = "Mitra"
|
|
26
33
|
weights_file_name = "model.pt"
|
|
27
34
|
ag_priority = 55
|
|
28
35
|
|
|
29
|
-
def __init__(self,
|
|
36
|
+
def __init__(self, **kwargs):
|
|
30
37
|
super().__init__(**kwargs)
|
|
31
|
-
self.problem_type = problem_type
|
|
32
38
|
self._weights_saved = False
|
|
39
|
+
self._feature_generator = None
|
|
33
40
|
|
|
34
41
|
@staticmethod
|
|
35
42
|
def _get_default_device():
|
|
36
43
|
"""Get the best available device for the current system."""
|
|
37
44
|
if ResourceManager.get_gpu_count_torch(cuda_only=True) > 0:
|
|
38
|
-
logger.
|
|
45
|
+
logger.log(15, "Using CUDA GPU")
|
|
39
46
|
return "cuda"
|
|
40
47
|
else:
|
|
41
48
|
return "cpu"
|
|
42
49
|
|
|
43
50
|
def get_model_cls(self):
|
|
44
|
-
from .sklearn_interface import MitraClassifier
|
|
45
|
-
|
|
46
51
|
if self.problem_type in ["binary", "multiclass"]:
|
|
52
|
+
from .sklearn_interface import MitraClassifier
|
|
53
|
+
|
|
47
54
|
model_cls = MitraClassifier
|
|
48
55
|
elif self.problem_type == "regression":
|
|
49
56
|
from .sklearn_interface import MitraRegressor
|
|
@@ -53,6 +60,23 @@ class MitraModel(AbstractModel):
|
|
|
53
60
|
raise AssertionError(f"Unsupported problem_type: {self.problem_type}")
|
|
54
61
|
return model_cls
|
|
55
62
|
|
|
63
|
+
def _preprocess(self, X: pd.DataFrame, is_train: bool = False, **kwargs) -> pd.DataFrame:
|
|
64
|
+
X = super()._preprocess(X, **kwargs)
|
|
65
|
+
|
|
66
|
+
if is_train:
|
|
67
|
+
# X will be the training data.
|
|
68
|
+
self._feature_generator = LabelEncoderFeatureGenerator(verbosity=0)
|
|
69
|
+
self._feature_generator.fit(X=X)
|
|
70
|
+
|
|
71
|
+
# This converts categorical features to numeric via stateful label encoding.
|
|
72
|
+
if self._feature_generator.features_in:
|
|
73
|
+
X = X.copy()
|
|
74
|
+
X[self._feature_generator.features_in] = self._feature_generator.transform(
|
|
75
|
+
X=X
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
return X
|
|
79
|
+
|
|
56
80
|
def _fit(
|
|
57
81
|
self,
|
|
58
82
|
X: pd.DataFrame,
|
|
@@ -61,11 +85,25 @@ class MitraModel(AbstractModel):
|
|
|
61
85
|
y_val: pd.Series = None,
|
|
62
86
|
time_limit: float = None,
|
|
63
87
|
num_cpus: int = 1,
|
|
88
|
+
num_gpus: float = 0,
|
|
89
|
+
verbosity: int = 2,
|
|
64
90
|
**kwargs,
|
|
65
91
|
):
|
|
66
92
|
# TODO: Reset the number of threads based on the specified num_cpus
|
|
67
93
|
need_to_reset_torch_threads = False
|
|
68
94
|
torch_threads_og = None
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
model_cls = self.get_model_cls()
|
|
98
|
+
import torch
|
|
99
|
+
except ImportError as err:
|
|
100
|
+
logger.log(
|
|
101
|
+
40,
|
|
102
|
+
f"\tFailed to import Mitra! To use the Mitra model, "
|
|
103
|
+
f"do: `pip install autogluon.tabular[mitra]=={__version__}`.",
|
|
104
|
+
)
|
|
105
|
+
raise err
|
|
106
|
+
|
|
69
107
|
if num_cpus is not None and isinstance(num_cpus, (int, float)):
|
|
70
108
|
torch_threads_og = torch.get_num_threads()
|
|
71
109
|
if torch_threads_og != num_cpus:
|
|
@@ -73,9 +111,14 @@ class MitraModel(AbstractModel):
|
|
|
73
111
|
torch.set_num_threads(num_cpus)
|
|
74
112
|
need_to_reset_torch_threads = True
|
|
75
113
|
|
|
76
|
-
model_cls = self.get_model_cls()
|
|
77
|
-
|
|
78
114
|
hyp = self._get_model_params()
|
|
115
|
+
|
|
116
|
+
if hyp.get("device", None) is None:
|
|
117
|
+
if num_gpus == 0:
|
|
118
|
+
hyp["device"] = "cpu"
|
|
119
|
+
else:
|
|
120
|
+
hyp["device"] = self._get_default_device()
|
|
121
|
+
|
|
79
122
|
if "state_dict_classification" in hyp:
|
|
80
123
|
state_dict_classification = hyp.pop("state_dict_classification")
|
|
81
124
|
if self.problem_type in ["binary", "multiclass"]:
|
|
@@ -85,11 +128,14 @@ class MitraModel(AbstractModel):
|
|
|
85
128
|
if self.problem_type in ["regression"]:
|
|
86
129
|
hyp["state_dict"] = state_dict_regression
|
|
87
130
|
|
|
131
|
+
if "verbose" not in hyp:
|
|
132
|
+
hyp["verbose"] = verbosity >= 3
|
|
133
|
+
|
|
88
134
|
self.model = model_cls(
|
|
89
135
|
**hyp,
|
|
90
136
|
)
|
|
91
137
|
|
|
92
|
-
X = self.preprocess(X)
|
|
138
|
+
X = self.preprocess(X, is_train=True)
|
|
93
139
|
if X_val is not None:
|
|
94
140
|
X_val = self.preprocess(X_val)
|
|
95
141
|
|
|
@@ -106,7 +152,6 @@ class MitraModel(AbstractModel):
|
|
|
106
152
|
|
|
107
153
|
def _set_default_params(self):
|
|
108
154
|
default_params = {
|
|
109
|
-
"device": self._get_default_device(),
|
|
110
155
|
"n_estimators": 1,
|
|
111
156
|
}
|
|
112
157
|
for param, val in default_params.items():
|
|
@@ -184,6 +229,24 @@ class MitraModel(AbstractModel):
|
|
|
184
229
|
|
|
185
230
|
return num_cpus, num_gpus
|
|
186
231
|
|
|
232
|
+
def get_minimum_resources(self, is_gpu_available: bool = False) -> dict[str, int | float]:
|
|
233
|
+
"""
|
|
234
|
+
Parameters
|
|
235
|
+
----------
|
|
236
|
+
is_gpu_available : bool, default = False
|
|
237
|
+
Whether gpu is available in the system.
|
|
238
|
+
Model that can be trained both on cpu and gpu can decide the minimum resources based on this.
|
|
239
|
+
|
|
240
|
+
Returns a dictionary of minimum resource requirements to fit the model.
|
|
241
|
+
Subclass should consider overriding this method if it requires more resources to train.
|
|
242
|
+
If a resource is not part of the output dictionary, it is considered unnecessary.
|
|
243
|
+
Valid keys: 'num_cpus', 'num_gpus'.
|
|
244
|
+
"""
|
|
245
|
+
return {
|
|
246
|
+
"num_cpus": 1,
|
|
247
|
+
"num_gpus": 0.5,
|
|
248
|
+
}
|
|
249
|
+
|
|
187
250
|
def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
|
|
188
251
|
return self.estimate_memory_usage_static(
|
|
189
252
|
X=X, problem_type=self.problem_type, num_classes=self.num_classes, **kwargs
|
|
@@ -196,12 +259,13 @@ class MitraModel(AbstractModel):
|
|
|
196
259
|
X: pd.DataFrame,
|
|
197
260
|
**kwargs,
|
|
198
261
|
) -> int:
|
|
199
|
-
|
|
262
|
+
# Multiply by 0.9 as currently this is overly safe
|
|
263
|
+
return int(0.9 * max(
|
|
200
264
|
cls._estimate_memory_usage_static_cpu_icl(X=X, **kwargs),
|
|
201
265
|
cls._estimate_memory_usage_static_cpu_ft_icl(X=X, **kwargs),
|
|
202
266
|
cls._estimate_memory_usage_static_gpu_cpu(X=X, **kwargs),
|
|
203
267
|
cls._estimate_memory_usage_static_gpu_gpu(X=X, **kwargs),
|
|
204
|
-
)
|
|
268
|
+
))
|
|
205
269
|
|
|
206
270
|
@classmethod
|
|
207
271
|
def _estimate_memory_usage_static_cpu_icl(
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
1
4
|
import time
|
|
2
5
|
from pathlib import Path
|
|
3
6
|
import contextlib
|
|
@@ -76,6 +79,7 @@ class MitraBase(BaseEstimator):
|
|
|
76
79
|
random_mirror_regression=RANDOM_MIRROR_REGRESSION,
|
|
77
80
|
random_mirror_x=RANDOM_MIRROR_X,
|
|
78
81
|
seed=SEED,
|
|
82
|
+
verbose=True,
|
|
79
83
|
):
|
|
80
84
|
"""
|
|
81
85
|
Initialize the base Mitra model.
|
|
@@ -114,8 +118,11 @@ class MitraBase(BaseEstimator):
|
|
|
114
118
|
self.trainers = []
|
|
115
119
|
self.train_time = 0
|
|
116
120
|
self.seed = seed
|
|
121
|
+
self.verbose = verbose
|
|
117
122
|
|
|
118
|
-
set_seed
|
|
123
|
+
# FIXME: set_seed was removed in v1.4 as quality and speed reduction was observed when setting seed.
|
|
124
|
+
# This should be investigated and fixed for v1.5
|
|
125
|
+
# set_seed(self.seed)
|
|
119
126
|
|
|
120
127
|
def _create_config(self, task, dim_output, time_limit=None):
|
|
121
128
|
cfg = ConfigRun(
|
|
@@ -183,6 +190,7 @@ class MitraBase(BaseEstimator):
|
|
|
183
190
|
"""Train the ensemble of models."""
|
|
184
191
|
|
|
185
192
|
cfg, Tab2D = self._create_config(task, dim_output, time_limit)
|
|
193
|
+
rng = np.random.RandomState(cfg.seed)
|
|
186
194
|
|
|
187
195
|
success = False
|
|
188
196
|
while not (success and cfg.hyperparams["max_samples_support"] > 0 and cfg.hyperparams["max_samples_query"] > 0):
|
|
@@ -217,7 +225,7 @@ class MitraBase(BaseEstimator):
|
|
|
217
225
|
path_to_weights=Path(self.state_dict),
|
|
218
226
|
device=self.device,
|
|
219
227
|
)
|
|
220
|
-
trainer = TrainerFinetune(cfg, model, n_classes=n_classes, device=self.device)
|
|
228
|
+
trainer = TrainerFinetune(cfg, model, n_classes=n_classes, device=self.device, rng=rng, verbose=self.verbose)
|
|
221
229
|
|
|
222
230
|
start_time = time.time()
|
|
223
231
|
trainer.train(X_train, y_train, X_valid, y_valid)
|
|
@@ -275,6 +283,7 @@ class MitraClassifier(MitraBase, ClassifierMixin):
|
|
|
275
283
|
random_mirror_regression=RANDOM_MIRROR_REGRESSION,
|
|
276
284
|
random_mirror_x=RANDOM_MIRROR_X,
|
|
277
285
|
seed=SEED,
|
|
286
|
+
verbose=True,
|
|
278
287
|
):
|
|
279
288
|
"""Initialize the classifier."""
|
|
280
289
|
super().__init__(
|
|
@@ -294,6 +303,7 @@ class MitraClassifier(MitraBase, ClassifierMixin):
|
|
|
294
303
|
random_mirror_regression=random_mirror_regression,
|
|
295
304
|
random_mirror_x=random_mirror_x,
|
|
296
305
|
seed=seed,
|
|
306
|
+
verbose=verbose,
|
|
297
307
|
)
|
|
298
308
|
self.task = 'classification'
|
|
299
309
|
|
|
@@ -403,6 +413,7 @@ class MitraRegressor(MitraBase, RegressorMixin):
|
|
|
403
413
|
random_mirror_regression=RANDOM_MIRROR_REGRESSION,
|
|
404
414
|
random_mirror_x=RANDOM_MIRROR_X,
|
|
405
415
|
seed=SEED,
|
|
416
|
+
verbose=True,
|
|
406
417
|
):
|
|
407
418
|
"""Initialize the regressor."""
|
|
408
419
|
super().__init__(
|
|
@@ -422,6 +433,7 @@ class MitraRegressor(MitraBase, RegressorMixin):
|
|
|
422
433
|
random_mirror_regression=random_mirror_regression,
|
|
423
434
|
random_mirror_x=random_mirror_x,
|
|
424
435
|
seed=seed,
|
|
436
|
+
verbose=verbose,
|
|
425
437
|
)
|
|
426
438
|
self.task = 'regression'
|
|
427
439
|
|
|
@@ -492,14 +504,4 @@ class MitraRegressor(MitraBase, RegressorMixin):
|
|
|
492
504
|
@contextlib.contextmanager
|
|
493
505
|
def mitra_deterministic_context():
|
|
494
506
|
"""Context manager to set deterministic settings only for Mitra operations."""
|
|
495
|
-
|
|
496
|
-
original_deterministic_algorithms_set = False
|
|
497
|
-
|
|
498
|
-
try:
|
|
499
|
-
torch.use_deterministic_algorithms(True)
|
|
500
|
-
original_deterministic_algorithms_set = True
|
|
501
|
-
yield
|
|
502
|
-
|
|
503
|
-
finally:
|
|
504
|
-
if original_deterministic_algorithms_set:
|
|
505
|
-
torch.use_deterministic_algorithms(False)
|
|
507
|
+
yield
|
|
@@ -1,11 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Code Adapted from TabArena: https://github.com/autogluon/tabrepo/blob/main/tabrepo/benchmark/models/ag/realmlp/realmlp_model.py
|
|
3
|
-
|
|
4
|
-
Model: RealMLP
|
|
5
|
-
Paper: Better by Default: Strong Pre-Tuned MLPs and Boosted Trees on Tabular Data
|
|
6
|
-
Authors: David Holzmüller, Léo Grinsztajn, Ingo Steinwart
|
|
7
|
-
Codebase: https://github.com/dholzmueller/pytabkit
|
|
8
|
-
License: Apache-2.0
|
|
9
3
|
"""
|
|
10
4
|
|
|
11
5
|
from __future__ import annotations
|
|
@@ -41,6 +35,19 @@ def set_logger_level(logger_name: str, level: int):
|
|
|
41
35
|
|
|
42
36
|
# pip install pytabkit
|
|
43
37
|
class RealMLPModel(AbstractModel):
|
|
38
|
+
"""
|
|
39
|
+
RealMLP is an improved multilayer perception (MLP) model
|
|
40
|
+
through a bag of tricks and better default hyperparameters.
|
|
41
|
+
|
|
42
|
+
RealMLP is the top performing method overall on TabArena-v0.1: https://tabarena.ai
|
|
43
|
+
|
|
44
|
+
Paper: Better by Default: Strong Pre-Tuned MLPs and Boosted Trees on Tabular Data
|
|
45
|
+
Authors: David Holzmüller, Léo Grinsztajn, Ingo Steinwart
|
|
46
|
+
Codebase: https://github.com/dholzmueller/pytabkit
|
|
47
|
+
License: Apache-2.0
|
|
48
|
+
|
|
49
|
+
.. versionadded:: 1.4.0
|
|
50
|
+
"""
|
|
44
51
|
ag_key = "REALMLP"
|
|
45
52
|
ag_name = "RealMLP"
|
|
46
53
|
ag_priority = 75
|
|
@@ -1,10 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Code Adapted from TabArena: https://github.com/autogluon/tabrepo/blob/main/tabrepo/benchmark/models/ag/tabicl/tabicl_model.py
|
|
3
|
-
Model: TabICL
|
|
4
|
-
Paper: TabICL: A Tabular Foundation Model for In-Context Learning on Large Data
|
|
5
|
-
Authors: Jingang Qu, David Holzmüller, Gaël Varoquaux, Marine Le Morvan
|
|
6
|
-
Codebase: https://github.com/soda-inria/tabicl
|
|
7
|
-
License: BSD-3-Clause
|
|
8
3
|
"""
|
|
9
4
|
|
|
10
5
|
from __future__ import annotations
|
|
@@ -23,6 +18,20 @@ logger = logging.getLogger(__name__)
|
|
|
23
18
|
|
|
24
19
|
# TODO: Verify if crashes when weights are not yet downloaded and fit in parallel
|
|
25
20
|
class TabICLModel(AbstractModel):
|
|
21
|
+
"""
|
|
22
|
+
TabICL is a foundation model for tabular data using in-context learning
|
|
23
|
+
that is scalable to larger datasets than TabPFNv2. It is pretrained purely on synthetic data.
|
|
24
|
+
TabICL currently only supports classification tasks.
|
|
25
|
+
|
|
26
|
+
TabICL is one of the top performing methods overall on TabArena-v0.1: https://tabarena.ai
|
|
27
|
+
|
|
28
|
+
Paper: TabICL: A Tabular Foundation Model for In-Context Learning on Large Data
|
|
29
|
+
Authors: Jingang Qu, David Holzmüller, Gaël Varoquaux, Marine Le Morvan
|
|
30
|
+
Codebase: https://github.com/soda-inria/tabicl
|
|
31
|
+
License: BSD-3-Clause
|
|
32
|
+
|
|
33
|
+
.. versionadded:: 1.4.0
|
|
34
|
+
"""
|
|
26
35
|
ag_key = "TABICL"
|
|
27
36
|
ag_name = "TabICL"
|
|
28
37
|
ag_priority = 65
|
|
@@ -98,8 +107,8 @@ class TabICLModel(AbstractModel):
|
|
|
98
107
|
default_auxiliary_params = super()._get_default_auxiliary_params()
|
|
99
108
|
default_auxiliary_params.update(
|
|
100
109
|
{
|
|
101
|
-
"max_rows":
|
|
102
|
-
"max_features":
|
|
110
|
+
"max_rows": 30000,
|
|
111
|
+
"max_features": 2000,
|
|
103
112
|
}
|
|
104
113
|
)
|
|
105
114
|
return default_auxiliary_params
|
|
@@ -147,7 +156,7 @@ class TabICLModel(AbstractModel):
|
|
|
147
156
|
model_mem_estimate *= 1.3 # add 30% buffer
|
|
148
157
|
|
|
149
158
|
# TODO: Observed memory spikes above expected values on large datasets, increasing mem estimate to compensate
|
|
150
|
-
model_mem_estimate *= 1.5
|
|
159
|
+
model_mem_estimate *= 2.0 # Note: 1.5 is not large enough, still gets OOM
|
|
151
160
|
|
|
152
161
|
mem_estimate = model_mem_estimate + dataset_size_mem_est + baseline_overhead_mem_est
|
|
153
162
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# taken from https://github.com/yandex-research/rtdl-num-embeddings/blob/main/package/rtdl_num_embeddings.py
|
|
2
2
|
"""On Embeddings for Numerical Features in Tabular Deep Learning."""
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
__version__ = '0.0.12'
|
|
5
7
|
|
|
6
8
|
__all__ = [
|
|
@@ -12,6 +14,7 @@ __all__ = [
|
|
|
12
14
|
'compute_bins',
|
|
13
15
|
]
|
|
14
16
|
|
|
17
|
+
|
|
15
18
|
import math
|
|
16
19
|
import warnings
|
|
17
20
|
from typing import Any, Literal, Optional, Union
|
|
@@ -4,12 +4,6 @@ Note: This is a custom implementation of TabM based on TabArena. Because the Aut
|
|
|
4
4
|
the same time as TabM became available on PyPi, we chose to use TabArena's implementation
|
|
5
5
|
for the AutoGluon 1.4 release as it has already been benchmarked.
|
|
6
6
|
|
|
7
|
-
Model: TabM
|
|
8
|
-
Paper: TabM: Advancing Tabular Deep Learning with Parameter-Efficient Ensembling
|
|
9
|
-
Authors: Yury Gorishniy, Akim Kotelnikov, Artem Babenko
|
|
10
|
-
Codebase: https://github.com/yandex-research/tabm
|
|
11
|
-
License: Apache-2.0
|
|
12
|
-
|
|
13
7
|
Partially adapted from pytabkit's TabM implementation.
|
|
14
8
|
"""
|
|
15
9
|
|
|
@@ -28,6 +22,20 @@ logger = logging.getLogger(__name__)
|
|
|
28
22
|
|
|
29
23
|
|
|
30
24
|
class TabMModel(AbstractModel):
|
|
25
|
+
"""
|
|
26
|
+
TabM is an efficient ensemble of MLPs that is trained simultaneously with mostly shared parameters.
|
|
27
|
+
|
|
28
|
+
TabM is one of the top performing methods overall on TabArena-v0.1: https://tabarena.ai
|
|
29
|
+
|
|
30
|
+
Paper: TabM: Advancing Tabular Deep Learning with Parameter-Efficient Ensembling
|
|
31
|
+
Authors: Yury Gorishniy, Akim Kotelnikov, Artem Babenko
|
|
32
|
+
Codebase: https://github.com/yandex-research/tabm
|
|
33
|
+
License: Apache-2.0
|
|
34
|
+
|
|
35
|
+
Partially adapted from pytabkit's TabM implementation.
|
|
36
|
+
|
|
37
|
+
.. versionadded:: 1.4.0
|
|
38
|
+
"""
|
|
31
39
|
ag_key = "TABM"
|
|
32
40
|
ag_name = "TabM"
|
|
33
41
|
ag_priority = 85
|
|
@@ -26,6 +26,8 @@ class TabPFNMixModel(AbstractModel):
|
|
|
26
26
|
|
|
27
27
|
TabPFNMix is based off of the TabPFN and TabForestPFN models.
|
|
28
28
|
|
|
29
|
+
We recommend using Mitra instead, as it is an improved version of TabPFNMix.
|
|
30
|
+
|
|
29
31
|
It is a tabular transformer model pre-trained on purely synthetic data.
|
|
30
32
|
|
|
31
33
|
It currently has several limitations:
|
|
@@ -34,6 +36,8 @@ class TabPFNMixModel(AbstractModel):
|
|
|
34
36
|
3. Does not support GPU
|
|
35
37
|
|
|
36
38
|
For more information, refer to the `./_internals/README.md` file.
|
|
39
|
+
|
|
40
|
+
.. versionadded:: 1.2.0
|
|
37
41
|
"""
|
|
38
42
|
ag_key = "TABPFNMIX"
|
|
39
43
|
ag_name = "TabPFNMix"
|
|
@@ -1,11 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Code Adapted from TabArena: https://github.com/autogluon/tabrepo/blob/main/tabrepo/benchmark/models/ag/tabpfnv2/tabpfnv2_model.py
|
|
3
|
-
|
|
4
|
-
Model: TabPFNv2
|
|
5
|
-
Paper: Accurate predictions on small data with a tabular foundation model
|
|
6
|
-
Authors: Noah Hollmann, Samuel Müller, Lennart Purucker, Arjun Krishnakumar, Max Körfer, Shi Bin Hoo, Robin Tibor Schirrmeister & Frank Hutter
|
|
7
|
-
Codebase: https://github.com/PriorLabs/TabPFN
|
|
8
|
-
License: https://github.com/PriorLabs/TabPFN/blob/main/LICENSE
|
|
9
3
|
"""
|
|
10
4
|
|
|
11
5
|
from __future__ import annotations
|
|
@@ -111,6 +105,20 @@ class FixedSafePowerTransformer(PowerTransformer):
|
|
|
111
105
|
|
|
112
106
|
|
|
113
107
|
class TabPFNV2Model(AbstractModel):
|
|
108
|
+
"""
|
|
109
|
+
TabPFNv2 is a tabular foundation model pre-trained purely on synthetic data that achieves
|
|
110
|
+
state-of-the-art results with in-context learning on small datasets with <=10000 samples and <=500 features.
|
|
111
|
+
TabPFNv2 is developed and maintained by PriorLabs: https://priorlabs.ai/
|
|
112
|
+
|
|
113
|
+
TabPFNv2 is the top performing method for small datasets on TabArena-v0.1: https://tabarena.ai
|
|
114
|
+
|
|
115
|
+
Paper: Accurate predictions on small data with a tabular foundation model
|
|
116
|
+
Authors: Noah Hollmann, Samuel Müller, Lennart Purucker, Arjun Krishnakumar, Max Körfer, Shi Bin Hoo, Robin Tibor Schirrmeister & Frank Hutter
|
|
117
|
+
Codebase: https://github.com/PriorLabs/TabPFN
|
|
118
|
+
License: https://github.com/PriorLabs/TabPFN/blob/main/LICENSE
|
|
119
|
+
|
|
120
|
+
.. versionadded:: 1.4.0
|
|
121
|
+
"""
|
|
114
122
|
ag_key = "TABPFNV2"
|
|
115
123
|
ag_name = "TabPFNv2"
|
|
116
124
|
ag_priority = 105
|
|
@@ -119,12 +127,14 @@ class TabPFNV2Model(AbstractModel):
|
|
|
119
127
|
super().__init__(**kwargs)
|
|
120
128
|
self._feature_generator = None
|
|
121
129
|
self._cat_features = None
|
|
130
|
+
self._cat_indices = None
|
|
122
131
|
|
|
123
132
|
def _preprocess(self, X: pd.DataFrame, is_train=False, **kwargs) -> pd.DataFrame:
|
|
124
133
|
X = super()._preprocess(X, **kwargs)
|
|
125
|
-
self._cat_indices = []
|
|
126
134
|
|
|
127
135
|
if is_train:
|
|
136
|
+
self._cat_indices = []
|
|
137
|
+
|
|
128
138
|
# X will be the training data.
|
|
129
139
|
self._feature_generator = LabelEncoderFeatureGenerator(verbosity=0)
|
|
130
140
|
self._feature_generator.fit(X=X)
|
|
@@ -136,10 +146,11 @@ class TabPFNV2Model(AbstractModel):
|
|
|
136
146
|
X=X
|
|
137
147
|
)
|
|
138
148
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
self._cat_features
|
|
142
|
-
|
|
149
|
+
if is_train:
|
|
150
|
+
# Detect/set cat features and indices
|
|
151
|
+
if self._cat_features is None:
|
|
152
|
+
self._cat_features = self._feature_generator.features_in[:]
|
|
153
|
+
self._cat_indices = [X.columns.get_loc(col) for col in self._cat_features]
|
|
143
154
|
|
|
144
155
|
return X
|
|
145
156
|
|
|
@@ -187,6 +198,12 @@ class TabPFNV2Model(AbstractModel):
|
|
|
187
198
|
# logs "Built with PriorLabs-TabPFN"
|
|
188
199
|
self._log_license(device=device)
|
|
189
200
|
|
|
201
|
+
if num_gpus == 0:
|
|
202
|
+
logger.log(
|
|
203
|
+
30,
|
|
204
|
+
f"\tWARNING: Running TabPFNv2 on CPU. This can be very slow. We recommend using a GPU instead."
|
|
205
|
+
)
|
|
206
|
+
|
|
190
207
|
X = self.preprocess(X, is_train=True)
|
|
191
208
|
|
|
192
209
|
hps = self._get_model_params()
|
|
@@ -366,7 +383,7 @@ class TabPFNV2Model(AbstractModel):
|
|
|
366
383
|
|
|
367
384
|
# Add some buffer to each term + 1 GB overhead to be safe
|
|
368
385
|
return int(
|
|
369
|
-
model_mem + 4 * X_mem +
|
|
386
|
+
model_mem + 4 * X_mem + 2 * activation_mem + baseline_overhead_mem_est
|
|
370
387
|
)
|
|
371
388
|
|
|
372
389
|
@classmethod
|
|
@@ -1068,11 +1068,11 @@ class TabularPredictor:
|
|
|
1068
1068
|
20,
|
|
1069
1069
|
"No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...\n"
|
|
1070
1070
|
"\tRecommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):\n"
|
|
1071
|
-
"\tpresets='
|
|
1072
|
-
"\tpresets='best'
|
|
1073
|
-
"\tpresets='high'
|
|
1074
|
-
"\tpresets='good'
|
|
1075
|
-
"\tpresets='medium'
|
|
1071
|
+
"\tpresets='extreme' : New in v1.4: Massively better than 'best' on datasets <30000 samples by using new models meta-learned on https://tabarena.ai: TabPFNv2, TabICL, Mitra, and TabM. Absolute best accuracy. Requires a GPU. Recommended 64 GB CPU memory and 32+ GB GPU memory.\n"
|
|
1072
|
+
"\tpresets='best' : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.\n"
|
|
1073
|
+
"\tpresets='high' : Strong accuracy with fast inference speed.\n"
|
|
1074
|
+
"\tpresets='good' : Good accuracy with very fast inference speed.\n"
|
|
1075
|
+
"\tpresets='medium' : Fast training time, ideal for initial prototyping.",
|
|
1076
1076
|
)
|
|
1077
1077
|
|
|
1078
1078
|
kwargs_orig = kwargs.copy()
|
|
@@ -1127,10 +1127,48 @@ class TabularPredictor:
|
|
|
1127
1127
|
)
|
|
1128
1128
|
infer_limit, infer_limit_batch_size = self._validate_infer_limit(infer_limit=infer_limit, infer_limit_batch_size=infer_limit_batch_size)
|
|
1129
1129
|
|
|
1130
|
+
# TODO: Temporary for v1.4. Make this more extensible for v1.5 by letting users make their own dynamic hyperparameters.
|
|
1131
|
+
dynamic_hyperparameters = kwargs["_experimental_dynamic_hyperparameters"]
|
|
1132
|
+
if dynamic_hyperparameters:
|
|
1133
|
+
logger.log(20, f"`extreme` preset uses a dynamic portfolio based on dataset size...")
|
|
1134
|
+
assert hyperparameters is None, f"hyperparameters must be unspecified when `_experimental_dynamic_hyperparameters=True`."
|
|
1135
|
+
n_samples = len(train_data)
|
|
1136
|
+
if n_samples > 30000:
|
|
1137
|
+
data_size = "large"
|
|
1138
|
+
else:
|
|
1139
|
+
data_size = "small"
|
|
1140
|
+
assert data_size in ["large", "small"]
|
|
1141
|
+
if data_size == "large":
|
|
1142
|
+
logger.log(20, f"\tDetected data size: large (>30000 samples), using `zeroshot` portfolio (identical to 'best_quality' preset).")
|
|
1143
|
+
hyperparameters = "zeroshot"
|
|
1144
|
+
else:
|
|
1145
|
+
if "num_stack_levels" not in kwargs_orig:
|
|
1146
|
+
# disable stacking for tabfm portfolio
|
|
1147
|
+
num_stack_levels = 0
|
|
1148
|
+
kwargs["num_stack_levels"] = 0
|
|
1149
|
+
logger.log(
|
|
1150
|
+
20,
|
|
1151
|
+
f"\tDetected data size: small (<=30000 samples), using `zeroshot_2025_tabfm` portfolio."
|
|
1152
|
+
f"\n\t\tNote: `zeroshot_2025_tabfm` portfolio requires a CUDA compatible GPU for best performance."
|
|
1153
|
+
f"\n\t\tMake sure you have all the relevant dependencies installed: "
|
|
1154
|
+
f"`pip install autogluon.tabular[tabarena]`."
|
|
1155
|
+
f"\n\t\tIt is strongly recommended to use a machine with 64+ GB memory "
|
|
1156
|
+
f"and a CUDA compatible GPU with 32+ GB vRAM when using this preset. "
|
|
1157
|
+
f"\n\t\tThis portfolio will download foundation model weights from HuggingFace during training. "
|
|
1158
|
+
f"Ensure you have an internet connection or have pre-downloaded the weights to use these models."
|
|
1159
|
+
f"\n\t\tThis portfolio was meta-learned with TabArena: https://tabarena.ai"
|
|
1160
|
+
)
|
|
1161
|
+
hyperparameters = "zeroshot_2025_tabfm"
|
|
1162
|
+
|
|
1130
1163
|
if hyperparameters is None:
|
|
1131
1164
|
hyperparameters = "default"
|
|
1132
1165
|
if isinstance(hyperparameters, str):
|
|
1166
|
+
hyperparameters_str = hyperparameters
|
|
1133
1167
|
hyperparameters = get_hyperparameter_config(hyperparameters)
|
|
1168
|
+
logger.log(
|
|
1169
|
+
20,
|
|
1170
|
+
f"Using hyperparameters preset: hyperparameters='{hyperparameters_str}'",
|
|
1171
|
+
)
|
|
1134
1172
|
self._validate_hyperparameters(hyperparameters=hyperparameters)
|
|
1135
1173
|
self.fit_hyperparameters_ = hyperparameters
|
|
1136
1174
|
|
|
@@ -5042,6 +5080,8 @@ class TabularPredictor:
|
|
|
5042
5080
|
learning_curves=False,
|
|
5043
5081
|
test_data=None,
|
|
5044
5082
|
raise_on_model_failure=False,
|
|
5083
|
+
# experimental
|
|
5084
|
+
_experimental_dynamic_hyperparameters=False,
|
|
5045
5085
|
)
|
|
5046
5086
|
kwargs, ds_valid_keys = self._sanitize_dynamic_stacking_kwargs(kwargs)
|
|
5047
5087
|
kwargs = self._validate_fit_extra_kwargs(kwargs, extra_valid_keys=list(fit_kwargs_default.keys()) + ds_valid_keys)
|
|
@@ -2131,6 +2131,8 @@ class AbstractTabularTrainer(AbstractTrainer[AbstractModel]):
|
|
|
2131
2131
|
if isinstance(model, BaggedEnsembleModel) and not compute_score:
|
|
2132
2132
|
# Do not perform OOF predictions when we don't compute a score.
|
|
2133
2133
|
model_fit_kwargs["_skip_oof"] = True
|
|
2134
|
+
if not isinstance(model, BaggedEnsembleModel):
|
|
2135
|
+
model_fit_kwargs.setdefault("log_resources", True)
|
|
2134
2136
|
|
|
2135
2137
|
model_fit_kwargs = dict(
|
|
2136
2138
|
model=model,
|
autogluon/tabular/version.py
CHANGED