autogluon.tabular 1.3.2b20250723__py3-none-any.whl → 1.4.0b20250725__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of autogluon.tabular might be problematic. Click here for more details.

Files changed (31) hide show
  1. autogluon/tabular/configs/hyperparameter_configs.py +2 -265
  2. autogluon/tabular/configs/presets_configs.py +51 -23
  3. autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2023.py +0 -1
  4. autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2025.py +309 -0
  5. autogluon/tabular/models/automm/automm_model.py +2 -0
  6. autogluon/tabular/models/automm/ft_transformer.py +4 -1
  7. autogluon/tabular/models/mitra/_internal/core/trainer_finetune.py +18 -6
  8. autogluon/tabular/models/mitra/_internal/data/dataset_finetune.py +8 -4
  9. autogluon/tabular/models/mitra/_internal/data/dataset_split.py +5 -1
  10. autogluon/tabular/models/mitra/_internal/models/tab2d.py +3 -0
  11. autogluon/tabular/models/mitra/mitra_model.py +85 -21
  12. autogluon/tabular/models/mitra/sklearn_interface.py +15 -13
  13. autogluon/tabular/models/realmlp/realmlp_model.py +13 -6
  14. autogluon/tabular/models/tabicl/tabicl_model.py +17 -8
  15. autogluon/tabular/models/tabm/rtdl_num_embeddings.py +3 -0
  16. autogluon/tabular/models/tabm/tabm_model.py +14 -6
  17. autogluon/tabular/models/tabm/tabm_reference.py +2 -0
  18. autogluon/tabular/models/tabpfnmix/tabpfnmix_model.py +4 -0
  19. autogluon/tabular/models/tabpfnv2/tabpfnv2_model.py +29 -12
  20. autogluon/tabular/predictor/predictor.py +45 -5
  21. autogluon/tabular/trainer/abstract_trainer.py +2 -0
  22. autogluon/tabular/version.py +1 -1
  23. {autogluon.tabular-1.3.2b20250723.dist-info → autogluon.tabular-1.4.0b20250725.dist-info}/METADATA +40 -18
  24. {autogluon.tabular-1.3.2b20250723.dist-info → autogluon.tabular-1.4.0b20250725.dist-info}/RECORD +31 -30
  25. /autogluon.tabular-1.3.2b20250723-py3.9-nspkg.pth → /autogluon.tabular-1.4.0b20250725-py3.9-nspkg.pth +0 -0
  26. {autogluon.tabular-1.3.2b20250723.dist-info → autogluon.tabular-1.4.0b20250725.dist-info}/LICENSE +0 -0
  27. {autogluon.tabular-1.3.2b20250723.dist-info → autogluon.tabular-1.4.0b20250725.dist-info}/NOTICE +0 -0
  28. {autogluon.tabular-1.3.2b20250723.dist-info → autogluon.tabular-1.4.0b20250725.dist-info}/WHEEL +0 -0
  29. {autogluon.tabular-1.3.2b20250723.dist-info → autogluon.tabular-1.4.0b20250725.dist-info}/namespace_packages.txt +0 -0
  30. {autogluon.tabular-1.3.2b20250723.dist-info → autogluon.tabular-1.4.0b20250725.dist-info}/top_level.txt +0 -0
  31. {autogluon.tabular-1.3.2b20250723.dist-info → autogluon.tabular-1.4.0b20250725.dist-info}/zip-safe +0 -0
@@ -1,49 +1,56 @@
1
- # TODO: To ensure deterministic operations we need to set torch.use_deterministic_algorithms(True)
2
- # and os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'. The CUBLAS environment variable configures
3
- # the workspace size for certain CUBLAS operations to ensure reproducibility when using CUDA >= 10.2.
4
- # Both settings are required to ensure deterministic behavior in operations such as matrix multiplications.
5
- import os
6
-
7
- os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
1
+ from __future__ import annotations
8
2
 
3
+ import logging
9
4
  import os
10
5
  from typing import List, Optional
11
6
 
12
7
  import pandas as pd
13
- import torch
14
- import logging
15
8
 
16
9
  from autogluon.common.utils.resource_utils import ResourceManager
17
10
  from autogluon.core.models import AbstractModel
11
+ from autogluon.features.generators import LabelEncoderFeatureGenerator
12
+ from autogluon.tabular import __version__
18
13
 
19
14
  logger = logging.getLogger(__name__)
20
15
 
21
16
 
22
- # TODO: Needs memory usage estimate method
23
17
  class MitraModel(AbstractModel):
18
+ """
19
+ Mitra is a tabular foundation model pre-trained purely on synthetic data with the goal
20
+ of optimizing fine-tuning performance over in-context learning performance.
21
+ Mitra was developed by the AutoGluon team @ AWS AI.
22
+
23
+ Mitra's default hyperparameters outperforms all methods for small datasets on TabArena-v0.1 (excluding ensembling): https://tabarena.ai
24
+
25
+ Authors: Xiyuan Zhang, Danielle C. Maddix, Junming Yin, Nick Erickson, Abdul Fatir Ansari, Boran Han, Shuai Zhang, Leman Akoglu, Christos Faloutsos, Michael W. Mahoney, Cuixiong Hu, Huzefa Rangwala, George Karypis, Bernie Wang
26
+ Blog Post: https://www.amazon.science/blog/mitra-mixed-synthetic-priors-for-enhancing-tabular-foundation-models
27
+ License: Apache-2.0
28
+
29
+ .. versionadded:: 1.4.0
30
+ """
24
31
  ag_key = "MITRA"
25
32
  ag_name = "Mitra"
26
33
  weights_file_name = "model.pt"
27
34
  ag_priority = 55
28
35
 
29
- def __init__(self, problem_type=None, **kwargs):
36
+ def __init__(self, **kwargs):
30
37
  super().__init__(**kwargs)
31
- self.problem_type = problem_type
32
38
  self._weights_saved = False
39
+ self._feature_generator = None
33
40
 
34
41
  @staticmethod
35
42
  def _get_default_device():
36
43
  """Get the best available device for the current system."""
37
44
  if ResourceManager.get_gpu_count_torch(cuda_only=True) > 0:
38
- logger.info("Using CUDA GPU")
45
+ logger.log(15, "Using CUDA GPU")
39
46
  return "cuda"
40
47
  else:
41
48
  return "cpu"
42
49
 
43
50
  def get_model_cls(self):
44
- from .sklearn_interface import MitraClassifier
45
-
46
51
  if self.problem_type in ["binary", "multiclass"]:
52
+ from .sklearn_interface import MitraClassifier
53
+
47
54
  model_cls = MitraClassifier
48
55
  elif self.problem_type == "regression":
49
56
  from .sklearn_interface import MitraRegressor
@@ -53,6 +60,23 @@ class MitraModel(AbstractModel):
53
60
  raise AssertionError(f"Unsupported problem_type: {self.problem_type}")
54
61
  return model_cls
55
62
 
63
+ def _preprocess(self, X: pd.DataFrame, is_train: bool = False, **kwargs) -> pd.DataFrame:
64
+ X = super()._preprocess(X, **kwargs)
65
+
66
+ if is_train:
67
+ # X will be the training data.
68
+ self._feature_generator = LabelEncoderFeatureGenerator(verbosity=0)
69
+ self._feature_generator.fit(X=X)
70
+
71
+ # This converts categorical features to numeric via stateful label encoding.
72
+ if self._feature_generator.features_in:
73
+ X = X.copy()
74
+ X[self._feature_generator.features_in] = self._feature_generator.transform(
75
+ X=X
76
+ )
77
+
78
+ return X
79
+
56
80
  def _fit(
57
81
  self,
58
82
  X: pd.DataFrame,
@@ -61,11 +85,25 @@ class MitraModel(AbstractModel):
61
85
  y_val: pd.Series = None,
62
86
  time_limit: float = None,
63
87
  num_cpus: int = 1,
88
+ num_gpus: float = 0,
89
+ verbosity: int = 2,
64
90
  **kwargs,
65
91
  ):
66
92
  # TODO: Reset the number of threads based on the specified num_cpus
67
93
  need_to_reset_torch_threads = False
68
94
  torch_threads_og = None
95
+
96
+ try:
97
+ model_cls = self.get_model_cls()
98
+ import torch
99
+ except ImportError as err:
100
+ logger.log(
101
+ 40,
102
+ f"\tFailed to import Mitra! To use the Mitra model, "
103
+ f"do: `pip install autogluon.tabular[mitra]=={__version__}`.",
104
+ )
105
+ raise err
106
+
69
107
  if num_cpus is not None and isinstance(num_cpus, (int, float)):
70
108
  torch_threads_og = torch.get_num_threads()
71
109
  if torch_threads_og != num_cpus:
@@ -73,9 +111,14 @@ class MitraModel(AbstractModel):
73
111
  torch.set_num_threads(num_cpus)
74
112
  need_to_reset_torch_threads = True
75
113
 
76
- model_cls = self.get_model_cls()
77
-
78
114
  hyp = self._get_model_params()
115
+
116
+ if hyp.get("device", None) is None:
117
+ if num_gpus == 0:
118
+ hyp["device"] = "cpu"
119
+ else:
120
+ hyp["device"] = self._get_default_device()
121
+
79
122
  if "state_dict_classification" in hyp:
80
123
  state_dict_classification = hyp.pop("state_dict_classification")
81
124
  if self.problem_type in ["binary", "multiclass"]:
@@ -85,11 +128,14 @@ class MitraModel(AbstractModel):
85
128
  if self.problem_type in ["regression"]:
86
129
  hyp["state_dict"] = state_dict_regression
87
130
 
131
+ if "verbose" not in hyp:
132
+ hyp["verbose"] = verbosity >= 3
133
+
88
134
  self.model = model_cls(
89
135
  **hyp,
90
136
  )
91
137
 
92
- X = self.preprocess(X)
138
+ X = self.preprocess(X, is_train=True)
93
139
  if X_val is not None:
94
140
  X_val = self.preprocess(X_val)
95
141
 
@@ -106,7 +152,6 @@ class MitraModel(AbstractModel):
106
152
 
107
153
  def _set_default_params(self):
108
154
  default_params = {
109
- "device": self._get_default_device(),
110
155
  "n_estimators": 1,
111
156
  }
112
157
  for param, val in default_params.items():
@@ -184,6 +229,24 @@ class MitraModel(AbstractModel):
184
229
 
185
230
  return num_cpus, num_gpus
186
231
 
232
+ def get_minimum_resources(self, is_gpu_available: bool = False) -> dict[str, int | float]:
233
+ """
234
+ Parameters
235
+ ----------
236
+ is_gpu_available : bool, default = False
237
+ Whether gpu is available in the system.
238
+ Model that can be trained both on cpu and gpu can decide the minimum resources based on this.
239
+
240
+ Returns a dictionary of minimum resource requirements to fit the model.
241
+ Subclass should consider overriding this method if it requires more resources to train.
242
+ If a resource is not part of the output dictionary, it is considered unnecessary.
243
+ Valid keys: 'num_cpus', 'num_gpus'.
244
+ """
245
+ return {
246
+ "num_cpus": 1,
247
+ "num_gpus": 0.5,
248
+ }
249
+
187
250
  def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
188
251
  return self.estimate_memory_usage_static(
189
252
  X=X, problem_type=self.problem_type, num_classes=self.num_classes, **kwargs
@@ -196,12 +259,13 @@ class MitraModel(AbstractModel):
196
259
  X: pd.DataFrame,
197
260
  **kwargs,
198
261
  ) -> int:
199
- return max(
262
+ # Multiply by 0.9 as currently this is overly safe
263
+ return int(0.9 * max(
200
264
  cls._estimate_memory_usage_static_cpu_icl(X=X, **kwargs),
201
265
  cls._estimate_memory_usage_static_cpu_ft_icl(X=X, **kwargs),
202
266
  cls._estimate_memory_usage_static_gpu_cpu(X=X, **kwargs),
203
267
  cls._estimate_memory_usage_static_gpu_gpu(X=X, **kwargs),
204
- )
268
+ ))
205
269
 
206
270
  @classmethod
207
271
  def _estimate_memory_usage_static_cpu_icl(
@@ -1,3 +1,6 @@
1
+ from __future__ import annotations
2
+
3
+ import os
1
4
  import time
2
5
  from pathlib import Path
3
6
  import contextlib
@@ -76,6 +79,7 @@ class MitraBase(BaseEstimator):
76
79
  random_mirror_regression=RANDOM_MIRROR_REGRESSION,
77
80
  random_mirror_x=RANDOM_MIRROR_X,
78
81
  seed=SEED,
82
+ verbose=True,
79
83
  ):
80
84
  """
81
85
  Initialize the base Mitra model.
@@ -114,8 +118,11 @@ class MitraBase(BaseEstimator):
114
118
  self.trainers = []
115
119
  self.train_time = 0
116
120
  self.seed = seed
121
+ self.verbose = verbose
117
122
 
118
- set_seed(self.seed)
123
+ # FIXME: set_seed was removed in v1.4 as quality and speed reduction was observed when setting seed.
124
+ # This should be investigated and fixed for v1.5
125
+ # set_seed(self.seed)
119
126
 
120
127
  def _create_config(self, task, dim_output, time_limit=None):
121
128
  cfg = ConfigRun(
@@ -183,6 +190,7 @@ class MitraBase(BaseEstimator):
183
190
  """Train the ensemble of models."""
184
191
 
185
192
  cfg, Tab2D = self._create_config(task, dim_output, time_limit)
193
+ rng = np.random.RandomState(cfg.seed)
186
194
 
187
195
  success = False
188
196
  while not (success and cfg.hyperparams["max_samples_support"] > 0 and cfg.hyperparams["max_samples_query"] > 0):
@@ -217,7 +225,7 @@ class MitraBase(BaseEstimator):
217
225
  path_to_weights=Path(self.state_dict),
218
226
  device=self.device,
219
227
  )
220
- trainer = TrainerFinetune(cfg, model, n_classes=n_classes, device=self.device)
228
+ trainer = TrainerFinetune(cfg, model, n_classes=n_classes, device=self.device, rng=rng, verbose=self.verbose)
221
229
 
222
230
  start_time = time.time()
223
231
  trainer.train(X_train, y_train, X_valid, y_valid)
@@ -275,6 +283,7 @@ class MitraClassifier(MitraBase, ClassifierMixin):
275
283
  random_mirror_regression=RANDOM_MIRROR_REGRESSION,
276
284
  random_mirror_x=RANDOM_MIRROR_X,
277
285
  seed=SEED,
286
+ verbose=True,
278
287
  ):
279
288
  """Initialize the classifier."""
280
289
  super().__init__(
@@ -294,6 +303,7 @@ class MitraClassifier(MitraBase, ClassifierMixin):
294
303
  random_mirror_regression=random_mirror_regression,
295
304
  random_mirror_x=random_mirror_x,
296
305
  seed=seed,
306
+ verbose=verbose,
297
307
  )
298
308
  self.task = 'classification'
299
309
 
@@ -403,6 +413,7 @@ class MitraRegressor(MitraBase, RegressorMixin):
403
413
  random_mirror_regression=RANDOM_MIRROR_REGRESSION,
404
414
  random_mirror_x=RANDOM_MIRROR_X,
405
415
  seed=SEED,
416
+ verbose=True,
406
417
  ):
407
418
  """Initialize the regressor."""
408
419
  super().__init__(
@@ -422,6 +433,7 @@ class MitraRegressor(MitraBase, RegressorMixin):
422
433
  random_mirror_regression=random_mirror_regression,
423
434
  random_mirror_x=random_mirror_x,
424
435
  seed=seed,
436
+ verbose=verbose,
425
437
  )
426
438
  self.task = 'regression'
427
439
 
@@ -492,14 +504,4 @@ class MitraRegressor(MitraBase, RegressorMixin):
492
504
  @contextlib.contextmanager
493
505
  def mitra_deterministic_context():
494
506
  """Context manager to set deterministic settings only for Mitra operations."""
495
-
496
- original_deterministic_algorithms_set = False
497
-
498
- try:
499
- torch.use_deterministic_algorithms(True)
500
- original_deterministic_algorithms_set = True
501
- yield
502
-
503
- finally:
504
- if original_deterministic_algorithms_set:
505
- torch.use_deterministic_algorithms(False)
507
+ yield
@@ -1,11 +1,5 @@
1
1
  """
2
2
  Code Adapted from TabArena: https://github.com/autogluon/tabrepo/blob/main/tabrepo/benchmark/models/ag/realmlp/realmlp_model.py
3
-
4
- Model: RealMLP
5
- Paper: Better by Default: Strong Pre-Tuned MLPs and Boosted Trees on Tabular Data
6
- Authors: David Holzmüller, Léo Grinsztajn, Ingo Steinwart
7
- Codebase: https://github.com/dholzmueller/pytabkit
8
- License: Apache-2.0
9
3
  """
10
4
 
11
5
  from __future__ import annotations
@@ -41,6 +35,19 @@ def set_logger_level(logger_name: str, level: int):
41
35
 
42
36
  # pip install pytabkit
43
37
  class RealMLPModel(AbstractModel):
38
+ """
39
+ RealMLP is an improved multilayer perception (MLP) model
40
+ through a bag of tricks and better default hyperparameters.
41
+
42
+ RealMLP is the top performing method overall on TabArena-v0.1: https://tabarena.ai
43
+
44
+ Paper: Better by Default: Strong Pre-Tuned MLPs and Boosted Trees on Tabular Data
45
+ Authors: David Holzmüller, Léo Grinsztajn, Ingo Steinwart
46
+ Codebase: https://github.com/dholzmueller/pytabkit
47
+ License: Apache-2.0
48
+
49
+ .. versionadded:: 1.4.0
50
+ """
44
51
  ag_key = "REALMLP"
45
52
  ag_name = "RealMLP"
46
53
  ag_priority = 75
@@ -1,10 +1,5 @@
1
1
  """
2
2
  Code Adapted from TabArena: https://github.com/autogluon/tabrepo/blob/main/tabrepo/benchmark/models/ag/tabicl/tabicl_model.py
3
- Model: TabICL
4
- Paper: TabICL: A Tabular Foundation Model for In-Context Learning on Large Data
5
- Authors: Jingang Qu, David Holzmüller, Gaël Varoquaux, Marine Le Morvan
6
- Codebase: https://github.com/soda-inria/tabicl
7
- License: BSD-3-Clause
8
3
  """
9
4
 
10
5
  from __future__ import annotations
@@ -23,6 +18,20 @@ logger = logging.getLogger(__name__)
23
18
 
24
19
  # TODO: Verify if crashes when weights are not yet downloaded and fit in parallel
25
20
  class TabICLModel(AbstractModel):
21
+ """
22
+ TabICL is a foundation model for tabular data using in-context learning
23
+ that is scalable to larger datasets than TabPFNv2. It is pretrained purely on synthetic data.
24
+ TabICL currently only supports classification tasks.
25
+
26
+ TabICL is one of the top performing methods overall on TabArena-v0.1: https://tabarena.ai
27
+
28
+ Paper: TabICL: A Tabular Foundation Model for In-Context Learning on Large Data
29
+ Authors: Jingang Qu, David Holzmüller, Gaël Varoquaux, Marine Le Morvan
30
+ Codebase: https://github.com/soda-inria/tabicl
31
+ License: BSD-3-Clause
32
+
33
+ .. versionadded:: 1.4.0
34
+ """
26
35
  ag_key = "TABICL"
27
36
  ag_name = "TabICL"
28
37
  ag_priority = 65
@@ -98,8 +107,8 @@ class TabICLModel(AbstractModel):
98
107
  default_auxiliary_params = super()._get_default_auxiliary_params()
99
108
  default_auxiliary_params.update(
100
109
  {
101
- "max_rows": 100000,
102
- "max_features": 500,
110
+ "max_rows": 30000,
111
+ "max_features": 2000,
103
112
  }
104
113
  )
105
114
  return default_auxiliary_params
@@ -147,7 +156,7 @@ class TabICLModel(AbstractModel):
147
156
  model_mem_estimate *= 1.3 # add 30% buffer
148
157
 
149
158
  # TODO: Observed memory spikes above expected values on large datasets, increasing mem estimate to compensate
150
- model_mem_estimate *= 1.5
159
+ model_mem_estimate *= 2.0 # Note: 1.5 is not large enough, still gets OOM
151
160
 
152
161
  mem_estimate = model_mem_estimate + dataset_size_mem_est + baseline_overhead_mem_est
153
162
 
@@ -1,6 +1,8 @@
1
1
  # taken from https://github.com/yandex-research/rtdl-num-embeddings/blob/main/package/rtdl_num_embeddings.py
2
2
  """On Embeddings for Numerical Features in Tabular Deep Learning."""
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  __version__ = '0.0.12'
5
7
 
6
8
  __all__ = [
@@ -12,6 +14,7 @@ __all__ = [
12
14
  'compute_bins',
13
15
  ]
14
16
 
17
+
15
18
  import math
16
19
  import warnings
17
20
  from typing import Any, Literal, Optional, Union
@@ -4,12 +4,6 @@ Note: This is a custom implementation of TabM based on TabArena. Because the Aut
4
4
  the same time as TabM became available on PyPi, we chose to use TabArena's implementation
5
5
  for the AutoGluon 1.4 release as it has already been benchmarked.
6
6
 
7
- Model: TabM
8
- Paper: TabM: Advancing Tabular Deep Learning with Parameter-Efficient Ensembling
9
- Authors: Yury Gorishniy, Akim Kotelnikov, Artem Babenko
10
- Codebase: https://github.com/yandex-research/tabm
11
- License: Apache-2.0
12
-
13
7
  Partially adapted from pytabkit's TabM implementation.
14
8
  """
15
9
 
@@ -28,6 +22,20 @@ logger = logging.getLogger(__name__)
28
22
 
29
23
 
30
24
  class TabMModel(AbstractModel):
25
+ """
26
+ TabM is an efficient ensemble of MLPs that is trained simultaneously with mostly shared parameters.
27
+
28
+ TabM is one of the top performing methods overall on TabArena-v0.1: https://tabarena.ai
29
+
30
+ Paper: TabM: Advancing Tabular Deep Learning with Parameter-Efficient Ensembling
31
+ Authors: Yury Gorishniy, Akim Kotelnikov, Artem Babenko
32
+ Codebase: https://github.com/yandex-research/tabm
33
+ License: Apache-2.0
34
+
35
+ Partially adapted from pytabkit's TabM implementation.
36
+
37
+ .. versionadded:: 1.4.0
38
+ """
31
39
  ag_key = "TABM"
32
40
  ag_name = "TabM"
33
41
  ag_priority = 85
@@ -3,6 +3,8 @@
3
3
  # NOTE
4
4
  # The minimum required versions of the dependencies are specified in README.md.
5
5
 
6
+ from __future__ import annotations
7
+
6
8
  import itertools
7
9
  from typing import Any, Literal, Union
8
10
 
@@ -26,6 +26,8 @@ class TabPFNMixModel(AbstractModel):
26
26
 
27
27
  TabPFNMix is based off of the TabPFN and TabForestPFN models.
28
28
 
29
+ We recommend using Mitra instead, as it is an improved version of TabPFNMix.
30
+
29
31
  It is a tabular transformer model pre-trained on purely synthetic data.
30
32
 
31
33
  It currently has several limitations:
@@ -34,6 +36,8 @@ class TabPFNMixModel(AbstractModel):
34
36
  3. Does not support GPU
35
37
 
36
38
  For more information, refer to the `./_internals/README.md` file.
39
+
40
+ .. versionadded:: 1.2.0
37
41
  """
38
42
  ag_key = "TABPFNMIX"
39
43
  ag_name = "TabPFNMix"
@@ -1,11 +1,5 @@
1
1
  """
2
2
  Code Adapted from TabArena: https://github.com/autogluon/tabrepo/blob/main/tabrepo/benchmark/models/ag/tabpfnv2/tabpfnv2_model.py
3
-
4
- Model: TabPFNv2
5
- Paper: Accurate predictions on small data with a tabular foundation model
6
- Authors: Noah Hollmann, Samuel Müller, Lennart Purucker, Arjun Krishnakumar, Max Körfer, Shi Bin Hoo, Robin Tibor Schirrmeister & Frank Hutter
7
- Codebase: https://github.com/PriorLabs/TabPFN
8
- License: https://github.com/PriorLabs/TabPFN/blob/main/LICENSE
9
3
  """
10
4
 
11
5
  from __future__ import annotations
@@ -111,6 +105,20 @@ class FixedSafePowerTransformer(PowerTransformer):
111
105
 
112
106
 
113
107
  class TabPFNV2Model(AbstractModel):
108
+ """
109
+ TabPFNv2 is a tabular foundation model pre-trained purely on synthetic data that achieves
110
+ state-of-the-art results with in-context learning on small datasets with <=10000 samples and <=500 features.
111
+ TabPFNv2 is developed and maintained by PriorLabs: https://priorlabs.ai/
112
+
113
+ TabPFNv2 is the top performing method for small datasets on TabArena-v0.1: https://tabarena.ai
114
+
115
+ Paper: Accurate predictions on small data with a tabular foundation model
116
+ Authors: Noah Hollmann, Samuel Müller, Lennart Purucker, Arjun Krishnakumar, Max Körfer, Shi Bin Hoo, Robin Tibor Schirrmeister & Frank Hutter
117
+ Codebase: https://github.com/PriorLabs/TabPFN
118
+ License: https://github.com/PriorLabs/TabPFN/blob/main/LICENSE
119
+
120
+ .. versionadded:: 1.4.0
121
+ """
114
122
  ag_key = "TABPFNV2"
115
123
  ag_name = "TabPFNv2"
116
124
  ag_priority = 105
@@ -119,12 +127,14 @@ class TabPFNV2Model(AbstractModel):
119
127
  super().__init__(**kwargs)
120
128
  self._feature_generator = None
121
129
  self._cat_features = None
130
+ self._cat_indices = None
122
131
 
123
132
  def _preprocess(self, X: pd.DataFrame, is_train=False, **kwargs) -> pd.DataFrame:
124
133
  X = super()._preprocess(X, **kwargs)
125
- self._cat_indices = []
126
134
 
127
135
  if is_train:
136
+ self._cat_indices = []
137
+
128
138
  # X will be the training data.
129
139
  self._feature_generator = LabelEncoderFeatureGenerator(verbosity=0)
130
140
  self._feature_generator.fit(X=X)
@@ -136,10 +146,11 @@ class TabPFNV2Model(AbstractModel):
136
146
  X=X
137
147
  )
138
148
 
139
- # Detect/set cat features and indices
140
- if self._cat_features is None:
141
- self._cat_features = self._feature_generator.features_in[:]
142
- self._cat_indices = [X.columns.get_loc(col) for col in self._cat_features]
149
+ if is_train:
150
+ # Detect/set cat features and indices
151
+ if self._cat_features is None:
152
+ self._cat_features = self._feature_generator.features_in[:]
153
+ self._cat_indices = [X.columns.get_loc(col) for col in self._cat_features]
143
154
 
144
155
  return X
145
156
 
@@ -187,6 +198,12 @@ class TabPFNV2Model(AbstractModel):
187
198
  # logs "Built with PriorLabs-TabPFN"
188
199
  self._log_license(device=device)
189
200
 
201
+ if num_gpus == 0:
202
+ logger.log(
203
+ 30,
204
+ f"\tWARNING: Running TabPFNv2 on CPU. This can be very slow. We recommend using a GPU instead."
205
+ )
206
+
190
207
  X = self.preprocess(X, is_train=True)
191
208
 
192
209
  hps = self._get_model_params()
@@ -366,7 +383,7 @@ class TabPFNV2Model(AbstractModel):
366
383
 
367
384
  # Add some buffer to each term + 1 GB overhead to be safe
368
385
  return int(
369
- model_mem + 4 * X_mem + 1.5 * activation_mem + baseline_overhead_mem_est
386
+ model_mem + 4 * X_mem + 2 * activation_mem + baseline_overhead_mem_est
370
387
  )
371
388
 
372
389
  @classmethod
@@ -1068,11 +1068,11 @@ class TabularPredictor:
1068
1068
  20,
1069
1069
  "No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...\n"
1070
1070
  "\tRecommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):\n"
1071
- "\tpresets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.\n"
1072
- "\tpresets='best' : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.\n"
1073
- "\tpresets='high' : Strong accuracy with fast inference speed.\n"
1074
- "\tpresets='good' : Good accuracy with very fast inference speed.\n"
1075
- "\tpresets='medium' : Fast training time, ideal for initial prototyping.",
1071
+ "\tpresets='extreme' : New in v1.4: Massively better than 'best' on datasets <30000 samples by using new models meta-learned on https://tabarena.ai: TabPFNv2, TabICL, Mitra, and TabM. Absolute best accuracy. Requires a GPU. Recommended 64 GB CPU memory and 32+ GB GPU memory.\n"
1072
+ "\tpresets='best' : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.\n"
1073
+ "\tpresets='high' : Strong accuracy with fast inference speed.\n"
1074
+ "\tpresets='good' : Good accuracy with very fast inference speed.\n"
1075
+ "\tpresets='medium' : Fast training time, ideal for initial prototyping.",
1076
1076
  )
1077
1077
 
1078
1078
  kwargs_orig = kwargs.copy()
@@ -1127,10 +1127,48 @@ class TabularPredictor:
1127
1127
  )
1128
1128
  infer_limit, infer_limit_batch_size = self._validate_infer_limit(infer_limit=infer_limit, infer_limit_batch_size=infer_limit_batch_size)
1129
1129
 
1130
+ # TODO: Temporary for v1.4. Make this more extensible for v1.5 by letting users make their own dynamic hyperparameters.
1131
+ dynamic_hyperparameters = kwargs["_experimental_dynamic_hyperparameters"]
1132
+ if dynamic_hyperparameters:
1133
+ logger.log(20, f"`extreme` preset uses a dynamic portfolio based on dataset size...")
1134
+ assert hyperparameters is None, f"hyperparameters must be unspecified when `_experimental_dynamic_hyperparameters=True`."
1135
+ n_samples = len(train_data)
1136
+ if n_samples > 30000:
1137
+ data_size = "large"
1138
+ else:
1139
+ data_size = "small"
1140
+ assert data_size in ["large", "small"]
1141
+ if data_size == "large":
1142
+ logger.log(20, f"\tDetected data size: large (>30000 samples), using `zeroshot` portfolio (identical to 'best_quality' preset).")
1143
+ hyperparameters = "zeroshot"
1144
+ else:
1145
+ if "num_stack_levels" not in kwargs_orig:
1146
+ # disable stacking for tabfm portfolio
1147
+ num_stack_levels = 0
1148
+ kwargs["num_stack_levels"] = 0
1149
+ logger.log(
1150
+ 20,
1151
+ f"\tDetected data size: small (<=30000 samples), using `zeroshot_2025_tabfm` portfolio."
1152
+ f"\n\t\tNote: `zeroshot_2025_tabfm` portfolio requires a CUDA compatible GPU for best performance."
1153
+ f"\n\t\tMake sure you have all the relevant dependencies installed: "
1154
+ f"`pip install autogluon.tabular[tabarena]`."
1155
+ f"\n\t\tIt is strongly recommended to use a machine with 64+ GB memory "
1156
+ f"and a CUDA compatible GPU with 32+ GB vRAM when using this preset. "
1157
+ f"\n\t\tThis portfolio will download foundation model weights from HuggingFace during training. "
1158
+ f"Ensure you have an internet connection or have pre-downloaded the weights to use these models."
1159
+ f"\n\t\tThis portfolio was meta-learned with TabArena: https://tabarena.ai"
1160
+ )
1161
+ hyperparameters = "zeroshot_2025_tabfm"
1162
+
1130
1163
  if hyperparameters is None:
1131
1164
  hyperparameters = "default"
1132
1165
  if isinstance(hyperparameters, str):
1166
+ hyperparameters_str = hyperparameters
1133
1167
  hyperparameters = get_hyperparameter_config(hyperparameters)
1168
+ logger.log(
1169
+ 20,
1170
+ f"Using hyperparameters preset: hyperparameters='{hyperparameters_str}'",
1171
+ )
1134
1172
  self._validate_hyperparameters(hyperparameters=hyperparameters)
1135
1173
  self.fit_hyperparameters_ = hyperparameters
1136
1174
 
@@ -5042,6 +5080,8 @@ class TabularPredictor:
5042
5080
  learning_curves=False,
5043
5081
  test_data=None,
5044
5082
  raise_on_model_failure=False,
5083
+ # experimental
5084
+ _experimental_dynamic_hyperparameters=False,
5045
5085
  )
5046
5086
  kwargs, ds_valid_keys = self._sanitize_dynamic_stacking_kwargs(kwargs)
5047
5087
  kwargs = self._validate_fit_extra_kwargs(kwargs, extra_valid_keys=list(fit_kwargs_default.keys()) + ds_valid_keys)
@@ -2131,6 +2131,8 @@ class AbstractTabularTrainer(AbstractTrainer[AbstractModel]):
2131
2131
  if isinstance(model, BaggedEnsembleModel) and not compute_score:
2132
2132
  # Do not perform OOF predictions when we don't compute a score.
2133
2133
  model_fit_kwargs["_skip_oof"] = True
2134
+ if not isinstance(model, BaggedEnsembleModel):
2135
+ model_fit_kwargs.setdefault("log_resources", True)
2134
2136
 
2135
2137
  model_fit_kwargs = dict(
2136
2138
  model=model,
@@ -1,4 +1,4 @@
1
1
  """This is the autogluon version file."""
2
2
 
3
- __version__ = "1.3.2b20250723"
3
+ __version__ = "1.4.0b20250725"
4
4
  __lite__ = False