autogluon.tabular 1.3.2b20250709__py3-none-any.whl → 1.3.2b20250711__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. autogluon/tabular/models/__init__.py +3 -0
  2. autogluon/tabular/models/catboost/callbacks.py +3 -2
  3. autogluon/tabular/models/catboost/catboost_model.py +2 -2
  4. autogluon/tabular/models/catboost/catboost_utils.py +7 -3
  5. autogluon/tabular/models/fastainn/tabular_nn_fastai.py +3 -3
  6. autogluon/tabular/models/lgb/lgb_model.py +2 -2
  7. autogluon/tabular/models/realmlp/__init__.py +0 -0
  8. autogluon/tabular/models/realmlp/realmlp_model.py +347 -0
  9. autogluon/tabular/models/rf/rf_model.py +2 -1
  10. autogluon/tabular/models/tabicl/__init__.py +0 -0
  11. autogluon/tabular/models/tabicl/tabicl_model.py +174 -0
  12. autogluon/tabular/models/tabm/__init__.py +0 -0
  13. autogluon/tabular/models/tabm/_tabm_internal.py +544 -0
  14. autogluon/tabular/models/tabm/rtdl_num_embeddings.py +807 -0
  15. autogluon/tabular/models/tabm/tabm_model.py +275 -0
  16. autogluon/tabular/models/tabm/tabm_reference.py +627 -0
  17. autogluon/tabular/models/tabpfnmix/tabpfnmix_model.py +3 -3
  18. autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py +3 -3
  19. autogluon/tabular/models/xgboost/xgboost_model.py +2 -2
  20. autogluon/tabular/predictor/predictor.py +5 -3
  21. autogluon/tabular/registry/_ag_model_registry.py +6 -0
  22. autogluon/tabular/testing/fit_helper.py +27 -25
  23. autogluon/tabular/testing/generate_datasets.py +7 -0
  24. autogluon/tabular/trainer/abstract_trainer.py +1 -1
  25. autogluon/tabular/trainer/model_presets/presets.py +10 -1
  26. autogluon/tabular/version.py +1 -1
  27. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/METADATA +21 -13
  28. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/RECORD +35 -26
  29. /autogluon.tabular-1.3.2b20250709-py3.9-nspkg.pth → /autogluon.tabular-1.3.2b20250711-py3.9-nspkg.pth +0 -0
  30. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/LICENSE +0 -0
  31. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/NOTICE +0 -0
  32. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/WHEEL +0 -0
  33. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/namespace_packages.txt +0 -0
  34. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/top_level.txt +0 -0
  35. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/zip-safe +0 -0
@@ -17,7 +17,10 @@ from .imodels.imodels_models import (
17
17
  from .knn.knn_model import KNNModel
18
18
  from .lgb.lgb_model import LGBModel
19
19
  from .lr.lr_model import LinearModel
20
+ from .realmlp.realmlp_model import RealMLPModel
20
21
  from .rf.rf_model import RFModel
22
+ from .tabicl.tabicl_model import TabICLModel
23
+ from .tabm.tabm_model import TabMModel
21
24
  from .tabpfn.tabpfn_model import TabPFNModel
22
25
  from .tabpfnmix.tabpfnmix_model import TabPFNMixModel
23
26
  from .tabular_nn.torch.tabular_nn_torch import TabularNeuralNetTorchModel
@@ -170,14 +170,15 @@ class EarlyStoppingCallback:
170
170
 
171
171
  self.eval_metric_name = eval_metric_name
172
172
  self.is_max_optimal = is_max_optimal
173
- self.is_quantile = self.eval_metric_name.startswith(CATBOOST_QUANTILE_PREFIX)
173
+ self.is_quantile = CATBOOST_QUANTILE_PREFIX in self.eval_metric_name
174
174
 
175
175
  def after_iteration(self, info):
176
176
  is_best_iter = False
177
177
  if self.is_quantile:
178
178
  # FIXME: CatBoost adds extra ',' in the metric name if quantile levels are not balanced
179
179
  # e.g., 'MultiQuantile:alpha=0.1,0.25,0.5,0.95' becomes 'MultiQuantile:alpha=0.1,,0.25,0.5,0.95'
180
- eval_metric_name = [k for k in info.metrics[self.compare_key] if k.startswith(CATBOOST_QUANTILE_PREFIX)][0]
180
+ # `'Quantile:' in k` catches both multiquantile (MultiQuantile:) and single-quantile mode (Quantile:)
181
+ eval_metric_name = [k for k in info.metrics[self.compare_key] if CATBOOST_QUANTILE_PREFIX in k][0]
181
182
  else:
182
183
  eval_metric_name = self.eval_metric_name
183
184
  cur_score = info.metrics[self.compare_key][eval_metric_name][-1]
@@ -350,8 +350,8 @@ class CatBoostModel(AbstractModel):
350
350
  return minimum_resources
351
351
 
352
352
  def _get_default_resources(self):
353
- # logical=False is faster in training
354
- num_cpus = ResourceManager.get_cpu_count_psutil(logical=False)
353
+ # only_physical_cores=True is faster in training
354
+ num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
355
355
  num_gpus = 0
356
356
  return num_cpus, num_gpus
357
357
 
@@ -5,7 +5,7 @@ from autogluon.core.constants import BINARY, MULTICLASS, QUANTILE, REGRESSION, S
5
5
  logger = logging.getLogger(__name__)
6
6
 
7
7
 
8
- CATBOOST_QUANTILE_PREFIX = "MultiQuantile:"
8
+ CATBOOST_QUANTILE_PREFIX = "Quantile:"
9
9
 
10
10
 
11
11
  # TODO: Add weight support?
@@ -74,8 +74,12 @@ def get_catboost_metric_from_ag_metric(metric, problem_type, quantile_levels=Non
74
74
  raise AssertionError(f"quantile_levels must be provided for problem_type = {problem_type}")
75
75
  if not all(0 < q < 1 for q in quantile_levels):
76
76
  raise AssertionError(f"quantile_levels must fulfill 0 < q < 1, provided quantile_levels: {quantile_levels}")
77
- quantile_string = ",".join(str(q) for q in quantile_levels)
78
- metric_class = f"{CATBOOST_QUANTILE_PREFIX}alpha={quantile_string}"
77
+ # Loss function MultiQuantile: can only be used if len(quantile_levels) >= 2, otherwise we must use Quantile:
78
+ if len(quantile_levels) == 1:
79
+ metric_class = f"{CATBOOST_QUANTILE_PREFIX}alpha={quantile_levels[0]}"
80
+ else:
81
+ quantile_string = ",".join(str(q) for q in quantile_levels)
82
+ metric_class = f"Multi{CATBOOST_QUANTILE_PREFIX}alpha={quantile_string}"
79
83
  else:
80
84
  raise AssertionError(f"CatBoost does not support {problem_type} problem type.")
81
85
 
@@ -584,8 +584,8 @@ class NNFastAiTabularModel(AbstractModel):
584
584
  return default_auxiliary_params
585
585
 
586
586
  def _get_default_resources(self):
587
- # logical=False is faster in training
588
- num_cpus = ResourceManager.get_cpu_count_psutil(logical=False)
587
+ # only_physical_cores=True is faster in training
588
+ num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
589
589
  num_gpus = 0
590
590
  return num_cpus, num_gpus
591
591
 
@@ -642,7 +642,7 @@ class NNFastAiTabularModel(AbstractModel):
642
642
 
643
643
  def _get_maximum_resources(self) -> dict[str, Union[int, float]]:
644
644
  # fastai model trains slower when utilizing virtual cores and this issue scale up when the number of cpu cores increases
645
- return {"num_cpus": ResourceManager.get_cpu_count_psutil(logical=False)}
645
+ return {"num_cpus": ResourceManager.get_cpu_count(only_physical_cores=True)}
646
646
 
647
647
  def get_minimum_resources(self, is_gpu_available=False):
648
648
  minimum_resources = {
@@ -532,8 +532,8 @@ class LGBModel(AbstractModel):
532
532
  return minimum_resources
533
533
 
534
534
  def _get_default_resources(self):
535
- # logical=False is faster in training
536
- num_cpus = ResourceManager.get_cpu_count_psutil(logical=False)
535
+ # only_physical_cores=True is faster in training
536
+ num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
537
537
  num_gpus = 0
538
538
  return num_cpus, num_gpus
539
539
 
File without changes
@@ -0,0 +1,347 @@
1
+ """
2
+ Code Adapted from TabArena: https://github.com/autogluon/tabrepo/blob/main/tabrepo/benchmark/models/ag/realmlp/realmlp_model.py
3
+
4
+ Model: RealMLP
5
+ Paper: Better by Default: Strong Pre-Tuned MLPs and Boosted Trees on Tabular Data
6
+ Authors: David Holzmüller, Léo Grinsztajn, Ingo Steinwart
7
+ Codebase: https://github.com/dholzmueller/pytabkit
8
+ License: Apache-2.0
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+ import math
15
+ import time
16
+ from contextlib import contextmanager
17
+ from typing import Literal
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+ from sklearn.impute import SimpleImputer
22
+
23
+ from autogluon.common.utils.pandas_utils import get_approximate_df_mem_usage
24
+ from autogluon.common.utils.resource_utils import ResourceManager
25
+ from autogluon.core.models import AbstractModel
26
+ from autogluon.tabular import __version__
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ @contextmanager
32
+ def set_logger_level(logger_name: str, level: int):
33
+ _logger = logging.getLogger(logger_name)
34
+ old_level = _logger.level
35
+ _logger.setLevel(level)
36
+ try:
37
+ yield
38
+ finally:
39
+ _logger.setLevel(old_level)
40
+
41
+
42
+ # pip install pytabkit
43
+ class RealMLPModel(AbstractModel):
44
+ ag_key = "REALMLP"
45
+ ag_name = "RealMLP"
46
+ ag_priority = 75
47
+
48
+ def __init__(self, **kwargs):
49
+ super().__init__(**kwargs)
50
+ self._imputer = None
51
+ self._features_to_impute = None
52
+ self._features_to_keep = None
53
+ self._indicator_columns = None
54
+ self._features_bool = None
55
+ self._bool_to_cat = None
56
+
57
+ def get_model_cls(self, default_hyperparameters: Literal["td", "td_s"] = "td"):
58
+ from pytabkit import RealMLP_TD_Classifier, RealMLP_TD_Regressor, RealMLP_TD_S_Classifier, RealMLP_TD_S_Regressor
59
+
60
+ assert default_hyperparameters in ["td", "td_s"]
61
+ if self.problem_type in ['binary', 'multiclass']:
62
+ if default_hyperparameters == "td":
63
+ model_cls = RealMLP_TD_Classifier
64
+ else:
65
+ model_cls = RealMLP_TD_S_Classifier
66
+ else:
67
+ if default_hyperparameters == "td":
68
+ model_cls = RealMLP_TD_Regressor
69
+ else:
70
+ model_cls = RealMLP_TD_S_Regressor
71
+ return model_cls
72
+
73
+ def _fit(
74
+ self,
75
+ X: pd.DataFrame,
76
+ y: pd.Series,
77
+ X_val: pd.DataFrame = None,
78
+ y_val: pd.Series = None,
79
+ time_limit: float = None,
80
+ num_cpus: int = 1,
81
+ num_gpus: float = 0,
82
+ verbosity: int = 2,
83
+ **kwargs,
84
+ ):
85
+ start_time = time.time()
86
+
87
+ try:
88
+ import pytabkit
89
+ import torch
90
+ except ImportError as err:
91
+ logger.log(
92
+ 40,
93
+ f"\tFailed to import pytabkit/torch! To use the ReaLMLP model, "
94
+ f"do: `pip install autogluon.tabular[realmlp]=={__version__}`.",
95
+ )
96
+ raise err
97
+
98
+ if verbosity == 0:
99
+ _lightning_log_level = logging.ERROR
100
+ elif verbosity <= 2:
101
+ _lightning_log_level = logging.WARNING
102
+ else:
103
+ _lightning_log_level = logging.INFO
104
+
105
+ # FIXME: code assume we only see one GPU in the fit process.
106
+ device = "cpu" if num_gpus == 0 else "cuda:0"
107
+ if (device == "cuda:0") and (not torch.cuda.is_available()):
108
+ raise AssertionError(
109
+ "Fit specified to use GPU, but CUDA is not available on this machine. "
110
+ "Please switch to CPU usage instead.",
111
+ )
112
+
113
+ hyp = self._get_model_params()
114
+
115
+ default_hyperparameters = hyp.pop("default_hyperparameters", "td")
116
+
117
+ model_cls = self.get_model_cls(default_hyperparameters=default_hyperparameters)
118
+
119
+ metric_map = {
120
+ "roc_auc": "1-auc_ovr_alt",
121
+ "accuracy": "class_error",
122
+ "balanced_accuracy": "1-balanced_accuracy",
123
+ "log_loss": "cross_entropy",
124
+ "rmse": "rmse",
125
+ "root_mean_squared_error": "rmse",
126
+ "r2": "rmse",
127
+ "mae": "mae",
128
+ "mean_average_error": "mae",
129
+ }
130
+
131
+ val_metric_name = metric_map.get(self.stopping_metric.name, None)
132
+
133
+ init_kwargs = dict()
134
+
135
+ if val_metric_name is not None:
136
+ init_kwargs["val_metric_name"] = val_metric_name
137
+
138
+ # TODO: Make this smarter? Maybe use `eval_metric.needs_pred`
139
+ if hyp["use_ls"] is not None and isinstance(hyp["use_ls"], str) and hyp["use_ls"] == "auto":
140
+ if val_metric_name is None:
141
+ hyp["use_ls"] = False
142
+ elif val_metric_name in ["cross_entropy", "1-auc_ovr_alt"]:
143
+ hyp["use_ls"] = False
144
+ else:
145
+ hyp["use_ls"] = None
146
+
147
+ if X_val is None:
148
+ hyp["use_early_stopping"] = False
149
+ hyp["val_fraction"] = 0
150
+
151
+ bool_to_cat = hyp.pop("bool_to_cat", True)
152
+ impute_bool = hyp.pop("impute_bool", True)
153
+ name_categories = hyp.pop("name_categories", True)
154
+
155
+ n_features = len(X.columns)
156
+ if "predict_batch_size" in hyp and isinstance(hyp["predict_batch_size"], str) and hyp["predict_batch_size"] == "auto":
157
+ # simple heuristic to avoid OOM during inference time
158
+ # note: this isn't fool-proof, and ignores the actual memory availability of the machine.
159
+ # note: this is based on an assumption of 32 GB of memory available on the instance
160
+ # default is 1024
161
+ hyp["predict_batch_size"] = max(min(int(8192 * 200 / n_features), 8192), 64)
162
+
163
+ self.model = model_cls(
164
+ n_threads=num_cpus,
165
+ device=device,
166
+ **init_kwargs,
167
+ **hyp,
168
+ )
169
+
170
+ X = self.preprocess(X, is_train=True, bool_to_cat=bool_to_cat, impute_bool=impute_bool)
171
+
172
+ # FIXME: In rare cases can cause exceptions if name_categories=False, unknown why
173
+ extra_fit_kwargs = {}
174
+ if name_categories:
175
+ cat_col_names = X.select_dtypes(include='category').columns.tolist()
176
+ extra_fit_kwargs["cat_col_names"] = cat_col_names
177
+
178
+ if X_val is not None:
179
+ X_val = self.preprocess(X_val)
180
+
181
+ with set_logger_level("lightning.pytorch", _lightning_log_level):
182
+ self.model = self.model.fit(
183
+ X=X,
184
+ y=y,
185
+ X_val=X_val,
186
+ y_val=y_val,
187
+ time_to_fit_in_seconds=time_limit - (time.time() - start_time) if time_limit is not None else None,
188
+ **extra_fit_kwargs,
189
+ )
190
+
191
+ def _predict_proba(self, X, **kwargs) -> np.ndarray:
192
+ with set_logger_level("lightning.pytorch", logging.WARNING):
193
+ return super()._predict_proba(X=X, kwargs=kwargs)
194
+
195
+ # TODO: Move missing indicator + mean fill to a generic preprocess flag available to all models
196
+ # FIXME: bool_to_cat is a hack: Maybe move to abstract model?
197
+ def _preprocess(self, X: pd.DataFrame, is_train: bool = False, bool_to_cat: bool = False, impute_bool: bool = True, **kwargs) -> pd.DataFrame:
198
+ """
199
+ Imputes missing values via the mean and adds indicator columns for numerical features.
200
+ Converts indicator columns to categorical features to avoid them being treated as numerical by RealMLP.
201
+ """
202
+ X = super()._preprocess(X, **kwargs)
203
+
204
+ # FIXME: is copy needed?
205
+ X = X.copy(deep=True)
206
+ if is_train:
207
+ self._bool_to_cat = bool_to_cat
208
+ self._features_bool = self._feature_metadata.get_features(required_special_types=["bool"])
209
+ if impute_bool: # Technically this should do nothing useful because bools will never have NaN
210
+ self._features_to_impute = self._feature_metadata.get_features(valid_raw_types=["int", "float"])
211
+ self._features_to_keep = self._feature_metadata.get_features(invalid_raw_types=["int", "float"])
212
+ else:
213
+ self._features_to_impute = self._feature_metadata.get_features(valid_raw_types=["int", "float"], invalid_special_types=["bool"])
214
+ self._features_to_keep = [f for f in self._feature_metadata.get_features() if f not in self._features_to_impute]
215
+ if self._features_to_impute:
216
+ self._imputer = SimpleImputer(strategy="mean", add_indicator=True)
217
+ self._imputer.fit(X=X[self._features_to_impute])
218
+ self._indicator_columns = [c for c in self._imputer.get_feature_names_out() if c not in self._features_to_impute]
219
+ if self._imputer is not None:
220
+ X_impute = self._imputer.transform(X=X[self._features_to_impute])
221
+ X_impute = pd.DataFrame(X_impute, index=X.index, columns=self._imputer.get_feature_names_out())
222
+ if self._indicator_columns:
223
+ # FIXME: Use CategoryFeatureGenerator? Or tell the model which is category
224
+ # TODO: Add to features_bool?
225
+ X_impute[self._indicator_columns] = X_impute[self._indicator_columns].astype("category")
226
+ X = pd.concat([X[self._features_to_keep], X_impute], axis=1)
227
+ if self._bool_to_cat and self._features_bool:
228
+ # FIXME: Use CategoryFeatureGenerator? Or tell the model which is category
229
+ X[self._features_bool] = X[self._features_bool].astype("category")
230
+ return X
231
+
232
+ def _set_default_params(self):
233
+ default_params = dict(
234
+ random_state=0,
235
+
236
+ # Don't use early stopping by default, seems to work well without
237
+ use_early_stopping=False,
238
+ early_stopping_additive_patience=40,
239
+ early_stopping_multiplicative_patience=3,
240
+
241
+ # verdict: use_ls="auto" is much better than None.
242
+ use_ls="auto",
243
+
244
+ # verdict: no impact, but makes more sense to be False.
245
+ impute_bool=False,
246
+
247
+ # verdict: name_categories=True avoids random exceptions being raised in rare cases
248
+ name_categories=True,
249
+
250
+ # verdict: bool_to_cat=True is equivalent to False in terms of quality, but can be slightly faster in training time
251
+ # and slightly slower in inference time
252
+ bool_to_cat=True,
253
+
254
+ # verdict: "td" is better than "td_s"
255
+ default_hyperparameters="td", # options ["td", "td_s"]
256
+
257
+ predict_batch_size="auto", # if auto, uses AutoGluon's heuristic to set a value between 8192 and 64.
258
+ )
259
+ for param, val in default_params.items():
260
+ self._set_default_param_value(param, val)
261
+
262
+ @classmethod
263
+ def supported_problem_types(cls) -> list[str] | None:
264
+ return ["binary", "multiclass", "regression"]
265
+
266
+ def _get_default_stopping_metric(self):
267
+ return self.eval_metric
268
+
269
+ def _get_default_resources(self) -> tuple[int, int]:
270
+ # only_physical_cores=True is faster in training
271
+ num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
272
+ num_gpus = min(ResourceManager.get_gpu_count_torch(), 1)
273
+ return num_cpus, num_gpus
274
+
275
+ def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
276
+ hyperparameters = self._get_model_params()
277
+ return self.estimate_memory_usage_static(X=X, problem_type=self.problem_type, num_classes=self.num_classes, hyperparameters=hyperparameters, **kwargs)
278
+
279
+ @classmethod
280
+ def _estimate_memory_usage_static(
281
+ cls,
282
+ *,
283
+ X: pd.DataFrame,
284
+ hyperparameters: dict = None,
285
+ **kwargs,
286
+ ) -> int:
287
+ """
288
+ Heuristic memory estimate that correlates strongly with RealMLP's more sophisticated method
289
+
290
+ More comprehensive memory estimate logic:
291
+
292
+ ```python
293
+ from typing import Any
294
+
295
+ from pytabkit.models.alg_interfaces.nn_interfaces import NNAlgInterface
296
+ from pytabkit.models.data.data import DictDataset, TensorInfo
297
+ from pytabkit.models.sklearn.default_params import DefaultParams
298
+
299
+ def estimate_realmlp_cpu_ram_gb(hparams: dict[str, Any], n_numerical: int, cat_sizes: list[int], n_classes: int,
300
+ n_samples: int):
301
+ params = copy.copy(DefaultParams.RealMLP_TD_CLASS if n_classes > 0 else DefaultParams.RealMLP_TD_REG)
302
+ params.update(hparams)
303
+
304
+ ds = DictDataset(tensors=None, tensor_infos=dict(x_cont=TensorInfo(feat_shape=[n_numerical]),
305
+ x_cat=TensorInfo(cat_sizes=cat_sizes),
306
+ y=TensorInfo(cat_sizes=[n_classes])), device='cpu',
307
+ n_samples=n_samples)
308
+
309
+ alg_interface = NNAlgInterface(**params)
310
+ res = alg_interface.get_required_resources(ds, n_cv=1, n_refit=0, n_splits=1, split_seeds=[0], n_train=n_samples)
311
+ return res.cpu_ram_gb
312
+ ```
313
+
314
+ """
315
+ if hyperparameters is None:
316
+ hyperparameters = {}
317
+ plr_hidden_1 = hyperparameters.get("plr_hidden_1", 16)
318
+ plr_hidden_2 = hyperparameters.get("plr_hidden_2", 4)
319
+ hidden_width = hyperparameters.get("hidden_width", 256)
320
+
321
+ num_features = len(X.columns)
322
+ columns_mem_est = num_features * 8e5
323
+
324
+ hidden_1_weight = 0.13
325
+ hidden_2_weight = 0.42
326
+ width_factor = math.sqrt(hidden_width / 256 + 0.6)
327
+
328
+ columns_mem_est_hidden_1 = columns_mem_est * hidden_1_weight * plr_hidden_1 / 16 * width_factor
329
+ columns_mem_est_hidden_2 = columns_mem_est * hidden_2_weight * plr_hidden_2 / 16 * width_factor
330
+ columns_mem_est = columns_mem_est_hidden_1 + columns_mem_est_hidden_2
331
+
332
+ dataset_size_mem_est = 5 * get_approximate_df_mem_usage(X).sum() # roughly 5x DataFrame memory size
333
+ baseline_overhead_mem_est = 3e8 # 300 MB generic overhead
334
+
335
+ mem_estimate = dataset_size_mem_est + columns_mem_est + baseline_overhead_mem_est
336
+
337
+ return mem_estimate
338
+
339
+ @classmethod
340
+ def _class_tags(cls) -> dict:
341
+ return {"can_estimate_memory_usage_static": True}
342
+
343
+ def _more_tags(self) -> dict:
344
+ # TODO: Need to add train params support, track best epoch
345
+ # How to mirror RealMLP learning rate scheduler while forcing stopping at a specific epoch?
346
+ tags = {"can_refit_full": False}
347
+ return tags
@@ -309,8 +309,9 @@ class RFModel(AbstractModel):
309
309
  if self.model.n_outputs_ == 1:
310
310
  self.model.n_classes_ = [self.model.n_classes_]
311
311
  from sklearn.tree._tree import DOUBLE, DTYPE
312
+ from sklearn.utils.validation import check_X_y
312
313
 
313
- X, y = self.model._validate_data(X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE)
314
+ X, y = check_X_y(X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE)
314
315
  if y.ndim == 1:
315
316
  # reshape is necessary to preserve the data contiguity against vs
316
317
  # [:, np.newaxis] that does not.
File without changes
@@ -0,0 +1,174 @@
1
+ """
2
+ Code Adapted from TabArena: https://github.com/autogluon/tabrepo/blob/main/tabrepo/benchmark/models/ag/tabicl/tabicl_model.py
3
+ Model: TabICL
4
+ Paper: TabICL: A Tabular Foundation Model for In-Context Learning on Large Data
5
+ Authors: Jingang Qu, David Holzmüller, Gaël Varoquaux, Marine Le Morvan
6
+ Codebase: https://github.com/soda-inria/tabicl
7
+ License: BSD-3-Clause
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+
14
+ import pandas as pd
15
+
16
+ from autogluon.common.utils.pandas_utils import get_approximate_df_mem_usage
17
+ from autogluon.common.utils.resource_utils import ResourceManager
18
+ from autogluon.core.models import AbstractModel
19
+ from autogluon.tabular import __version__
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ # TODO: Verify if crashes when weights are not yet downloaded and fit in parallel
25
+ class TabICLModel(AbstractModel):
26
+ ag_key = "TABICL"
27
+ ag_name = "TabICL"
28
+ ag_priority = 65
29
+
30
+ def get_model_cls(self):
31
+ from tabicl import TabICLClassifier
32
+
33
+ if self.problem_type in ["binary", "multiclass"]:
34
+ model_cls = TabICLClassifier
35
+ else:
36
+ raise AssertionError(f"Unsupported problem_type: {self.problem_type}")
37
+ return model_cls
38
+
39
+ @staticmethod
40
+ def _get_batch_size(n_cells: int):
41
+ if n_cells <= 4_000_000:
42
+ return 8
43
+ elif n_cells <= 6_000_000:
44
+ return 4
45
+ else:
46
+ return 2
47
+
48
+ def _fit(
49
+ self,
50
+ X: pd.DataFrame,
51
+ y: pd.Series,
52
+ num_cpus: int = 1,
53
+ num_gpus: int = 0,
54
+ **kwargs,
55
+ ):
56
+ try:
57
+ import tabicl
58
+ except ImportError as err:
59
+ logger.log(
60
+ 40,
61
+ f"\tFailed to import tabicl! To use the TabICL model, "
62
+ f"do: `pip install autogluon.tabular[tabicl]=={__version__}`.",
63
+ )
64
+ raise err
65
+
66
+ from torch.cuda import is_available
67
+
68
+ device = "cuda" if num_gpus != 0 else "cpu"
69
+ if (device == "cuda") and (not is_available()):
70
+ # FIXME: warn instead and switch to CPU.
71
+ raise AssertionError(
72
+ "Fit specified to use GPU, but CUDA is not available on this machine. "
73
+ "Please switch to CPU usage instead.",
74
+ )
75
+
76
+ model_cls = self.get_model_cls()
77
+ hyp = self._get_model_params()
78
+ hyp["batch_size"] = hyp.get("batch_size", self._get_batch_size(X.shape[0] * X.shape[1]))
79
+ self.model = model_cls(
80
+ **hyp,
81
+ device=device,
82
+ n_jobs=num_cpus,
83
+ )
84
+ X = self.preprocess(X)
85
+ self.model = self.model.fit(
86
+ X=X,
87
+ y=y,
88
+ )
89
+
90
+ def _set_default_params(self):
91
+ default_params = {
92
+ "random_state": 42,
93
+ }
94
+ for param, val in default_params.items():
95
+ self._set_default_param_value(param, val)
96
+
97
+ def _get_default_auxiliary_params(self) -> dict:
98
+ default_auxiliary_params = super()._get_default_auxiliary_params()
99
+ default_auxiliary_params.update(
100
+ {
101
+ "max_rows": 100000,
102
+ "max_features": 500,
103
+ }
104
+ )
105
+ return default_auxiliary_params
106
+
107
+ @classmethod
108
+ def supported_problem_types(cls) -> list[str] | None:
109
+ return ["binary", "multiclass"]
110
+
111
+ def _get_default_resources(self) -> tuple[int, int]:
112
+ num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
113
+ num_gpus = min(ResourceManager.get_gpu_count_torch(), 1)
114
+ return num_cpus, num_gpus
115
+
116
+ def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
117
+ hyperparameters = self._get_model_params()
118
+ return self.estimate_memory_usage_static(X=X, problem_type=self.problem_type, num_classes=self.num_classes, hyperparameters=hyperparameters, **kwargs)
119
+
120
+ @classmethod
121
+ def _estimate_memory_usage_static(
122
+ cls,
123
+ *,
124
+ X: pd.DataFrame,
125
+ hyperparameters: dict = None,
126
+ **kwargs,
127
+ ) -> int:
128
+ """
129
+ Heuristic memory estimate that is very primitive.
130
+ Can be vastly improved.
131
+ """
132
+ if hyperparameters is None:
133
+ hyperparameters = {}
134
+
135
+ dataset_size_mem_est = 3 * get_approximate_df_mem_usage(X).sum() # roughly 3x DataFrame memory size
136
+ baseline_overhead_mem_est = 1e9 # 1 GB generic overhead
137
+
138
+ n_rows = X.shape[0]
139
+ n_features = X.shape[1]
140
+ batch_size = hyperparameters.get("batch_size", cls._get_batch_size(X.shape[0] * X.shape[1]))
141
+ embedding_dim = 128
142
+ bytes_per_float = 4
143
+ model_mem_estimate = 2 * batch_size * embedding_dim * bytes_per_float * (4 + n_rows) * n_features
144
+
145
+ model_mem_estimate *= 1.3 # add 30% buffer
146
+
147
+ # TODO: Observed memory spikes above expected values on large datasets, increasing mem estimate to compensate
148
+ model_mem_estimate *= 1.5
149
+
150
+ mem_estimate = model_mem_estimate + dataset_size_mem_est + baseline_overhead_mem_est
151
+
152
+ return mem_estimate
153
+
154
+ @classmethod
155
+ def _get_default_ag_args_ensemble(cls, **kwargs) -> dict:
156
+ """
157
+ Set fold_fitting_strategy to sequential_local,
158
+ as parallel folding crashes if model weights aren't pre-downloaded.
159
+ """
160
+ default_ag_args_ensemble = super()._get_default_ag_args_ensemble(**kwargs)
161
+ extra_ag_args_ensemble = {
162
+ # FIXME: If parallel, uses way more memory, seems to behave incorrectly, so we force sequential.
163
+ "fold_fitting_strategy": "sequential_local",
164
+ "refit_folds": True, # Better to refit the model for faster inference and similar quality as the bag.
165
+ }
166
+ default_ag_args_ensemble.update(extra_ag_args_ensemble)
167
+ return default_ag_args_ensemble
168
+
169
+ @classmethod
170
+ def _class_tags(cls) -> dict:
171
+ return {"can_estimate_memory_usage_static": True}
172
+
173
+ def _more_tags(self) -> dict:
174
+ return {"can_refit_full": True}
File without changes