autogluon.tabular 1.3.2b20250709__py3-none-any.whl → 1.3.2b20250710__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. autogluon/tabular/models/__init__.py +3 -0
  2. autogluon/tabular/models/catboost/callbacks.py +3 -2
  3. autogluon/tabular/models/catboost/catboost_model.py +2 -2
  4. autogluon/tabular/models/catboost/catboost_utils.py +7 -3
  5. autogluon/tabular/models/fastainn/tabular_nn_fastai.py +3 -3
  6. autogluon/tabular/models/lgb/lgb_model.py +2 -2
  7. autogluon/tabular/models/realmlp/__init__.py +0 -0
  8. autogluon/tabular/models/realmlp/realmlp_model.py +347 -0
  9. autogluon/tabular/models/rf/rf_model.py +2 -1
  10. autogluon/tabular/models/tabicl/__init__.py +0 -0
  11. autogluon/tabular/models/tabicl/tabicl_model.py +174 -0
  12. autogluon/tabular/models/tabm/__init__.py +0 -0
  13. autogluon/tabular/models/tabm/_tabm_internal.py +544 -0
  14. autogluon/tabular/models/tabm/rtdl_num_embeddings.py +807 -0
  15. autogluon/tabular/models/tabm/tabm_model.py +275 -0
  16. autogluon/tabular/models/tabm/tabm_reference.py +627 -0
  17. autogluon/tabular/models/tabpfnmix/tabpfnmix_model.py +3 -3
  18. autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py +3 -3
  19. autogluon/tabular/models/xgboost/xgboost_model.py +2 -2
  20. autogluon/tabular/predictor/predictor.py +5 -3
  21. autogluon/tabular/registry/_ag_model_registry.py +6 -0
  22. autogluon/tabular/testing/fit_helper.py +27 -25
  23. autogluon/tabular/testing/generate_datasets.py +7 -0
  24. autogluon/tabular/trainer/abstract_trainer.py +1 -1
  25. autogluon/tabular/trainer/model_presets/presets.py +10 -1
  26. autogluon/tabular/version.py +1 -1
  27. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250710.dist-info}/METADATA +21 -13
  28. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250710.dist-info}/RECORD +35 -26
  29. /autogluon.tabular-1.3.2b20250709-py3.9-nspkg.pth → /autogluon.tabular-1.3.2b20250710-py3.9-nspkg.pth +0 -0
  30. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250710.dist-info}/LICENSE +0 -0
  31. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250710.dist-info}/NOTICE +0 -0
  32. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250710.dist-info}/WHEEL +0 -0
  33. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250710.dist-info}/namespace_packages.txt +0 -0
  34. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250710.dist-info}/top_level.txt +0 -0
  35. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250710.dist-info}/zip-safe +0 -0
@@ -0,0 +1,275 @@
1
+ """
2
+ Code Adapted from TabArena: https://github.com/autogluon/tabrepo/blob/main/tabrepo/benchmark/models/ag/tabm/tabm_model.py
3
+ Note: This is a custom implementation of TabM based on TabArena. Because the AutoGluon 1.4 release occurred at nearly
4
+ the same time as TabM became available on PyPi, we chose to use TabArena's implementation
5
+ for the AutoGluon 1.4 release as it has already been benchmarked.
6
+
7
+ Model: TabM
8
+ Paper: TabM: Advancing Tabular Deep Learning with Parameter-Efficient Ensembling
9
+ Authors: Yury Gorishniy, Akim Kotelnikov, Artem Babenko
10
+ Codebase: https://github.com/yandex-research/tabm
11
+ License: Apache-2.0
12
+
13
+ Partially adapted from pytabkit's TabM implementation.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import logging
19
+ import time
20
+
21
+ import pandas as pd
22
+ from autogluon.common.utils.resource_utils import ResourceManager
23
+ from autogluon.core.models import AbstractModel
24
+ from autogluon.tabular import __version__
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class TabMModel(AbstractModel):
30
+ ag_key = "TABM"
31
+ ag_name = "TabM"
32
+ ag_priority = 85
33
+
34
+ def __init__(self, **kwargs):
35
+ super().__init__(**kwargs)
36
+ self._imputer = None
37
+ self._features_to_impute = None
38
+ self._features_to_keep = None
39
+ self._indicator_columns = None
40
+ self._features_bool = None
41
+ self._bool_to_cat = None
42
+
43
+ def _fit(
44
+ self,
45
+ X: pd.DataFrame,
46
+ y: pd.Series,
47
+ X_val: pd.DataFrame = None,
48
+ y_val: pd.Series = None,
49
+ time_limit: float | None = None,
50
+ num_cpus: int = 1,
51
+ num_gpus: float = 0,
52
+ **kwargs,
53
+ ):
54
+ start_time = time.time()
55
+
56
+ try:
57
+ # imports various dependencies such as torch
58
+ from ._tabm_internal import TabMImplementation
59
+ from torch.cuda import is_available
60
+ except ImportError as err:
61
+ logger.log(
62
+ 40,
63
+ f"\tFailed to import tabm! To use the TabM model, "
64
+ f"do: `pip install autogluon.tabular[tabm]=={__version__}`.",
65
+ )
66
+ raise err
67
+
68
+ device = "cpu" if num_gpus == 0 else "cuda"
69
+ if (device == "cuda") and (not is_available()):
70
+ # FIXME: warn instead and switch to CPU.
71
+ raise AssertionError(
72
+ "Fit specified to use GPU, but CUDA is not available on this machine. "
73
+ "Please switch to CPU usage instead.",
74
+ )
75
+
76
+ if X_val is None:
77
+ from autogluon.core.utils import generate_train_test_split
78
+
79
+ X_train, X_val, y_train, y_val = generate_train_test_split(
80
+ X=X,
81
+ y=y,
82
+ problem_type=self.problem_type,
83
+ test_size=0.2,
84
+ random_state=0,
85
+ )
86
+
87
+ hyp = self._get_model_params()
88
+ bool_to_cat = hyp.pop("bool_to_cat", True)
89
+
90
+ X = self.preprocess(X, is_train=True, bool_to_cat=bool_to_cat)
91
+ if X_val is not None:
92
+ X_val = self.preprocess(X_val)
93
+
94
+ self.model = TabMImplementation(
95
+ n_threads=num_cpus,
96
+ device=device,
97
+ problem_type=self.problem_type,
98
+ early_stopping_metric=self.stopping_metric,
99
+ **hyp,
100
+ )
101
+
102
+ self.model.fit(
103
+ X_train=X,
104
+ y_train=y,
105
+ X_val=X_val,
106
+ y_val=y_val,
107
+ cat_col_names=X.select_dtypes(include="category").columns.tolist(),
108
+ time_to_fit_in_seconds=time_limit - (time.time() - start_time) if time_limit is not None else None,
109
+ )
110
+
111
+ # FIXME: bool_to_cat is a hack: Maybe move to abstract model?
112
+ def _preprocess(
113
+ self,
114
+ X: pd.DataFrame,
115
+ is_train: bool = False,
116
+ bool_to_cat: bool = False,
117
+ **kwargs,
118
+ ) -> pd.DataFrame:
119
+ """Imputes missing values via the mean and adds indicator columns for numerical features.
120
+ Converts indicator columns to categorical features to avoid them being treated as numerical by RealMLP.
121
+ """
122
+ X = super()._preprocess(X, **kwargs)
123
+
124
+ if is_train:
125
+ self._bool_to_cat = bool_to_cat
126
+ self._features_bool = self._feature_metadata.get_features(required_special_types=["bool"])
127
+ if self._bool_to_cat and self._features_bool:
128
+ # FIXME: Use CategoryFeatureGenerator? Or tell the model which is category
129
+ X = X.copy(deep=True)
130
+ X[self._features_bool] = X[self._features_bool].astype("category")
131
+
132
+ return X
133
+
134
+ def _set_default_params(self):
135
+ default_params = dict(
136
+ random_state=0,
137
+ )
138
+ for param, val in default_params.items():
139
+ self._set_default_param_value(param, val)
140
+
141
+ @classmethod
142
+ def supported_problem_types(cls) -> list[str] | None:
143
+ return ["binary", "multiclass", "regression"]
144
+
145
+ def _get_default_stopping_metric(self):
146
+ return self.eval_metric
147
+
148
+ def _get_default_resources(self) -> tuple[int, int]:
149
+ # only_physical_cores=True is faster in training
150
+ num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
151
+ num_gpus = min(ResourceManager.get_gpu_count_torch(), 1)
152
+ return num_cpus, num_gpus
153
+
154
+ def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
155
+ hyperparameters = self._get_model_params()
156
+ return self.estimate_memory_usage_static(
157
+ X=X,
158
+ problem_type=self.problem_type,
159
+ num_classes=self.num_classes,
160
+ hyperparameters=hyperparameters,
161
+ **kwargs,
162
+ )
163
+
164
+ @classmethod
165
+ def _estimate_memory_usage_static(
166
+ cls,
167
+ *,
168
+ X: pd.DataFrame,
169
+ hyperparameters: dict = None,
170
+ num_classes: int | None = 1,
171
+ **kwargs,
172
+ ) -> int:
173
+ """
174
+ Heuristic memory estimate that correlates strongly with RealMLP
175
+ """
176
+ if num_classes is None:
177
+ num_classes = 1
178
+ if hyperparameters is None:
179
+ hyperparameters = {}
180
+
181
+ cat_sizes = []
182
+ for col in X.select_dtypes(include=["category", "object"]):
183
+ if isinstance(X[col], pd.CategoricalDtype):
184
+ # Use .cat.codes for category dtype
185
+ unique_codes = X[col].cat.codes.unique()
186
+ else:
187
+ # For object dtype, treat unique strings as codes
188
+ unique_codes = X[col].astype("category").cat.codes.unique()
189
+ cat_sizes.append(len(unique_codes))
190
+
191
+ n_numerical = len(X.select_dtypes(include=["number"]).columns)
192
+
193
+ # TODO: This estimates very high memory usage,
194
+ # we probably need to adjust batch size automatically to compensate
195
+ mem_estimate_bytes = cls._estimate_tabm_ram(
196
+ hyperparameters=hyperparameters,
197
+ n_numerical=n_numerical,
198
+ cat_sizes=cat_sizes,
199
+ n_classes=num_classes,
200
+ n_samples=len(X),
201
+ )
202
+
203
+ return mem_estimate_bytes
204
+
205
+ @classmethod
206
+ def _estimate_tabm_ram(
207
+ cls,
208
+ hyperparameters: dict,
209
+ n_numerical: int,
210
+ cat_sizes: list[int],
211
+ n_classes: int,
212
+ n_samples: int,
213
+ ) -> int:
214
+ num_emb_n_bins = hyperparameters.get("num_emb_n_bins", 48)
215
+ d_embedding = hyperparameters.get("d_embedding", 16)
216
+ d_block = hyperparameters.get("d_block", 512)
217
+ # not completely sure if this is hidden blocks or all blocks, taking the safe option below
218
+ n_blocks = hyperparameters.get("n_blocks", "auto")
219
+ if isinstance(n_blocks, str) and n_blocks == "auto":
220
+ n_blocks = 3
221
+ batch_size = hyperparameters.get("batch_size", "auto")
222
+ if isinstance(batch_size, str) and batch_size == "auto":
223
+ batch_size = cls.get_tabm_auto_batch_size(n_samples=n_samples)
224
+ tabm_k = hyperparameters.get("tabm_k", 32)
225
+ predict_batch_size = hyperparameters.get("eval_batch_size", 1024)
226
+
227
+ # not completely sure
228
+ n_params_num_emb = n_numerical * (num_emb_n_bins + 1) * d_embedding
229
+ n_params_mlp = (n_numerical + sum(cat_sizes)) * d_embedding * (d_block + tabm_k) \
230
+ + (n_blocks - 1) * d_block ** 2 \
231
+ + n_blocks * d_block + d_block * (1 + max(1, n_classes))
232
+ # 4 bytes per float, up to 5 copies of parameters (1 standard, 1 .grad, 2 adam, 1 best_epoch)
233
+ mem_params = 4 * 5 * (n_params_num_emb + n_params_mlp)
234
+
235
+ # compute number of floats in forward pass (per batch element)
236
+ # todo: numerical embedding layer (not sure if this is entirely correct)
237
+ n_floats_forward = n_numerical * (num_emb_n_bins + d_embedding)
238
+ # before and after scale
239
+ n_floats_forward += 2 * (sum(cat_sizes) + n_numerical * d_embedding)
240
+ # 2 for pre-act, post-act
241
+ n_floats_forward += n_blocks * 2 * d_block + 2 * max(1, n_classes)
242
+ # 2 for forward and backward, 4 bytes per float
243
+ mem_forward_backward = 4 * max(batch_size * 2, predict_batch_size) * n_floats_forward * tabm_k
244
+ # * 8 is pessimistic for the long tensors in the forward pass, 4 would probably suffice
245
+
246
+ mem_ds = n_samples * (4 * n_numerical + 8 * len(cat_sizes))
247
+
248
+ # some safety constants and offsets (the 5 is probably excessive)
249
+ mem_total = 5 * mem_ds + 1.2 * mem_forward_backward + 1.2 * mem_params + 0.3 * (1024 ** 3)
250
+
251
+ return mem_total
252
+
253
+ @classmethod
254
+ def get_tabm_auto_batch_size(cls, n_samples: int) -> int:
255
+ # by Yury Gorishniy, inferred from the choices in the TabM paper.
256
+ if n_samples < 2_800:
257
+ return 32
258
+ if n_samples < 4_500:
259
+ return 64
260
+ if n_samples < 6_400:
261
+ return 128
262
+ if n_samples < 32_000:
263
+ return 256
264
+ if n_samples < 108_000:
265
+ return 512
266
+ return 1024
267
+
268
+ @classmethod
269
+ def _class_tags(cls):
270
+ return {"can_estimate_memory_usage_static": True}
271
+
272
+ def _more_tags(self) -> dict:
273
+ # TODO: Need to add train params support, track best epoch
274
+ # How to force stopping at a specific epoch?
275
+ return {"can_refit_full": False}