autogluon.tabular 1.3.2b20250709__py3-none-any.whl → 1.3.2b20250710__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autogluon/tabular/models/__init__.py +3 -0
- autogluon/tabular/models/catboost/callbacks.py +3 -2
- autogluon/tabular/models/catboost/catboost_model.py +2 -2
- autogluon/tabular/models/catboost/catboost_utils.py +7 -3
- autogluon/tabular/models/fastainn/tabular_nn_fastai.py +3 -3
- autogluon/tabular/models/lgb/lgb_model.py +2 -2
- autogluon/tabular/models/realmlp/__init__.py +0 -0
- autogluon/tabular/models/realmlp/realmlp_model.py +347 -0
- autogluon/tabular/models/rf/rf_model.py +2 -1
- autogluon/tabular/models/tabicl/__init__.py +0 -0
- autogluon/tabular/models/tabicl/tabicl_model.py +174 -0
- autogluon/tabular/models/tabm/__init__.py +0 -0
- autogluon/tabular/models/tabm/_tabm_internal.py +544 -0
- autogluon/tabular/models/tabm/rtdl_num_embeddings.py +807 -0
- autogluon/tabular/models/tabm/tabm_model.py +275 -0
- autogluon/tabular/models/tabm/tabm_reference.py +627 -0
- autogluon/tabular/models/tabpfnmix/tabpfnmix_model.py +3 -3
- autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py +3 -3
- autogluon/tabular/models/xgboost/xgboost_model.py +2 -2
- autogluon/tabular/predictor/predictor.py +5 -3
- autogluon/tabular/registry/_ag_model_registry.py +6 -0
- autogluon/tabular/testing/fit_helper.py +27 -25
- autogluon/tabular/testing/generate_datasets.py +7 -0
- autogluon/tabular/trainer/abstract_trainer.py +1 -1
- autogluon/tabular/trainer/model_presets/presets.py +10 -1
- autogluon/tabular/version.py +1 -1
- {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250710.dist-info}/METADATA +21 -13
- {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250710.dist-info}/RECORD +35 -26
- /autogluon.tabular-1.3.2b20250709-py3.9-nspkg.pth → /autogluon.tabular-1.3.2b20250710-py3.9-nspkg.pth +0 -0
- {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250710.dist-info}/LICENSE +0 -0
- {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250710.dist-info}/NOTICE +0 -0
- {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250710.dist-info}/WHEEL +0 -0
- {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250710.dist-info}/namespace_packages.txt +0 -0
- {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250710.dist-info}/top_level.txt +0 -0
- {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250710.dist-info}/zip-safe +0 -0
@@ -17,7 +17,10 @@ from .imodels.imodels_models import (
|
|
17
17
|
from .knn.knn_model import KNNModel
|
18
18
|
from .lgb.lgb_model import LGBModel
|
19
19
|
from .lr.lr_model import LinearModel
|
20
|
+
from .realmlp.realmlp_model import RealMLPModel
|
20
21
|
from .rf.rf_model import RFModel
|
22
|
+
from .tabicl.tabicl_model import TabICLModel
|
23
|
+
from .tabm.tabm_model import TabMModel
|
21
24
|
from .tabpfn.tabpfn_model import TabPFNModel
|
22
25
|
from .tabpfnmix.tabpfnmix_model import TabPFNMixModel
|
23
26
|
from .tabular_nn.torch.tabular_nn_torch import TabularNeuralNetTorchModel
|
@@ -170,14 +170,15 @@ class EarlyStoppingCallback:
|
|
170
170
|
|
171
171
|
self.eval_metric_name = eval_metric_name
|
172
172
|
self.is_max_optimal = is_max_optimal
|
173
|
-
self.is_quantile = self.eval_metric_name
|
173
|
+
self.is_quantile = CATBOOST_QUANTILE_PREFIX in self.eval_metric_name
|
174
174
|
|
175
175
|
def after_iteration(self, info):
|
176
176
|
is_best_iter = False
|
177
177
|
if self.is_quantile:
|
178
178
|
# FIXME: CatBoost adds extra ',' in the metric name if quantile levels are not balanced
|
179
179
|
# e.g., 'MultiQuantile:alpha=0.1,0.25,0.5,0.95' becomes 'MultiQuantile:alpha=0.1,,0.25,0.5,0.95'
|
180
|
-
|
180
|
+
# `'Quantile:' in k` catches both multiquantile (MultiQuantile:) and single-quantile mode (Quantile:)
|
181
|
+
eval_metric_name = [k for k in info.metrics[self.compare_key] if CATBOOST_QUANTILE_PREFIX in k][0]
|
181
182
|
else:
|
182
183
|
eval_metric_name = self.eval_metric_name
|
183
184
|
cur_score = info.metrics[self.compare_key][eval_metric_name][-1]
|
@@ -350,8 +350,8 @@ class CatBoostModel(AbstractModel):
|
|
350
350
|
return minimum_resources
|
351
351
|
|
352
352
|
def _get_default_resources(self):
|
353
|
-
#
|
354
|
-
num_cpus = ResourceManager.
|
353
|
+
# only_physical_cores=True is faster in training
|
354
|
+
num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
|
355
355
|
num_gpus = 0
|
356
356
|
return num_cpus, num_gpus
|
357
357
|
|
@@ -5,7 +5,7 @@ from autogluon.core.constants import BINARY, MULTICLASS, QUANTILE, REGRESSION, S
|
|
5
5
|
logger = logging.getLogger(__name__)
|
6
6
|
|
7
7
|
|
8
|
-
CATBOOST_QUANTILE_PREFIX = "
|
8
|
+
CATBOOST_QUANTILE_PREFIX = "Quantile:"
|
9
9
|
|
10
10
|
|
11
11
|
# TODO: Add weight support?
|
@@ -74,8 +74,12 @@ def get_catboost_metric_from_ag_metric(metric, problem_type, quantile_levels=Non
|
|
74
74
|
raise AssertionError(f"quantile_levels must be provided for problem_type = {problem_type}")
|
75
75
|
if not all(0 < q < 1 for q in quantile_levels):
|
76
76
|
raise AssertionError(f"quantile_levels must fulfill 0 < q < 1, provided quantile_levels: {quantile_levels}")
|
77
|
-
|
78
|
-
|
77
|
+
# Loss function MultiQuantile: can only be used if len(quantile_levels) >= 2, otherwise we must use Quantile:
|
78
|
+
if len(quantile_levels) == 1:
|
79
|
+
metric_class = f"{CATBOOST_QUANTILE_PREFIX}alpha={quantile_levels[0]}"
|
80
|
+
else:
|
81
|
+
quantile_string = ",".join(str(q) for q in quantile_levels)
|
82
|
+
metric_class = f"Multi{CATBOOST_QUANTILE_PREFIX}alpha={quantile_string}"
|
79
83
|
else:
|
80
84
|
raise AssertionError(f"CatBoost does not support {problem_type} problem type.")
|
81
85
|
|
@@ -584,8 +584,8 @@ class NNFastAiTabularModel(AbstractModel):
|
|
584
584
|
return default_auxiliary_params
|
585
585
|
|
586
586
|
def _get_default_resources(self):
|
587
|
-
#
|
588
|
-
num_cpus = ResourceManager.
|
587
|
+
# only_physical_cores=True is faster in training
|
588
|
+
num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
|
589
589
|
num_gpus = 0
|
590
590
|
return num_cpus, num_gpus
|
591
591
|
|
@@ -642,7 +642,7 @@ class NNFastAiTabularModel(AbstractModel):
|
|
642
642
|
|
643
643
|
def _get_maximum_resources(self) -> dict[str, Union[int, float]]:
|
644
644
|
# fastai model trains slower when utilizing virtual cores and this issue scale up when the number of cpu cores increases
|
645
|
-
return {"num_cpus": ResourceManager.
|
645
|
+
return {"num_cpus": ResourceManager.get_cpu_count(only_physical_cores=True)}
|
646
646
|
|
647
647
|
def get_minimum_resources(self, is_gpu_available=False):
|
648
648
|
minimum_resources = {
|
@@ -532,8 +532,8 @@ class LGBModel(AbstractModel):
|
|
532
532
|
return minimum_resources
|
533
533
|
|
534
534
|
def _get_default_resources(self):
|
535
|
-
#
|
536
|
-
num_cpus = ResourceManager.
|
535
|
+
# only_physical_cores=True is faster in training
|
536
|
+
num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
|
537
537
|
num_gpus = 0
|
538
538
|
return num_cpus, num_gpus
|
539
539
|
|
File without changes
|
@@ -0,0 +1,347 @@
|
|
1
|
+
"""
|
2
|
+
Code Adapted from TabArena: https://github.com/autogluon/tabrepo/blob/main/tabrepo/benchmark/models/ag/realmlp/realmlp_model.py
|
3
|
+
|
4
|
+
Model: RealMLP
|
5
|
+
Paper: Better by Default: Strong Pre-Tuned MLPs and Boosted Trees on Tabular Data
|
6
|
+
Authors: David Holzmüller, Léo Grinsztajn, Ingo Steinwart
|
7
|
+
Codebase: https://github.com/dholzmueller/pytabkit
|
8
|
+
License: Apache-2.0
|
9
|
+
"""
|
10
|
+
|
11
|
+
from __future__ import annotations
|
12
|
+
|
13
|
+
import logging
|
14
|
+
import math
|
15
|
+
import time
|
16
|
+
from contextlib import contextmanager
|
17
|
+
from typing import Literal
|
18
|
+
|
19
|
+
import numpy as np
|
20
|
+
import pandas as pd
|
21
|
+
from sklearn.impute import SimpleImputer
|
22
|
+
|
23
|
+
from autogluon.common.utils.pandas_utils import get_approximate_df_mem_usage
|
24
|
+
from autogluon.common.utils.resource_utils import ResourceManager
|
25
|
+
from autogluon.core.models import AbstractModel
|
26
|
+
from autogluon.tabular import __version__
|
27
|
+
|
28
|
+
logger = logging.getLogger(__name__)
|
29
|
+
|
30
|
+
|
31
|
+
@contextmanager
|
32
|
+
def set_logger_level(logger_name: str, level: int):
|
33
|
+
_logger = logging.getLogger(logger_name)
|
34
|
+
old_level = _logger.level
|
35
|
+
_logger.setLevel(level)
|
36
|
+
try:
|
37
|
+
yield
|
38
|
+
finally:
|
39
|
+
_logger.setLevel(old_level)
|
40
|
+
|
41
|
+
|
42
|
+
# pip install pytabkit
|
43
|
+
class RealMLPModel(AbstractModel):
|
44
|
+
ag_key = "REALMLP"
|
45
|
+
ag_name = "RealMLP"
|
46
|
+
ag_priority = 75
|
47
|
+
|
48
|
+
def __init__(self, **kwargs):
|
49
|
+
super().__init__(**kwargs)
|
50
|
+
self._imputer = None
|
51
|
+
self._features_to_impute = None
|
52
|
+
self._features_to_keep = None
|
53
|
+
self._indicator_columns = None
|
54
|
+
self._features_bool = None
|
55
|
+
self._bool_to_cat = None
|
56
|
+
|
57
|
+
def get_model_cls(self, default_hyperparameters: Literal["td", "td_s"] = "td"):
|
58
|
+
from pytabkit import RealMLP_TD_Classifier, RealMLP_TD_Regressor, RealMLP_TD_S_Classifier, RealMLP_TD_S_Regressor
|
59
|
+
|
60
|
+
assert default_hyperparameters in ["td", "td_s"]
|
61
|
+
if self.problem_type in ['binary', 'multiclass']:
|
62
|
+
if default_hyperparameters == "td":
|
63
|
+
model_cls = RealMLP_TD_Classifier
|
64
|
+
else:
|
65
|
+
model_cls = RealMLP_TD_S_Classifier
|
66
|
+
else:
|
67
|
+
if default_hyperparameters == "td":
|
68
|
+
model_cls = RealMLP_TD_Regressor
|
69
|
+
else:
|
70
|
+
model_cls = RealMLP_TD_S_Regressor
|
71
|
+
return model_cls
|
72
|
+
|
73
|
+
def _fit(
|
74
|
+
self,
|
75
|
+
X: pd.DataFrame,
|
76
|
+
y: pd.Series,
|
77
|
+
X_val: pd.DataFrame = None,
|
78
|
+
y_val: pd.Series = None,
|
79
|
+
time_limit: float = None,
|
80
|
+
num_cpus: int = 1,
|
81
|
+
num_gpus: float = 0,
|
82
|
+
verbosity: int = 2,
|
83
|
+
**kwargs,
|
84
|
+
):
|
85
|
+
start_time = time.time()
|
86
|
+
|
87
|
+
try:
|
88
|
+
import pytabkit
|
89
|
+
import torch
|
90
|
+
except ImportError as err:
|
91
|
+
logger.log(
|
92
|
+
40,
|
93
|
+
f"\tFailed to import pytabkit/torch! To use the ReaLMLP model, "
|
94
|
+
f"do: `pip install autogluon.tabular[realmlp]=={__version__}`.",
|
95
|
+
)
|
96
|
+
raise err
|
97
|
+
|
98
|
+
if verbosity == 0:
|
99
|
+
_lightning_log_level = logging.ERROR
|
100
|
+
elif verbosity <= 2:
|
101
|
+
_lightning_log_level = logging.WARNING
|
102
|
+
else:
|
103
|
+
_lightning_log_level = logging.INFO
|
104
|
+
|
105
|
+
# FIXME: code assume we only see one GPU in the fit process.
|
106
|
+
device = "cpu" if num_gpus == 0 else "cuda:0"
|
107
|
+
if (device == "cuda:0") and (not torch.cuda.is_available()):
|
108
|
+
raise AssertionError(
|
109
|
+
"Fit specified to use GPU, but CUDA is not available on this machine. "
|
110
|
+
"Please switch to CPU usage instead.",
|
111
|
+
)
|
112
|
+
|
113
|
+
hyp = self._get_model_params()
|
114
|
+
|
115
|
+
default_hyperparameters = hyp.pop("default_hyperparameters", "td")
|
116
|
+
|
117
|
+
model_cls = self.get_model_cls(default_hyperparameters=default_hyperparameters)
|
118
|
+
|
119
|
+
metric_map = {
|
120
|
+
"roc_auc": "1-auc_ovr_alt",
|
121
|
+
"accuracy": "class_error",
|
122
|
+
"balanced_accuracy": "1-balanced_accuracy",
|
123
|
+
"log_loss": "cross_entropy",
|
124
|
+
"rmse": "rmse",
|
125
|
+
"root_mean_squared_error": "rmse",
|
126
|
+
"r2": "rmse",
|
127
|
+
"mae": "mae",
|
128
|
+
"mean_average_error": "mae",
|
129
|
+
}
|
130
|
+
|
131
|
+
val_metric_name = metric_map.get(self.stopping_metric.name, None)
|
132
|
+
|
133
|
+
init_kwargs = dict()
|
134
|
+
|
135
|
+
if val_metric_name is not None:
|
136
|
+
init_kwargs["val_metric_name"] = val_metric_name
|
137
|
+
|
138
|
+
# TODO: Make this smarter? Maybe use `eval_metric.needs_pred`
|
139
|
+
if hyp["use_ls"] is not None and isinstance(hyp["use_ls"], str) and hyp["use_ls"] == "auto":
|
140
|
+
if val_metric_name is None:
|
141
|
+
hyp["use_ls"] = False
|
142
|
+
elif val_metric_name in ["cross_entropy", "1-auc_ovr_alt"]:
|
143
|
+
hyp["use_ls"] = False
|
144
|
+
else:
|
145
|
+
hyp["use_ls"] = None
|
146
|
+
|
147
|
+
if X_val is None:
|
148
|
+
hyp["use_early_stopping"] = False
|
149
|
+
hyp["val_fraction"] = 0
|
150
|
+
|
151
|
+
bool_to_cat = hyp.pop("bool_to_cat", True)
|
152
|
+
impute_bool = hyp.pop("impute_bool", True)
|
153
|
+
name_categories = hyp.pop("name_categories", True)
|
154
|
+
|
155
|
+
n_features = len(X.columns)
|
156
|
+
if "predict_batch_size" in hyp and isinstance(hyp["predict_batch_size"], str) and hyp["predict_batch_size"] == "auto":
|
157
|
+
# simple heuristic to avoid OOM during inference time
|
158
|
+
# note: this isn't fool-proof, and ignores the actual memory availability of the machine.
|
159
|
+
# note: this is based on an assumption of 32 GB of memory available on the instance
|
160
|
+
# default is 1024
|
161
|
+
hyp["predict_batch_size"] = max(min(int(8192 * 200 / n_features), 8192), 64)
|
162
|
+
|
163
|
+
self.model = model_cls(
|
164
|
+
n_threads=num_cpus,
|
165
|
+
device=device,
|
166
|
+
**init_kwargs,
|
167
|
+
**hyp,
|
168
|
+
)
|
169
|
+
|
170
|
+
X = self.preprocess(X, is_train=True, bool_to_cat=bool_to_cat, impute_bool=impute_bool)
|
171
|
+
|
172
|
+
# FIXME: In rare cases can cause exceptions if name_categories=False, unknown why
|
173
|
+
extra_fit_kwargs = {}
|
174
|
+
if name_categories:
|
175
|
+
cat_col_names = X.select_dtypes(include='category').columns.tolist()
|
176
|
+
extra_fit_kwargs["cat_col_names"] = cat_col_names
|
177
|
+
|
178
|
+
if X_val is not None:
|
179
|
+
X_val = self.preprocess(X_val)
|
180
|
+
|
181
|
+
with set_logger_level("lightning.pytorch", _lightning_log_level):
|
182
|
+
self.model = self.model.fit(
|
183
|
+
X=X,
|
184
|
+
y=y,
|
185
|
+
X_val=X_val,
|
186
|
+
y_val=y_val,
|
187
|
+
time_to_fit_in_seconds=time_limit - (time.time() - start_time) if time_limit is not None else None,
|
188
|
+
**extra_fit_kwargs,
|
189
|
+
)
|
190
|
+
|
191
|
+
def _predict_proba(self, X, **kwargs) -> np.ndarray:
|
192
|
+
with set_logger_level("lightning.pytorch", logging.WARNING):
|
193
|
+
return super()._predict_proba(X=X, kwargs=kwargs)
|
194
|
+
|
195
|
+
# TODO: Move missing indicator + mean fill to a generic preprocess flag available to all models
|
196
|
+
# FIXME: bool_to_cat is a hack: Maybe move to abstract model?
|
197
|
+
def _preprocess(self, X: pd.DataFrame, is_train: bool = False, bool_to_cat: bool = False, impute_bool: bool = True, **kwargs) -> pd.DataFrame:
|
198
|
+
"""
|
199
|
+
Imputes missing values via the mean and adds indicator columns for numerical features.
|
200
|
+
Converts indicator columns to categorical features to avoid them being treated as numerical by RealMLP.
|
201
|
+
"""
|
202
|
+
X = super()._preprocess(X, **kwargs)
|
203
|
+
|
204
|
+
# FIXME: is copy needed?
|
205
|
+
X = X.copy(deep=True)
|
206
|
+
if is_train:
|
207
|
+
self._bool_to_cat = bool_to_cat
|
208
|
+
self._features_bool = self._feature_metadata.get_features(required_special_types=["bool"])
|
209
|
+
if impute_bool: # Technically this should do nothing useful because bools will never have NaN
|
210
|
+
self._features_to_impute = self._feature_metadata.get_features(valid_raw_types=["int", "float"])
|
211
|
+
self._features_to_keep = self._feature_metadata.get_features(invalid_raw_types=["int", "float"])
|
212
|
+
else:
|
213
|
+
self._features_to_impute = self._feature_metadata.get_features(valid_raw_types=["int", "float"], invalid_special_types=["bool"])
|
214
|
+
self._features_to_keep = [f for f in self._feature_metadata.get_features() if f not in self._features_to_impute]
|
215
|
+
if self._features_to_impute:
|
216
|
+
self._imputer = SimpleImputer(strategy="mean", add_indicator=True)
|
217
|
+
self._imputer.fit(X=X[self._features_to_impute])
|
218
|
+
self._indicator_columns = [c for c in self._imputer.get_feature_names_out() if c not in self._features_to_impute]
|
219
|
+
if self._imputer is not None:
|
220
|
+
X_impute = self._imputer.transform(X=X[self._features_to_impute])
|
221
|
+
X_impute = pd.DataFrame(X_impute, index=X.index, columns=self._imputer.get_feature_names_out())
|
222
|
+
if self._indicator_columns:
|
223
|
+
# FIXME: Use CategoryFeatureGenerator? Or tell the model which is category
|
224
|
+
# TODO: Add to features_bool?
|
225
|
+
X_impute[self._indicator_columns] = X_impute[self._indicator_columns].astype("category")
|
226
|
+
X = pd.concat([X[self._features_to_keep], X_impute], axis=1)
|
227
|
+
if self._bool_to_cat and self._features_bool:
|
228
|
+
# FIXME: Use CategoryFeatureGenerator? Or tell the model which is category
|
229
|
+
X[self._features_bool] = X[self._features_bool].astype("category")
|
230
|
+
return X
|
231
|
+
|
232
|
+
def _set_default_params(self):
|
233
|
+
default_params = dict(
|
234
|
+
random_state=0,
|
235
|
+
|
236
|
+
# Don't use early stopping by default, seems to work well without
|
237
|
+
use_early_stopping=False,
|
238
|
+
early_stopping_additive_patience=40,
|
239
|
+
early_stopping_multiplicative_patience=3,
|
240
|
+
|
241
|
+
# verdict: use_ls="auto" is much better than None.
|
242
|
+
use_ls="auto",
|
243
|
+
|
244
|
+
# verdict: no impact, but makes more sense to be False.
|
245
|
+
impute_bool=False,
|
246
|
+
|
247
|
+
# verdict: name_categories=True avoids random exceptions being raised in rare cases
|
248
|
+
name_categories=True,
|
249
|
+
|
250
|
+
# verdict: bool_to_cat=True is equivalent to False in terms of quality, but can be slightly faster in training time
|
251
|
+
# and slightly slower in inference time
|
252
|
+
bool_to_cat=True,
|
253
|
+
|
254
|
+
# verdict: "td" is better than "td_s"
|
255
|
+
default_hyperparameters="td", # options ["td", "td_s"]
|
256
|
+
|
257
|
+
predict_batch_size="auto", # if auto, uses AutoGluon's heuristic to set a value between 8192 and 64.
|
258
|
+
)
|
259
|
+
for param, val in default_params.items():
|
260
|
+
self._set_default_param_value(param, val)
|
261
|
+
|
262
|
+
@classmethod
|
263
|
+
def supported_problem_types(cls) -> list[str] | None:
|
264
|
+
return ["binary", "multiclass", "regression"]
|
265
|
+
|
266
|
+
def _get_default_stopping_metric(self):
|
267
|
+
return self.eval_metric
|
268
|
+
|
269
|
+
def _get_default_resources(self) -> tuple[int, int]:
|
270
|
+
# only_physical_cores=True is faster in training
|
271
|
+
num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
|
272
|
+
num_gpus = min(ResourceManager.get_gpu_count_torch(), 1)
|
273
|
+
return num_cpus, num_gpus
|
274
|
+
|
275
|
+
def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
|
276
|
+
hyperparameters = self._get_model_params()
|
277
|
+
return self.estimate_memory_usage_static(X=X, problem_type=self.problem_type, num_classes=self.num_classes, hyperparameters=hyperparameters, **kwargs)
|
278
|
+
|
279
|
+
@classmethod
|
280
|
+
def _estimate_memory_usage_static(
|
281
|
+
cls,
|
282
|
+
*,
|
283
|
+
X: pd.DataFrame,
|
284
|
+
hyperparameters: dict = None,
|
285
|
+
**kwargs,
|
286
|
+
) -> int:
|
287
|
+
"""
|
288
|
+
Heuristic memory estimate that correlates strongly with RealMLP's more sophisticated method
|
289
|
+
|
290
|
+
More comprehensive memory estimate logic:
|
291
|
+
|
292
|
+
```python
|
293
|
+
from typing import Any
|
294
|
+
|
295
|
+
from pytabkit.models.alg_interfaces.nn_interfaces import NNAlgInterface
|
296
|
+
from pytabkit.models.data.data import DictDataset, TensorInfo
|
297
|
+
from pytabkit.models.sklearn.default_params import DefaultParams
|
298
|
+
|
299
|
+
def estimate_realmlp_cpu_ram_gb(hparams: dict[str, Any], n_numerical: int, cat_sizes: list[int], n_classes: int,
|
300
|
+
n_samples: int):
|
301
|
+
params = copy.copy(DefaultParams.RealMLP_TD_CLASS if n_classes > 0 else DefaultParams.RealMLP_TD_REG)
|
302
|
+
params.update(hparams)
|
303
|
+
|
304
|
+
ds = DictDataset(tensors=None, tensor_infos=dict(x_cont=TensorInfo(feat_shape=[n_numerical]),
|
305
|
+
x_cat=TensorInfo(cat_sizes=cat_sizes),
|
306
|
+
y=TensorInfo(cat_sizes=[n_classes])), device='cpu',
|
307
|
+
n_samples=n_samples)
|
308
|
+
|
309
|
+
alg_interface = NNAlgInterface(**params)
|
310
|
+
res = alg_interface.get_required_resources(ds, n_cv=1, n_refit=0, n_splits=1, split_seeds=[0], n_train=n_samples)
|
311
|
+
return res.cpu_ram_gb
|
312
|
+
```
|
313
|
+
|
314
|
+
"""
|
315
|
+
if hyperparameters is None:
|
316
|
+
hyperparameters = {}
|
317
|
+
plr_hidden_1 = hyperparameters.get("plr_hidden_1", 16)
|
318
|
+
plr_hidden_2 = hyperparameters.get("plr_hidden_2", 4)
|
319
|
+
hidden_width = hyperparameters.get("hidden_width", 256)
|
320
|
+
|
321
|
+
num_features = len(X.columns)
|
322
|
+
columns_mem_est = num_features * 8e5
|
323
|
+
|
324
|
+
hidden_1_weight = 0.13
|
325
|
+
hidden_2_weight = 0.42
|
326
|
+
width_factor = math.sqrt(hidden_width / 256 + 0.6)
|
327
|
+
|
328
|
+
columns_mem_est_hidden_1 = columns_mem_est * hidden_1_weight * plr_hidden_1 / 16 * width_factor
|
329
|
+
columns_mem_est_hidden_2 = columns_mem_est * hidden_2_weight * plr_hidden_2 / 16 * width_factor
|
330
|
+
columns_mem_est = columns_mem_est_hidden_1 + columns_mem_est_hidden_2
|
331
|
+
|
332
|
+
dataset_size_mem_est = 5 * get_approximate_df_mem_usage(X).sum() # roughly 5x DataFrame memory size
|
333
|
+
baseline_overhead_mem_est = 3e8 # 300 MB generic overhead
|
334
|
+
|
335
|
+
mem_estimate = dataset_size_mem_est + columns_mem_est + baseline_overhead_mem_est
|
336
|
+
|
337
|
+
return mem_estimate
|
338
|
+
|
339
|
+
@classmethod
|
340
|
+
def _class_tags(cls) -> dict:
|
341
|
+
return {"can_estimate_memory_usage_static": True}
|
342
|
+
|
343
|
+
def _more_tags(self) -> dict:
|
344
|
+
# TODO: Need to add train params support, track best epoch
|
345
|
+
# How to mirror RealMLP learning rate scheduler while forcing stopping at a specific epoch?
|
346
|
+
tags = {"can_refit_full": False}
|
347
|
+
return tags
|
@@ -309,8 +309,9 @@ class RFModel(AbstractModel):
|
|
309
309
|
if self.model.n_outputs_ == 1:
|
310
310
|
self.model.n_classes_ = [self.model.n_classes_]
|
311
311
|
from sklearn.tree._tree import DOUBLE, DTYPE
|
312
|
+
from sklearn.utils.validation import check_X_y
|
312
313
|
|
313
|
-
X, y =
|
314
|
+
X, y = check_X_y(X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE)
|
314
315
|
if y.ndim == 1:
|
315
316
|
# reshape is necessary to preserve the data contiguity against vs
|
316
317
|
# [:, np.newaxis] that does not.
|
File without changes
|
@@ -0,0 +1,174 @@
|
|
1
|
+
"""
|
2
|
+
Code Adapted from TabArena: https://github.com/autogluon/tabrepo/blob/main/tabrepo/benchmark/models/ag/tabicl/tabicl_model.py
|
3
|
+
Model: TabICL
|
4
|
+
Paper: TabICL: A Tabular Foundation Model for In-Context Learning on Large Data
|
5
|
+
Authors: Jingang Qu, David Holzmüller, Gaël Varoquaux, Marine Le Morvan
|
6
|
+
Codebase: https://github.com/soda-inria/tabicl
|
7
|
+
License: BSD-3-Clause
|
8
|
+
"""
|
9
|
+
|
10
|
+
from __future__ import annotations
|
11
|
+
|
12
|
+
import logging
|
13
|
+
|
14
|
+
import pandas as pd
|
15
|
+
|
16
|
+
from autogluon.common.utils.pandas_utils import get_approximate_df_mem_usage
|
17
|
+
from autogluon.common.utils.resource_utils import ResourceManager
|
18
|
+
from autogluon.core.models import AbstractModel
|
19
|
+
from autogluon.tabular import __version__
|
20
|
+
|
21
|
+
logger = logging.getLogger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
# TODO: Verify if crashes when weights are not yet downloaded and fit in parallel
|
25
|
+
class TabICLModel(AbstractModel):
|
26
|
+
ag_key = "TABICL"
|
27
|
+
ag_name = "TabICL"
|
28
|
+
ag_priority = 65
|
29
|
+
|
30
|
+
def get_model_cls(self):
|
31
|
+
from tabicl import TabICLClassifier
|
32
|
+
|
33
|
+
if self.problem_type in ["binary", "multiclass"]:
|
34
|
+
model_cls = TabICLClassifier
|
35
|
+
else:
|
36
|
+
raise AssertionError(f"Unsupported problem_type: {self.problem_type}")
|
37
|
+
return model_cls
|
38
|
+
|
39
|
+
@staticmethod
|
40
|
+
def _get_batch_size(n_cells: int):
|
41
|
+
if n_cells <= 4_000_000:
|
42
|
+
return 8
|
43
|
+
elif n_cells <= 6_000_000:
|
44
|
+
return 4
|
45
|
+
else:
|
46
|
+
return 2
|
47
|
+
|
48
|
+
def _fit(
|
49
|
+
self,
|
50
|
+
X: pd.DataFrame,
|
51
|
+
y: pd.Series,
|
52
|
+
num_cpus: int = 1,
|
53
|
+
num_gpus: int = 0,
|
54
|
+
**kwargs,
|
55
|
+
):
|
56
|
+
try:
|
57
|
+
import tabicl
|
58
|
+
except ImportError as err:
|
59
|
+
logger.log(
|
60
|
+
40,
|
61
|
+
f"\tFailed to import tabicl! To use the TabICL model, "
|
62
|
+
f"do: `pip install autogluon.tabular[tabicl]=={__version__}`.",
|
63
|
+
)
|
64
|
+
raise err
|
65
|
+
|
66
|
+
from torch.cuda import is_available
|
67
|
+
|
68
|
+
device = "cuda" if num_gpus != 0 else "cpu"
|
69
|
+
if (device == "cuda") and (not is_available()):
|
70
|
+
# FIXME: warn instead and switch to CPU.
|
71
|
+
raise AssertionError(
|
72
|
+
"Fit specified to use GPU, but CUDA is not available on this machine. "
|
73
|
+
"Please switch to CPU usage instead.",
|
74
|
+
)
|
75
|
+
|
76
|
+
model_cls = self.get_model_cls()
|
77
|
+
hyp = self._get_model_params()
|
78
|
+
hyp["batch_size"] = hyp.get("batch_size", self._get_batch_size(X.shape[0] * X.shape[1]))
|
79
|
+
self.model = model_cls(
|
80
|
+
**hyp,
|
81
|
+
device=device,
|
82
|
+
n_jobs=num_cpus,
|
83
|
+
)
|
84
|
+
X = self.preprocess(X)
|
85
|
+
self.model = self.model.fit(
|
86
|
+
X=X,
|
87
|
+
y=y,
|
88
|
+
)
|
89
|
+
|
90
|
+
def _set_default_params(self):
|
91
|
+
default_params = {
|
92
|
+
"random_state": 42,
|
93
|
+
}
|
94
|
+
for param, val in default_params.items():
|
95
|
+
self._set_default_param_value(param, val)
|
96
|
+
|
97
|
+
def _get_default_auxiliary_params(self) -> dict:
|
98
|
+
default_auxiliary_params = super()._get_default_auxiliary_params()
|
99
|
+
default_auxiliary_params.update(
|
100
|
+
{
|
101
|
+
"max_rows": 100000,
|
102
|
+
"max_features": 500,
|
103
|
+
}
|
104
|
+
)
|
105
|
+
return default_auxiliary_params
|
106
|
+
|
107
|
+
@classmethod
|
108
|
+
def supported_problem_types(cls) -> list[str] | None:
|
109
|
+
return ["binary", "multiclass"]
|
110
|
+
|
111
|
+
def _get_default_resources(self) -> tuple[int, int]:
|
112
|
+
num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
|
113
|
+
num_gpus = min(ResourceManager.get_gpu_count_torch(), 1)
|
114
|
+
return num_cpus, num_gpus
|
115
|
+
|
116
|
+
def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
|
117
|
+
hyperparameters = self._get_model_params()
|
118
|
+
return self.estimate_memory_usage_static(X=X, problem_type=self.problem_type, num_classes=self.num_classes, hyperparameters=hyperparameters, **kwargs)
|
119
|
+
|
120
|
+
@classmethod
|
121
|
+
def _estimate_memory_usage_static(
|
122
|
+
cls,
|
123
|
+
*,
|
124
|
+
X: pd.DataFrame,
|
125
|
+
hyperparameters: dict = None,
|
126
|
+
**kwargs,
|
127
|
+
) -> int:
|
128
|
+
"""
|
129
|
+
Heuristic memory estimate that is very primitive.
|
130
|
+
Can be vastly improved.
|
131
|
+
"""
|
132
|
+
if hyperparameters is None:
|
133
|
+
hyperparameters = {}
|
134
|
+
|
135
|
+
dataset_size_mem_est = 3 * get_approximate_df_mem_usage(X).sum() # roughly 3x DataFrame memory size
|
136
|
+
baseline_overhead_mem_est = 1e9 # 1 GB generic overhead
|
137
|
+
|
138
|
+
n_rows = X.shape[0]
|
139
|
+
n_features = X.shape[1]
|
140
|
+
batch_size = hyperparameters.get("batch_size", cls._get_batch_size(X.shape[0] * X.shape[1]))
|
141
|
+
embedding_dim = 128
|
142
|
+
bytes_per_float = 4
|
143
|
+
model_mem_estimate = 2 * batch_size * embedding_dim * bytes_per_float * (4 + n_rows) * n_features
|
144
|
+
|
145
|
+
model_mem_estimate *= 1.3 # add 30% buffer
|
146
|
+
|
147
|
+
# TODO: Observed memory spikes above expected values on large datasets, increasing mem estimate to compensate
|
148
|
+
model_mem_estimate *= 1.5
|
149
|
+
|
150
|
+
mem_estimate = model_mem_estimate + dataset_size_mem_est + baseline_overhead_mem_est
|
151
|
+
|
152
|
+
return mem_estimate
|
153
|
+
|
154
|
+
@classmethod
|
155
|
+
def _get_default_ag_args_ensemble(cls, **kwargs) -> dict:
|
156
|
+
"""
|
157
|
+
Set fold_fitting_strategy to sequential_local,
|
158
|
+
as parallel folding crashes if model weights aren't pre-downloaded.
|
159
|
+
"""
|
160
|
+
default_ag_args_ensemble = super()._get_default_ag_args_ensemble(**kwargs)
|
161
|
+
extra_ag_args_ensemble = {
|
162
|
+
# FIXME: If parallel, uses way more memory, seems to behave incorrectly, so we force sequential.
|
163
|
+
"fold_fitting_strategy": "sequential_local",
|
164
|
+
"refit_folds": True, # Better to refit the model for faster inference and similar quality as the bag.
|
165
|
+
}
|
166
|
+
default_ag_args_ensemble.update(extra_ag_args_ensemble)
|
167
|
+
return default_ag_args_ensemble
|
168
|
+
|
169
|
+
@classmethod
|
170
|
+
def _class_tags(cls) -> dict:
|
171
|
+
return {"can_estimate_memory_usage_static": True}
|
172
|
+
|
173
|
+
def _more_tags(self) -> dict:
|
174
|
+
return {"can_refit_full": True}
|
File without changes
|