autogluon.tabular 1.4.1b20251014__py3-none-any.whl → 1.5.0b20251222__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autogluon/tabular/configs/hyperparameter_configs.py +4 -0
- autogluon/tabular/configs/presets_configs.py +39 -2
- autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2025.py +2 -44
- autogluon/tabular/configs/zeroshot/zeroshot_portfolio_cpu_2025_12_18.py +2 -0
- autogluon/tabular/configs/zeroshot/zeroshot_portfolio_gpu_2025_12_18.py +2 -0
- autogluon/tabular/learner/default_learner.py +1 -0
- autogluon/tabular/models/__init__.py +3 -1
- autogluon/tabular/models/abstract/__init__.py +0 -0
- autogluon/tabular/models/abstract/abstract_torch_model.py +148 -0
- autogluon/tabular/models/catboost/catboost_model.py +2 -5
- autogluon/tabular/models/ebm/ebm_model.py +2 -6
- autogluon/tabular/models/fastainn/tabular_nn_fastai.py +9 -3
- autogluon/tabular/models/lgb/lgb_model.py +60 -17
- autogluon/tabular/models/lgb/lgb_utils.py +2 -2
- autogluon/tabular/models/lr/lr_model.py +2 -4
- autogluon/tabular/models/lr/lr_preprocessing_utils.py +6 -7
- autogluon/tabular/models/mitra/_internal/core/trainer_finetune.py +14 -1
- autogluon/tabular/models/mitra/mitra_model.py +55 -29
- autogluon/tabular/models/realmlp/realmlp_model.py +8 -5
- autogluon/tabular/models/rf/rf_model.py +6 -8
- autogluon/tabular/models/tabdpt/__init__.py +0 -0
- autogluon/tabular/models/tabdpt/tabdpt_model.py +253 -0
- autogluon/tabular/models/tabicl/tabicl_model.py +15 -5
- autogluon/tabular/models/tabm/tabm_model.py +25 -8
- autogluon/tabular/models/tabpfnmix/tabpfnmix_model.py +7 -5
- autogluon/tabular/models/tabpfnv2/tabpfnv2_5_model.py +451 -0
- autogluon/tabular/models/tabpfnv2/tabpfnv2_model.py +87 -12
- autogluon/tabular/models/tabprep/__init__.py +0 -0
- autogluon/tabular/models/tabprep/prep_lgb_model.py +21 -0
- autogluon/tabular/models/tabprep/prep_mixin.py +220 -0
- autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py +3 -6
- autogluon/tabular/models/tabular_nn/utils/data_preprocessor.py +12 -4
- autogluon/tabular/models/xgboost/xgboost_model.py +3 -4
- autogluon/tabular/predictor/predictor.py +50 -20
- autogluon/tabular/registry/_ag_model_registry.py +8 -2
- autogluon/tabular/testing/fit_helper.py +61 -0
- autogluon/tabular/trainer/abstract_trainer.py +45 -9
- autogluon/tabular/trainer/auto_trainer.py +5 -0
- autogluon/tabular/version.py +1 -1
- autogluon.tabular-1.5.0b20251222-py3.11-nspkg.pth +1 -0
- {autogluon.tabular-1.4.1b20251014.dist-info → autogluon_tabular-1.5.0b20251222.dist-info}/METADATA +97 -87
- {autogluon.tabular-1.4.1b20251014.dist-info → autogluon_tabular-1.5.0b20251222.dist-info}/RECORD +48 -38
- {autogluon.tabular-1.4.1b20251014.dist-info → autogluon_tabular-1.5.0b20251222.dist-info}/WHEEL +1 -1
- autogluon.tabular-1.4.1b20251014-py3.9-nspkg.pth +0 -1
- {autogluon.tabular-1.4.1b20251014.dist-info → autogluon_tabular-1.5.0b20251222.dist-info/licenses}/LICENSE +0 -0
- {autogluon.tabular-1.4.1b20251014.dist-info → autogluon_tabular-1.5.0b20251222.dist-info/licenses}/NOTICE +0 -0
- {autogluon.tabular-1.4.1b20251014.dist-info → autogluon_tabular-1.5.0b20251222.dist-info}/namespace_packages.txt +0 -0
- {autogluon.tabular-1.4.1b20251014.dist-info → autogluon_tabular-1.5.0b20251222.dist-info}/top_level.txt +0 -0
- {autogluon.tabular-1.4.1b20251014.dist-info → autogluon_tabular-1.5.0b20251222.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Type
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from autogluon.features import ArithmeticFeatureGenerator
|
|
10
|
+
from autogluon.features import CategoricalInteractionFeatureGenerator
|
|
11
|
+
from autogluon.features import OOFTargetEncodingFeatureGenerator
|
|
12
|
+
from autogluon.features import BulkFeatureGenerator
|
|
13
|
+
from autogluon.features.generators.abstract import AbstractFeatureGenerator
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
# TODO: In future we can have a feature generator registry like what is done for models
|
|
18
|
+
_feature_generator_class_lst = [
|
|
19
|
+
ArithmeticFeatureGenerator,
|
|
20
|
+
CategoricalInteractionFeatureGenerator,
|
|
21
|
+
OOFTargetEncodingFeatureGenerator,
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
_feature_generator_class_map = {
|
|
25
|
+
feature_generator_cls.__name__: feature_generator_cls for feature_generator_cls in _feature_generator_class_lst
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _recursive_expand_prep_param(prep_param: tuple | list[list | tuple]) -> list[tuple]:
|
|
30
|
+
if isinstance(prep_param, list):
|
|
31
|
+
if len(prep_param) == 0:
|
|
32
|
+
param_type = "list"
|
|
33
|
+
elif len(prep_param) == 2:
|
|
34
|
+
if isinstance(prep_param[0], (str, AbstractFeatureGenerator)):
|
|
35
|
+
param_type = "generator"
|
|
36
|
+
else:
|
|
37
|
+
param_type = "list"
|
|
38
|
+
else:
|
|
39
|
+
param_type = "list"
|
|
40
|
+
elif isinstance(prep_param, tuple):
|
|
41
|
+
param_type = "generator"
|
|
42
|
+
else:
|
|
43
|
+
raise ValueError(f"Invalid value for prep_param: {prep_param}")
|
|
44
|
+
if param_type == "list":
|
|
45
|
+
out = []
|
|
46
|
+
for p in prep_param:
|
|
47
|
+
out += _recursive_expand_prep_param(p)
|
|
48
|
+
return out
|
|
49
|
+
elif param_type == "generator":
|
|
50
|
+
return [prep_param]
|
|
51
|
+
else:
|
|
52
|
+
raise ValueError(f"Invalid value for prep_param: {prep_param}")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# FIXME: Why is preprocessing twice as slow per fold when bagging LightGBM??? Need to investigate. Try sequential fold fit
|
|
56
|
+
# TODO: Why is `prep_params` a dict instead of a list?
|
|
57
|
+
class ModelAgnosticPrepMixin:
|
|
58
|
+
def _estimate_dtypes_after_preprocessing(self, X: pd.DataFrame, **kwargs) -> int:
|
|
59
|
+
prep_params = self._get_ag_params().get("prep_params", None)
|
|
60
|
+
if prep_params is None:
|
|
61
|
+
prep_params = []
|
|
62
|
+
|
|
63
|
+
# FIXME: Temporarily simplify for memory calculation
|
|
64
|
+
prep_params = _recursive_expand_prep_param(prep_params)
|
|
65
|
+
|
|
66
|
+
X_nunique = X.nunique().values
|
|
67
|
+
n_categorical = X.select_dtypes(exclude=[np.number]).shape[1]
|
|
68
|
+
n_numeric = X.loc[:, X_nunique > 2].select_dtypes(include=[np.number]).shape[1]
|
|
69
|
+
n_binary = X.loc[:, X_nunique <= 2].select_dtypes(include=[np.number]).shape[
|
|
70
|
+
1] # NOTE: It can happen that features have less than two unique values if cleaning is applied before the bagging, i.e. Bioresponse
|
|
71
|
+
|
|
72
|
+
assert n_numeric + n_categorical + n_binary == X.shape[1] # NOTE: FOr debugging, to be removed later
|
|
73
|
+
for preprocessor_cls_name, init_params in prep_params:
|
|
74
|
+
if preprocessor_cls_name == 'ArithmeticFeatureGenerator':
|
|
75
|
+
prep_cls = ArithmeticFeatureGenerator(target_type=self.problem_type, **init_params)
|
|
76
|
+
elif preprocessor_cls_name == 'CategoricalInteractionFeatureGenerator':
|
|
77
|
+
prep_cls = CategoricalInteractionFeatureGenerator(target_type=self.problem_type, **init_params)
|
|
78
|
+
elif preprocessor_cls_name == 'OOFTargetEncodingFeatureGenerator':
|
|
79
|
+
prep_cls = OOFTargetEncodingFeatureGenerator(target_type=self.problem_type, **init_params)
|
|
80
|
+
else:
|
|
81
|
+
raise ValueError(f"Unknown preprocessor class name: {preprocessor_cls_name}")
|
|
82
|
+
n_numeric, n_categorical, n_binary = prep_cls.estimate_new_dtypes(n_numeric, n_categorical, n_binary,
|
|
83
|
+
num_classes=self.num_classes)
|
|
84
|
+
|
|
85
|
+
return n_numeric, n_categorical, n_binary
|
|
86
|
+
|
|
87
|
+
def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
|
|
88
|
+
hyperparameters = self._get_model_params()
|
|
89
|
+
n_numeric, n_categorical, n_binary = self._estimate_dtypes_after_preprocessing(X=X, **kwargs)
|
|
90
|
+
|
|
91
|
+
if hasattr(self, "_estimate_memory_usage_static_lite"):
|
|
92
|
+
return self._estimate_memory_usage_static_lite(
|
|
93
|
+
num_samples=X.shape[0],
|
|
94
|
+
num_features=n_numeric + n_categorical + n_binary,
|
|
95
|
+
num_bytes_per_cell=4,
|
|
96
|
+
hyperparameters=hyperparameters,
|
|
97
|
+
problem_type=self.problem_type,
|
|
98
|
+
num_classes=self.num_classes,
|
|
99
|
+
**kwargs,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# TODO: Replace with memory estimation logic based on no. of features instead of dataframe generation
|
|
103
|
+
shape = X.shape[0]
|
|
104
|
+
df_lst = []
|
|
105
|
+
if n_numeric > 0:
|
|
106
|
+
X_estimate = np.random.random(size=[shape, n_numeric]).astype(np.float32)
|
|
107
|
+
X_estimate_numeric = pd.DataFrame(X_estimate)
|
|
108
|
+
df_lst.append(X_estimate_numeric)
|
|
109
|
+
if n_categorical > 0:
|
|
110
|
+
cardinality = int(X.select_dtypes(exclude=[np.number]).nunique().mean())
|
|
111
|
+
X_estimate = np.random.randint(0, cardinality, [shape, n_categorical]).astype('str')
|
|
112
|
+
X_estimate_cat = pd.DataFrame(X_estimate)
|
|
113
|
+
df_lst.append(X_estimate_cat)
|
|
114
|
+
if n_binary > 0:
|
|
115
|
+
X_estimate = np.random.randint(0, 2, [shape, n_binary]).astype(np.int8)
|
|
116
|
+
X_estimate_binary = pd.DataFrame(X_estimate)
|
|
117
|
+
df_lst.append(X_estimate_binary)
|
|
118
|
+
X = pd.concat(df_lst, ignore_index=True, axis=1)
|
|
119
|
+
|
|
120
|
+
return self.estimate_memory_usage_static(
|
|
121
|
+
X=X,
|
|
122
|
+
problem_type=self.problem_type,
|
|
123
|
+
num_classes=self.num_classes,
|
|
124
|
+
hyperparameters=hyperparameters,
|
|
125
|
+
**kwargs,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
def _init_preprocessor(
|
|
129
|
+
self,
|
|
130
|
+
preprocessor_cls: Type[AbstractFeatureGenerator] | str,
|
|
131
|
+
init_params: dict | None,
|
|
132
|
+
) -> AbstractFeatureGenerator:
|
|
133
|
+
if isinstance(preprocessor_cls, str):
|
|
134
|
+
preprocessor_cls = _feature_generator_class_map[preprocessor_cls]
|
|
135
|
+
if init_params is None:
|
|
136
|
+
init_params = {}
|
|
137
|
+
_init_params = dict(
|
|
138
|
+
verbosity=0,
|
|
139
|
+
random_state=self.random_seed, # FIXME: Not a generic param
|
|
140
|
+
target_type=self.problem_type, # FIXME: Not a generic param
|
|
141
|
+
)
|
|
142
|
+
_init_params.update(**init_params)
|
|
143
|
+
return preprocessor_cls(
|
|
144
|
+
**_init_params,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
def _recursive_init_preprocessors(self, prep_param: tuple | list[list | tuple]):
|
|
148
|
+
if isinstance(prep_param, list):
|
|
149
|
+
if len(prep_param) == 0:
|
|
150
|
+
param_type = "list"
|
|
151
|
+
elif len(prep_param) == 2:
|
|
152
|
+
if isinstance(prep_param[0], (str, AbstractFeatureGenerator)):
|
|
153
|
+
param_type = "generator"
|
|
154
|
+
else:
|
|
155
|
+
param_type = "list"
|
|
156
|
+
else:
|
|
157
|
+
param_type = "list"
|
|
158
|
+
elif isinstance(prep_param, tuple):
|
|
159
|
+
param_type = "generator"
|
|
160
|
+
else:
|
|
161
|
+
raise ValueError(f"Invalid value for prep_param: {prep_param}")
|
|
162
|
+
|
|
163
|
+
if param_type == "list":
|
|
164
|
+
out = []
|
|
165
|
+
for i, p in enumerate(prep_param):
|
|
166
|
+
out.append(self._recursive_init_preprocessors(p))
|
|
167
|
+
return out
|
|
168
|
+
elif param_type == "generator":
|
|
169
|
+
assert len(prep_param) == 2
|
|
170
|
+
preprocessor_cls = prep_param[0]
|
|
171
|
+
init_params = prep_param[1]
|
|
172
|
+
return self._init_preprocessor(preprocessor_cls=preprocessor_cls, init_params=init_params)
|
|
173
|
+
else:
|
|
174
|
+
raise ValueError(f"Invalid value for prep_param: {prep_param}")
|
|
175
|
+
|
|
176
|
+
def get_preprocessors(self) -> list[AbstractFeatureGenerator]:
|
|
177
|
+
ag_params = self._get_ag_params()
|
|
178
|
+
prep_params = ag_params.get("prep_params", None)
|
|
179
|
+
passthrough_types = ag_params.get("prep_params.passthrough_types", None)
|
|
180
|
+
if prep_params is None:
|
|
181
|
+
return []
|
|
182
|
+
if not prep_params:
|
|
183
|
+
return []
|
|
184
|
+
|
|
185
|
+
preprocessors = self._recursive_init_preprocessors(prep_param=prep_params)
|
|
186
|
+
if len(preprocessors) == 0:
|
|
187
|
+
return []
|
|
188
|
+
if len(preprocessors) == 1 and isinstance(preprocessors[0], AbstractFeatureGenerator):
|
|
189
|
+
return preprocessors
|
|
190
|
+
else:
|
|
191
|
+
preprocessors = [BulkFeatureGenerator(
|
|
192
|
+
generators=preprocessors,
|
|
193
|
+
# TODO: "false_recursive" technically can slow down inference, but need to optimize `True` first
|
|
194
|
+
# Refer to `Bioresponse` dataset where setting to `True` -> 200s fit time vs `false_recursive` -> 1s fit time
|
|
195
|
+
remove_unused_features="false_recursive",
|
|
196
|
+
post_drop_duplicates=True,
|
|
197
|
+
passthrough=True,
|
|
198
|
+
passthrough_types=passthrough_types,
|
|
199
|
+
verbosity=0,
|
|
200
|
+
)]
|
|
201
|
+
return preprocessors
|
|
202
|
+
|
|
203
|
+
def _preprocess(self, X: pd.DataFrame, y=None, is_train: bool = False, **kwargs):
|
|
204
|
+
if is_train:
|
|
205
|
+
self.preprocessors = self.get_preprocessors()
|
|
206
|
+
if self.preprocessors:
|
|
207
|
+
assert y is not None, f"y must be specified to fit preprocessors... Likely the inheriting class isn't passing `y` in its `preprocess` call."
|
|
208
|
+
# FIXME: add `post_drop_useless`, example: anneal has many useless features
|
|
209
|
+
feature_metadata_in = self._feature_metadata
|
|
210
|
+
for prep in self.preprocessors:
|
|
211
|
+
X = prep.fit_transform(X, y, feature_metadata_in=feature_metadata_in)
|
|
212
|
+
# FIXME: Nick: This is incorrect because it strips away special dtypes. Need to do this properly by fixing in the preprocessors
|
|
213
|
+
feature_metadata_in = prep.feature_metadata
|
|
214
|
+
self._feature_metadata = feature_metadata_in
|
|
215
|
+
self._features_internal = self._feature_metadata.get_features()
|
|
216
|
+
else:
|
|
217
|
+
for prep in self.preprocessors:
|
|
218
|
+
X = prep.transform(X)
|
|
219
|
+
|
|
220
|
+
return super()._preprocess(X, y=y, is_train=is_train, **kwargs)
|
|
@@ -50,6 +50,7 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
50
50
|
ag_key = "NN_TORCH"
|
|
51
51
|
ag_name = "NeuralNetTorch"
|
|
52
52
|
ag_priority = 25
|
|
53
|
+
seed_name = "seed_value"
|
|
53
54
|
|
|
54
55
|
# Constants used throughout this class:
|
|
55
56
|
unique_category_str = np.nan # string used to represent missing values and unknown categories for categorical features.
|
|
@@ -164,9 +165,6 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
164
165
|
|
|
165
166
|
return processor_kwargs, optimizer_kwargs, fit_kwargs, loss_kwargs, params
|
|
166
167
|
|
|
167
|
-
def _get_random_seed_from_hyperparameters(self, hyperparameters: dict) -> int | None | str:
|
|
168
|
-
return hyperparameters.get("seed_value", "N/A")
|
|
169
|
-
|
|
170
168
|
def _fit(
|
|
171
169
|
self,
|
|
172
170
|
X: pd.DataFrame,
|
|
@@ -194,7 +192,7 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
194
192
|
|
|
195
193
|
processor_kwargs, optimizer_kwargs, fit_kwargs, loss_kwargs, params = self._prepare_params(params=params)
|
|
196
194
|
|
|
197
|
-
seed_value = self.
|
|
195
|
+
seed_value = params.pop(self.seed_name, self.default_random_seed)
|
|
198
196
|
|
|
199
197
|
self._num_cpus_infer = params.pop("_num_cpus_infer", 1)
|
|
200
198
|
if seed_value is not None: # Set seeds
|
|
@@ -373,7 +371,6 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
373
371
|
best_epoch = 0
|
|
374
372
|
best_val_metric = -np.inf # higher = better
|
|
375
373
|
best_val_update = 0
|
|
376
|
-
val_improve_epoch = 0 # most recent epoch where validation-score strictly improved
|
|
377
374
|
start_fit_time = time.time()
|
|
378
375
|
if time_limit is not None:
|
|
379
376
|
time_limit = time_limit - (start_fit_time - start_time)
|
|
@@ -498,7 +495,7 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
498
495
|
|
|
499
496
|
if time_limit is not None:
|
|
500
497
|
time_elapsed = time.time() - start_fit_time
|
|
501
|
-
time_epoch_average = time_elapsed / (epoch
|
|
498
|
+
time_epoch_average = time_elapsed / max(epoch, 1) # avoid divide by 0
|
|
502
499
|
time_left = time_limit - time_elapsed
|
|
503
500
|
if time_left < time_epoch_average:
|
|
504
501
|
logger.log(20, f"\tRan out of time, stopping training early. (Stopping on epoch {epoch})")
|
|
@@ -37,10 +37,18 @@ def create_preprocessor(
|
|
|
37
37
|
steps=[("ordinal", OrdinalMergeRaresHandleUnknownEncoder(max_levels=max_category_levels))]
|
|
38
38
|
) # returns 0-n when max_category_levels = n-1. category n is reserved for unknown test-time categories.
|
|
39
39
|
transformers.append(("ordinal", ordinal_transformer, embed_features))
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
40
|
+
try:
|
|
41
|
+
out = ColumnTransformer(
|
|
42
|
+
transformers=transformers, remainder="passthrough", force_int_remainder_cols=False,
|
|
43
|
+
) # numeric features are processed in the same order as in numeric_features vector, so feature-names remain the same.
|
|
44
|
+
except:
|
|
45
|
+
# TODO: Avoid try/except once scikit-learn 1.5 is minimum
|
|
46
|
+
# Needed for scikit-learn 1.4 and 1.9+, force_int_remainder_cols is deprecated in 1.7 and introduced in 1.5
|
|
47
|
+
# ref: https://github.com/autogluon/autogluon/issues/5289
|
|
48
|
+
out = ColumnTransformer(
|
|
49
|
+
transformers=transformers, remainder="passthrough",
|
|
50
|
+
) # numeric features are processed in the same order as in numeric_features vector, so feature-names remain the same.
|
|
51
|
+
return out
|
|
44
52
|
|
|
45
53
|
def convert_df_dtype_to_str(df):
|
|
46
54
|
return df.astype(str)
|
|
@@ -32,6 +32,7 @@ class XGBoostModel(AbstractModel):
|
|
|
32
32
|
ag_key = "XGB"
|
|
33
33
|
ag_name = "XGBoost"
|
|
34
34
|
ag_priority = 40
|
|
35
|
+
seed_name = "seed"
|
|
35
36
|
|
|
36
37
|
def __init__(self, **kwargs):
|
|
37
38
|
super().__init__(**kwargs)
|
|
@@ -75,15 +76,11 @@ class XGBoostModel(AbstractModel):
|
|
|
75
76
|
|
|
76
77
|
return X
|
|
77
78
|
|
|
78
|
-
def _get_random_seed_from_hyperparameters(self, hyperparameters: dict) -> int | None | str:
|
|
79
|
-
return hyperparameters.get("seed", "N/A")
|
|
80
|
-
|
|
81
79
|
def _fit(self, X, y, X_val=None, y_val=None, time_limit=None, num_gpus=0, num_cpus=None, sample_weight=None, sample_weight_val=None, verbosity=2, **kwargs):
|
|
82
80
|
# TODO: utilize sample_weight_val in early-stopping if provided
|
|
83
81
|
start_time = time.time()
|
|
84
82
|
ag_params = self._get_ag_params()
|
|
85
83
|
params = self._get_model_params()
|
|
86
|
-
params["seed"] = self.random_seed
|
|
87
84
|
generate_curves = ag_params.get("generate_curves", False)
|
|
88
85
|
|
|
89
86
|
if generate_curves:
|
|
@@ -125,6 +122,8 @@ class XGBoostModel(AbstractModel):
|
|
|
125
122
|
if eval_metric is not None:
|
|
126
123
|
params["eval_metric"] = eval_metric
|
|
127
124
|
eval_metric_name = eval_metric.__name__ if not isinstance(eval_metric, str) else eval_metric
|
|
125
|
+
else:
|
|
126
|
+
eval_metric_name = params["eval_metric"].__name__ if not isinstance(params["eval_metric"], str) else params["eval_metric"]
|
|
128
127
|
|
|
129
128
|
if X_val is None:
|
|
130
129
|
early_stopping_rounds = None
|
|
@@ -19,7 +19,10 @@ from packaging import version
|
|
|
19
19
|
from autogluon.common import FeatureMetadata, TabularDataset
|
|
20
20
|
from autogluon.common.loaders import load_json
|
|
21
21
|
from autogluon.common.savers import save_json
|
|
22
|
+
from autogluon.common.utils.cv_splitter import CVSplitter
|
|
23
|
+
from autogluon.common.utils.decorators import apply_presets
|
|
22
24
|
from autogluon.common.utils.file_utils import get_directory_size, get_directory_size_per_file
|
|
25
|
+
from autogluon.common.utils.resource_utils import ResourceManager, get_resource_manager
|
|
23
26
|
from autogluon.common.utils.hyperparameter_utils import get_hyperparameter_str_deprecation_msg, is_advanced_hyperparameter_format
|
|
24
27
|
from autogluon.common.utils.log_utils import add_log_to_file, set_logger_verbosity, warn_if_mlflow_autologging_is_enabled
|
|
25
28
|
from autogluon.common.utils.pandas_utils import get_approximate_df_mem_usage
|
|
@@ -45,10 +48,9 @@ from autogluon.core.pseudolabeling.pseudolabeling import filter_ensemble_pseudo,
|
|
|
45
48
|
from autogluon.core.scheduler.scheduler_factory import scheduler_factory
|
|
46
49
|
from autogluon.core.stacked_overfitting.utils import check_stacked_overfitting_from_leaderboard
|
|
47
50
|
from autogluon.core.utils import get_pred_from_proba_df, plot_performance_vs_trials, plot_summary_of_models, plot_tabular_models
|
|
48
|
-
from autogluon.core.utils.decorators import apply_presets
|
|
49
51
|
from autogluon.core.utils.loaders import load_pkl, load_str
|
|
50
52
|
from autogluon.core.utils.savers import save_pkl, save_str
|
|
51
|
-
from autogluon.core.utils.utils import
|
|
53
|
+
from autogluon.core.utils.utils import generate_train_test_split_combined
|
|
52
54
|
|
|
53
55
|
from ..configs.feature_generator_presets import get_default_feature_generator
|
|
54
56
|
from ..configs.hyperparameter_configs import get_hyperparameter_config
|
|
@@ -421,7 +423,7 @@ class TabularPredictor:
|
|
|
421
423
|
num_gpus: int | str = "auto",
|
|
422
424
|
fit_strategy: Literal["sequential", "parallel"] = "sequential",
|
|
423
425
|
memory_limit: float | str = "auto",
|
|
424
|
-
callbacks: list[AbstractCallback] = None,
|
|
426
|
+
callbacks: list[AbstractCallback | list | tuple] = None,
|
|
425
427
|
**kwargs,
|
|
426
428
|
) -> "TabularPredictor":
|
|
427
429
|
"""
|
|
@@ -462,16 +464,23 @@ class TabularPredictor:
|
|
|
462
464
|
It is recommended to only use one `quality` based preset in a given call to `fit()` as they alter many of the same arguments and are not compatible with each-other.
|
|
463
465
|
|
|
464
466
|
In-depth Preset Info:
|
|
465
|
-
extreme_quality={
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
467
|
+
extreme_quality={...}
|
|
468
|
+
New in v1.5: The state-of-the-art for tabular machine learning.
|
|
469
|
+
Requires `pip install autogluon.tabular[tabarena]` to install TabPFN, TabICL, and TabDPT.
|
|
470
|
+
Significantly more accurate than `best_quality` on datasets <= 100000 samples. Requires a GPU.
|
|
471
|
+
Will use recent tabular foundation models TabPFNv2, TabICL, TabDPT, and Mitra to maximize performance.
|
|
469
472
|
Recommended for applications that benefit from the best possible model accuracy.
|
|
470
473
|
|
|
474
|
+
best_quality_v150={...}
|
|
475
|
+
New in v1.5: Better quality than 'best_quality' and 5x+ faster to train. Give it a try!
|
|
476
|
+
|
|
471
477
|
best_quality={'auto_stack': True, 'dynamic_stacking': 'auto', 'hyperparameters': 'zeroshot'}
|
|
472
478
|
Best predictive accuracy with little consideration to inference time or disk usage. Achieve even better results by specifying a large time_limit value.
|
|
473
479
|
Recommended for applications that benefit from the best possible model accuracy.
|
|
474
480
|
|
|
481
|
+
high_quality_v150={...}
|
|
482
|
+
New in v1.5: Better quality than 'high_quality' and 5x+ faster to train. Give it a try!
|
|
483
|
+
|
|
475
484
|
high_quality={'auto_stack': True, 'dynamic_stacking': 'auto', 'hyperparameters': 'zeroshot', 'refit_full': True, 'set_best_to_refit_full': True, 'save_bag_folds': False}
|
|
476
485
|
High predictive accuracy with fast inference. ~8x faster inference and ~8x lower disk usage than `best_quality`.
|
|
477
486
|
Recommended for applications that require reasonable inference speed and/or model size.
|
|
@@ -1091,7 +1100,8 @@ class TabularPredictor:
|
|
|
1091
1100
|
elif verbosity >= 4:
|
|
1092
1101
|
logger.log(20, f"Verbosity: {verbosity} (Maximum Logging)")
|
|
1093
1102
|
|
|
1094
|
-
|
|
1103
|
+
resource_manager: ResourceManager = get_resource_manager()
|
|
1104
|
+
include_gpu_count = resource_manager.get_gpu_count_torch() or verbosity >= 3
|
|
1095
1105
|
sys_msg = get_ag_system_info(path=self.path, include_gpu_count=include_gpu_count)
|
|
1096
1106
|
logger.log(20, sys_msg)
|
|
1097
1107
|
|
|
@@ -1104,11 +1114,13 @@ class TabularPredictor:
|
|
|
1104
1114
|
20,
|
|
1105
1115
|
"No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...\n"
|
|
1106
1116
|
"\tRecommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):\n"
|
|
1107
|
-
"\tpresets='extreme'
|
|
1108
|
-
"\tpresets='best'
|
|
1109
|
-
"\tpresets='
|
|
1110
|
-
"\tpresets='
|
|
1111
|
-
"\tpresets='
|
|
1117
|
+
"\tpresets='extreme' : New in v1.5: The state-of-the-art for tabular data. Massively better than 'best' on datasets <100000 samples by using new Tabular Foundation Models (TFMs) meta-learned on https://tabarena.ai: TabPFNv2, TabICL, Mitra, TabDPT, and TabM. Requires a GPU and `pip install autogluon.tabular[tabarena]` to install TabPFN, TabICL, and TabDPT.\n"
|
|
1118
|
+
"\tpresets='best' : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.\n"
|
|
1119
|
+
"\tpresets='best_v150': New in v1.5: Better quality than 'best' and 5x+ faster to train. Give it a try!\n"
|
|
1120
|
+
"\tpresets='high' : Strong accuracy with fast inference speed.\n"
|
|
1121
|
+
"\tpresets='high_v150': New in v1.5: Better quality than 'high' and 5x+ faster to train. Give it a try!\n"
|
|
1122
|
+
"\tpresets='good' : Good accuracy with very fast inference speed.\n"
|
|
1123
|
+
"\tpresets='medium' : Fast training time, ideal for initial prototyping.",
|
|
1112
1124
|
)
|
|
1113
1125
|
|
|
1114
1126
|
kwargs_orig = kwargs.copy()
|
|
@@ -1162,7 +1174,7 @@ class TabularPredictor:
|
|
|
1162
1174
|
# TODO: Temporary for v1.4. Make this more extensible for v1.5 by letting users make their own dynamic hyperparameters.
|
|
1163
1175
|
dynamic_hyperparameters = kwargs["_experimental_dynamic_hyperparameters"]
|
|
1164
1176
|
if dynamic_hyperparameters:
|
|
1165
|
-
logger.log(20, f"`
|
|
1177
|
+
logger.log(20, f"`extreme_v140` preset uses a dynamic portfolio based on dataset size...")
|
|
1166
1178
|
assert hyperparameters is None, f"hyperparameters must be unspecified when `_experimental_dynamic_hyperparameters=True`."
|
|
1167
1179
|
n_samples = len(train_data)
|
|
1168
1180
|
if n_samples > 30000:
|
|
@@ -1591,6 +1603,25 @@ class TabularPredictor:
|
|
|
1591
1603
|
memory_safe_fits = ds_fit_kwargs.get("memory_safe_fits", True)
|
|
1592
1604
|
enable_ray_logging = ds_fit_kwargs.get("enable_ray_logging", True)
|
|
1593
1605
|
normal_fit = False
|
|
1606
|
+
total_resources = ag_fit_kwargs["core_kwargs"]["total_resources"]
|
|
1607
|
+
|
|
1608
|
+
if memory_safe_fits == "auto":
|
|
1609
|
+
num_gpus = total_resources.get("num_gpus", "auto")
|
|
1610
|
+
if num_gpus == "auto":
|
|
1611
|
+
num_gpus = ResourceManager.get_gpu_count_torch()
|
|
1612
|
+
if num_gpus > 0:
|
|
1613
|
+
logger.log(
|
|
1614
|
+
30,
|
|
1615
|
+
f"DyStack: Disabling memory safe fit mode in DyStack "
|
|
1616
|
+
f"because GPUs were detected and num_gpus='auto' (GPUs cannot be used in memory safe fit mode). "
|
|
1617
|
+
f"If you want to use memory safe fit mode, manually set `num_gpus=0`."
|
|
1618
|
+
)
|
|
1619
|
+
if num_gpus > 0:
|
|
1620
|
+
memory_safe_fits = False
|
|
1621
|
+
else:
|
|
1622
|
+
memory_safe_fits = True
|
|
1623
|
+
|
|
1624
|
+
|
|
1594
1625
|
if memory_safe_fits:
|
|
1595
1626
|
try:
|
|
1596
1627
|
_ds_ray = try_import_ray()
|
|
@@ -1630,9 +1661,6 @@ class TabularPredictor:
|
|
|
1630
1661
|
if _ds_ray is not None:
|
|
1631
1662
|
# Handle resources
|
|
1632
1663
|
# FIXME: what about distributed?
|
|
1633
|
-
from autogluon.common.utils.resource_utils import ResourceManager
|
|
1634
|
-
|
|
1635
|
-
total_resources = ag_fit_kwargs["core_kwargs"]["total_resources"]
|
|
1636
1664
|
|
|
1637
1665
|
num_cpus = total_resources.get("num_cpus", "auto")
|
|
1638
1666
|
|
|
@@ -5243,11 +5271,11 @@ class TabularPredictor:
|
|
|
5243
5271
|
holdout_frac=1 / 9,
|
|
5244
5272
|
n_folds=2,
|
|
5245
5273
|
n_repeats=1,
|
|
5246
|
-
memory_safe_fits=
|
|
5274
|
+
memory_safe_fits="auto",
|
|
5247
5275
|
clean_up_fits=True,
|
|
5248
5276
|
holdout_data=None,
|
|
5249
5277
|
enable_ray_logging=True,
|
|
5250
|
-
enable_callbacks=
|
|
5278
|
+
enable_callbacks=True,
|
|
5251
5279
|
)
|
|
5252
5280
|
allowed_kes = set(ds_args.keys())
|
|
5253
5281
|
|
|
@@ -5262,9 +5290,11 @@ class TabularPredictor:
|
|
|
5262
5290
|
(not isinstance(ds_args["validation_procedure"], str)) or (ds_args["validation_procedure"] not in ["holdout", "cv"])
|
|
5263
5291
|
):
|
|
5264
5292
|
raise ValueError("`validation_procedure` in `ds_args` must be str in {'holdout','cv'}. " + f"Got: {ds_args['validation_procedure']}")
|
|
5265
|
-
for arg_name in ["
|
|
5293
|
+
for arg_name in ["clean_up_fits", "enable_ray_logging"]:
|
|
5266
5294
|
if (arg_name in ds_args) and (not isinstance(ds_args[arg_name], bool)):
|
|
5267
5295
|
raise ValueError(f"`{arg_name}` in `ds_args` must be bool. Got: {type(ds_args[arg_name])}")
|
|
5296
|
+
if "memory_safe_fits" in ds_args and not isinstance(ds_args["memory_safe_fits"], (bool, str)):
|
|
5297
|
+
raise ValueError(f"`memory_safe_fits` in `ds_args` must be bool or 'auto'. Got: {type(ds_args['memory_safe_fits'])}")
|
|
5268
5298
|
for arg_name in ["detection_time_frac", "holdout_frac"]:
|
|
5269
5299
|
if (arg_name in ds_args) and ((not isinstance(ds_args[arg_name], float)) or (ds_args[arg_name] >= 1) or (ds_args[arg_name] <= 0)):
|
|
5270
5300
|
raise ValueError(f"`{arg_name}` in `ds_args` must be float in (0,1). Got: {type(ds_args[arg_name])}, {ds_args[arg_name]}")
|
|
@@ -20,14 +20,17 @@ from ..models import (
|
|
|
20
20
|
LinearModel,
|
|
21
21
|
MultiModalPredictorModel,
|
|
22
22
|
NNFastAiTabularModel,
|
|
23
|
+
PrepLGBModel,
|
|
23
24
|
RealMLPModel,
|
|
24
25
|
RFModel,
|
|
25
26
|
RuleFitModel,
|
|
27
|
+
TabDPTModel,
|
|
26
28
|
TabICLModel,
|
|
27
29
|
TabMModel,
|
|
28
30
|
TabPFNMixModel,
|
|
29
31
|
MitraModel,
|
|
30
|
-
|
|
32
|
+
RealTabPFNv2Model,
|
|
33
|
+
RealTabPFNv25Model,
|
|
31
34
|
TabularNeuralNetTorchModel,
|
|
32
35
|
TextPredictorModel,
|
|
33
36
|
XGBoostModel,
|
|
@@ -47,14 +50,17 @@ REGISTERED_MODEL_CLS_LST = [
|
|
|
47
50
|
TabularNeuralNetTorchModel,
|
|
48
51
|
LinearModel,
|
|
49
52
|
NNFastAiTabularModel,
|
|
53
|
+
PrepLGBModel,
|
|
50
54
|
TextPredictorModel,
|
|
51
55
|
ImagePredictorModel,
|
|
52
56
|
MultiModalPredictorModel,
|
|
53
57
|
FTTransformerModel,
|
|
58
|
+
TabDPTModel,
|
|
54
59
|
TabICLModel,
|
|
55
60
|
TabMModel,
|
|
56
61
|
TabPFNMixModel,
|
|
57
|
-
|
|
62
|
+
RealTabPFNv2Model,
|
|
63
|
+
RealTabPFNv25Model,
|
|
58
64
|
MitraModel,
|
|
59
65
|
FastTextModel,
|
|
60
66
|
GreedyWeightedEnsembleModel,
|
|
@@ -4,6 +4,9 @@ import copy
|
|
|
4
4
|
import os
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import shutil
|
|
7
|
+
import sys
|
|
8
|
+
import subprocess
|
|
9
|
+
import textwrap
|
|
7
10
|
import uuid
|
|
8
11
|
from typing import Any, Type
|
|
9
12
|
|
|
@@ -12,6 +15,7 @@ from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION
|
|
|
12
15
|
from autogluon.core.metrics import METRICS
|
|
13
16
|
from autogluon.core.models import AbstractModel, BaggedEnsembleModel
|
|
14
17
|
from autogluon.core.stacked_overfitting.utils import check_stacked_overfitting_from_leaderboard
|
|
18
|
+
from autogluon.core.testing.global_context_snapshot import GlobalContextSnapshot
|
|
15
19
|
from autogluon.core.utils import download, generate_train_test_split_combined, infer_problem_type, unzip
|
|
16
20
|
|
|
17
21
|
from autogluon.tabular import TabularDataset, TabularPredictor
|
|
@@ -175,6 +179,8 @@ class FitHelper:
|
|
|
175
179
|
use_test_for_val: bool = False,
|
|
176
180
|
raise_on_model_failure: bool | None = None,
|
|
177
181
|
deepcopy_fit_args: bool = True,
|
|
182
|
+
verify_model_seed: bool = False,
|
|
183
|
+
verify_load_wo_cuda: bool = False,
|
|
178
184
|
) -> TabularPredictor:
|
|
179
185
|
if compiler_configs is None:
|
|
180
186
|
compiler_configs = {}
|
|
@@ -218,6 +224,8 @@ class FitHelper:
|
|
|
218
224
|
expected_model_count -= 1
|
|
219
225
|
fit_args["fit_weighted_ensemble"] = fit_weighted_ensemble
|
|
220
226
|
|
|
227
|
+
ctx_before = GlobalContextSnapshot.capture()
|
|
228
|
+
|
|
221
229
|
predictor: TabularPredictor = FitHelper.fit_dataset(
|
|
222
230
|
train_data=train_data,
|
|
223
231
|
init_args=init_args,
|
|
@@ -226,6 +234,10 @@ class FitHelper:
|
|
|
226
234
|
scikit_api=scikit_api,
|
|
227
235
|
min_cls_count_train=min_cls_count_train,
|
|
228
236
|
)
|
|
237
|
+
|
|
238
|
+
ctx_after = GlobalContextSnapshot.capture()
|
|
239
|
+
ctx_before.assert_unchanged(ctx_after)
|
|
240
|
+
|
|
229
241
|
if compile:
|
|
230
242
|
predictor.compile(models="all", compiler_configs=compiler_configs)
|
|
231
243
|
predictor.persist(models="all")
|
|
@@ -269,6 +281,11 @@ class FitHelper:
|
|
|
269
281
|
assert not model_info["val_in_fit"], f"val data must not be present in refit model if `can_refit_full=True`. Maybe an exception occurred?"
|
|
270
282
|
else:
|
|
271
283
|
assert model_info["val_in_fit"], f"val data must be present in refit model if `can_refit_full=False`"
|
|
284
|
+
if verify_model_seed:
|
|
285
|
+
model_names = predictor.model_names()
|
|
286
|
+
for model_name in model_names:
|
|
287
|
+
model = predictor._trainer.load_model(model_name)
|
|
288
|
+
_verify_model_seed(model=model)
|
|
272
289
|
|
|
273
290
|
if predictor_info:
|
|
274
291
|
predictor.info()
|
|
@@ -281,6 +298,28 @@ class FitHelper:
|
|
|
281
298
|
predictor_load = predictor.load(path=predictor.path)
|
|
282
299
|
predictor_load.predict(test_data)
|
|
283
300
|
|
|
301
|
+
# TODO: This is expensive, only do this sparingly.
|
|
302
|
+
if verify_load_wo_cuda:
|
|
303
|
+
import torch
|
|
304
|
+
if torch.cuda.is_available():
|
|
305
|
+
# Checks if the model is able to predict w/o CUDA.
|
|
306
|
+
# This verifies that a model artifact works on a CPU machine.
|
|
307
|
+
predictor_path = predictor.path
|
|
308
|
+
|
|
309
|
+
code = textwrap.dedent(f"""
|
|
310
|
+
import os
|
|
311
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
|
312
|
+
from autogluon.tabular import TabularPredictor
|
|
313
|
+
|
|
314
|
+
import torch
|
|
315
|
+
assert torch.cuda.is_available() is False
|
|
316
|
+
predictor = TabularPredictor.load(r"{predictor_path}")
|
|
317
|
+
X, y = predictor.load_data_internal()
|
|
318
|
+
predictor.persist("all")
|
|
319
|
+
predictor.predict_multi(X, transform_features=False)
|
|
320
|
+
""")
|
|
321
|
+
subprocess.run([sys.executable, "-c", code], check=True)
|
|
322
|
+
|
|
284
323
|
assert os.path.realpath(save_path) == os.path.realpath(predictor.path)
|
|
285
324
|
if delete_directory:
|
|
286
325
|
shutil.rmtree(save_path, ignore_errors=True) # Delete AutoGluon output directory to ensure runs' information has been removed.
|
|
@@ -339,6 +378,7 @@ class FitHelper:
|
|
|
339
378
|
require_known_problem_types: bool = True,
|
|
340
379
|
raise_on_model_failure: bool = True,
|
|
341
380
|
problem_types: list[str] | None = None,
|
|
381
|
+
verify_model_seed: bool = True,
|
|
342
382
|
**kwargs,
|
|
343
383
|
):
|
|
344
384
|
"""
|
|
@@ -355,12 +395,18 @@ class FitHelper:
|
|
|
355
395
|
problem_types: list[str], optional
|
|
356
396
|
If specified, checks the given problem_types.
|
|
357
397
|
If None, checks `model_cls.supported_problem_types()`
|
|
398
|
+
verify_model_seed: bool = True
|
|
358
399
|
**kwargs
|
|
359
400
|
|
|
360
401
|
Returns
|
|
361
402
|
-------
|
|
362
403
|
|
|
363
404
|
"""
|
|
405
|
+
if verify_model_seed and model_cls.seed_name is not None:
|
|
406
|
+
# verify that the seed logic works
|
|
407
|
+
model_hyperparameters = model_hyperparameters.copy()
|
|
408
|
+
model_hyperparameters[model_cls.seed_name] = 42
|
|
409
|
+
|
|
364
410
|
fit_args = dict(
|
|
365
411
|
hyperparameters={model_cls: model_hyperparameters},
|
|
366
412
|
)
|
|
@@ -429,6 +475,7 @@ class FitHelper:
|
|
|
429
475
|
refit_full=refit_full,
|
|
430
476
|
extra_metrics=_extra_metrics,
|
|
431
477
|
raise_on_model_failure=raise_on_model_failure,
|
|
478
|
+
verify_model_seed=verify_model_seed,
|
|
432
479
|
**kwargs,
|
|
433
480
|
)
|
|
434
481
|
|
|
@@ -460,6 +507,7 @@ class FitHelper:
|
|
|
460
507
|
refit_full=refit_full,
|
|
461
508
|
extra_metrics=_extra_metrics,
|
|
462
509
|
raise_on_model_failure=raise_on_model_failure,
|
|
510
|
+
verify_model_seed=verify_model_seed,
|
|
463
511
|
**kwargs,
|
|
464
512
|
)
|
|
465
513
|
|
|
@@ -476,3 +524,16 @@ def stacked_overfitting_assert(
|
|
|
476
524
|
if expected_stacked_overfitting_at_test is not None:
|
|
477
525
|
stacked_overfitting = check_stacked_overfitting_from_leaderboard(lb)
|
|
478
526
|
assert stacked_overfitting == expected_stacked_overfitting_at_test, "Expected stacked overfitting at test mismatch!"
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def _verify_model_seed(model: AbstractModel):
|
|
530
|
+
assert model.random_seed is None or isinstance(model.random_seed, int)
|
|
531
|
+
if model.seed_name is not None:
|
|
532
|
+
if model.seed_name in model._user_params:
|
|
533
|
+
assert model.random_seed == model._user_params[model.seed_name]
|
|
534
|
+
assert model.seed_name in model.params
|
|
535
|
+
assert model.random_seed == model.params[model.seed_name]
|
|
536
|
+
if isinstance(model, BaggedEnsembleModel):
|
|
537
|
+
for child in model.models:
|
|
538
|
+
child = model.load_child(child)
|
|
539
|
+
_verify_model_seed(child)
|