autogluon.tabular 1.3.2b20250610__py3-none-any.whl → 1.4.1b20251214__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autogluon/tabular/configs/config_helper.py +1 -1
- autogluon/tabular/configs/hyperparameter_configs.py +2 -265
- autogluon/tabular/configs/pipeline_presets.py +130 -0
- autogluon/tabular/configs/presets_configs.py +51 -26
- autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2023.py +0 -1
- autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2025.py +310 -0
- autogluon/tabular/models/__init__.py +6 -1
- autogluon/tabular/models/_utils/rapids_utils.py +1 -1
- autogluon/tabular/models/automm/automm_model.py +2 -0
- autogluon/tabular/models/automm/ft_transformer.py +4 -1
- autogluon/tabular/models/catboost/callbacks.py +3 -2
- autogluon/tabular/models/catboost/catboost_model.py +15 -9
- autogluon/tabular/models/catboost/catboost_utils.py +17 -3
- autogluon/tabular/models/ebm/__init__.py +0 -0
- autogluon/tabular/models/ebm/ebm_model.py +259 -0
- autogluon/tabular/models/ebm/hyperparameters/__init__.py +0 -0
- autogluon/tabular/models/ebm/hyperparameters/parameters.py +39 -0
- autogluon/tabular/models/ebm/hyperparameters/searchspaces.py +72 -0
- autogluon/tabular/models/fastainn/tabular_nn_fastai.py +7 -5
- autogluon/tabular/models/knn/knn_model.py +7 -3
- autogluon/tabular/models/lgb/lgb_model.py +60 -21
- autogluon/tabular/models/lr/lr_model.py +6 -1
- autogluon/tabular/models/lr/lr_preprocessing_utils.py +6 -7
- autogluon/tabular/models/lr/lr_rapids_model.py +45 -5
- autogluon/tabular/models/mitra/__init__.py +0 -0
- autogluon/tabular/models/mitra/_internal/__init__.py +1 -0
- autogluon/tabular/models/mitra/_internal/config/__init__.py +1 -0
- autogluon/tabular/models/mitra/_internal/config/config_pretrain.py +190 -0
- autogluon/tabular/models/mitra/_internal/config/config_run.py +32 -0
- autogluon/tabular/models/mitra/_internal/config/enums.py +162 -0
- autogluon/tabular/models/mitra/_internal/core/__init__.py +1 -0
- autogluon/tabular/models/mitra/_internal/core/callbacks.py +94 -0
- autogluon/tabular/models/mitra/_internal/core/get_loss.py +54 -0
- autogluon/tabular/models/mitra/_internal/core/get_optimizer.py +108 -0
- autogluon/tabular/models/mitra/_internal/core/get_scheduler.py +67 -0
- autogluon/tabular/models/mitra/_internal/core/prediction_metrics.py +132 -0
- autogluon/tabular/models/mitra/_internal/core/trainer_finetune.py +373 -0
- autogluon/tabular/models/mitra/_internal/data/__init__.py +1 -0
- autogluon/tabular/models/mitra/_internal/data/collator.py +46 -0
- autogluon/tabular/models/mitra/_internal/data/dataset_finetune.py +136 -0
- autogluon/tabular/models/mitra/_internal/data/dataset_split.py +57 -0
- autogluon/tabular/models/mitra/_internal/data/preprocessor.py +420 -0
- autogluon/tabular/models/mitra/_internal/models/__init__.py +1 -0
- autogluon/tabular/models/mitra/_internal/models/base.py +21 -0
- autogluon/tabular/models/mitra/_internal/models/embedding.py +182 -0
- autogluon/tabular/models/mitra/_internal/models/tab2d.py +667 -0
- autogluon/tabular/models/mitra/_internal/utils/__init__.py +1 -0
- autogluon/tabular/models/mitra/_internal/utils/set_seed.py +15 -0
- autogluon/tabular/models/mitra/mitra_model.py +380 -0
- autogluon/tabular/models/mitra/sklearn_interface.py +494 -0
- autogluon/tabular/models/realmlp/__init__.py +0 -0
- autogluon/tabular/models/realmlp/realmlp_model.py +360 -0
- autogluon/tabular/models/rf/rf_model.py +11 -6
- autogluon/tabular/models/tabicl/__init__.py +0 -0
- autogluon/tabular/models/tabicl/tabicl_model.py +179 -0
- autogluon/tabular/models/tabm/__init__.py +0 -0
- autogluon/tabular/models/tabm/_tabm_internal.py +545 -0
- autogluon/tabular/models/tabm/rtdl_num_embeddings.py +810 -0
- autogluon/tabular/models/tabm/tabm_model.py +356 -0
- autogluon/tabular/models/tabm/tabm_reference.py +631 -0
- autogluon/tabular/models/tabpfnmix/tabpfnmix_model.py +13 -7
- autogluon/tabular/models/tabpfnv2/__init__.py +0 -0
- autogluon/tabular/models/tabpfnv2/rfpfn/__init__.py +20 -0
- autogluon/tabular/models/tabpfnv2/rfpfn/configs.py +40 -0
- autogluon/tabular/models/tabpfnv2/rfpfn/scoring_utils.py +201 -0
- autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_based_decision_tree_tabpfn.py +1464 -0
- autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_based_random_forest_tabpfn.py +747 -0
- autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_compat.py +863 -0
- autogluon/tabular/models/tabpfnv2/rfpfn/utils.py +106 -0
- autogluon/tabular/models/tabpfnv2/tabpfnv2_model.py +388 -0
- autogluon/tabular/models/tabular_nn/hyperparameters/parameters.py +1 -3
- autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py +5 -5
- autogluon/tabular/models/xgboost/xgboost_model.py +10 -3
- autogluon/tabular/predictor/predictor.py +147 -84
- autogluon/tabular/registry/_ag_model_registry.py +12 -2
- autogluon/tabular/testing/fit_helper.py +57 -27
- autogluon/tabular/testing/generate_datasets.py +7 -0
- autogluon/tabular/trainer/abstract_trainer.py +3 -1
- autogluon/tabular/trainer/model_presets/presets.py +10 -1
- autogluon/tabular/version.py +1 -1
- autogluon.tabular-1.4.1b20251214-py3.11-nspkg.pth +1 -0
- {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info}/METADATA +112 -57
- {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info}/RECORD +89 -40
- {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info}/WHEEL +1 -1
- autogluon/tabular/models/tabpfn/__init__.py +0 -1
- autogluon/tabular/models/tabpfn/tabpfn_model.py +0 -153
- autogluon.tabular-1.3.2b20250610-py3.9-nspkg.pth +0 -1
- {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info/licenses}/LICENSE +0 -0
- {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info/licenses}/NOTICE +0 -0
- {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info}/namespace_packages.txt +0 -0
- {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info}/top_level.txt +0 -0
- {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
import warnings
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION
|
|
10
|
+
from autogluon.core.models import AbstractModel
|
|
11
|
+
|
|
12
|
+
from .hyperparameters.parameters import get_param_baseline
|
|
13
|
+
from .hyperparameters.searchspaces import get_default_searchspace
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from autogluon.core.metrics import Scorer
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class EbmCallback:
|
|
20
|
+
"""Time limit callback for EBM."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, seconds: float):
|
|
23
|
+
self.seconds = seconds
|
|
24
|
+
self.end_time: float | None = None
|
|
25
|
+
|
|
26
|
+
def __call__(self, *args, **kwargs):
|
|
27
|
+
if self.end_time is None:
|
|
28
|
+
self.end_time = time.monotonic() + self.seconds
|
|
29
|
+
return False
|
|
30
|
+
return time.monotonic() > self.end_time
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class EBMModel(AbstractModel):
|
|
34
|
+
"""
|
|
35
|
+
The Explainable Boosting Machine (EBM) is a glass-box generalized additive model
|
|
36
|
+
with automatic interaction detection (https://interpret.ml/docs). EBMs are
|
|
37
|
+
designed to be highly interpretable while achieving accuracy comparable to
|
|
38
|
+
black-box models on a wide range of tabular datasets.
|
|
39
|
+
|
|
40
|
+
Requires the 'interpret' or 'interpret-core' package. Install via:
|
|
41
|
+
|
|
42
|
+
pip install interpret
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
Paper: InterpretML: A Unified Framework for Machine Learning Interpretability
|
|
46
|
+
|
|
47
|
+
Authors: H. Nori, S. Jenkins, P. Koch, and R. Caruana 2019
|
|
48
|
+
|
|
49
|
+
Codebase: https://github.com/interpretml/interpret
|
|
50
|
+
|
|
51
|
+
License: MIT
|
|
52
|
+
|
|
53
|
+
.. versionadded:: 1.5.0
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
ag_key = "EBM"
|
|
57
|
+
ag_name = "EBM"
|
|
58
|
+
ag_priority = 35
|
|
59
|
+
seed_name = "random_state"
|
|
60
|
+
|
|
61
|
+
def _fit(
|
|
62
|
+
self,
|
|
63
|
+
X: pd.DataFrame,
|
|
64
|
+
y: pd.Series,
|
|
65
|
+
X_val: pd.DataFrame | None = None,
|
|
66
|
+
y_val: pd.Series | None = None,
|
|
67
|
+
time_limit: float | None = None,
|
|
68
|
+
sample_weight: np.ndarray | None = None,
|
|
69
|
+
sample_weight_val: np.ndarray | None = None,
|
|
70
|
+
num_cpus: int | str = "auto",
|
|
71
|
+
**kwargs,
|
|
72
|
+
):
|
|
73
|
+
# Preprocess data.
|
|
74
|
+
X = self.preprocess(X)
|
|
75
|
+
if X_val is not None:
|
|
76
|
+
X_val = self.preprocess(X_val)
|
|
77
|
+
|
|
78
|
+
features = self._features
|
|
79
|
+
if features is None:
|
|
80
|
+
features = X.columns
|
|
81
|
+
|
|
82
|
+
params = construct_ebm_params(
|
|
83
|
+
self.problem_type,
|
|
84
|
+
self._get_model_params(),
|
|
85
|
+
features,
|
|
86
|
+
self.stopping_metric,
|
|
87
|
+
num_cpus,
|
|
88
|
+
time_limit,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Init Class
|
|
92
|
+
model_cls = get_class_from_problem_type(self.problem_type)
|
|
93
|
+
self.model = model_cls(**params)
|
|
94
|
+
|
|
95
|
+
# Handle validation data format for EBM
|
|
96
|
+
fit_X = X
|
|
97
|
+
fit_y = y
|
|
98
|
+
fit_sample_weight = sample_weight
|
|
99
|
+
bags = None
|
|
100
|
+
if X_val is not None:
|
|
101
|
+
fit_X = pd.concat([X, X_val], ignore_index=True)
|
|
102
|
+
fit_y = pd.concat([y, y_val], ignore_index=True)
|
|
103
|
+
if sample_weight is not None:
|
|
104
|
+
fit_sample_weight = np.hstack([sample_weight, sample_weight_val])
|
|
105
|
+
bags = np.full((len(fit_X), 1), 1, np.int8)
|
|
106
|
+
bags[len(X) :, 0] = -1
|
|
107
|
+
|
|
108
|
+
with warnings.catch_warnings(): # try to filter joblib warnings
|
|
109
|
+
warnings.filterwarnings(
|
|
110
|
+
"ignore",
|
|
111
|
+
category=UserWarning,
|
|
112
|
+
message=".*resource_tracker: process died.*",
|
|
113
|
+
)
|
|
114
|
+
self.model.fit(fit_X, fit_y, sample_weight=fit_sample_weight, bags=bags)
|
|
115
|
+
|
|
116
|
+
def _set_default_params(self):
|
|
117
|
+
default_params = get_param_baseline(problem_type=self.problem_type, num_classes=self.num_classes)
|
|
118
|
+
for param, val in default_params.items():
|
|
119
|
+
self._set_default_param_value(param, val)
|
|
120
|
+
|
|
121
|
+
def _get_default_searchspace(self):
|
|
122
|
+
return get_default_searchspace(problem_type=self.problem_type, num_classes=self.num_classes)
|
|
123
|
+
|
|
124
|
+
def _get_default_auxiliary_params(self) -> dict:
|
|
125
|
+
default_auxiliary_params = super()._get_default_auxiliary_params()
|
|
126
|
+
extra_auxiliary_params = {
|
|
127
|
+
"valid_raw_types": ["int", "float", "category"],
|
|
128
|
+
}
|
|
129
|
+
default_auxiliary_params.update(extra_auxiliary_params)
|
|
130
|
+
return default_auxiliary_params
|
|
131
|
+
|
|
132
|
+
@classmethod
|
|
133
|
+
def supported_problem_types(cls) -> list[str] | None:
|
|
134
|
+
return ["binary", "multiclass", "regression"]
|
|
135
|
+
|
|
136
|
+
@classmethod
|
|
137
|
+
def _class_tags(cls) -> dict:
|
|
138
|
+
return {"can_estimate_memory_usage_static": True}
|
|
139
|
+
|
|
140
|
+
def _more_tags(self) -> dict:
|
|
141
|
+
"""EBMs support refit full."""
|
|
142
|
+
return {"can_refit_full": True}
|
|
143
|
+
|
|
144
|
+
def _estimate_memory_usage(self, X: pd.DataFrame, y: pd.Series | None = None, **kwargs) -> int:
|
|
145
|
+
return self.estimate_memory_usage_static(
|
|
146
|
+
X=X,
|
|
147
|
+
y=y,
|
|
148
|
+
hyperparameters=self._get_model_params(),
|
|
149
|
+
problem_type=self.problem_type,
|
|
150
|
+
num_classes=self.num_classes,
|
|
151
|
+
features=self._features,
|
|
152
|
+
**kwargs,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
@classmethod
|
|
156
|
+
def _estimate_memory_usage_static(
|
|
157
|
+
cls,
|
|
158
|
+
*,
|
|
159
|
+
X: pd.DataFrame,
|
|
160
|
+
y: pd.Series | None = None,
|
|
161
|
+
hyperparameters: dict | None = None,
|
|
162
|
+
problem_type: str = "infer",
|
|
163
|
+
num_classes: int = 1,
|
|
164
|
+
features=None,
|
|
165
|
+
**kwargs,
|
|
166
|
+
) -> int:
|
|
167
|
+
"""Returns the expected peak memory usage in bytes of the EBM model during fit."""
|
|
168
|
+
# TODO: we can improve the memory estimate slightly by using num_classes if y is None
|
|
169
|
+
|
|
170
|
+
if features is None:
|
|
171
|
+
features = X.columns
|
|
172
|
+
|
|
173
|
+
model_cls = get_class_from_problem_type(problem_type)
|
|
174
|
+
params = construct_ebm_params(problem_type, hyperparameters, features)
|
|
175
|
+
baseline_memory_bytes = 400_000_000 # 400 MB baseline memory
|
|
176
|
+
|
|
177
|
+
# assuming we call pd.concat([X, X_val], ignore_index=True), then X size will be doubled
|
|
178
|
+
return baseline_memory_bytes + model_cls(**params).estimate_mem(
|
|
179
|
+
X, y, data_multiplier=2.0
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
def _validate_fit_memory_usage(self, mem_error_threshold: float = 1, **kwargs):
|
|
183
|
+
# Given the good mem estimates with overhead, we set the threshold to 1.
|
|
184
|
+
return super()._validate_fit_memory_usage(
|
|
185
|
+
mem_error_threshold=mem_error_threshold, **kwargs
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def construct_ebm_params(
|
|
190
|
+
problem_type,
|
|
191
|
+
hyperparameters=None,
|
|
192
|
+
features=None,
|
|
193
|
+
stopping_metric=None,
|
|
194
|
+
num_cpus=-1,
|
|
195
|
+
time_limit=None,
|
|
196
|
+
):
|
|
197
|
+
if hyperparameters is None:
|
|
198
|
+
hyperparameters = {}
|
|
199
|
+
|
|
200
|
+
hyperparameters = hyperparameters.copy() # we pop values below, so copy.
|
|
201
|
+
|
|
202
|
+
# The user can specify nominal and continuous columns.
|
|
203
|
+
continuous_columns = hyperparameters.pop("continuous_columns", [])
|
|
204
|
+
nominal_columns = hyperparameters.pop("nominal_columns", [])
|
|
205
|
+
|
|
206
|
+
feature_types = None
|
|
207
|
+
if features is not None:
|
|
208
|
+
feature_types = []
|
|
209
|
+
for c in features:
|
|
210
|
+
if c in continuous_columns:
|
|
211
|
+
f_type = "continuous"
|
|
212
|
+
elif c in nominal_columns:
|
|
213
|
+
f_type = "nominal"
|
|
214
|
+
else:
|
|
215
|
+
f_type = "auto"
|
|
216
|
+
feature_types.append(f_type)
|
|
217
|
+
|
|
218
|
+
# Default parameters for EBM
|
|
219
|
+
params = {
|
|
220
|
+
"outer_bags": 1, # AutoGluon ensemble creates outer bags, no need for this overhead.
|
|
221
|
+
"n_jobs": 1, # EBM only parallelizes across outer bags currently, so ignore num_cpus
|
|
222
|
+
"feature_names": features,
|
|
223
|
+
"feature_types": feature_types,
|
|
224
|
+
}
|
|
225
|
+
if stopping_metric is not None:
|
|
226
|
+
params["objective"] = get_metric_from_ag_metric(
|
|
227
|
+
metric=stopping_metric, problem_type=problem_type
|
|
228
|
+
)
|
|
229
|
+
if time_limit is not None:
|
|
230
|
+
params["callback"] = EbmCallback(time_limit)
|
|
231
|
+
|
|
232
|
+
params.update(hyperparameters)
|
|
233
|
+
return params
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def get_class_from_problem_type(problem_type: str):
|
|
237
|
+
if problem_type in [BINARY, MULTICLASS]:
|
|
238
|
+
from interpret.glassbox import ExplainableBoostingClassifier
|
|
239
|
+
|
|
240
|
+
model_cls = ExplainableBoostingClassifier
|
|
241
|
+
elif problem_type == REGRESSION:
|
|
242
|
+
from interpret.glassbox import ExplainableBoostingRegressor
|
|
243
|
+
|
|
244
|
+
model_cls = ExplainableBoostingRegressor
|
|
245
|
+
else:
|
|
246
|
+
raise ValueError(f"Unsupported problem type: {problem_type}")
|
|
247
|
+
return model_cls
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def get_metric_from_ag_metric(*, metric: Scorer, problem_type: str):
|
|
251
|
+
"""Map AutoGluon metric to EBM metric for early stopping."""
|
|
252
|
+
if problem_type in [BINARY, MULTICLASS]:
|
|
253
|
+
metric_class = "log_loss"
|
|
254
|
+
elif problem_type == REGRESSION:
|
|
255
|
+
metric_class = "rmse"
|
|
256
|
+
else:
|
|
257
|
+
raise AssertionError(f"EBM does not support {problem_type} problem type.")
|
|
258
|
+
|
|
259
|
+
return metric_class
|
|
File without changes
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION, SOFTCLASS
|
|
2
|
+
|
|
3
|
+
def get_param_baseline(problem_type, num_classes=None):
|
|
4
|
+
if problem_type == BINARY:
|
|
5
|
+
return get_param_binary_baseline()
|
|
6
|
+
elif problem_type == MULTICLASS:
|
|
7
|
+
return get_param_multiclass_baseline(num_classes=num_classes)
|
|
8
|
+
elif problem_type == SOFTCLASS:
|
|
9
|
+
return get_param_multiclass_baseline(num_classes=num_classes)
|
|
10
|
+
elif problem_type == REGRESSION:
|
|
11
|
+
return get_param_regression_baseline()
|
|
12
|
+
else:
|
|
13
|
+
return get_param_binary_baseline()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_base_params():
|
|
17
|
+
base_params = {}
|
|
18
|
+
return base_params
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_param_binary_baseline():
|
|
22
|
+
params = get_base_params()
|
|
23
|
+
baseline_params = {}
|
|
24
|
+
params.update(baseline_params)
|
|
25
|
+
return params
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_param_multiclass_baseline(num_classes):
|
|
29
|
+
params = get_base_params()
|
|
30
|
+
baseline_params = {}
|
|
31
|
+
params.update(baseline_params)
|
|
32
|
+
return params
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_param_regression_baseline():
|
|
36
|
+
params = get_base_params()
|
|
37
|
+
baseline_params = {}
|
|
38
|
+
params.update(baseline_params)
|
|
39
|
+
return params
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Default hyperparameter search spaces used in EBM model"""
|
|
2
|
+
|
|
3
|
+
from autogluon.common import space
|
|
4
|
+
from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION
|
|
5
|
+
|
|
6
|
+
def get_default_searchspace(problem_type, num_classes=None):
|
|
7
|
+
if problem_type == BINARY:
|
|
8
|
+
return get_searchspace_binary_baseline()
|
|
9
|
+
elif problem_type == MULTICLASS:
|
|
10
|
+
return get_searchspace_multiclass_baseline(num_classes=num_classes)
|
|
11
|
+
elif problem_type == REGRESSION:
|
|
12
|
+
return get_searchspace_regression_baseline()
|
|
13
|
+
else:
|
|
14
|
+
return get_searchspace_binary_baseline()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def get_base_searchspace():
|
|
18
|
+
base_params = {
|
|
19
|
+
"max_leaves": space.Int(2, 3, default=2),
|
|
20
|
+
"smoothing_rounds": space.Int(0, 1000, default=200),
|
|
21
|
+
"learning_rate": space.Real(0.0025, 0.2, default=0.02, log=True),
|
|
22
|
+
"interactions": space.Categorical(
|
|
23
|
+
0,
|
|
24
|
+
"0.5x",
|
|
25
|
+
"1x",
|
|
26
|
+
"1.5x",
|
|
27
|
+
"2x",
|
|
28
|
+
"2.5x",
|
|
29
|
+
"3x",
|
|
30
|
+
"3.5x",
|
|
31
|
+
"4x",
|
|
32
|
+
"4.5x",
|
|
33
|
+
"5x",
|
|
34
|
+
"6x",
|
|
35
|
+
"7x",
|
|
36
|
+
"8x",
|
|
37
|
+
"9x",
|
|
38
|
+
"10x",
|
|
39
|
+
"15x",
|
|
40
|
+
"20x",
|
|
41
|
+
"25x",
|
|
42
|
+
),
|
|
43
|
+
"interaction_smoothing_rounds": space.Int(0, 200, default=90),
|
|
44
|
+
"min_hessian": space.Real(1e-10, 1e-2, default=1e-4, log=True),
|
|
45
|
+
"min_samples_leaf": space.Int(2, 20, default=4),
|
|
46
|
+
"gain_scale": space.Real(0.5, 5.0, default=5.0, log=True),
|
|
47
|
+
"min_cat_samples": space.Int(5, 20, default=10),
|
|
48
|
+
"cat_smooth": space.Real(5.0, 100.0, default=10.0, log=True),
|
|
49
|
+
"missing": space.Categorical("separate", "low", "high", "gain"),
|
|
50
|
+
}
|
|
51
|
+
return base_params
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def get_searchspace_multiclass_baseline(num_classes):
|
|
55
|
+
params = get_base_searchspace()
|
|
56
|
+
baseline_params = {}
|
|
57
|
+
params.update(baseline_params)
|
|
58
|
+
return params
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_searchspace_binary_baseline():
|
|
62
|
+
params = get_base_searchspace()
|
|
63
|
+
baseline_params = {}
|
|
64
|
+
params.update(baseline_params)
|
|
65
|
+
return params
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_searchspace_regression_baseline():
|
|
69
|
+
params = get_base_searchspace()
|
|
70
|
+
baseline_params = {}
|
|
71
|
+
params.update(baseline_params)
|
|
72
|
+
return params
|
|
@@ -103,6 +103,7 @@ class NNFastAiTabularModel(AbstractModel):
|
|
|
103
103
|
ag_priority_by_problem_type = MappingProxyType({
|
|
104
104
|
MULTICLASS: 95,
|
|
105
105
|
})
|
|
106
|
+
seed_name = "random_seed"
|
|
106
107
|
|
|
107
108
|
model_internals_file_name = "model-internals.pkl"
|
|
108
109
|
|
|
@@ -322,8 +323,9 @@ class NNFastAiTabularModel(AbstractModel):
|
|
|
322
323
|
# Make deterministic
|
|
323
324
|
from fastai.torch_core import set_seed
|
|
324
325
|
|
|
325
|
-
|
|
326
|
-
|
|
326
|
+
random_seed = params.pop(self.seed_name, self.default_random_seed)
|
|
327
|
+
set_seed(random_seed, True)
|
|
328
|
+
dls.rng.seed(random_seed)
|
|
327
329
|
|
|
328
330
|
if self.problem_type == QUANTILE:
|
|
329
331
|
dls.c = len(self.quantile_levels)
|
|
@@ -584,8 +586,8 @@ class NNFastAiTabularModel(AbstractModel):
|
|
|
584
586
|
return default_auxiliary_params
|
|
585
587
|
|
|
586
588
|
def _get_default_resources(self):
|
|
587
|
-
#
|
|
588
|
-
num_cpus = ResourceManager.
|
|
589
|
+
# only_physical_cores=True is faster in training
|
|
590
|
+
num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
|
|
589
591
|
num_gpus = 0
|
|
590
592
|
return num_cpus, num_gpus
|
|
591
593
|
|
|
@@ -642,7 +644,7 @@ class NNFastAiTabularModel(AbstractModel):
|
|
|
642
644
|
|
|
643
645
|
def _get_maximum_resources(self) -> dict[str, Union[int, float]]:
|
|
644
646
|
# fastai model trains slower when utilizing virtual cores and this issue scale up when the number of cpu cores increases
|
|
645
|
-
return {"num_cpus": ResourceManager.
|
|
647
|
+
return {"num_cpus": ResourceManager.get_cpu_count(only_physical_cores=True)}
|
|
646
648
|
|
|
647
649
|
def get_minimum_resources(self, is_gpu_available=False):
|
|
648
650
|
minimum_resources = {
|
|
@@ -214,7 +214,7 @@ class KNNModel(AbstractModel):
|
|
|
214
214
|
def sample_func(chunk, frac):
|
|
215
215
|
# Guarantee at least 1 sample (otherwise log_loss would crash or model would return different column counts in pred_proba)
|
|
216
216
|
n = max(math.ceil(len(chunk) * frac), 1)
|
|
217
|
-
return chunk.sample(n=n, replace=False, random_state=
|
|
217
|
+
return chunk.sample(n=n, replace=False, random_state=self.random_seed)
|
|
218
218
|
|
|
219
219
|
if self.problem_type != REGRESSION:
|
|
220
220
|
y_df = y.to_frame(name="label").reset_index(drop=True)
|
|
@@ -255,9 +255,13 @@ class KNNModel(AbstractModel):
|
|
|
255
255
|
self._X_unused_index = [i for i in range(num_rows_max) if i not in idx]
|
|
256
256
|
return self.model
|
|
257
257
|
|
|
258
|
-
def _get_maximum_resources(self) ->
|
|
258
|
+
def _get_maximum_resources(self) -> dict[str, int | float]:
|
|
259
259
|
# use at most 32 cpus to avoid OpenBLAS error: https://github.com/autogluon/autogluon/issues/1020
|
|
260
|
-
|
|
260
|
+
# no GPU support
|
|
261
|
+
return {
|
|
262
|
+
"num_cpus": 32,
|
|
263
|
+
"num_gpus": 0,
|
|
264
|
+
}
|
|
261
265
|
|
|
262
266
|
def _get_default_resources(self):
|
|
263
267
|
# use at most 32 cpus to avoid OpenBLAS error: https://github.com/autogluon/autogluon/issues/1020
|
|
@@ -46,6 +46,8 @@ class LGBModel(AbstractModel):
|
|
|
46
46
|
ag_priority_by_problem_type = MappingProxyType({
|
|
47
47
|
SOFTCLASS: 100
|
|
48
48
|
})
|
|
49
|
+
seed_name = "seed"
|
|
50
|
+
seed_name_alt = ["seed_value", "random_seed", "random_state"]
|
|
49
51
|
|
|
50
52
|
def __init__(self, **kwargs):
|
|
51
53
|
super().__init__(**kwargs)
|
|
@@ -161,7 +163,7 @@ class LGBModel(AbstractModel):
|
|
|
161
163
|
# Before enabling GPU, we should add code to detect that GPU-enabled version is installed and that a valid GPU exists.
|
|
162
164
|
# GPU training heavily alters accuracy, often in a negative manner. We will have to be careful about when to use GPU.
|
|
163
165
|
params["device"] = "gpu"
|
|
164
|
-
logger.log(20, f"\
|
|
166
|
+
logger.log(20, f"\tWarning: Training LightGBM with GPU. This may negatively impact model quality compared to CPU training.")
|
|
165
167
|
logger.log(15, f"\tFitting {num_boost_round} rounds... Hyperparameters: {params}")
|
|
166
168
|
|
|
167
169
|
if "num_threads" not in params:
|
|
@@ -225,7 +227,6 @@ class LGBModel(AbstractModel):
|
|
|
225
227
|
if log_period is not None:
|
|
226
228
|
callbacks.append(log_evaluation(period=log_period))
|
|
227
229
|
|
|
228
|
-
seed_val = params.pop("seed_value", 0)
|
|
229
230
|
train_params = {
|
|
230
231
|
"params": params,
|
|
231
232
|
"train_set": dataset_train,
|
|
@@ -281,11 +282,10 @@ class LGBModel(AbstractModel):
|
|
|
281
282
|
train_params["params"]["metric"] = f'{stopping_metric},{train_params["params"]["metric"]}'
|
|
282
283
|
|
|
283
284
|
if self.problem_type == SOFTCLASS:
|
|
284
|
-
train_params["
|
|
285
|
+
train_params["params"]["objective"] = lgb_utils.softclass_lgbobj
|
|
286
|
+
train_params["params"]["num_classes"] = self.num_classes
|
|
285
287
|
elif self.problem_type == QUANTILE:
|
|
286
288
|
train_params["params"]["quantile_levels"] = self.quantile_levels
|
|
287
|
-
if seed_val is not None:
|
|
288
|
-
train_params["params"]["seed"] = seed_val
|
|
289
289
|
|
|
290
290
|
# Train LightGBM model:
|
|
291
291
|
# Note that self.model contains a <class 'lightgbm.basic.Booster'> not a LightBGMClassifier or LightGBMRegressor object
|
|
@@ -298,16 +298,28 @@ class LGBModel(AbstractModel):
|
|
|
298
298
|
try:
|
|
299
299
|
self.model = train_lgb_model(early_stopping_callback_kwargs=early_stopping_callback_kwargs, **train_params)
|
|
300
300
|
except LightGBMError:
|
|
301
|
-
if train_params["params"].get("device", "cpu")
|
|
301
|
+
if train_params["params"].get("device", "cpu") not in ["gpu", "cuda"]:
|
|
302
302
|
raise
|
|
303
303
|
else:
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
304
|
+
if train_params["params"]["device"] == "gpu":
|
|
305
|
+
logger.warning(
|
|
306
|
+
"Warning: GPU mode might not be installed for LightGBM, "
|
|
307
|
+
"GPU training raised an exception. Falling back to CPU training..."
|
|
308
|
+
"Refer to LightGBM GPU documentation: "
|
|
309
|
+
"https://github.com/Microsoft/LightGBM/tree/master/python-package#build-gpu-version"
|
|
310
|
+
"One possible method is:"
|
|
311
|
+
"\tpip uninstall lightgbm -y"
|
|
312
|
+
"\tpip install lightgbm --install-option=--gpu"
|
|
313
|
+
)
|
|
314
|
+
elif train_params["params"]["device"] == "cuda":
|
|
315
|
+
# Current blocker for using CUDA over GPU: https://github.com/microsoft/LightGBM/issues/6828
|
|
316
|
+
# Note that device="cuda" works if AutoGluon (and therefore LightGBM) is installed via conda.
|
|
317
|
+
logger.warning(
|
|
318
|
+
"Warning: CUDA mode might not be installed for LightGBM, "
|
|
319
|
+
"CUDA training raised an exception. Falling back to CPU training..."
|
|
320
|
+
"Refer to LightGBM CUDA documentation: "
|
|
321
|
+
"https://github.com/Microsoft/LightGBM/tree/master/python-package#build-cuda-version"
|
|
322
|
+
)
|
|
311
323
|
train_params["params"]["device"] = "cpu"
|
|
312
324
|
self.model = train_lgb_model(early_stopping_callback_kwargs=early_stopping_callback_kwargs, **train_params)
|
|
313
325
|
retrain = False
|
|
@@ -508,17 +520,44 @@ class LGBModel(AbstractModel):
|
|
|
508
520
|
default_auxiliary_params.update(extra_auxiliary_params)
|
|
509
521
|
return default_auxiliary_params
|
|
510
522
|
|
|
511
|
-
|
|
523
|
+
@staticmethod
|
|
524
|
+
def _is_gpu_lgbm_installed():
|
|
512
525
|
# Taken from https://github.com/microsoft/LightGBM/issues/3939
|
|
513
526
|
try_import_lightgbm()
|
|
514
527
|
import lightgbm
|
|
515
528
|
|
|
529
|
+
rng = np.random.RandomState(42)
|
|
530
|
+
data = rng.rand(25, 2)
|
|
531
|
+
label = rng.randint(2, size=25)
|
|
532
|
+
|
|
533
|
+
try:
|
|
534
|
+
train_data = lightgbm.Dataset(data, label=label)
|
|
535
|
+
params = {
|
|
536
|
+
"device": "gpu",
|
|
537
|
+
"verbose": -1,
|
|
538
|
+
}
|
|
539
|
+
gbm = lightgbm.train(params, num_boost_round=10, train_set=train_data)
|
|
540
|
+
return True
|
|
541
|
+
except Exception as e:
|
|
542
|
+
return False
|
|
543
|
+
|
|
544
|
+
@staticmethod
|
|
545
|
+
def _is_cuda_lgbm_installed():
|
|
546
|
+
# Taken from https://github.com/microsoft/LightGBM/issues/3939
|
|
547
|
+
try_import_lightgbm()
|
|
548
|
+
import lightgbm
|
|
549
|
+
|
|
550
|
+
rng = np.random.RandomState(42)
|
|
551
|
+
data = rng.rand(25, 2)
|
|
552
|
+
label = rng.randint(2, size=25)
|
|
553
|
+
|
|
516
554
|
try:
|
|
517
|
-
data = np.random.rand(50, 2)
|
|
518
|
-
label = np.random.randint(2, size=50)
|
|
519
555
|
train_data = lightgbm.Dataset(data, label=label)
|
|
520
|
-
params = {
|
|
521
|
-
|
|
556
|
+
params = {
|
|
557
|
+
"device": "cuda",
|
|
558
|
+
"verbose": -1,
|
|
559
|
+
}
|
|
560
|
+
gbm = lightgbm.train(params, num_boost_round=10, train_set=train_data)
|
|
522
561
|
return True
|
|
523
562
|
except Exception as e:
|
|
524
563
|
return False
|
|
@@ -527,13 +566,13 @@ class LGBModel(AbstractModel):
|
|
|
527
566
|
minimum_resources = {
|
|
528
567
|
"num_cpus": 1,
|
|
529
568
|
}
|
|
530
|
-
if is_gpu_available
|
|
569
|
+
if is_gpu_available:
|
|
531
570
|
minimum_resources["num_gpus"] = 0.5
|
|
532
571
|
return minimum_resources
|
|
533
572
|
|
|
534
573
|
def _get_default_resources(self):
|
|
535
|
-
#
|
|
536
|
-
num_cpus = ResourceManager.
|
|
574
|
+
# only_physical_cores=True is faster in training
|
|
575
|
+
num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
|
|
537
576
|
num_gpus = 0
|
|
538
577
|
return num_cpus, num_gpus
|
|
539
578
|
|
|
@@ -43,6 +43,7 @@ class LinearModel(AbstractModel):
|
|
|
43
43
|
ag_key = "LR"
|
|
44
44
|
ag_name = "LinearModel"
|
|
45
45
|
ag_priority = 30
|
|
46
|
+
seed_name = "random_state"
|
|
46
47
|
|
|
47
48
|
def __init__(self, **kwargs):
|
|
48
49
|
super().__init__(**kwargs)
|
|
@@ -155,7 +156,7 @@ class LinearModel(AbstractModel):
|
|
|
155
156
|
return self._pipeline.fit_transform(X)
|
|
156
157
|
|
|
157
158
|
def _set_default_params(self):
|
|
158
|
-
default_params = {"
|
|
159
|
+
default_params = {"fit_intercept": True}
|
|
159
160
|
if self.problem_type != REGRESSION:
|
|
160
161
|
default_params.update({"solver": _get_solver(self.problem_type)})
|
|
161
162
|
default_params.update(get_param_baseline())
|
|
@@ -319,6 +320,10 @@ class LinearModel(AbstractModel):
|
|
|
319
320
|
) -> int:
|
|
320
321
|
return 4 * get_approximate_df_mem_usage(X).sum()
|
|
321
322
|
|
|
323
|
+
def _get_maximum_resources(self) -> dict[str, int | float]:
|
|
324
|
+
# no GPU support
|
|
325
|
+
return {"num_gpus": 0}
|
|
326
|
+
|
|
322
327
|
@classmethod
|
|
323
328
|
def supported_problem_types(cls) -> list[str] | None:
|
|
324
329
|
return ["binary", "multiclass", "regression"]
|
|
@@ -5,20 +5,19 @@ from autogluon.features.generators import OneHotEncoderFeatureGenerator
|
|
|
5
5
|
|
|
6
6
|
class OheFeaturesGenerator(BaseEstimator, TransformerMixin):
|
|
7
7
|
def __init__(self):
|
|
8
|
-
|
|
9
|
-
self._encoder = None
|
|
8
|
+
pass
|
|
10
9
|
|
|
11
10
|
def fit(self, X, y=None):
|
|
12
|
-
self.
|
|
13
|
-
self.
|
|
14
|
-
self.
|
|
11
|
+
self.encoder_ = OneHotEncoderFeatureGenerator(max_levels=10000, verbosity=0)
|
|
12
|
+
self.encoder_.fit(X)
|
|
13
|
+
self.feature_names_ = self.encoder_.features_out
|
|
15
14
|
return self
|
|
16
15
|
|
|
17
16
|
def transform(self, X, y=None):
|
|
18
|
-
return self.
|
|
17
|
+
return self.encoder_.transform_ohe(X)
|
|
19
18
|
|
|
20
19
|
def get_feature_names(self):
|
|
21
|
-
return self.
|
|
20
|
+
return self.feature_names_
|
|
22
21
|
|
|
23
22
|
|
|
24
23
|
class NlpDataPreprocessor(BaseEstimator, TransformerMixin):
|