autogluon.tabular 1.4.0__py3-none-any.whl → 1.4.1b20251128__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of autogluon.tabular might be problematic. Click here for more details.
- autogluon/tabular/configs/pipeline_presets.py +130 -0
- autogluon/tabular/configs/presets_configs.py +0 -3
- autogluon/tabular/models/__init__.py +1 -0
- autogluon/tabular/models/catboost/catboost_model.py +4 -1
- autogluon/tabular/models/ebm/__init__.py +0 -0
- autogluon/tabular/models/ebm/ebm_model.py +259 -0
- autogluon/tabular/models/ebm/hyperparameters/__init__.py +0 -0
- autogluon/tabular/models/ebm/hyperparameters/parameters.py +39 -0
- autogluon/tabular/models/ebm/hyperparameters/searchspaces.py +72 -0
- autogluon/tabular/models/fastainn/tabular_nn_fastai.py +4 -2
- autogluon/tabular/models/knn/knn_model.py +7 -3
- autogluon/tabular/models/lgb/lgb_model.py +56 -18
- autogluon/tabular/models/lr/lr_model.py +6 -1
- autogluon/tabular/models/lr/lr_preprocessing_utils.py +6 -7
- autogluon/tabular/models/mitra/_internal/models/tab2d.py +10 -10
- autogluon/tabular/models/mitra/mitra_model.py +43 -3
- autogluon/tabular/models/mitra/sklearn_interface.py +8 -21
- autogluon/tabular/models/realmlp/realmlp_model.py +1 -3
- autogluon/tabular/models/rf/rf_model.py +5 -1
- autogluon/tabular/models/tabicl/tabicl_model.py +1 -7
- autogluon/tabular/models/tabm/tabm_model.py +76 -6
- autogluon/tabular/models/tabpfnmix/tabpfnmix_model.py +6 -4
- autogluon/tabular/models/tabpfnv2/tabpfnv2_model.py +1 -7
- autogluon/tabular/models/tabular_nn/hyperparameters/parameters.py +1 -3
- autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py +2 -1
- autogluon/tabular/models/xgboost/xgboost_model.py +8 -1
- autogluon/tabular/predictor/predictor.py +63 -55
- autogluon/tabular/registry/_ag_model_registry.py +2 -0
- autogluon/tabular/testing/fit_helper.py +28 -0
- autogluon/tabular/version.py +1 -1
- autogluon.tabular-1.4.1b20251128-py3.11-nspkg.pth +1 -0
- {autogluon.tabular-1.4.0.dist-info → autogluon_tabular-1.4.1b20251128.dist-info}/METADATA +87 -71
- {autogluon.tabular-1.4.0.dist-info → autogluon_tabular-1.4.1b20251128.dist-info}/RECORD +39 -33
- {autogluon.tabular-1.4.0.dist-info → autogluon_tabular-1.4.1b20251128.dist-info}/WHEEL +1 -1
- autogluon.tabular-1.4.0-py3.9-nspkg.pth +0 -1
- {autogluon.tabular-1.4.0.dist-info → autogluon_tabular-1.4.1b20251128.dist-info/licenses}/LICENSE +0 -0
- {autogluon.tabular-1.4.0.dist-info → autogluon_tabular-1.4.1b20251128.dist-info/licenses}/NOTICE +0 -0
- {autogluon.tabular-1.4.0.dist-info → autogluon_tabular-1.4.1b20251128.dist-info}/namespace_packages.txt +0 -0
- {autogluon.tabular-1.4.0.dist-info → autogluon_tabular-1.4.1b20251128.dist-info}/top_level.txt +0 -0
- {autogluon.tabular-1.4.0.dist-info → autogluon_tabular-1.4.1b20251128.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
|
|
5
|
+
from autogluon.core.constants import BINARY, PROBLEM_TYPES
|
|
6
|
+
from autogluon.core.utils.utils import default_holdout_frac
|
|
7
|
+
|
|
8
|
+
USE_BAG_HOLDOUT_AUTO_THRESHOLD = 1_000_000
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _get_validation_preset(num_train_rows: int, hpo_enabled: bool) -> dict[str, int | float]:
|
|
12
|
+
"""Recommended validation preset manually defined by the AutoGluon developers."""
|
|
13
|
+
|
|
14
|
+
# -- Default recommendation
|
|
15
|
+
# max 8 due to 8 cores per CPU being very common.
|
|
16
|
+
# down to 5 folds for small datasets to have enough samples for a representative validation set.
|
|
17
|
+
num_bag_folds = min(8, max(5, math.floor(num_train_rows / 10)))
|
|
18
|
+
|
|
19
|
+
num_bag_sets = 1 # More repeats do not seem to help due to overfitting on val data.
|
|
20
|
+
use_bag_holdout = num_train_rows >= USE_BAG_HOLDOUT_AUTO_THRESHOLD
|
|
21
|
+
holdout_frac = round(default_holdout_frac(num_train_rows=num_train_rows, hyperparameter_tune=hpo_enabled), 4)
|
|
22
|
+
|
|
23
|
+
return dict(
|
|
24
|
+
num_bag_sets=num_bag_sets,
|
|
25
|
+
num_bag_folds=num_bag_folds,
|
|
26
|
+
use_bag_holdout=use_bag_holdout,
|
|
27
|
+
holdout_frac=holdout_frac,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# TODO(refactor): use a data class for the config of the validation method.
|
|
32
|
+
# TODO(improvement): Implement a more sophisticated solution.
|
|
33
|
+
# Could also use more metadata such as num_features, num_models,
|
|
34
|
+
# or time_limit for a heuristic.
|
|
35
|
+
# num_features: The number of features in the dataset.
|
|
36
|
+
# num_models: The number of models in the portfolio to fit.
|
|
37
|
+
# time_limit: The time limit for fitting models.
|
|
38
|
+
# Pointer for non-heuristic approach:
|
|
39
|
+
# -> meta-learning like Auto-Sklearn 2.0, needs a lot of metadata
|
|
40
|
+
def get_validation_and_stacking_method(
|
|
41
|
+
# Validation parameters
|
|
42
|
+
num_bag_folds: int | None,
|
|
43
|
+
num_bag_sets: int | None,
|
|
44
|
+
use_bag_holdout: bool | None,
|
|
45
|
+
holdout_frac: float | None,
|
|
46
|
+
# Stacking/Pipeline parameters
|
|
47
|
+
auto_stack: bool,
|
|
48
|
+
num_stack_levels: int | None,
|
|
49
|
+
dynamic_stacking: bool | None,
|
|
50
|
+
refit_full: bool | None,
|
|
51
|
+
# Metadata
|
|
52
|
+
num_train_rows: int,
|
|
53
|
+
problem_type: PROBLEM_TYPES,
|
|
54
|
+
hpo_enabled: bool,
|
|
55
|
+
) -> tuple[int, int, int, bool, bool, float, bool]:
|
|
56
|
+
"""Get the validation method for AutoGluon via a heuristic.
|
|
57
|
+
|
|
58
|
+
Input variables are `None` if they were not specified by the user or have an explicit default.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
num_bag_folds: int | None
|
|
63
|
+
The number of folds for cross-validation.
|
|
64
|
+
num_bag_sets: int | None
|
|
65
|
+
The number of repeats for cross-validation.
|
|
66
|
+
use_bag_holdout: bool | None
|
|
67
|
+
Whether to use (additional) holdout validation.
|
|
68
|
+
holdout_frac: float | None
|
|
69
|
+
The fraction of data to holdout for validation.
|
|
70
|
+
auto_stack: bool
|
|
71
|
+
Whether to automatically determine the stacking method.
|
|
72
|
+
num_stack_levels: int | None
|
|
73
|
+
The number of stacking levels.
|
|
74
|
+
dynamic_stacking: bool | None
|
|
75
|
+
Whether to use dynamic stacking.
|
|
76
|
+
refit_full: bool
|
|
77
|
+
Whether to refit the full training dataset.
|
|
78
|
+
num_train_rows: int
|
|
79
|
+
The number of rows in the training dataset.
|
|
80
|
+
problem_type: PROBLEM_TYPES
|
|
81
|
+
The type of problem to solve.
|
|
82
|
+
hpo_enabled: bool
|
|
83
|
+
If True, HPO is enabled during the run of AutoGluon.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
--------
|
|
87
|
+
Returns all variables needed to define the validation method.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
cv_preset = _get_validation_preset(num_train_rows=num_train_rows, hpo_enabled=hpo_enabled)
|
|
91
|
+
|
|
92
|
+
# Independent of `auto_stack`
|
|
93
|
+
if use_bag_holdout is None:
|
|
94
|
+
use_bag_holdout = cv_preset["use_bag_holdout"]
|
|
95
|
+
if holdout_frac is None:
|
|
96
|
+
holdout_frac = cv_preset["holdout_frac"]
|
|
97
|
+
if dynamic_stacking is None:
|
|
98
|
+
dynamic_stacking = not use_bag_holdout
|
|
99
|
+
if refit_full is None:
|
|
100
|
+
refit_full = False
|
|
101
|
+
|
|
102
|
+
# Changed by `auto_stack`
|
|
103
|
+
if num_bag_folds is None:
|
|
104
|
+
# `num_bag_folds == 0` -> only use holdout validation
|
|
105
|
+
num_bag_folds = cv_preset["num_bag_folds"] if auto_stack else 0
|
|
106
|
+
if num_bag_sets is None:
|
|
107
|
+
# `num_bag_sets == 1` -> no repeats
|
|
108
|
+
num_bag_sets = cv_preset["num_bag_sets"] if auto_stack else 1
|
|
109
|
+
if num_stack_levels is None:
|
|
110
|
+
# Disable multi-layer stacking by default
|
|
111
|
+
num_stack_levels = 0
|
|
112
|
+
|
|
113
|
+
# Activate multi-layer stacking for `auto_stack` if
|
|
114
|
+
if auto_stack and (
|
|
115
|
+
dynamic_stacking # -> We use dynamic stacking
|
|
116
|
+
or
|
|
117
|
+
# -> We have holdout validation or a non-binary problem with more than 750 training rows
|
|
118
|
+
((use_bag_holdout or (problem_type != BINARY)) and (num_train_rows >= 750))
|
|
119
|
+
):
|
|
120
|
+
num_stack_levels = 1
|
|
121
|
+
|
|
122
|
+
return (
|
|
123
|
+
num_bag_folds,
|
|
124
|
+
num_bag_sets,
|
|
125
|
+
num_stack_levels,
|
|
126
|
+
dynamic_stacking,
|
|
127
|
+
use_bag_holdout,
|
|
128
|
+
holdout_frac,
|
|
129
|
+
refit_full,
|
|
130
|
+
)
|
|
@@ -6,7 +6,6 @@ tabular_presets_dict = dict(
|
|
|
6
6
|
best_quality={
|
|
7
7
|
"auto_stack": True,
|
|
8
8
|
"dynamic_stacking": "auto",
|
|
9
|
-
"num_bag_sets": 1,
|
|
10
9
|
"hyperparameters": "zeroshot",
|
|
11
10
|
"time_limit": 3600,
|
|
12
11
|
},
|
|
@@ -16,7 +15,6 @@ tabular_presets_dict = dict(
|
|
|
16
15
|
high_quality={
|
|
17
16
|
"auto_stack": True,
|
|
18
17
|
"dynamic_stacking": "auto",
|
|
19
|
-
"num_bag_sets": 1,
|
|
20
18
|
"hyperparameters": "zeroshot",
|
|
21
19
|
"time_limit": 3600,
|
|
22
20
|
"refit_full": True,
|
|
@@ -29,7 +27,6 @@ tabular_presets_dict = dict(
|
|
|
29
27
|
good_quality={
|
|
30
28
|
"auto_stack": True,
|
|
31
29
|
"dynamic_stacking": "auto",
|
|
32
|
-
"num_bag_sets": 1,
|
|
33
30
|
"hyperparameters": "light",
|
|
34
31
|
"time_limit": 3600,
|
|
35
32
|
"refit_full": True,
|
|
@@ -3,6 +3,7 @@ from autogluon.core.models.abstract.abstract_model import AbstractModel
|
|
|
3
3
|
from .automm.automm_model import MultiModalPredictorModel
|
|
4
4
|
from .automm.ft_transformer import FTTransformerModel
|
|
5
5
|
from .catboost.catboost_model import CatBoostModel
|
|
6
|
+
from .ebm.ebm_model import EBMModel
|
|
6
7
|
from .fastainn.tabular_nn_fastai import NNFastAiTabularModel
|
|
7
8
|
from .fasttext.fasttext_model import FastTextModel
|
|
8
9
|
from .image_prediction.image_predictor import ImagePredictorModel
|
|
@@ -39,6 +39,7 @@ class CatBoostModel(AbstractModel):
|
|
|
39
39
|
ag_priority_by_problem_type = MappingProxyType({
|
|
40
40
|
SOFTCLASS: 60
|
|
41
41
|
})
|
|
42
|
+
seed_name = "random_seed"
|
|
42
43
|
|
|
43
44
|
def __init__(self, **kwargs):
|
|
44
45
|
super().__init__(**kwargs)
|
|
@@ -48,7 +49,6 @@ class CatBoostModel(AbstractModel):
|
|
|
48
49
|
default_params = get_param_baseline(problem_type=self.problem_type)
|
|
49
50
|
for param, val in default_params.items():
|
|
50
51
|
self._set_default_param_value(param, val)
|
|
51
|
-
self._set_default_param_value("random_seed", 0) # Remove randomness for reproducibility
|
|
52
52
|
# Set 'allow_writing_files' to True in order to keep log files created by catboost during training (these will be saved in the directory where AutoGluon stores this model)
|
|
53
53
|
self._set_default_param_value("allow_writing_files", False) # Disables creation of catboost logging files during training by default
|
|
54
54
|
if self.problem_type != SOFTCLASS: # TODO: remove this after catboost 0.24
|
|
@@ -126,6 +126,7 @@ class CatBoostModel(AbstractModel):
|
|
|
126
126
|
|
|
127
127
|
ag_params = self._get_ag_params()
|
|
128
128
|
params = self._get_model_params()
|
|
129
|
+
|
|
129
130
|
params["thread_count"] = num_cpus
|
|
130
131
|
if self.problem_type == SOFTCLASS:
|
|
131
132
|
# FIXME: This is extremely slow due to unoptimized metric / objective sent to CatBoost
|
|
@@ -310,6 +311,8 @@ class CatBoostModel(AbstractModel):
|
|
|
310
311
|
max_memory_iters = math.floor(available_mem * max_memory_proportion / mem_usage_per_iter)
|
|
311
312
|
|
|
312
313
|
final_iters = min(default_iters, min(max_memory_iters, estimated_iters_in_time))
|
|
314
|
+
if final_iters < 1:
|
|
315
|
+
raise TimeLimitExceeded
|
|
313
316
|
return final_iters
|
|
314
317
|
|
|
315
318
|
def _predict_proba(self, X, **kwargs):
|
|
File without changes
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
import warnings
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION
|
|
10
|
+
from autogluon.core.models import AbstractModel
|
|
11
|
+
|
|
12
|
+
from .hyperparameters.parameters import get_param_baseline
|
|
13
|
+
from .hyperparameters.searchspaces import get_default_searchspace
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from autogluon.core.metrics import Scorer
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class EbmCallback:
|
|
20
|
+
"""Time limit callback for EBM."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, seconds: float):
|
|
23
|
+
self.seconds = seconds
|
|
24
|
+
self.end_time: float | None = None
|
|
25
|
+
|
|
26
|
+
def __call__(self, *args, **kwargs):
|
|
27
|
+
if self.end_time is None:
|
|
28
|
+
self.end_time = time.monotonic() + self.seconds
|
|
29
|
+
return False
|
|
30
|
+
return time.monotonic() > self.end_time
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class EBMModel(AbstractModel):
|
|
34
|
+
"""
|
|
35
|
+
The Explainable Boosting Machine (EBM) is a glass-box generalized additive model
|
|
36
|
+
with automatic interaction detection (https://interpret.ml/docs). EBMs are
|
|
37
|
+
designed to be highly interpretable while achieving accuracy comparable to
|
|
38
|
+
black-box models on a wide range of tabular datasets.
|
|
39
|
+
|
|
40
|
+
Requires the 'interpret' or 'interpret-core' package. Install via:
|
|
41
|
+
|
|
42
|
+
pip install interpret
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
Paper: InterpretML: A Unified Framework for Machine Learning Interpretability
|
|
46
|
+
|
|
47
|
+
Authors: H. Nori, S. Jenkins, P. Koch, and R. Caruana 2019
|
|
48
|
+
|
|
49
|
+
Codebase: https://github.com/interpretml/interpret
|
|
50
|
+
|
|
51
|
+
License: MIT
|
|
52
|
+
|
|
53
|
+
.. versionadded:: 1.5.0
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
ag_key = "EBM"
|
|
57
|
+
ag_name = "EBM"
|
|
58
|
+
ag_priority = 35
|
|
59
|
+
seed_name = "random_state"
|
|
60
|
+
|
|
61
|
+
def _fit(
|
|
62
|
+
self,
|
|
63
|
+
X: pd.DataFrame,
|
|
64
|
+
y: pd.Series,
|
|
65
|
+
X_val: pd.DataFrame | None = None,
|
|
66
|
+
y_val: pd.Series | None = None,
|
|
67
|
+
time_limit: float | None = None,
|
|
68
|
+
sample_weight: np.ndarray | None = None,
|
|
69
|
+
sample_weight_val: np.ndarray | None = None,
|
|
70
|
+
num_cpus: int | str = "auto",
|
|
71
|
+
**kwargs,
|
|
72
|
+
):
|
|
73
|
+
# Preprocess data.
|
|
74
|
+
X = self.preprocess(X)
|
|
75
|
+
if X_val is not None:
|
|
76
|
+
X_val = self.preprocess(X_val)
|
|
77
|
+
|
|
78
|
+
features = self._features
|
|
79
|
+
if features is None:
|
|
80
|
+
features = X.columns
|
|
81
|
+
|
|
82
|
+
params = construct_ebm_params(
|
|
83
|
+
self.problem_type,
|
|
84
|
+
self._get_model_params(),
|
|
85
|
+
features,
|
|
86
|
+
self.stopping_metric,
|
|
87
|
+
num_cpus,
|
|
88
|
+
time_limit,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Init Class
|
|
92
|
+
model_cls = get_class_from_problem_type(self.problem_type)
|
|
93
|
+
self.model = model_cls(**params)
|
|
94
|
+
|
|
95
|
+
# Handle validation data format for EBM
|
|
96
|
+
fit_X = X
|
|
97
|
+
fit_y = y
|
|
98
|
+
fit_sample_weight = sample_weight
|
|
99
|
+
bags = None
|
|
100
|
+
if X_val is not None:
|
|
101
|
+
fit_X = pd.concat([X, X_val], ignore_index=True)
|
|
102
|
+
fit_y = pd.concat([y, y_val], ignore_index=True)
|
|
103
|
+
if sample_weight is not None:
|
|
104
|
+
fit_sample_weight = np.hstack([sample_weight, sample_weight_val])
|
|
105
|
+
bags = np.full((len(fit_X), 1), 1, np.int8)
|
|
106
|
+
bags[len(X) :, 0] = -1
|
|
107
|
+
|
|
108
|
+
with warnings.catch_warnings(): # try to filter joblib warnings
|
|
109
|
+
warnings.filterwarnings(
|
|
110
|
+
"ignore",
|
|
111
|
+
category=UserWarning,
|
|
112
|
+
message=".*resource_tracker: process died.*",
|
|
113
|
+
)
|
|
114
|
+
self.model.fit(fit_X, fit_y, sample_weight=fit_sample_weight, bags=bags)
|
|
115
|
+
|
|
116
|
+
def _set_default_params(self):
|
|
117
|
+
default_params = get_param_baseline(problem_type=self.problem_type, num_classes=self.num_classes)
|
|
118
|
+
for param, val in default_params.items():
|
|
119
|
+
self._set_default_param_value(param, val)
|
|
120
|
+
|
|
121
|
+
def _get_default_searchspace(self):
|
|
122
|
+
return get_default_searchspace(problem_type=self.problem_type, num_classes=self.num_classes)
|
|
123
|
+
|
|
124
|
+
def _get_default_auxiliary_params(self) -> dict:
|
|
125
|
+
default_auxiliary_params = super()._get_default_auxiliary_params()
|
|
126
|
+
extra_auxiliary_params = {
|
|
127
|
+
"valid_raw_types": ["int", "float", "category"],
|
|
128
|
+
}
|
|
129
|
+
default_auxiliary_params.update(extra_auxiliary_params)
|
|
130
|
+
return default_auxiliary_params
|
|
131
|
+
|
|
132
|
+
@classmethod
|
|
133
|
+
def supported_problem_types(cls) -> list[str] | None:
|
|
134
|
+
return ["binary", "multiclass", "regression"]
|
|
135
|
+
|
|
136
|
+
@classmethod
|
|
137
|
+
def _class_tags(cls) -> dict:
|
|
138
|
+
return {"can_estimate_memory_usage_static": True}
|
|
139
|
+
|
|
140
|
+
def _more_tags(self) -> dict:
|
|
141
|
+
"""EBMs support refit full."""
|
|
142
|
+
return {"can_refit_full": True}
|
|
143
|
+
|
|
144
|
+
def _estimate_memory_usage(self, X: pd.DataFrame, y: pd.Series | None = None, **kwargs) -> int:
|
|
145
|
+
return self.estimate_memory_usage_static(
|
|
146
|
+
X=X,
|
|
147
|
+
y=y,
|
|
148
|
+
hyperparameters=self._get_model_params(),
|
|
149
|
+
problem_type=self.problem_type,
|
|
150
|
+
num_classes=self.num_classes,
|
|
151
|
+
features=self._features,
|
|
152
|
+
**kwargs,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
@classmethod
|
|
156
|
+
def _estimate_memory_usage_static(
|
|
157
|
+
cls,
|
|
158
|
+
*,
|
|
159
|
+
X: pd.DataFrame,
|
|
160
|
+
y: pd.Series | None = None,
|
|
161
|
+
hyperparameters: dict | None = None,
|
|
162
|
+
problem_type: str = "infer",
|
|
163
|
+
num_classes: int = 1,
|
|
164
|
+
features=None,
|
|
165
|
+
**kwargs,
|
|
166
|
+
) -> int:
|
|
167
|
+
"""Returns the expected peak memory usage in bytes of the EBM model during fit."""
|
|
168
|
+
# TODO: we can improve the memory estimate slightly by using num_classes if y is None
|
|
169
|
+
|
|
170
|
+
if features is None:
|
|
171
|
+
features = X.columns
|
|
172
|
+
|
|
173
|
+
model_cls = get_class_from_problem_type(problem_type)
|
|
174
|
+
params = construct_ebm_params(problem_type, hyperparameters, features)
|
|
175
|
+
baseline_memory_bytes = 400_000_000 # 400 MB baseline memory
|
|
176
|
+
|
|
177
|
+
# assuming we call pd.concat([X, X_val], ignore_index=True), then X size will be doubled
|
|
178
|
+
return baseline_memory_bytes + model_cls(**params).estimate_mem(
|
|
179
|
+
X, y, data_multiplier=2.0
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
def _validate_fit_memory_usage(self, mem_error_threshold: float = 1, **kwargs):
|
|
183
|
+
# Given the good mem estimates with overhead, we set the threshold to 1.
|
|
184
|
+
return super()._validate_fit_memory_usage(
|
|
185
|
+
mem_error_threshold=mem_error_threshold, **kwargs
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def construct_ebm_params(
|
|
190
|
+
problem_type,
|
|
191
|
+
hyperparameters=None,
|
|
192
|
+
features=None,
|
|
193
|
+
stopping_metric=None,
|
|
194
|
+
num_cpus=-1,
|
|
195
|
+
time_limit=None,
|
|
196
|
+
):
|
|
197
|
+
if hyperparameters is None:
|
|
198
|
+
hyperparameters = {}
|
|
199
|
+
|
|
200
|
+
hyperparameters = hyperparameters.copy() # we pop values below, so copy.
|
|
201
|
+
|
|
202
|
+
# The user can specify nominal and continuous columns.
|
|
203
|
+
continuous_columns = hyperparameters.pop("continuous_columns", [])
|
|
204
|
+
nominal_columns = hyperparameters.pop("nominal_columns", [])
|
|
205
|
+
|
|
206
|
+
feature_types = None
|
|
207
|
+
if features is not None:
|
|
208
|
+
feature_types = []
|
|
209
|
+
for c in features:
|
|
210
|
+
if c in continuous_columns:
|
|
211
|
+
f_type = "continuous"
|
|
212
|
+
elif c in nominal_columns:
|
|
213
|
+
f_type = "nominal"
|
|
214
|
+
else:
|
|
215
|
+
f_type = "auto"
|
|
216
|
+
feature_types.append(f_type)
|
|
217
|
+
|
|
218
|
+
# Default parameters for EBM
|
|
219
|
+
params = {
|
|
220
|
+
"outer_bags": 1, # AutoGluon ensemble creates outer bags, no need for this overhead.
|
|
221
|
+
"n_jobs": 1, # EBM only parallelizes across outer bags currently, so ignore num_cpus
|
|
222
|
+
"feature_names": features,
|
|
223
|
+
"feature_types": feature_types,
|
|
224
|
+
}
|
|
225
|
+
if stopping_metric is not None:
|
|
226
|
+
params["objective"] = get_metric_from_ag_metric(
|
|
227
|
+
metric=stopping_metric, problem_type=problem_type
|
|
228
|
+
)
|
|
229
|
+
if time_limit is not None:
|
|
230
|
+
params["callback"] = EbmCallback(time_limit)
|
|
231
|
+
|
|
232
|
+
params.update(hyperparameters)
|
|
233
|
+
return params
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def get_class_from_problem_type(problem_type: str):
|
|
237
|
+
if problem_type in [BINARY, MULTICLASS]:
|
|
238
|
+
from interpret.glassbox import ExplainableBoostingClassifier
|
|
239
|
+
|
|
240
|
+
model_cls = ExplainableBoostingClassifier
|
|
241
|
+
elif problem_type == REGRESSION:
|
|
242
|
+
from interpret.glassbox import ExplainableBoostingRegressor
|
|
243
|
+
|
|
244
|
+
model_cls = ExplainableBoostingRegressor
|
|
245
|
+
else:
|
|
246
|
+
raise ValueError(f"Unsupported problem type: {problem_type}")
|
|
247
|
+
return model_cls
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def get_metric_from_ag_metric(*, metric: Scorer, problem_type: str):
|
|
251
|
+
"""Map AutoGluon metric to EBM metric for early stopping."""
|
|
252
|
+
if problem_type in [BINARY, MULTICLASS]:
|
|
253
|
+
metric_class = "log_loss"
|
|
254
|
+
elif problem_type == REGRESSION:
|
|
255
|
+
metric_class = "rmse"
|
|
256
|
+
else:
|
|
257
|
+
raise AssertionError(f"EBM does not support {problem_type} problem type.")
|
|
258
|
+
|
|
259
|
+
return metric_class
|
|
File without changes
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION, SOFTCLASS
|
|
2
|
+
|
|
3
|
+
def get_param_baseline(problem_type, num_classes=None):
|
|
4
|
+
if problem_type == BINARY:
|
|
5
|
+
return get_param_binary_baseline()
|
|
6
|
+
elif problem_type == MULTICLASS:
|
|
7
|
+
return get_param_multiclass_baseline(num_classes=num_classes)
|
|
8
|
+
elif problem_type == SOFTCLASS:
|
|
9
|
+
return get_param_multiclass_baseline(num_classes=num_classes)
|
|
10
|
+
elif problem_type == REGRESSION:
|
|
11
|
+
return get_param_regression_baseline()
|
|
12
|
+
else:
|
|
13
|
+
return get_param_binary_baseline()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_base_params():
|
|
17
|
+
base_params = {}
|
|
18
|
+
return base_params
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_param_binary_baseline():
|
|
22
|
+
params = get_base_params()
|
|
23
|
+
baseline_params = {}
|
|
24
|
+
params.update(baseline_params)
|
|
25
|
+
return params
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_param_multiclass_baseline(num_classes):
|
|
29
|
+
params = get_base_params()
|
|
30
|
+
baseline_params = {}
|
|
31
|
+
params.update(baseline_params)
|
|
32
|
+
return params
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_param_regression_baseline():
|
|
36
|
+
params = get_base_params()
|
|
37
|
+
baseline_params = {}
|
|
38
|
+
params.update(baseline_params)
|
|
39
|
+
return params
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Default hyperparameter search spaces used in EBM model"""
|
|
2
|
+
|
|
3
|
+
from autogluon.common import space
|
|
4
|
+
from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION
|
|
5
|
+
|
|
6
|
+
def get_default_searchspace(problem_type, num_classes=None):
|
|
7
|
+
if problem_type == BINARY:
|
|
8
|
+
return get_searchspace_binary_baseline()
|
|
9
|
+
elif problem_type == MULTICLASS:
|
|
10
|
+
return get_searchspace_multiclass_baseline(num_classes=num_classes)
|
|
11
|
+
elif problem_type == REGRESSION:
|
|
12
|
+
return get_searchspace_regression_baseline()
|
|
13
|
+
else:
|
|
14
|
+
return get_searchspace_binary_baseline()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def get_base_searchspace():
|
|
18
|
+
base_params = {
|
|
19
|
+
"max_leaves": space.Int(2, 3, default=2),
|
|
20
|
+
"smoothing_rounds": space.Int(0, 1000, default=200),
|
|
21
|
+
"learning_rate": space.Real(0.0025, 0.2, default=0.02, log=True),
|
|
22
|
+
"interactions": space.Categorical(
|
|
23
|
+
0,
|
|
24
|
+
"0.5x",
|
|
25
|
+
"1x",
|
|
26
|
+
"1.5x",
|
|
27
|
+
"2x",
|
|
28
|
+
"2.5x",
|
|
29
|
+
"3x",
|
|
30
|
+
"3.5x",
|
|
31
|
+
"4x",
|
|
32
|
+
"4.5x",
|
|
33
|
+
"5x",
|
|
34
|
+
"6x",
|
|
35
|
+
"7x",
|
|
36
|
+
"8x",
|
|
37
|
+
"9x",
|
|
38
|
+
"10x",
|
|
39
|
+
"15x",
|
|
40
|
+
"20x",
|
|
41
|
+
"25x",
|
|
42
|
+
),
|
|
43
|
+
"interaction_smoothing_rounds": space.Int(0, 200, default=90),
|
|
44
|
+
"min_hessian": space.Real(1e-10, 1e-2, default=1e-4, log=True),
|
|
45
|
+
"min_samples_leaf": space.Int(2, 20, default=4),
|
|
46
|
+
"gain_scale": space.Real(0.5, 5.0, default=5.0, log=True),
|
|
47
|
+
"min_cat_samples": space.Int(5, 20, default=10),
|
|
48
|
+
"cat_smooth": space.Real(5.0, 100.0, default=10.0, log=True),
|
|
49
|
+
"missing": space.Categorical("separate", "low", "high", "gain"),
|
|
50
|
+
}
|
|
51
|
+
return base_params
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def get_searchspace_multiclass_baseline(num_classes):
|
|
55
|
+
params = get_base_searchspace()
|
|
56
|
+
baseline_params = {}
|
|
57
|
+
params.update(baseline_params)
|
|
58
|
+
return params
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_searchspace_binary_baseline():
|
|
62
|
+
params = get_base_searchspace()
|
|
63
|
+
baseline_params = {}
|
|
64
|
+
params.update(baseline_params)
|
|
65
|
+
return params
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_searchspace_regression_baseline():
|
|
69
|
+
params = get_base_searchspace()
|
|
70
|
+
baseline_params = {}
|
|
71
|
+
params.update(baseline_params)
|
|
72
|
+
return params
|
|
@@ -103,6 +103,7 @@ class NNFastAiTabularModel(AbstractModel):
|
|
|
103
103
|
ag_priority_by_problem_type = MappingProxyType({
|
|
104
104
|
MULTICLASS: 95,
|
|
105
105
|
})
|
|
106
|
+
seed_name = "random_seed"
|
|
106
107
|
|
|
107
108
|
model_internals_file_name = "model-internals.pkl"
|
|
108
109
|
|
|
@@ -322,8 +323,9 @@ class NNFastAiTabularModel(AbstractModel):
|
|
|
322
323
|
# Make deterministic
|
|
323
324
|
from fastai.torch_core import set_seed
|
|
324
325
|
|
|
325
|
-
|
|
326
|
-
|
|
326
|
+
random_seed = params.pop(self.seed_name, self.default_random_seed)
|
|
327
|
+
set_seed(random_seed, True)
|
|
328
|
+
dls.rng.seed(random_seed)
|
|
327
329
|
|
|
328
330
|
if self.problem_type == QUANTILE:
|
|
329
331
|
dls.c = len(self.quantile_levels)
|
|
@@ -214,7 +214,7 @@ class KNNModel(AbstractModel):
|
|
|
214
214
|
def sample_func(chunk, frac):
|
|
215
215
|
# Guarantee at least 1 sample (otherwise log_loss would crash or model would return different column counts in pred_proba)
|
|
216
216
|
n = max(math.ceil(len(chunk) * frac), 1)
|
|
217
|
-
return chunk.sample(n=n, replace=False, random_state=
|
|
217
|
+
return chunk.sample(n=n, replace=False, random_state=self.random_seed)
|
|
218
218
|
|
|
219
219
|
if self.problem_type != REGRESSION:
|
|
220
220
|
y_df = y.to_frame(name="label").reset_index(drop=True)
|
|
@@ -255,9 +255,13 @@ class KNNModel(AbstractModel):
|
|
|
255
255
|
self._X_unused_index = [i for i in range(num_rows_max) if i not in idx]
|
|
256
256
|
return self.model
|
|
257
257
|
|
|
258
|
-
def _get_maximum_resources(self) ->
|
|
258
|
+
def _get_maximum_resources(self) -> dict[str, int | float]:
|
|
259
259
|
# use at most 32 cpus to avoid OpenBLAS error: https://github.com/autogluon/autogluon/issues/1020
|
|
260
|
-
|
|
260
|
+
# no GPU support
|
|
261
|
+
return {
|
|
262
|
+
"num_cpus": 32,
|
|
263
|
+
"num_gpus": 0,
|
|
264
|
+
}
|
|
261
265
|
|
|
262
266
|
def _get_default_resources(self):
|
|
263
267
|
# use at most 32 cpus to avoid OpenBLAS error: https://github.com/autogluon/autogluon/issues/1020
|