autogluon.tabular 1.5.1b20260105__py3-none-any.whl → 1.5.1b20260117__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of autogluon.tabular might be problematic. Click here for more details.
- autogluon/tabular/__init__.py +1 -0
- autogluon/tabular/configs/config_helper.py +18 -6
- autogluon/tabular/configs/feature_generator_presets.py +3 -1
- autogluon/tabular/configs/hyperparameter_configs.py +42 -9
- autogluon/tabular/configs/presets_configs.py +38 -14
- autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2023.py +84 -14
- autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2025.py +48 -48
- autogluon/tabular/configs/zeroshot/zeroshot_portfolio_cpu_2025_12_18.py +774 -1
- autogluon/tabular/configs/zeroshot/zeroshot_portfolio_gpu_2025_12_18.py +421 -1
- autogluon/tabular/experimental/_scikit_mixin.py +6 -2
- autogluon/tabular/experimental/_tabular_classifier.py +3 -1
- autogluon/tabular/experimental/_tabular_regressor.py +3 -1
- autogluon/tabular/experimental/plot_leaderboard.py +73 -19
- autogluon/tabular/learner/abstract_learner.py +160 -42
- autogluon/tabular/learner/default_learner.py +78 -22
- autogluon/tabular/models/__init__.py +2 -2
- autogluon/tabular/models/_utils/rapids_utils.py +3 -1
- autogluon/tabular/models/abstract/abstract_torch_model.py +2 -0
- autogluon/tabular/models/automm/automm_model.py +12 -3
- autogluon/tabular/models/automm/ft_transformer.py +5 -1
- autogluon/tabular/models/catboost/callbacks.py +2 -2
- autogluon/tabular/models/catboost/catboost_model.py +93 -29
- autogluon/tabular/models/catboost/catboost_softclass_utils.py +4 -1
- autogluon/tabular/models/catboost/catboost_utils.py +3 -1
- autogluon/tabular/models/ebm/ebm_model.py +8 -13
- autogluon/tabular/models/ebm/hyperparameters/parameters.py +1 -0
- autogluon/tabular/models/ebm/hyperparameters/searchspaces.py +1 -0
- autogluon/tabular/models/fastainn/callbacks.py +20 -3
- autogluon/tabular/models/fastainn/hyperparameters/searchspaces.py +11 -1
- autogluon/tabular/models/fastainn/quantile_helpers.py +10 -2
- autogluon/tabular/models/fastainn/tabular_nn_fastai.py +65 -18
- autogluon/tabular/models/fasttext/fasttext_model.py +3 -1
- autogluon/tabular/models/image_prediction/image_predictor.py +7 -2
- autogluon/tabular/models/knn/knn_model.py +41 -8
- autogluon/tabular/models/lgb/callbacks.py +32 -9
- autogluon/tabular/models/lgb/hyperparameters/searchspaces.py +3 -1
- autogluon/tabular/models/lgb/lgb_model.py +150 -34
- autogluon/tabular/models/lgb/lgb_utils.py +12 -4
- autogluon/tabular/models/lr/hyperparameters/searchspaces.py +5 -1
- autogluon/tabular/models/lr/lr_model.py +40 -10
- autogluon/tabular/models/lr/lr_rapids_model.py +22 -13
- autogluon/tabular/models/mitra/_internal/__init__.py +1 -1
- autogluon/tabular/models/mitra/_internal/config/__init__.py +1 -1
- autogluon/tabular/models/mitra/_internal/config/config_pretrain.py +36 -40
- autogluon/tabular/models/mitra/_internal/config/config_run.py +2 -14
- autogluon/tabular/models/mitra/_internal/config/enums.py +27 -26
- autogluon/tabular/models/mitra/_internal/core/__init__.py +1 -1
- autogluon/tabular/models/mitra/_internal/core/callbacks.py +14 -21
- autogluon/tabular/models/mitra/_internal/core/get_loss.py +10 -12
- autogluon/tabular/models/mitra/_internal/core/get_optimizer.py +17 -32
- autogluon/tabular/models/mitra/_internal/core/get_scheduler.py +12 -27
- autogluon/tabular/models/mitra/_internal/core/prediction_metrics.py +16 -21
- autogluon/tabular/models/mitra/_internal/core/trainer_finetune.py +130 -111
- autogluon/tabular/models/mitra/_internal/data/__init__.py +1 -1
- autogluon/tabular/models/mitra/_internal/data/collator.py +30 -26
- autogluon/tabular/models/mitra/_internal/data/dataset_finetune.py +18 -26
- autogluon/tabular/models/mitra/_internal/data/dataset_split.py +10 -7
- autogluon/tabular/models/mitra/_internal/data/preprocessor.py +70 -100
- autogluon/tabular/models/mitra/_internal/models/__init__.py +1 -1
- autogluon/tabular/models/mitra/_internal/models/base.py +7 -10
- autogluon/tabular/models/mitra/_internal/models/embedding.py +46 -56
- autogluon/tabular/models/mitra/_internal/models/tab2d.py +140 -120
- autogluon/tabular/models/mitra/_internal/utils/__init__.py +1 -1
- autogluon/tabular/models/mitra/_internal/utils/set_seed.py +3 -1
- autogluon/tabular/models/mitra/mitra_model.py +16 -11
- autogluon/tabular/models/mitra/sklearn_interface.py +178 -162
- autogluon/tabular/models/realmlp/realmlp_model.py +28 -15
- autogluon/tabular/models/rf/compilers/onnx.py +1 -1
- autogluon/tabular/models/rf/rf_model.py +45 -12
- autogluon/tabular/models/rf/rf_quantile.py +4 -2
- autogluon/tabular/models/tabdpt/tabdpt_model.py +8 -17
- autogluon/tabular/models/tabicl/tabicl_model.py +8 -1
- autogluon/tabular/models/tabm/_tabm_internal.py +6 -4
- autogluon/tabular/models/tabm/rtdl_num_embeddings.py +80 -127
- autogluon/tabular/models/tabm/tabm_model.py +8 -4
- autogluon/tabular/models/tabm/tabm_reference.py +53 -85
- autogluon/tabular/models/tabpfnmix/_internal/core/callbacks.py +7 -16
- autogluon/tabular/models/tabpfnmix/_internal/core/collator.py +16 -24
- autogluon/tabular/models/tabpfnmix/_internal/core/dataset_split.py +5 -7
- autogluon/tabular/models/tabpfnmix/_internal/core/enums.py +0 -2
- autogluon/tabular/models/tabpfnmix/_internal/core/get_loss.py +0 -1
- autogluon/tabular/models/tabpfnmix/_internal/core/get_optimizer.py +7 -18
- autogluon/tabular/models/tabpfnmix/_internal/core/get_scheduler.py +3 -14
- autogluon/tabular/models/tabpfnmix/_internal/core/trainer_finetune.py +79 -64
- autogluon/tabular/models/tabpfnmix/_internal/core/y_transformer.py +3 -5
- autogluon/tabular/models/tabpfnmix/_internal/data/dataset_finetune.py +17 -30
- autogluon/tabular/models/tabpfnmix/_internal/data/preprocessor.py +15 -35
- autogluon/tabular/models/tabpfnmix/_internal/models/foundation/embedding.py +21 -38
- autogluon/tabular/models/tabpfnmix/_internal/models/foundation/foundation_transformer.py +33 -51
- autogluon/tabular/models/tabpfnmix/_internal/results/prediction_metrics.py +4 -4
- autogluon/tabular/models/tabpfnmix/_internal/tabpfnmix_classifier.py +32 -12
- autogluon/tabular/models/tabpfnmix/_internal/tabpfnmix_regressor.py +32 -13
- autogluon/tabular/models/tabpfnmix/tabpfnmix_model.py +55 -19
- autogluon/tabular/models/tabpfnv2/tabpfnv2_5_model.py +21 -48
- autogluon/tabular/models/tabprep/prep_mixin.py +34 -26
- autogluon/tabular/models/tabular_nn/compilers/onnx.py +36 -8
- autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py +130 -36
- autogluon/tabular/models/tabular_nn/torch/tabular_torch_dataset.py +8 -4
- autogluon/tabular/models/tabular_nn/torch/torch_network_modules.py +26 -5
- autogluon/tabular/models/tabular_nn/utils/categorical_encoders.py +41 -24
- autogluon/tabular/models/tabular_nn/utils/data_preprocessor.py +33 -8
- autogluon/tabular/models/tabular_nn/utils/nn_architecture_utils.py +21 -6
- autogluon/tabular/models/xgboost/callbacks.py +9 -3
- autogluon/tabular/models/xgboost/xgboost_model.py +59 -11
- autogluon/tabular/models/xt/xt_model.py +1 -0
- autogluon/tabular/predictor/interpretable_predictor.py +3 -1
- autogluon/tabular/predictor/predictor.py +409 -128
- autogluon/tabular/registry/__init__.py +1 -1
- autogluon/tabular/registry/_ag_model_registry.py +4 -5
- autogluon/tabular/registry/_model_registry.py +1 -0
- autogluon/tabular/testing/fit_helper.py +55 -15
- autogluon/tabular/testing/generate_datasets.py +1 -1
- autogluon/tabular/testing/model_fit_helper.py +10 -4
- autogluon/tabular/trainer/abstract_trainer.py +644 -230
- autogluon/tabular/trainer/auto_trainer.py +19 -8
- autogluon/tabular/trainer/model_presets/presets.py +33 -9
- autogluon/tabular/trainer/model_presets/presets_distill.py +16 -2
- autogluon/tabular/version.py +1 -1
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260117.dist-info}/METADATA +27 -27
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260117.dist-info}/RECORD +127 -135
- autogluon/tabular/models/tabpfnv2/rfpfn/__init__.py +0 -20
- autogluon/tabular/models/tabpfnv2/rfpfn/configs.py +0 -40
- autogluon/tabular/models/tabpfnv2/rfpfn/scoring_utils.py +0 -201
- autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_based_decision_tree_tabpfn.py +0 -1464
- autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_based_random_forest_tabpfn.py +0 -747
- autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_compat.py +0 -863
- autogluon/tabular/models/tabpfnv2/rfpfn/utils.py +0 -106
- autogluon/tabular/models/tabpfnv2/tabpfnv2_model.py +0 -466
- /autogluon.tabular-1.5.1b20260105-py3.11-nspkg.pth → /autogluon.tabular-1.5.1b20260117-py3.11-nspkg.pth +0 -0
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260117.dist-info}/WHEEL +0 -0
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260117.dist-info}/licenses/LICENSE +0 -0
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260117.dist-info}/licenses/NOTICE +0 -0
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260117.dist-info}/namespace_packages.txt +0 -0
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260117.dist-info}/top_level.txt +0 -0
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260117.dist-info}/zip-safe +0 -0
|
@@ -25,7 +25,9 @@ from .hyperparameters.parameters import DEFAULT_NUM_BOOST_ROUND, get_lgb_objecti
|
|
|
25
25
|
from .hyperparameters.searchspaces import get_default_searchspace
|
|
26
26
|
from .lgb_utils import construct_dataset, train_lgb_model
|
|
27
27
|
|
|
28
|
-
warnings.filterwarnings(
|
|
28
|
+
warnings.filterwarnings(
|
|
29
|
+
"ignore", category=UserWarning, message="Starting from version"
|
|
30
|
+
) # lightGBM brew libomp warning
|
|
29
31
|
warnings.filterwarnings("ignore", category=FutureWarning, message="Dask dataframe query") # lightGBM dask-expr warning
|
|
30
32
|
logger = logging.getLogger(__name__)
|
|
31
33
|
|
|
@@ -40,12 +42,11 @@ class LGBModel(AbstractModel):
|
|
|
40
42
|
Extra hyperparameter options:
|
|
41
43
|
ag.early_stop : int, specifies the early stopping rounds. Defaults to an adaptive strategy. Recommended to keep default.
|
|
42
44
|
"""
|
|
45
|
+
|
|
43
46
|
ag_key = "GBM"
|
|
44
47
|
ag_name = "LightGBM"
|
|
45
48
|
ag_priority = 90
|
|
46
|
-
ag_priority_by_problem_type = MappingProxyType({
|
|
47
|
-
SOFTCLASS: 100
|
|
48
|
-
})
|
|
49
|
+
ag_priority_by_problem_type = MappingProxyType({SOFTCLASS: 100})
|
|
49
50
|
seed_name = "seed"
|
|
50
51
|
seed_name_alt = ["seed_value", "random_seed", "random_state"]
|
|
51
52
|
|
|
@@ -53,8 +54,8 @@ class LGBModel(AbstractModel):
|
|
|
53
54
|
super().__init__(**kwargs)
|
|
54
55
|
|
|
55
56
|
self._features_internal_map = None
|
|
56
|
-
self._features_internal_list = None
|
|
57
57
|
self._requires_remap = None
|
|
58
|
+
self._features_internal_lgbm = None
|
|
58
59
|
|
|
59
60
|
def _set_default_params(self):
|
|
60
61
|
default_params = get_param_baseline(problem_type=self.problem_type)
|
|
@@ -66,10 +67,15 @@ class LGBModel(AbstractModel):
|
|
|
66
67
|
|
|
67
68
|
# Use specialized LightGBM metric if available (fast), otherwise use custom func generator
|
|
68
69
|
def _get_stopping_metric_internal(self):
|
|
69
|
-
stopping_metric = lgb_utils.convert_ag_metric_to_lgbm(
|
|
70
|
+
stopping_metric = lgb_utils.convert_ag_metric_to_lgbm(
|
|
71
|
+
ag_metric_name=self.stopping_metric.name, problem_type=self.problem_type
|
|
72
|
+
)
|
|
70
73
|
if stopping_metric is None:
|
|
71
74
|
stopping_metric = lgb_utils.func_generator(
|
|
72
|
-
metric=self.stopping_metric,
|
|
75
|
+
metric=self.stopping_metric,
|
|
76
|
+
is_higher_better=True,
|
|
77
|
+
needs_pred_proba=not self.stopping_metric.needs_pred,
|
|
78
|
+
problem_type=self.problem_type,
|
|
73
79
|
)
|
|
74
80
|
stopping_metric_name = self.stopping_metric.name
|
|
75
81
|
else:
|
|
@@ -78,7 +84,13 @@ class LGBModel(AbstractModel):
|
|
|
78
84
|
|
|
79
85
|
def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
|
|
80
86
|
hyperparameters = self._get_model_params()
|
|
81
|
-
return self.estimate_memory_usage_static(
|
|
87
|
+
return self.estimate_memory_usage_static(
|
|
88
|
+
X=X,
|
|
89
|
+
problem_type=self.problem_type,
|
|
90
|
+
num_classes=self.num_classes,
|
|
91
|
+
hyperparameters=hyperparameters,
|
|
92
|
+
**kwargs,
|
|
93
|
+
)
|
|
82
94
|
|
|
83
95
|
# FIXME: Don't use `hyperparameters.get("max_bins", 255)`, instead get the defaults all at once!
|
|
84
96
|
@classmethod
|
|
@@ -142,8 +154,12 @@ class LGBModel(AbstractModel):
|
|
|
142
154
|
"""
|
|
143
155
|
if hyperparameters is None:
|
|
144
156
|
hyperparameters = {}
|
|
145
|
-
num_classes =
|
|
146
|
-
|
|
157
|
+
num_classes = (
|
|
158
|
+
num_classes if num_classes else 1
|
|
159
|
+
) # num_classes could be None after initialization if it's a regression problem
|
|
160
|
+
data_mem_usage_bytes = (
|
|
161
|
+
data_mem_usage * 5 + data_mem_usage / 4 * num_classes
|
|
162
|
+
) # TODO: Extremely crude approximation, can be vastly improved
|
|
147
163
|
|
|
148
164
|
n_trees_per_estimator = num_classes if num_classes > 2 else 1
|
|
149
165
|
|
|
@@ -161,12 +177,27 @@ class LGBModel(AbstractModel):
|
|
|
161
177
|
mem_size_per_estimator = n_trees_per_estimator * num_leaves * 100 # very rough estimate
|
|
162
178
|
n_estimators = hyperparameters.get("num_boost_round", DEFAULT_NUM_BOOST_ROUND)
|
|
163
179
|
n_estimators_min = min(n_estimators, 5000)
|
|
164
|
-
mem_size_estimators =
|
|
180
|
+
mem_size_estimators = (
|
|
181
|
+
n_estimators_min * mem_size_per_estimator
|
|
182
|
+
) # memory estimate after fitting up to 5000 estimators
|
|
165
183
|
|
|
166
184
|
approx_mem_size_req = data_mem_usage_bytes + histogram_mem_usage_bytes + mem_size_estimators
|
|
167
185
|
return int(approx_mem_size_req)
|
|
168
186
|
|
|
169
|
-
def _fit(
|
|
187
|
+
def _fit(
|
|
188
|
+
self,
|
|
189
|
+
X,
|
|
190
|
+
y,
|
|
191
|
+
X_val=None,
|
|
192
|
+
y_val=None,
|
|
193
|
+
time_limit=None,
|
|
194
|
+
num_gpus=0,
|
|
195
|
+
num_cpus=0,
|
|
196
|
+
sample_weight=None,
|
|
197
|
+
sample_weight_val=None,
|
|
198
|
+
verbosity=2,
|
|
199
|
+
**kwargs,
|
|
200
|
+
):
|
|
170
201
|
try_import_lightgbm() # raise helpful error message if LightGBM isn't installed
|
|
171
202
|
start_time = time.time()
|
|
172
203
|
ag_params = self._get_ag_params()
|
|
@@ -192,14 +223,19 @@ class LGBModel(AbstractModel):
|
|
|
192
223
|
stopping_metric, stopping_metric_name = self._get_stopping_metric_internal()
|
|
193
224
|
|
|
194
225
|
num_boost_round = params.pop("num_boost_round", DEFAULT_NUM_BOOST_ROUND)
|
|
195
|
-
dart_retrain = params.pop(
|
|
226
|
+
dart_retrain = params.pop(
|
|
227
|
+
"dart_retrain", False
|
|
228
|
+
) # Whether to retrain the model to get optimal iteration if model is trained in 'dart' mode.
|
|
196
229
|
if num_gpus != 0:
|
|
197
230
|
if "device" not in params:
|
|
198
231
|
# TODO: lightgbm must have a special install to support GPU: https://github.com/Microsoft/LightGBM/tree/master/python-package#build-gpu-version
|
|
199
232
|
# Before enabling GPU, we should add code to detect that GPU-enabled version is installed and that a valid GPU exists.
|
|
200
233
|
# GPU training heavily alters accuracy, often in a negative manner. We will have to be careful about when to use GPU.
|
|
201
234
|
params["device"] = "gpu"
|
|
202
|
-
logger.log(
|
|
235
|
+
logger.log(
|
|
236
|
+
20,
|
|
237
|
+
f"\tWarning: Training LightGBM with GPU. This may negatively impact model quality compared to CPU training.",
|
|
238
|
+
)
|
|
203
239
|
logger.log(15, f"\tFitting {num_boost_round} rounds... Hyperparameters: {params}")
|
|
204
240
|
|
|
205
241
|
if "num_threads" not in params:
|
|
@@ -213,7 +249,15 @@ class LGBModel(AbstractModel):
|
|
|
213
249
|
|
|
214
250
|
num_rows_train = len(X)
|
|
215
251
|
dataset_train, dataset_val, dataset_test = self.generate_datasets(
|
|
216
|
-
X=X,
|
|
252
|
+
X=X,
|
|
253
|
+
y=y,
|
|
254
|
+
params=params,
|
|
255
|
+
X_val=X_val,
|
|
256
|
+
y_val=y_val,
|
|
257
|
+
X_test=X_test,
|
|
258
|
+
y_test=y_test,
|
|
259
|
+
sample_weight=sample_weight,
|
|
260
|
+
sample_weight_val=sample_weight_val,
|
|
217
261
|
)
|
|
218
262
|
gc.collect()
|
|
219
263
|
|
|
@@ -226,7 +270,9 @@ class LGBModel(AbstractModel):
|
|
|
226
270
|
# TODO: Better solution: Track trend to early stop when score is far worse than best score, or score is trending worse over time
|
|
227
271
|
early_stopping_rounds = ag_params.get("early_stop", "adaptive")
|
|
228
272
|
if isinstance(early_stopping_rounds, (str, tuple, list)):
|
|
229
|
-
early_stopping_rounds = self._get_early_stopping_rounds(
|
|
273
|
+
early_stopping_rounds = self._get_early_stopping_rounds(
|
|
274
|
+
num_rows_train=num_rows_train, strategy=early_stopping_rounds
|
|
275
|
+
)
|
|
230
276
|
if early_stopping_rounds is None:
|
|
231
277
|
early_stopping_rounds = 999999
|
|
232
278
|
reporter = kwargs.get("reporter", None)
|
|
@@ -235,7 +281,7 @@ class LGBModel(AbstractModel):
|
|
|
235
281
|
if "metric" not in params or params["metric"] == "":
|
|
236
282
|
params["metric"] = train_loss_name
|
|
237
283
|
elif train_loss_name not in params["metric"]:
|
|
238
|
-
params["metric"] = f
|
|
284
|
+
params["metric"] = f"{params['metric']},{train_loss_name}"
|
|
239
285
|
# early stopping callback will be added later by QuantileBooster if problem_type==QUANTILE
|
|
240
286
|
early_stopping_callback_kwargs = dict(
|
|
241
287
|
stopping_rounds=early_stopping_rounds,
|
|
@@ -315,7 +361,7 @@ class LGBModel(AbstractModel):
|
|
|
315
361
|
if "metric" not in train_params["params"] or train_params["params"]["metric"] == "":
|
|
316
362
|
train_params["params"]["metric"] = stopping_metric
|
|
317
363
|
elif stopping_metric not in train_params["params"]["metric"]:
|
|
318
|
-
train_params["params"]["metric"] = f
|
|
364
|
+
train_params["params"]["metric"] = f"{stopping_metric},{train_params['params']['metric']}"
|
|
319
365
|
|
|
320
366
|
if self.problem_type == SOFTCLASS:
|
|
321
367
|
train_params["params"]["objective"] = lgb_utils.softclass_lgbobj
|
|
@@ -332,7 +378,9 @@ class LGBModel(AbstractModel):
|
|
|
332
378
|
warnings.filterwarnings("ignore", message="Overriding the parameters from Reference Dataset.")
|
|
333
379
|
warnings.filterwarnings("ignore", message="categorical_column in param dict is overridden.")
|
|
334
380
|
try:
|
|
335
|
-
self.model = train_lgb_model(
|
|
381
|
+
self.model = train_lgb_model(
|
|
382
|
+
early_stopping_callback_kwargs=early_stopping_callback_kwargs, **train_params
|
|
383
|
+
)
|
|
336
384
|
except LightGBMError:
|
|
337
385
|
if train_params["params"].get("device", "cpu") not in ["gpu", "cuda"]:
|
|
338
386
|
raise
|
|
@@ -357,7 +405,9 @@ class LGBModel(AbstractModel):
|
|
|
357
405
|
"https://github.com/Microsoft/LightGBM/tree/master/python-package#build-cuda-version"
|
|
358
406
|
)
|
|
359
407
|
train_params["params"]["device"] = "cpu"
|
|
360
|
-
self.model = train_lgb_model(
|
|
408
|
+
self.model = train_lgb_model(
|
|
409
|
+
early_stopping_callback_kwargs=early_stopping_callback_kwargs, **train_params
|
|
410
|
+
)
|
|
361
411
|
retrain = False
|
|
362
412
|
if train_params["params"].get("boosting_type", "") == "dart":
|
|
363
413
|
if dataset_val is not None and dart_retrain and (self.model.best_iteration != num_boost_round):
|
|
@@ -434,7 +484,73 @@ class LGBModel(AbstractModel):
|
|
|
434
484
|
else: # Should this ever happen?
|
|
435
485
|
return y_pred_proba[:, 1]
|
|
436
486
|
|
|
437
|
-
|
|
487
|
+
@staticmethod
|
|
488
|
+
def _clean_column_name_for_lgb(column_name):
|
|
489
|
+
"""Clean column names while keeping most semantic meaning."""
|
|
490
|
+
if not isinstance(column_name, str):
|
|
491
|
+
return column_name
|
|
492
|
+
for symbol in ['"', ",", ":", "{", "}", "[", "]"]:
|
|
493
|
+
column_name = column_name.replace(symbol, "_")
|
|
494
|
+
return column_name
|
|
495
|
+
|
|
496
|
+
@classmethod
|
|
497
|
+
def _rename_columns(cls, features: list) -> dict:
|
|
498
|
+
"""
|
|
499
|
+
Generate a deterministic, one-to-one mapping from original feature names to
|
|
500
|
+
LightGBM-safe, unique column names.
|
|
501
|
+
|
|
502
|
+
This method:
|
|
503
|
+
- Cleans feature names using `_clean_column_name_for_lgb`
|
|
504
|
+
- Resolves naming collisions by appending numeric suffixes (`_2`, `_3`, ...)
|
|
505
|
+
- Guarantees that all output column names are unique
|
|
506
|
+
- Guarantees a strict 1-to-1 mapping between input features and output names
|
|
507
|
+
|
|
508
|
+
The mapping is deterministic with respect to input order. If two or more
|
|
509
|
+
features clean to the same base name, the first occurrence keeps the base
|
|
510
|
+
name and subsequent occurrences receive incrementing suffixes.
|
|
511
|
+
|
|
512
|
+
Parameters
|
|
513
|
+
----------
|
|
514
|
+
features : list
|
|
515
|
+
List of feature names. All entries must be unique under Python equality
|
|
516
|
+
semantics (e.g., `"a"` and `"a"` or `1` and `True` are considered duplicates).
|
|
517
|
+
|
|
518
|
+
Returns
|
|
519
|
+
-------
|
|
520
|
+
dict
|
|
521
|
+
Mapping from original feature name to a unique, cleaned column name
|
|
522
|
+
suitable for use in LightGBM.
|
|
523
|
+
|
|
524
|
+
Raises
|
|
525
|
+
------
|
|
526
|
+
ValueError
|
|
527
|
+
If `features` contains duplicate entries, since a dictionary cannot
|
|
528
|
+
represent a one-to-one mapping in that case.
|
|
529
|
+
|
|
530
|
+
"""
|
|
531
|
+
if len(features) != len(set(features)):
|
|
532
|
+
raise ValueError("features contains duplicates; cannot create 1-to-1 mapping with a dict.")
|
|
533
|
+
|
|
534
|
+
unique_features = set()
|
|
535
|
+
features_map = {}
|
|
536
|
+
for feature in features:
|
|
537
|
+
cleaned_feature = cls._clean_column_name_for_lgb(feature)
|
|
538
|
+
|
|
539
|
+
unique_feature = cleaned_feature
|
|
540
|
+
if unique_feature in unique_features:
|
|
541
|
+
is_unique = False
|
|
542
|
+
count = 2
|
|
543
|
+
while not is_unique:
|
|
544
|
+
unique_feature = f"{cleaned_feature}_{count}"
|
|
545
|
+
if unique_feature not in unique_features:
|
|
546
|
+
is_unique = True
|
|
547
|
+
else:
|
|
548
|
+
count += 1
|
|
549
|
+
unique_features.add(unique_feature)
|
|
550
|
+
features_map[feature] = unique_feature
|
|
551
|
+
return features_map
|
|
552
|
+
|
|
553
|
+
def _preprocess_nonadaptive(self, X: pd.DataFrame, is_train: bool = False, **kwargs):
|
|
438
554
|
X = super()._preprocess_nonadaptive(X=X, **kwargs)
|
|
439
555
|
|
|
440
556
|
if is_train:
|
|
@@ -443,21 +559,25 @@ class LGBModel(AbstractModel):
|
|
|
443
559
|
if isinstance(column, str):
|
|
444
560
|
new_column = re.sub(r'[",:{}[\]]', "", column)
|
|
445
561
|
if new_column != column:
|
|
446
|
-
self._features_internal_map = {feature: i for i, feature in enumerate(list(X.columns))}
|
|
447
562
|
self._requires_remap = True
|
|
448
563
|
break
|
|
449
564
|
if self._requires_remap:
|
|
450
|
-
self.
|
|
451
|
-
|
|
452
|
-
self._features_internal_list = self._features_internal
|
|
565
|
+
self._features_internal_map = self._rename_columns(features=list(X.columns))
|
|
566
|
+
self._features_internal_lgbm = [self._features_internal_map[feature] for feature in list(X.columns)]
|
|
453
567
|
|
|
454
|
-
if self._requires_remap:
|
|
455
|
-
X_new = X.copy(deep=False)
|
|
456
|
-
X_new.columns = self._features_internal_list
|
|
457
|
-
return X_new
|
|
458
|
-
else:
|
|
568
|
+
if not self._requires_remap:
|
|
459
569
|
return X
|
|
460
570
|
|
|
571
|
+
X_new = X.copy(deep=False)
|
|
572
|
+
X_new.columns = self._features_internal_lgbm
|
|
573
|
+
|
|
574
|
+
# Update feature metadata
|
|
575
|
+
if is_train:
|
|
576
|
+
new_feature_metadata = self._feature_metadata.rename_features(self._features_internal_map)
|
|
577
|
+
self._preprocess_set_features_internal(X=X_new, feature_metadata=new_feature_metadata)
|
|
578
|
+
|
|
579
|
+
return X_new
|
|
580
|
+
|
|
461
581
|
def generate_datasets(
|
|
462
582
|
self,
|
|
463
583
|
X: DataFrame,
|
|
@@ -630,10 +750,6 @@ class LGBModel(AbstractModel):
|
|
|
630
750
|
def supported_problem_types(cls) -> list[str] | None:
|
|
631
751
|
return ["binary", "multiclass", "regression", "quantile", "softclass"]
|
|
632
752
|
|
|
633
|
-
@property
|
|
634
|
-
def _features(self):
|
|
635
|
-
return self._features_internal_list
|
|
636
|
-
|
|
637
753
|
def _ag_params(self) -> set:
|
|
638
754
|
return {"early_stop", "generate_curves", "curve_metrics", "use_error_for_curve_metrics"}
|
|
639
755
|
|
|
@@ -104,11 +104,15 @@ def softclass_lgbobj(preds, train_data):
|
|
|
104
104
|
return grad.flatten("F"), hess.flatten("F")
|
|
105
105
|
|
|
106
106
|
|
|
107
|
-
def construct_dataset(
|
|
107
|
+
def construct_dataset(
|
|
108
|
+
x: DataFrame, y: Series, location=None, reference=None, params=None, save=False, weight=None, init_score=None
|
|
109
|
+
):
|
|
108
110
|
try_import_lightgbm()
|
|
109
111
|
import lightgbm as lgb
|
|
110
112
|
|
|
111
|
-
dataset = lgb.Dataset(
|
|
113
|
+
dataset = lgb.Dataset(
|
|
114
|
+
data=x, label=y, reference=reference, free_raw_data=True, params=params, weight=weight, init_score=init_score
|
|
115
|
+
)
|
|
112
116
|
|
|
113
117
|
if save:
|
|
114
118
|
assert location is not None
|
|
@@ -128,7 +132,9 @@ def train_lgb_model(early_stopping_callback_kwargs=None, **train_params):
|
|
|
128
132
|
|
|
129
133
|
if train_params["params"]["objective"] == "quantile":
|
|
130
134
|
quantile_levels = train_params["params"].pop("quantile_levels")
|
|
131
|
-
booster = QuantileBooster(
|
|
135
|
+
booster = QuantileBooster(
|
|
136
|
+
quantile_levels=quantile_levels, early_stopping_callback_kwargs=early_stopping_callback_kwargs
|
|
137
|
+
)
|
|
132
138
|
return booster.fit(**train_params)
|
|
133
139
|
else:
|
|
134
140
|
return lgb.train(**train_params)
|
|
@@ -141,7 +147,9 @@ class QuantileBooster:
|
|
|
141
147
|
if quantile_levels is None:
|
|
142
148
|
raise AssertionError
|
|
143
149
|
if not all(0 < q < 1 for q in quantile_levels):
|
|
144
|
-
raise AssertionError(
|
|
150
|
+
raise AssertionError(
|
|
151
|
+
f"quantile_levels must fulfill 0 < q < 1, provided quantile_levels: {quantile_levels}"
|
|
152
|
+
)
|
|
145
153
|
|
|
146
154
|
self.quantile_levels = quantile_levels
|
|
147
155
|
|
|
@@ -2,5 +2,9 @@ from autogluon.common.space import Categorical, Real
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
def get_default_searchspace(problem_type, num_classes=None):
|
|
5
|
-
spaces = {
|
|
5
|
+
spaces = {
|
|
6
|
+
"C": Real(lower=0.1, upper=1e3, default=1),
|
|
7
|
+
"proc.skew_threshold": Categorical(0.99, None),
|
|
8
|
+
"penalty": Categorical("L2", "L1"),
|
|
9
|
+
}
|
|
6
10
|
return spaces
|
|
@@ -40,6 +40,7 @@ class LinearModel(AbstractModel):
|
|
|
40
40
|
|
|
41
41
|
'regression': https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge
|
|
42
42
|
"""
|
|
43
|
+
|
|
43
44
|
ag_key = "LR"
|
|
44
45
|
ag_name = "LinearModel"
|
|
45
46
|
ag_priority = 30
|
|
@@ -87,7 +88,9 @@ class LinearModel(AbstractModel):
|
|
|
87
88
|
"""Returns dict with keys: : 'continuous', 'skewed', 'onehot', 'embed', 'language', values = ordered list of feature-names falling into each category.
|
|
88
89
|
Each value is a list of feature-names corresponding to columns in original dataframe.
|
|
89
90
|
"""
|
|
90
|
-
continuous_featnames = self._feature_metadata.get_features(
|
|
91
|
+
continuous_featnames = self._feature_metadata.get_features(
|
|
92
|
+
valid_raw_types=[R_INT, R_FLOAT], invalid_special_types=[S_BOOL]
|
|
93
|
+
)
|
|
91
94
|
categorical_featnames = self._feature_metadata.get_features(valid_raw_types=[R_CATEGORY, R_OBJECT])
|
|
92
95
|
bool_featnames = self._feature_metadata.get_features(required_special_types=[S_BOOL])
|
|
93
96
|
language_featnames = [] # TODO: Disabled currently, have to pass raw text data features here to function properly
|
|
@@ -125,7 +128,10 @@ class LinearModel(AbstractModel):
|
|
|
125
128
|
(
|
|
126
129
|
"vectorizer",
|
|
127
130
|
TfidfVectorizer(
|
|
128
|
-
ngram_range=self.params["proc.ngram_range"],
|
|
131
|
+
ngram_range=self.params["proc.ngram_range"],
|
|
132
|
+
sublinear_tf=True,
|
|
133
|
+
max_features=vect_max_features,
|
|
134
|
+
tokenizer=self._tokenize,
|
|
129
135
|
),
|
|
130
136
|
),
|
|
131
137
|
]
|
|
@@ -139,7 +145,12 @@ class LinearModel(AbstractModel):
|
|
|
139
145
|
)
|
|
140
146
|
transformer_list.append(("cats", pipeline, feature_types["onehot"]))
|
|
141
147
|
if feature_types.get("continuous", None):
|
|
142
|
-
pipeline = Pipeline(
|
|
148
|
+
pipeline = Pipeline(
|
|
149
|
+
steps=[
|
|
150
|
+
("imputer", SimpleImputer(strategy=self.params["proc.impute_strategy"])),
|
|
151
|
+
("scaler", StandardScaler()),
|
|
152
|
+
]
|
|
153
|
+
)
|
|
143
154
|
transformer_list.append(("cont", pipeline, feature_types["continuous"]))
|
|
144
155
|
if feature_types.get("bool", None):
|
|
145
156
|
pipeline = Pipeline(steps=[("scaler", StandardScaler())])
|
|
@@ -148,7 +159,10 @@ class LinearModel(AbstractModel):
|
|
|
148
159
|
pipeline = Pipeline(
|
|
149
160
|
steps=[
|
|
150
161
|
("imputer", SimpleImputer(strategy=self.params["proc.impute_strategy"])),
|
|
151
|
-
(
|
|
162
|
+
(
|
|
163
|
+
"quantile",
|
|
164
|
+
QuantileTransformer(output_distribution="normal"),
|
|
165
|
+
), # Or output_distribution = 'uniform'
|
|
152
166
|
]
|
|
153
167
|
)
|
|
154
168
|
transformer_list.append(("skew", pipeline, feature_types["skewed"]))
|
|
@@ -227,7 +241,9 @@ class LinearModel(AbstractModel):
|
|
|
227
241
|
if time_to_train_cur_max_iter > time_left_train:
|
|
228
242
|
cur_max_iter = min(int(time_left_train / time_per_iter) - 1, cur_max_iter)
|
|
229
243
|
if cur_max_iter <= 0:
|
|
230
|
-
logger.warning(
|
|
244
|
+
logger.warning(
|
|
245
|
+
f"\tEarly stopping due to lack of time remaining. Fit {total_iter}/{total_max_iter} iters..."
|
|
246
|
+
)
|
|
231
247
|
break
|
|
232
248
|
early_stop = True
|
|
233
249
|
|
|
@@ -251,13 +267,17 @@ class LinearModel(AbstractModel):
|
|
|
251
267
|
total_iter_used += model.max_iter
|
|
252
268
|
if early_stop:
|
|
253
269
|
if total_iter_used == total_iter: # Not yet converged
|
|
254
|
-
logger.warning(
|
|
270
|
+
logger.warning(
|
|
271
|
+
f"\tEarly stopping due to lack of time remaining. Fit {total_iter}/{total_max_iter} iters..."
|
|
272
|
+
)
|
|
255
273
|
break
|
|
256
274
|
|
|
257
275
|
self.model = model
|
|
258
276
|
self.params_trained["max_iter"] = total_iter
|
|
259
277
|
|
|
260
|
-
def _select_features_handle_text_include(
|
|
278
|
+
def _select_features_handle_text_include(
|
|
279
|
+
self, df, categorical_featnames, language_featnames, continuous_featnames, bool_featnames
|
|
280
|
+
):
|
|
261
281
|
types_of_features = dict()
|
|
262
282
|
types_of_features.update(self._select_continuous(df, continuous_featnames))
|
|
263
283
|
types_of_features.update(self._select_bool(df, bool_featnames))
|
|
@@ -265,12 +285,16 @@ class LinearModel(AbstractModel):
|
|
|
265
285
|
types_of_features.update(self._select_text(df, language_featnames))
|
|
266
286
|
return types_of_features
|
|
267
287
|
|
|
268
|
-
def _select_features_handle_text_only(
|
|
288
|
+
def _select_features_handle_text_only(
|
|
289
|
+
self, df, categorical_featnames, language_featnames, continuous_featnames, bool_featnames
|
|
290
|
+
):
|
|
269
291
|
types_of_features = dict()
|
|
270
292
|
types_of_features.update(self._select_text(df, language_featnames))
|
|
271
293
|
return types_of_features
|
|
272
294
|
|
|
273
|
-
def _select_features_handle_text_ignore(
|
|
295
|
+
def _select_features_handle_text_ignore(
|
|
296
|
+
self, df, categorical_featnames, language_featnames, continuous_featnames, bool_featnames
|
|
297
|
+
):
|
|
274
298
|
types_of_features = dict()
|
|
275
299
|
types_of_features.update(self._select_continuous(df, continuous_featnames))
|
|
276
300
|
types_of_features.update(self._select_bool(df, bool_featnames))
|
|
@@ -309,7 +333,13 @@ class LinearModel(AbstractModel):
|
|
|
309
333
|
|
|
310
334
|
def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
|
|
311
335
|
hyperparameters = self._get_model_params()
|
|
312
|
-
return self.estimate_memory_usage_static(
|
|
336
|
+
return self.estimate_memory_usage_static(
|
|
337
|
+
X=X,
|
|
338
|
+
problem_type=self.problem_type,
|
|
339
|
+
num_classes=self.num_classes,
|
|
340
|
+
hyperparameters=hyperparameters,
|
|
341
|
+
**kwargs,
|
|
342
|
+
)
|
|
313
343
|
|
|
314
344
|
@classmethod
|
|
315
345
|
def _estimate_memory_usage_static(
|
|
@@ -49,7 +49,7 @@ class LinearRapidsModel(RapidsModelMixin, LinearModel):
|
|
|
49
49
|
|
|
50
50
|
def _preprocess(self, X, **kwargs):
|
|
51
51
|
X = super()._preprocess(X=X, **kwargs)
|
|
52
|
-
if hasattr(X,
|
|
52
|
+
if hasattr(X, "toarray"): # Check if it's a sparse matrix
|
|
53
53
|
X = X.toarray()
|
|
54
54
|
return X
|
|
55
55
|
|
|
@@ -60,7 +60,7 @@ class LinearRapidsModel(RapidsModelMixin, LinearModel):
|
|
|
60
60
|
"""
|
|
61
61
|
# Preprocess data
|
|
62
62
|
X = self.preprocess(X, is_train=True)
|
|
63
|
-
if self.problem_type ==
|
|
63
|
+
if self.problem_type == "binary":
|
|
64
64
|
y = y.astype(int).values
|
|
65
65
|
|
|
66
66
|
# Create cuML model with filtered parameters
|
|
@@ -69,28 +69,37 @@ class LinearRapidsModel(RapidsModelMixin, LinearModel):
|
|
|
69
69
|
# Comprehensive parameter filtering for cuML compatibility
|
|
70
70
|
cuml_incompatible_params = {
|
|
71
71
|
# AutoGluon-specific preprocessing parameters
|
|
72
|
-
|
|
73
|
-
|
|
72
|
+
"vectorizer_dict_size",
|
|
73
|
+
"proc.ngram_range",
|
|
74
|
+
"proc.skew_threshold",
|
|
75
|
+
"proc.impute_strategy",
|
|
76
|
+
"handle_text",
|
|
74
77
|
# sklearn-specific parameters not supported by cuML
|
|
75
|
-
|
|
76
|
-
|
|
78
|
+
"n_jobs",
|
|
79
|
+
"warm_start",
|
|
80
|
+
"multi_class",
|
|
81
|
+
"dual",
|
|
82
|
+
"intercept_scaling",
|
|
83
|
+
"class_weight",
|
|
84
|
+
"random_state",
|
|
85
|
+
"verbose",
|
|
77
86
|
# Parameters that need conversion or special handling
|
|
78
|
-
|
|
87
|
+
"penalty",
|
|
88
|
+
"C",
|
|
79
89
|
}
|
|
80
90
|
|
|
81
91
|
# Filter out incompatible parameters
|
|
82
|
-
filtered_params = {k: v for k, v in self.params.items()
|
|
83
|
-
if k not in cuml_incompatible_params}
|
|
92
|
+
filtered_params = {k: v for k, v in self.params.items() if k not in cuml_incompatible_params}
|
|
84
93
|
|
|
85
94
|
# Handle parameter conversions for cuML
|
|
86
95
|
if self.problem_type == REGRESSION:
|
|
87
96
|
# Convert sklearn's C parameter to cuML's alpha
|
|
88
|
-
if
|
|
89
|
-
filtered_params[
|
|
97
|
+
if "C" in self.params:
|
|
98
|
+
filtered_params["alpha"] = 1.0 / self.params["C"]
|
|
90
99
|
else:
|
|
91
100
|
# For classification, keep C parameter
|
|
92
|
-
if
|
|
93
|
-
filtered_params[
|
|
101
|
+
if "C" in self.params:
|
|
102
|
+
filtered_params["C"] = self.params["C"]
|
|
94
103
|
|
|
95
104
|
# Create and fit cuML model - let cuML handle its own error messages
|
|
96
105
|
self.model = model_cls(**filtered_params)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
# Internal modules for MitraModel
|
|
1
|
+
# Internal modules for MitraModel
|
|
@@ -1 +1 @@
|
|
|
1
|
-
# Configuration modules for MitraModel
|
|
1
|
+
# Configuration modules for MitraModel
|