autogluon.tabular 1.5.0b20251228__py3-none-any.whl → 1.5.1b20260116__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of autogluon.tabular might be problematic. Click here for more details.

Files changed (135) hide show
  1. autogluon/tabular/__init__.py +1 -0
  2. autogluon/tabular/configs/config_helper.py +18 -6
  3. autogluon/tabular/configs/feature_generator_presets.py +3 -1
  4. autogluon/tabular/configs/hyperparameter_configs.py +42 -9
  5. autogluon/tabular/configs/presets_configs.py +38 -14
  6. autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2023.py +84 -14
  7. autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2025.py +48 -48
  8. autogluon/tabular/configs/zeroshot/zeroshot_portfolio_cpu_2025_12_18.py +774 -1
  9. autogluon/tabular/configs/zeroshot/zeroshot_portfolio_gpu_2025_12_18.py +421 -1
  10. autogluon/tabular/experimental/_scikit_mixin.py +6 -2
  11. autogluon/tabular/experimental/_tabular_classifier.py +3 -1
  12. autogluon/tabular/experimental/_tabular_regressor.py +3 -1
  13. autogluon/tabular/experimental/plot_leaderboard.py +73 -19
  14. autogluon/tabular/learner/abstract_learner.py +160 -42
  15. autogluon/tabular/learner/default_learner.py +78 -22
  16. autogluon/tabular/models/__init__.py +2 -2
  17. autogluon/tabular/models/_utils/rapids_utils.py +3 -1
  18. autogluon/tabular/models/abstract/abstract_torch_model.py +2 -0
  19. autogluon/tabular/models/automm/automm_model.py +12 -3
  20. autogluon/tabular/models/automm/ft_transformer.py +5 -1
  21. autogluon/tabular/models/catboost/callbacks.py +2 -2
  22. autogluon/tabular/models/catboost/catboost_model.py +93 -29
  23. autogluon/tabular/models/catboost/catboost_softclass_utils.py +4 -1
  24. autogluon/tabular/models/catboost/catboost_utils.py +3 -1
  25. autogluon/tabular/models/ebm/ebm_model.py +8 -13
  26. autogluon/tabular/models/ebm/hyperparameters/parameters.py +1 -0
  27. autogluon/tabular/models/ebm/hyperparameters/searchspaces.py +1 -0
  28. autogluon/tabular/models/fastainn/callbacks.py +20 -3
  29. autogluon/tabular/models/fastainn/hyperparameters/searchspaces.py +11 -1
  30. autogluon/tabular/models/fastainn/quantile_helpers.py +10 -2
  31. autogluon/tabular/models/fastainn/tabular_nn_fastai.py +65 -18
  32. autogluon/tabular/models/fasttext/fasttext_model.py +3 -1
  33. autogluon/tabular/models/image_prediction/image_predictor.py +7 -2
  34. autogluon/tabular/models/knn/knn_model.py +41 -8
  35. autogluon/tabular/models/lgb/callbacks.py +32 -9
  36. autogluon/tabular/models/lgb/hyperparameters/searchspaces.py +3 -1
  37. autogluon/tabular/models/lgb/lgb_model.py +150 -34
  38. autogluon/tabular/models/lgb/lgb_utils.py +12 -4
  39. autogluon/tabular/models/lr/hyperparameters/searchspaces.py +5 -1
  40. autogluon/tabular/models/lr/lr_model.py +40 -10
  41. autogluon/tabular/models/lr/lr_rapids_model.py +22 -13
  42. autogluon/tabular/models/mitra/_internal/__init__.py +1 -1
  43. autogluon/tabular/models/mitra/_internal/config/__init__.py +1 -1
  44. autogluon/tabular/models/mitra/_internal/config/config_pretrain.py +36 -40
  45. autogluon/tabular/models/mitra/_internal/config/config_run.py +2 -14
  46. autogluon/tabular/models/mitra/_internal/config/enums.py +27 -26
  47. autogluon/tabular/models/mitra/_internal/core/__init__.py +1 -1
  48. autogluon/tabular/models/mitra/_internal/core/callbacks.py +14 -21
  49. autogluon/tabular/models/mitra/_internal/core/get_loss.py +10 -12
  50. autogluon/tabular/models/mitra/_internal/core/get_optimizer.py +17 -32
  51. autogluon/tabular/models/mitra/_internal/core/get_scheduler.py +12 -27
  52. autogluon/tabular/models/mitra/_internal/core/prediction_metrics.py +16 -21
  53. autogluon/tabular/models/mitra/_internal/core/trainer_finetune.py +130 -111
  54. autogluon/tabular/models/mitra/_internal/data/__init__.py +1 -1
  55. autogluon/tabular/models/mitra/_internal/data/collator.py +30 -26
  56. autogluon/tabular/models/mitra/_internal/data/dataset_finetune.py +18 -26
  57. autogluon/tabular/models/mitra/_internal/data/dataset_split.py +10 -7
  58. autogluon/tabular/models/mitra/_internal/data/preprocessor.py +70 -100
  59. autogluon/tabular/models/mitra/_internal/models/__init__.py +1 -1
  60. autogluon/tabular/models/mitra/_internal/models/base.py +7 -10
  61. autogluon/tabular/models/mitra/_internal/models/embedding.py +46 -56
  62. autogluon/tabular/models/mitra/_internal/models/tab2d.py +140 -120
  63. autogluon/tabular/models/mitra/_internal/utils/__init__.py +1 -1
  64. autogluon/tabular/models/mitra/_internal/utils/set_seed.py +3 -1
  65. autogluon/tabular/models/mitra/mitra_model.py +16 -11
  66. autogluon/tabular/models/mitra/sklearn_interface.py +178 -162
  67. autogluon/tabular/models/realmlp/realmlp_model.py +28 -15
  68. autogluon/tabular/models/rf/compilers/onnx.py +1 -1
  69. autogluon/tabular/models/rf/rf_model.py +45 -12
  70. autogluon/tabular/models/rf/rf_quantile.py +4 -2
  71. autogluon/tabular/models/tabdpt/tabdpt_model.py +8 -17
  72. autogluon/tabular/models/tabicl/tabicl_model.py +8 -1
  73. autogluon/tabular/models/tabm/_tabm_internal.py +6 -4
  74. autogluon/tabular/models/tabm/rtdl_num_embeddings.py +80 -127
  75. autogluon/tabular/models/tabm/tabm_model.py +8 -4
  76. autogluon/tabular/models/tabm/tabm_reference.py +53 -85
  77. autogluon/tabular/models/tabpfnmix/_internal/core/callbacks.py +7 -16
  78. autogluon/tabular/models/tabpfnmix/_internal/core/collator.py +16 -24
  79. autogluon/tabular/models/tabpfnmix/_internal/core/dataset_split.py +5 -7
  80. autogluon/tabular/models/tabpfnmix/_internal/core/enums.py +0 -2
  81. autogluon/tabular/models/tabpfnmix/_internal/core/get_loss.py +0 -1
  82. autogluon/tabular/models/tabpfnmix/_internal/core/get_optimizer.py +7 -18
  83. autogluon/tabular/models/tabpfnmix/_internal/core/get_scheduler.py +3 -14
  84. autogluon/tabular/models/tabpfnmix/_internal/core/trainer_finetune.py +79 -64
  85. autogluon/tabular/models/tabpfnmix/_internal/core/y_transformer.py +3 -5
  86. autogluon/tabular/models/tabpfnmix/_internal/data/dataset_finetune.py +17 -30
  87. autogluon/tabular/models/tabpfnmix/_internal/data/preprocessor.py +15 -35
  88. autogluon/tabular/models/tabpfnmix/_internal/models/foundation/embedding.py +21 -38
  89. autogluon/tabular/models/tabpfnmix/_internal/models/foundation/foundation_transformer.py +33 -51
  90. autogluon/tabular/models/tabpfnmix/_internal/results/prediction_metrics.py +4 -4
  91. autogluon/tabular/models/tabpfnmix/_internal/tabpfnmix_classifier.py +32 -12
  92. autogluon/tabular/models/tabpfnmix/_internal/tabpfnmix_regressor.py +32 -13
  93. autogluon/tabular/models/tabpfnmix/tabpfnmix_model.py +55 -19
  94. autogluon/tabular/models/tabpfnv2/tabpfnv2_5_model.py +21 -48
  95. autogluon/tabular/models/tabprep/prep_mixin.py +34 -26
  96. autogluon/tabular/models/tabular_nn/compilers/onnx.py +36 -8
  97. autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py +130 -36
  98. autogluon/tabular/models/tabular_nn/torch/tabular_torch_dataset.py +8 -4
  99. autogluon/tabular/models/tabular_nn/torch/torch_network_modules.py +26 -5
  100. autogluon/tabular/models/tabular_nn/utils/categorical_encoders.py +41 -24
  101. autogluon/tabular/models/tabular_nn/utils/data_preprocessor.py +33 -8
  102. autogluon/tabular/models/tabular_nn/utils/nn_architecture_utils.py +21 -6
  103. autogluon/tabular/models/xgboost/callbacks.py +9 -3
  104. autogluon/tabular/models/xgboost/xgboost_model.py +59 -11
  105. autogluon/tabular/models/xt/xt_model.py +1 -0
  106. autogluon/tabular/predictor/interpretable_predictor.py +3 -1
  107. autogluon/tabular/predictor/predictor.py +409 -128
  108. autogluon/tabular/registry/__init__.py +1 -1
  109. autogluon/tabular/registry/_ag_model_registry.py +4 -5
  110. autogluon/tabular/registry/_model_registry.py +1 -0
  111. autogluon/tabular/testing/fit_helper.py +55 -15
  112. autogluon/tabular/testing/generate_datasets.py +1 -1
  113. autogluon/tabular/testing/model_fit_helper.py +10 -4
  114. autogluon/tabular/trainer/abstract_trainer.py +644 -230
  115. autogluon/tabular/trainer/auto_trainer.py +19 -8
  116. autogluon/tabular/trainer/model_presets/presets.py +33 -9
  117. autogluon/tabular/trainer/model_presets/presets_distill.py +16 -2
  118. autogluon/tabular/version.py +1 -1
  119. {autogluon_tabular-1.5.0b20251228.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/METADATA +26 -26
  120. {autogluon_tabular-1.5.0b20251228.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/RECORD +127 -135
  121. autogluon/tabular/models/tabpfnv2/rfpfn/__init__.py +0 -20
  122. autogluon/tabular/models/tabpfnv2/rfpfn/configs.py +0 -40
  123. autogluon/tabular/models/tabpfnv2/rfpfn/scoring_utils.py +0 -201
  124. autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_based_decision_tree_tabpfn.py +0 -1464
  125. autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_based_random_forest_tabpfn.py +0 -747
  126. autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_compat.py +0 -863
  127. autogluon/tabular/models/tabpfnv2/rfpfn/utils.py +0 -106
  128. autogluon/tabular/models/tabpfnv2/tabpfnv2_model.py +0 -466
  129. /autogluon.tabular-1.5.0b20251228-py3.11-nspkg.pth → /autogluon.tabular-1.5.1b20260116-py3.11-nspkg.pth +0 -0
  130. {autogluon_tabular-1.5.0b20251228.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/WHEEL +0 -0
  131. {autogluon_tabular-1.5.0b20251228.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/licenses/LICENSE +0 -0
  132. {autogluon_tabular-1.5.0b20251228.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/licenses/NOTICE +0 -0
  133. {autogluon_tabular-1.5.0b20251228.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/namespace_packages.txt +0 -0
  134. {autogluon_tabular-1.5.0b20251228.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/top_level.txt +0 -0
  135. {autogluon_tabular-1.5.0b20251228.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/zip-safe +0 -0
@@ -25,7 +25,9 @@ from .hyperparameters.parameters import DEFAULT_NUM_BOOST_ROUND, get_lgb_objecti
25
25
  from .hyperparameters.searchspaces import get_default_searchspace
26
26
  from .lgb_utils import construct_dataset, train_lgb_model
27
27
 
28
- warnings.filterwarnings("ignore", category=UserWarning, message="Starting from version") # lightGBM brew libomp warning
28
+ warnings.filterwarnings(
29
+ "ignore", category=UserWarning, message="Starting from version"
30
+ ) # lightGBM brew libomp warning
29
31
  warnings.filterwarnings("ignore", category=FutureWarning, message="Dask dataframe query") # lightGBM dask-expr warning
30
32
  logger = logging.getLogger(__name__)
31
33
 
@@ -40,12 +42,11 @@ class LGBModel(AbstractModel):
40
42
  Extra hyperparameter options:
41
43
  ag.early_stop : int, specifies the early stopping rounds. Defaults to an adaptive strategy. Recommended to keep default.
42
44
  """
45
+
43
46
  ag_key = "GBM"
44
47
  ag_name = "LightGBM"
45
48
  ag_priority = 90
46
- ag_priority_by_problem_type = MappingProxyType({
47
- SOFTCLASS: 100
48
- })
49
+ ag_priority_by_problem_type = MappingProxyType({SOFTCLASS: 100})
49
50
  seed_name = "seed"
50
51
  seed_name_alt = ["seed_value", "random_seed", "random_state"]
51
52
 
@@ -53,8 +54,8 @@ class LGBModel(AbstractModel):
53
54
  super().__init__(**kwargs)
54
55
 
55
56
  self._features_internal_map = None
56
- self._features_internal_list = None
57
57
  self._requires_remap = None
58
+ self._features_internal_lgbm = None
58
59
 
59
60
  def _set_default_params(self):
60
61
  default_params = get_param_baseline(problem_type=self.problem_type)
@@ -66,10 +67,15 @@ class LGBModel(AbstractModel):
66
67
 
67
68
  # Use specialized LightGBM metric if available (fast), otherwise use custom func generator
68
69
  def _get_stopping_metric_internal(self):
69
- stopping_metric = lgb_utils.convert_ag_metric_to_lgbm(ag_metric_name=self.stopping_metric.name, problem_type=self.problem_type)
70
+ stopping_metric = lgb_utils.convert_ag_metric_to_lgbm(
71
+ ag_metric_name=self.stopping_metric.name, problem_type=self.problem_type
72
+ )
70
73
  if stopping_metric is None:
71
74
  stopping_metric = lgb_utils.func_generator(
72
- metric=self.stopping_metric, is_higher_better=True, needs_pred_proba=not self.stopping_metric.needs_pred, problem_type=self.problem_type
75
+ metric=self.stopping_metric,
76
+ is_higher_better=True,
77
+ needs_pred_proba=not self.stopping_metric.needs_pred,
78
+ problem_type=self.problem_type,
73
79
  )
74
80
  stopping_metric_name = self.stopping_metric.name
75
81
  else:
@@ -78,7 +84,13 @@ class LGBModel(AbstractModel):
78
84
 
79
85
  def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
80
86
  hyperparameters = self._get_model_params()
81
- return self.estimate_memory_usage_static(X=X, problem_type=self.problem_type, num_classes=self.num_classes, hyperparameters=hyperparameters, **kwargs)
87
+ return self.estimate_memory_usage_static(
88
+ X=X,
89
+ problem_type=self.problem_type,
90
+ num_classes=self.num_classes,
91
+ hyperparameters=hyperparameters,
92
+ **kwargs,
93
+ )
82
94
 
83
95
  # FIXME: Don't use `hyperparameters.get("max_bins", 255)`, instead get the defaults all at once!
84
96
  @classmethod
@@ -142,8 +154,12 @@ class LGBModel(AbstractModel):
142
154
  """
143
155
  if hyperparameters is None:
144
156
  hyperparameters = {}
145
- num_classes = num_classes if num_classes else 1 # num_classes could be None after initialization if it's a regression problem
146
- data_mem_usage_bytes = data_mem_usage * 5 + data_mem_usage / 4 * num_classes # TODO: Extremely crude approximation, can be vastly improved
157
+ num_classes = (
158
+ num_classes if num_classes else 1
159
+ ) # num_classes could be None after initialization if it's a regression problem
160
+ data_mem_usage_bytes = (
161
+ data_mem_usage * 5 + data_mem_usage / 4 * num_classes
162
+ ) # TODO: Extremely crude approximation, can be vastly improved
147
163
 
148
164
  n_trees_per_estimator = num_classes if num_classes > 2 else 1
149
165
 
@@ -161,12 +177,27 @@ class LGBModel(AbstractModel):
161
177
  mem_size_per_estimator = n_trees_per_estimator * num_leaves * 100 # very rough estimate
162
178
  n_estimators = hyperparameters.get("num_boost_round", DEFAULT_NUM_BOOST_ROUND)
163
179
  n_estimators_min = min(n_estimators, 5000)
164
- mem_size_estimators = n_estimators_min * mem_size_per_estimator # memory estimate after fitting up to 5000 estimators
180
+ mem_size_estimators = (
181
+ n_estimators_min * mem_size_per_estimator
182
+ ) # memory estimate after fitting up to 5000 estimators
165
183
 
166
184
  approx_mem_size_req = data_mem_usage_bytes + histogram_mem_usage_bytes + mem_size_estimators
167
185
  return int(approx_mem_size_req)
168
186
 
169
- def _fit(self, X, y, X_val=None, y_val=None, time_limit=None, num_gpus=0, num_cpus=0, sample_weight=None, sample_weight_val=None, verbosity=2, **kwargs):
187
+ def _fit(
188
+ self,
189
+ X,
190
+ y,
191
+ X_val=None,
192
+ y_val=None,
193
+ time_limit=None,
194
+ num_gpus=0,
195
+ num_cpus=0,
196
+ sample_weight=None,
197
+ sample_weight_val=None,
198
+ verbosity=2,
199
+ **kwargs,
200
+ ):
170
201
  try_import_lightgbm() # raise helpful error message if LightGBM isn't installed
171
202
  start_time = time.time()
172
203
  ag_params = self._get_ag_params()
@@ -192,14 +223,19 @@ class LGBModel(AbstractModel):
192
223
  stopping_metric, stopping_metric_name = self._get_stopping_metric_internal()
193
224
 
194
225
  num_boost_round = params.pop("num_boost_round", DEFAULT_NUM_BOOST_ROUND)
195
- dart_retrain = params.pop("dart_retrain", False) # Whether to retrain the model to get optimal iteration if model is trained in 'dart' mode.
226
+ dart_retrain = params.pop(
227
+ "dart_retrain", False
228
+ ) # Whether to retrain the model to get optimal iteration if model is trained in 'dart' mode.
196
229
  if num_gpus != 0:
197
230
  if "device" not in params:
198
231
  # TODO: lightgbm must have a special install to support GPU: https://github.com/Microsoft/LightGBM/tree/master/python-package#build-gpu-version
199
232
  # Before enabling GPU, we should add code to detect that GPU-enabled version is installed and that a valid GPU exists.
200
233
  # GPU training heavily alters accuracy, often in a negative manner. We will have to be careful about when to use GPU.
201
234
  params["device"] = "gpu"
202
- logger.log(20, f"\tWarning: Training LightGBM with GPU. This may negatively impact model quality compared to CPU training.")
235
+ logger.log(
236
+ 20,
237
+ f"\tWarning: Training LightGBM with GPU. This may negatively impact model quality compared to CPU training.",
238
+ )
203
239
  logger.log(15, f"\tFitting {num_boost_round} rounds... Hyperparameters: {params}")
204
240
 
205
241
  if "num_threads" not in params:
@@ -213,7 +249,15 @@ class LGBModel(AbstractModel):
213
249
 
214
250
  num_rows_train = len(X)
215
251
  dataset_train, dataset_val, dataset_test = self.generate_datasets(
216
- X=X, y=y, params=params, X_val=X_val, y_val=y_val, X_test=X_test, y_test=y_test, sample_weight=sample_weight, sample_weight_val=sample_weight_val
252
+ X=X,
253
+ y=y,
254
+ params=params,
255
+ X_val=X_val,
256
+ y_val=y_val,
257
+ X_test=X_test,
258
+ y_test=y_test,
259
+ sample_weight=sample_weight,
260
+ sample_weight_val=sample_weight_val,
217
261
  )
218
262
  gc.collect()
219
263
 
@@ -226,7 +270,9 @@ class LGBModel(AbstractModel):
226
270
  # TODO: Better solution: Track trend to early stop when score is far worse than best score, or score is trending worse over time
227
271
  early_stopping_rounds = ag_params.get("early_stop", "adaptive")
228
272
  if isinstance(early_stopping_rounds, (str, tuple, list)):
229
- early_stopping_rounds = self._get_early_stopping_rounds(num_rows_train=num_rows_train, strategy=early_stopping_rounds)
273
+ early_stopping_rounds = self._get_early_stopping_rounds(
274
+ num_rows_train=num_rows_train, strategy=early_stopping_rounds
275
+ )
230
276
  if early_stopping_rounds is None:
231
277
  early_stopping_rounds = 999999
232
278
  reporter = kwargs.get("reporter", None)
@@ -235,7 +281,7 @@ class LGBModel(AbstractModel):
235
281
  if "metric" not in params or params["metric"] == "":
236
282
  params["metric"] = train_loss_name
237
283
  elif train_loss_name not in params["metric"]:
238
- params["metric"] = f'{params["metric"]},{train_loss_name}'
284
+ params["metric"] = f"{params['metric']},{train_loss_name}"
239
285
  # early stopping callback will be added later by QuantileBooster if problem_type==QUANTILE
240
286
  early_stopping_callback_kwargs = dict(
241
287
  stopping_rounds=early_stopping_rounds,
@@ -315,7 +361,7 @@ class LGBModel(AbstractModel):
315
361
  if "metric" not in train_params["params"] or train_params["params"]["metric"] == "":
316
362
  train_params["params"]["metric"] = stopping_metric
317
363
  elif stopping_metric not in train_params["params"]["metric"]:
318
- train_params["params"]["metric"] = f'{stopping_metric},{train_params["params"]["metric"]}'
364
+ train_params["params"]["metric"] = f"{stopping_metric},{train_params['params']['metric']}"
319
365
 
320
366
  if self.problem_type == SOFTCLASS:
321
367
  train_params["params"]["objective"] = lgb_utils.softclass_lgbobj
@@ -332,7 +378,9 @@ class LGBModel(AbstractModel):
332
378
  warnings.filterwarnings("ignore", message="Overriding the parameters from Reference Dataset.")
333
379
  warnings.filterwarnings("ignore", message="categorical_column in param dict is overridden.")
334
380
  try:
335
- self.model = train_lgb_model(early_stopping_callback_kwargs=early_stopping_callback_kwargs, **train_params)
381
+ self.model = train_lgb_model(
382
+ early_stopping_callback_kwargs=early_stopping_callback_kwargs, **train_params
383
+ )
336
384
  except LightGBMError:
337
385
  if train_params["params"].get("device", "cpu") not in ["gpu", "cuda"]:
338
386
  raise
@@ -357,7 +405,9 @@ class LGBModel(AbstractModel):
357
405
  "https://github.com/Microsoft/LightGBM/tree/master/python-package#build-cuda-version"
358
406
  )
359
407
  train_params["params"]["device"] = "cpu"
360
- self.model = train_lgb_model(early_stopping_callback_kwargs=early_stopping_callback_kwargs, **train_params)
408
+ self.model = train_lgb_model(
409
+ early_stopping_callback_kwargs=early_stopping_callback_kwargs, **train_params
410
+ )
361
411
  retrain = False
362
412
  if train_params["params"].get("boosting_type", "") == "dart":
363
413
  if dataset_val is not None and dart_retrain and (self.model.best_iteration != num_boost_round):
@@ -434,7 +484,73 @@ class LGBModel(AbstractModel):
434
484
  else: # Should this ever happen?
435
485
  return y_pred_proba[:, 1]
436
486
 
437
- def _preprocess_nonadaptive(self, X, is_train=False, **kwargs):
487
+ @staticmethod
488
+ def _clean_column_name_for_lgb(column_name):
489
+ """Clean column names while keeping most semantic meaning."""
490
+ if not isinstance(column_name, str):
491
+ return column_name
492
+ for symbol in ['"', ",", ":", "{", "}", "[", "]"]:
493
+ column_name = column_name.replace(symbol, "_")
494
+ return column_name
495
+
496
+ @classmethod
497
+ def _rename_columns(cls, features: list) -> dict:
498
+ """
499
+ Generate a deterministic, one-to-one mapping from original feature names to
500
+ LightGBM-safe, unique column names.
501
+
502
+ This method:
503
+ - Cleans feature names using `_clean_column_name_for_lgb`
504
+ - Resolves naming collisions by appending numeric suffixes (`_2`, `_3`, ...)
505
+ - Guarantees that all output column names are unique
506
+ - Guarantees a strict 1-to-1 mapping between input features and output names
507
+
508
+ The mapping is deterministic with respect to input order. If two or more
509
+ features clean to the same base name, the first occurrence keeps the base
510
+ name and subsequent occurrences receive incrementing suffixes.
511
+
512
+ Parameters
513
+ ----------
514
+ features : list
515
+ List of feature names. All entries must be unique under Python equality
516
+ semantics (e.g., `"a"` and `"a"` or `1` and `True` are considered duplicates).
517
+
518
+ Returns
519
+ -------
520
+ dict
521
+ Mapping from original feature name to a unique, cleaned column name
522
+ suitable for use in LightGBM.
523
+
524
+ Raises
525
+ ------
526
+ ValueError
527
+ If `features` contains duplicate entries, since a dictionary cannot
528
+ represent a one-to-one mapping in that case.
529
+
530
+ """
531
+ if len(features) != len(set(features)):
532
+ raise ValueError("features contains duplicates; cannot create 1-to-1 mapping with a dict.")
533
+
534
+ unique_features = set()
535
+ features_map = {}
536
+ for feature in features:
537
+ cleaned_feature = cls._clean_column_name_for_lgb(feature)
538
+
539
+ unique_feature = cleaned_feature
540
+ if unique_feature in unique_features:
541
+ is_unique = False
542
+ count = 2
543
+ while not is_unique:
544
+ unique_feature = f"{cleaned_feature}_{count}"
545
+ if unique_feature not in unique_features:
546
+ is_unique = True
547
+ else:
548
+ count += 1
549
+ unique_features.add(unique_feature)
550
+ features_map[feature] = unique_feature
551
+ return features_map
552
+
553
+ def _preprocess_nonadaptive(self, X: pd.DataFrame, is_train: bool = False, **kwargs):
438
554
  X = super()._preprocess_nonadaptive(X=X, **kwargs)
439
555
 
440
556
  if is_train:
@@ -443,21 +559,25 @@ class LGBModel(AbstractModel):
443
559
  if isinstance(column, str):
444
560
  new_column = re.sub(r'[",:{}[\]]', "", column)
445
561
  if new_column != column:
446
- self._features_internal_map = {feature: i for i, feature in enumerate(list(X.columns))}
447
562
  self._requires_remap = True
448
563
  break
449
564
  if self._requires_remap:
450
- self._features_internal_list = np.array([self._features_internal_map[feature] for feature in list(X.columns)])
451
- else:
452
- self._features_internal_list = self._features_internal
565
+ self._features_internal_map = self._rename_columns(features=list(X.columns))
566
+ self._features_internal_lgbm = [self._features_internal_map[feature] for feature in list(X.columns)]
453
567
 
454
- if self._requires_remap:
455
- X_new = X.copy(deep=False)
456
- X_new.columns = self._features_internal_list
457
- return X_new
458
- else:
568
+ if not self._requires_remap:
459
569
  return X
460
570
 
571
+ X_new = X.copy(deep=False)
572
+ X_new.columns = self._features_internal_lgbm
573
+
574
+ # Update feature metadata
575
+ if is_train:
576
+ new_feature_metadata = self._feature_metadata.rename_features(self._features_internal_map)
577
+ self._preprocess_set_features_internal(X=X_new, feature_metadata=new_feature_metadata)
578
+
579
+ return X_new
580
+
461
581
  def generate_datasets(
462
582
  self,
463
583
  X: DataFrame,
@@ -630,10 +750,6 @@ class LGBModel(AbstractModel):
630
750
  def supported_problem_types(cls) -> list[str] | None:
631
751
  return ["binary", "multiclass", "regression", "quantile", "softclass"]
632
752
 
633
- @property
634
- def _features(self):
635
- return self._features_internal_list
636
-
637
753
  def _ag_params(self) -> set:
638
754
  return {"early_stop", "generate_curves", "curve_metrics", "use_error_for_curve_metrics"}
639
755
 
@@ -104,11 +104,15 @@ def softclass_lgbobj(preds, train_data):
104
104
  return grad.flatten("F"), hess.flatten("F")
105
105
 
106
106
 
107
- def construct_dataset(x: DataFrame, y: Series, location=None, reference=None, params=None, save=False, weight=None, init_score=None):
107
+ def construct_dataset(
108
+ x: DataFrame, y: Series, location=None, reference=None, params=None, save=False, weight=None, init_score=None
109
+ ):
108
110
  try_import_lightgbm()
109
111
  import lightgbm as lgb
110
112
 
111
- dataset = lgb.Dataset(data=x, label=y, reference=reference, free_raw_data=True, params=params, weight=weight, init_score=init_score)
113
+ dataset = lgb.Dataset(
114
+ data=x, label=y, reference=reference, free_raw_data=True, params=params, weight=weight, init_score=init_score
115
+ )
112
116
 
113
117
  if save:
114
118
  assert location is not None
@@ -128,7 +132,9 @@ def train_lgb_model(early_stopping_callback_kwargs=None, **train_params):
128
132
 
129
133
  if train_params["params"]["objective"] == "quantile":
130
134
  quantile_levels = train_params["params"].pop("quantile_levels")
131
- booster = QuantileBooster(quantile_levels=quantile_levels, early_stopping_callback_kwargs=early_stopping_callback_kwargs)
135
+ booster = QuantileBooster(
136
+ quantile_levels=quantile_levels, early_stopping_callback_kwargs=early_stopping_callback_kwargs
137
+ )
132
138
  return booster.fit(**train_params)
133
139
  else:
134
140
  return lgb.train(**train_params)
@@ -141,7 +147,9 @@ class QuantileBooster:
141
147
  if quantile_levels is None:
142
148
  raise AssertionError
143
149
  if not all(0 < q < 1 for q in quantile_levels):
144
- raise AssertionError(f"quantile_levels must fulfill 0 < q < 1, provided quantile_levels: {quantile_levels}")
150
+ raise AssertionError(
151
+ f"quantile_levels must fulfill 0 < q < 1, provided quantile_levels: {quantile_levels}"
152
+ )
145
153
 
146
154
  self.quantile_levels = quantile_levels
147
155
 
@@ -2,5 +2,9 @@ from autogluon.common.space import Categorical, Real
2
2
 
3
3
 
4
4
  def get_default_searchspace(problem_type, num_classes=None):
5
- spaces = {"C": Real(lower=0.1, upper=1e3, default=1), "proc.skew_threshold": Categorical(0.99, None), "penalty": Categorical("L2", "L1")}
5
+ spaces = {
6
+ "C": Real(lower=0.1, upper=1e3, default=1),
7
+ "proc.skew_threshold": Categorical(0.99, None),
8
+ "penalty": Categorical("L2", "L1"),
9
+ }
6
10
  return spaces
@@ -40,6 +40,7 @@ class LinearModel(AbstractModel):
40
40
 
41
41
  'regression': https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge
42
42
  """
43
+
43
44
  ag_key = "LR"
44
45
  ag_name = "LinearModel"
45
46
  ag_priority = 30
@@ -87,7 +88,9 @@ class LinearModel(AbstractModel):
87
88
  """Returns dict with keys: : 'continuous', 'skewed', 'onehot', 'embed', 'language', values = ordered list of feature-names falling into each category.
88
89
  Each value is a list of feature-names corresponding to columns in original dataframe.
89
90
  """
90
- continuous_featnames = self._feature_metadata.get_features(valid_raw_types=[R_INT, R_FLOAT], invalid_special_types=[S_BOOL])
91
+ continuous_featnames = self._feature_metadata.get_features(
92
+ valid_raw_types=[R_INT, R_FLOAT], invalid_special_types=[S_BOOL]
93
+ )
91
94
  categorical_featnames = self._feature_metadata.get_features(valid_raw_types=[R_CATEGORY, R_OBJECT])
92
95
  bool_featnames = self._feature_metadata.get_features(required_special_types=[S_BOOL])
93
96
  language_featnames = [] # TODO: Disabled currently, have to pass raw text data features here to function properly
@@ -125,7 +128,10 @@ class LinearModel(AbstractModel):
125
128
  (
126
129
  "vectorizer",
127
130
  TfidfVectorizer(
128
- ngram_range=self.params["proc.ngram_range"], sublinear_tf=True, max_features=vect_max_features, tokenizer=self._tokenize
131
+ ngram_range=self.params["proc.ngram_range"],
132
+ sublinear_tf=True,
133
+ max_features=vect_max_features,
134
+ tokenizer=self._tokenize,
129
135
  ),
130
136
  ),
131
137
  ]
@@ -139,7 +145,12 @@ class LinearModel(AbstractModel):
139
145
  )
140
146
  transformer_list.append(("cats", pipeline, feature_types["onehot"]))
141
147
  if feature_types.get("continuous", None):
142
- pipeline = Pipeline(steps=[("imputer", SimpleImputer(strategy=self.params["proc.impute_strategy"])), ("scaler", StandardScaler())])
148
+ pipeline = Pipeline(
149
+ steps=[
150
+ ("imputer", SimpleImputer(strategy=self.params["proc.impute_strategy"])),
151
+ ("scaler", StandardScaler()),
152
+ ]
153
+ )
143
154
  transformer_list.append(("cont", pipeline, feature_types["continuous"]))
144
155
  if feature_types.get("bool", None):
145
156
  pipeline = Pipeline(steps=[("scaler", StandardScaler())])
@@ -148,7 +159,10 @@ class LinearModel(AbstractModel):
148
159
  pipeline = Pipeline(
149
160
  steps=[
150
161
  ("imputer", SimpleImputer(strategy=self.params["proc.impute_strategy"])),
151
- ("quantile", QuantileTransformer(output_distribution="normal")), # Or output_distribution = 'uniform'
162
+ (
163
+ "quantile",
164
+ QuantileTransformer(output_distribution="normal"),
165
+ ), # Or output_distribution = 'uniform'
152
166
  ]
153
167
  )
154
168
  transformer_list.append(("skew", pipeline, feature_types["skewed"]))
@@ -227,7 +241,9 @@ class LinearModel(AbstractModel):
227
241
  if time_to_train_cur_max_iter > time_left_train:
228
242
  cur_max_iter = min(int(time_left_train / time_per_iter) - 1, cur_max_iter)
229
243
  if cur_max_iter <= 0:
230
- logger.warning(f"\tEarly stopping due to lack of time remaining. Fit {total_iter}/{total_max_iter} iters...")
244
+ logger.warning(
245
+ f"\tEarly stopping due to lack of time remaining. Fit {total_iter}/{total_max_iter} iters..."
246
+ )
231
247
  break
232
248
  early_stop = True
233
249
 
@@ -251,13 +267,17 @@ class LinearModel(AbstractModel):
251
267
  total_iter_used += model.max_iter
252
268
  if early_stop:
253
269
  if total_iter_used == total_iter: # Not yet converged
254
- logger.warning(f"\tEarly stopping due to lack of time remaining. Fit {total_iter}/{total_max_iter} iters...")
270
+ logger.warning(
271
+ f"\tEarly stopping due to lack of time remaining. Fit {total_iter}/{total_max_iter} iters..."
272
+ )
255
273
  break
256
274
 
257
275
  self.model = model
258
276
  self.params_trained["max_iter"] = total_iter
259
277
 
260
- def _select_features_handle_text_include(self, df, categorical_featnames, language_featnames, continuous_featnames, bool_featnames):
278
+ def _select_features_handle_text_include(
279
+ self, df, categorical_featnames, language_featnames, continuous_featnames, bool_featnames
280
+ ):
261
281
  types_of_features = dict()
262
282
  types_of_features.update(self._select_continuous(df, continuous_featnames))
263
283
  types_of_features.update(self._select_bool(df, bool_featnames))
@@ -265,12 +285,16 @@ class LinearModel(AbstractModel):
265
285
  types_of_features.update(self._select_text(df, language_featnames))
266
286
  return types_of_features
267
287
 
268
- def _select_features_handle_text_only(self, df, categorical_featnames, language_featnames, continuous_featnames, bool_featnames):
288
+ def _select_features_handle_text_only(
289
+ self, df, categorical_featnames, language_featnames, continuous_featnames, bool_featnames
290
+ ):
269
291
  types_of_features = dict()
270
292
  types_of_features.update(self._select_text(df, language_featnames))
271
293
  return types_of_features
272
294
 
273
- def _select_features_handle_text_ignore(self, df, categorical_featnames, language_featnames, continuous_featnames, bool_featnames):
295
+ def _select_features_handle_text_ignore(
296
+ self, df, categorical_featnames, language_featnames, continuous_featnames, bool_featnames
297
+ ):
274
298
  types_of_features = dict()
275
299
  types_of_features.update(self._select_continuous(df, continuous_featnames))
276
300
  types_of_features.update(self._select_bool(df, bool_featnames))
@@ -309,7 +333,13 @@ class LinearModel(AbstractModel):
309
333
 
310
334
  def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
311
335
  hyperparameters = self._get_model_params()
312
- return self.estimate_memory_usage_static(X=X, problem_type=self.problem_type, num_classes=self.num_classes, hyperparameters=hyperparameters, **kwargs)
336
+ return self.estimate_memory_usage_static(
337
+ X=X,
338
+ problem_type=self.problem_type,
339
+ num_classes=self.num_classes,
340
+ hyperparameters=hyperparameters,
341
+ **kwargs,
342
+ )
313
343
 
314
344
  @classmethod
315
345
  def _estimate_memory_usage_static(
@@ -49,7 +49,7 @@ class LinearRapidsModel(RapidsModelMixin, LinearModel):
49
49
 
50
50
  def _preprocess(self, X, **kwargs):
51
51
  X = super()._preprocess(X=X, **kwargs)
52
- if hasattr(X, 'toarray'): # Check if it's a sparse matrix
52
+ if hasattr(X, "toarray"): # Check if it's a sparse matrix
53
53
  X = X.toarray()
54
54
  return X
55
55
 
@@ -60,7 +60,7 @@ class LinearRapidsModel(RapidsModelMixin, LinearModel):
60
60
  """
61
61
  # Preprocess data
62
62
  X = self.preprocess(X, is_train=True)
63
- if self.problem_type == 'binary':
63
+ if self.problem_type == "binary":
64
64
  y = y.astype(int).values
65
65
 
66
66
  # Create cuML model with filtered parameters
@@ -69,28 +69,37 @@ class LinearRapidsModel(RapidsModelMixin, LinearModel):
69
69
  # Comprehensive parameter filtering for cuML compatibility
70
70
  cuml_incompatible_params = {
71
71
  # AutoGluon-specific preprocessing parameters
72
- 'vectorizer_dict_size', 'proc.ngram_range', 'proc.skew_threshold',
73
- 'proc.impute_strategy', 'handle_text',
72
+ "vectorizer_dict_size",
73
+ "proc.ngram_range",
74
+ "proc.skew_threshold",
75
+ "proc.impute_strategy",
76
+ "handle_text",
74
77
  # sklearn-specific parameters not supported by cuML
75
- 'n_jobs', 'warm_start', 'multi_class', 'dual', 'intercept_scaling',
76
- 'class_weight', 'random_state', 'verbose',
78
+ "n_jobs",
79
+ "warm_start",
80
+ "multi_class",
81
+ "dual",
82
+ "intercept_scaling",
83
+ "class_weight",
84
+ "random_state",
85
+ "verbose",
77
86
  # Parameters that need conversion or special handling
78
- 'penalty', 'C'
87
+ "penalty",
88
+ "C",
79
89
  }
80
90
 
81
91
  # Filter out incompatible parameters
82
- filtered_params = {k: v for k, v in self.params.items()
83
- if k not in cuml_incompatible_params}
92
+ filtered_params = {k: v for k, v in self.params.items() if k not in cuml_incompatible_params}
84
93
 
85
94
  # Handle parameter conversions for cuML
86
95
  if self.problem_type == REGRESSION:
87
96
  # Convert sklearn's C parameter to cuML's alpha
88
- if 'C' in self.params:
89
- filtered_params['alpha'] = 1.0 / self.params['C']
97
+ if "C" in self.params:
98
+ filtered_params["alpha"] = 1.0 / self.params["C"]
90
99
  else:
91
100
  # For classification, keep C parameter
92
- if 'C' in self.params:
93
- filtered_params['C'] = self.params['C']
101
+ if "C" in self.params:
102
+ filtered_params["C"] = self.params["C"]
94
103
 
95
104
  # Create and fit cuML model - let cuML handle its own error messages
96
105
  self.model = model_cls(**filtered_params)
@@ -1 +1 @@
1
- # Internal modules for MitraModel
1
+ # Internal modules for MitraModel
@@ -1 +1 @@
1
- # Configuration modules for MitraModel
1
+ # Configuration modules for MitraModel