autogluon.tabular 1.5.1b20260105__py3-none-any.whl → 1.5.1b20260117__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of autogluon.tabular might be problematic. Click here for more details.
- autogluon/tabular/__init__.py +1 -0
- autogluon/tabular/configs/config_helper.py +18 -6
- autogluon/tabular/configs/feature_generator_presets.py +3 -1
- autogluon/tabular/configs/hyperparameter_configs.py +42 -9
- autogluon/tabular/configs/presets_configs.py +38 -14
- autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2023.py +84 -14
- autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2025.py +48 -48
- autogluon/tabular/configs/zeroshot/zeroshot_portfolio_cpu_2025_12_18.py +774 -1
- autogluon/tabular/configs/zeroshot/zeroshot_portfolio_gpu_2025_12_18.py +421 -1
- autogluon/tabular/experimental/_scikit_mixin.py +6 -2
- autogluon/tabular/experimental/_tabular_classifier.py +3 -1
- autogluon/tabular/experimental/_tabular_regressor.py +3 -1
- autogluon/tabular/experimental/plot_leaderboard.py +73 -19
- autogluon/tabular/learner/abstract_learner.py +160 -42
- autogluon/tabular/learner/default_learner.py +78 -22
- autogluon/tabular/models/__init__.py +2 -2
- autogluon/tabular/models/_utils/rapids_utils.py +3 -1
- autogluon/tabular/models/abstract/abstract_torch_model.py +2 -0
- autogluon/tabular/models/automm/automm_model.py +12 -3
- autogluon/tabular/models/automm/ft_transformer.py +5 -1
- autogluon/tabular/models/catboost/callbacks.py +2 -2
- autogluon/tabular/models/catboost/catboost_model.py +93 -29
- autogluon/tabular/models/catboost/catboost_softclass_utils.py +4 -1
- autogluon/tabular/models/catboost/catboost_utils.py +3 -1
- autogluon/tabular/models/ebm/ebm_model.py +8 -13
- autogluon/tabular/models/ebm/hyperparameters/parameters.py +1 -0
- autogluon/tabular/models/ebm/hyperparameters/searchspaces.py +1 -0
- autogluon/tabular/models/fastainn/callbacks.py +20 -3
- autogluon/tabular/models/fastainn/hyperparameters/searchspaces.py +11 -1
- autogluon/tabular/models/fastainn/quantile_helpers.py +10 -2
- autogluon/tabular/models/fastainn/tabular_nn_fastai.py +65 -18
- autogluon/tabular/models/fasttext/fasttext_model.py +3 -1
- autogluon/tabular/models/image_prediction/image_predictor.py +7 -2
- autogluon/tabular/models/knn/knn_model.py +41 -8
- autogluon/tabular/models/lgb/callbacks.py +32 -9
- autogluon/tabular/models/lgb/hyperparameters/searchspaces.py +3 -1
- autogluon/tabular/models/lgb/lgb_model.py +150 -34
- autogluon/tabular/models/lgb/lgb_utils.py +12 -4
- autogluon/tabular/models/lr/hyperparameters/searchspaces.py +5 -1
- autogluon/tabular/models/lr/lr_model.py +40 -10
- autogluon/tabular/models/lr/lr_rapids_model.py +22 -13
- autogluon/tabular/models/mitra/_internal/__init__.py +1 -1
- autogluon/tabular/models/mitra/_internal/config/__init__.py +1 -1
- autogluon/tabular/models/mitra/_internal/config/config_pretrain.py +36 -40
- autogluon/tabular/models/mitra/_internal/config/config_run.py +2 -14
- autogluon/tabular/models/mitra/_internal/config/enums.py +27 -26
- autogluon/tabular/models/mitra/_internal/core/__init__.py +1 -1
- autogluon/tabular/models/mitra/_internal/core/callbacks.py +14 -21
- autogluon/tabular/models/mitra/_internal/core/get_loss.py +10 -12
- autogluon/tabular/models/mitra/_internal/core/get_optimizer.py +17 -32
- autogluon/tabular/models/mitra/_internal/core/get_scheduler.py +12 -27
- autogluon/tabular/models/mitra/_internal/core/prediction_metrics.py +16 -21
- autogluon/tabular/models/mitra/_internal/core/trainer_finetune.py +130 -111
- autogluon/tabular/models/mitra/_internal/data/__init__.py +1 -1
- autogluon/tabular/models/mitra/_internal/data/collator.py +30 -26
- autogluon/tabular/models/mitra/_internal/data/dataset_finetune.py +18 -26
- autogluon/tabular/models/mitra/_internal/data/dataset_split.py +10 -7
- autogluon/tabular/models/mitra/_internal/data/preprocessor.py +70 -100
- autogluon/tabular/models/mitra/_internal/models/__init__.py +1 -1
- autogluon/tabular/models/mitra/_internal/models/base.py +7 -10
- autogluon/tabular/models/mitra/_internal/models/embedding.py +46 -56
- autogluon/tabular/models/mitra/_internal/models/tab2d.py +140 -120
- autogluon/tabular/models/mitra/_internal/utils/__init__.py +1 -1
- autogluon/tabular/models/mitra/_internal/utils/set_seed.py +3 -1
- autogluon/tabular/models/mitra/mitra_model.py +16 -11
- autogluon/tabular/models/mitra/sklearn_interface.py +178 -162
- autogluon/tabular/models/realmlp/realmlp_model.py +28 -15
- autogluon/tabular/models/rf/compilers/onnx.py +1 -1
- autogluon/tabular/models/rf/rf_model.py +45 -12
- autogluon/tabular/models/rf/rf_quantile.py +4 -2
- autogluon/tabular/models/tabdpt/tabdpt_model.py +8 -17
- autogluon/tabular/models/tabicl/tabicl_model.py +8 -1
- autogluon/tabular/models/tabm/_tabm_internal.py +6 -4
- autogluon/tabular/models/tabm/rtdl_num_embeddings.py +80 -127
- autogluon/tabular/models/tabm/tabm_model.py +8 -4
- autogluon/tabular/models/tabm/tabm_reference.py +53 -85
- autogluon/tabular/models/tabpfnmix/_internal/core/callbacks.py +7 -16
- autogluon/tabular/models/tabpfnmix/_internal/core/collator.py +16 -24
- autogluon/tabular/models/tabpfnmix/_internal/core/dataset_split.py +5 -7
- autogluon/tabular/models/tabpfnmix/_internal/core/enums.py +0 -2
- autogluon/tabular/models/tabpfnmix/_internal/core/get_loss.py +0 -1
- autogluon/tabular/models/tabpfnmix/_internal/core/get_optimizer.py +7 -18
- autogluon/tabular/models/tabpfnmix/_internal/core/get_scheduler.py +3 -14
- autogluon/tabular/models/tabpfnmix/_internal/core/trainer_finetune.py +79 -64
- autogluon/tabular/models/tabpfnmix/_internal/core/y_transformer.py +3 -5
- autogluon/tabular/models/tabpfnmix/_internal/data/dataset_finetune.py +17 -30
- autogluon/tabular/models/tabpfnmix/_internal/data/preprocessor.py +15 -35
- autogluon/tabular/models/tabpfnmix/_internal/models/foundation/embedding.py +21 -38
- autogluon/tabular/models/tabpfnmix/_internal/models/foundation/foundation_transformer.py +33 -51
- autogluon/tabular/models/tabpfnmix/_internal/results/prediction_metrics.py +4 -4
- autogluon/tabular/models/tabpfnmix/_internal/tabpfnmix_classifier.py +32 -12
- autogluon/tabular/models/tabpfnmix/_internal/tabpfnmix_regressor.py +32 -13
- autogluon/tabular/models/tabpfnmix/tabpfnmix_model.py +55 -19
- autogluon/tabular/models/tabpfnv2/tabpfnv2_5_model.py +21 -48
- autogluon/tabular/models/tabprep/prep_mixin.py +34 -26
- autogluon/tabular/models/tabular_nn/compilers/onnx.py +36 -8
- autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py +130 -36
- autogluon/tabular/models/tabular_nn/torch/tabular_torch_dataset.py +8 -4
- autogluon/tabular/models/tabular_nn/torch/torch_network_modules.py +26 -5
- autogluon/tabular/models/tabular_nn/utils/categorical_encoders.py +41 -24
- autogluon/tabular/models/tabular_nn/utils/data_preprocessor.py +33 -8
- autogluon/tabular/models/tabular_nn/utils/nn_architecture_utils.py +21 -6
- autogluon/tabular/models/xgboost/callbacks.py +9 -3
- autogluon/tabular/models/xgboost/xgboost_model.py +59 -11
- autogluon/tabular/models/xt/xt_model.py +1 -0
- autogluon/tabular/predictor/interpretable_predictor.py +3 -1
- autogluon/tabular/predictor/predictor.py +409 -128
- autogluon/tabular/registry/__init__.py +1 -1
- autogluon/tabular/registry/_ag_model_registry.py +4 -5
- autogluon/tabular/registry/_model_registry.py +1 -0
- autogluon/tabular/testing/fit_helper.py +55 -15
- autogluon/tabular/testing/generate_datasets.py +1 -1
- autogluon/tabular/testing/model_fit_helper.py +10 -4
- autogluon/tabular/trainer/abstract_trainer.py +644 -230
- autogluon/tabular/trainer/auto_trainer.py +19 -8
- autogluon/tabular/trainer/model_presets/presets.py +33 -9
- autogluon/tabular/trainer/model_presets/presets_distill.py +16 -2
- autogluon/tabular/version.py +1 -1
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260117.dist-info}/METADATA +27 -27
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260117.dist-info}/RECORD +127 -135
- autogluon/tabular/models/tabpfnv2/rfpfn/__init__.py +0 -20
- autogluon/tabular/models/tabpfnv2/rfpfn/configs.py +0 -40
- autogluon/tabular/models/tabpfnv2/rfpfn/scoring_utils.py +0 -201
- autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_based_decision_tree_tabpfn.py +0 -1464
- autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_based_random_forest_tabpfn.py +0 -747
- autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_compat.py +0 -863
- autogluon/tabular/models/tabpfnv2/rfpfn/utils.py +0 -106
- autogluon/tabular/models/tabpfnv2/tabpfnv2_model.py +0 -466
- /autogluon.tabular-1.5.1b20260105-py3.11-nspkg.pth → /autogluon.tabular-1.5.1b20260117-py3.11-nspkg.pth +0 -0
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260117.dist-info}/WHEEL +0 -0
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260117.dist-info}/licenses/LICENSE +0 -0
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260117.dist-info}/licenses/NOTICE +0 -0
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260117.dist-info}/namespace_packages.txt +0 -0
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260117.dist-info}/top_level.txt +0 -0
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260117.dist-info}/zip-safe +0 -0
|
@@ -47,13 +47,16 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
47
47
|
ag.early_stop : int | str, default = "default"
|
|
48
48
|
Specifies the early stopping rounds. Defaults to an adaptive strategy. Recommended to keep default.
|
|
49
49
|
"""
|
|
50
|
+
|
|
50
51
|
ag_key = "NN_TORCH"
|
|
51
52
|
ag_name = "NeuralNetTorch"
|
|
52
53
|
ag_priority = 25
|
|
53
54
|
seed_name = "seed_value"
|
|
54
55
|
|
|
55
56
|
# Constants used throughout this class:
|
|
56
|
-
unique_category_str =
|
|
57
|
+
unique_category_str = (
|
|
58
|
+
np.nan
|
|
59
|
+
) # string used to represent missing values and unknown categories for categorical features.
|
|
57
60
|
|
|
58
61
|
def __init__(self, **kwargs):
|
|
59
62
|
super().__init__(**kwargs)
|
|
@@ -106,12 +109,16 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
106
109
|
device = torch.device("cuda")
|
|
107
110
|
logger.log(15, "Training on GPU (CUDA)")
|
|
108
111
|
if num_gpus > 1:
|
|
109
|
-
logger.warning(
|
|
110
|
-
|
|
112
|
+
logger.warning(
|
|
113
|
+
f"{self.__class__.__name__} not yet able to use more than 1 GPU. 'num_gpus' is set to >1, but we will be using only 1 GPU."
|
|
114
|
+
)
|
|
115
|
+
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
111
116
|
device = torch.device("mps")
|
|
112
117
|
logger.log(15, "Training on GPU (MPS - Apple Silicon)")
|
|
113
118
|
if num_gpus > 1:
|
|
114
|
-
logger.warning(
|
|
119
|
+
logger.warning(
|
|
120
|
+
f"{self.__class__.__name__} on Apple Silicon can only use 1 GPU (MPS). 'num_gpus' is set to >1, but we will be using only 1 GPU."
|
|
121
|
+
)
|
|
115
122
|
else:
|
|
116
123
|
device = torch.device("cpu")
|
|
117
124
|
logger.log(15, "Training on CPU")
|
|
@@ -126,7 +133,9 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
126
133
|
""" Sets dataset-adaptive default values to use for our neural network """
|
|
127
134
|
if self.problem_type in [REGRESSION, QUANTILE]:
|
|
128
135
|
if params["y_range"] is None:
|
|
129
|
-
params["y_range"] = infer_y_range(
|
|
136
|
+
params["y_range"] = infer_y_range(
|
|
137
|
+
y_vals=train_dataset.data_list[train_dataset.label_index], y_range_extend=y_range_extend
|
|
138
|
+
)
|
|
130
139
|
return params
|
|
131
140
|
|
|
132
141
|
def _get_default_loss_function(self):
|
|
@@ -143,7 +152,13 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
143
152
|
def _prepare_params(params):
|
|
144
153
|
params = params.copy()
|
|
145
154
|
|
|
146
|
-
processor_param_keys = {
|
|
155
|
+
processor_param_keys = {
|
|
156
|
+
"proc.embed_min_categories",
|
|
157
|
+
"proc.impute_strategy",
|
|
158
|
+
"proc.max_category_levels",
|
|
159
|
+
"proc.skew_threshold",
|
|
160
|
+
"use_ngram_features",
|
|
161
|
+
}
|
|
147
162
|
processor_kwargs = {k: v for k, v in params.items() if k in processor_param_keys}
|
|
148
163
|
for key in processor_param_keys:
|
|
149
164
|
params.pop(key, None)
|
|
@@ -201,14 +216,20 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
201
216
|
torch.manual_seed(seed_value)
|
|
202
217
|
|
|
203
218
|
if sample_weight is not None: # TODO: support
|
|
204
|
-
logger.log(
|
|
219
|
+
logger.log(
|
|
220
|
+
15,
|
|
221
|
+
f"sample_weight not yet supported for {self.__class__.__name__},"
|
|
222
|
+
" this model will ignore them in training.",
|
|
223
|
+
)
|
|
205
224
|
|
|
206
225
|
if num_cpus is not None:
|
|
207
226
|
self.num_dataloading_workers = max(1, int(num_cpus / 2.0))
|
|
208
227
|
else:
|
|
209
228
|
self.num_dataloading_workers = 1
|
|
210
229
|
if self.num_dataloading_workers == 1:
|
|
211
|
-
self.num_dataloading_workers =
|
|
230
|
+
self.num_dataloading_workers = (
|
|
231
|
+
0 # TODO: verify 0 is typically faster and uses less memory than 1 in pytorch
|
|
232
|
+
)
|
|
212
233
|
self.num_dataloading_workers = 0 # TODO: >0 crashes on MacOS
|
|
213
234
|
self.max_batch_size = params.pop("max_batch_size", 512)
|
|
214
235
|
|
|
@@ -298,7 +319,10 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
298
319
|
logging.debug("initialized")
|
|
299
320
|
train_dataloader = train_dataset.build_loader(batch_size, self.num_dataloading_workers, is_test=False)
|
|
300
321
|
|
|
301
|
-
if
|
|
322
|
+
if (
|
|
323
|
+
isinstance(loss_kwargs.get("loss_function", "auto"), str)
|
|
324
|
+
and loss_kwargs.get("loss_function", "auto") == "auto"
|
|
325
|
+
):
|
|
302
326
|
loss_kwargs["loss_function"] = self._get_default_loss_function()
|
|
303
327
|
if epochs_wo_improve is not None:
|
|
304
328
|
early_stopping_method = SimpleES(patience=epochs_wo_improve)
|
|
@@ -404,13 +428,18 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
404
428
|
|
|
405
429
|
# v1 estimate is sensitive to fixed cost overhead at the start of training, such as torch initialization.
|
|
406
430
|
# v2 fixes this, but we keep both and take the min to avoid potential cases where v2 is inaccurate due to an overly slow batch.
|
|
407
|
-
estimated_time_v1 =
|
|
408
|
-
|
|
431
|
+
estimated_time_v1 = (
|
|
432
|
+
time_elapsed_epoch / update_cur * num_updates_per_epoch
|
|
433
|
+
) # Less accurate than v2, but never underestimates time
|
|
434
|
+
estimated_time_v2 = time_elapsed_epoch + time_elapsed_batch * (
|
|
435
|
+
num_updates_per_epoch - update_cur
|
|
436
|
+
) # Less likely to overestimate time
|
|
409
437
|
estimated_time = min(estimated_time_v1, estimated_time_v2)
|
|
410
438
|
if estimated_time > time_limit:
|
|
411
439
|
logger.log(
|
|
412
440
|
30,
|
|
413
|
-
f"\tNot enough time to train first epoch. "
|
|
441
|
+
f"\tNot enough time to train first epoch. "
|
|
442
|
+
f"(Time Required: {round(estimated_time, 2)}s, Time Left: {round(time_limit, 2)}s)",
|
|
414
443
|
)
|
|
415
444
|
raise TimeLimitExceeded
|
|
416
445
|
time_elapsed = time_cur - start_fit_time
|
|
@@ -421,7 +450,10 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
421
450
|
f"\tNot enough time to train first epoch. Stopped on Update {total_updates} (Epoch {epoch}))",
|
|
422
451
|
)
|
|
423
452
|
raise TimeLimitExceeded
|
|
424
|
-
logger.log(
|
|
453
|
+
logger.log(
|
|
454
|
+
15,
|
|
455
|
+
f"\tRan out of time, stopping training early. (Stopped on Update {total_updates} (Epoch {epoch}))",
|
|
456
|
+
)
|
|
425
457
|
do_update = False
|
|
426
458
|
break
|
|
427
459
|
|
|
@@ -464,10 +496,10 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
464
496
|
is_best = True
|
|
465
497
|
best_val_metric = val_metric
|
|
466
498
|
io_buffer = io.BytesIO()
|
|
467
|
-
torch.save(self.model, io_buffer)
|
|
499
|
+
torch.save(self.model.state_dict(), io_buffer)
|
|
468
500
|
best_epoch = epoch
|
|
469
501
|
best_val_update = total_updates
|
|
470
|
-
early_stop = early_stopping_method.update(cur_round=epoch-1, is_best=is_best)
|
|
502
|
+
early_stop = early_stopping_method.update(cur_round=epoch - 1, is_best=is_best)
|
|
471
503
|
if verbose_eval:
|
|
472
504
|
logger.log(
|
|
473
505
|
15,
|
|
@@ -514,10 +546,13 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
514
546
|
|
|
515
547
|
# revert back to best model
|
|
516
548
|
if val_dataset is not None:
|
|
517
|
-
logger.log(
|
|
549
|
+
logger.log(
|
|
550
|
+
15,
|
|
551
|
+
f"Best model found on Epoch {best_epoch} (Update {best_val_update}). Val {self.stopping_metric.name}: {best_val_metric}",
|
|
552
|
+
)
|
|
518
553
|
if io_buffer is not None:
|
|
519
554
|
io_buffer.seek(0)
|
|
520
|
-
self.model
|
|
555
|
+
self.model.load_state_dict(torch.load(io_buffer, weights_only=True))
|
|
521
556
|
else:
|
|
522
557
|
logger.log(15, f"Best model found on Epoch {best_epoch} (Update {best_val_update}).")
|
|
523
558
|
self.params_trained["batch_size"] = batch_size
|
|
@@ -530,7 +565,9 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
530
565
|
elif isinstance(ag_early_stop, str) and ag_early_stop == "default":
|
|
531
566
|
early_stopping_method = self._get_early_stop_default()
|
|
532
567
|
elif isinstance(ag_early_stop, (str, tuple, list)):
|
|
533
|
-
early_stopping_rounds = self._get_early_stopping_rounds(
|
|
568
|
+
early_stopping_rounds = self._get_early_stopping_rounds(
|
|
569
|
+
num_rows_train=num_rows_train, strategy=ag_early_stop
|
|
570
|
+
)
|
|
534
571
|
early_stopping_method = early_stopping_rounds[0](**early_stopping_rounds[1])
|
|
535
572
|
elif isinstance(ag_early_stop, int):
|
|
536
573
|
early_stopping_method = SimpleES(patience=ag_early_stop)
|
|
@@ -573,8 +610,16 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
573
610
|
|
|
574
611
|
for metric in scorers:
|
|
575
612
|
train_metrics.append(self.score(X=train_dataset, y=y_train, metric=metric, _reset_threads=False))
|
|
576
|
-
val_metrics +=
|
|
577
|
-
|
|
613
|
+
val_metrics += (
|
|
614
|
+
[self.score(X=val_dataset, y=y_val, metric=metric, _reset_threads=False)]
|
|
615
|
+
if val_dataset is not None
|
|
616
|
+
else []
|
|
617
|
+
)
|
|
618
|
+
test_metrics += (
|
|
619
|
+
[self.score(X=test_dataset, y=y_test, metric=metric, _reset_threads=False)]
|
|
620
|
+
if test_dataset is not None
|
|
621
|
+
else []
|
|
622
|
+
)
|
|
578
623
|
|
|
579
624
|
if use_curve_metric_error:
|
|
580
625
|
train_metrics[-1] = metric.convert_score_to_error(train_metrics[-1])
|
|
@@ -585,8 +630,14 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
585
630
|
|
|
586
631
|
if (
|
|
587
632
|
not self._assert_valid_metric(metric=train_metrics[-1], best_epoch=best_epoch)
|
|
588
|
-
or (
|
|
589
|
-
|
|
633
|
+
or (
|
|
634
|
+
val_dataset is not None
|
|
635
|
+
and not self._assert_valid_metric(metric=val_metrics[-1], best_epoch=best_epoch)
|
|
636
|
+
)
|
|
637
|
+
or (
|
|
638
|
+
test_dataset is not None
|
|
639
|
+
and not self._assert_valid_metric(metric=test_metrics[-1], best_epoch=best_epoch)
|
|
640
|
+
)
|
|
590
641
|
):
|
|
591
642
|
return True
|
|
592
643
|
|
|
@@ -621,7 +672,10 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
621
672
|
"or NN weights may have diverged."
|
|
622
673
|
)
|
|
623
674
|
else:
|
|
624
|
-
logger.warning(
|
|
675
|
+
logger.warning(
|
|
676
|
+
f"Warning: NaNs encountered in {self.__class__.__name__} training. "
|
|
677
|
+
"Reverting model to last checkpoint without NaNs."
|
|
678
|
+
)
|
|
625
679
|
return False
|
|
626
680
|
return True
|
|
627
681
|
|
|
@@ -657,7 +711,9 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
657
711
|
preds_dataset = np.concatenate(preds_dataset, 0)
|
|
658
712
|
return preds_dataset
|
|
659
713
|
|
|
660
|
-
def _generate_dataset(
|
|
714
|
+
def _generate_dataset(
|
|
715
|
+
self, X: pd.DataFrame | TabularTorchDataset, y: pd.Series, train_params: dict = {}, is_train: bool = False
|
|
716
|
+
) -> TabularTorchDataset:
|
|
661
717
|
"""
|
|
662
718
|
Generate TabularTorchDataset from X and y.
|
|
663
719
|
|
|
@@ -721,7 +777,12 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
721
777
|
warnings.filterwarnings("ignore", module="sklearn.preprocessing")
|
|
722
778
|
if labels is not None and len(labels) != len(df):
|
|
723
779
|
raise ValueError("Number of examples in Dataframe does not match number of labels")
|
|
724
|
-
if
|
|
780
|
+
if (
|
|
781
|
+
self.processor is None
|
|
782
|
+
or self._types_of_features is None
|
|
783
|
+
or self.feature_arraycol_map is None
|
|
784
|
+
or self.feature_type_map is None
|
|
785
|
+
):
|
|
725
786
|
raise ValueError("Need to process training data before test data")
|
|
726
787
|
if self.features_to_drop:
|
|
727
788
|
drop_cols = [col for col in df.columns if col in self.features_to_drop]
|
|
@@ -732,7 +793,16 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
732
793
|
df = self.processor.transform(df)
|
|
733
794
|
return TabularTorchDataset(df, self.feature_arraycol_map, self.feature_type_map, self.problem_type, labels)
|
|
734
795
|
|
|
735
|
-
def _process_train_data(
|
|
796
|
+
def _process_train_data(
|
|
797
|
+
self,
|
|
798
|
+
df,
|
|
799
|
+
impute_strategy,
|
|
800
|
+
max_category_levels,
|
|
801
|
+
skew_threshold,
|
|
802
|
+
embed_min_categories,
|
|
803
|
+
use_ngram_features,
|
|
804
|
+
labels,
|
|
805
|
+
):
|
|
736
806
|
from .tabular_torch_dataset import TabularTorchDataset
|
|
737
807
|
|
|
738
808
|
# sklearn processing n_quantiles warning
|
|
@@ -744,13 +814,18 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
744
814
|
|
|
745
815
|
# dict with keys: : 'continuous', 'skewed', 'onehot', 'embed', values = column-names of df
|
|
746
816
|
self._types_of_features, df = self._get_types_of_features(
|
|
747
|
-
df,
|
|
817
|
+
df,
|
|
818
|
+
skew_threshold=skew_threshold,
|
|
819
|
+
embed_min_categories=embed_min_categories,
|
|
820
|
+
use_ngram_features=use_ngram_features,
|
|
748
821
|
)
|
|
749
822
|
logger.log(15, "Tabular Neural Network treats features as the following types:")
|
|
750
823
|
logger.log(15, json.dumps(self._types_of_features, indent=4))
|
|
751
824
|
logger.log(15, "\n")
|
|
752
825
|
if self.processor is not None:
|
|
753
|
-
Warning(
|
|
826
|
+
Warning(
|
|
827
|
+
f"Attempting to process training data for {self.__class__.__name__}, but previously already did this."
|
|
828
|
+
)
|
|
754
829
|
self.processor = create_preprocessor(
|
|
755
830
|
impute_strategy=impute_strategy,
|
|
756
831
|
max_category_levels=max_category_levels,
|
|
@@ -763,15 +838,22 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
763
838
|
)
|
|
764
839
|
df = self.processor.fit_transform(df)
|
|
765
840
|
# OrderedDict of feature-name -> list of column-indices in df corresponding to this feature
|
|
766
|
-
self.feature_arraycol_map = get_feature_arraycol_map(
|
|
767
|
-
|
|
841
|
+
self.feature_arraycol_map = get_feature_arraycol_map(
|
|
842
|
+
processor=self.processor, max_category_levels=max_category_levels
|
|
843
|
+
)
|
|
844
|
+
num_array_cols = np.sum(
|
|
845
|
+
[len(self.feature_arraycol_map[key]) for key in self.feature_arraycol_map]
|
|
846
|
+
) # should match number of columns in processed array
|
|
768
847
|
if num_array_cols != df.shape[1]:
|
|
769
848
|
raise ValueError(
|
|
770
|
-
"Error during one-hot encoding data processing for neural network. "
|
|
849
|
+
"Error during one-hot encoding data processing for neural network. "
|
|
850
|
+
"Number of columns in df array does not match feature_arraycol_map."
|
|
771
851
|
)
|
|
772
852
|
|
|
773
853
|
# OrderedDict of feature-name -> feature_type string (options: 'vector', 'embed')
|
|
774
|
-
self.feature_type_map = get_feature_type_map(
|
|
854
|
+
self.feature_type_map = get_feature_type_map(
|
|
855
|
+
feature_arraycol_map=self.feature_arraycol_map, types_of_features=self._types_of_features
|
|
856
|
+
)
|
|
775
857
|
return TabularTorchDataset(df, self.feature_arraycol_map, self.feature_type_map, self.problem_type, labels)
|
|
776
858
|
|
|
777
859
|
def _init_optimizer(self, optimizer, learning_rate, weight_decay):
|
|
@@ -801,7 +883,13 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
801
883
|
|
|
802
884
|
def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
|
|
803
885
|
hyperparameters = self._get_model_params()
|
|
804
|
-
return self.estimate_memory_usage_static(
|
|
886
|
+
return self.estimate_memory_usage_static(
|
|
887
|
+
X=X,
|
|
888
|
+
problem_type=self.problem_type,
|
|
889
|
+
num_classes=self.num_classes,
|
|
890
|
+
hyperparameters=hyperparameters,
|
|
891
|
+
**kwargs,
|
|
892
|
+
)
|
|
805
893
|
|
|
806
894
|
@classmethod
|
|
807
895
|
def _estimate_memory_usage_static(
|
|
@@ -877,7 +965,10 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
877
965
|
device = torch.device(original_device_type)
|
|
878
966
|
|
|
879
967
|
if verbose and (original_device_type != device.type):
|
|
880
|
-
logger.log(
|
|
968
|
+
logger.log(
|
|
969
|
+
15,
|
|
970
|
+
f"Model is trained on {original_device_type}, but the device is not available - loading on {device.type}",
|
|
971
|
+
)
|
|
881
972
|
|
|
882
973
|
model.device = device
|
|
883
974
|
model.model = model.model.to(model.device)
|
|
@@ -949,9 +1040,12 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
|
|
|
949
1040
|
|
|
950
1041
|
input_types = kwargs.get("input_types", self._get_input_types(batch_size=self.max_batch_size))
|
|
951
1042
|
assert isinstance(self.processor, ColumnTransformer), (
|
|
952
|
-
f"unexpected processor type {type(self.processor)}, "
|
|
1043
|
+
f"unexpected processor type {type(self.processor)}, "
|
|
1044
|
+
"expecting processor type to be sklearn.compose._column_transformer.ColumnTransformer"
|
|
1045
|
+
)
|
|
1046
|
+
self.processor = self._compiler.compile(
|
|
1047
|
+
model=(self.processor, self.model), path=self.path, input_types=input_types
|
|
953
1048
|
)
|
|
954
|
-
self.processor = self._compiler.compile(model=(self.processor, self.model), path=self.path, input_types=input_types)
|
|
955
1049
|
|
|
956
1050
|
@classmethod
|
|
957
1051
|
def supported_problem_types(cls) -> list[str] | None:
|
|
@@ -205,7 +205,9 @@ class TabularTorchDataset(torch.utils.data.IterableDataset):
|
|
|
205
205
|
feat_i = self.feature_groups["embed"][i]
|
|
206
206
|
feat_i_data = self.get_feature_data(feat_i).flatten().tolist()
|
|
207
207
|
num_categories_i = len(set(feat_i_data)) # number of categories for ith feature
|
|
208
|
-
num_categories_per_embedfeature[i] =
|
|
208
|
+
num_categories_per_embedfeature[i] = (
|
|
209
|
+
num_categories_i + 1
|
|
210
|
+
) # to account for unknown test-time categories
|
|
209
211
|
return num_categories_per_embedfeature
|
|
210
212
|
|
|
211
213
|
def get_feature_data(self, feature):
|
|
@@ -231,14 +233,14 @@ class TabularTorchDataset(torch.utils.data.IterableDataset):
|
|
|
231
233
|
dataobj_file = file_prefix + self.DATAOBJ_SUFFIX
|
|
232
234
|
if not os.path.exists(os.path.dirname(dataobj_file)):
|
|
233
235
|
os.makedirs(os.path.dirname(dataobj_file))
|
|
234
|
-
torch.save(self, dataobj_file)
|
|
236
|
+
torch.save(self, dataobj_file) # nosec B614
|
|
235
237
|
logger.debug("TabularPyTorchDataset Dataset saved to a file: \n %s" % dataobj_file)
|
|
236
238
|
|
|
237
239
|
@classmethod
|
|
238
240
|
def load(cls, file_prefix=""):
|
|
239
241
|
"""Additional naming changes will be appended to end of file_prefix (must contain full absolute path)"""
|
|
240
242
|
dataobj_file = file_prefix + cls.DATAOBJ_SUFFIX
|
|
241
|
-
dataset: TabularTorchDataset = torch.load(dataobj_file)
|
|
243
|
+
dataset: TabularTorchDataset = torch.load(dataobj_file) # nosec B614
|
|
242
244
|
logger.debug("TabularNN Dataset loaded from a file: \n %s" % dataobj_file)
|
|
243
245
|
return dataset
|
|
244
246
|
|
|
@@ -256,5 +258,7 @@ class TabularTorchDataset(torch.utils.data.IterableDataset):
|
|
|
256
258
|
self.shuffle = False if is_test else True
|
|
257
259
|
self.drop_last = False if is_test else True
|
|
258
260
|
generator = torch.Generator().manual_seed(torch.initial_seed()) if is_test else None
|
|
259
|
-
loader = torch.utils.data.DataLoader(
|
|
261
|
+
loader = torch.utils.data.DataLoader(
|
|
262
|
+
self, num_workers=num_workers, batch_size=None, worker_init_fn=worker_init_fn, generator=generator
|
|
263
|
+
) # no collation
|
|
260
264
|
return loader
|
|
@@ -16,7 +16,16 @@ class EmbedNet(nn.Module):
|
|
|
16
16
|
y_range: Used specifically for regression. = None for classification.
|
|
17
17
|
"""
|
|
18
18
|
|
|
19
|
-
def __init__(
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
problem_type,
|
|
22
|
+
num_net_outputs=None,
|
|
23
|
+
quantile_levels=None,
|
|
24
|
+
train_dataset=None,
|
|
25
|
+
architecture_desc=None,
|
|
26
|
+
device=None,
|
|
27
|
+
**kwargs,
|
|
28
|
+
):
|
|
20
29
|
if (architecture_desc is None) and (train_dataset is None):
|
|
21
30
|
raise ValueError("train_dataset cannot = None if architecture_desc=None")
|
|
22
31
|
super().__init__()
|
|
@@ -54,7 +63,9 @@ class EmbedNet(nn.Module):
|
|
|
54
63
|
if self.has_embed_features:
|
|
55
64
|
self.embed_blocks = nn.ModuleList()
|
|
56
65
|
for i in range(len(num_categs_per_feature)):
|
|
57
|
-
self.embed_blocks.append(
|
|
66
|
+
self.embed_blocks.append(
|
|
67
|
+
nn.Embedding(num_embeddings=num_categs_per_feature[i], embedding_dim=embed_dims[i])
|
|
68
|
+
)
|
|
58
69
|
input_size += embed_dims[i]
|
|
59
70
|
|
|
60
71
|
# update input size
|
|
@@ -189,9 +200,17 @@ class EmbedNet(nn.Module):
|
|
|
189
200
|
loss_data = torch.max(self.quantile_levels * error_data, (self.quantile_levels - 1) * error_data)
|
|
190
201
|
return loss_data.mean()
|
|
191
202
|
|
|
192
|
-
loss_data = torch.where(
|
|
203
|
+
loss_data = torch.where(
|
|
204
|
+
torch.abs(error_data) < self.alpha,
|
|
205
|
+
0.5 * error_data * error_data,
|
|
206
|
+
self.alpha * (torch.abs(error_data) - 0.5 * self.alpha),
|
|
207
|
+
)
|
|
193
208
|
loss_data /= self.alpha
|
|
194
|
-
scale = torch.where(
|
|
209
|
+
scale = torch.where(
|
|
210
|
+
error_data >= 0,
|
|
211
|
+
torch.ones_like(error_data) * self.quantile_levels,
|
|
212
|
+
torch.ones_like(error_data) * (1 - self.quantile_levels),
|
|
213
|
+
)
|
|
195
214
|
loss_data *= scale
|
|
196
215
|
return loss_data.mean()
|
|
197
216
|
|
|
@@ -226,7 +245,9 @@ class EmbedNet(nn.Module):
|
|
|
226
245
|
predict_data = self(data_batch)
|
|
227
246
|
target_data = data_batch[-1].to(self.device)
|
|
228
247
|
if self.problem_type in [BINARY, MULTICLASS]:
|
|
229
|
-
target_data = target_data.type(
|
|
248
|
+
target_data = target_data.type(
|
|
249
|
+
torch.long
|
|
250
|
+
) # Windows default int type is int32. Need to explicit convert to Long.
|
|
230
251
|
if self.problem_type == QUANTILE:
|
|
231
252
|
return self.quantile_loss(predict_data, target_data, margin=gamma)
|
|
232
253
|
if self.problem_type == SOFTCLASS:
|
|
@@ -6,9 +6,9 @@ Unknown categories are returned as None in inverse transforms. Always converts i
|
|
|
6
6
|
|
|
7
7
|
import copy
|
|
8
8
|
from numbers import Integral
|
|
9
|
-
from packaging.version import parse as parse_version
|
|
10
9
|
|
|
11
10
|
import numpy as np
|
|
11
|
+
from packaging.version import parse as parse_version
|
|
12
12
|
from scipy import sparse
|
|
13
13
|
from sklearn import __version__ as _sklearn_version
|
|
14
14
|
from sklearn.base import BaseEstimator, TransformerMixin
|
|
@@ -136,7 +136,7 @@ def _encode_check_unknown(values, uniques, return_mask=False):
|
|
|
136
136
|
diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True))
|
|
137
137
|
if return_mask:
|
|
138
138
|
if diff:
|
|
139
|
-
valid_mask = np.
|
|
139
|
+
valid_mask = np.isin(values, uniques)
|
|
140
140
|
else:
|
|
141
141
|
valid_mask = np.ones(len(values), dtype=bool)
|
|
142
142
|
return diff, valid_mask
|
|
@@ -203,11 +203,11 @@ class _BaseEncoder(BaseEstimator, TransformerMixin):
|
|
|
203
203
|
|
|
204
204
|
if self.categories != "auto":
|
|
205
205
|
if len(self.categories) != n_features:
|
|
206
|
-
raise ValueError("Shape mismatch: if categories is an array,
|
|
206
|
+
raise ValueError("Shape mismatch: if categories is an array, it has to be of shape (n_features,).")
|
|
207
207
|
|
|
208
208
|
if self.max_levels is not None:
|
|
209
209
|
if not isinstance(self.max_levels, Integral) or self.max_levels <= 0:
|
|
210
|
-
raise ValueError("max_levels must be None or a strictly
|
|
210
|
+
raise ValueError("max_levels must be None or a strictly positive int, got {}.".format(self.max_levels))
|
|
211
211
|
|
|
212
212
|
self.categories_ = []
|
|
213
213
|
self.infrequent_indices_ = []
|
|
@@ -220,11 +220,11 @@ class _BaseEncoder(BaseEstimator, TransformerMixin):
|
|
|
220
220
|
cats = np.array(self.categories[i], dtype=Xi.dtype)
|
|
221
221
|
if Xi.dtype != object:
|
|
222
222
|
if not np.all(np.sort(cats) == cats):
|
|
223
|
-
raise ValueError("Unsorted categories are not
|
|
223
|
+
raise ValueError("Unsorted categories are not supported for numerical categories")
|
|
224
224
|
if handle_unknown == "error":
|
|
225
225
|
diff = _encode_check_unknown(Xi, cats)
|
|
226
226
|
if diff:
|
|
227
|
-
msg = "Found unknown categories {0} in column {1}
|
|
227
|
+
msg = "Found unknown categories {0} in column {1} during fit".format(diff, i)
|
|
228
228
|
raise ValueError(msg)
|
|
229
229
|
self.categories_.append(cats)
|
|
230
230
|
|
|
@@ -264,7 +264,7 @@ class _BaseEncoder(BaseEstimator, TransformerMixin):
|
|
|
264
264
|
|
|
265
265
|
if not np.all(valid_mask):
|
|
266
266
|
if handle_unknown == "error":
|
|
267
|
-
msg = "Found unknown categories {0} in column {1}
|
|
267
|
+
msg = "Found unknown categories {0} in column {1} during transform".format(diff, i)
|
|
268
268
|
raise ValueError(msg)
|
|
269
269
|
else:
|
|
270
270
|
# Set the problematic rows to an acceptable value and
|
|
@@ -325,7 +325,7 @@ class _BaseEncoder(BaseEstimator, TransformerMixin):
|
|
|
325
325
|
A Tags object containing all tag information.
|
|
326
326
|
"""
|
|
327
327
|
# lazily import to avoid crashing if sklearn<1.6
|
|
328
|
-
from sklearn.utils import
|
|
328
|
+
from sklearn.utils import InputTags, Tags, TargetTags
|
|
329
329
|
|
|
330
330
|
# Create the Tags object with appropriate settings
|
|
331
331
|
tags = Tags(
|
|
@@ -433,13 +433,17 @@ class OneHotMergeRaresHandleUnknownEncoder(_BaseEncoder):
|
|
|
433
433
|
|
|
434
434
|
def _validate_keywords(self):
|
|
435
435
|
if self.handle_unknown not in ("error", "ignore"):
|
|
436
|
-
msg = "handle_unknown should be either 'error' or 'ignore',
|
|
436
|
+
msg = "handle_unknown should be either 'error' or 'ignore', got {0}.".format(self.handle_unknown)
|
|
437
437
|
raise ValueError(msg)
|
|
438
438
|
# If we have both dropped columns and ignored unknown
|
|
439
439
|
# values, there will be ambiguous cells. This creates difficulties
|
|
440
440
|
# in interpreting the model.
|
|
441
441
|
if self.drop is not None and self.handle_unknown != "error":
|
|
442
|
-
raise ValueError(
|
|
442
|
+
raise ValueError(
|
|
443
|
+
"`handle_unknown` must be 'error' when the drop parameter is "
|
|
444
|
+
"specified, as both would create categories that are all "
|
|
445
|
+
"zero."
|
|
446
|
+
)
|
|
443
447
|
|
|
444
448
|
def _compute_drop_idx(self):
|
|
445
449
|
if self.drop is None:
|
|
@@ -451,20 +455,25 @@ class OneHotMergeRaresHandleUnknownEncoder(_BaseEncoder):
|
|
|
451
455
|
self.drop = np.asarray(self.drop, dtype=object)
|
|
452
456
|
droplen = len(self.drop)
|
|
453
457
|
except (ValueError, TypeError):
|
|
454
|
-
msg = "Wrong input for parameter `drop`. Expected
|
|
458
|
+
msg = "Wrong input for parameter `drop`. Expected 'first', None or array of objects, got {}"
|
|
455
459
|
raise ValueError(msg.format(type(self.drop)))
|
|
456
460
|
if droplen != len(self.categories_):
|
|
457
|
-
msg = "`drop` should have length equal to the number
|
|
461
|
+
msg = "`drop` should have length equal to the number of features ({}), got {}"
|
|
458
462
|
raise ValueError(msg.format(len(self.categories_), len(self.drop)))
|
|
459
463
|
missing_drops = [(i, val) for i, val in enumerate(self.drop) if val not in self.categories_[i]]
|
|
460
464
|
if any(missing_drops):
|
|
461
|
-
msg =
|
|
462
|
-
"
|
|
465
|
+
msg = (
|
|
466
|
+
"The following categories were supposed to be "
|
|
467
|
+
"dropped, but were not found in the training "
|
|
468
|
+
"data.\n{}".format("\n".join(["Category: {}, Feature: {}".format(c, v) for c, v in missing_drops]))
|
|
463
469
|
)
|
|
464
470
|
raise ValueError(msg)
|
|
465
|
-
return np.array(
|
|
471
|
+
return np.array(
|
|
472
|
+
[np.where(cat_list == val)[0][0] for (val, cat_list) in zip(self.drop, self.categories_)],
|
|
473
|
+
dtype=np.int_,
|
|
474
|
+
)
|
|
466
475
|
else:
|
|
467
|
-
msg = "Wrong input for parameter `drop`. Expected
|
|
476
|
+
msg = "Wrong input for parameter `drop`. Expected 'first', None or array of objects, got {}"
|
|
468
477
|
raise ValueError(msg.format(type(self.drop)))
|
|
469
478
|
|
|
470
479
|
def _convert_cat_to_int(self, X):
|
|
@@ -497,12 +506,14 @@ class OneHotMergeRaresHandleUnknownEncoder(_BaseEncoder):
|
|
|
497
506
|
# check if user wants to manually drop a feature that is
|
|
498
507
|
# infrequent: this is not allowed
|
|
499
508
|
if self.drop is not None and not isinstance(self.drop, str):
|
|
500
|
-
for feature_idx, (infrequent_indices, drop_idx) in enumerate(
|
|
509
|
+
for feature_idx, (infrequent_indices, drop_idx) in enumerate(
|
|
510
|
+
zip(self.infrequent_indices_, self.drop_idx_)
|
|
511
|
+
):
|
|
501
512
|
if drop_idx in infrequent_indices:
|
|
502
513
|
raise ValueError(
|
|
503
|
-
"Category {} of feature {} is infrequent and thus "
|
|
504
|
-
|
|
505
|
-
)
|
|
514
|
+
"Category {} of feature {} is infrequent and thus "
|
|
515
|
+
"cannot be dropped. Use drop='infrequent' "
|
|
516
|
+
"instead.".format(self.categories_[feature_idx][drop_idx], feature_idx)
|
|
506
517
|
)
|
|
507
518
|
return self
|
|
508
519
|
|
|
@@ -614,7 +625,7 @@ class OneHotMergeRaresHandleUnknownEncoder(_BaseEncoder):
|
|
|
614
625
|
n_transformed_features = sum(len(cats) - 1 for cats in self.categories_)
|
|
615
626
|
|
|
616
627
|
# validate shape of passed X
|
|
617
|
-
msg = "Shape of the passed X data is not correct. Expected {0}
|
|
628
|
+
msg = "Shape of the passed X data is not correct. Expected {0} columns, got {1}."
|
|
618
629
|
if X.shape[1] != n_transformed_features:
|
|
619
630
|
raise ValueError(msg.format(n_transformed_features, X.shape[1]))
|
|
620
631
|
|
|
@@ -686,7 +697,11 @@ class OneHotMergeRaresHandleUnknownEncoder(_BaseEncoder):
|
|
|
686
697
|
if input_features is None:
|
|
687
698
|
input_features = ["x%d" % i for i in range(len(cats))]
|
|
688
699
|
elif len(input_features) != len(self.categories_):
|
|
689
|
-
raise ValueError(
|
|
700
|
+
raise ValueError(
|
|
701
|
+
"input_features should have length equal to number of features ({}), got {}".format(
|
|
702
|
+
len(self.categories_), len(input_features)
|
|
703
|
+
)
|
|
704
|
+
)
|
|
690
705
|
|
|
691
706
|
feature_names = []
|
|
692
707
|
for i in range(len(cats)):
|
|
@@ -788,7 +803,9 @@ class OrdinalMergeRaresHandleUnknownEncoder(_BaseEncoder):
|
|
|
788
803
|
"""
|
|
789
804
|
X = self._label_encoder.transform(X)
|
|
790
805
|
X_og_array = np.array(X) # original X array before transform
|
|
791
|
-
X_int, _ = self._transform(
|
|
806
|
+
X_int, _ = self._transform(
|
|
807
|
+
X, handle_unknown="ignore"
|
|
808
|
+
) # will contain zeros for 0th category as well as unknown values.
|
|
792
809
|
|
|
793
810
|
for i in range(X_int.shape[1]):
|
|
794
811
|
X_col_data = X_og_array[:, i]
|
|
@@ -822,7 +839,7 @@ class OrdinalMergeRaresHandleUnknownEncoder(_BaseEncoder):
|
|
|
822
839
|
n_features = len(self.categories_)
|
|
823
840
|
|
|
824
841
|
# validate shape of passed X
|
|
825
|
-
msg = "Shape of the passed X data is not correct. Expected {0}
|
|
842
|
+
msg = "Shape of the passed X data is not correct. Expected {0} columns, got {1}."
|
|
826
843
|
if X.shape[1] != n_features:
|
|
827
844
|
raise ValueError(msg.format(n_features, X.shape[1]))
|
|
828
845
|
|