autogluon.tabular 1.2.1b20250407__py3-none-any.whl → 1.2.1b20250409__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. autogluon/tabular/register/_ag_model_register.py +0 -2
  2. autogluon/tabular/version.py +1 -1
  3. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/METADATA +13 -13
  4. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/RECORD +11 -22
  5. autogluon/tabular/models/tab_transformer/__init__.py +0 -1
  6. autogluon/tabular/models/tab_transformer/hyperparameters/__init__.py +0 -1
  7. autogluon/tabular/models/tab_transformer/hyperparameters/parameters.py +0 -66
  8. autogluon/tabular/models/tab_transformer/hyperparameters/searchspaces.py +0 -17
  9. autogluon/tabular/models/tab_transformer/modified_transformer.py +0 -494
  10. autogluon/tabular/models/tab_transformer/pretexts.py +0 -150
  11. autogluon/tabular/models/tab_transformer/tab_model_base.py +0 -86
  12. autogluon/tabular/models/tab_transformer/tab_transformer.py +0 -183
  13. autogluon/tabular/models/tab_transformer/tab_transformer_encoder.py +0 -668
  14. autogluon/tabular/models/tab_transformer/tab_transformer_model.py +0 -540
  15. autogluon/tabular/models/tab_transformer/utils.py +0 -124
  16. /autogluon.tabular-1.2.1b20250407-py3.9-nspkg.pth → /autogluon.tabular-1.2.1b20250409-py3.9-nspkg.pth +0 -0
  17. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/LICENSE +0 -0
  18. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/NOTICE +0 -0
  19. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/WHEEL +0 -0
  20. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/namespace_packages.txt +0 -0
  21. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/top_level.txt +0 -0
  22. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/zip-safe +0 -0
@@ -1,540 +0,0 @@
1
- """TabTransformer model"""
2
- from __future__ import annotations
3
-
4
- import logging
5
- import os
6
- import time
7
-
8
- import numpy as np
9
- import pandas as pd
10
- from tqdm import tqdm
11
-
12
- from autogluon.common.features.types import R_OBJECT, S_TEXT_AS_CATEGORY, S_TEXT_NGRAM
13
- from autogluon.common.utils.try_import import try_import_torch
14
- from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION
15
- from autogluon.core.models.abstract.abstract_nn_model import AbstractNeuralNetworkModel
16
- from autogluon.core.utils.loaders import load_pkl
17
-
18
- from .hyperparameters.parameters import get_default_param
19
- from .hyperparameters.searchspaces import get_default_searchspace
20
-
21
- logger = logging.getLogger(__name__)
22
-
23
- """
24
- TODO: Fix Mac OS X warning spam.
25
- The error message is:
26
- Cannot set number of intraop threads after parallel work has started or after set_num_threads call when using native parallel backend (function set_num_threads)
27
- This has been investigated to be a harmless warning for training and running inference on TabTransformer.
28
- This warning can occur with a very specific environment: torch 1.7, Mac OS X, Python 3.6/3.7, when using torch DataLoader.
29
- https://github.com/pytorch/pytorch/issues/46409
30
- """
31
-
32
-
33
- class TabTransformerModel(AbstractNeuralNetworkModel):
34
- """
35
- Main TabTransformer model that inherits from AbstractModel.
36
-
37
- This model includes the full torch pipeline (TabNet) and the internal Transformer embeddings (TabTransformer).
38
- This file serves as the connection of all these internal models and architectures to AutoGluon.
39
-
40
- TabTransformer uses modifications to the typical Transformer architecture and the pretraining in BERT
41
- and applies them to the use case of tabular data. Specifically, this makes TabTransformer suitable for unsupervised
42
- training of Tabular data with a subsequent fine-tuning step on labeled data.
43
- """
44
- ag_key = "TRANSF"
45
- ag_name = "Transformer"
46
-
47
- params_file_name = "tab_trans_params.pth"
48
-
49
- def __init__(self, **kwargs):
50
- try_import_torch()
51
- super().__init__(**kwargs)
52
- self._verbosity = None
53
- self._temp_file_name = "tab_trans_temp.pth"
54
- self._period_columns_mapping = None
55
-
56
- def _set_default_params(self):
57
- default_params = get_default_param()
58
- for param, val in default_params.items():
59
- self._set_default_param_value(param, val)
60
-
61
- def _get_default_auxiliary_params(self) -> dict:
62
- default_auxiliary_params = super()._get_default_auxiliary_params()
63
- extra_auxiliary_params = dict(
64
- ignored_type_group_raw=[R_OBJECT],
65
- ignored_type_group_special=[S_TEXT_NGRAM, S_TEXT_AS_CATEGORY],
66
- )
67
- default_auxiliary_params.update(extra_auxiliary_params)
68
- return default_auxiliary_params
69
-
70
- def _get_model(self):
71
- from .tab_model_base import TabNet
72
-
73
- # If we have already initialized the model, we don't need to do it again.
74
- model = TabNet(self.params["n_classes"], self.params["feature_dim"], self.params["num_output_layers"], self.device, self.params)
75
- if self.device.type == "cuda":
76
- model = model.cuda()
77
-
78
- return model
79
-
80
- # NOTE: Making an assumption that X_unlabeled will not have a different schema. Otherwise, we would need two
81
- # period_columns_mapping fields. One for X/X_val, another for X_unlabeled, which may have different columns.
82
- @staticmethod
83
- def _get_no_period_columns(columns):
84
- # Latest pytorch does not support . in module names. Therefore, we must replace the ".".
85
- rename_columns = dict()
86
- for col in columns:
87
- new_col_name = col
88
- if "." in col:
89
- new_col_name = col.replace(".", "_")
90
-
91
- if new_col_name in rename_columns:
92
- for i in range(1, 100):
93
- append_col_name = new_col_name + "_" + str(i)
94
- if append_col_name not in rename_columns:
95
- new_col_name = append_col_name
96
- break
97
- else:
98
- raise RuntimeError("Tried 100 column renames to eliminate duplicates.\n" "Please check similar columns with . or _ in them.")
99
-
100
- # Mapping for every column
101
- rename_columns[col] = new_col_name
102
-
103
- return rename_columns
104
-
105
- def _preprocess(self, X, **kwargs):
106
- from .utils import TabTransformerDataset
107
-
108
- X = super()._preprocess(X=X, **kwargs)
109
-
110
- X = X.rename(columns=self._period_columns_mapping)
111
- encoders = self.params["encoders"]
112
- data = TabTransformerDataset(X, encoders=encoders, problem_type=self.problem_type, col_info=self._types_of_features)
113
- data.encode(self.fe)
114
-
115
- return data
116
-
117
- def _preprocess_train(self, X, X_val=None, X_unlabeled=None, fe=None):
118
- """
119
- Pre-processing specific to TabTransformer. Setting up feature encoders, renaming columns with periods in
120
- them (torch), and converting X, X_val, X_unlabeled into TabTransformerDataset's.
121
- """
122
- from .utils import TabTransformerDataset
123
-
124
- X = self._preprocess_nonadaptive(X)
125
- if X_val is not None:
126
- X_val = self._preprocess_nonadaptive(X_val)
127
- if X_unlabeled is not None:
128
- X_unlabeled = self._preprocess_nonadaptive(X_unlabeled)
129
-
130
- self._period_columns_mapping = self._get_no_period_columns(X.columns)
131
- X = X.rename(columns=self._period_columns_mapping)
132
-
133
- if X_val is not None:
134
- X_val = X_val.rename(columns=self._period_columns_mapping)
135
- if X_unlabeled is not None:
136
- X_unlabeled = X_unlabeled.rename(columns=self._period_columns_mapping)
137
-
138
- self._types_of_features, _ = self._get_types_of_features(X, needs_extra_types=False)
139
-
140
- # Also need to rename the feature names in the types_of_features dictionary.
141
- for feature_dict in self._types_of_features:
142
- # Need to check that the value is in the mapping. Otherwise, we could be updating columns that have been dropped.
143
- feature_dict.update(("name", self._period_columns_mapping[v]) for k, v in feature_dict.items() if k == "name" and v in self._period_columns_mapping)
144
-
145
- encoders = self.params["encoders"]
146
- data = TabTransformerDataset(X, encoders=encoders, problem_type=self.problem_type, col_info=self._types_of_features)
147
- self.fe = fe
148
- if self.fe is not None:
149
- if X_unlabeled is None:
150
- unlab_data = None
151
- elif X_unlabeled is not None:
152
- unlab_data = TabTransformerDataset(X_unlabeled, encoders=encoders, problem_type=self.problem_type, col_info=self._types_of_features)
153
- if self.fe is None:
154
- if X_unlabeled is None:
155
- data.fit_feat_encoders()
156
- self.fe = data.feature_encoders
157
- unlab_data = None
158
- elif X_unlabeled is not None:
159
- unlab_data = TabTransformerDataset(X_unlabeled, encoders=encoders, problem_type=self.problem_type, col_info=self._types_of_features)
160
- unlab_data.fit_feat_encoders()
161
- self.fe = unlab_data.feature_encoders
162
-
163
- data.encode(self.fe)
164
-
165
- if X_val is not None:
166
- val_data = TabTransformerDataset(X_val, encoders=encoders, problem_type=self.problem_type, col_info=self._types_of_features)
167
- val_data.encode(self.fe)
168
- else:
169
- val_data = None
170
-
171
- if unlab_data is not None:
172
- unlab_data.encode(self.fe)
173
-
174
- return data, val_data, unlab_data
175
-
176
- def _epoch(
177
- self, net, loader_train, loader_val, y_val, optimizers, loss_criterion, pretext, state, scheduler, epoch, epochs, databar_disable, reporter, params
178
- ):
179
- """
180
- Helper function to run one epoch of training, essentially the "inner loop" of training.
181
- """
182
- import torch
183
-
184
- from .utils import augmentation
185
-
186
- is_train = optimizers is not None
187
- net.train() if is_train else net.eval()
188
- total_loss, total_correct, total_num = 0.0, 0.0, 0
189
- data_bar = tqdm(loader_train, disable=databar_disable) if is_train else tqdm(loader_val, disable=databar_disable)
190
-
191
- with torch.enable_grad() if is_train else torch.no_grad():
192
- for data, target in data_bar:
193
- data, target = pretext.get(data, target)
194
-
195
- if self.device.type == "cuda":
196
- data, target = data.cuda(), target.cuda()
197
- pretext = pretext.cuda()
198
-
199
- if state in [None, "finetune"]:
200
- if self.params["num_augs"] > 0:
201
- data, target = augmentation(data, target, **params)
202
- out, _ = net(data)
203
- elif state == "pretrain":
204
- _, out = net(data)
205
- else:
206
- raise NotImplementedError("state must be one of [None, 'pretrain', 'finetune']")
207
-
208
- loss, correct = pretext(out, target)
209
-
210
- if is_train:
211
- for optimizer in optimizers:
212
- optimizer.zero_grad()
213
- loss.backward()
214
- for optimizer in optimizers:
215
- optimizer.step()
216
-
217
- total_num += 1
218
- total_loss += loss.item()
219
-
220
- if epochs == 1:
221
- train_test = "Test"
222
- else:
223
- train_test = "Train"
224
-
225
- val_metric = None
226
- if loader_val is not None and state != "pretrain":
227
- val_metric = self.score(X=loader_val, y=y_val, metric=self.stopping_metric)
228
- data_bar.set_description(
229
- "{} Epoch: [{}/{}] Train Loss: {:.4f} Validation {}: {:.2f}".format(
230
- train_test, epoch, epochs, total_loss / total_num, self.stopping_metric.name, val_metric
231
- )
232
- )
233
-
234
- if reporter is not None:
235
- reporter(epoch=epoch + 1, validation_performance=val_metric, train_loss=total_loss)
236
-
237
- else:
238
- data_bar.set_description("{} Epoch: [{}/{}] Loss: {:.4f}".format(train_test, epoch, epochs, total_loss / total_num))
239
-
240
- return total_loss / total_num, val_metric
241
-
242
- if scheduler is not None:
243
- scheduler.step()
244
- return total_loss / total_num
245
-
246
- def tt_fit(self, loader_train, loader_val=None, y_val=None, state=None, time_limit=None, reporter=None):
247
- """
248
- Main training function for TabTransformer
249
- "state" must be one of [None, 'pretrain', 'finetune']
250
- None: corresponds to purely supervised learning
251
- pretrain: discriminative task will be a pretext task
252
- finetune: same as supervised learning except that the model base has
253
- exponentially decaying learning rate.
254
- """
255
- import torch
256
- import torch.nn as nn
257
- import torch.optim as optim
258
-
259
- from . import pretexts
260
-
261
- start_time = time.time()
262
- pretext_tasks = pretexts.__dict__
263
- optimizers = []
264
- lr = self.params["lr"]
265
- weight_decay = self.params["weight_decay"]
266
- epochs = self.params["pretrain_epochs"] if state == "pretrain" else self.params["epochs"]
267
- epochs_wo_improve = self.params["epochs_wo_improve"]
268
-
269
- if state is None:
270
- optimizers = [optim.Adam(self.model.parameters(), lr=lr, weight_decay=weight_decay)]
271
- pretext = pretext_tasks["SupervisedPretext"](self.problem_type, self.device)
272
- elif state == "pretrain":
273
- optimizers = [optim.Adam(self.model.parameters(), lr=lr, weight_decay=weight_decay)]
274
- pretext = pretext_tasks["BERTPretext"](self.cat_feat_origin_cards, self.device, self.params["hidden_dim"])
275
- elif state == "finetune":
276
- base_exp_decay = self.params["base_exp_decay"]
277
- optimizer_fc = [optim.Adam(fc_layer.parameters(), lr=lr, weight_decay=weight_decay) for fc_layer in self.model.fc]
278
- optimizer_embeds = optim.Adam(self.model.embed.parameters(), lr=lr, weight_decay=weight_decay)
279
- scheduler = optim.lr_scheduler.ExponentialLR(optimizer_embeds, gamma=base_exp_decay) # TODO: Should we be using this in _epoch()?
280
- optimizers.extend(optimizer_fc)
281
- optimizers.append(optimizer_embeds)
282
-
283
- pretext = pretext_tasks["SupervisedPretext"](self.problem_type, self.device)
284
-
285
- else:
286
- raise NotImplementedError("state must be one of [None, 'pretrain', 'finetune']")
287
-
288
- if self.problem_type == REGRESSION:
289
- loss_criterion = nn.MSELoss()
290
- else:
291
- loss_criterion = nn.CrossEntropyLoss()
292
-
293
- best_val_metric = -np.inf # higher = better
294
- best_val_epoch = 0
295
- best_loss = np.inf
296
-
297
- if self._verbosity <= 1:
298
- verbose_eval = -1
299
- elif self._verbosity == 2:
300
- verbose_eval = 50
301
- elif self._verbosity == 3:
302
- verbose_eval = 10
303
- else:
304
- verbose_eval = 1
305
-
306
- if verbose_eval <= 0:
307
- databar_disable = True # Whether or not we want to suppress output based on our verbosity
308
- else:
309
- databar_disable = False
310
-
311
- for e in range(epochs):
312
- if e == 0:
313
- logger.log(15, "TabTransformer architecture:")
314
- logger.log(15, str(self.model))
315
-
316
- train_loss, val_metric = self._epoch(
317
- net=self.model,
318
- loader_train=loader_train,
319
- loader_val=loader_val,
320
- y_val=y_val,
321
- optimizers=optimizers,
322
- loss_criterion=loss_criterion,
323
- pretext=pretext,
324
- state=state,
325
- scheduler=None,
326
- epoch=e,
327
- epochs=epochs,
328
- databar_disable=databar_disable,
329
- reporter=reporter,
330
- params=self.params,
331
- )
332
-
333
- # Early stopping for pretrain'ing based on loss.
334
- if state == "pretrain":
335
- if train_loss < best_loss or e == 0:
336
- if train_loss < best_loss:
337
- best_loss = train_loss
338
- best_val_epoch = e
339
- else:
340
- if val_metric >= best_val_metric or e == 0:
341
- if loader_val is not None:
342
- if not np.isnan(val_metric):
343
- best_val_metric = val_metric
344
-
345
- best_val_epoch = e
346
- os.makedirs(os.path.dirname(self.path), exist_ok=True)
347
- torch.save(self.model, os.path.join(self.path, self._temp_file_name)) # nosec B614
348
-
349
- # If time limit has exceeded or we haven't improved in some number of epochs, stop early.
350
- if e - best_val_epoch > epochs_wo_improve:
351
- break
352
- if time_limit:
353
- time_elapsed = time.time() - start_time
354
- time_left = time_limit - time_elapsed
355
- if time_left <= 0:
356
- logger.log(20, "\tRan out of time, stopping training early.")
357
- break
358
-
359
- if loader_val is not None:
360
- try:
361
- self.model = torch.load(os.path.join(self.path, self._temp_file_name)) # nosec B614
362
- os.remove(os.path.join(self.path, self._temp_file_name))
363
- except:
364
- pass
365
- logger.log(15, "Best model found in epoch %d" % best_val_epoch)
366
-
367
- def _fit(self, X, y, X_val=None, y_val=None, X_unlabeled=None, time_limit=None, sample_weight=None, reporter=None, **kwargs):
368
- import torch
369
-
370
- self._verbosity = kwargs.get("verbosity", 2)
371
- num_gpus = kwargs.get("num_gpus", None)
372
- if num_gpus is None:
373
- if torch.cuda.is_available():
374
- self.device = torch.device("cuda")
375
- else:
376
- self.device = torch.device("cpu")
377
- elif num_gpus == 0:
378
- self.device = torch.device("cpu")
379
- else:
380
- self.device = torch.device("cuda")
381
-
382
- if num_gpus > 1:
383
- logger.warning("TabTransformer not yet configured to use more than 1 GPU. 'num_gpus' set to >1, but we will be using only 1 GPU.")
384
-
385
- if sample_weight is not None:
386
- logger.log(15, "sample_weight not yet supported for TabTransformerModel, this model will ignore them in training.")
387
-
388
- if self.problem_type == REGRESSION:
389
- self.params["n_classes"] = 1
390
- elif self.problem_type == BINARY:
391
- self.params["n_classes"] = 2
392
- elif self.problem_type == MULTICLASS:
393
- self.params["n_classes"] = y.nunique()
394
-
395
- train, val, unlab = self._preprocess_train(X, X_val, X_unlabeled)
396
-
397
- num_cols = len(train.columns)
398
- if num_cols > self.params["max_columns"]:
399
- raise NotImplementedError(
400
- f"This dataset has {num_cols} columns and exceeds 'max_columns' == {self.params['max_columns']}.\n"
401
- f"Which is set by default to ensure the TabTransformer model will not run out of memory.\n"
402
- f"If you are confident you will have enough memory, set the 'max_columns' hyperparameter higher and try again.\n"
403
- )
404
-
405
- if self.problem_type == REGRESSION:
406
- train.targets = torch.FloatTensor(list(y))
407
- val.targets = torch.FloatTensor(list(y_val))
408
- else:
409
- train.targets = torch.LongTensor(list(y))
410
- val.targets = torch.LongTensor(list(y_val))
411
-
412
- batch_size = self.params["batch_size"]
413
- num_workers = self.params["num_workers"]
414
-
415
- loader_train = train.build_loader(batch_size, num_workers, shuffle=True)
416
- loader_val = val.build_loader(batch_size, num_workers)
417
- loader_unlab = unlab.build_loader(batch_size, num_workers) if unlab is not None else None
418
-
419
- self.cat_feat_origin_cards = loader_train.cat_feat_origin_cards
420
- self.params["cat_feat_origin_cards"] = self.cat_feat_origin_cards
421
-
422
- self.model = self._get_model()
423
-
424
- if X_unlabeled is not None:
425
- # Can't spend all the time in pretraining, have to split it up.
426
- pretrain_time_limit = time_limit / 2 if time_limit is not None else time_limit
427
- pretrain_before_time = time.time()
428
- self.tt_fit(loader_unlab, loader_val, y_val, state="pretrain", time_limit=pretrain_time_limit, reporter=reporter)
429
- finetune_time_limit = time_limit - (time.time() - pretrain_before_time) if time_limit is not None else time_limit
430
- self.tt_fit(loader_train, loader_val, y_val, state="finetune", time_limit=finetune_time_limit, reporter=reporter)
431
- else:
432
- self.tt_fit(loader_train, loader_val, y_val, time_limit=time_limit, reporter=reporter)
433
-
434
- def _predict_proba(self, X, **kwargs):
435
- """
436
- X (torch.tensor or pd.dataframe): data for model to give prediction probabilities
437
- returns: np.array of k-probabilities for each of the k classes. If k=2 we drop the second probability.
438
- """
439
- import torch
440
- import torch.nn as nn
441
- from torch.autograd import Variable
442
- from torch.utils.data import DataLoader
443
-
444
- if isinstance(X, pd.DataFrame):
445
- # Preprocess here also calls our _preprocess, which creates a TTDataset.
446
- X = self.preprocess(X, **kwargs)
447
- loader = X.build_loader(self.params["batch_size"], self.params["num_workers"])
448
- elif isinstance(X, DataLoader):
449
- loader = X
450
- elif isinstance(X, torch.Tensor):
451
- X = X.rename(columns=self._get_no_period_columns(X))
452
- loader = X.build_loader(self.params["batch_size"], self.params["num_workers"])
453
- else:
454
- raise NotImplementedError(
455
- "Attempting to predict against a non-supported data type. \nNeeds to be a pandas DataFrame, torch DataLoader or torch Tensor."
456
- )
457
-
458
- self.model.eval()
459
- softmax = nn.Softmax(dim=1)
460
-
461
- if self.problem_type == REGRESSION:
462
- outputs = torch.zeros([len(loader.dataset), 1])
463
- else:
464
- outputs = torch.zeros([len(loader.dataset), self.num_classes])
465
-
466
- iter = 0
467
- for data, _ in loader:
468
- if self.device.type == "cuda":
469
- data = data.cuda()
470
- with torch.no_grad():
471
- data = Variable(data)
472
- prob, _ = self.model(data)
473
- batch_size = len(prob)
474
- if self.problem_type != REGRESSION:
475
- prob = softmax(prob)
476
-
477
- outputs[iter : (iter + batch_size)] = prob
478
- iter += batch_size
479
-
480
- if self.problem_type == BINARY:
481
- return outputs[:, 1].cpu().numpy()
482
- elif self.problem_type == REGRESSION:
483
- outputs = outputs.flatten()
484
-
485
- return outputs.cpu().numpy()
486
-
487
- def _get_default_searchspace(self):
488
- return get_default_searchspace()
489
-
490
- @classmethod
491
- def supported_problem_types(cls) -> list[str] | None:
492
- return ["binary", "multiclass", "regression"]
493
-
494
- def save(self, path: str = None, verbose=True) -> str:
495
- import torch
496
-
497
- if path is None:
498
- path = self.path
499
-
500
- params_filepath = os.path.join(path, self.params_file_name)
501
-
502
- os.makedirs(os.path.dirname(path), exist_ok=True)
503
-
504
- temp_model = self.model
505
- if self.model is not None:
506
- torch.save(self.model, params_filepath) # nosec B614
507
-
508
- self.model = None # Avoiding pickling the weights.
509
- modelobj_filepath = super().save(path=path, verbose=verbose)
510
-
511
- self.model = temp_model
512
-
513
- return modelobj_filepath
514
-
515
- @classmethod
516
- def load(cls, path: str, reset_paths=False, verbose=True):
517
- import torch
518
-
519
- obj: TabTransformerModel = load_pkl.load(path=os.path.join(path, cls.model_file_name), verbose=verbose)
520
- if reset_paths:
521
- obj.set_contexts(path)
522
-
523
- obj.model = torch.load(os.path.join(path, cls.params_file_name)) # nosec B614
524
-
525
- return obj
526
-
527
- """
528
- List of features to add (Updated by Anthony Galczak 11-19-20):
529
-
530
- 1) Allow for saving of pretrained model for future use. This will be done in a future PR as the
531
- "pretrain API change".
532
-
533
- 2) Investigate options for when the unlabeled schema does not match the training schema. Currently,
534
- we do not allow such mismatches and the schemas must match exactly. We can investigate ways to use
535
- less or more columns from the unlabeled data. This will likely require a design meeting.
536
-
537
- 3) Bug where HPO doesn't work when cuda is enabled.
538
- "RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method"
539
- Update: This will likely be fixed in a future change to HPO in AutoGluon.
540
- """
@@ -1,124 +0,0 @@
1
- import logging
2
-
3
- import torch
4
- from torch.utils.data import DataLoader, Dataset
5
-
6
- from autogluon.core.constants import REGRESSION
7
-
8
- from . import tab_transformer_encoder
9
- from .tab_transformer_encoder import NullEnc, WontEncodeError
10
-
11
- logger = logging.getLogger(__name__)
12
-
13
-
14
- def augmentation(data, target, **params):
15
- shape = data.shape
16
- cat_data = torch.cat([data for _ in range(params["num_augs"])])
17
- target = torch.cat([target for _ in range(params["num_augs"])]).view(-1)
18
- locs_to_mask = torch.empty_like(cat_data, dtype=float).uniform_() < params["aug_mask_prob"]
19
- cat_data[locs_to_mask] = 0
20
- cat_data = cat_data.view(-1, shape[-1])
21
- return cat_data, target
22
-
23
-
24
- def get_col_info(X):
25
- """
26
- If we somehow do not get col_info when creating a TT dataset, then set every feature type to CATEGORICAL.
27
- """
28
- cols = list(X.columns)
29
- col_info = []
30
- for c in cols:
31
- col_info.append({"name": c, "type": "CATEGORICAL"})
32
- return col_info
33
-
34
-
35
- class TabTransformerDataset(Dataset):
36
- def __init__(self, X, encoders, problem_type, y=None, col_info=None):
37
- self.encoders = encoders
38
- self.col_info = col_info
39
-
40
- self.raw_data = X
41
-
42
- if y is None:
43
- self.targets = None
44
- elif problem_type == REGRESSION:
45
- self.targets = torch.FloatTensor(y)
46
- else:
47
- self.targets = torch.LongTensor(y)
48
-
49
- if col_info is None:
50
- self.columns = get_col_info(X)
51
- else:
52
- self.columns = self.col_info
53
-
54
- """must be a list of dicts, each dict is of the form {"name": col_name, "type": col_type}
55
- where col_name is obtained from the df X, and col_type is CATEGORICAL, TEXT or SCALAR
56
- """
57
- self.cat_feat_origin_cards = None
58
- self.cont_feat_origin = None
59
- self.feature_encoders = None
60
-
61
- @property
62
- def n_cont_features(self):
63
- return len(self.cont_feat_origin) if self.encoders is not None else None
64
-
65
- def fit_feat_encoders(self):
66
- if self.encoders is not None:
67
- self.feature_encoders = {}
68
- for c in self.columns:
69
- col = self.raw_data[c["name"]]
70
- enc = tab_transformer_encoder.__dict__[self.encoders[c["type"]]]()
71
-
72
- if c["type"] == "SCALAR" and col.nunique() < 32:
73
- logger.log(15, f"Column {c['name']} shouldn't be encoded as SCALAR. Switching to CATEGORICAL.")
74
- enc = tab_transformer_encoder.__dict__[self.encoders["CATEGORICAL"]]()
75
- try:
76
- enc.fit(col)
77
- except WontEncodeError as e:
78
- logger.log(15, f"Not encoding column '{c['name']}': {e}")
79
- enc = NullEnc()
80
- self.feature_encoders[c["name"]] = enc
81
-
82
- def encode(self, feature_encoders):
83
- if self.encoders is not None:
84
- self.feature_encoders = feature_encoders
85
-
86
- self.cat_feat_origin_cards = []
87
- cat_features = []
88
- self.cont_feat_origin = []
89
- cont_features = []
90
- for c in self.columns:
91
- enc = feature_encoders[c["name"]]
92
- col = self.raw_data[c["name"]]
93
- cat_feats = enc.enc_cat(col)
94
- if cat_feats is not None:
95
- self.cat_feat_origin_cards += [(f'{c["name"]}_{i}_{c["type"]}', card) for i, card in enumerate(enc.cat_cards)]
96
- cat_features.append(cat_feats)
97
- cont_feats = enc.enc_cont(col)
98
- if cont_feats is not None:
99
- self.cont_feat_origin += [c["name"]] * enc.cont_dim
100
- cont_features.append(cont_feats)
101
- if cat_features:
102
- self.cat_data = torch.cat(cat_features, dim=1)
103
- else:
104
- self.cat_data = None
105
- if cont_features:
106
- self.cont_data = torch.cat(cont_features, dim=1)
107
- else:
108
- self.cont_data = None
109
-
110
- def build_loader(self, batch_size, num_workers, shuffle=False):
111
- loader = DataLoader(self, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=True)
112
- loader.cat_feat_origin_cards = self.cat_feat_origin_cards
113
- return loader
114
-
115
- def __len__(self):
116
- return len(self.raw_data)
117
-
118
- def __getitem__(self, idx):
119
- target = self.targets[idx] if self.targets is not None else []
120
- input = self.cat_data[idx] if self.cat_data is not None else []
121
- return input, target
122
-
123
- def data(self):
124
- return self.raw_data