autogluon.tabular 1.2.1b20250407__py3-none-any.whl → 1.2.1b20250409__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autogluon/tabular/register/_ag_model_register.py +0 -2
- autogluon/tabular/version.py +1 -1
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/METADATA +13 -13
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/RECORD +11 -22
- autogluon/tabular/models/tab_transformer/__init__.py +0 -1
- autogluon/tabular/models/tab_transformer/hyperparameters/__init__.py +0 -1
- autogluon/tabular/models/tab_transformer/hyperparameters/parameters.py +0 -66
- autogluon/tabular/models/tab_transformer/hyperparameters/searchspaces.py +0 -17
- autogluon/tabular/models/tab_transformer/modified_transformer.py +0 -494
- autogluon/tabular/models/tab_transformer/pretexts.py +0 -150
- autogluon/tabular/models/tab_transformer/tab_model_base.py +0 -86
- autogluon/tabular/models/tab_transformer/tab_transformer.py +0 -183
- autogluon/tabular/models/tab_transformer/tab_transformer_encoder.py +0 -668
- autogluon/tabular/models/tab_transformer/tab_transformer_model.py +0 -540
- autogluon/tabular/models/tab_transformer/utils.py +0 -124
- /autogluon.tabular-1.2.1b20250407-py3.9-nspkg.pth → /autogluon.tabular-1.2.1b20250409-py3.9-nspkg.pth +0 -0
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/LICENSE +0 -0
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/NOTICE +0 -0
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/WHEEL +0 -0
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/namespace_packages.txt +0 -0
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/top_level.txt +0 -0
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/zip-safe +0 -0
@@ -1,540 +0,0 @@
|
|
1
|
-
"""TabTransformer model"""
|
2
|
-
from __future__ import annotations
|
3
|
-
|
4
|
-
import logging
|
5
|
-
import os
|
6
|
-
import time
|
7
|
-
|
8
|
-
import numpy as np
|
9
|
-
import pandas as pd
|
10
|
-
from tqdm import tqdm
|
11
|
-
|
12
|
-
from autogluon.common.features.types import R_OBJECT, S_TEXT_AS_CATEGORY, S_TEXT_NGRAM
|
13
|
-
from autogluon.common.utils.try_import import try_import_torch
|
14
|
-
from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION
|
15
|
-
from autogluon.core.models.abstract.abstract_nn_model import AbstractNeuralNetworkModel
|
16
|
-
from autogluon.core.utils.loaders import load_pkl
|
17
|
-
|
18
|
-
from .hyperparameters.parameters import get_default_param
|
19
|
-
from .hyperparameters.searchspaces import get_default_searchspace
|
20
|
-
|
21
|
-
logger = logging.getLogger(__name__)
|
22
|
-
|
23
|
-
"""
|
24
|
-
TODO: Fix Mac OS X warning spam.
|
25
|
-
The error message is:
|
26
|
-
Cannot set number of intraop threads after parallel work has started or after set_num_threads call when using native parallel backend (function set_num_threads)
|
27
|
-
This has been investigated to be a harmless warning for training and running inference on TabTransformer.
|
28
|
-
This warning can occur with a very specific environment: torch 1.7, Mac OS X, Python 3.6/3.7, when using torch DataLoader.
|
29
|
-
https://github.com/pytorch/pytorch/issues/46409
|
30
|
-
"""
|
31
|
-
|
32
|
-
|
33
|
-
class TabTransformerModel(AbstractNeuralNetworkModel):
|
34
|
-
"""
|
35
|
-
Main TabTransformer model that inherits from AbstractModel.
|
36
|
-
|
37
|
-
This model includes the full torch pipeline (TabNet) and the internal Transformer embeddings (TabTransformer).
|
38
|
-
This file serves as the connection of all these internal models and architectures to AutoGluon.
|
39
|
-
|
40
|
-
TabTransformer uses modifications to the typical Transformer architecture and the pretraining in BERT
|
41
|
-
and applies them to the use case of tabular data. Specifically, this makes TabTransformer suitable for unsupervised
|
42
|
-
training of Tabular data with a subsequent fine-tuning step on labeled data.
|
43
|
-
"""
|
44
|
-
ag_key = "TRANSF"
|
45
|
-
ag_name = "Transformer"
|
46
|
-
|
47
|
-
params_file_name = "tab_trans_params.pth"
|
48
|
-
|
49
|
-
def __init__(self, **kwargs):
|
50
|
-
try_import_torch()
|
51
|
-
super().__init__(**kwargs)
|
52
|
-
self._verbosity = None
|
53
|
-
self._temp_file_name = "tab_trans_temp.pth"
|
54
|
-
self._period_columns_mapping = None
|
55
|
-
|
56
|
-
def _set_default_params(self):
|
57
|
-
default_params = get_default_param()
|
58
|
-
for param, val in default_params.items():
|
59
|
-
self._set_default_param_value(param, val)
|
60
|
-
|
61
|
-
def _get_default_auxiliary_params(self) -> dict:
|
62
|
-
default_auxiliary_params = super()._get_default_auxiliary_params()
|
63
|
-
extra_auxiliary_params = dict(
|
64
|
-
ignored_type_group_raw=[R_OBJECT],
|
65
|
-
ignored_type_group_special=[S_TEXT_NGRAM, S_TEXT_AS_CATEGORY],
|
66
|
-
)
|
67
|
-
default_auxiliary_params.update(extra_auxiliary_params)
|
68
|
-
return default_auxiliary_params
|
69
|
-
|
70
|
-
def _get_model(self):
|
71
|
-
from .tab_model_base import TabNet
|
72
|
-
|
73
|
-
# If we have already initialized the model, we don't need to do it again.
|
74
|
-
model = TabNet(self.params["n_classes"], self.params["feature_dim"], self.params["num_output_layers"], self.device, self.params)
|
75
|
-
if self.device.type == "cuda":
|
76
|
-
model = model.cuda()
|
77
|
-
|
78
|
-
return model
|
79
|
-
|
80
|
-
# NOTE: Making an assumption that X_unlabeled will not have a different schema. Otherwise, we would need two
|
81
|
-
# period_columns_mapping fields. One for X/X_val, another for X_unlabeled, which may have different columns.
|
82
|
-
@staticmethod
|
83
|
-
def _get_no_period_columns(columns):
|
84
|
-
# Latest pytorch does not support . in module names. Therefore, we must replace the ".".
|
85
|
-
rename_columns = dict()
|
86
|
-
for col in columns:
|
87
|
-
new_col_name = col
|
88
|
-
if "." in col:
|
89
|
-
new_col_name = col.replace(".", "_")
|
90
|
-
|
91
|
-
if new_col_name in rename_columns:
|
92
|
-
for i in range(1, 100):
|
93
|
-
append_col_name = new_col_name + "_" + str(i)
|
94
|
-
if append_col_name not in rename_columns:
|
95
|
-
new_col_name = append_col_name
|
96
|
-
break
|
97
|
-
else:
|
98
|
-
raise RuntimeError("Tried 100 column renames to eliminate duplicates.\n" "Please check similar columns with . or _ in them.")
|
99
|
-
|
100
|
-
# Mapping for every column
|
101
|
-
rename_columns[col] = new_col_name
|
102
|
-
|
103
|
-
return rename_columns
|
104
|
-
|
105
|
-
def _preprocess(self, X, **kwargs):
|
106
|
-
from .utils import TabTransformerDataset
|
107
|
-
|
108
|
-
X = super()._preprocess(X=X, **kwargs)
|
109
|
-
|
110
|
-
X = X.rename(columns=self._period_columns_mapping)
|
111
|
-
encoders = self.params["encoders"]
|
112
|
-
data = TabTransformerDataset(X, encoders=encoders, problem_type=self.problem_type, col_info=self._types_of_features)
|
113
|
-
data.encode(self.fe)
|
114
|
-
|
115
|
-
return data
|
116
|
-
|
117
|
-
def _preprocess_train(self, X, X_val=None, X_unlabeled=None, fe=None):
|
118
|
-
"""
|
119
|
-
Pre-processing specific to TabTransformer. Setting up feature encoders, renaming columns with periods in
|
120
|
-
them (torch), and converting X, X_val, X_unlabeled into TabTransformerDataset's.
|
121
|
-
"""
|
122
|
-
from .utils import TabTransformerDataset
|
123
|
-
|
124
|
-
X = self._preprocess_nonadaptive(X)
|
125
|
-
if X_val is not None:
|
126
|
-
X_val = self._preprocess_nonadaptive(X_val)
|
127
|
-
if X_unlabeled is not None:
|
128
|
-
X_unlabeled = self._preprocess_nonadaptive(X_unlabeled)
|
129
|
-
|
130
|
-
self._period_columns_mapping = self._get_no_period_columns(X.columns)
|
131
|
-
X = X.rename(columns=self._period_columns_mapping)
|
132
|
-
|
133
|
-
if X_val is not None:
|
134
|
-
X_val = X_val.rename(columns=self._period_columns_mapping)
|
135
|
-
if X_unlabeled is not None:
|
136
|
-
X_unlabeled = X_unlabeled.rename(columns=self._period_columns_mapping)
|
137
|
-
|
138
|
-
self._types_of_features, _ = self._get_types_of_features(X, needs_extra_types=False)
|
139
|
-
|
140
|
-
# Also need to rename the feature names in the types_of_features dictionary.
|
141
|
-
for feature_dict in self._types_of_features:
|
142
|
-
# Need to check that the value is in the mapping. Otherwise, we could be updating columns that have been dropped.
|
143
|
-
feature_dict.update(("name", self._period_columns_mapping[v]) for k, v in feature_dict.items() if k == "name" and v in self._period_columns_mapping)
|
144
|
-
|
145
|
-
encoders = self.params["encoders"]
|
146
|
-
data = TabTransformerDataset(X, encoders=encoders, problem_type=self.problem_type, col_info=self._types_of_features)
|
147
|
-
self.fe = fe
|
148
|
-
if self.fe is not None:
|
149
|
-
if X_unlabeled is None:
|
150
|
-
unlab_data = None
|
151
|
-
elif X_unlabeled is not None:
|
152
|
-
unlab_data = TabTransformerDataset(X_unlabeled, encoders=encoders, problem_type=self.problem_type, col_info=self._types_of_features)
|
153
|
-
if self.fe is None:
|
154
|
-
if X_unlabeled is None:
|
155
|
-
data.fit_feat_encoders()
|
156
|
-
self.fe = data.feature_encoders
|
157
|
-
unlab_data = None
|
158
|
-
elif X_unlabeled is not None:
|
159
|
-
unlab_data = TabTransformerDataset(X_unlabeled, encoders=encoders, problem_type=self.problem_type, col_info=self._types_of_features)
|
160
|
-
unlab_data.fit_feat_encoders()
|
161
|
-
self.fe = unlab_data.feature_encoders
|
162
|
-
|
163
|
-
data.encode(self.fe)
|
164
|
-
|
165
|
-
if X_val is not None:
|
166
|
-
val_data = TabTransformerDataset(X_val, encoders=encoders, problem_type=self.problem_type, col_info=self._types_of_features)
|
167
|
-
val_data.encode(self.fe)
|
168
|
-
else:
|
169
|
-
val_data = None
|
170
|
-
|
171
|
-
if unlab_data is not None:
|
172
|
-
unlab_data.encode(self.fe)
|
173
|
-
|
174
|
-
return data, val_data, unlab_data
|
175
|
-
|
176
|
-
def _epoch(
|
177
|
-
self, net, loader_train, loader_val, y_val, optimizers, loss_criterion, pretext, state, scheduler, epoch, epochs, databar_disable, reporter, params
|
178
|
-
):
|
179
|
-
"""
|
180
|
-
Helper function to run one epoch of training, essentially the "inner loop" of training.
|
181
|
-
"""
|
182
|
-
import torch
|
183
|
-
|
184
|
-
from .utils import augmentation
|
185
|
-
|
186
|
-
is_train = optimizers is not None
|
187
|
-
net.train() if is_train else net.eval()
|
188
|
-
total_loss, total_correct, total_num = 0.0, 0.0, 0
|
189
|
-
data_bar = tqdm(loader_train, disable=databar_disable) if is_train else tqdm(loader_val, disable=databar_disable)
|
190
|
-
|
191
|
-
with torch.enable_grad() if is_train else torch.no_grad():
|
192
|
-
for data, target in data_bar:
|
193
|
-
data, target = pretext.get(data, target)
|
194
|
-
|
195
|
-
if self.device.type == "cuda":
|
196
|
-
data, target = data.cuda(), target.cuda()
|
197
|
-
pretext = pretext.cuda()
|
198
|
-
|
199
|
-
if state in [None, "finetune"]:
|
200
|
-
if self.params["num_augs"] > 0:
|
201
|
-
data, target = augmentation(data, target, **params)
|
202
|
-
out, _ = net(data)
|
203
|
-
elif state == "pretrain":
|
204
|
-
_, out = net(data)
|
205
|
-
else:
|
206
|
-
raise NotImplementedError("state must be one of [None, 'pretrain', 'finetune']")
|
207
|
-
|
208
|
-
loss, correct = pretext(out, target)
|
209
|
-
|
210
|
-
if is_train:
|
211
|
-
for optimizer in optimizers:
|
212
|
-
optimizer.zero_grad()
|
213
|
-
loss.backward()
|
214
|
-
for optimizer in optimizers:
|
215
|
-
optimizer.step()
|
216
|
-
|
217
|
-
total_num += 1
|
218
|
-
total_loss += loss.item()
|
219
|
-
|
220
|
-
if epochs == 1:
|
221
|
-
train_test = "Test"
|
222
|
-
else:
|
223
|
-
train_test = "Train"
|
224
|
-
|
225
|
-
val_metric = None
|
226
|
-
if loader_val is not None and state != "pretrain":
|
227
|
-
val_metric = self.score(X=loader_val, y=y_val, metric=self.stopping_metric)
|
228
|
-
data_bar.set_description(
|
229
|
-
"{} Epoch: [{}/{}] Train Loss: {:.4f} Validation {}: {:.2f}".format(
|
230
|
-
train_test, epoch, epochs, total_loss / total_num, self.stopping_metric.name, val_metric
|
231
|
-
)
|
232
|
-
)
|
233
|
-
|
234
|
-
if reporter is not None:
|
235
|
-
reporter(epoch=epoch + 1, validation_performance=val_metric, train_loss=total_loss)
|
236
|
-
|
237
|
-
else:
|
238
|
-
data_bar.set_description("{} Epoch: [{}/{}] Loss: {:.4f}".format(train_test, epoch, epochs, total_loss / total_num))
|
239
|
-
|
240
|
-
return total_loss / total_num, val_metric
|
241
|
-
|
242
|
-
if scheduler is not None:
|
243
|
-
scheduler.step()
|
244
|
-
return total_loss / total_num
|
245
|
-
|
246
|
-
def tt_fit(self, loader_train, loader_val=None, y_val=None, state=None, time_limit=None, reporter=None):
|
247
|
-
"""
|
248
|
-
Main training function for TabTransformer
|
249
|
-
"state" must be one of [None, 'pretrain', 'finetune']
|
250
|
-
None: corresponds to purely supervised learning
|
251
|
-
pretrain: discriminative task will be a pretext task
|
252
|
-
finetune: same as supervised learning except that the model base has
|
253
|
-
exponentially decaying learning rate.
|
254
|
-
"""
|
255
|
-
import torch
|
256
|
-
import torch.nn as nn
|
257
|
-
import torch.optim as optim
|
258
|
-
|
259
|
-
from . import pretexts
|
260
|
-
|
261
|
-
start_time = time.time()
|
262
|
-
pretext_tasks = pretexts.__dict__
|
263
|
-
optimizers = []
|
264
|
-
lr = self.params["lr"]
|
265
|
-
weight_decay = self.params["weight_decay"]
|
266
|
-
epochs = self.params["pretrain_epochs"] if state == "pretrain" else self.params["epochs"]
|
267
|
-
epochs_wo_improve = self.params["epochs_wo_improve"]
|
268
|
-
|
269
|
-
if state is None:
|
270
|
-
optimizers = [optim.Adam(self.model.parameters(), lr=lr, weight_decay=weight_decay)]
|
271
|
-
pretext = pretext_tasks["SupervisedPretext"](self.problem_type, self.device)
|
272
|
-
elif state == "pretrain":
|
273
|
-
optimizers = [optim.Adam(self.model.parameters(), lr=lr, weight_decay=weight_decay)]
|
274
|
-
pretext = pretext_tasks["BERTPretext"](self.cat_feat_origin_cards, self.device, self.params["hidden_dim"])
|
275
|
-
elif state == "finetune":
|
276
|
-
base_exp_decay = self.params["base_exp_decay"]
|
277
|
-
optimizer_fc = [optim.Adam(fc_layer.parameters(), lr=lr, weight_decay=weight_decay) for fc_layer in self.model.fc]
|
278
|
-
optimizer_embeds = optim.Adam(self.model.embed.parameters(), lr=lr, weight_decay=weight_decay)
|
279
|
-
scheduler = optim.lr_scheduler.ExponentialLR(optimizer_embeds, gamma=base_exp_decay) # TODO: Should we be using this in _epoch()?
|
280
|
-
optimizers.extend(optimizer_fc)
|
281
|
-
optimizers.append(optimizer_embeds)
|
282
|
-
|
283
|
-
pretext = pretext_tasks["SupervisedPretext"](self.problem_type, self.device)
|
284
|
-
|
285
|
-
else:
|
286
|
-
raise NotImplementedError("state must be one of [None, 'pretrain', 'finetune']")
|
287
|
-
|
288
|
-
if self.problem_type == REGRESSION:
|
289
|
-
loss_criterion = nn.MSELoss()
|
290
|
-
else:
|
291
|
-
loss_criterion = nn.CrossEntropyLoss()
|
292
|
-
|
293
|
-
best_val_metric = -np.inf # higher = better
|
294
|
-
best_val_epoch = 0
|
295
|
-
best_loss = np.inf
|
296
|
-
|
297
|
-
if self._verbosity <= 1:
|
298
|
-
verbose_eval = -1
|
299
|
-
elif self._verbosity == 2:
|
300
|
-
verbose_eval = 50
|
301
|
-
elif self._verbosity == 3:
|
302
|
-
verbose_eval = 10
|
303
|
-
else:
|
304
|
-
verbose_eval = 1
|
305
|
-
|
306
|
-
if verbose_eval <= 0:
|
307
|
-
databar_disable = True # Whether or not we want to suppress output based on our verbosity
|
308
|
-
else:
|
309
|
-
databar_disable = False
|
310
|
-
|
311
|
-
for e in range(epochs):
|
312
|
-
if e == 0:
|
313
|
-
logger.log(15, "TabTransformer architecture:")
|
314
|
-
logger.log(15, str(self.model))
|
315
|
-
|
316
|
-
train_loss, val_metric = self._epoch(
|
317
|
-
net=self.model,
|
318
|
-
loader_train=loader_train,
|
319
|
-
loader_val=loader_val,
|
320
|
-
y_val=y_val,
|
321
|
-
optimizers=optimizers,
|
322
|
-
loss_criterion=loss_criterion,
|
323
|
-
pretext=pretext,
|
324
|
-
state=state,
|
325
|
-
scheduler=None,
|
326
|
-
epoch=e,
|
327
|
-
epochs=epochs,
|
328
|
-
databar_disable=databar_disable,
|
329
|
-
reporter=reporter,
|
330
|
-
params=self.params,
|
331
|
-
)
|
332
|
-
|
333
|
-
# Early stopping for pretrain'ing based on loss.
|
334
|
-
if state == "pretrain":
|
335
|
-
if train_loss < best_loss or e == 0:
|
336
|
-
if train_loss < best_loss:
|
337
|
-
best_loss = train_loss
|
338
|
-
best_val_epoch = e
|
339
|
-
else:
|
340
|
-
if val_metric >= best_val_metric or e == 0:
|
341
|
-
if loader_val is not None:
|
342
|
-
if not np.isnan(val_metric):
|
343
|
-
best_val_metric = val_metric
|
344
|
-
|
345
|
-
best_val_epoch = e
|
346
|
-
os.makedirs(os.path.dirname(self.path), exist_ok=True)
|
347
|
-
torch.save(self.model, os.path.join(self.path, self._temp_file_name)) # nosec B614
|
348
|
-
|
349
|
-
# If time limit has exceeded or we haven't improved in some number of epochs, stop early.
|
350
|
-
if e - best_val_epoch > epochs_wo_improve:
|
351
|
-
break
|
352
|
-
if time_limit:
|
353
|
-
time_elapsed = time.time() - start_time
|
354
|
-
time_left = time_limit - time_elapsed
|
355
|
-
if time_left <= 0:
|
356
|
-
logger.log(20, "\tRan out of time, stopping training early.")
|
357
|
-
break
|
358
|
-
|
359
|
-
if loader_val is not None:
|
360
|
-
try:
|
361
|
-
self.model = torch.load(os.path.join(self.path, self._temp_file_name)) # nosec B614
|
362
|
-
os.remove(os.path.join(self.path, self._temp_file_name))
|
363
|
-
except:
|
364
|
-
pass
|
365
|
-
logger.log(15, "Best model found in epoch %d" % best_val_epoch)
|
366
|
-
|
367
|
-
def _fit(self, X, y, X_val=None, y_val=None, X_unlabeled=None, time_limit=None, sample_weight=None, reporter=None, **kwargs):
|
368
|
-
import torch
|
369
|
-
|
370
|
-
self._verbosity = kwargs.get("verbosity", 2)
|
371
|
-
num_gpus = kwargs.get("num_gpus", None)
|
372
|
-
if num_gpus is None:
|
373
|
-
if torch.cuda.is_available():
|
374
|
-
self.device = torch.device("cuda")
|
375
|
-
else:
|
376
|
-
self.device = torch.device("cpu")
|
377
|
-
elif num_gpus == 0:
|
378
|
-
self.device = torch.device("cpu")
|
379
|
-
else:
|
380
|
-
self.device = torch.device("cuda")
|
381
|
-
|
382
|
-
if num_gpus > 1:
|
383
|
-
logger.warning("TabTransformer not yet configured to use more than 1 GPU. 'num_gpus' set to >1, but we will be using only 1 GPU.")
|
384
|
-
|
385
|
-
if sample_weight is not None:
|
386
|
-
logger.log(15, "sample_weight not yet supported for TabTransformerModel, this model will ignore them in training.")
|
387
|
-
|
388
|
-
if self.problem_type == REGRESSION:
|
389
|
-
self.params["n_classes"] = 1
|
390
|
-
elif self.problem_type == BINARY:
|
391
|
-
self.params["n_classes"] = 2
|
392
|
-
elif self.problem_type == MULTICLASS:
|
393
|
-
self.params["n_classes"] = y.nunique()
|
394
|
-
|
395
|
-
train, val, unlab = self._preprocess_train(X, X_val, X_unlabeled)
|
396
|
-
|
397
|
-
num_cols = len(train.columns)
|
398
|
-
if num_cols > self.params["max_columns"]:
|
399
|
-
raise NotImplementedError(
|
400
|
-
f"This dataset has {num_cols} columns and exceeds 'max_columns' == {self.params['max_columns']}.\n"
|
401
|
-
f"Which is set by default to ensure the TabTransformer model will not run out of memory.\n"
|
402
|
-
f"If you are confident you will have enough memory, set the 'max_columns' hyperparameter higher and try again.\n"
|
403
|
-
)
|
404
|
-
|
405
|
-
if self.problem_type == REGRESSION:
|
406
|
-
train.targets = torch.FloatTensor(list(y))
|
407
|
-
val.targets = torch.FloatTensor(list(y_val))
|
408
|
-
else:
|
409
|
-
train.targets = torch.LongTensor(list(y))
|
410
|
-
val.targets = torch.LongTensor(list(y_val))
|
411
|
-
|
412
|
-
batch_size = self.params["batch_size"]
|
413
|
-
num_workers = self.params["num_workers"]
|
414
|
-
|
415
|
-
loader_train = train.build_loader(batch_size, num_workers, shuffle=True)
|
416
|
-
loader_val = val.build_loader(batch_size, num_workers)
|
417
|
-
loader_unlab = unlab.build_loader(batch_size, num_workers) if unlab is not None else None
|
418
|
-
|
419
|
-
self.cat_feat_origin_cards = loader_train.cat_feat_origin_cards
|
420
|
-
self.params["cat_feat_origin_cards"] = self.cat_feat_origin_cards
|
421
|
-
|
422
|
-
self.model = self._get_model()
|
423
|
-
|
424
|
-
if X_unlabeled is not None:
|
425
|
-
# Can't spend all the time in pretraining, have to split it up.
|
426
|
-
pretrain_time_limit = time_limit / 2 if time_limit is not None else time_limit
|
427
|
-
pretrain_before_time = time.time()
|
428
|
-
self.tt_fit(loader_unlab, loader_val, y_val, state="pretrain", time_limit=pretrain_time_limit, reporter=reporter)
|
429
|
-
finetune_time_limit = time_limit - (time.time() - pretrain_before_time) if time_limit is not None else time_limit
|
430
|
-
self.tt_fit(loader_train, loader_val, y_val, state="finetune", time_limit=finetune_time_limit, reporter=reporter)
|
431
|
-
else:
|
432
|
-
self.tt_fit(loader_train, loader_val, y_val, time_limit=time_limit, reporter=reporter)
|
433
|
-
|
434
|
-
def _predict_proba(self, X, **kwargs):
|
435
|
-
"""
|
436
|
-
X (torch.tensor or pd.dataframe): data for model to give prediction probabilities
|
437
|
-
returns: np.array of k-probabilities for each of the k classes. If k=2 we drop the second probability.
|
438
|
-
"""
|
439
|
-
import torch
|
440
|
-
import torch.nn as nn
|
441
|
-
from torch.autograd import Variable
|
442
|
-
from torch.utils.data import DataLoader
|
443
|
-
|
444
|
-
if isinstance(X, pd.DataFrame):
|
445
|
-
# Preprocess here also calls our _preprocess, which creates a TTDataset.
|
446
|
-
X = self.preprocess(X, **kwargs)
|
447
|
-
loader = X.build_loader(self.params["batch_size"], self.params["num_workers"])
|
448
|
-
elif isinstance(X, DataLoader):
|
449
|
-
loader = X
|
450
|
-
elif isinstance(X, torch.Tensor):
|
451
|
-
X = X.rename(columns=self._get_no_period_columns(X))
|
452
|
-
loader = X.build_loader(self.params["batch_size"], self.params["num_workers"])
|
453
|
-
else:
|
454
|
-
raise NotImplementedError(
|
455
|
-
"Attempting to predict against a non-supported data type. \nNeeds to be a pandas DataFrame, torch DataLoader or torch Tensor."
|
456
|
-
)
|
457
|
-
|
458
|
-
self.model.eval()
|
459
|
-
softmax = nn.Softmax(dim=1)
|
460
|
-
|
461
|
-
if self.problem_type == REGRESSION:
|
462
|
-
outputs = torch.zeros([len(loader.dataset), 1])
|
463
|
-
else:
|
464
|
-
outputs = torch.zeros([len(loader.dataset), self.num_classes])
|
465
|
-
|
466
|
-
iter = 0
|
467
|
-
for data, _ in loader:
|
468
|
-
if self.device.type == "cuda":
|
469
|
-
data = data.cuda()
|
470
|
-
with torch.no_grad():
|
471
|
-
data = Variable(data)
|
472
|
-
prob, _ = self.model(data)
|
473
|
-
batch_size = len(prob)
|
474
|
-
if self.problem_type != REGRESSION:
|
475
|
-
prob = softmax(prob)
|
476
|
-
|
477
|
-
outputs[iter : (iter + batch_size)] = prob
|
478
|
-
iter += batch_size
|
479
|
-
|
480
|
-
if self.problem_type == BINARY:
|
481
|
-
return outputs[:, 1].cpu().numpy()
|
482
|
-
elif self.problem_type == REGRESSION:
|
483
|
-
outputs = outputs.flatten()
|
484
|
-
|
485
|
-
return outputs.cpu().numpy()
|
486
|
-
|
487
|
-
def _get_default_searchspace(self):
|
488
|
-
return get_default_searchspace()
|
489
|
-
|
490
|
-
@classmethod
|
491
|
-
def supported_problem_types(cls) -> list[str] | None:
|
492
|
-
return ["binary", "multiclass", "regression"]
|
493
|
-
|
494
|
-
def save(self, path: str = None, verbose=True) -> str:
|
495
|
-
import torch
|
496
|
-
|
497
|
-
if path is None:
|
498
|
-
path = self.path
|
499
|
-
|
500
|
-
params_filepath = os.path.join(path, self.params_file_name)
|
501
|
-
|
502
|
-
os.makedirs(os.path.dirname(path), exist_ok=True)
|
503
|
-
|
504
|
-
temp_model = self.model
|
505
|
-
if self.model is not None:
|
506
|
-
torch.save(self.model, params_filepath) # nosec B614
|
507
|
-
|
508
|
-
self.model = None # Avoiding pickling the weights.
|
509
|
-
modelobj_filepath = super().save(path=path, verbose=verbose)
|
510
|
-
|
511
|
-
self.model = temp_model
|
512
|
-
|
513
|
-
return modelobj_filepath
|
514
|
-
|
515
|
-
@classmethod
|
516
|
-
def load(cls, path: str, reset_paths=False, verbose=True):
|
517
|
-
import torch
|
518
|
-
|
519
|
-
obj: TabTransformerModel = load_pkl.load(path=os.path.join(path, cls.model_file_name), verbose=verbose)
|
520
|
-
if reset_paths:
|
521
|
-
obj.set_contexts(path)
|
522
|
-
|
523
|
-
obj.model = torch.load(os.path.join(path, cls.params_file_name)) # nosec B614
|
524
|
-
|
525
|
-
return obj
|
526
|
-
|
527
|
-
"""
|
528
|
-
List of features to add (Updated by Anthony Galczak 11-19-20):
|
529
|
-
|
530
|
-
1) Allow for saving of pretrained model for future use. This will be done in a future PR as the
|
531
|
-
"pretrain API change".
|
532
|
-
|
533
|
-
2) Investigate options for when the unlabeled schema does not match the training schema. Currently,
|
534
|
-
we do not allow such mismatches and the schemas must match exactly. We can investigate ways to use
|
535
|
-
less or more columns from the unlabeled data. This will likely require a design meeting.
|
536
|
-
|
537
|
-
3) Bug where HPO doesn't work when cuda is enabled.
|
538
|
-
"RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method"
|
539
|
-
Update: This will likely be fixed in a future change to HPO in AutoGluon.
|
540
|
-
"""
|
@@ -1,124 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
|
3
|
-
import torch
|
4
|
-
from torch.utils.data import DataLoader, Dataset
|
5
|
-
|
6
|
-
from autogluon.core.constants import REGRESSION
|
7
|
-
|
8
|
-
from . import tab_transformer_encoder
|
9
|
-
from .tab_transformer_encoder import NullEnc, WontEncodeError
|
10
|
-
|
11
|
-
logger = logging.getLogger(__name__)
|
12
|
-
|
13
|
-
|
14
|
-
def augmentation(data, target, **params):
|
15
|
-
shape = data.shape
|
16
|
-
cat_data = torch.cat([data for _ in range(params["num_augs"])])
|
17
|
-
target = torch.cat([target for _ in range(params["num_augs"])]).view(-1)
|
18
|
-
locs_to_mask = torch.empty_like(cat_data, dtype=float).uniform_() < params["aug_mask_prob"]
|
19
|
-
cat_data[locs_to_mask] = 0
|
20
|
-
cat_data = cat_data.view(-1, shape[-1])
|
21
|
-
return cat_data, target
|
22
|
-
|
23
|
-
|
24
|
-
def get_col_info(X):
|
25
|
-
"""
|
26
|
-
If we somehow do not get col_info when creating a TT dataset, then set every feature type to CATEGORICAL.
|
27
|
-
"""
|
28
|
-
cols = list(X.columns)
|
29
|
-
col_info = []
|
30
|
-
for c in cols:
|
31
|
-
col_info.append({"name": c, "type": "CATEGORICAL"})
|
32
|
-
return col_info
|
33
|
-
|
34
|
-
|
35
|
-
class TabTransformerDataset(Dataset):
|
36
|
-
def __init__(self, X, encoders, problem_type, y=None, col_info=None):
|
37
|
-
self.encoders = encoders
|
38
|
-
self.col_info = col_info
|
39
|
-
|
40
|
-
self.raw_data = X
|
41
|
-
|
42
|
-
if y is None:
|
43
|
-
self.targets = None
|
44
|
-
elif problem_type == REGRESSION:
|
45
|
-
self.targets = torch.FloatTensor(y)
|
46
|
-
else:
|
47
|
-
self.targets = torch.LongTensor(y)
|
48
|
-
|
49
|
-
if col_info is None:
|
50
|
-
self.columns = get_col_info(X)
|
51
|
-
else:
|
52
|
-
self.columns = self.col_info
|
53
|
-
|
54
|
-
"""must be a list of dicts, each dict is of the form {"name": col_name, "type": col_type}
|
55
|
-
where col_name is obtained from the df X, and col_type is CATEGORICAL, TEXT or SCALAR
|
56
|
-
"""
|
57
|
-
self.cat_feat_origin_cards = None
|
58
|
-
self.cont_feat_origin = None
|
59
|
-
self.feature_encoders = None
|
60
|
-
|
61
|
-
@property
|
62
|
-
def n_cont_features(self):
|
63
|
-
return len(self.cont_feat_origin) if self.encoders is not None else None
|
64
|
-
|
65
|
-
def fit_feat_encoders(self):
|
66
|
-
if self.encoders is not None:
|
67
|
-
self.feature_encoders = {}
|
68
|
-
for c in self.columns:
|
69
|
-
col = self.raw_data[c["name"]]
|
70
|
-
enc = tab_transformer_encoder.__dict__[self.encoders[c["type"]]]()
|
71
|
-
|
72
|
-
if c["type"] == "SCALAR" and col.nunique() < 32:
|
73
|
-
logger.log(15, f"Column {c['name']} shouldn't be encoded as SCALAR. Switching to CATEGORICAL.")
|
74
|
-
enc = tab_transformer_encoder.__dict__[self.encoders["CATEGORICAL"]]()
|
75
|
-
try:
|
76
|
-
enc.fit(col)
|
77
|
-
except WontEncodeError as e:
|
78
|
-
logger.log(15, f"Not encoding column '{c['name']}': {e}")
|
79
|
-
enc = NullEnc()
|
80
|
-
self.feature_encoders[c["name"]] = enc
|
81
|
-
|
82
|
-
def encode(self, feature_encoders):
|
83
|
-
if self.encoders is not None:
|
84
|
-
self.feature_encoders = feature_encoders
|
85
|
-
|
86
|
-
self.cat_feat_origin_cards = []
|
87
|
-
cat_features = []
|
88
|
-
self.cont_feat_origin = []
|
89
|
-
cont_features = []
|
90
|
-
for c in self.columns:
|
91
|
-
enc = feature_encoders[c["name"]]
|
92
|
-
col = self.raw_data[c["name"]]
|
93
|
-
cat_feats = enc.enc_cat(col)
|
94
|
-
if cat_feats is not None:
|
95
|
-
self.cat_feat_origin_cards += [(f'{c["name"]}_{i}_{c["type"]}', card) for i, card in enumerate(enc.cat_cards)]
|
96
|
-
cat_features.append(cat_feats)
|
97
|
-
cont_feats = enc.enc_cont(col)
|
98
|
-
if cont_feats is not None:
|
99
|
-
self.cont_feat_origin += [c["name"]] * enc.cont_dim
|
100
|
-
cont_features.append(cont_feats)
|
101
|
-
if cat_features:
|
102
|
-
self.cat_data = torch.cat(cat_features, dim=1)
|
103
|
-
else:
|
104
|
-
self.cat_data = None
|
105
|
-
if cont_features:
|
106
|
-
self.cont_data = torch.cat(cont_features, dim=1)
|
107
|
-
else:
|
108
|
-
self.cont_data = None
|
109
|
-
|
110
|
-
def build_loader(self, batch_size, num_workers, shuffle=False):
|
111
|
-
loader = DataLoader(self, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=True)
|
112
|
-
loader.cat_feat_origin_cards = self.cat_feat_origin_cards
|
113
|
-
return loader
|
114
|
-
|
115
|
-
def __len__(self):
|
116
|
-
return len(self.raw_data)
|
117
|
-
|
118
|
-
def __getitem__(self, idx):
|
119
|
-
target = self.targets[idx] if self.targets is not None else []
|
120
|
-
input = self.cat_data[idx] if self.cat_data is not None else []
|
121
|
-
return input, target
|
122
|
-
|
123
|
-
def data(self):
|
124
|
-
return self.raw_data
|
File without changes
|
{autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/LICENSE
RENAMED
File without changes
|
{autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/NOTICE
RENAMED
File without changes
|
{autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|
{autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/zip-safe
RENAMED
File without changes
|