autogluon.tabular 1.2.1b20250407__py3-none-any.whl → 1.2.1b20250409__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autogluon/tabular/register/_ag_model_register.py +0 -2
- autogluon/tabular/version.py +1 -1
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/METADATA +13 -13
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/RECORD +11 -22
- autogluon/tabular/models/tab_transformer/__init__.py +0 -1
- autogluon/tabular/models/tab_transformer/hyperparameters/__init__.py +0 -1
- autogluon/tabular/models/tab_transformer/hyperparameters/parameters.py +0 -66
- autogluon/tabular/models/tab_transformer/hyperparameters/searchspaces.py +0 -17
- autogluon/tabular/models/tab_transformer/modified_transformer.py +0 -494
- autogluon/tabular/models/tab_transformer/pretexts.py +0 -150
- autogluon/tabular/models/tab_transformer/tab_model_base.py +0 -86
- autogluon/tabular/models/tab_transformer/tab_transformer.py +0 -183
- autogluon/tabular/models/tab_transformer/tab_transformer_encoder.py +0 -668
- autogluon/tabular/models/tab_transformer/tab_transformer_model.py +0 -540
- autogluon/tabular/models/tab_transformer/utils.py +0 -124
- /autogluon.tabular-1.2.1b20250407-py3.9-nspkg.pth → /autogluon.tabular-1.2.1b20250409-py3.9-nspkg.pth +0 -0
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/LICENSE +0 -0
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/NOTICE +0 -0
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/WHEEL +0 -0
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/namespace_packages.txt +0 -0
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/top_level.txt +0 -0
- {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/zip-safe +0 -0
@@ -1,494 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
This code is a modification of the official PyTorch Transformer code found at:
|
3
|
-
https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/transformer.py
|
4
|
-
|
5
|
-
The modification allows the option of fixing the attention map
|
6
|
-
"""
|
7
|
-
|
8
|
-
import math
|
9
|
-
from typing import Optional
|
10
|
-
|
11
|
-
import torch
|
12
|
-
import torch.nn.functional as F
|
13
|
-
|
14
|
-
# Needed for pytorch 1.7 and 1.2
|
15
|
-
try:
|
16
|
-
from torch.overrides import handle_torch_function, has_torch_function
|
17
|
-
# Needed for pytorch 1.6
|
18
|
-
except ImportError:
|
19
|
-
from torch._overrides import handle_torch_function, has_torch_function
|
20
|
-
|
21
|
-
from torch.nn import Module, init
|
22
|
-
from torch.nn.functional import dropout, linear, softmax
|
23
|
-
from torch.nn.init import constant_, xavier_normal_, xavier_uniform_
|
24
|
-
from torch.nn.modules.dropout import Dropout
|
25
|
-
from torch.nn.modules.normalization import LayerNorm
|
26
|
-
from torch.nn.parameter import Parameter
|
27
|
-
|
28
|
-
|
29
|
-
def multi_head_attention_forward(
|
30
|
-
self,
|
31
|
-
query, # type: Tensor
|
32
|
-
key, # type: Tensor
|
33
|
-
value, # type: Tensor
|
34
|
-
embed_dim_to_check, # type: int
|
35
|
-
num_heads, # type: int
|
36
|
-
in_proj_weight, # type: Tensor
|
37
|
-
in_proj_bias, # type: Tensor
|
38
|
-
bias_k, # type: Optional[Tensor]
|
39
|
-
bias_v, # type: Optional[Tensor]
|
40
|
-
add_zero_attn, # type: bool
|
41
|
-
dropout_p, # type: float
|
42
|
-
out_proj_weight, # type: Tensor
|
43
|
-
out_proj_bias,
|
44
|
-
fixed_k=None, # type: Tensor
|
45
|
-
fixed_q=None, # type: Tensor
|
46
|
-
training=True, # type: bool
|
47
|
-
key_padding_mask=None, # type: Optional[Tensor]
|
48
|
-
need_weights=True, # type: bool
|
49
|
-
attn_mask=None, # type: Optional[Tensor]
|
50
|
-
use_separate_proj_weight=False, # type: bool
|
51
|
-
q_proj_weight=None, # type: Optional[Tensor]
|
52
|
-
k_proj_weight=None, # type: Optional[Tensor]
|
53
|
-
v_proj_weight=None, # type: Optional[Tensor]
|
54
|
-
static_k=None, # type: Optional[Tensor]
|
55
|
-
static_v=None, # type: Optional[Tensor]
|
56
|
-
):
|
57
|
-
# type: (...) -> tuple[Tensor, Optional[Tensor]]
|
58
|
-
"""
|
59
|
-
Args:
|
60
|
-
query, key, value: map a query and a set of key-value pairs to an output.
|
61
|
-
See "Attention Is All You Need" for more details.
|
62
|
-
embed_dim_to_check: total dimension of the model.
|
63
|
-
num_heads: parallel attention heads.
|
64
|
-
in_proj_weight, in_proj_bias: input projection weight and bias.
|
65
|
-
bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
|
66
|
-
add_zero_attn: add a new batch of zeros to the key and
|
67
|
-
value sequences at dim=1.
|
68
|
-
dropout_p: probability of an element to be zeroed.
|
69
|
-
out_proj_weight, out_proj_bias: the output projection weight and bias.
|
70
|
-
training: apply dropout if is ``True``.
|
71
|
-
key_padding_mask: if provided, specified padding elements in the key will
|
72
|
-
be ignored by the attention. This is an binary mask. When the value is True,
|
73
|
-
the corresponding value on the attention layer will be filled with -inf.
|
74
|
-
need_weights: output attn_output_weights.
|
75
|
-
attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
|
76
|
-
the batches while a 3D mask allows to specify a different mask for the entries of each batch.
|
77
|
-
use_separate_proj_weight: the function accept the proj. weights for query, key,
|
78
|
-
and value in different forms. If false, in_proj_weight will be used, which is
|
79
|
-
a combination of q_proj_weight, k_proj_weight, v_proj_weight.
|
80
|
-
q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
|
81
|
-
static_k, static_v: static key and value used for attention operators.
|
82
|
-
Shape:
|
83
|
-
Inputs:
|
84
|
-
- query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
|
85
|
-
the embedding dimension.
|
86
|
-
- key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
|
87
|
-
the embedding dimension.
|
88
|
-
- value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
|
89
|
-
the embedding dimension.
|
90
|
-
- key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
|
91
|
-
If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions
|
92
|
-
will be unchanged. If a BoolTensor is provided, the positions with the
|
93
|
-
value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
|
94
|
-
- attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
|
95
|
-
3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
|
96
|
-
S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
|
97
|
-
positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
|
98
|
-
while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
|
99
|
-
are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
|
100
|
-
is provided, it will be added to the attention weight.
|
101
|
-
- static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
|
102
|
-
N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
|
103
|
-
- static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
|
104
|
-
N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
|
105
|
-
Outputs:
|
106
|
-
- attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
|
107
|
-
E is the embedding dimension.
|
108
|
-
- attn_output_weights: :math:`(N, L, S)` where N is the batch size,
|
109
|
-
L is the target sequence length, S is the source sequence length.
|
110
|
-
"""
|
111
|
-
|
112
|
-
if not torch.jit.is_scripting():
|
113
|
-
tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v, out_proj_weight, out_proj_bias)
|
114
|
-
if any([type(t) is not torch.Tensor for t in tens_ops]) and has_torch_function(tens_ops):
|
115
|
-
return handle_torch_function(
|
116
|
-
self.multi_head_attention_forward,
|
117
|
-
tens_ops,
|
118
|
-
query,
|
119
|
-
key,
|
120
|
-
value,
|
121
|
-
embed_dim_to_check,
|
122
|
-
num_heads,
|
123
|
-
in_proj_weight,
|
124
|
-
in_proj_bias,
|
125
|
-
bias_k,
|
126
|
-
bias_v,
|
127
|
-
add_zero_attn,
|
128
|
-
dropout_p,
|
129
|
-
out_proj_weight,
|
130
|
-
out_proj_bias,
|
131
|
-
training=training,
|
132
|
-
key_padding_mask=key_padding_mask,
|
133
|
-
need_weights=need_weights,
|
134
|
-
attn_mask=attn_mask,
|
135
|
-
use_separate_proj_weight=use_separate_proj_weight,
|
136
|
-
q_proj_weight=q_proj_weight,
|
137
|
-
k_proj_weight=k_proj_weight,
|
138
|
-
v_proj_weight=v_proj_weight,
|
139
|
-
static_k=static_k,
|
140
|
-
static_v=static_v,
|
141
|
-
)
|
142
|
-
tgt_len, bsz, embed_dim = query.size()
|
143
|
-
|
144
|
-
assert embed_dim == embed_dim_to_check
|
145
|
-
# allow MHA to have different sizes for the feature dimension
|
146
|
-
assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
|
147
|
-
|
148
|
-
head_dim = embed_dim // num_heads
|
149
|
-
assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
|
150
|
-
scaling = float(head_dim) ** -0.5
|
151
|
-
|
152
|
-
# self-attention
|
153
|
-
# q = linear(query, in_proj_weight, in_proj_bias) #.chunk(1, dim=-1)
|
154
|
-
# we assume we are in the case key==value==query
|
155
|
-
v = linear(query, in_proj_weight, in_proj_bias) # .chunk(2, dim=-1)
|
156
|
-
|
157
|
-
k = torch.cat([fixed_k.unsqueeze(1) for _ in range(key.shape[1])], dim=1)
|
158
|
-
q = torch.cat([fixed_q.unsqueeze(1) for _ in range(key.shape[1])], dim=1)
|
159
|
-
q = q * scaling
|
160
|
-
|
161
|
-
q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
|
162
|
-
k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
|
163
|
-
v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
|
164
|
-
|
165
|
-
src_len = k.size(1)
|
166
|
-
|
167
|
-
attn_output_weights = torch.bmm(q, k.transpose(1, 2))
|
168
|
-
|
169
|
-
assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
|
170
|
-
|
171
|
-
attn_output_weights = softmax(attn_output_weights, dim=-1)
|
172
|
-
attn_output_weights = dropout(attn_output_weights, p=dropout_p, training=training)
|
173
|
-
attn_output = torch.bmm(attn_output_weights, v)
|
174
|
-
|
175
|
-
assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
|
176
|
-
attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
|
177
|
-
attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
|
178
|
-
|
179
|
-
if need_weights:
|
180
|
-
# average attention weights over heads
|
181
|
-
attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
|
182
|
-
return attn_output, attn_output_weights.sum(dim=1) / num_heads
|
183
|
-
else:
|
184
|
-
return attn_output, None
|
185
|
-
|
186
|
-
|
187
|
-
class MultiheadAttention(Module):
|
188
|
-
"""Allows the model to jointly attend to information
|
189
|
-
from different representation subspaces.
|
190
|
-
See reference: Attention Is All You Need
|
191
|
-
|
192
|
-
.. math::
|
193
|
-
text{MultiHead}(Q, K, V) = text{Concat}(head_1,dots,head_h)W^O
|
194
|
-
text{where} head_i = text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
|
195
|
-
|
196
|
-
Args:
|
197
|
-
embed_dim: total dimension of the model.
|
198
|
-
num_heads: parallel attention heads.
|
199
|
-
dropout: a Dropout layer on attn_output_weights. Default: 0.0.
|
200
|
-
bias: add bias as module parameter. Default: True.
|
201
|
-
add_bias_kv: add bias to the key and value sequences at dim=0.
|
202
|
-
add_zero_attn: add a new batch of zeros to the key and
|
203
|
-
value sequences at dim=1.
|
204
|
-
kdim: total number of features in key. Default: None.
|
205
|
-
vdim: total number of features in value. Default: None.
|
206
|
-
|
207
|
-
Note: if kdim and vdim are None, they will be set to embed_dim such that
|
208
|
-
query, key, and value have the same number of features.
|
209
|
-
|
210
|
-
Examples::
|
211
|
-
|
212
|
-
>>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
|
213
|
-
>>> attn_output, attn_output_weights = multihead_attn(query, key, value)
|
214
|
-
"""
|
215
|
-
|
216
|
-
__annotations__ = {
|
217
|
-
"bias_k": torch._jit_internal.Optional[torch.Tensor],
|
218
|
-
"bias_v": torch._jit_internal.Optional[torch.Tensor],
|
219
|
-
}
|
220
|
-
|
221
|
-
def __init__(self, embed_dim, n_cat_embeddings, num_heads, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None):
|
222
|
-
super().__init__()
|
223
|
-
|
224
|
-
self.embed_dim = embed_dim
|
225
|
-
self.kdim = kdim if kdim is not None else embed_dim
|
226
|
-
self.vdim = vdim if vdim is not None else embed_dim
|
227
|
-
self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
|
228
|
-
|
229
|
-
self.num_heads = num_heads
|
230
|
-
self.dropout = dropout
|
231
|
-
self.head_dim = embed_dim // num_heads
|
232
|
-
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
|
233
|
-
|
234
|
-
if self._qkv_same_embed_dim is False:
|
235
|
-
self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
|
236
|
-
self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
|
237
|
-
self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
|
238
|
-
self.register_parameter("in_proj_weight", None)
|
239
|
-
self.register_parameter("fixed_k", None)
|
240
|
-
else:
|
241
|
-
self.in_proj_weight = Parameter(torch.empty(embed_dim, embed_dim))
|
242
|
-
# self.in_proj_weight = Parameter(torch.empty(2 * embed_dim, embed_dim))
|
243
|
-
self.fixed_k = Parameter(torch.empty(n_cat_embeddings, embed_dim))
|
244
|
-
self.fixed_q = Parameter(torch.empty(n_cat_embeddings, embed_dim))
|
245
|
-
self.register_parameter("q_proj_weight", None)
|
246
|
-
self.register_parameter("k_proj_weight", None)
|
247
|
-
self.register_parameter("v_proj_weight", None)
|
248
|
-
|
249
|
-
if bias:
|
250
|
-
self.in_proj_bias = Parameter(torch.empty(embed_dim))
|
251
|
-
else:
|
252
|
-
self.register_parameter("in_proj_bias", None)
|
253
|
-
self.out_proj = _LinearWithBias(embed_dim, embed_dim)
|
254
|
-
|
255
|
-
if add_bias_kv:
|
256
|
-
self.bias_k = Parameter(torch.empty(1, 1, embed_dim))
|
257
|
-
self.bias_v = Parameter(torch.empty(1, 1, embed_dim))
|
258
|
-
else:
|
259
|
-
self.bias_k = self.bias_v = None
|
260
|
-
|
261
|
-
self.add_zero_attn = add_zero_attn
|
262
|
-
|
263
|
-
self._reset_parameters()
|
264
|
-
|
265
|
-
def _reset_parameters(self):
|
266
|
-
if self._qkv_same_embed_dim:
|
267
|
-
xavier_uniform_(self.in_proj_weight)
|
268
|
-
xavier_uniform_(self.fixed_k)
|
269
|
-
xavier_uniform_(self.fixed_q)
|
270
|
-
else:
|
271
|
-
xavier_uniform_(self.q_proj_weight)
|
272
|
-
xavier_uniform_(self.k_proj_weight)
|
273
|
-
xavier_uniform_(self.v_proj_weight)
|
274
|
-
|
275
|
-
if self.in_proj_bias is not None:
|
276
|
-
constant_(self.in_proj_bias, 0.0)
|
277
|
-
constant_(self.out_proj.bias, 0.0)
|
278
|
-
if self.bias_k is not None:
|
279
|
-
xavier_normal_(self.bias_k)
|
280
|
-
if self.bias_v is not None:
|
281
|
-
xavier_normal_(self.bias_v)
|
282
|
-
|
283
|
-
def __setstate__(self, state):
|
284
|
-
# Support loading old MultiheadAttention checkpoints generated by v1.1.0
|
285
|
-
if "_qkv_same_embed_dim" not in state:
|
286
|
-
state["_qkv_same_embed_dim"] = True
|
287
|
-
|
288
|
-
super().__setstate__(state)
|
289
|
-
|
290
|
-
def forward(self, query, key, value, key_padding_mask=None, need_weights=True, attn_mask=None):
|
291
|
-
# type: (Tensor, Tensor, Tensor, Optional[Tensor], bool, Optional[Tensor]) -> tuple[Tensor, Optional[Tensor]]
|
292
|
-
"""
|
293
|
-
Args:
|
294
|
-
query, key, value: map a query and a set of key-value pairs to an output.
|
295
|
-
See "Attention Is All You Need" for more details.
|
296
|
-
key_padding_mask: if provided, specified padding elements in the key will
|
297
|
-
be ignored by the attention. When given a binary mask and a value is True,
|
298
|
-
the corresponding value on the attention layer will be ignored. When given
|
299
|
-
a byte mask and a value is non-zero, the corresponding value on the attention
|
300
|
-
layer will be ignored
|
301
|
-
need_weights: output attn_output_weights.
|
302
|
-
attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
|
303
|
-
the batches while a 3D mask allows to specify a different mask for the entries of each batch.
|
304
|
-
|
305
|
-
Shape:
|
306
|
-
- Inputs:
|
307
|
-
- query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
|
308
|
-
the embedding dimension.
|
309
|
-
- key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
|
310
|
-
the embedding dimension.
|
311
|
-
- value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
|
312
|
-
the embedding dimension.
|
313
|
-
- key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
|
314
|
-
If a ByteTensor is provided, the non-zero positions will be ignored while the position
|
315
|
-
with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
|
316
|
-
value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
|
317
|
-
- attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
|
318
|
-
3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
|
319
|
-
S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
|
320
|
-
positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
|
321
|
-
while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
|
322
|
-
is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
|
323
|
-
is provided, it will be added to the attention weight.
|
324
|
-
|
325
|
-
- Outputs:
|
326
|
-
- attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
|
327
|
-
E is the embedding dimension.
|
328
|
-
- attn_output_weights: :math:`(N, L, S)` where N is the batch size,
|
329
|
-
L is the target sequence length, S is the source sequence length.
|
330
|
-
"""
|
331
|
-
|
332
|
-
return multi_head_attention_forward(
|
333
|
-
self,
|
334
|
-
query=query,
|
335
|
-
key=key,
|
336
|
-
value=value,
|
337
|
-
embed_dim_to_check=self.embed_dim,
|
338
|
-
num_heads=self.num_heads,
|
339
|
-
in_proj_weight=self.in_proj_weight,
|
340
|
-
in_proj_bias=self.in_proj_bias,
|
341
|
-
bias_k=self.bias_k,
|
342
|
-
bias_v=self.bias_v,
|
343
|
-
add_zero_attn=self.add_zero_attn,
|
344
|
-
dropout_p=self.dropout,
|
345
|
-
out_proj_weight=self.out_proj.weight,
|
346
|
-
out_proj_bias=self.out_proj.bias,
|
347
|
-
fixed_k=self.fixed_k,
|
348
|
-
fixed_q=self.fixed_q,
|
349
|
-
training=self.training,
|
350
|
-
key_padding_mask=key_padding_mask,
|
351
|
-
need_weights=need_weights,
|
352
|
-
attn_mask=attn_mask,
|
353
|
-
)
|
354
|
-
|
355
|
-
|
356
|
-
class Linear(Module):
|
357
|
-
r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
|
358
|
-
Args:
|
359
|
-
in_features: size of each input sample
|
360
|
-
out_features: size of each output sample
|
361
|
-
bias: If set to ``False``, the layer will not learn an additive bias.
|
362
|
-
Default: ``True``
|
363
|
-
Shape:
|
364
|
-
- Input: :math:`(N, *, H_{in})` where :math:`*` means any number of
|
365
|
-
additional dimensions and :math:`H_{in} = \text{in\_features}`
|
366
|
-
- Output: :math:`(N, *, H_{out})` where all but the last dimension
|
367
|
-
are the same shape as the input and :math:`H_{out} = \text{out\_features}`.
|
368
|
-
Attributes:
|
369
|
-
weight: the learnable weights of the module of shape
|
370
|
-
:math:`(\text{out\_features}, \text{in\_features})`. The values are
|
371
|
-
initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
|
372
|
-
:math:`k = \frac{1}{\text{in\_features}}`
|
373
|
-
bias: the learnable bias of the module of shape :math:`(\text{out\_features})`.
|
374
|
-
If :attr:`bias` is ``True``, the values are initialized from
|
375
|
-
:math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
|
376
|
-
:math:`k = \frac{1}{\text{in\_features}}`
|
377
|
-
Examples::
|
378
|
-
>>> m = nn.Linear(20, 30)
|
379
|
-
>>> input = torch.randn(128, 20)
|
380
|
-
>>> output = m(input)
|
381
|
-
>>> print(output.size())
|
382
|
-
torch.Size([128, 30])
|
383
|
-
"""
|
384
|
-
|
385
|
-
__constants__ = ["in_features", "out_features"]
|
386
|
-
in_features: int
|
387
|
-
out_features: int
|
388
|
-
weight: torch.Tensor
|
389
|
-
|
390
|
-
def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None:
|
391
|
-
super().__init__()
|
392
|
-
|
393
|
-
self.in_features = in_features
|
394
|
-
self.out_features = out_features
|
395
|
-
self.weight = Parameter(torch.Tensor(out_features, in_features))
|
396
|
-
if bias:
|
397
|
-
self.bias = Parameter(torch.Tensor(out_features))
|
398
|
-
else:
|
399
|
-
self.register_parameter("bias", None)
|
400
|
-
self.reset_parameters()
|
401
|
-
|
402
|
-
def reset_parameters(self) -> None:
|
403
|
-
init.kaiming_uniform_(self.weight, a=math.sqrt(5))
|
404
|
-
if self.bias is not None:
|
405
|
-
fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
|
406
|
-
bound = 1 / math.sqrt(fan_in)
|
407
|
-
init.uniform_(self.bias, -bound, bound)
|
408
|
-
|
409
|
-
def forward(self, input: torch.Tensor) -> torch.Tensor:
|
410
|
-
return F.linear(input, self.weight, self.bias)
|
411
|
-
|
412
|
-
def extra_repr(self) -> str:
|
413
|
-
return "in_features={}, out_features={}, bias={}".format(self.in_features, self.out_features, self.bias is not None)
|
414
|
-
|
415
|
-
|
416
|
-
# This class exists solely for Transformer; it has an annotation stating
|
417
|
-
# that bias is never None, which appeases TorchScript
|
418
|
-
class _LinearWithBias(Linear):
|
419
|
-
bias: torch.Tensor
|
420
|
-
|
421
|
-
def __init__(self, in_features: int, out_features: int) -> None:
|
422
|
-
super().__init__(in_features, out_features, bias=True)
|
423
|
-
|
424
|
-
|
425
|
-
class TransformerEncoderLayerModified(Module):
|
426
|
-
"""TransformerEncoderLayer is made up of self-attn and feedforward network.
|
427
|
-
This standard encoder layer is based on the paper "Attention Is All You Need".
|
428
|
-
Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
|
429
|
-
Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
|
430
|
-
Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
|
431
|
-
in a different way during application.
|
432
|
-
|
433
|
-
Args:
|
434
|
-
d_model: the number of expected features in the input (required).
|
435
|
-
nhead: the number of heads in the multiheadattention models (required).
|
436
|
-
dim_feedforward: the dimension of the feedforward network model (default=2048).
|
437
|
-
dropout: the dropout value (default=0.1).
|
438
|
-
activation: the activation function of intermediate layer, relu or gelu (default=relu).
|
439
|
-
|
440
|
-
Examples::
|
441
|
-
>>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
|
442
|
-
>>> src = torch.rand(10, 32, 512)
|
443
|
-
>>> out = encoder_layer(src)
|
444
|
-
"""
|
445
|
-
|
446
|
-
def __init__(self, d_model, n_cat_embeddings, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"):
|
447
|
-
super().__init__()
|
448
|
-
|
449
|
-
self.self_attn = MultiheadAttention(d_model, n_cat_embeddings, nhead, dropout=dropout)
|
450
|
-
|
451
|
-
# Implementation of Feedforward model
|
452
|
-
self.linear1 = Linear(d_model, dim_feedforward)
|
453
|
-
self.dropout = Dropout(dropout)
|
454
|
-
self.linear2 = Linear(dim_feedforward, d_model)
|
455
|
-
|
456
|
-
self.norm1 = LayerNorm(d_model)
|
457
|
-
self.norm2 = LayerNorm(d_model)
|
458
|
-
self.dropout1 = Dropout(dropout)
|
459
|
-
self.dropout2 = Dropout(dropout)
|
460
|
-
|
461
|
-
self.activation = _get_activation_fn(activation)
|
462
|
-
|
463
|
-
def __setstate__(self, state):
|
464
|
-
if "activation" not in state:
|
465
|
-
state["activation"] = F.relu
|
466
|
-
super().__setstate__(state)
|
467
|
-
|
468
|
-
def forward(self, src: torch.Tensor, src_mask: Optional[torch.Tensor] = None, src_key_padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
|
469
|
-
"""Pass the input through the encoder layer.
|
470
|
-
|
471
|
-
Args:
|
472
|
-
src: the sequence to the encoder layer (required).
|
473
|
-
src_mask: the mask for the src sequence (optional).
|
474
|
-
src_key_padding_mask: the mask for the src keys per batch (optional).
|
475
|
-
|
476
|
-
Shape:
|
477
|
-
see the docs in Transformer class.
|
478
|
-
"""
|
479
|
-
src2 = self.self_attn(src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
|
480
|
-
src = src + self.dropout1(src2)
|
481
|
-
src = self.norm1(src)
|
482
|
-
src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
|
483
|
-
src = src + self.dropout2(src2)
|
484
|
-
src = self.norm2(src)
|
485
|
-
return src
|
486
|
-
|
487
|
-
|
488
|
-
def _get_activation_fn(activation):
|
489
|
-
if activation == "relu":
|
490
|
-
return F.relu
|
491
|
-
elif activation == "gelu":
|
492
|
-
return F.gelu
|
493
|
-
|
494
|
-
raise RuntimeError("activation should be relu/gelu, not {}".format(activation))
|
@@ -1,150 +0,0 @@
|
|
1
|
-
from copy import deepcopy
|
2
|
-
|
3
|
-
import numpy as np
|
4
|
-
import torch
|
5
|
-
import torch.nn as nn
|
6
|
-
|
7
|
-
from autogluon.core.constants import REGRESSION
|
8
|
-
|
9
|
-
"""
|
10
|
-
possible TODO: although there is a supervised pretext option below, i.e. pretrain using
|
11
|
-
labeled data presumably from a different but related task, currently this cannot be handled
|
12
|
-
by the AG API. One simple way to handle it would be to have an optional "pretext label"
|
13
|
-
argument to task.fit that tells the pretraining module to use the supervised pretext task
|
14
|
-
with that particular column as the label.
|
15
|
-
"""
|
16
|
-
|
17
|
-
|
18
|
-
class SupervisedPretext(nn.Module):
|
19
|
-
# holder class to handle supervised pretraining.
|
20
|
-
def __init__(self, problem_type, device):
|
21
|
-
super().__init__()
|
22
|
-
self.device = device
|
23
|
-
self.loss_funct = nn.MSELoss() if problem_type == REGRESSION else nn.CrossEntropyLoss()
|
24
|
-
|
25
|
-
def forward(self, out, target):
|
26
|
-
loss = self.loss_funct(out, target)
|
27
|
-
pred = out.max(dim=1, keepdim=True)[1]
|
28
|
-
correct = pred.eq(target.view_as(pred)).sum()
|
29
|
-
correct = correct.float()
|
30
|
-
correct = correct / out.shape[0]
|
31
|
-
return loss, correct
|
32
|
-
|
33
|
-
def get(self, data, target):
|
34
|
-
data = data.to(self.device, non_blocking=True)
|
35
|
-
target = target.to(self.device, non_blocking=True)
|
36
|
-
|
37
|
-
return data, target
|
38
|
-
|
39
|
-
|
40
|
-
class BERTPretext(nn.Module):
|
41
|
-
"""
|
42
|
-
This is the current default pretext task module.
|
43
|
-
|
44
|
-
Functionality:
|
45
|
-
|
46
|
-
self.get:
|
47
|
-
inputs: (data, target) (target will often be None)
|
48
|
-
outputs: (pretext_data, pretext_target)
|
49
|
-
|
50
|
-
called before the forward pass through the TabTransformer to create the
|
51
|
-
input and label for a BERT-style pretext task.
|
52
|
-
|
53
|
-
self.forward:
|
54
|
-
inputs: out (embedding for TabTransformer), target (pretext task label)
|
55
|
-
outputs: loss, % accuracy on pretext task
|
56
|
-
|
57
|
-
given the embedding it passes it through a classifier which learns to
|
58
|
-
predict the pretext label.
|
59
|
-
"""
|
60
|
-
|
61
|
-
def __init__(self, cat_feat_origin_cards, device, hidden_dim, replacement_noise="random", p_replace=0.3):
|
62
|
-
super().__init__()
|
63
|
-
self.cat_feat_origin_cards = cat_feat_origin_cards
|
64
|
-
self.device = device
|
65
|
-
self.hidden_dim = hidden_dim
|
66
|
-
self.loss_funct = nn.CrossEntropyLoss()
|
67
|
-
self.p_replace = p_replace
|
68
|
-
self.predicters = nn.ModuleList()
|
69
|
-
self.n_cat_feats = len(cat_feat_origin_cards)
|
70
|
-
self.replacement_noise = replacement_noise
|
71
|
-
|
72
|
-
for col in range(self.n_cat_feats):
|
73
|
-
lin = nn.Linear(self.hidden_dim, 2)
|
74
|
-
self.predicters.append(lin)
|
75
|
-
|
76
|
-
def forward(self, out, target):
|
77
|
-
prob = torch.cat([self.predicters[col](out[:, col, :]).unsqueeze(1) for col in range(self.n_cat_feats)], dim=1)
|
78
|
-
prob = prob.view(-1, 2)
|
79
|
-
target = target.view(-1)
|
80
|
-
|
81
|
-
loss = self.loss_funct(prob, target)
|
82
|
-
pred = prob.max(dim=1, keepdim=True)[1]
|
83
|
-
correct = pred.eq(target.view_as(pred)).sum()
|
84
|
-
correct = correct.float()
|
85
|
-
correct = correct / pred.shape[0]
|
86
|
-
|
87
|
-
return loss, correct
|
88
|
-
|
89
|
-
def get(self, data, target):
|
90
|
-
cat_feats = data
|
91
|
-
|
92
|
-
orig_cat_feats = deepcopy(cat_feats.detach())
|
93
|
-
|
94
|
-
if self.replacement_noise == "swap":
|
95
|
-
n_cat = cat_feats.shape[1]
|
96
|
-
cols_to_shuffle = np.random.choice(n_cat, int(self.p_replace * n_cat), replace=False)
|
97
|
-
for col in cols_to_shuffle:
|
98
|
-
cat_feats[:, col] = cat_feats[:, col][torch.randperm(cat_feats.shape[0])]
|
99
|
-
|
100
|
-
elif self.replacement_noise == "random":
|
101
|
-
locs_to_replace = torch.empty_like(cat_feats, dtype=float).uniform_() < self.p_replace
|
102
|
-
col_cardinalities = torch.LongTensor([i[1] for i in self.cat_feat_origin_cards]).to(cat_feats)
|
103
|
-
col_cardinalities = col_cardinalities.unsqueeze(0).expand_as(cat_feats)
|
104
|
-
|
105
|
-
unif = torch.rand(cat_feats.shape, device=col_cardinalities.device)
|
106
|
-
random_feats = (unif * col_cardinalities).floor().to(torch.int64) + 1 # + 1 since 0 is the padding value
|
107
|
-
|
108
|
-
extra_replace = torch.mul((cat_feats == random_feats).to(int), locs_to_replace.to(int)).to(torch.bool)
|
109
|
-
cat_feats[locs_to_replace] = random_feats[locs_to_replace]
|
110
|
-
|
111
|
-
assert torch.all(cat_feats[extra_replace] == orig_cat_feats[extra_replace]).item() is True
|
112
|
-
extra_plus1 = cat_feats[extra_replace] + 1
|
113
|
-
extra_minus1 = cat_feats[extra_replace] - 1
|
114
|
-
extra_zero_pad_idx = extra_minus1 == 0
|
115
|
-
extra_minus1[extra_zero_pad_idx] = extra_plus1[extra_zero_pad_idx]
|
116
|
-
|
117
|
-
cat_feats[extra_replace] = extra_minus1
|
118
|
-
assert torch.all(~(cat_feats[extra_replace] == orig_cat_feats[extra_replace])).item() is True
|
119
|
-
|
120
|
-
elif self.replacement_noise == "low_rank":
|
121
|
-
assert self.p_replace + 0.2 <= 1, "p_replace too big, lower it!"
|
122
|
-
weights = torch.tensor([self.p_replace, 0.1, 0.9 - self.p_replace], dtype=torch.float) # 0=pad, 1=replace with random value, 2=dont change
|
123
|
-
|
124
|
-
locs_to_change = torch.multinomial(weights, np.prod(cat_feats.shape), replacement=True).view(cat_feats.shape)
|
125
|
-
col_cardinalities = torch.LongTensor([i[1] for i in self.cat_feat_origin_cards]).to(cat_feats)
|
126
|
-
col_cardinalities = col_cardinalities.unsqueeze(0).expand_as(cat_feats)
|
127
|
-
|
128
|
-
unif = torch.rand(cat_feats.shape, device=col_cardinalities.device)
|
129
|
-
random_feats = (unif * col_cardinalities).floor().to(torch.int64) + 1 # + 1 since 0 is the padding value
|
130
|
-
|
131
|
-
extra_replace = torch.mul((cat_feats == random_feats).to(int), (locs_to_change == 1).to(int)).to(torch.bool)
|
132
|
-
cat_feats[locs_to_change == 1] = random_feats[locs_to_change == 1]
|
133
|
-
cat_feats[locs_to_change == 0] = 0
|
134
|
-
|
135
|
-
assert torch.all(cat_feats[extra_replace] == orig_cat_feats[extra_replace]).item() is True
|
136
|
-
extra_plus1 = cat_feats[extra_replace] + 1
|
137
|
-
extra_minus1 = cat_feats[extra_replace] - 1
|
138
|
-
extra_zero_pad_idx = extra_minus1 == 0
|
139
|
-
extra_minus1[extra_zero_pad_idx] = extra_plus1[extra_zero_pad_idx]
|
140
|
-
|
141
|
-
cat_feats[extra_replace] = extra_minus1
|
142
|
-
assert torch.all(~(cat_feats[extra_replace] == orig_cat_feats[extra_replace])).item() is True
|
143
|
-
|
144
|
-
pretext_label = (cat_feats != orig_cat_feats).long()
|
145
|
-
pretext_data = cat_feats
|
146
|
-
|
147
|
-
pretext_data = pretext_data.to(self.device, non_blocking=True)
|
148
|
-
pretext_label = pretext_label.to(self.device, non_blocking=True)
|
149
|
-
|
150
|
-
return pretext_data, pretext_label
|