autogluon.tabular 1.2.1b20250407__py3-none-any.whl → 1.2.1b20250409__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. autogluon/tabular/register/_ag_model_register.py +0 -2
  2. autogluon/tabular/version.py +1 -1
  3. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/METADATA +13 -13
  4. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/RECORD +11 -22
  5. autogluon/tabular/models/tab_transformer/__init__.py +0 -1
  6. autogluon/tabular/models/tab_transformer/hyperparameters/__init__.py +0 -1
  7. autogluon/tabular/models/tab_transformer/hyperparameters/parameters.py +0 -66
  8. autogluon/tabular/models/tab_transformer/hyperparameters/searchspaces.py +0 -17
  9. autogluon/tabular/models/tab_transformer/modified_transformer.py +0 -494
  10. autogluon/tabular/models/tab_transformer/pretexts.py +0 -150
  11. autogluon/tabular/models/tab_transformer/tab_model_base.py +0 -86
  12. autogluon/tabular/models/tab_transformer/tab_transformer.py +0 -183
  13. autogluon/tabular/models/tab_transformer/tab_transformer_encoder.py +0 -668
  14. autogluon/tabular/models/tab_transformer/tab_transformer_model.py +0 -540
  15. autogluon/tabular/models/tab_transformer/utils.py +0 -124
  16. /autogluon.tabular-1.2.1b20250407-py3.9-nspkg.pth → /autogluon.tabular-1.2.1b20250409-py3.9-nspkg.pth +0 -0
  17. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/LICENSE +0 -0
  18. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/NOTICE +0 -0
  19. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/WHEEL +0 -0
  20. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/namespace_packages.txt +0 -0
  21. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/top_level.txt +0 -0
  22. {autogluon.tabular-1.2.1b20250407.dist-info → autogluon.tabular-1.2.1b20250409.dist-info}/zip-safe +0 -0
@@ -1,494 +0,0 @@
1
- """
2
- This code is a modification of the official PyTorch Transformer code found at:
3
- https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/transformer.py
4
-
5
- The modification allows the option of fixing the attention map
6
- """
7
-
8
- import math
9
- from typing import Optional
10
-
11
- import torch
12
- import torch.nn.functional as F
13
-
14
- # Needed for pytorch 1.7 and 1.2
15
- try:
16
- from torch.overrides import handle_torch_function, has_torch_function
17
- # Needed for pytorch 1.6
18
- except ImportError:
19
- from torch._overrides import handle_torch_function, has_torch_function
20
-
21
- from torch.nn import Module, init
22
- from torch.nn.functional import dropout, linear, softmax
23
- from torch.nn.init import constant_, xavier_normal_, xavier_uniform_
24
- from torch.nn.modules.dropout import Dropout
25
- from torch.nn.modules.normalization import LayerNorm
26
- from torch.nn.parameter import Parameter
27
-
28
-
29
- def multi_head_attention_forward(
30
- self,
31
- query, # type: Tensor
32
- key, # type: Tensor
33
- value, # type: Tensor
34
- embed_dim_to_check, # type: int
35
- num_heads, # type: int
36
- in_proj_weight, # type: Tensor
37
- in_proj_bias, # type: Tensor
38
- bias_k, # type: Optional[Tensor]
39
- bias_v, # type: Optional[Tensor]
40
- add_zero_attn, # type: bool
41
- dropout_p, # type: float
42
- out_proj_weight, # type: Tensor
43
- out_proj_bias,
44
- fixed_k=None, # type: Tensor
45
- fixed_q=None, # type: Tensor
46
- training=True, # type: bool
47
- key_padding_mask=None, # type: Optional[Tensor]
48
- need_weights=True, # type: bool
49
- attn_mask=None, # type: Optional[Tensor]
50
- use_separate_proj_weight=False, # type: bool
51
- q_proj_weight=None, # type: Optional[Tensor]
52
- k_proj_weight=None, # type: Optional[Tensor]
53
- v_proj_weight=None, # type: Optional[Tensor]
54
- static_k=None, # type: Optional[Tensor]
55
- static_v=None, # type: Optional[Tensor]
56
- ):
57
- # type: (...) -> tuple[Tensor, Optional[Tensor]]
58
- """
59
- Args:
60
- query, key, value: map a query and a set of key-value pairs to an output.
61
- See "Attention Is All You Need" for more details.
62
- embed_dim_to_check: total dimension of the model.
63
- num_heads: parallel attention heads.
64
- in_proj_weight, in_proj_bias: input projection weight and bias.
65
- bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
66
- add_zero_attn: add a new batch of zeros to the key and
67
- value sequences at dim=1.
68
- dropout_p: probability of an element to be zeroed.
69
- out_proj_weight, out_proj_bias: the output projection weight and bias.
70
- training: apply dropout if is ``True``.
71
- key_padding_mask: if provided, specified padding elements in the key will
72
- be ignored by the attention. This is an binary mask. When the value is True,
73
- the corresponding value on the attention layer will be filled with -inf.
74
- need_weights: output attn_output_weights.
75
- attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
76
- the batches while a 3D mask allows to specify a different mask for the entries of each batch.
77
- use_separate_proj_weight: the function accept the proj. weights for query, key,
78
- and value in different forms. If false, in_proj_weight will be used, which is
79
- a combination of q_proj_weight, k_proj_weight, v_proj_weight.
80
- q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
81
- static_k, static_v: static key and value used for attention operators.
82
- Shape:
83
- Inputs:
84
- - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
85
- the embedding dimension.
86
- - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
87
- the embedding dimension.
88
- - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
89
- the embedding dimension.
90
- - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
91
- If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions
92
- will be unchanged. If a BoolTensor is provided, the positions with the
93
- value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
94
- - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
95
- 3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
96
- S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
97
- positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
98
- while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
99
- are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
100
- is provided, it will be added to the attention weight.
101
- - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
102
- N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
103
- - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
104
- N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
105
- Outputs:
106
- - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
107
- E is the embedding dimension.
108
- - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
109
- L is the target sequence length, S is the source sequence length.
110
- """
111
-
112
- if not torch.jit.is_scripting():
113
- tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v, out_proj_weight, out_proj_bias)
114
- if any([type(t) is not torch.Tensor for t in tens_ops]) and has_torch_function(tens_ops):
115
- return handle_torch_function(
116
- self.multi_head_attention_forward,
117
- tens_ops,
118
- query,
119
- key,
120
- value,
121
- embed_dim_to_check,
122
- num_heads,
123
- in_proj_weight,
124
- in_proj_bias,
125
- bias_k,
126
- bias_v,
127
- add_zero_attn,
128
- dropout_p,
129
- out_proj_weight,
130
- out_proj_bias,
131
- training=training,
132
- key_padding_mask=key_padding_mask,
133
- need_weights=need_weights,
134
- attn_mask=attn_mask,
135
- use_separate_proj_weight=use_separate_proj_weight,
136
- q_proj_weight=q_proj_weight,
137
- k_proj_weight=k_proj_weight,
138
- v_proj_weight=v_proj_weight,
139
- static_k=static_k,
140
- static_v=static_v,
141
- )
142
- tgt_len, bsz, embed_dim = query.size()
143
-
144
- assert embed_dim == embed_dim_to_check
145
- # allow MHA to have different sizes for the feature dimension
146
- assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
147
-
148
- head_dim = embed_dim // num_heads
149
- assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
150
- scaling = float(head_dim) ** -0.5
151
-
152
- # self-attention
153
- # q = linear(query, in_proj_weight, in_proj_bias) #.chunk(1, dim=-1)
154
- # we assume we are in the case key==value==query
155
- v = linear(query, in_proj_weight, in_proj_bias) # .chunk(2, dim=-1)
156
-
157
- k = torch.cat([fixed_k.unsqueeze(1) for _ in range(key.shape[1])], dim=1)
158
- q = torch.cat([fixed_q.unsqueeze(1) for _ in range(key.shape[1])], dim=1)
159
- q = q * scaling
160
-
161
- q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
162
- k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
163
- v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
164
-
165
- src_len = k.size(1)
166
-
167
- attn_output_weights = torch.bmm(q, k.transpose(1, 2))
168
-
169
- assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
170
-
171
- attn_output_weights = softmax(attn_output_weights, dim=-1)
172
- attn_output_weights = dropout(attn_output_weights, p=dropout_p, training=training)
173
- attn_output = torch.bmm(attn_output_weights, v)
174
-
175
- assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
176
- attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
177
- attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
178
-
179
- if need_weights:
180
- # average attention weights over heads
181
- attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
182
- return attn_output, attn_output_weights.sum(dim=1) / num_heads
183
- else:
184
- return attn_output, None
185
-
186
-
187
- class MultiheadAttention(Module):
188
- """Allows the model to jointly attend to information
189
- from different representation subspaces.
190
- See reference: Attention Is All You Need
191
-
192
- .. math::
193
- text{MultiHead}(Q, K, V) = text{Concat}(head_1,dots,head_h)W^O
194
- text{where} head_i = text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
195
-
196
- Args:
197
- embed_dim: total dimension of the model.
198
- num_heads: parallel attention heads.
199
- dropout: a Dropout layer on attn_output_weights. Default: 0.0.
200
- bias: add bias as module parameter. Default: True.
201
- add_bias_kv: add bias to the key and value sequences at dim=0.
202
- add_zero_attn: add a new batch of zeros to the key and
203
- value sequences at dim=1.
204
- kdim: total number of features in key. Default: None.
205
- vdim: total number of features in value. Default: None.
206
-
207
- Note: if kdim and vdim are None, they will be set to embed_dim such that
208
- query, key, and value have the same number of features.
209
-
210
- Examples::
211
-
212
- >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
213
- >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
214
- """
215
-
216
- __annotations__ = {
217
- "bias_k": torch._jit_internal.Optional[torch.Tensor],
218
- "bias_v": torch._jit_internal.Optional[torch.Tensor],
219
- }
220
-
221
- def __init__(self, embed_dim, n_cat_embeddings, num_heads, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None):
222
- super().__init__()
223
-
224
- self.embed_dim = embed_dim
225
- self.kdim = kdim if kdim is not None else embed_dim
226
- self.vdim = vdim if vdim is not None else embed_dim
227
- self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
228
-
229
- self.num_heads = num_heads
230
- self.dropout = dropout
231
- self.head_dim = embed_dim // num_heads
232
- assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
233
-
234
- if self._qkv_same_embed_dim is False:
235
- self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
236
- self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
237
- self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
238
- self.register_parameter("in_proj_weight", None)
239
- self.register_parameter("fixed_k", None)
240
- else:
241
- self.in_proj_weight = Parameter(torch.empty(embed_dim, embed_dim))
242
- # self.in_proj_weight = Parameter(torch.empty(2 * embed_dim, embed_dim))
243
- self.fixed_k = Parameter(torch.empty(n_cat_embeddings, embed_dim))
244
- self.fixed_q = Parameter(torch.empty(n_cat_embeddings, embed_dim))
245
- self.register_parameter("q_proj_weight", None)
246
- self.register_parameter("k_proj_weight", None)
247
- self.register_parameter("v_proj_weight", None)
248
-
249
- if bias:
250
- self.in_proj_bias = Parameter(torch.empty(embed_dim))
251
- else:
252
- self.register_parameter("in_proj_bias", None)
253
- self.out_proj = _LinearWithBias(embed_dim, embed_dim)
254
-
255
- if add_bias_kv:
256
- self.bias_k = Parameter(torch.empty(1, 1, embed_dim))
257
- self.bias_v = Parameter(torch.empty(1, 1, embed_dim))
258
- else:
259
- self.bias_k = self.bias_v = None
260
-
261
- self.add_zero_attn = add_zero_attn
262
-
263
- self._reset_parameters()
264
-
265
- def _reset_parameters(self):
266
- if self._qkv_same_embed_dim:
267
- xavier_uniform_(self.in_proj_weight)
268
- xavier_uniform_(self.fixed_k)
269
- xavier_uniform_(self.fixed_q)
270
- else:
271
- xavier_uniform_(self.q_proj_weight)
272
- xavier_uniform_(self.k_proj_weight)
273
- xavier_uniform_(self.v_proj_weight)
274
-
275
- if self.in_proj_bias is not None:
276
- constant_(self.in_proj_bias, 0.0)
277
- constant_(self.out_proj.bias, 0.0)
278
- if self.bias_k is not None:
279
- xavier_normal_(self.bias_k)
280
- if self.bias_v is not None:
281
- xavier_normal_(self.bias_v)
282
-
283
- def __setstate__(self, state):
284
- # Support loading old MultiheadAttention checkpoints generated by v1.1.0
285
- if "_qkv_same_embed_dim" not in state:
286
- state["_qkv_same_embed_dim"] = True
287
-
288
- super().__setstate__(state)
289
-
290
- def forward(self, query, key, value, key_padding_mask=None, need_weights=True, attn_mask=None):
291
- # type: (Tensor, Tensor, Tensor, Optional[Tensor], bool, Optional[Tensor]) -> tuple[Tensor, Optional[Tensor]]
292
- """
293
- Args:
294
- query, key, value: map a query and a set of key-value pairs to an output.
295
- See "Attention Is All You Need" for more details.
296
- key_padding_mask: if provided, specified padding elements in the key will
297
- be ignored by the attention. When given a binary mask and a value is True,
298
- the corresponding value on the attention layer will be ignored. When given
299
- a byte mask and a value is non-zero, the corresponding value on the attention
300
- layer will be ignored
301
- need_weights: output attn_output_weights.
302
- attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
303
- the batches while a 3D mask allows to specify a different mask for the entries of each batch.
304
-
305
- Shape:
306
- - Inputs:
307
- - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
308
- the embedding dimension.
309
- - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
310
- the embedding dimension.
311
- - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
312
- the embedding dimension.
313
- - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
314
- If a ByteTensor is provided, the non-zero positions will be ignored while the position
315
- with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
316
- value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
317
- - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
318
- 3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
319
- S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
320
- positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
321
- while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
322
- is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
323
- is provided, it will be added to the attention weight.
324
-
325
- - Outputs:
326
- - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
327
- E is the embedding dimension.
328
- - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
329
- L is the target sequence length, S is the source sequence length.
330
- """
331
-
332
- return multi_head_attention_forward(
333
- self,
334
- query=query,
335
- key=key,
336
- value=value,
337
- embed_dim_to_check=self.embed_dim,
338
- num_heads=self.num_heads,
339
- in_proj_weight=self.in_proj_weight,
340
- in_proj_bias=self.in_proj_bias,
341
- bias_k=self.bias_k,
342
- bias_v=self.bias_v,
343
- add_zero_attn=self.add_zero_attn,
344
- dropout_p=self.dropout,
345
- out_proj_weight=self.out_proj.weight,
346
- out_proj_bias=self.out_proj.bias,
347
- fixed_k=self.fixed_k,
348
- fixed_q=self.fixed_q,
349
- training=self.training,
350
- key_padding_mask=key_padding_mask,
351
- need_weights=need_weights,
352
- attn_mask=attn_mask,
353
- )
354
-
355
-
356
- class Linear(Module):
357
- r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
358
- Args:
359
- in_features: size of each input sample
360
- out_features: size of each output sample
361
- bias: If set to ``False``, the layer will not learn an additive bias.
362
- Default: ``True``
363
- Shape:
364
- - Input: :math:`(N, *, H_{in})` where :math:`*` means any number of
365
- additional dimensions and :math:`H_{in} = \text{in\_features}`
366
- - Output: :math:`(N, *, H_{out})` where all but the last dimension
367
- are the same shape as the input and :math:`H_{out} = \text{out\_features}`.
368
- Attributes:
369
- weight: the learnable weights of the module of shape
370
- :math:`(\text{out\_features}, \text{in\_features})`. The values are
371
- initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
372
- :math:`k = \frac{1}{\text{in\_features}}`
373
- bias: the learnable bias of the module of shape :math:`(\text{out\_features})`.
374
- If :attr:`bias` is ``True``, the values are initialized from
375
- :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
376
- :math:`k = \frac{1}{\text{in\_features}}`
377
- Examples::
378
- >>> m = nn.Linear(20, 30)
379
- >>> input = torch.randn(128, 20)
380
- >>> output = m(input)
381
- >>> print(output.size())
382
- torch.Size([128, 30])
383
- """
384
-
385
- __constants__ = ["in_features", "out_features"]
386
- in_features: int
387
- out_features: int
388
- weight: torch.Tensor
389
-
390
- def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None:
391
- super().__init__()
392
-
393
- self.in_features = in_features
394
- self.out_features = out_features
395
- self.weight = Parameter(torch.Tensor(out_features, in_features))
396
- if bias:
397
- self.bias = Parameter(torch.Tensor(out_features))
398
- else:
399
- self.register_parameter("bias", None)
400
- self.reset_parameters()
401
-
402
- def reset_parameters(self) -> None:
403
- init.kaiming_uniform_(self.weight, a=math.sqrt(5))
404
- if self.bias is not None:
405
- fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
406
- bound = 1 / math.sqrt(fan_in)
407
- init.uniform_(self.bias, -bound, bound)
408
-
409
- def forward(self, input: torch.Tensor) -> torch.Tensor:
410
- return F.linear(input, self.weight, self.bias)
411
-
412
- def extra_repr(self) -> str:
413
- return "in_features={}, out_features={}, bias={}".format(self.in_features, self.out_features, self.bias is not None)
414
-
415
-
416
- # This class exists solely for Transformer; it has an annotation stating
417
- # that bias is never None, which appeases TorchScript
418
- class _LinearWithBias(Linear):
419
- bias: torch.Tensor
420
-
421
- def __init__(self, in_features: int, out_features: int) -> None:
422
- super().__init__(in_features, out_features, bias=True)
423
-
424
-
425
- class TransformerEncoderLayerModified(Module):
426
- """TransformerEncoderLayer is made up of self-attn and feedforward network.
427
- This standard encoder layer is based on the paper "Attention Is All You Need".
428
- Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
429
- Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
430
- Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
431
- in a different way during application.
432
-
433
- Args:
434
- d_model: the number of expected features in the input (required).
435
- nhead: the number of heads in the multiheadattention models (required).
436
- dim_feedforward: the dimension of the feedforward network model (default=2048).
437
- dropout: the dropout value (default=0.1).
438
- activation: the activation function of intermediate layer, relu or gelu (default=relu).
439
-
440
- Examples::
441
- >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
442
- >>> src = torch.rand(10, 32, 512)
443
- >>> out = encoder_layer(src)
444
- """
445
-
446
- def __init__(self, d_model, n_cat_embeddings, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"):
447
- super().__init__()
448
-
449
- self.self_attn = MultiheadAttention(d_model, n_cat_embeddings, nhead, dropout=dropout)
450
-
451
- # Implementation of Feedforward model
452
- self.linear1 = Linear(d_model, dim_feedforward)
453
- self.dropout = Dropout(dropout)
454
- self.linear2 = Linear(dim_feedforward, d_model)
455
-
456
- self.norm1 = LayerNorm(d_model)
457
- self.norm2 = LayerNorm(d_model)
458
- self.dropout1 = Dropout(dropout)
459
- self.dropout2 = Dropout(dropout)
460
-
461
- self.activation = _get_activation_fn(activation)
462
-
463
- def __setstate__(self, state):
464
- if "activation" not in state:
465
- state["activation"] = F.relu
466
- super().__setstate__(state)
467
-
468
- def forward(self, src: torch.Tensor, src_mask: Optional[torch.Tensor] = None, src_key_padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
469
- """Pass the input through the encoder layer.
470
-
471
- Args:
472
- src: the sequence to the encoder layer (required).
473
- src_mask: the mask for the src sequence (optional).
474
- src_key_padding_mask: the mask for the src keys per batch (optional).
475
-
476
- Shape:
477
- see the docs in Transformer class.
478
- """
479
- src2 = self.self_attn(src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
480
- src = src + self.dropout1(src2)
481
- src = self.norm1(src)
482
- src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
483
- src = src + self.dropout2(src2)
484
- src = self.norm2(src)
485
- return src
486
-
487
-
488
- def _get_activation_fn(activation):
489
- if activation == "relu":
490
- return F.relu
491
- elif activation == "gelu":
492
- return F.gelu
493
-
494
- raise RuntimeError("activation should be relu/gelu, not {}".format(activation))
@@ -1,150 +0,0 @@
1
- from copy import deepcopy
2
-
3
- import numpy as np
4
- import torch
5
- import torch.nn as nn
6
-
7
- from autogluon.core.constants import REGRESSION
8
-
9
- """
10
- possible TODO: although there is a supervised pretext option below, i.e. pretrain using
11
- labeled data presumably from a different but related task, currently this cannot be handled
12
- by the AG API. One simple way to handle it would be to have an optional "pretext label"
13
- argument to task.fit that tells the pretraining module to use the supervised pretext task
14
- with that particular column as the label.
15
- """
16
-
17
-
18
- class SupervisedPretext(nn.Module):
19
- # holder class to handle supervised pretraining.
20
- def __init__(self, problem_type, device):
21
- super().__init__()
22
- self.device = device
23
- self.loss_funct = nn.MSELoss() if problem_type == REGRESSION else nn.CrossEntropyLoss()
24
-
25
- def forward(self, out, target):
26
- loss = self.loss_funct(out, target)
27
- pred = out.max(dim=1, keepdim=True)[1]
28
- correct = pred.eq(target.view_as(pred)).sum()
29
- correct = correct.float()
30
- correct = correct / out.shape[0]
31
- return loss, correct
32
-
33
- def get(self, data, target):
34
- data = data.to(self.device, non_blocking=True)
35
- target = target.to(self.device, non_blocking=True)
36
-
37
- return data, target
38
-
39
-
40
- class BERTPretext(nn.Module):
41
- """
42
- This is the current default pretext task module.
43
-
44
- Functionality:
45
-
46
- self.get:
47
- inputs: (data, target) (target will often be None)
48
- outputs: (pretext_data, pretext_target)
49
-
50
- called before the forward pass through the TabTransformer to create the
51
- input and label for a BERT-style pretext task.
52
-
53
- self.forward:
54
- inputs: out (embedding for TabTransformer), target (pretext task label)
55
- outputs: loss, % accuracy on pretext task
56
-
57
- given the embedding it passes it through a classifier which learns to
58
- predict the pretext label.
59
- """
60
-
61
- def __init__(self, cat_feat_origin_cards, device, hidden_dim, replacement_noise="random", p_replace=0.3):
62
- super().__init__()
63
- self.cat_feat_origin_cards = cat_feat_origin_cards
64
- self.device = device
65
- self.hidden_dim = hidden_dim
66
- self.loss_funct = nn.CrossEntropyLoss()
67
- self.p_replace = p_replace
68
- self.predicters = nn.ModuleList()
69
- self.n_cat_feats = len(cat_feat_origin_cards)
70
- self.replacement_noise = replacement_noise
71
-
72
- for col in range(self.n_cat_feats):
73
- lin = nn.Linear(self.hidden_dim, 2)
74
- self.predicters.append(lin)
75
-
76
- def forward(self, out, target):
77
- prob = torch.cat([self.predicters[col](out[:, col, :]).unsqueeze(1) for col in range(self.n_cat_feats)], dim=1)
78
- prob = prob.view(-1, 2)
79
- target = target.view(-1)
80
-
81
- loss = self.loss_funct(prob, target)
82
- pred = prob.max(dim=1, keepdim=True)[1]
83
- correct = pred.eq(target.view_as(pred)).sum()
84
- correct = correct.float()
85
- correct = correct / pred.shape[0]
86
-
87
- return loss, correct
88
-
89
- def get(self, data, target):
90
- cat_feats = data
91
-
92
- orig_cat_feats = deepcopy(cat_feats.detach())
93
-
94
- if self.replacement_noise == "swap":
95
- n_cat = cat_feats.shape[1]
96
- cols_to_shuffle = np.random.choice(n_cat, int(self.p_replace * n_cat), replace=False)
97
- for col in cols_to_shuffle:
98
- cat_feats[:, col] = cat_feats[:, col][torch.randperm(cat_feats.shape[0])]
99
-
100
- elif self.replacement_noise == "random":
101
- locs_to_replace = torch.empty_like(cat_feats, dtype=float).uniform_() < self.p_replace
102
- col_cardinalities = torch.LongTensor([i[1] for i in self.cat_feat_origin_cards]).to(cat_feats)
103
- col_cardinalities = col_cardinalities.unsqueeze(0).expand_as(cat_feats)
104
-
105
- unif = torch.rand(cat_feats.shape, device=col_cardinalities.device)
106
- random_feats = (unif * col_cardinalities).floor().to(torch.int64) + 1 # + 1 since 0 is the padding value
107
-
108
- extra_replace = torch.mul((cat_feats == random_feats).to(int), locs_to_replace.to(int)).to(torch.bool)
109
- cat_feats[locs_to_replace] = random_feats[locs_to_replace]
110
-
111
- assert torch.all(cat_feats[extra_replace] == orig_cat_feats[extra_replace]).item() is True
112
- extra_plus1 = cat_feats[extra_replace] + 1
113
- extra_minus1 = cat_feats[extra_replace] - 1
114
- extra_zero_pad_idx = extra_minus1 == 0
115
- extra_minus1[extra_zero_pad_idx] = extra_plus1[extra_zero_pad_idx]
116
-
117
- cat_feats[extra_replace] = extra_minus1
118
- assert torch.all(~(cat_feats[extra_replace] == orig_cat_feats[extra_replace])).item() is True
119
-
120
- elif self.replacement_noise == "low_rank":
121
- assert self.p_replace + 0.2 <= 1, "p_replace too big, lower it!"
122
- weights = torch.tensor([self.p_replace, 0.1, 0.9 - self.p_replace], dtype=torch.float) # 0=pad, 1=replace with random value, 2=dont change
123
-
124
- locs_to_change = torch.multinomial(weights, np.prod(cat_feats.shape), replacement=True).view(cat_feats.shape)
125
- col_cardinalities = torch.LongTensor([i[1] for i in self.cat_feat_origin_cards]).to(cat_feats)
126
- col_cardinalities = col_cardinalities.unsqueeze(0).expand_as(cat_feats)
127
-
128
- unif = torch.rand(cat_feats.shape, device=col_cardinalities.device)
129
- random_feats = (unif * col_cardinalities).floor().to(torch.int64) + 1 # + 1 since 0 is the padding value
130
-
131
- extra_replace = torch.mul((cat_feats == random_feats).to(int), (locs_to_change == 1).to(int)).to(torch.bool)
132
- cat_feats[locs_to_change == 1] = random_feats[locs_to_change == 1]
133
- cat_feats[locs_to_change == 0] = 0
134
-
135
- assert torch.all(cat_feats[extra_replace] == orig_cat_feats[extra_replace]).item() is True
136
- extra_plus1 = cat_feats[extra_replace] + 1
137
- extra_minus1 = cat_feats[extra_replace] - 1
138
- extra_zero_pad_idx = extra_minus1 == 0
139
- extra_minus1[extra_zero_pad_idx] = extra_plus1[extra_zero_pad_idx]
140
-
141
- cat_feats[extra_replace] = extra_minus1
142
- assert torch.all(~(cat_feats[extra_replace] == orig_cat_feats[extra_replace])).item() is True
143
-
144
- pretext_label = (cat_feats != orig_cat_feats).long()
145
- pretext_data = cat_feats
146
-
147
- pretext_data = pretext_data.to(self.device, non_blocking=True)
148
- pretext_label = pretext_label.to(self.device, non_blocking=True)
149
-
150
- return pretext_data, pretext_label