autogluon.multimodal 1.2.1b20250303__py3-none-any.whl → 1.2.1b20250305__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autogluon/multimodal/__init__.py +4 -2
- autogluon/multimodal/configs/data/default.yaml +4 -2
- autogluon/multimodal/configs/{environment → env}/default.yaml +2 -3
- autogluon/multimodal/configs/model/default.yaml +58 -11
- autogluon/multimodal/configs/{optimization → optim}/default.yaml +21 -4
- autogluon/multimodal/constants.py +16 -5
- autogluon/multimodal/data/__init__.py +14 -2
- autogluon/multimodal/data/dataset.py +2 -2
- autogluon/multimodal/data/infer_types.py +16 -2
- autogluon/multimodal/data/label_encoder.py +3 -3
- autogluon/multimodal/{utils → data}/nlpaug.py +4 -4
- autogluon/multimodal/data/preprocess_dataframe.py +55 -38
- autogluon/multimodal/data/process_categorical.py +35 -6
- autogluon/multimodal/data/process_document.py +59 -33
- autogluon/multimodal/data/process_image.py +198 -163
- autogluon/multimodal/data/process_label.py +7 -3
- autogluon/multimodal/data/process_mmlab/process_mmdet.py +1 -8
- autogluon/multimodal/data/process_mmlab/process_mmlab_base.py +2 -9
- autogluon/multimodal/data/process_mmlab/process_mmocr.py +1 -9
- autogluon/multimodal/data/process_ner.py +192 -4
- autogluon/multimodal/data/process_numerical.py +32 -5
- autogluon/multimodal/data/process_semantic_seg_img.py +23 -28
- autogluon/multimodal/data/process_text.py +95 -58
- autogluon/multimodal/data/template_engine.py +7 -9
- autogluon/multimodal/data/templates.py +0 -2
- autogluon/multimodal/data/trivial_augmenter.py +2 -2
- autogluon/multimodal/data/utils.py +564 -338
- autogluon/multimodal/learners/__init__.py +2 -1
- autogluon/multimodal/learners/base.py +189 -189
- autogluon/multimodal/learners/ensemble.py +748 -0
- autogluon/multimodal/learners/few_shot_svm.py +6 -15
- autogluon/multimodal/learners/matching.py +59 -84
- autogluon/multimodal/learners/ner.py +23 -22
- autogluon/multimodal/learners/object_detection.py +26 -21
- autogluon/multimodal/learners/semantic_segmentation.py +16 -18
- autogluon/multimodal/models/__init__.py +12 -3
- autogluon/multimodal/models/augmenter.py +175 -0
- autogluon/multimodal/models/categorical_mlp.py +13 -8
- autogluon/multimodal/models/clip.py +92 -18
- autogluon/multimodal/models/custom_transformer.py +75 -75
- autogluon/multimodal/models/document_transformer.py +23 -9
- autogluon/multimodal/models/ft_transformer.py +40 -35
- autogluon/multimodal/models/fusion/base.py +2 -4
- autogluon/multimodal/models/fusion/fusion_mlp.py +82 -18
- autogluon/multimodal/models/fusion/fusion_ner.py +1 -1
- autogluon/multimodal/models/fusion/fusion_transformer.py +23 -23
- autogluon/multimodal/models/{huggingface_text.py → hf_text.py} +21 -2
- autogluon/multimodal/models/meta_transformer.py +336 -0
- autogluon/multimodal/models/mlp.py +6 -6
- autogluon/multimodal/models/mmocr_text_detection.py +1 -1
- autogluon/multimodal/models/mmocr_text_recognition.py +0 -1
- autogluon/multimodal/models/ner_text.py +1 -8
- autogluon/multimodal/models/numerical_mlp.py +14 -8
- autogluon/multimodal/models/sam.py +12 -2
- autogluon/multimodal/models/t_few.py +21 -5
- autogluon/multimodal/models/timm_image.py +74 -32
- autogluon/multimodal/models/utils.py +877 -16
- autogluon/multimodal/optim/__init__.py +17 -0
- autogluon/multimodal/{optimization → optim}/lit_distiller.py +2 -1
- autogluon/multimodal/{optimization → optim}/lit_matcher.py +4 -10
- autogluon/multimodal/{optimization → optim}/lit_mmdet.py +2 -10
- autogluon/multimodal/{optimization → optim}/lit_module.py +139 -14
- autogluon/multimodal/{optimization → optim}/lit_ner.py +3 -3
- autogluon/multimodal/{optimization → optim}/lit_semantic_seg.py +1 -1
- autogluon/multimodal/optim/losses/__init__.py +14 -0
- autogluon/multimodal/optim/losses/bce_loss.py +25 -0
- autogluon/multimodal/optim/losses/focal_loss.py +81 -0
- autogluon/multimodal/optim/losses/lemda_loss.py +39 -0
- autogluon/multimodal/optim/losses/rkd_loss.py +103 -0
- autogluon/multimodal/optim/losses/softmax_losses.py +177 -0
- autogluon/multimodal/optim/losses/structure_loss.py +26 -0
- autogluon/multimodal/optim/losses/utils.py +313 -0
- autogluon/multimodal/optim/lr/__init__.py +1 -0
- autogluon/multimodal/optim/lr/utils.py +332 -0
- autogluon/multimodal/optim/metrics/__init__.py +4 -0
- autogluon/multimodal/optim/metrics/coverage_metrics.py +42 -0
- autogluon/multimodal/optim/metrics/hit_rate_metrics.py +78 -0
- autogluon/multimodal/optim/metrics/ranking_metrics.py +231 -0
- autogluon/multimodal/optim/metrics/utils.py +359 -0
- autogluon/multimodal/optim/utils.py +284 -0
- autogluon/multimodal/predictor.py +51 -12
- autogluon/multimodal/utils/__init__.py +19 -45
- autogluon/multimodal/utils/cache.py +23 -2
- autogluon/multimodal/utils/checkpoint.py +58 -5
- autogluon/multimodal/utils/config.py +127 -55
- autogluon/multimodal/utils/device.py +120 -0
- autogluon/multimodal/utils/distillation.py +8 -8
- autogluon/multimodal/utils/download.py +1 -1
- autogluon/multimodal/utils/env.py +22 -0
- autogluon/multimodal/utils/export.py +3 -3
- autogluon/multimodal/utils/hpo.py +5 -5
- autogluon/multimodal/utils/inference.py +37 -4
- autogluon/multimodal/utils/install.py +91 -0
- autogluon/multimodal/utils/load.py +52 -47
- autogluon/multimodal/utils/log.py +6 -41
- autogluon/multimodal/utils/matcher.py +3 -2
- autogluon/multimodal/utils/onnx.py +0 -4
- autogluon/multimodal/utils/path.py +10 -0
- autogluon/multimodal/utils/precision.py +130 -0
- autogluon/multimodal/{presets.py → utils/presets.py} +259 -66
- autogluon/multimodal/{problem_types.py → utils/problem_types.py} +30 -1
- autogluon/multimodal/utils/save.py +47 -29
- autogluon/multimodal/utils/strategy.py +24 -0
- autogluon/multimodal/version.py +1 -1
- {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250305.dist-info}/METADATA +5 -5
- autogluon.multimodal-1.2.1b20250305.dist-info/RECORD +163 -0
- autogluon/multimodal/optimization/__init__.py +0 -16
- autogluon/multimodal/optimization/losses.py +0 -394
- autogluon/multimodal/optimization/utils.py +0 -1054
- autogluon/multimodal/utils/cloud_io.py +0 -80
- autogluon/multimodal/utils/data.py +0 -701
- autogluon/multimodal/utils/environment.py +0 -395
- autogluon/multimodal/utils/metric.py +0 -500
- autogluon/multimodal/utils/model.py +0 -558
- autogluon.multimodal-1.2.1b20250303.dist-info/RECORD +0 -145
- /autogluon/multimodal/{optimization → optim}/deepspeed.py +0 -0
- /autogluon/multimodal/{optimization/lr_scheduler.py → optim/lr/lr_schedulers.py} +0 -0
- /autogluon/multimodal/{optimization → optim/metrics}/semantic_seg_metrics.py +0 -0
- /autogluon/multimodal/{registry.py → utils/registry.py} +0 -0
- /autogluon.multimodal-1.2.1b20250303-py3.9-nspkg.pth → /autogluon.multimodal-1.2.1b20250305-py3.9-nspkg.pth +0 -0
- {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250305.dist-info}/LICENSE +0 -0
- {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250305.dist-info}/NOTICE +0 -0
- {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250305.dist-info}/WHEEL +0 -0
- {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250305.dist-info}/namespace_packages.txt +0 -0
- {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250305.dist-info}/top_level.txt +0 -0
- {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250305.dist-info}/zip-safe +0 -0
@@ -108,10 +108,10 @@ class CLSToken(nn.Module):
|
|
108
108
|
[1] Jacob Devlin, Ming-Wei Chang, Kenton Lee, Kristina Toutanova "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding" 2018
|
109
109
|
"""
|
110
110
|
|
111
|
-
def __init__(self,
|
111
|
+
def __init__(self, token_dim: int, initialization: str) -> None:
|
112
112
|
"""
|
113
113
|
Args:
|
114
|
-
|
114
|
+
token_dim: the size of token
|
115
115
|
initialization: initialization policy for parameters. Must be one of
|
116
116
|
:code:`['uniform', 'normal']`. Let :code:`s = d ** -0.5`. Then, the
|
117
117
|
corresponding distributions are :code:`Uniform(-s, s)` and :code:`Normal(0, s)`. In
|
@@ -123,8 +123,8 @@ class CLSToken(nn.Module):
|
|
123
123
|
"""
|
124
124
|
super().__init__()
|
125
125
|
initialization_ = _TokenInitialization.from_str(initialization)
|
126
|
-
self.weight = nn.Parameter(Tensor(
|
127
|
-
initialization_.apply(self.weight,
|
126
|
+
self.weight = nn.Parameter(Tensor(token_dim))
|
127
|
+
initialization_.apply(self.weight, token_dim)
|
128
128
|
|
129
129
|
def expand(self, *leading_dimensions: int) -> Tensor:
|
130
130
|
"""Expand (repeat) the underlying [CLS]-token to a tensor with the given leading dimensions.
|
@@ -192,8 +192,8 @@ class MultiheadAttention(nn.Module):
|
|
192
192
|
def __init__(
|
193
193
|
self,
|
194
194
|
*,
|
195
|
-
|
196
|
-
|
195
|
+
token_dim: int,
|
196
|
+
num_heads: int,
|
197
197
|
dropout: float,
|
198
198
|
bias: bool,
|
199
199
|
initialization: str,
|
@@ -201,9 +201,9 @@ class MultiheadAttention(nn.Module):
|
|
201
201
|
"""
|
202
202
|
Parameters
|
203
203
|
----------
|
204
|
-
|
205
|
-
the token size. Must be a multiple of :code:`
|
206
|
-
|
204
|
+
token_dim:
|
205
|
+
the token size. Must be a multiple of :code:`num_heads`.
|
206
|
+
num_heads:
|
207
207
|
the number of heads. If greater than 1, then the module will have
|
208
208
|
an addition output layer (so called "mixing" layer).
|
209
209
|
dropout:
|
@@ -221,15 +221,15 @@ class MultiheadAttention(nn.Module):
|
|
221
221
|
AssertionError: if requirements for the inputs are not met.
|
222
222
|
"""
|
223
223
|
super().__init__()
|
224
|
-
if
|
225
|
-
assert
|
224
|
+
if num_heads > 1:
|
225
|
+
assert token_dim % num_heads == 0, "token_dim must be a multiple of num_heads"
|
226
226
|
assert initialization in ["kaiming", "xavier"]
|
227
227
|
|
228
|
-
self.W_q = nn.Linear(
|
229
|
-
self.W_k = nn.Linear(
|
230
|
-
self.W_v = nn.Linear(
|
231
|
-
self.W_out = nn.Linear(
|
232
|
-
self.
|
228
|
+
self.W_q = nn.Linear(token_dim, token_dim, bias)
|
229
|
+
self.W_k = nn.Linear(token_dim, token_dim, bias)
|
230
|
+
self.W_v = nn.Linear(token_dim, token_dim, bias)
|
231
|
+
self.W_out = nn.Linear(token_dim, token_dim, bias) if num_heads > 1 else None
|
232
|
+
self.num_heads = num_heads
|
233
233
|
self.dropout = nn.Dropout(dropout) if dropout else None
|
234
234
|
|
235
235
|
for m in [self.W_q, self.W_k, self.W_v]:
|
@@ -246,12 +246,12 @@ class MultiheadAttention(nn.Module):
|
|
246
246
|
nn.init.zeros_(self.W_out.bias)
|
247
247
|
|
248
248
|
def _reshape(self, x: Tensor) -> Tensor:
|
249
|
-
batch_size,
|
250
|
-
|
249
|
+
batch_size, num_tokens, d = x.shape
|
250
|
+
head_dim = d // self.num_heads
|
251
251
|
return (
|
252
|
-
x.reshape(batch_size,
|
252
|
+
x.reshape(batch_size, num_tokens, self.num_heads, head_dim)
|
253
253
|
.transpose(1, 2)
|
254
|
-
.reshape(batch_size * self.
|
254
|
+
.reshape(batch_size * self.num_heads, num_tokens, head_dim)
|
255
255
|
)
|
256
256
|
|
257
257
|
def forward(
|
@@ -283,27 +283,27 @@ class MultiheadAttention(nn.Module):
|
|
283
283
|
), "If key_compression is (not) None, then value_compression must (not) be None"
|
284
284
|
q, k, v = self.W_q(x_q), self.W_k(x_kv), self.W_v(x_kv)
|
285
285
|
for tensor in [q, k, v]:
|
286
|
-
assert tensor.shape[-1] % self.
|
286
|
+
assert tensor.shape[-1] % self.num_heads == 0, _INTERNAL_ERROR_MESSAGE
|
287
287
|
if key_compression is not None:
|
288
288
|
k = key_compression(k.transpose(1, 2)).transpose(1, 2)
|
289
289
|
v = value_compression(v.transpose(1, 2)).transpose(1, 2) # type: ignore
|
290
290
|
|
291
291
|
batch_size = len(q)
|
292
|
-
|
293
|
-
|
292
|
+
head_dim_key = k.shape[-1] // self.num_heads
|
293
|
+
head_dim_value = v.shape[-1] // self.num_heads
|
294
294
|
n_q_tokens = q.shape[1]
|
295
295
|
|
296
296
|
q = self._reshape(q)
|
297
297
|
k = self._reshape(k)
|
298
|
-
attention_logits = q @ k.transpose(1, 2) / math.sqrt(
|
298
|
+
attention_logits = q @ k.transpose(1, 2) / math.sqrt(head_dim_key)
|
299
299
|
attention_probs = F.softmax(attention_logits, dim=-1)
|
300
300
|
if self.dropout is not None:
|
301
301
|
attention_probs = self.dropout(attention_probs)
|
302
302
|
x = attention_probs @ self._reshape(v)
|
303
303
|
x = (
|
304
|
-
x.reshape(batch_size, self.
|
304
|
+
x.reshape(batch_size, self.num_heads, n_q_tokens, head_dim_value)
|
305
305
|
.transpose(1, 2)
|
306
|
-
.reshape(batch_size, n_q_tokens, self.
|
306
|
+
.reshape(batch_size, n_q_tokens, self.num_heads * head_dim_value)
|
307
307
|
)
|
308
308
|
if self.W_out is not None:
|
309
309
|
x = self.W_out(x)
|
@@ -328,8 +328,8 @@ class AdditiveAttention(nn.Module):
|
|
328
328
|
def __init__(
|
329
329
|
self,
|
330
330
|
*,
|
331
|
-
|
332
|
-
|
331
|
+
token_dim: int,
|
332
|
+
num_heads: int,
|
333
333
|
dropout: float,
|
334
334
|
bias: bool,
|
335
335
|
share_qv_weights: bool,
|
@@ -338,9 +338,9 @@ class AdditiveAttention(nn.Module):
|
|
338
338
|
"""
|
339
339
|
Parameters
|
340
340
|
----------
|
341
|
-
|
342
|
-
the token size. Must be a multiple of :code:`
|
343
|
-
|
341
|
+
token_dim:
|
342
|
+
the token size. Must be a multiple of :code:`num_heads`.
|
343
|
+
num_heads:
|
344
344
|
the number of heads. If greater than 1, then the module will have
|
345
345
|
an addition output layer (so called "mixing" layer).
|
346
346
|
dropout:
|
@@ -357,26 +357,26 @@ class AdditiveAttention(nn.Module):
|
|
357
357
|
"""
|
358
358
|
super().__init__()
|
359
359
|
|
360
|
-
assert
|
360
|
+
assert token_dim % num_heads == 0, "token_dim must be a multiple of num_heads"
|
361
361
|
assert initialization in ["kaiming", "xavier"]
|
362
362
|
|
363
|
-
self.head_dim =
|
364
|
-
self.
|
363
|
+
self.head_dim = token_dim // num_heads
|
364
|
+
self.num_heads = num_heads
|
365
365
|
self.share_qv_weights = share_qv_weights
|
366
366
|
self.dropout = nn.Dropout(dropout)
|
367
367
|
trainable = []
|
368
368
|
if share_qv_weights:
|
369
|
-
self.qv_proj = nn.Linear(
|
369
|
+
self.qv_proj = nn.Linear(token_dim, token_dim, bias=bias)
|
370
370
|
trainable.extend([self.qv_proj])
|
371
371
|
else:
|
372
|
-
self.q_proj = nn.Linear(
|
373
|
-
self.v_proj = nn.Linear(
|
372
|
+
self.q_proj = nn.Linear(token_dim, token_dim, bias=bias)
|
373
|
+
self.v_proj = nn.Linear(token_dim, token_dim, bias=bias)
|
374
374
|
trainable.extend([self.q_proj, self.v_proj])
|
375
375
|
|
376
|
-
self.k_proj = nn.Linear(
|
377
|
-
self.W_q = nn.Linear(
|
378
|
-
self.W_k = nn.Linear(
|
379
|
-
self.r_out = nn.Linear(
|
376
|
+
self.k_proj = nn.Linear(token_dim, token_dim, bias=bias)
|
377
|
+
self.W_q = nn.Linear(token_dim, num_heads)
|
378
|
+
self.W_k = nn.Linear(token_dim, num_heads)
|
379
|
+
self.r_out = nn.Linear(token_dim, token_dim)
|
380
380
|
trainable.extend([self.k_proj, self.W_q, self.W_k, self.r_out])
|
381
381
|
|
382
382
|
if initialization == "xavier":
|
@@ -392,24 +392,24 @@ class AdditiveAttention(nn.Module):
|
|
392
392
|
x_kv: Tensor,
|
393
393
|
*args, # Not used. just to make the input consistent with MultiheadAttention.
|
394
394
|
) -> Tuple[Tensor, Dict[str, Tensor]]:
|
395
|
-
batch_size, n_q_tokens,
|
396
|
-
batch_size, n_k_tokens,
|
395
|
+
batch_size, n_q_tokens, token_dim = x_q.shape
|
396
|
+
batch_size, n_k_tokens, token_dim = x_kv.shape
|
397
397
|
|
398
398
|
q = self.qv_proj(x_q) if self.share_qv_weights else self.q_proj(x_q)
|
399
399
|
v = self.qv_proj(x_kv) if self.share_qv_weights else self.v_proj(x_kv)
|
400
400
|
k = self.k_proj(x_kv)
|
401
401
|
|
402
402
|
alphas = (self.W_q(q) / math.sqrt(self.head_dim)).softmax(dim=1)
|
403
|
-
q_r = q.reshape(batch_size, n_q_tokens, self.
|
403
|
+
q_r = q.reshape(batch_size, n_q_tokens, self.num_heads, self.head_dim)
|
404
404
|
global_query = torch.einsum(" b s h, b s h d -> b h d", alphas, q_r)
|
405
|
-
global_query = global_query.reshape(batch_size, self.
|
405
|
+
global_query = global_query.reshape(batch_size, self.num_heads * self.head_dim).unsqueeze(1)
|
406
406
|
|
407
407
|
p = k * global_query
|
408
408
|
|
409
409
|
betas = (self.W_k(p) / math.sqrt(self.head_dim)).softmax(dim=1)
|
410
|
-
p_r = p.reshape(batch_size, n_k_tokens, self.
|
410
|
+
p_r = p.reshape(batch_size, n_k_tokens, self.num_heads, self.head_dim)
|
411
411
|
global_key = torch.einsum(" b s h, b s h d -> b h d", betas, p_r)
|
412
|
-
global_key = global_key.reshape(batch_size, self.
|
412
|
+
global_key = global_key.reshape(batch_size, self.num_heads * self.head_dim).unsqueeze(1)
|
413
413
|
|
414
414
|
u = v * global_key
|
415
415
|
output = q + self.dropout(self.r_out(u))
|
@@ -433,7 +433,7 @@ class Custom_Transformer(nn.Module):
|
|
433
433
|
def __init__(
|
434
434
|
self,
|
435
435
|
*,
|
436
|
-
|
436
|
+
token_dim: int,
|
437
437
|
d_hidden: int,
|
438
438
|
bias_first: bool,
|
439
439
|
bias_second: bool,
|
@@ -442,13 +442,13 @@ class Custom_Transformer(nn.Module):
|
|
442
442
|
):
|
443
443
|
super().__init__()
|
444
444
|
self.linear_first = nn.Linear(
|
445
|
-
|
445
|
+
token_dim,
|
446
446
|
d_hidden * (2 if _is_glu_activation(activation) else 1),
|
447
447
|
bias_first,
|
448
448
|
)
|
449
449
|
self.activation = _make_nn_module(activation)
|
450
450
|
self.dropout = nn.Dropout(dropout)
|
451
|
-
self.linear_second = nn.Linear(d_hidden,
|
451
|
+
self.linear_second = nn.Linear(d_hidden, token_dim, bias_second)
|
452
452
|
|
453
453
|
def forward(self, x: Tensor) -> Tensor:
|
454
454
|
x = self.linear_first(x)
|
@@ -484,13 +484,13 @@ class Custom_Transformer(nn.Module):
|
|
484
484
|
def __init__(
|
485
485
|
self,
|
486
486
|
*,
|
487
|
-
|
488
|
-
|
489
|
-
|
487
|
+
token_dim: int,
|
488
|
+
num_blocks: int,
|
489
|
+
attention_num_heads: int,
|
490
490
|
attention_dropout: float,
|
491
491
|
attention_initialization: str,
|
492
492
|
attention_normalization: str,
|
493
|
-
|
493
|
+
ffn_hidden_size: int,
|
494
494
|
ffn_dropout: float,
|
495
495
|
ffn_activation: str,
|
496
496
|
ffn_normalization: str,
|
@@ -498,7 +498,7 @@ class Custom_Transformer(nn.Module):
|
|
498
498
|
prenormalization: bool,
|
499
499
|
first_prenormalization: bool,
|
500
500
|
last_layer_query_idx: Union[None, List[int], slice],
|
501
|
-
|
501
|
+
num_tokens: Optional[int],
|
502
502
|
kv_compression_ratio: Optional[float],
|
503
503
|
kv_compression_sharing: Optional[str],
|
504
504
|
head_activation: ModuleType,
|
@@ -511,11 +511,11 @@ class Custom_Transformer(nn.Module):
|
|
511
511
|
"""
|
512
512
|
Parameters
|
513
513
|
----------
|
514
|
-
|
514
|
+
token_dim
|
515
515
|
The size of one token for `_CategoricalFeatureTokenizer`.
|
516
|
-
|
516
|
+
num_blocks
|
517
517
|
Number of the `FT_Transformer` blocks, which should be non-negative.
|
518
|
-
|
518
|
+
attention_num_heads
|
519
519
|
Number of attention heads in each `FT_Transformer` block, which should be positive.
|
520
520
|
attention_dropout
|
521
521
|
Dropout ratio for the Multi Headed Attention module.
|
@@ -523,7 +523,7 @@ class Custom_Transformer(nn.Module):
|
|
523
523
|
Weights initialization scheme for Multi Headed Attention module.
|
524
524
|
attention_normalization
|
525
525
|
Normalization policy for attention layers. "layer_norm" is a good default.
|
526
|
-
|
526
|
+
ffn_hidden_size
|
527
527
|
Number of the hidden nodes of the linear layers in the Feed-Forward Network module.
|
528
528
|
ffn_dropout
|
529
529
|
Dropout ratio of the hidden nodes of the linear layers in the Feed-Forward Network module.
|
@@ -535,7 +535,7 @@ class Custom_Transformer(nn.Module):
|
|
535
535
|
Dropout ratio for the linear layers in FT_Transformer block.
|
536
536
|
prenormalization, first_prenormalization
|
537
537
|
Prenormalization to stabilize the training.
|
538
|
-
|
538
|
+
num_tokens
|
539
539
|
Number of tokens of the input sequence.
|
540
540
|
kv_compression_ratio
|
541
541
|
The compression ration to reduce the input sequence length.
|
@@ -564,9 +564,9 @@ class Custom_Transformer(nn.Module):
|
|
564
564
|
assert (
|
565
565
|
not first_prenormalization
|
566
566
|
), "If `prenormalization` is False, then `first_prenormalization` must be False"
|
567
|
-
assert _all_or_none([
|
567
|
+
assert _all_or_none([num_tokens, kv_compression_ratio, kv_compression_sharing]), (
|
568
568
|
"If any of the following arguments is (not) None, then all of them must (not) be None: "
|
569
|
-
"
|
569
|
+
"num_tokens, kv_compression_ratio, kv_compression_sharing"
|
570
570
|
)
|
571
571
|
assert (
|
572
572
|
additive_attention or not share_qv_weights
|
@@ -595,9 +595,9 @@ class Custom_Transformer(nn.Module):
|
|
595
595
|
)
|
596
596
|
|
597
597
|
def make_kv_compression():
|
598
|
-
assert
|
598
|
+
assert num_tokens and kv_compression_ratio, _INTERNAL_ERROR_MESSAGE # for mypy
|
599
599
|
# https://github.com/pytorch/fairseq/blob/1bba712622b8ae4efb3eb793a8a40da386fe11d0/examples/linformer/linformer_src/modules/multihead_linear_attention.py#L83
|
600
|
-
return nn.Linear(
|
600
|
+
return nn.Linear(num_tokens, int(num_tokens * kv_compression_ratio), bias=False)
|
601
601
|
|
602
602
|
self.shared_kv_compression = (
|
603
603
|
make_kv_compression() if kv_compression_ratio and kv_compression_sharing == "layerwise" else None
|
@@ -607,12 +607,12 @@ class Custom_Transformer(nn.Module):
|
|
607
607
|
self.last_layer_query_idx = last_layer_query_idx
|
608
608
|
|
609
609
|
self.blocks = nn.ModuleList([])
|
610
|
-
for layer_idx in range(
|
610
|
+
for layer_idx in range(num_blocks):
|
611
611
|
layer = nn.ModuleDict(
|
612
612
|
{
|
613
613
|
"attention": AdditiveAttention(
|
614
|
-
|
615
|
-
|
614
|
+
token_dim=token_dim,
|
615
|
+
num_heads=attention_num_heads,
|
616
616
|
dropout=attention_dropout,
|
617
617
|
bias=True,
|
618
618
|
share_qv_weights=share_qv_weights,
|
@@ -620,15 +620,15 @@ class Custom_Transformer(nn.Module):
|
|
620
620
|
)
|
621
621
|
if additive_attention
|
622
622
|
else MultiheadAttention(
|
623
|
-
|
624
|
-
|
623
|
+
token_dim=token_dim,
|
624
|
+
num_heads=attention_num_heads,
|
625
625
|
dropout=attention_dropout,
|
626
626
|
bias=True,
|
627
627
|
initialization=attention_initialization,
|
628
628
|
),
|
629
629
|
"ffn": Custom_Transformer.FFN(
|
630
|
-
|
631
|
-
d_hidden=
|
630
|
+
token_dim=token_dim,
|
631
|
+
d_hidden=ffn_hidden_size,
|
632
632
|
bias_first=True,
|
633
633
|
bias_second=True,
|
634
634
|
dropout=ffn_dropout,
|
@@ -640,8 +640,8 @@ class Custom_Transformer(nn.Module):
|
|
640
640
|
}
|
641
641
|
)
|
642
642
|
if layer_idx or not prenormalization or first_prenormalization:
|
643
|
-
layer["attention_normalization"] = _make_nn_module(attention_normalization,
|
644
|
-
layer["ffn_normalization"] = _make_nn_module(ffn_normalization,
|
643
|
+
layer["attention_normalization"] = _make_nn_module(attention_normalization, token_dim)
|
644
|
+
layer["ffn_normalization"] = _make_nn_module(ffn_normalization, token_dim)
|
645
645
|
if kv_compression_ratio and self.shared_kv_compression is None:
|
646
646
|
layer["key_compression"] = make_kv_compression()
|
647
647
|
if kv_compression_sharing == "headwise":
|
@@ -652,7 +652,7 @@ class Custom_Transformer(nn.Module):
|
|
652
652
|
|
653
653
|
self.head = (
|
654
654
|
Custom_Transformer.Head(
|
655
|
-
d_in=
|
655
|
+
d_in=token_dim,
|
656
656
|
d_out=d_out,
|
657
657
|
bias=True,
|
658
658
|
activation=head_activation, # type: ignore
|
@@ -691,7 +691,7 @@ class Custom_Transformer(nn.Module):
|
|
691
691
|
return x
|
692
692
|
|
693
693
|
def forward(self, x: Tensor) -> Tensor:
|
694
|
-
assert x.ndim == 3, "The input must have 3 dimensions: (n_objects,
|
694
|
+
assert x.ndim == 3, "The input must have 3 dimensions: (n_objects, num_tokens, token_dim)"
|
695
695
|
for layer_idx, layer in enumerate(self.blocks):
|
696
696
|
layer = cast(nn.ModuleDict, layer)
|
697
697
|
|
@@ -6,24 +6,18 @@ from transformers import logging as hf_logging
|
|
6
6
|
|
7
7
|
from ..constants import (
|
8
8
|
ATTENTION_MASK,
|
9
|
-
AUTOMM,
|
10
9
|
BBOX,
|
11
|
-
COLUMN,
|
12
10
|
COLUMN_FEATURES,
|
13
11
|
FEATURES,
|
14
12
|
IMAGE,
|
15
13
|
INPUT_IDS,
|
16
|
-
LABEL,
|
17
14
|
LOGITS,
|
18
15
|
MASKS,
|
19
16
|
PIXEL_VALUES,
|
20
|
-
TEXT_SEGMENT_IDS,
|
21
|
-
TEXT_TOKEN_IDS,
|
22
|
-
TEXT_VALID_LENGTH,
|
23
17
|
TOKEN_TYPE_IDS,
|
24
18
|
)
|
25
|
-
from .
|
26
|
-
from .utils import get_column_features
|
19
|
+
from .hf_text import HFAutoModelForTextPrediction
|
20
|
+
from .utils import get_column_features, get_image_size_mean_std
|
27
21
|
|
28
22
|
hf_logging.set_verbosity_error()
|
29
23
|
|
@@ -45,6 +39,8 @@ class DocumentTransformer(HFAutoModelForTextPrediction):
|
|
45
39
|
low_cpu_mem_usage: Optional[bool] = False,
|
46
40
|
pretrained: Optional[bool] = True,
|
47
41
|
tokenizer_name: Optional[str] = "hf_auto",
|
42
|
+
image_size: Optional[int] = None,
|
43
|
+
image_norm: Optional[str] = None,
|
48
44
|
):
|
49
45
|
"""
|
50
46
|
Load a pretrained huggingface layout-aware document transformer backbone.
|
@@ -77,8 +73,20 @@ class DocumentTransformer(HFAutoModelForTextPrediction):
|
|
77
73
|
Whether using the pretrained weights. If pretrained=True, download the pretrained model.
|
78
74
|
tokenizer_name
|
79
75
|
Name of the huggingface tokenizer type.
|
76
|
+
image_norm
|
77
|
+
How to normalize an image. We now support:
|
78
|
+
- inception
|
79
|
+
Normalize image by IMAGENET_INCEPTION_MEAN and IMAGENET_INCEPTION_STD from timm
|
80
|
+
- imagenet
|
81
|
+
Normalize image by IMAGENET_DEFAULT_MEAN and IMAGENET_DEFAULT_STD from timm
|
82
|
+
- clip
|
83
|
+
Normalize image by mean (0.48145466, 0.4578275, 0.40821073) and
|
84
|
+
std (0.26862954, 0.26130258, 0.27577711), used for CLIP.
|
85
|
+
image_size
|
86
|
+
The provided width / height of a square image.
|
80
87
|
"""
|
81
|
-
logger.debug(f"initializing {
|
88
|
+
logger.debug(f"initializing {prefix} (DocumentTransformer)")
|
89
|
+
logger.debug(f"model checkpoint: {checkpoint_name}")
|
82
90
|
super().__init__(
|
83
91
|
prefix=prefix,
|
84
92
|
checkpoint_name=checkpoint_name,
|
@@ -89,6 +97,12 @@ class DocumentTransformer(HFAutoModelForTextPrediction):
|
|
89
97
|
pretrained=pretrained,
|
90
98
|
tokenizer_name=tokenizer_name,
|
91
99
|
)
|
100
|
+
self.image_size, self.image_mean, self.image_std = get_image_size_mean_std(
|
101
|
+
model_name=self.prefix,
|
102
|
+
config=self.config,
|
103
|
+
provided_size=image_size,
|
104
|
+
provided_norm_type=image_norm,
|
105
|
+
)
|
92
106
|
self.is_text_only_flag = self.is_text_only()
|
93
107
|
|
94
108
|
if self.is_text_only_flag:
|