autogluon.multimodal 1.2.1b20250303__py3-none-any.whl → 1.2.1b20250304__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. autogluon/multimodal/__init__.py +4 -2
  2. autogluon/multimodal/configs/data/default.yaml +4 -2
  3. autogluon/multimodal/configs/{environment → env}/default.yaml +2 -3
  4. autogluon/multimodal/configs/model/default.yaml +58 -11
  5. autogluon/multimodal/configs/{optimization → optim}/default.yaml +21 -4
  6. autogluon/multimodal/constants.py +16 -5
  7. autogluon/multimodal/data/__init__.py +14 -2
  8. autogluon/multimodal/data/dataset.py +2 -2
  9. autogluon/multimodal/data/infer_types.py +16 -2
  10. autogluon/multimodal/data/label_encoder.py +3 -3
  11. autogluon/multimodal/{utils → data}/nlpaug.py +4 -4
  12. autogluon/multimodal/data/preprocess_dataframe.py +55 -38
  13. autogluon/multimodal/data/process_categorical.py +35 -6
  14. autogluon/multimodal/data/process_document.py +59 -33
  15. autogluon/multimodal/data/process_image.py +198 -163
  16. autogluon/multimodal/data/process_label.py +7 -3
  17. autogluon/multimodal/data/process_mmlab/process_mmdet.py +1 -8
  18. autogluon/multimodal/data/process_mmlab/process_mmlab_base.py +2 -9
  19. autogluon/multimodal/data/process_mmlab/process_mmocr.py +1 -9
  20. autogluon/multimodal/data/process_ner.py +192 -4
  21. autogluon/multimodal/data/process_numerical.py +32 -5
  22. autogluon/multimodal/data/process_semantic_seg_img.py +23 -28
  23. autogluon/multimodal/data/process_text.py +95 -58
  24. autogluon/multimodal/data/template_engine.py +7 -9
  25. autogluon/multimodal/data/templates.py +0 -2
  26. autogluon/multimodal/data/trivial_augmenter.py +2 -2
  27. autogluon/multimodal/data/utils.py +564 -338
  28. autogluon/multimodal/learners/__init__.py +2 -1
  29. autogluon/multimodal/learners/base.py +189 -189
  30. autogluon/multimodal/learners/ensemble.py +748 -0
  31. autogluon/multimodal/learners/few_shot_svm.py +6 -15
  32. autogluon/multimodal/learners/matching.py +59 -84
  33. autogluon/multimodal/learners/ner.py +23 -22
  34. autogluon/multimodal/learners/object_detection.py +26 -21
  35. autogluon/multimodal/learners/semantic_segmentation.py +16 -18
  36. autogluon/multimodal/models/__init__.py +12 -3
  37. autogluon/multimodal/models/augmenter.py +175 -0
  38. autogluon/multimodal/models/categorical_mlp.py +13 -8
  39. autogluon/multimodal/models/clip.py +92 -18
  40. autogluon/multimodal/models/custom_transformer.py +75 -75
  41. autogluon/multimodal/models/document_transformer.py +23 -9
  42. autogluon/multimodal/models/ft_transformer.py +40 -35
  43. autogluon/multimodal/models/fusion/base.py +2 -4
  44. autogluon/multimodal/models/fusion/fusion_mlp.py +82 -18
  45. autogluon/multimodal/models/fusion/fusion_ner.py +1 -1
  46. autogluon/multimodal/models/fusion/fusion_transformer.py +23 -23
  47. autogluon/multimodal/models/{huggingface_text.py → hf_text.py} +21 -2
  48. autogluon/multimodal/models/meta_transformer.py +336 -0
  49. autogluon/multimodal/models/mlp.py +6 -6
  50. autogluon/multimodal/models/mmocr_text_detection.py +1 -1
  51. autogluon/multimodal/models/mmocr_text_recognition.py +0 -1
  52. autogluon/multimodal/models/ner_text.py +1 -8
  53. autogluon/multimodal/models/numerical_mlp.py +14 -8
  54. autogluon/multimodal/models/sam.py +12 -2
  55. autogluon/multimodal/models/t_few.py +21 -5
  56. autogluon/multimodal/models/timm_image.py +74 -32
  57. autogluon/multimodal/models/utils.py +877 -16
  58. autogluon/multimodal/optim/__init__.py +17 -0
  59. autogluon/multimodal/{optimization → optim}/lit_distiller.py +2 -1
  60. autogluon/multimodal/{optimization → optim}/lit_matcher.py +4 -10
  61. autogluon/multimodal/{optimization → optim}/lit_mmdet.py +2 -10
  62. autogluon/multimodal/{optimization → optim}/lit_module.py +139 -14
  63. autogluon/multimodal/{optimization → optim}/lit_ner.py +3 -3
  64. autogluon/multimodal/{optimization → optim}/lit_semantic_seg.py +1 -1
  65. autogluon/multimodal/optim/losses/__init__.py +14 -0
  66. autogluon/multimodal/optim/losses/bce_loss.py +25 -0
  67. autogluon/multimodal/optim/losses/focal_loss.py +81 -0
  68. autogluon/multimodal/optim/losses/lemda_loss.py +39 -0
  69. autogluon/multimodal/optim/losses/rkd_loss.py +103 -0
  70. autogluon/multimodal/optim/losses/softmax_losses.py +177 -0
  71. autogluon/multimodal/optim/losses/structure_loss.py +26 -0
  72. autogluon/multimodal/optim/losses/utils.py +313 -0
  73. autogluon/multimodal/optim/lr/__init__.py +1 -0
  74. autogluon/multimodal/optim/lr/utils.py +332 -0
  75. autogluon/multimodal/optim/metrics/__init__.py +4 -0
  76. autogluon/multimodal/optim/metrics/coverage_metrics.py +42 -0
  77. autogluon/multimodal/optim/metrics/hit_rate_metrics.py +78 -0
  78. autogluon/multimodal/optim/metrics/ranking_metrics.py +231 -0
  79. autogluon/multimodal/optim/metrics/utils.py +359 -0
  80. autogluon/multimodal/optim/utils.py +284 -0
  81. autogluon/multimodal/predictor.py +51 -12
  82. autogluon/multimodal/utils/__init__.py +19 -45
  83. autogluon/multimodal/utils/cache.py +23 -2
  84. autogluon/multimodal/utils/checkpoint.py +58 -5
  85. autogluon/multimodal/utils/config.py +127 -55
  86. autogluon/multimodal/utils/device.py +120 -0
  87. autogluon/multimodal/utils/distillation.py +8 -8
  88. autogluon/multimodal/utils/download.py +1 -1
  89. autogluon/multimodal/utils/env.py +22 -0
  90. autogluon/multimodal/utils/export.py +3 -3
  91. autogluon/multimodal/utils/hpo.py +5 -5
  92. autogluon/multimodal/utils/inference.py +37 -4
  93. autogluon/multimodal/utils/install.py +91 -0
  94. autogluon/multimodal/utils/load.py +52 -47
  95. autogluon/multimodal/utils/log.py +6 -41
  96. autogluon/multimodal/utils/matcher.py +3 -2
  97. autogluon/multimodal/utils/onnx.py +0 -4
  98. autogluon/multimodal/utils/path.py +10 -0
  99. autogluon/multimodal/utils/precision.py +130 -0
  100. autogluon/multimodal/{presets.py → utils/presets.py} +259 -66
  101. autogluon/multimodal/{problem_types.py → utils/problem_types.py} +30 -1
  102. autogluon/multimodal/utils/save.py +47 -29
  103. autogluon/multimodal/utils/strategy.py +24 -0
  104. autogluon/multimodal/version.py +1 -1
  105. {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250304.dist-info}/METADATA +5 -5
  106. autogluon.multimodal-1.2.1b20250304.dist-info/RECORD +163 -0
  107. autogluon/multimodal/optimization/__init__.py +0 -16
  108. autogluon/multimodal/optimization/losses.py +0 -394
  109. autogluon/multimodal/optimization/utils.py +0 -1054
  110. autogluon/multimodal/utils/cloud_io.py +0 -80
  111. autogluon/multimodal/utils/data.py +0 -701
  112. autogluon/multimodal/utils/environment.py +0 -395
  113. autogluon/multimodal/utils/metric.py +0 -500
  114. autogluon/multimodal/utils/model.py +0 -558
  115. autogluon.multimodal-1.2.1b20250303.dist-info/RECORD +0 -145
  116. /autogluon/multimodal/{optimization → optim}/deepspeed.py +0 -0
  117. /autogluon/multimodal/{optimization/lr_scheduler.py → optim/lr/lr_schedulers.py} +0 -0
  118. /autogluon/multimodal/{optimization → optim/metrics}/semantic_seg_metrics.py +0 -0
  119. /autogluon/multimodal/{registry.py → utils/registry.py} +0 -0
  120. /autogluon.multimodal-1.2.1b20250303-py3.9-nspkg.pth → /autogluon.multimodal-1.2.1b20250304-py3.9-nspkg.pth +0 -0
  121. {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250304.dist-info}/LICENSE +0 -0
  122. {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250304.dist-info}/NOTICE +0 -0
  123. {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250304.dist-info}/WHEEL +0 -0
  124. {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250304.dist-info}/namespace_packages.txt +0 -0
  125. {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250304.dist-info}/top_level.txt +0 -0
  126. {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250304.dist-info}/zip-safe +0 -0
@@ -108,10 +108,10 @@ class CLSToken(nn.Module):
108
108
  [1] Jacob Devlin, Ming-Wei Chang, Kenton Lee, Kristina Toutanova "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding" 2018
109
109
  """
110
110
 
111
- def __init__(self, d_token: int, initialization: str) -> None:
111
+ def __init__(self, token_dim: int, initialization: str) -> None:
112
112
  """
113
113
  Args:
114
- d_token: the size of token
114
+ token_dim: the size of token
115
115
  initialization: initialization policy for parameters. Must be one of
116
116
  :code:`['uniform', 'normal']`. Let :code:`s = d ** -0.5`. Then, the
117
117
  corresponding distributions are :code:`Uniform(-s, s)` and :code:`Normal(0, s)`. In
@@ -123,8 +123,8 @@ class CLSToken(nn.Module):
123
123
  """
124
124
  super().__init__()
125
125
  initialization_ = _TokenInitialization.from_str(initialization)
126
- self.weight = nn.Parameter(Tensor(d_token))
127
- initialization_.apply(self.weight, d_token)
126
+ self.weight = nn.Parameter(Tensor(token_dim))
127
+ initialization_.apply(self.weight, token_dim)
128
128
 
129
129
  def expand(self, *leading_dimensions: int) -> Tensor:
130
130
  """Expand (repeat) the underlying [CLS]-token to a tensor with the given leading dimensions.
@@ -192,8 +192,8 @@ class MultiheadAttention(nn.Module):
192
192
  def __init__(
193
193
  self,
194
194
  *,
195
- d_token: int,
196
- n_heads: int,
195
+ token_dim: int,
196
+ num_heads: int,
197
197
  dropout: float,
198
198
  bias: bool,
199
199
  initialization: str,
@@ -201,9 +201,9 @@ class MultiheadAttention(nn.Module):
201
201
  """
202
202
  Parameters
203
203
  ----------
204
- d_token:
205
- the token size. Must be a multiple of :code:`n_heads`.
206
- n_heads:
204
+ token_dim:
205
+ the token size. Must be a multiple of :code:`num_heads`.
206
+ num_heads:
207
207
  the number of heads. If greater than 1, then the module will have
208
208
  an addition output layer (so called "mixing" layer).
209
209
  dropout:
@@ -221,15 +221,15 @@ class MultiheadAttention(nn.Module):
221
221
  AssertionError: if requirements for the inputs are not met.
222
222
  """
223
223
  super().__init__()
224
- if n_heads > 1:
225
- assert d_token % n_heads == 0, "d_token must be a multiple of n_heads"
224
+ if num_heads > 1:
225
+ assert token_dim % num_heads == 0, "token_dim must be a multiple of num_heads"
226
226
  assert initialization in ["kaiming", "xavier"]
227
227
 
228
- self.W_q = nn.Linear(d_token, d_token, bias)
229
- self.W_k = nn.Linear(d_token, d_token, bias)
230
- self.W_v = nn.Linear(d_token, d_token, bias)
231
- self.W_out = nn.Linear(d_token, d_token, bias) if n_heads > 1 else None
232
- self.n_heads = n_heads
228
+ self.W_q = nn.Linear(token_dim, token_dim, bias)
229
+ self.W_k = nn.Linear(token_dim, token_dim, bias)
230
+ self.W_v = nn.Linear(token_dim, token_dim, bias)
231
+ self.W_out = nn.Linear(token_dim, token_dim, bias) if num_heads > 1 else None
232
+ self.num_heads = num_heads
233
233
  self.dropout = nn.Dropout(dropout) if dropout else None
234
234
 
235
235
  for m in [self.W_q, self.W_k, self.W_v]:
@@ -246,12 +246,12 @@ class MultiheadAttention(nn.Module):
246
246
  nn.init.zeros_(self.W_out.bias)
247
247
 
248
248
  def _reshape(self, x: Tensor) -> Tensor:
249
- batch_size, n_tokens, d = x.shape
250
- d_head = d // self.n_heads
249
+ batch_size, num_tokens, d = x.shape
250
+ head_dim = d // self.num_heads
251
251
  return (
252
- x.reshape(batch_size, n_tokens, self.n_heads, d_head)
252
+ x.reshape(batch_size, num_tokens, self.num_heads, head_dim)
253
253
  .transpose(1, 2)
254
- .reshape(batch_size * self.n_heads, n_tokens, d_head)
254
+ .reshape(batch_size * self.num_heads, num_tokens, head_dim)
255
255
  )
256
256
 
257
257
  def forward(
@@ -283,27 +283,27 @@ class MultiheadAttention(nn.Module):
283
283
  ), "If key_compression is (not) None, then value_compression must (not) be None"
284
284
  q, k, v = self.W_q(x_q), self.W_k(x_kv), self.W_v(x_kv)
285
285
  for tensor in [q, k, v]:
286
- assert tensor.shape[-1] % self.n_heads == 0, _INTERNAL_ERROR_MESSAGE
286
+ assert tensor.shape[-1] % self.num_heads == 0, _INTERNAL_ERROR_MESSAGE
287
287
  if key_compression is not None:
288
288
  k = key_compression(k.transpose(1, 2)).transpose(1, 2)
289
289
  v = value_compression(v.transpose(1, 2)).transpose(1, 2) # type: ignore
290
290
 
291
291
  batch_size = len(q)
292
- d_head_key = k.shape[-1] // self.n_heads
293
- d_head_value = v.shape[-1] // self.n_heads
292
+ head_dim_key = k.shape[-1] // self.num_heads
293
+ head_dim_value = v.shape[-1] // self.num_heads
294
294
  n_q_tokens = q.shape[1]
295
295
 
296
296
  q = self._reshape(q)
297
297
  k = self._reshape(k)
298
- attention_logits = q @ k.transpose(1, 2) / math.sqrt(d_head_key)
298
+ attention_logits = q @ k.transpose(1, 2) / math.sqrt(head_dim_key)
299
299
  attention_probs = F.softmax(attention_logits, dim=-1)
300
300
  if self.dropout is not None:
301
301
  attention_probs = self.dropout(attention_probs)
302
302
  x = attention_probs @ self._reshape(v)
303
303
  x = (
304
- x.reshape(batch_size, self.n_heads, n_q_tokens, d_head_value)
304
+ x.reshape(batch_size, self.num_heads, n_q_tokens, head_dim_value)
305
305
  .transpose(1, 2)
306
- .reshape(batch_size, n_q_tokens, self.n_heads * d_head_value)
306
+ .reshape(batch_size, n_q_tokens, self.num_heads * head_dim_value)
307
307
  )
308
308
  if self.W_out is not None:
309
309
  x = self.W_out(x)
@@ -328,8 +328,8 @@ class AdditiveAttention(nn.Module):
328
328
  def __init__(
329
329
  self,
330
330
  *,
331
- d_token: int,
332
- n_heads: int,
331
+ token_dim: int,
332
+ num_heads: int,
333
333
  dropout: float,
334
334
  bias: bool,
335
335
  share_qv_weights: bool,
@@ -338,9 +338,9 @@ class AdditiveAttention(nn.Module):
338
338
  """
339
339
  Parameters
340
340
  ----------
341
- d_token:
342
- the token size. Must be a multiple of :code:`n_heads`.
343
- n_heads:
341
+ token_dim:
342
+ the token size. Must be a multiple of :code:`num_heads`.
343
+ num_heads:
344
344
  the number of heads. If greater than 1, then the module will have
345
345
  an addition output layer (so called "mixing" layer).
346
346
  dropout:
@@ -357,26 +357,26 @@ class AdditiveAttention(nn.Module):
357
357
  """
358
358
  super().__init__()
359
359
 
360
- assert d_token % n_heads == 0, "d_token must be a multiple of n_heads"
360
+ assert token_dim % num_heads == 0, "token_dim must be a multiple of num_heads"
361
361
  assert initialization in ["kaiming", "xavier"]
362
362
 
363
- self.head_dim = d_token // n_heads
364
- self.n_heads = n_heads
363
+ self.head_dim = token_dim // num_heads
364
+ self.num_heads = num_heads
365
365
  self.share_qv_weights = share_qv_weights
366
366
  self.dropout = nn.Dropout(dropout)
367
367
  trainable = []
368
368
  if share_qv_weights:
369
- self.qv_proj = nn.Linear(d_token, d_token, bias=bias)
369
+ self.qv_proj = nn.Linear(token_dim, token_dim, bias=bias)
370
370
  trainable.extend([self.qv_proj])
371
371
  else:
372
- self.q_proj = nn.Linear(d_token, d_token, bias=bias)
373
- self.v_proj = nn.Linear(d_token, d_token, bias=bias)
372
+ self.q_proj = nn.Linear(token_dim, token_dim, bias=bias)
373
+ self.v_proj = nn.Linear(token_dim, token_dim, bias=bias)
374
374
  trainable.extend([self.q_proj, self.v_proj])
375
375
 
376
- self.k_proj = nn.Linear(d_token, d_token, bias=bias)
377
- self.W_q = nn.Linear(d_token, n_heads)
378
- self.W_k = nn.Linear(d_token, n_heads)
379
- self.r_out = nn.Linear(d_token, d_token)
376
+ self.k_proj = nn.Linear(token_dim, token_dim, bias=bias)
377
+ self.W_q = nn.Linear(token_dim, num_heads)
378
+ self.W_k = nn.Linear(token_dim, num_heads)
379
+ self.r_out = nn.Linear(token_dim, token_dim)
380
380
  trainable.extend([self.k_proj, self.W_q, self.W_k, self.r_out])
381
381
 
382
382
  if initialization == "xavier":
@@ -392,24 +392,24 @@ class AdditiveAttention(nn.Module):
392
392
  x_kv: Tensor,
393
393
  *args, # Not used. just to make the input consistent with MultiheadAttention.
394
394
  ) -> Tuple[Tensor, Dict[str, Tensor]]:
395
- batch_size, n_q_tokens, d_token = x_q.shape
396
- batch_size, n_k_tokens, d_token = x_kv.shape
395
+ batch_size, n_q_tokens, token_dim = x_q.shape
396
+ batch_size, n_k_tokens, token_dim = x_kv.shape
397
397
 
398
398
  q = self.qv_proj(x_q) if self.share_qv_weights else self.q_proj(x_q)
399
399
  v = self.qv_proj(x_kv) if self.share_qv_weights else self.v_proj(x_kv)
400
400
  k = self.k_proj(x_kv)
401
401
 
402
402
  alphas = (self.W_q(q) / math.sqrt(self.head_dim)).softmax(dim=1)
403
- q_r = q.reshape(batch_size, n_q_tokens, self.n_heads, self.head_dim)
403
+ q_r = q.reshape(batch_size, n_q_tokens, self.num_heads, self.head_dim)
404
404
  global_query = torch.einsum(" b s h, b s h d -> b h d", alphas, q_r)
405
- global_query = global_query.reshape(batch_size, self.n_heads * self.head_dim).unsqueeze(1)
405
+ global_query = global_query.reshape(batch_size, self.num_heads * self.head_dim).unsqueeze(1)
406
406
 
407
407
  p = k * global_query
408
408
 
409
409
  betas = (self.W_k(p) / math.sqrt(self.head_dim)).softmax(dim=1)
410
- p_r = p.reshape(batch_size, n_k_tokens, self.n_heads, self.head_dim)
410
+ p_r = p.reshape(batch_size, n_k_tokens, self.num_heads, self.head_dim)
411
411
  global_key = torch.einsum(" b s h, b s h d -> b h d", betas, p_r)
412
- global_key = global_key.reshape(batch_size, self.n_heads * self.head_dim).unsqueeze(1)
412
+ global_key = global_key.reshape(batch_size, self.num_heads * self.head_dim).unsqueeze(1)
413
413
 
414
414
  u = v * global_key
415
415
  output = q + self.dropout(self.r_out(u))
@@ -433,7 +433,7 @@ class Custom_Transformer(nn.Module):
433
433
  def __init__(
434
434
  self,
435
435
  *,
436
- d_token: int,
436
+ token_dim: int,
437
437
  d_hidden: int,
438
438
  bias_first: bool,
439
439
  bias_second: bool,
@@ -442,13 +442,13 @@ class Custom_Transformer(nn.Module):
442
442
  ):
443
443
  super().__init__()
444
444
  self.linear_first = nn.Linear(
445
- d_token,
445
+ token_dim,
446
446
  d_hidden * (2 if _is_glu_activation(activation) else 1),
447
447
  bias_first,
448
448
  )
449
449
  self.activation = _make_nn_module(activation)
450
450
  self.dropout = nn.Dropout(dropout)
451
- self.linear_second = nn.Linear(d_hidden, d_token, bias_second)
451
+ self.linear_second = nn.Linear(d_hidden, token_dim, bias_second)
452
452
 
453
453
  def forward(self, x: Tensor) -> Tensor:
454
454
  x = self.linear_first(x)
@@ -484,13 +484,13 @@ class Custom_Transformer(nn.Module):
484
484
  def __init__(
485
485
  self,
486
486
  *,
487
- d_token: int,
488
- n_blocks: int,
489
- attention_n_heads: int,
487
+ token_dim: int,
488
+ num_blocks: int,
489
+ attention_num_heads: int,
490
490
  attention_dropout: float,
491
491
  attention_initialization: str,
492
492
  attention_normalization: str,
493
- ffn_d_hidden: int,
493
+ ffn_hidden_size: int,
494
494
  ffn_dropout: float,
495
495
  ffn_activation: str,
496
496
  ffn_normalization: str,
@@ -498,7 +498,7 @@ class Custom_Transformer(nn.Module):
498
498
  prenormalization: bool,
499
499
  first_prenormalization: bool,
500
500
  last_layer_query_idx: Union[None, List[int], slice],
501
- n_tokens: Optional[int],
501
+ num_tokens: Optional[int],
502
502
  kv_compression_ratio: Optional[float],
503
503
  kv_compression_sharing: Optional[str],
504
504
  head_activation: ModuleType,
@@ -511,11 +511,11 @@ class Custom_Transformer(nn.Module):
511
511
  """
512
512
  Parameters
513
513
  ----------
514
- d_token
514
+ token_dim
515
515
  The size of one token for `_CategoricalFeatureTokenizer`.
516
- n_blocks
516
+ num_blocks
517
517
  Number of the `FT_Transformer` blocks, which should be non-negative.
518
- attention_n_heads
518
+ attention_num_heads
519
519
  Number of attention heads in each `FT_Transformer` block, which should be positive.
520
520
  attention_dropout
521
521
  Dropout ratio for the Multi Headed Attention module.
@@ -523,7 +523,7 @@ class Custom_Transformer(nn.Module):
523
523
  Weights initialization scheme for Multi Headed Attention module.
524
524
  attention_normalization
525
525
  Normalization policy for attention layers. "layer_norm" is a good default.
526
- ffn_d_hidden
526
+ ffn_hidden_size
527
527
  Number of the hidden nodes of the linear layers in the Feed-Forward Network module.
528
528
  ffn_dropout
529
529
  Dropout ratio of the hidden nodes of the linear layers in the Feed-Forward Network module.
@@ -535,7 +535,7 @@ class Custom_Transformer(nn.Module):
535
535
  Dropout ratio for the linear layers in FT_Transformer block.
536
536
  prenormalization, first_prenormalization
537
537
  Prenormalization to stabilize the training.
538
- n_tokens
538
+ num_tokens
539
539
  Number of tokens of the input sequence.
540
540
  kv_compression_ratio
541
541
  The compression ration to reduce the input sequence length.
@@ -564,9 +564,9 @@ class Custom_Transformer(nn.Module):
564
564
  assert (
565
565
  not first_prenormalization
566
566
  ), "If `prenormalization` is False, then `first_prenormalization` must be False"
567
- assert _all_or_none([n_tokens, kv_compression_ratio, kv_compression_sharing]), (
567
+ assert _all_or_none([num_tokens, kv_compression_ratio, kv_compression_sharing]), (
568
568
  "If any of the following arguments is (not) None, then all of them must (not) be None: "
569
- "n_tokens, kv_compression_ratio, kv_compression_sharing"
569
+ "num_tokens, kv_compression_ratio, kv_compression_sharing"
570
570
  )
571
571
  assert (
572
572
  additive_attention or not share_qv_weights
@@ -595,9 +595,9 @@ class Custom_Transformer(nn.Module):
595
595
  )
596
596
 
597
597
  def make_kv_compression():
598
- assert n_tokens and kv_compression_ratio, _INTERNAL_ERROR_MESSAGE # for mypy
598
+ assert num_tokens and kv_compression_ratio, _INTERNAL_ERROR_MESSAGE # for mypy
599
599
  # https://github.com/pytorch/fairseq/blob/1bba712622b8ae4efb3eb793a8a40da386fe11d0/examples/linformer/linformer_src/modules/multihead_linear_attention.py#L83
600
- return nn.Linear(n_tokens, int(n_tokens * kv_compression_ratio), bias=False)
600
+ return nn.Linear(num_tokens, int(num_tokens * kv_compression_ratio), bias=False)
601
601
 
602
602
  self.shared_kv_compression = (
603
603
  make_kv_compression() if kv_compression_ratio and kv_compression_sharing == "layerwise" else None
@@ -607,12 +607,12 @@ class Custom_Transformer(nn.Module):
607
607
  self.last_layer_query_idx = last_layer_query_idx
608
608
 
609
609
  self.blocks = nn.ModuleList([])
610
- for layer_idx in range(n_blocks):
610
+ for layer_idx in range(num_blocks):
611
611
  layer = nn.ModuleDict(
612
612
  {
613
613
  "attention": AdditiveAttention(
614
- d_token=d_token,
615
- n_heads=attention_n_heads,
614
+ token_dim=token_dim,
615
+ num_heads=attention_num_heads,
616
616
  dropout=attention_dropout,
617
617
  bias=True,
618
618
  share_qv_weights=share_qv_weights,
@@ -620,15 +620,15 @@ class Custom_Transformer(nn.Module):
620
620
  )
621
621
  if additive_attention
622
622
  else MultiheadAttention(
623
- d_token=d_token,
624
- n_heads=attention_n_heads,
623
+ token_dim=token_dim,
624
+ num_heads=attention_num_heads,
625
625
  dropout=attention_dropout,
626
626
  bias=True,
627
627
  initialization=attention_initialization,
628
628
  ),
629
629
  "ffn": Custom_Transformer.FFN(
630
- d_token=d_token,
631
- d_hidden=ffn_d_hidden,
630
+ token_dim=token_dim,
631
+ d_hidden=ffn_hidden_size,
632
632
  bias_first=True,
633
633
  bias_second=True,
634
634
  dropout=ffn_dropout,
@@ -640,8 +640,8 @@ class Custom_Transformer(nn.Module):
640
640
  }
641
641
  )
642
642
  if layer_idx or not prenormalization or first_prenormalization:
643
- layer["attention_normalization"] = _make_nn_module(attention_normalization, d_token)
644
- layer["ffn_normalization"] = _make_nn_module(ffn_normalization, d_token)
643
+ layer["attention_normalization"] = _make_nn_module(attention_normalization, token_dim)
644
+ layer["ffn_normalization"] = _make_nn_module(ffn_normalization, token_dim)
645
645
  if kv_compression_ratio and self.shared_kv_compression is None:
646
646
  layer["key_compression"] = make_kv_compression()
647
647
  if kv_compression_sharing == "headwise":
@@ -652,7 +652,7 @@ class Custom_Transformer(nn.Module):
652
652
 
653
653
  self.head = (
654
654
  Custom_Transformer.Head(
655
- d_in=d_token,
655
+ d_in=token_dim,
656
656
  d_out=d_out,
657
657
  bias=True,
658
658
  activation=head_activation, # type: ignore
@@ -691,7 +691,7 @@ class Custom_Transformer(nn.Module):
691
691
  return x
692
692
 
693
693
  def forward(self, x: Tensor) -> Tensor:
694
- assert x.ndim == 3, "The input must have 3 dimensions: (n_objects, n_tokens, d_token)"
694
+ assert x.ndim == 3, "The input must have 3 dimensions: (n_objects, num_tokens, token_dim)"
695
695
  for layer_idx, layer in enumerate(self.blocks):
696
696
  layer = cast(nn.ModuleDict, layer)
697
697
 
@@ -6,24 +6,18 @@ from transformers import logging as hf_logging
6
6
 
7
7
  from ..constants import (
8
8
  ATTENTION_MASK,
9
- AUTOMM,
10
9
  BBOX,
11
- COLUMN,
12
10
  COLUMN_FEATURES,
13
11
  FEATURES,
14
12
  IMAGE,
15
13
  INPUT_IDS,
16
- LABEL,
17
14
  LOGITS,
18
15
  MASKS,
19
16
  PIXEL_VALUES,
20
- TEXT_SEGMENT_IDS,
21
- TEXT_TOKEN_IDS,
22
- TEXT_VALID_LENGTH,
23
17
  TOKEN_TYPE_IDS,
24
18
  )
25
- from .huggingface_text import HFAutoModelForTextPrediction
26
- from .utils import get_column_features
19
+ from .hf_text import HFAutoModelForTextPrediction
20
+ from .utils import get_column_features, get_image_size_mean_std
27
21
 
28
22
  hf_logging.set_verbosity_error()
29
23
 
@@ -45,6 +39,8 @@ class DocumentTransformer(HFAutoModelForTextPrediction):
45
39
  low_cpu_mem_usage: Optional[bool] = False,
46
40
  pretrained: Optional[bool] = True,
47
41
  tokenizer_name: Optional[str] = "hf_auto",
42
+ image_size: Optional[int] = None,
43
+ image_norm: Optional[str] = None,
48
44
  ):
49
45
  """
50
46
  Load a pretrained huggingface layout-aware document transformer backbone.
@@ -77,8 +73,20 @@ class DocumentTransformer(HFAutoModelForTextPrediction):
77
73
  Whether using the pretrained weights. If pretrained=True, download the pretrained model.
78
74
  tokenizer_name
79
75
  Name of the huggingface tokenizer type.
76
+ image_norm
77
+ How to normalize an image. We now support:
78
+ - inception
79
+ Normalize image by IMAGENET_INCEPTION_MEAN and IMAGENET_INCEPTION_STD from timm
80
+ - imagenet
81
+ Normalize image by IMAGENET_DEFAULT_MEAN and IMAGENET_DEFAULT_STD from timm
82
+ - clip
83
+ Normalize image by mean (0.48145466, 0.4578275, 0.40821073) and
84
+ std (0.26862954, 0.26130258, 0.27577711), used for CLIP.
85
+ image_size
86
+ The provided width / height of a square image.
80
87
  """
81
- logger.debug(f"initializing {checkpoint_name}")
88
+ logger.debug(f"initializing {prefix} (DocumentTransformer)")
89
+ logger.debug(f"model checkpoint: {checkpoint_name}")
82
90
  super().__init__(
83
91
  prefix=prefix,
84
92
  checkpoint_name=checkpoint_name,
@@ -89,6 +97,12 @@ class DocumentTransformer(HFAutoModelForTextPrediction):
89
97
  pretrained=pretrained,
90
98
  tokenizer_name=tokenizer_name,
91
99
  )
100
+ self.image_size, self.image_mean, self.image_std = get_image_size_mean_std(
101
+ model_name=self.prefix,
102
+ config=self.config,
103
+ provided_size=image_size,
104
+ provided_norm_type=image_norm,
105
+ )
92
106
  self.is_text_only_flag = self.is_text_only()
93
107
 
94
108
  if self.is_text_only_flag: