autogluon.multimodal 1.2.1b20250303__py3-none-any.whl → 1.2.1b20250305__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autogluon/multimodal/__init__.py +4 -2
- autogluon/multimodal/configs/data/default.yaml +4 -2
- autogluon/multimodal/configs/{environment → env}/default.yaml +2 -3
- autogluon/multimodal/configs/model/default.yaml +58 -11
- autogluon/multimodal/configs/{optimization → optim}/default.yaml +21 -4
- autogluon/multimodal/constants.py +16 -5
- autogluon/multimodal/data/__init__.py +14 -2
- autogluon/multimodal/data/dataset.py +2 -2
- autogluon/multimodal/data/infer_types.py +16 -2
- autogluon/multimodal/data/label_encoder.py +3 -3
- autogluon/multimodal/{utils → data}/nlpaug.py +4 -4
- autogluon/multimodal/data/preprocess_dataframe.py +55 -38
- autogluon/multimodal/data/process_categorical.py +35 -6
- autogluon/multimodal/data/process_document.py +59 -33
- autogluon/multimodal/data/process_image.py +198 -163
- autogluon/multimodal/data/process_label.py +7 -3
- autogluon/multimodal/data/process_mmlab/process_mmdet.py +1 -8
- autogluon/multimodal/data/process_mmlab/process_mmlab_base.py +2 -9
- autogluon/multimodal/data/process_mmlab/process_mmocr.py +1 -9
- autogluon/multimodal/data/process_ner.py +192 -4
- autogluon/multimodal/data/process_numerical.py +32 -5
- autogluon/multimodal/data/process_semantic_seg_img.py +23 -28
- autogluon/multimodal/data/process_text.py +95 -58
- autogluon/multimodal/data/template_engine.py +7 -9
- autogluon/multimodal/data/templates.py +0 -2
- autogluon/multimodal/data/trivial_augmenter.py +2 -2
- autogluon/multimodal/data/utils.py +564 -338
- autogluon/multimodal/learners/__init__.py +2 -1
- autogluon/multimodal/learners/base.py +189 -189
- autogluon/multimodal/learners/ensemble.py +748 -0
- autogluon/multimodal/learners/few_shot_svm.py +6 -15
- autogluon/multimodal/learners/matching.py +59 -84
- autogluon/multimodal/learners/ner.py +23 -22
- autogluon/multimodal/learners/object_detection.py +26 -21
- autogluon/multimodal/learners/semantic_segmentation.py +16 -18
- autogluon/multimodal/models/__init__.py +12 -3
- autogluon/multimodal/models/augmenter.py +175 -0
- autogluon/multimodal/models/categorical_mlp.py +13 -8
- autogluon/multimodal/models/clip.py +92 -18
- autogluon/multimodal/models/custom_transformer.py +75 -75
- autogluon/multimodal/models/document_transformer.py +23 -9
- autogluon/multimodal/models/ft_transformer.py +40 -35
- autogluon/multimodal/models/fusion/base.py +2 -4
- autogluon/multimodal/models/fusion/fusion_mlp.py +82 -18
- autogluon/multimodal/models/fusion/fusion_ner.py +1 -1
- autogluon/multimodal/models/fusion/fusion_transformer.py +23 -23
- autogluon/multimodal/models/{huggingface_text.py → hf_text.py} +21 -2
- autogluon/multimodal/models/meta_transformer.py +336 -0
- autogluon/multimodal/models/mlp.py +6 -6
- autogluon/multimodal/models/mmocr_text_detection.py +1 -1
- autogluon/multimodal/models/mmocr_text_recognition.py +0 -1
- autogluon/multimodal/models/ner_text.py +1 -8
- autogluon/multimodal/models/numerical_mlp.py +14 -8
- autogluon/multimodal/models/sam.py +12 -2
- autogluon/multimodal/models/t_few.py +21 -5
- autogluon/multimodal/models/timm_image.py +74 -32
- autogluon/multimodal/models/utils.py +877 -16
- autogluon/multimodal/optim/__init__.py +17 -0
- autogluon/multimodal/{optimization → optim}/lit_distiller.py +2 -1
- autogluon/multimodal/{optimization → optim}/lit_matcher.py +4 -10
- autogluon/multimodal/{optimization → optim}/lit_mmdet.py +2 -10
- autogluon/multimodal/{optimization → optim}/lit_module.py +139 -14
- autogluon/multimodal/{optimization → optim}/lit_ner.py +3 -3
- autogluon/multimodal/{optimization → optim}/lit_semantic_seg.py +1 -1
- autogluon/multimodal/optim/losses/__init__.py +14 -0
- autogluon/multimodal/optim/losses/bce_loss.py +25 -0
- autogluon/multimodal/optim/losses/focal_loss.py +81 -0
- autogluon/multimodal/optim/losses/lemda_loss.py +39 -0
- autogluon/multimodal/optim/losses/rkd_loss.py +103 -0
- autogluon/multimodal/optim/losses/softmax_losses.py +177 -0
- autogluon/multimodal/optim/losses/structure_loss.py +26 -0
- autogluon/multimodal/optim/losses/utils.py +313 -0
- autogluon/multimodal/optim/lr/__init__.py +1 -0
- autogluon/multimodal/optim/lr/utils.py +332 -0
- autogluon/multimodal/optim/metrics/__init__.py +4 -0
- autogluon/multimodal/optim/metrics/coverage_metrics.py +42 -0
- autogluon/multimodal/optim/metrics/hit_rate_metrics.py +78 -0
- autogluon/multimodal/optim/metrics/ranking_metrics.py +231 -0
- autogluon/multimodal/optim/metrics/utils.py +359 -0
- autogluon/multimodal/optim/utils.py +284 -0
- autogluon/multimodal/predictor.py +51 -12
- autogluon/multimodal/utils/__init__.py +19 -45
- autogluon/multimodal/utils/cache.py +23 -2
- autogluon/multimodal/utils/checkpoint.py +58 -5
- autogluon/multimodal/utils/config.py +127 -55
- autogluon/multimodal/utils/device.py +120 -0
- autogluon/multimodal/utils/distillation.py +8 -8
- autogluon/multimodal/utils/download.py +1 -1
- autogluon/multimodal/utils/env.py +22 -0
- autogluon/multimodal/utils/export.py +3 -3
- autogluon/multimodal/utils/hpo.py +5 -5
- autogluon/multimodal/utils/inference.py +37 -4
- autogluon/multimodal/utils/install.py +91 -0
- autogluon/multimodal/utils/load.py +52 -47
- autogluon/multimodal/utils/log.py +6 -41
- autogluon/multimodal/utils/matcher.py +3 -2
- autogluon/multimodal/utils/onnx.py +0 -4
- autogluon/multimodal/utils/path.py +10 -0
- autogluon/multimodal/utils/precision.py +130 -0
- autogluon/multimodal/{presets.py → utils/presets.py} +259 -66
- autogluon/multimodal/{problem_types.py → utils/problem_types.py} +30 -1
- autogluon/multimodal/utils/save.py +47 -29
- autogluon/multimodal/utils/strategy.py +24 -0
- autogluon/multimodal/version.py +1 -1
- {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250305.dist-info}/METADATA +5 -5
- autogluon.multimodal-1.2.1b20250305.dist-info/RECORD +163 -0
- autogluon/multimodal/optimization/__init__.py +0 -16
- autogluon/multimodal/optimization/losses.py +0 -394
- autogluon/multimodal/optimization/utils.py +0 -1054
- autogluon/multimodal/utils/cloud_io.py +0 -80
- autogluon/multimodal/utils/data.py +0 -701
- autogluon/multimodal/utils/environment.py +0 -395
- autogluon/multimodal/utils/metric.py +0 -500
- autogluon/multimodal/utils/model.py +0 -558
- autogluon.multimodal-1.2.1b20250303.dist-info/RECORD +0 -145
- /autogluon/multimodal/{optimization → optim}/deepspeed.py +0 -0
- /autogluon/multimodal/{optimization/lr_scheduler.py → optim/lr/lr_schedulers.py} +0 -0
- /autogluon/multimodal/{optimization → optim/metrics}/semantic_seg_metrics.py +0 -0
- /autogluon/multimodal/{registry.py → utils/registry.py} +0 -0
- /autogluon.multimodal-1.2.1b20250303-py3.9-nspkg.pth → /autogluon.multimodal-1.2.1b20250305-py3.9-nspkg.pth +0 -0
- {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250305.dist-info}/LICENSE +0 -0
- {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250305.dist-info}/NOTICE +0 -0
- {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250305.dist-info}/WHEEL +0 -0
- {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250305.dist-info}/namespace_packages.txt +0 -0
- {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250305.dist-info}/top_level.txt +0 -0
- {autogluon.multimodal-1.2.1b20250303.dist-info → autogluon.multimodal-1.2.1b20250305.dist-info}/zip-safe +0 -0
autogluon/multimodal/__init__.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
|
+
from autogluon.common.utils.log_utils import _add_stream_handler
|
2
|
+
|
1
3
|
try:
|
2
4
|
from .version import __version__
|
3
5
|
except ImportError:
|
4
6
|
pass
|
5
7
|
|
6
|
-
from . import constants, data, learners, models, optimization, predictor, problem_types, utils
|
7
8
|
from .predictor import MultiModalPredictor
|
8
|
-
|
9
|
+
|
10
|
+
_add_stream_handler()
|
@@ -7,6 +7,7 @@ data:
|
|
7
7
|
minimum_cat_count: 100 # The minimum number of occurrences a category must have in the training data to avoid being considered a rare category.
|
8
8
|
maximum_num_cat: 20 # The maximum amount of categories that can be considered non-rare.
|
9
9
|
convert_to_text: False # Whether to convert the feature to text.
|
10
|
+
convert_to_text_template: "latex" # The template used to convert categorical to text. Choices are: "direct", "list", "text", "latex".
|
10
11
|
numerical:
|
11
12
|
convert_to_text: False # Whether to convert the feature to text.
|
12
13
|
scaler_with_mean: True # Whether to normalize with mean.
|
@@ -14,7 +15,7 @@ data:
|
|
14
15
|
document:
|
15
16
|
missing_value_strategy: "zero" # How to deal with missing documents. By default, we use a zero document image to replace a missing document. We also support "skip", i.e., skipping a sample with missing documents.
|
16
17
|
label:
|
17
|
-
|
18
|
+
numerical_preprocessing: "standardscaler" # The mode of numerical label preprocessing for . Support "standardscaler" or "minmaxscaler" or None (means no transform).
|
18
19
|
pos_label: # The name of binary classification's positive class. It's used in computing some metrics, e.g., roc_auc. If not provided, then use label_encoder.classes_[1],
|
19
20
|
column_features_pooling_mode: "concat" # How to pool multi-column features into one feature vector. Currently only support "concat" or "mean" for few shot classification.
|
20
21
|
mixup:
|
@@ -22,11 +23,12 @@ data:
|
|
22
23
|
mixup_alpha: 0.8 # Mixup alpha.
|
23
24
|
cutmix_alpha: 1.0 # Cutmix alpha.
|
24
25
|
cutmix_minmax: # Cutmix min/max ratio, it will override cutmix alpha if set, a list/tuple with size two.
|
25
|
-
prob: 1.0 # The probability of conducting mixup/cutmix if
|
26
|
+
prob: 1.0 # The probability of conducting mixup/cutmix if enabled.
|
26
27
|
switch_prob: 0.5 # The probability of switching mixup to cutmix if both enable.
|
27
28
|
mode: "batch" # Perform mixup/cutmix on "batch" or "pair" or "elem".
|
28
29
|
turn_off_epoch: 5 # The epoch when the mixup will be turned off.
|
29
30
|
label_smoothing: 0.1 # Label smoothing.
|
31
|
+
modality_dropout: 0
|
30
32
|
templates:
|
31
33
|
turn_on: False
|
32
34
|
num_templates: 30 # The number of templates to sample from uniformly.
|
@@ -3,11 +3,10 @@ env:
|
|
3
3
|
num_nodes: 1
|
4
4
|
batch_size: 128 # this is a desired batch size; pl trainer will accumulate gradients when per step batch is smaller.
|
5
5
|
per_gpu_batch_size: 8 # training per gpu batch size
|
6
|
-
|
7
|
-
per_gpu_batch_size_evaluation: # This is deprecated. Use eval_batch_size_ratio instead.
|
6
|
+
inference_batch_size_ratio: 4 # per_gpu_batch_size_for_inference = per_gpu_batch_size * inference_batch_size_ratio
|
8
7
|
precision: "16-mixed" # training precision. Refer to https://lightning.ai/docs/pytorch/stable/common/trainer.html#precision
|
9
8
|
num_workers: 2 # pytorch training dataloader workers.
|
10
|
-
|
9
|
+
num_workers_inference: 2 # pytorch prediction/test dataloader workers.
|
11
10
|
accelerator: "auto" # "cpu", "gpu", or "auto"
|
12
11
|
fast_dev_run: False
|
13
12
|
deterministic: False
|
@@ -4,21 +4,23 @@ model:
|
|
4
4
|
hidden_size: 64
|
5
5
|
activation: "leaky_relu"
|
6
6
|
num_layers: 1
|
7
|
-
|
7
|
+
dropout: 0.1
|
8
8
|
normalization: "layer_norm"
|
9
9
|
data_types:
|
10
10
|
- "categorical"
|
11
|
+
|
11
12
|
numerical_mlp:
|
12
13
|
hidden_size: 128
|
13
14
|
activation: "leaky_relu"
|
14
15
|
num_layers: 1
|
15
|
-
|
16
|
+
dropout: 0.1
|
16
17
|
normalization: "layer_norm"
|
17
|
-
|
18
|
+
token_dim: 8
|
18
19
|
embedding_arch:
|
19
20
|
data_types:
|
20
21
|
- "numerical"
|
21
22
|
merge: "concat"
|
23
|
+
|
22
24
|
hf_text:
|
23
25
|
checkpoint_name: "google/electra-base-discriminator"
|
24
26
|
gradient_checkpointing: False
|
@@ -39,6 +41,7 @@ model:
|
|
39
41
|
# - "random_delete(0.05)" # less than 0.1 based on eda paper
|
40
42
|
# - "syn_replacement(0.05)" # less than 0.1 based on eda paper
|
41
43
|
# - "insert_punc(0.05)"
|
44
|
+
|
42
45
|
ner_text:
|
43
46
|
checkpoint_name: "bert-base-cased"
|
44
47
|
max_text_len: 512
|
@@ -53,6 +56,7 @@ model:
|
|
53
56
|
special_tags:
|
54
57
|
- X # CLS, SEP, and non-first tokens of a word will be labelled as X
|
55
58
|
- O # Outside of a named entity
|
59
|
+
|
56
60
|
document_transformer:
|
57
61
|
checkpoint_name: "microsoft/layoutlmv3-base" # document foundation models
|
58
62
|
gradient_checkpointing: False
|
@@ -75,6 +79,7 @@ model:
|
|
75
79
|
stochastic_chunk: False
|
76
80
|
text_aug_detect_length: 10 # We perform text augmentation only if a text has more than text_detection_length words. It is used to differentiate text columns versus tabular columns that are treated as text.
|
77
81
|
text_trivial_aug_maxscale: 0.0 # augmentation magnitude randomly drawn from [0, text_trivial_aug_maxscale]
|
82
|
+
|
78
83
|
t_few:
|
79
84
|
checkpoint_name: "t5-small" #"bigscience/T0_3B"
|
80
85
|
gradient_checkpointing: False
|
@@ -91,6 +96,8 @@ model:
|
|
91
96
|
stochastic_chunk: False
|
92
97
|
text_aug_detect_length: 10 # We perform text augmentation only if a text has more than text_detection_length words. It is used to differentiate text columns versus tabular columns that are treated as text.
|
93
98
|
text_trivial_aug_maxscale: 0.0 # augmentation magnititude randomly drawn from [0, text_trivial_aug_maxscale]
|
99
|
+
text_train_augment_types:
|
100
|
+
|
94
101
|
timm_image:
|
95
102
|
checkpoint_name: "swin_base_patch4_window7_224"
|
96
103
|
mix_choice: "all_logits"
|
@@ -105,7 +112,9 @@ model:
|
|
105
112
|
- "center_crop"
|
106
113
|
image_norm: "imagenet"
|
107
114
|
image_size: null
|
108
|
-
|
115
|
+
image_chan_num: 3
|
116
|
+
use_learnable_image: False
|
117
|
+
max_image_num_per_column: 1
|
109
118
|
|
110
119
|
mmdet_image:
|
111
120
|
checkpoint_name: "yolov3_mobilenetv2_8xb24-320-300e_coco"
|
@@ -161,7 +170,9 @@ model:
|
|
161
170
|
- "center_crop"
|
162
171
|
image_norm: "clip"
|
163
172
|
image_size: 224
|
164
|
-
|
173
|
+
image_chan_num: 3
|
174
|
+
use_learnable_image: False
|
175
|
+
max_image_num_per_column: 1
|
165
176
|
tokenizer_name: "clip"
|
166
177
|
max_text_len: 77 # The maximum possible length.
|
167
178
|
insert_sep: False
|
@@ -174,15 +185,17 @@ model:
|
|
174
185
|
# - "random_delete(0.05)" # less than 0.1 based on eda paper
|
175
186
|
# - "syn_replacement(0.05)" # less than 0.1 based on eda paper
|
176
187
|
# - "insert_punc(0.05)"
|
188
|
+
|
177
189
|
fusion_mlp:
|
178
|
-
|
190
|
+
aux_loss_weight:
|
179
191
|
adapt_in_features: "max"
|
180
192
|
hidden_sizes:
|
181
193
|
- 128
|
182
194
|
activation: "leaky_relu"
|
183
|
-
|
195
|
+
dropout: 0.1
|
184
196
|
normalization: "layer_norm"
|
185
197
|
data_types:
|
198
|
+
|
186
199
|
fusion_ner:
|
187
200
|
weight:
|
188
201
|
adapt_in_features: "max"
|
@@ -192,21 +205,24 @@ model:
|
|
192
205
|
drop_rate: 0.1
|
193
206
|
normalization: "layer_norm"
|
194
207
|
data_types:
|
208
|
+
|
195
209
|
fusion_transformer:
|
210
|
+
aux_loss_weight:
|
196
211
|
hidden_size: 192
|
197
|
-
|
198
|
-
|
212
|
+
num_blocks: 3
|
213
|
+
attention_num_heads: 8
|
199
214
|
adapt_in_features: "max"
|
200
215
|
attention_dropout: 0.2
|
201
216
|
residual_dropout: 0.0
|
202
217
|
ffn_dropout: 0.1
|
203
|
-
|
218
|
+
ffn_hidden_size: 192
|
204
219
|
normalization: "layer_norm"
|
205
220
|
ffn_activation: "geglu"
|
206
221
|
head_activation: "relu"
|
207
222
|
data_types:
|
208
223
|
additive_attention: False # Whether to use lightweight additive attention, can be True, False or "auto"
|
209
224
|
share_qv_weights: False # Whether to share weight for query and value, can be True, False or "auto"
|
225
|
+
|
210
226
|
ft_transformer:
|
211
227
|
data_types:
|
212
228
|
- "categorical"
|
@@ -216,7 +232,7 @@ model:
|
|
216
232
|
token_dim: 192
|
217
233
|
hidden_size: 192
|
218
234
|
num_blocks: 3
|
219
|
-
|
235
|
+
attention_num_heads: 8
|
220
236
|
attention_dropout: 0.2
|
221
237
|
residual_dropout: 0.0
|
222
238
|
ffn_dropout: 0.1
|
@@ -247,3 +263,34 @@ model:
|
|
247
263
|
frozen_layers: ["mask_decoder.iou_prediction_head", "prompt_encoder"]
|
248
264
|
num_mask_tokens: 1
|
249
265
|
ignore_label: 255
|
266
|
+
|
267
|
+
meta_transformer:
|
268
|
+
data_types:
|
269
|
+
- "image"
|
270
|
+
- "text"
|
271
|
+
- "categorical"
|
272
|
+
- "numerical"
|
273
|
+
checkpoint_path: null
|
274
|
+
model_version: "base"
|
275
|
+
requires_all_dtypes: False
|
276
|
+
train_transforms:
|
277
|
+
- "resize_shorter_side"
|
278
|
+
- "center_crop"
|
279
|
+
- "trivial_augment"
|
280
|
+
val_transforms:
|
281
|
+
- "resize_shorter_side"
|
282
|
+
- "center_crop"
|
283
|
+
image_norm: "imagenet"
|
284
|
+
image_size: 224
|
285
|
+
image_chan_num: 3
|
286
|
+
use_learnable_image: False
|
287
|
+
max_image_num_per_column: 1
|
288
|
+
tokenizer_name: "hf_auto"
|
289
|
+
max_text_len: 512 # If None or <=0, then use the max length of pretrained models.
|
290
|
+
insert_sep: True
|
291
|
+
text_segment_num: 2
|
292
|
+
stochastic_chunk: False
|
293
|
+
text_aug_detect_length: 10 # We perform text augmentation only if a text has more than text_detection_length words. It is used to differentiate text columns versus tabular columns that are treated as text.
|
294
|
+
text_trivial_aug_maxscale: 0.1 # augmentation magnitude randomly drawn from [0, text_trivial_aug_maxscale]
|
295
|
+
text_train_augment_types:
|
296
|
+
merge: "concat"
|
@@ -1,6 +1,6 @@
|
|
1
|
-
|
1
|
+
optim:
|
2
2
|
optim_type: "adamw"
|
3
|
-
|
3
|
+
lr: 1.0e-4
|
4
4
|
weight_decay: 0.001
|
5
5
|
lr_choice: "layerwise_decay"
|
6
6
|
lr_decay: 0.9
|
@@ -18,11 +18,12 @@ optimization:
|
|
18
18
|
gradient_clip_algorithm: "norm"
|
19
19
|
track_grad_norm: -1 # Whether to check gradient norm. We can set it to 2 to check for gradient norm.
|
20
20
|
log_every_n_steps: 10
|
21
|
+
label_smoothing: 0
|
21
22
|
top_k: 3
|
22
23
|
top_k_average_method:
|
23
24
|
"greedy_soup" # We support averaging method described in https://arxiv.org/pdf/2203.05482.pdf.
|
24
25
|
# Currently support "uniform_soup", "greedy_soup", and "best".
|
25
|
-
|
26
|
+
peft: null # Can be 'bit_fit' (only finetune bias), 'norm_fit' (finetune the normalization terms + bias terms), lora (LoRA Adaptations only), lora_bias (LoRA Adaptation + bit_fit), lora_norm (LoRA Adaptation + norm_fit), or null
|
26
27
|
lora:
|
27
28
|
module_filter: null # Specify which module (if any) to adapt(e.g. ".*EncDecAttention|.*DenseReluDense"). Default consider all modules in a model (i.e. empty filter).
|
28
29
|
filter: # Specify which layer in a module to adapt. Default fine-tune only query and value attention weights, recommended in https://arxiv.org/abs/2106.09685
|
@@ -35,7 +36,7 @@ optimization:
|
|
35
36
|
r: 8
|
36
37
|
alpha: 8
|
37
38
|
conv_lora_expert_num: 8 # default setting for Conv-LoRA
|
38
|
-
|
39
|
+
loss_func:
|
39
40
|
"auto" # The replaced loss for regression. Can only support loss function in torch.nn.
|
40
41
|
# example
|
41
42
|
# "BCEWithLogitsLoss" or "nn.BCEWithLogitsloss"
|
@@ -48,3 +49,19 @@ optimization:
|
|
48
49
|
loss_mask_weight: 5.0
|
49
50
|
loss_dice_weight: 5.0
|
50
51
|
extra_trainable_params: []
|
52
|
+
cross_modal_align: null
|
53
|
+
cross_modal_align_weight: 0
|
54
|
+
automatic_optimization: True
|
55
|
+
lemda:
|
56
|
+
turn_on: False
|
57
|
+
arch_type: "mlp_vae"
|
58
|
+
z_dim: 8
|
59
|
+
num_layers: 6
|
60
|
+
kld_weight: 0.1
|
61
|
+
mse_weight: 0.1
|
62
|
+
adv_weight: 1.0e-4
|
63
|
+
consist_weight: 0.01
|
64
|
+
consist_threshold: 0.5
|
65
|
+
lr: 1.0e-4
|
66
|
+
optim_type: "adamw"
|
67
|
+
weight_decay: 1.0e-5
|
@@ -59,12 +59,17 @@ SEMANTIC_SEGMENTATION_GT = "semantic_segmentation_gt"
|
|
59
59
|
|
60
60
|
# Output keys
|
61
61
|
LOGITS = "logits"
|
62
|
+
ORI_LOGITS = "ori_logits"
|
63
|
+
AUG_LOGITS = "aug_logits"
|
62
64
|
TEMPLATE_LOGITS = "template_logits"
|
63
65
|
LM_TARGET = "lm_target"
|
64
66
|
LOSS = "loss"
|
65
67
|
OUTPUT = "output"
|
66
68
|
WEIGHT = "weight"
|
67
69
|
FEATURES = "features"
|
70
|
+
MULTIMODAL_FEATURES = "multimodal_features" # used for the adapted multimodal features before the fusion module
|
71
|
+
MULTIMODAL_FEATURES_PRE_AUG = "multimodal_features_pre_aug"
|
72
|
+
MULTIMODAL_FEATURES_POST_AUG = "multimodal_features_post_aug"
|
68
73
|
RAW_FEATURES = "raw_features"
|
69
74
|
MASKS = "masks"
|
70
75
|
PROBABILITY = "probability"
|
@@ -73,6 +78,8 @@ BBOX = "bbox"
|
|
73
78
|
ROIS = "rois"
|
74
79
|
SCORE = "score"
|
75
80
|
LOGIT_SCALE = "logit_scale"
|
81
|
+
VAE_MEAN = "vae_mean"
|
82
|
+
VAE_VAR = "vae_var"
|
76
83
|
|
77
84
|
# Loss
|
78
85
|
MOE_LOSS = "moe_loss"
|
@@ -142,6 +149,7 @@ FM = "fm"
|
|
142
149
|
MAE = "mae"
|
143
150
|
BER = "ber"
|
144
151
|
IOU = "iou"
|
152
|
+
COVERAGE = "coverage"
|
145
153
|
RETRIEVAL_METRICS = [NDCG, PRECISION, RECALL, MRR]
|
146
154
|
METRIC_MODE_MAP = {
|
147
155
|
ACC: MAX,
|
@@ -168,6 +176,7 @@ METRIC_MODE_MAP = {
|
|
168
176
|
SM: MAX,
|
169
177
|
IOU: MAX,
|
170
178
|
BER: MIN,
|
179
|
+
COVERAGE: MAX,
|
171
180
|
}
|
172
181
|
|
173
182
|
MATCHING_METRICS = {
|
@@ -179,7 +188,7 @@ MATCHING_METRICS_WITHOUT_PROBLEM_TYPE = [RECALL, NDCG]
|
|
179
188
|
|
180
189
|
EVALUATION_METRICS = {
|
181
190
|
# Use evaluation metrics from METRICS for these types
|
182
|
-
BINARY: METRICS[BINARY].keys(),
|
191
|
+
BINARY: list(METRICS[BINARY].keys()) + [COVERAGE],
|
183
192
|
MULTICLASS: METRICS[MULTICLASS].keys(),
|
184
193
|
REGRESSION: METRICS[REGRESSION].keys(),
|
185
194
|
OBJECT_DETECTION: DETECTION_METRICS,
|
@@ -197,6 +206,7 @@ VALIDATION_METRICS = {
|
|
197
206
|
# Training status
|
198
207
|
TRAIN = "train"
|
199
208
|
VALIDATE = "validate"
|
209
|
+
VAL = "val"
|
200
210
|
TEST = "test"
|
201
211
|
PREDICT = "predict"
|
202
212
|
|
@@ -217,11 +227,11 @@ Y_TRUE = "y_true"
|
|
217
227
|
# Configuration keys
|
218
228
|
MODEL = "model"
|
219
229
|
DATA = "data"
|
220
|
-
|
221
|
-
|
230
|
+
OPTIM = "optim"
|
231
|
+
ENV = "env"
|
222
232
|
DISTILLER = "distiller"
|
223
233
|
MATCHER = "matcher"
|
224
|
-
VALID_CONFIG_KEYS = [MODEL, DATA,
|
234
|
+
VALID_CONFIG_KEYS = [MODEL, DATA, OPTIM, ENV, DISTILLER, MATCHER]
|
225
235
|
|
226
236
|
# Image normalization mean and std. This is only to normalize images for the CLIP model.
|
227
237
|
CLIP_IMAGE_MEAN = (0.48145466, 0.4578275, 0.40821073)
|
@@ -275,7 +285,7 @@ PEFT_STRATEGIES = list(set(PEFT_ADDITIVE_STRATEGIES) | set(PEFT_NON_ADDITIVE_STR
|
|
275
285
|
# DeepSpeed constants
|
276
286
|
DEEPSPEED_OFFLOADING = "deepspeed_stage_3_offload"
|
277
287
|
DEEPSPEED_STRATEGY = "deepspeed"
|
278
|
-
DEEPSPEED_MODULE = "autogluon.multimodal.
|
288
|
+
DEEPSPEED_MODULE = "autogluon.multimodal.optim.deepspeed"
|
279
289
|
DEEPSPEED_MIN_PL_VERSION = "1.7.1"
|
280
290
|
|
281
291
|
# registered model keys. TODO: document how to add new models.
|
@@ -298,6 +308,7 @@ DOCUMENT_TRANSFORMER = "document_transformer"
|
|
298
308
|
HF_MODELS = (HF_TEXT, T_FEW, CLIP, NER_TEXT, DOCUMENT_TRANSFORMER)
|
299
309
|
MMLAB_MODELS = (MMDET_IMAGE, MMOCR_TEXT_DET, MMOCR_TEXT_RECOG)
|
300
310
|
SAM = "sam"
|
311
|
+
META_TRANSFORMER = "meta_transformer"
|
301
312
|
|
302
313
|
# matcher loss type
|
303
314
|
CONTRASTIVE_LOSS = "contrastive_loss"
|
@@ -1,4 +1,3 @@
|
|
1
|
-
from . import collator, infer_types, randaug, utils
|
2
1
|
from .datamodule import BaseDataModule
|
3
2
|
from .dataset import BaseDataset
|
4
3
|
from .dataset_mmlab import MultiImageMixDataset
|
@@ -9,8 +8,9 @@ from .infer_types import (
|
|
9
8
|
infer_rois_column_type,
|
10
9
|
is_image_column,
|
11
10
|
)
|
12
|
-
from .label_encoder import CustomLabelEncoder, NerLabelEncoder
|
13
11
|
from .mixup import MixupModule
|
12
|
+
from .infer_types import infer_column_types, infer_output_shape, infer_problem_type, is_image_column, infer_ner_column_type
|
13
|
+
from .label_encoder import CustomLabelEncoder, NerLabelEncoder
|
14
14
|
from .preprocess_dataframe import MultiModalFeaturePreprocessor
|
15
15
|
from .process_categorical import CategoricalProcessor
|
16
16
|
from .process_document import DocumentProcessor
|
@@ -21,3 +21,15 @@ from .process_ner import NerProcessor
|
|
21
21
|
from .process_numerical import NumericalProcessor
|
22
22
|
from .process_semantic_seg_img import SemanticSegImageProcessor
|
23
23
|
from .process_text import TextProcessor
|
24
|
+
from .utils import (
|
25
|
+
create_data_processor,
|
26
|
+
create_fusion_data_processors,
|
27
|
+
data_to_df,
|
28
|
+
get_detected_data_types,
|
29
|
+
get_mixup,
|
30
|
+
infer_dtypes_by_model_names,
|
31
|
+
infer_scarcity_mode_by_data_size,
|
32
|
+
init_df_preprocessor,
|
33
|
+
split_train_tuning_data,
|
34
|
+
turn_on_off_feature_column_info,
|
35
|
+
)
|
@@ -4,7 +4,7 @@ from typing import Dict, List, Optional, Union
|
|
4
4
|
import pandas as pd
|
5
5
|
import torch
|
6
6
|
|
7
|
-
from ..constants import
|
7
|
+
from ..constants import GET_ITEM_ERROR_RETRY
|
8
8
|
from .preprocess_dataframe import MultiModalFeaturePreprocessor
|
9
9
|
from .utils import apply_data_processor, apply_df_preprocessor, get_per_sample_features
|
10
10
|
|
@@ -100,7 +100,7 @@ class BaseDataset(torch.utils.data.Dataset):
|
|
100
100
|
per_ret = apply_data_processor(
|
101
101
|
per_sample_features=per_sample_features,
|
102
102
|
data_processors=per_processors_group,
|
103
|
-
|
103
|
+
data_types=getattr(self, f"modality_types_{group_id}"),
|
104
104
|
is_training=self.is_training,
|
105
105
|
)
|
106
106
|
ret.update(per_ret)
|
@@ -19,7 +19,6 @@ from ..constants import (
|
|
19
19
|
DOCUMENT_IMAGE,
|
20
20
|
DOCUMENT_PDF,
|
21
21
|
IDENTIFIER,
|
22
|
-
IMAGE,
|
23
22
|
IMAGE_BASE64_STR,
|
24
23
|
IMAGE_BYTEARRAY,
|
25
24
|
IMAGE_PATH,
|
@@ -37,7 +36,6 @@ from ..constants import (
|
|
37
36
|
TEXT,
|
38
37
|
TEXT_NER,
|
39
38
|
)
|
40
|
-
from .utils import is_rois_input
|
41
39
|
|
42
40
|
logger = logging.getLogger(__name__)
|
43
41
|
|
@@ -114,6 +112,22 @@ def is_categorical_column(
|
|
114
112
|
return False
|
115
113
|
|
116
114
|
|
115
|
+
def is_rois_input(sample):
|
116
|
+
"""
|
117
|
+
check if a sample is rois for object detection
|
118
|
+
|
119
|
+
Parameters
|
120
|
+
----------
|
121
|
+
sample
|
122
|
+
The sampled data.
|
123
|
+
|
124
|
+
Returns
|
125
|
+
-------
|
126
|
+
bool, whether a sample is rois for object detection
|
127
|
+
"""
|
128
|
+
return isinstance(sample, list) and len(sample) and isinstance(sample[0], list) and len(sample[0]) == 5
|
129
|
+
|
130
|
+
|
117
131
|
def is_rois_column(data: pd.Series) -> bool:
|
118
132
|
"""
|
119
133
|
Identify if a column is one rois column.
|
@@ -9,7 +9,7 @@ import pandas as pd
|
|
9
9
|
from omegaconf import DictConfig, OmegaConf
|
10
10
|
from sklearn.preprocessing import LabelEncoder
|
11
11
|
|
12
|
-
from ..constants import
|
12
|
+
from ..constants import END_OFFSET, ENTITY_GROUP, PROBABILITY, START_OFFSET
|
13
13
|
|
14
14
|
logger = logging.getLogger(__name__)
|
15
15
|
|
@@ -137,12 +137,12 @@ class NerLabelEncoder:
|
|
137
137
|
transformed_y
|
138
138
|
A list of word level annotations.
|
139
139
|
"""
|
140
|
-
from .
|
140
|
+
from .process_ner import NerProcessor
|
141
141
|
|
142
142
|
all_annotations, _ = self.extract_ner_annotations(y)
|
143
143
|
transformed_y = []
|
144
144
|
for annotation, text_snippet in zip(all_annotations, x.items()):
|
145
|
-
word_label, _, _, _ = process_ner_annotations(
|
145
|
+
word_label, _, _, _ = NerProcessor.process_ner_annotations(
|
146
146
|
annotation, text_snippet[-1], self.entity_map, tokenizer, is_eval=True
|
147
147
|
)
|
148
148
|
word_label_invers = []
|
@@ -78,14 +78,14 @@ class InsertPunctuation(Augmenter):
|
|
78
78
|
new = " ".join(new)
|
79
79
|
return new
|
80
80
|
|
81
|
-
@
|
82
|
-
def clean(
|
81
|
+
@staticmethod
|
82
|
+
def clean(data):
|
83
83
|
if isinstance(data, list):
|
84
84
|
return [d.strip() if d else d for d in data]
|
85
85
|
return data.strip()
|
86
86
|
|
87
|
-
@
|
88
|
-
def is_duplicate(
|
87
|
+
@staticmethod
|
88
|
+
def is_duplicate(dataset, data):
|
89
89
|
for d in dataset:
|
90
90
|
if d == data:
|
91
91
|
return True
|