PyPI - autogluon.multimodal - Versions diffs - 1.2.1b20250302__py3-none-any.whl → 1.2.1b20250304__py3-none-any.whl - Mend

autogluon.multimodal 1.2.1b20250302py3-none-any.whl → 1.2.1b20250304py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (126) hide show

autogluon/multimodal/__init__.py +4 -2
autogluon/multimodal/configs/data/default.yaml +4 -2
autogluon/multimodal/configs/{environment → env}/default.yaml +2 -3
autogluon/multimodal/configs/model/default.yaml +58 -11
autogluon/multimodal/configs/{optimization → optim}/default.yaml +21 -4
autogluon/multimodal/constants.py +16 -5
autogluon/multimodal/data/__init__.py +14 -2
autogluon/multimodal/data/dataset.py +2 -2
autogluon/multimodal/data/infer_types.py +16 -2
autogluon/multimodal/data/label_encoder.py +3 -3
autogluon/multimodal/{utils → data}/nlpaug.py +4 -4
autogluon/multimodal/data/preprocess_dataframe.py +55 -38
autogluon/multimodal/data/process_categorical.py +35 -6
autogluon/multimodal/data/process_document.py +59 -33
autogluon/multimodal/data/process_image.py +198 -163
autogluon/multimodal/data/process_label.py +7 -3
autogluon/multimodal/data/process_mmlab/process_mmdet.py +1 -8
autogluon/multimodal/data/process_mmlab/process_mmlab_base.py +2 -9
autogluon/multimodal/data/process_mmlab/process_mmocr.py +1 -9
autogluon/multimodal/data/process_ner.py +192 -4
autogluon/multimodal/data/process_numerical.py +32 -5
autogluon/multimodal/data/process_semantic_seg_img.py +23 -28
autogluon/multimodal/data/process_text.py +95 -58
autogluon/multimodal/data/template_engine.py +7 -9
autogluon/multimodal/data/templates.py +0 -2
autogluon/multimodal/data/trivial_augmenter.py +2 -2
autogluon/multimodal/data/utils.py +564 -338
autogluon/multimodal/learners/__init__.py +2 -1
autogluon/multimodal/learners/base.py +189 -189
autogluon/multimodal/learners/ensemble.py +748 -0
autogluon/multimodal/learners/few_shot_svm.py +6 -15
autogluon/multimodal/learners/matching.py +59 -84
autogluon/multimodal/learners/ner.py +23 -22
autogluon/multimodal/learners/object_detection.py +26 -21
autogluon/multimodal/learners/semantic_segmentation.py +16 -18
autogluon/multimodal/models/__init__.py +12 -3
autogluon/multimodal/models/augmenter.py +175 -0
autogluon/multimodal/models/categorical_mlp.py +13 -8
autogluon/multimodal/models/clip.py +92 -18
autogluon/multimodal/models/custom_transformer.py +75 -75
autogluon/multimodal/models/document_transformer.py +23 -9
autogluon/multimodal/models/ft_transformer.py +40 -35
autogluon/multimodal/models/fusion/base.py +2 -4
autogluon/multimodal/models/fusion/fusion_mlp.py +82 -18
autogluon/multimodal/models/fusion/fusion_ner.py +1 -1
autogluon/multimodal/models/fusion/fusion_transformer.py +23 -23
autogluon/multimodal/models/{huggingface_text.py → hf_text.py} +21 -2
autogluon/multimodal/models/meta_transformer.py +336 -0
autogluon/multimodal/models/mlp.py +6 -6
autogluon/multimodal/models/mmocr_text_detection.py +1 -1
autogluon/multimodal/models/mmocr_text_recognition.py +0 -1
autogluon/multimodal/models/ner_text.py +1 -8
autogluon/multimodal/models/numerical_mlp.py +14 -8
autogluon/multimodal/models/sam.py +12 -2
autogluon/multimodal/models/t_few.py +21 -5
autogluon/multimodal/models/timm_image.py +74 -32
autogluon/multimodal/models/utils.py +877 -16
autogluon/multimodal/optim/__init__.py +17 -0
autogluon/multimodal/{optimization → optim}/lit_distiller.py +2 -1
autogluon/multimodal/{optimization → optim}/lit_matcher.py +4 -10
autogluon/multimodal/{optimization → optim}/lit_mmdet.py +2 -10
autogluon/multimodal/{optimization → optim}/lit_module.py +139 -14
autogluon/multimodal/{optimization → optim}/lit_ner.py +3 -3
autogluon/multimodal/{optimization → optim}/lit_semantic_seg.py +1 -1
autogluon/multimodal/optim/losses/__init__.py +14 -0
autogluon/multimodal/optim/losses/bce_loss.py +25 -0
autogluon/multimodal/optim/losses/focal_loss.py +81 -0
autogluon/multimodal/optim/losses/lemda_loss.py +39 -0
autogluon/multimodal/optim/losses/rkd_loss.py +103 -0
autogluon/multimodal/optim/losses/softmax_losses.py +177 -0
autogluon/multimodal/optim/losses/structure_loss.py +26 -0
autogluon/multimodal/optim/losses/utils.py +313 -0
autogluon/multimodal/optim/lr/__init__.py +1 -0
autogluon/multimodal/optim/lr/utils.py +332 -0
autogluon/multimodal/optim/metrics/__init__.py +4 -0
autogluon/multimodal/optim/metrics/coverage_metrics.py +42 -0
autogluon/multimodal/optim/metrics/hit_rate_metrics.py +78 -0
autogluon/multimodal/optim/metrics/ranking_metrics.py +231 -0
autogluon/multimodal/optim/metrics/utils.py +359 -0
autogluon/multimodal/optim/utils.py +284 -0
autogluon/multimodal/predictor.py +51 -12
autogluon/multimodal/utils/__init__.py +19 -45
autogluon/multimodal/utils/cache.py +23 -2
autogluon/multimodal/utils/checkpoint.py +58 -5
autogluon/multimodal/utils/config.py +127 -55
autogluon/multimodal/utils/device.py +120 -0
autogluon/multimodal/utils/distillation.py +8 -8
autogluon/multimodal/utils/download.py +1 -1
autogluon/multimodal/utils/env.py +22 -0
autogluon/multimodal/utils/export.py +3 -3
autogluon/multimodal/utils/hpo.py +5 -5
autogluon/multimodal/utils/inference.py +37 -4
autogluon/multimodal/utils/install.py +91 -0
autogluon/multimodal/utils/load.py +52 -47
autogluon/multimodal/utils/log.py +6 -41
autogluon/multimodal/utils/matcher.py +3 -2
autogluon/multimodal/utils/onnx.py +0 -4
autogluon/multimodal/utils/path.py +10 -0
autogluon/multimodal/utils/precision.py +130 -0
autogluon/multimodal/{presets.py → utils/presets.py} +259 -66
autogluon/multimodal/{problem_types.py → utils/problem_types.py} +30 -1
autogluon/multimodal/utils/save.py +47 -29
autogluon/multimodal/utils/strategy.py +24 -0
autogluon/multimodal/version.py +1 -1
{autogluon.multimodal-1.2.1b20250302.dist-info → autogluon.multimodal-1.2.1b20250304.dist-info}/METADATA +5 -5
autogluon.multimodal-1.2.1b20250304.dist-info/RECORD +163 -0
autogluon/multimodal/optimization/__init__.py +0 -16
autogluon/multimodal/optimization/losses.py +0 -394
autogluon/multimodal/optimization/utils.py +0 -1054
autogluon/multimodal/utils/cloud_io.py +0 -80
autogluon/multimodal/utils/data.py +0 -701
autogluon/multimodal/utils/environment.py +0 -395
autogluon/multimodal/utils/metric.py +0 -500
autogluon/multimodal/utils/model.py +0 -558
autogluon.multimodal-1.2.1b20250302.dist-info/RECORD +0 -145
/autogluon/multimodal/{optimization → optim}/deepspeed.py +0 -0
/autogluon/multimodal/{optimization/lr_scheduler.py → optim/lr/lr_schedulers.py} +0 -0
/autogluon/multimodal/{optimization → optim/metrics}/semantic_seg_metrics.py +0 -0
/autogluon/multimodal/{registry.py → utils/registry.py} +0 -0
/autogluon.multimodal-1.2.1b20250302-py3.9-nspkg.pth → /autogluon.multimodal-1.2.1b20250304-py3.9-nspkg.pth +0 -0
{autogluon.multimodal-1.2.1b20250302.dist-info → autogluon.multimodal-1.2.1b20250304.dist-info}/LICENSE +0 -0
{autogluon.multimodal-1.2.1b20250302.dist-info → autogluon.multimodal-1.2.1b20250304.dist-info}/NOTICE +0 -0
{autogluon.multimodal-1.2.1b20250302.dist-info → autogluon.multimodal-1.2.1b20250304.dist-info}/WHEEL +0 -0
{autogluon.multimodal-1.2.1b20250302.dist-info → autogluon.multimodal-1.2.1b20250304.dist-info}/namespace_packages.txt +0 -0
{autogluon.multimodal-1.2.1b20250302.dist-info → autogluon.multimodal-1.2.1b20250304.dist-info}/top_level.txt +0 -0
{autogluon.multimodal-1.2.1b20250302.dist-info → autogluon.multimodal-1.2.1b20250304.dist-info}/zip-safe +0 -0

autogluon/multimodal/__init__.py CHANGED Viewed

@@ -1,8 +1,10 @@
+from autogluon.common.utils.log_utils import _add_stream_handler
 try:
     from .version import __version__
 except ImportError:
     pass
-from . import constants, data, learners, models, optimization, predictor, problem_types, utils
 from .predictor import MultiModalPredictor
-from .utils import download
+_add_stream_handler()

autogluon/multimodal/configs/data/default.yaml CHANGED Viewed

@@ -7,6 +7,7 @@ data:
     minimum_cat_count: 100  # The minimum number of occurrences a category must have in the training data to avoid being considered a rare category.
     maximum_num_cat: 20  # The maximum amount of categories that can be considered non-rare.
     convert_to_text: False  # Whether to convert the feature to text.
+    convert_to_text_template: "latex"  # The template used to convert categorical to text. Choices are: "direct", "list", "text", "latex".
   numerical:
     convert_to_text: False  # Whether to convert the feature to text.
     scaler_with_mean: True  # Whether to normalize with mean.
@@ -14,7 +15,7 @@ data:
   document:
     missing_value_strategy: "zero"  # How to deal with missing documents. By default, we use a zero document image to replace a missing document. We also support "skip", i.e., skipping a sample with missing documents.
   label:
-    numerical_label_preprocessing: "standardscaler"  # The mode of label preprocessing for . Support "standardscaler" or "minmaxscaler" or "none" / None (means no transform).
+    numerical_preprocessing: "standardscaler"  # The mode of numerical label preprocessing for . Support "standardscaler" or "minmaxscaler" or None (means no transform).
   pos_label:  # The name of binary classification's positive class. It's used in computing some metrics, e.g., roc_auc. If not provided, then use label_encoder.classes_[1],
   column_features_pooling_mode: "concat"  # How to pool multi-column features into one feature vector. Currently only support "concat" or "mean" for few shot classification.
   mixup:
@@ -22,11 +23,12 @@ data:
     mixup_alpha: 0.8  # Mixup alpha.
     cutmix_alpha: 1.0  # Cutmix alpha.
     cutmix_minmax:  # Cutmix min/max ratio, it will override cutmix alpha if set, a list/tuple with size two.
-    prob: 1.0  # The probability of conducting mixup/cutmix if enable.
+    prob: 1.0  # The probability of conducting mixup/cutmix if enabled.
     switch_prob: 0.5  # The probability of switching mixup to cutmix if both enable.
     mode: "batch"  # Perform mixup/cutmix on "batch" or "pair" or "elem".
     turn_off_epoch: 5  # The epoch when the mixup will be turned off.
     label_smoothing: 0.1  # Label smoothing.
+  modality_dropout: 0
   templates:
     turn_on: False
     num_templates: 30 # The number of templates to sample from uniformly.

autogluon/multimodal/configs/{environment → env}/default.yaml RENAMED Viewed

@@ -3,11 +3,10 @@ env:
   num_nodes: 1
   batch_size: 128  # this is a desired batch size; pl trainer will accumulate gradients when per step batch is smaller.
   per_gpu_batch_size: 8  # training per gpu batch size
-  eval_batch_size_ratio: 4  # per_gpu_batch_size_evaluation = per_gpu_batch_size * eval_batch_size_ratio
-  per_gpu_batch_size_evaluation:  # This is deprecated. Use eval_batch_size_ratio instead.
+  inference_batch_size_ratio: 4  # per_gpu_batch_size_for_inference = per_gpu_batch_size * inference_batch_size_ratio
   precision: "16-mixed"  # training precision. Refer to https://lightning.ai/docs/pytorch/stable/common/trainer.html#precision
   num_workers: 2  # pytorch training dataloader workers.
-  num_workers_evaluation: 2  # pytorch prediction/test dataloader workers.
+  num_workers_inference: 2  # pytorch prediction/test dataloader workers.
   accelerator: "auto"  # "cpu", "gpu", or "auto"
   fast_dev_run: False
   deterministic: False

autogluon/multimodal/configs/model/default.yaml CHANGED Viewed

@@ -4,21 +4,23 @@ model:
     hidden_size: 64
     activation: "leaky_relu"
     num_layers: 1
-    drop_rate: 0.1
+    dropout: 0.1
     normalization: "layer_norm"
     data_types:
       - "categorical"
   numerical_mlp:
     hidden_size: 128
     activation: "leaky_relu"
     num_layers: 1
-    drop_rate: 0.1
+    dropout: 0.1
     normalization: "layer_norm"
-    d_token: 8
+    token_dim: 8
     embedding_arch:
     data_types:
       - "numerical"
     merge: "concat"
   hf_text:
     checkpoint_name: "google/electra-base-discriminator"
     gradient_checkpointing: False
@@ -39,6 +41,7 @@ model:
       # - "random_delete(0.05)"        # less than 0.1 based on eda paper
       # - "syn_replacement(0.05)"  # less than 0.1 based on eda paper
       # - "insert_punc(0.05)"
   ner_text:
     checkpoint_name: "bert-base-cased"
     max_text_len: 512
@@ -53,6 +56,7 @@ model:
     special_tags:
       - X # CLS, SEP, and non-first tokens of a word will be labelled as X
       - O # Outside of a named entity
   document_transformer:
     checkpoint_name: "microsoft/layoutlmv3-base" # document foundation models
     gradient_checkpointing: False
@@ -75,6 +79,7 @@ model:
     stochastic_chunk: False
     text_aug_detect_length: 10                # We perform text augmentation only if a text has more than text_detection_length words. It is used to differentiate text columns versus tabular columns that are treated as text.
     text_trivial_aug_maxscale: 0.0            # augmentation magnitude randomly drawn from [0, text_trivial_aug_maxscale]
   t_few:
     checkpoint_name: "t5-small" #"bigscience/T0_3B"
     gradient_checkpointing: False
@@ -91,6 +96,8 @@ model:
     stochastic_chunk: False
     text_aug_detect_length: 10                # We perform text augmentation only if a text has more than text_detection_length words. It is used to differentiate text columns versus tabular columns that are treated as text.
     text_trivial_aug_maxscale: 0.0            # augmentation magnititude randomly drawn from [0, text_trivial_aug_maxscale]
+    text_train_augment_types:
   timm_image:
     checkpoint_name: "swin_base_patch4_window7_224"
     mix_choice: "all_logits"
@@ -105,7 +112,9 @@ model:
       - "center_crop"
     image_norm: "imagenet"
     image_size: null
-    max_img_num_per_col: 2
+    image_chan_num: 3
+    use_learnable_image: False
+    max_image_num_per_column: 1
   mmdet_image:
     checkpoint_name: "yolov3_mobilenetv2_8xb24-320-300e_coco"
@@ -161,7 +170,9 @@ model:
       - "center_crop"
     image_norm: "clip"
     image_size: 224
-    max_img_num_per_col: 2
+    image_chan_num: 3
+    use_learnable_image: False
+    max_image_num_per_column: 1
     tokenizer_name: "clip"
     max_text_len: 77  # The maximum possible length.
     insert_sep: False
@@ -174,15 +185,17 @@ model:
       # - "random_delete(0.05)"        # less than 0.1 based on eda paper
       # - "syn_replacement(0.05)"  # less than 0.1 based on eda paper
       # - "insert_punc(0.05)"
   fusion_mlp:
-    weight: 0.1
+    aux_loss_weight:
     adapt_in_features: "max"
     hidden_sizes:
       - 128
     activation: "leaky_relu"
-    drop_rate: 0.1
+    dropout: 0.1
     normalization: "layer_norm"
     data_types:
   fusion_ner:
     weight:
     adapt_in_features: "max"
@@ -192,21 +205,24 @@ model:
     drop_rate: 0.1
     normalization: "layer_norm"
     data_types:
   fusion_transformer:
+    aux_loss_weight:
     hidden_size: 192
-    n_blocks: 3
-    attention_n_heads: 8
+    num_blocks: 3
+    attention_num_heads: 8
     adapt_in_features: "max"
     attention_dropout: 0.2
     residual_dropout: 0.0
     ffn_dropout: 0.1
-    ffn_d_hidden: 192
+    ffn_hidden_size: 192
     normalization: "layer_norm"
     ffn_activation: "geglu"
     head_activation: "relu"
     data_types:
     additive_attention: False # Whether to use lightweight additive attention, can be True, False or "auto"
     share_qv_weights: False # Whether to share weight for query and value, can be True, False or "auto"
   ft_transformer:
     data_types:
       - "categorical"
@@ -216,7 +232,7 @@ model:
     token_dim: 192
     hidden_size: 192
     num_blocks: 3
-    attention_n_heads: 8
+    attention_num_heads: 8
     attention_dropout: 0.2
     residual_dropout: 0.0
     ffn_dropout: 0.1
@@ -247,3 +263,34 @@ model:
     frozen_layers: ["mask_decoder.iou_prediction_head", "prompt_encoder"]
     num_mask_tokens: 1
     ignore_label: 255
+  meta_transformer:
+    data_types:
+      - "image"
+      - "text"
+      - "categorical"
+      - "numerical"
+    checkpoint_path: null
+    model_version: "base"
+    requires_all_dtypes: False
+    train_transforms:
+      - "resize_shorter_side"
+      - "center_crop"
+      - "trivial_augment"
+    val_transforms:
+      - "resize_shorter_side"
+      - "center_crop"
+    image_norm: "imagenet"
+    image_size: 224
+    image_chan_num: 3
+    use_learnable_image: False
+    max_image_num_per_column: 1
+    tokenizer_name: "hf_auto"
+    max_text_len: 512  # If None or <=0, then use the max length of pretrained models.
+    insert_sep: True
+    text_segment_num: 2
+    stochastic_chunk: False
+    text_aug_detect_length: 10     # We perform text augmentation only if a text has more than text_detection_length words. It is used to differentiate text columns versus tabular columns that are treated as text.
+    text_trivial_aug_maxscale: 0.1  # augmentation magnitude randomly drawn from [0, text_trivial_aug_maxscale]
+    text_train_augment_types:
+    merge: "concat"

autogluon/multimodal/configs/{optimization → optim}/default.yaml RENAMED Viewed

@@ -1,6 +1,6 @@
-optimization:
+optim:
   optim_type: "adamw"
-  learning_rate: 1.0e-4
+  lr: 1.0e-4
   weight_decay: 0.001
   lr_choice: "layerwise_decay"
   lr_decay: 0.9
@@ -18,11 +18,12 @@ optimization:
   gradient_clip_algorithm: "norm"
   track_grad_norm: -1 # Whether to check gradient norm. We can set it to 2 to check for gradient norm.
   log_every_n_steps: 10
+  label_smoothing: 0
   top_k: 3
   top_k_average_method:
     "greedy_soup" # We support averaging method described in https://arxiv.org/pdf/2203.05482.pdf.
     # Currently support "uniform_soup", "greedy_soup", and "best".
-  efficient_finetune: null # Can be 'bit_fit' (only finetune bias), 'norm_fit' (finetune the normalization terms + bias terms), lora (LoRA Adaptations only), lora_bias (LoRA Adaptation + bit_fit), lora_norm (LoRA Adaptation + norm_fit), or null
+  peft: null # Can be 'bit_fit' (only finetune bias), 'norm_fit' (finetune the normalization terms + bias terms), lora (LoRA Adaptations only), lora_bias (LoRA Adaptation + bit_fit), lora_norm (LoRA Adaptation + norm_fit), or null
   lora:
     module_filter: null # Specify which module (if any) to adapt(e.g. ".*EncDecAttention|.*DenseReluDense"). Default consider all modules in a model (i.e. empty filter).
     filter: # Specify which layer in a module to adapt. Default fine-tune only query and value attention weights, recommended in https://arxiv.org/abs/2106.09685
@@ -35,7 +36,7 @@ optimization:
     r: 8
     alpha: 8
     conv_lora_expert_num: 8 # default setting for Conv-LoRA
-  loss_function:
+  loss_func:
     "auto" # The replaced loss for regression. Can only support loss function in torch.nn.
     # example
     # "BCEWithLogitsLoss" or "nn.BCEWithLogitsloss"
@@ -48,3 +49,19 @@ optimization:
     loss_mask_weight: 5.0
     loss_dice_weight: 5.0
   extra_trainable_params: []
+  cross_modal_align: null
+  cross_modal_align_weight: 0
+  automatic_optimization: True
+  lemda:
+    turn_on: False
+    arch_type: "mlp_vae"
+    z_dim: 8
+    num_layers: 6
+    kld_weight: 0.1
+    mse_weight: 0.1
+    adv_weight: 1.0e-4
+    consist_weight: 0.01
+    consist_threshold: 0.5
+    lr: 1.0e-4
+    optim_type: "adamw"
+    weight_decay: 1.0e-5

autogluon/multimodal/constants.py CHANGED Viewed

@@ -59,12 +59,17 @@ SEMANTIC_SEGMENTATION_GT = "semantic_segmentation_gt"
 # Output keys
 LOGITS = "logits"
+ORI_LOGITS = "ori_logits"
+AUG_LOGITS = "aug_logits"
 TEMPLATE_LOGITS = "template_logits"
 LM_TARGET = "lm_target"
 LOSS = "loss"
 OUTPUT = "output"
 WEIGHT = "weight"
 FEATURES = "features"
+MULTIMODAL_FEATURES = "multimodal_features"  # used for the adapted multimodal features before the fusion module
+MULTIMODAL_FEATURES_PRE_AUG = "multimodal_features_pre_aug"
+MULTIMODAL_FEATURES_POST_AUG = "multimodal_features_post_aug"
 RAW_FEATURES = "raw_features"
 MASKS = "masks"
 PROBABILITY = "probability"
@@ -73,6 +78,8 @@ BBOX = "bbox"
 ROIS = "rois"
 SCORE = "score"
 LOGIT_SCALE = "logit_scale"
+VAE_MEAN = "vae_mean"
+VAE_VAR = "vae_var"
 # Loss
 MOE_LOSS = "moe_loss"
@@ -142,6 +149,7 @@ FM = "fm"
 MAE = "mae"
 BER = "ber"
 IOU = "iou"
+COVERAGE = "coverage"
 RETRIEVAL_METRICS = [NDCG, PRECISION, RECALL, MRR]
 METRIC_MODE_MAP = {
     ACC: MAX,
@@ -168,6 +176,7 @@ METRIC_MODE_MAP = {
     SM: MAX,
     IOU: MAX,
     BER: MIN,
+    COVERAGE: MAX,
 }
 MATCHING_METRICS = {
@@ -179,7 +188,7 @@ MATCHING_METRICS_WITHOUT_PROBLEM_TYPE = [RECALL, NDCG]
 EVALUATION_METRICS = {
     # Use evaluation metrics from METRICS for these types
-    BINARY: METRICS[BINARY].keys(),
+    BINARY: list(METRICS[BINARY].keys()) + [COVERAGE],
     MULTICLASS: METRICS[MULTICLASS].keys(),
     REGRESSION: METRICS[REGRESSION].keys(),
     OBJECT_DETECTION: DETECTION_METRICS,
@@ -197,6 +206,7 @@ VALIDATION_METRICS = {
 # Training status
 TRAIN = "train"
 VALIDATE = "validate"
+VAL = "val"
 TEST = "test"
 PREDICT = "predict"
@@ -217,11 +227,11 @@ Y_TRUE = "y_true"
 # Configuration keys
 MODEL = "model"
 DATA = "data"
-OPTIMIZATION = "optimization"
-ENVIRONMENT = "environment"
+OPTIM = "optim"
+ENV = "env"
 DISTILLER = "distiller"
 MATCHER = "matcher"
-VALID_CONFIG_KEYS = [MODEL, DATA, OPTIMIZATION, ENVIRONMENT, DISTILLER, MATCHER]
+VALID_CONFIG_KEYS = [MODEL, DATA, OPTIM, ENV, DISTILLER, MATCHER]
 # Image normalization mean and std. This is only to normalize images for the CLIP model.
 CLIP_IMAGE_MEAN = (0.48145466, 0.4578275, 0.40821073)
@@ -275,7 +285,7 @@ PEFT_STRATEGIES = list(set(PEFT_ADDITIVE_STRATEGIES) | set(PEFT_NON_ADDITIVE_STR
 # DeepSpeed constants
 DEEPSPEED_OFFLOADING = "deepspeed_stage_3_offload"
 DEEPSPEED_STRATEGY = "deepspeed"
-DEEPSPEED_MODULE = "autogluon.multimodal.optimization.deepspeed"
+DEEPSPEED_MODULE = "autogluon.multimodal.optim.deepspeed"
 DEEPSPEED_MIN_PL_VERSION = "1.7.1"
 # registered model keys. TODO: document how to add new models.
@@ -298,6 +308,7 @@ DOCUMENT_TRANSFORMER = "document_transformer"
 HF_MODELS = (HF_TEXT, T_FEW, CLIP, NER_TEXT, DOCUMENT_TRANSFORMER)
 MMLAB_MODELS = (MMDET_IMAGE, MMOCR_TEXT_DET, MMOCR_TEXT_RECOG)
 SAM = "sam"
+META_TRANSFORMER = "meta_transformer"
 # matcher loss type
 CONTRASTIVE_LOSS = "contrastive_loss"

autogluon/multimodal/data/__init__.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from . import collator, infer_types, randaug, utils
 from .datamodule import BaseDataModule
 from .dataset import BaseDataset
 from .dataset_mmlab import MultiImageMixDataset
@@ -9,8 +8,9 @@ from .infer_types import (
     infer_rois_column_type,
     is_image_column,
 )
-from .label_encoder import CustomLabelEncoder, NerLabelEncoder
 from .mixup import MixupModule
+from .infer_types import infer_column_types, infer_output_shape, infer_problem_type, is_image_column, infer_ner_column_type
+from .label_encoder import CustomLabelEncoder, NerLabelEncoder
 from .preprocess_dataframe import MultiModalFeaturePreprocessor
 from .process_categorical import CategoricalProcessor
 from .process_document import DocumentProcessor
@@ -21,3 +21,15 @@ from .process_ner import NerProcessor
 from .process_numerical import NumericalProcessor
 from .process_semantic_seg_img import SemanticSegImageProcessor
 from .process_text import TextProcessor
+from .utils import (
+    create_data_processor,
+    create_fusion_data_processors,
+    data_to_df,
+    get_detected_data_types,
+    get_mixup,
+    infer_dtypes_by_model_names,
+    infer_scarcity_mode_by_data_size,
+    init_df_preprocessor,
+    split_train_tuning_data,
+    turn_on_off_feature_column_info,
+)

autogluon/multimodal/data/dataset.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Dict, List, Optional, Union
 import pandas as pd
 import torch
-from ..constants import AUTOMM, GET_ITEM_ERROR_RETRY
+from ..constants import GET_ITEM_ERROR_RETRY
 from .preprocess_dataframe import MultiModalFeaturePreprocessor
 from .utils import apply_data_processor, apply_df_preprocessor, get_per_sample_features
@@ -100,7 +100,7 @@ class BaseDataset(torch.utils.data.Dataset):
                 per_ret = apply_data_processor(
                     per_sample_features=per_sample_features,
                     data_processors=per_processors_group,
-                    feature_modalities=getattr(self, f"modality_types_{group_id}"),
+                    data_types=getattr(self, f"modality_types_{group_id}"),
                     is_training=self.is_training,
                 )
                 ret.update(per_ret)

autogluon/multimodal/data/infer_types.py CHANGED Viewed

@@ -19,7 +19,6 @@ from ..constants import (
     DOCUMENT_IMAGE,
     DOCUMENT_PDF,
     IDENTIFIER,
-    IMAGE,
     IMAGE_BASE64_STR,
     IMAGE_BYTEARRAY,
     IMAGE_PATH,
@@ -37,7 +36,6 @@ from ..constants import (
     TEXT,
     TEXT_NER,
 )
-from .utils import is_rois_input
 logger = logging.getLogger(__name__)
@@ -114,6 +112,22 @@ def is_categorical_column(
             return False
+def is_rois_input(sample):
+    """
+    check if a sample is rois for object detection
+    Parameters
+    ----------
+    sample
+        The sampled data.
+    Returns
+    -------
+    bool, whether a sample is rois for object detection
+    """
+    return isinstance(sample, list) and len(sample) and isinstance(sample[0], list) and len(sample[0]) == 5
 def is_rois_column(data: pd.Series) -> bool:
     """
     Identify if a column is one rois column.

autogluon/multimodal/data/label_encoder.py CHANGED Viewed

@@ -9,7 +9,7 @@ import pandas as pd
 from omegaconf import DictConfig, OmegaConf
 from sklearn.preprocessing import LabelEncoder
-from ..constants import AUTOMM, END_OFFSET, ENTITY_GROUP, NER_ANNOTATION, PROBABILITY, START_OFFSET
+from ..constants import END_OFFSET, ENTITY_GROUP, PROBABILITY, START_OFFSET
 logger = logging.getLogger(__name__)
@@ -137,12 +137,12 @@ class NerLabelEncoder:
         transformed_y
             A list of word level annotations.
         """
-        from .utils import process_ner_annotations
+        from .process_ner import NerProcessor
         all_annotations, _ = self.extract_ner_annotations(y)
         transformed_y = []
         for annotation, text_snippet in zip(all_annotations, x.items()):
-            word_label, _, _, _ = process_ner_annotations(
+            word_label, _, _, _ = NerProcessor.process_ner_annotations(
                 annotation, text_snippet[-1], self.entity_map, tokenizer, is_eval=True
             )
             word_label_invers = []

autogluon/multimodal/{utils → data}/nlpaug.py RENAMED Viewed

@@ -78,14 +78,14 @@ class InsertPunctuation(Augmenter):
         new = " ".join(new)
         return new
-    @classmethod
-    def clean(cls, data):
+    @staticmethod
+    def clean(data):
         if isinstance(data, list):
             return [d.strip() if d else d for d in data]
         return data.strip()
-    @classmethod
-    def is_duplicate(cls, dataset, data):
+    @staticmethod
+    def is_duplicate(dataset, data):
         for d in dataset:
             if d == data:
                 return True

autogluon.multimodal 1.2.1b20250302__py3-none-any.whl → 1.2.1b20250304__py3-none-any.whl

autogluon.multimodal 1.2.1b20250302py3-none-any.whl → 1.2.1b20250304py3-none-any.whl