PyPI - evalscope - Versions diffs - 0.13.2__py3-none-any.whl → 0.15.0__py3-none-any.whl - Mend

evalscope 0.13.2py3-none-any.whl → 0.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (214) hide show

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py ADDED Viewed

@@ -0,0 +1,202 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import torch
+import torch.nn.functional as F
+import warnings
+from torch import nn
+from ...common.registry import registry
+from ..med import XBertEncoder
+from ..vit import VisionTransformerEncoder
+from .blip import BlipBase
+from .blip_outputs import BlipOutputFeatures
+@registry.register_model('blip_feature_extractor')
+class BlipFeatureExtractor(BlipBase):
+    """
+    Class for BLIP feature extractor.
+    Supported model types:
+        - base: BLIP base model with pre-trained weights from capfilt by BLIP large model.
+    Usage:
+        >>> from lavis.models import load_model
+        >>> model = load_model("blip_feature_extractor", "base")
+    """
+    PRETRAINED_MODEL_CONFIG_DICT = {
+        'base': 'configs/models/blip_feature_extractor_base.yaml',
+        # "large": "configs/models/blip_feature_extractor_large.yaml",
+    }
+    def __init__(self, image_encoder, text_encoder, embed_dim, max_txt_len=40):
+        super().__init__()
+        self.tokenizer = self.init_tokenizer()
+        self.visual_encoder = image_encoder
+        self.text_encoder = text_encoder
+        # creating projection layers for ITC
+        text_width = text_encoder.config.hidden_size
+        vision_width = image_encoder.vision_width
+        self.vision_proj = nn.Linear(vision_width, embed_dim)
+        self.text_proj = nn.Linear(text_width, embed_dim)
+        self.max_txt_len = max_txt_len
+        self.temp = nn.Parameter(0.07 * torch.ones([]))
+    @torch.no_grad()
+    def extract_features(self, samples, mode='multimodal'):
+        """
+        Extract features for multimodal or unimodal samples.
+        Args:
+            samples (dict): A dictionary of samples, containing the following keys:
+                - image (torch.Tensor): A tensor of shape (B, C, H, W) containing the image.
+                    Raw images should be preprocessed before being passed to feature extractor.
+                - text_input (list): A list of strings containing the text, length B.
+            mode (str): The mode of feature extraction. Can be either "multimodal", "text" or "image".
+                If "multimodal", return image features and multimodal features;
+                if "text", return text features;
+                if "image", return image features.
+                Default: "multimodal".
+        Returns:
+            BlipOutputFeatures: A BlipOutputFeatures object containing the features.
+                See lavis/models/blip_models/blip_outputs.py for more details.
+        Examples:
+        ```python
+            >>> from PIL import Image
+            >>> from lavis.models import load_model_and_preprocess
+            >>> raw_image = Image.open("docs/data/merlion.png").convert("RGB")
+            >>> caption = "a large fountain spewing water into the air"
+            >>> model, vis_processors, txt_processors = load_model_and_preprocess("blip_feature_extractor", is_eval=True)
+            >>> image = vis_processors["eval"](raw_image).unsqueeze(0)
+            >>> text_input = txt_processors["eval"](caption)
+            >>> sample = {"image": image, "text_input": [text_input]}
+            >>> features_multimodal = model.extract_features(sample)
+            >>> features_multimodal.keys()
+            odict_keys(['image_embeds', 'multimodal_embeds'])
+            >>> features_multimodal.image_embeds.shape
+            torch.Size([1, 197, 768])
+            >>> features_multimodal.multimodal_embeds.shape
+            torch.Size([1, 12, 768])
+            >>> features_text = model.extract_features(sample, mode="text")
+            >>> features_text.keys()
+            odict_keys(['text_embeds', 'text_features'])
+            >>> features_text.text_embeds.shape
+            torch.Size([1, 12, 768])
+            >>> features_text.text_features.shape
+            torch.Size([1, 12, 256])
+            >>> features_image = model.extract_features(sample, mode="image")
+            >>> features_image.keys()
+            odict_keys(['image_embeds', 'image_features'])
+            >>> features_image.image_embeds.shape
+            torch.Size([1, 197, 768])
+            >>> features_image.image_features.shape
+            torch.Size([1, 197, 256])
+        ```
+        """
+        image = samples.get('image')
+        caption = samples.get('text_input')
+        # assert mode is one of "image", "text", "multimodal"
+        assert mode in [
+            'image',
+            'text',
+            'multimodal',
+        ], "mode must be one of 'image', 'text', 'multimodal'"
+        # initalize output
+        image_embeds, text_embeds, multimodal_embeds = None, None, None
+        image_features, text_features = None, None
+        if mode == 'image':
+            assert (image is not None), "Image is not provided for mode 'image' or 'multimodal'"
+            # return image features
+            image_embeds = self.visual_encoder.forward_features(image)
+            image_features = self.vision_proj(image_embeds)
+            image_features = F.normalize(image_features, dim=-1)
+        elif mode == 'text':
+            assert (caption is not None), "text input is None for mode 'text' or 'multimodal'"
+            text = self.tokenizer(caption, return_tensors='pt', padding=True).to(self.device)
+            # return text features
+            text_output = self.text_encoder(
+                text.input_ids,
+                attention_mask=text.attention_mask,
+                return_dict=True,
+                mode='text',
+            )
+            text_embeds = text_output.last_hidden_state
+            text_features = self.text_proj(text_embeds)
+            text_features = F.normalize(text_features, dim=-1)
+        elif mode == 'multimodal':
+            # return multimodel features
+            image_embeds = self.visual_encoder.forward_features(image)
+            image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(self.device)
+            text = self.tokenizer(caption, return_tensors='pt', padding=True).to(self.device)
+            text.input_ids[:, 0] = self.tokenizer.enc_token_id
+            output = self.text_encoder(
+                text.input_ids,
+                attention_mask=text.attention_mask,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
+                return_dict=True,
+            )
+            multimodal_embeds = output.last_hidden_state
+        return BlipOutputFeatures(
+            image_embeds=image_embeds,
+            image_embeds_proj=image_features,
+            text_embeds=text_embeds,
+            text_embeds_proj=text_features,
+            multimodal_embeds=multimodal_embeds,
+        )
+    @classmethod
+    def from_config(cls, cfg=None):
+        # set from_pretrained=True to load weights for 'bert-base-uncased'
+        image_encoder = VisionTransformerEncoder.from_config(cfg)
+        text_encoder = XBertEncoder.from_config(cfg)
+        embed_dim = cfg.get('embed_dim', 256)
+        max_txt_len = cfg.get('max_txt_len', 30)
+        model = cls(
+            image_encoder=image_encoder,
+            text_encoder=text_encoder,
+            embed_dim=embed_dim,
+            max_txt_len=max_txt_len,
+        )
+        # load pre-trained weights
+        pretrain_path = cfg.get('pretrained', None)
+        if pretrain_path is not None:
+            msg = model.load_from_pretrained(url_or_filename=pretrain_path)
+        else:
+            warnings.warn('No pretrained weights are loaded.')
+        return model

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py ADDED Viewed

@@ -0,0 +1,185 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import torch
+import torch.nn.functional as F
+from torch import nn
+from ...common.registry import registry
+from ..med import XBertEncoder
+from ..vit import VisionTransformerEncoder
+from .blip import BlipBase
+@registry.register_model('blip_image_text_matching')
+class BlipITM(BlipBase):
+    """
+    BLIP Image-Text Matching (ITM) model.
+    Supported model types:
+        - base: fine-tuned BLIP retrieval weights on COCO dataset (Karpathy split).
+        - large: fine-tuned BLIP retrieval weights on COCO dataset (Karpathy split).
+    Usage:
+        >>> from lavis.models import load_model
+        >>> model = load_model("blip_image_text_matching", "base")
+        >>> model = load_model("blip_image_text_matching", "large")
+    """
+    PRETRAINED_MODEL_CONFIG_DICT = {
+        'base': 'configs/models/blip_itm_base.yaml',
+        'large': 'configs/models/blip_itm_large.yaml',
+    }
+    def __init__(self, image_encoder, text_encoder, embed_dim=256, max_txt_len=35):
+        super().__init__()
+        self.tokenizer = self.init_tokenizer()
+        self.text_encoder = text_encoder
+        self.visual_encoder = image_encoder
+        self.max_txt_len = max_txt_len
+        # creating projection layers for ITC
+        text_width = text_encoder.config.hidden_size
+        vision_width = image_encoder.vision_width
+        self.vision_proj = nn.Linear(vision_width, embed_dim)
+        self.text_proj = nn.Linear(text_width, embed_dim)
+        self.itm_head = nn.Linear(text_width, 2)
+    def forward(self, samples, match_head='itm'):
+        image = samples['image']
+        caption = samples['text_input']
+        image_embeds = self.visual_encoder.forward_features(image)
+        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device)
+        text = self.tokenizer(
+            caption,
+            padding='longest',
+            truncation=True,
+            max_length=self.max_txt_len,
+            return_tensors='pt',
+        ).to(image.device)
+        if match_head == 'itm':
+            encoder_input_ids = text.input_ids.clone()
+            encoder_input_ids[:, 0] = self.tokenizer.enc_token_id  # extra code
+            output = self.text_encoder(
+                encoder_input_ids,
+                attention_mask=text.attention_mask,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
+                return_dict=True,
+            )
+            itm_output = self.itm_head(output.last_hidden_state[:, 0, :])
+            return itm_output
+        elif match_head == 'itc':
+            text_output = self.text_encoder(
+                text.input_ids,
+                attention_mask=text.attention_mask,
+                return_dict=True,
+                mode='text',
+            )
+            image_feat = F.normalize(self.vision_proj(image_embeds[:, 0, :]), dim=-1)
+            text_feat = F.normalize(self.text_proj(text_output.last_hidden_state[:, 0, :]), dim=-1)
+            sim = image_feat @ text_feat.t()
+            return sim
+    def itm_rank(self, image_embeds, image_atts, encoder_input_ids, match_head='itm'):
+        # breakpoint()
+        encoder_input_ids = encoder_input_ids.clone()
+        encoder_input_ids = encoder_input_ids[:, 3:]
+        text_attention_mask = (encoder_input_ids != self.tokenizer.pad_token_id).long()
+        if match_head == 'itm':
+            # encoder_input_ids = encoder_input_ids.clone()
+            encoder_input_ids[:, 0] = self.tokenizer.enc_token_id
+            output = self.text_encoder(
+                encoder_input_ids,
+                attention_mask=text_attention_mask,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
+                return_dict=True,
+            )
+            # print(output.last_hidden_state.shape)
+            itm_output = self.itm_head(output.last_hidden_state[:, 0, :])
+            itm_output = F.softmax(itm_output, dim=1)[:, 1]
+            return itm_output  #, mask, token_length
+        elif match_head == 'itc':
+            encoder_input_ids[:, 0] = self.tokenizer.cls_token_id
+            text_output = self.text_encoder(
+                encoder_input_ids, attention_mask=text_attention_mask, return_dict=True, mode='text')
+            image_feat = F.normalize(self.vision_proj(image_embeds[:, 0, :]), dim=-1)
+            text_feat = F.normalize(self.text_proj(text_output.last_hidden_state[:, 0, :]), dim=-1)
+            sim = image_feat @ text_feat.t()
+            return sim
+    @classmethod
+    def from_config(cls, cfg=None):
+        image_encoder = VisionTransformerEncoder.from_config(cfg)
+        text_encoder = XBertEncoder.from_config(cfg)
+        embed_dim = cfg.get('embed_dim', 256)
+        max_txt_len = cfg.get('max_txt_len', 35)
+        model = cls(
+            image_encoder=image_encoder,
+            text_encoder=text_encoder,
+            embed_dim=embed_dim,
+            max_txt_len=max_txt_len,
+        )
+        model.load_checkpoint_from_config(cfg)
+        return model
+def compute_gradcam(model, visual_input, text_input, tokenized_text, block_num=6):
+    model.text_encoder.base_model.base_model.encoder.layer[block_num].crossattention.self.save_attention = True
+    output = model({'image': visual_input, 'text_input': text_input}, match_head='itm')
+    loss = output[:, 1].sum()
+    model.zero_grad()
+    loss.backward()
+    with torch.no_grad():
+        mask = tokenized_text.attention_mask.view(tokenized_text.attention_mask.size(0), 1, -1, 1,
+                                                  1)  # (bsz,1,token_len, 1,1)
+        token_length = tokenized_text.attention_mask.sum(dim=-1) - 2
+        token_length = token_length.cpu()
+        # grads and cams [bsz, num_head, seq_len, image_patch]
+        grads = model.text_encoder.base_model.base_model.encoder.layer[
+            block_num].crossattention.self.get_attn_gradients()
+        cams = model.text_encoder.base_model.base_model.encoder.layer[block_num].crossattention.self.get_attention_map()
+        # assume using vit with 576 num image patch
+        cams = cams[:, :, :, 1:].reshape(visual_input.size(0), 12, -1, 24, 24) * mask
+        grads = (grads[:, :, :, 1:].clamp(0).reshape(visual_input.size(0), 12, -1, 24, 24) * mask)
+        gradcams = cams * grads
+        gradcam_list = []
+        for ind in range(visual_input.size(0)):
+            token_length_ = token_length[ind]
+            gradcam = gradcams[ind].mean(0).cpu().detach()
+            # [enc token gradcam, average gradcam across token, gradcam for individual token]
+            gradcam = torch.cat((
+                gradcam[0:1, :],
+                gradcam[1:token_length_ + 1, :].sum(dim=0, keepdim=True) / token_length_,
+                gradcam[1:, :],
+            ))
+            gradcam_list.append(gradcam)
+    return gradcam_list, output

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py ADDED Viewed

@@ -0,0 +1,178 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import os
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import BertConfig
+from ...common.dist_utils import download_cached_file
+from ...common.registry import registry
+from ...common.utils import get_abs_path, is_url
+from ..base_model import MomentumDistilationMixin
+from ..vit import VisionTransformerEncoder, interpolate_pos_embed
+from .blip import BlipBase
+from .blip_outputs import BlipIntermediateOutput, BlipOutput
+from .nlvr_encoder import BertModel
+@registry.register_model('blip_nlvr')
+class BlipNLVR(BlipBase, MomentumDistilationMixin):
+    """
+    Class for BLIP NLVR model.
+    Supported model types:
+        - base: model with pre-trained BLIP weights, used as initialization for fine-tuning.
+        - nlvr: finetuned model on NLVR2 dataset.
+    Usage:
+        >>> from lavis.models import load_model
+        >>> model = load_model("blip_nlvr", "nlvr")
+    """
+    PRETRAINED_MODEL_CONFIG_DICT = {
+        'nlvr': 'configs/models/blip_nlvr.yaml',
+    }
+    def __init__(self, image_encoder, text_encoder, num_classes):
+        super().__init__()
+        self.tokenizer = self.init_tokenizer()
+        self.visual_encoder = image_encoder
+        self.text_encoder = text_encoder
+        hidden_size = text_encoder.config.hidden_size
+        self.cls_head = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size),
+            nn.ReLU(),
+            nn.Linear(hidden_size, num_classes),
+        )
+    def forward(self, samples, is_train=True):
+        """
+        Forward function for training and evaluation.
+        Args:
+            samples (dict): a dict of input samples, which contains the following keys:
+                - image0 (torch.Tensor): input image 0, shape (batch_size, 3, H, W), default H=384, W=384.
+                - image1 (torch.Tensor): input image 1, shape (batch_size, 3, H, W), default H=384, W=384.
+                - text_input (list): list of strings, each string is a natural language sentence.
+                - label (torch.LongTensor): ground truth label with shape (batch_size,).
+            is_train (bool): whether the model is in training mode.
+                If True, the model will return the loss;
+                If False, the model will return the prediction.
+        Examples:
+            >>> import torch
+            >>> from lavis.models import load_model
+            >>> model = load_model("blip_nlvr", "nlvr")
+            >>> samples = {
+            ...     "image0": torch.randn(2, 3, 384, 384),
+            ...     "image1": torch.randn(2, 3, 384, 384),
+            ...     "text_input": ["there is a ferret in tall grass", "there are lips in one of the images"],
+            ...     "label": torch.tensor([0, 1]),
+            ... }
+            >>> output = model(samples)
+            >>> output.keys()
+            odict_keys(['intermediate_output', 'loss'])
+        """
+        text = samples['text_input']
+        text = self.tokenizer(text, padding='longest', return_tensors='pt').to(self.device)
+        text.input_ids[:, 0] = self.tokenizer.enc_token_id
+        targets = samples['label']
+        image0 = samples['image0']
+        image1 = samples['image1']
+        images = torch.cat([image0, image1], dim=0)
+        image_embeds = self.visual_encoder.forward_features(images)
+        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(self.device)
+        image0_embeds, image1_embeds = torch.split(image_embeds, targets.size(0))
+        encoder_output = self.text_encoder(
+            text.input_ids,
+            attention_mask=text.attention_mask,
+            encoder_hidden_states=[image0_embeds, image1_embeds],
+            encoder_attention_mask=[
+                image_atts[:image0_embeds.size(0)],
+                image_atts[image0_embeds.size(0):],
+            ],
+            return_dict=True,
+        )
+        prediction = self.cls_head(encoder_output.last_hidden_state[:, 0, :])
+        if is_train:
+            loss = F.cross_entropy(prediction, targets)
+            # return {"loss": loss}
+            return BlipOutput(
+                loss=loss,
+                intermediate_output=BlipIntermediateOutput(
+                    image_embeds=torch.stack([image0_embeds, image1_embeds], dim=0),
+                    encoder_output=encoder_output,
+                ),
+            )
+        else:
+            return {'predictions': prediction, 'targets': targets}
+    def predict(self, samples):
+        output = self.forward(samples, is_train=False)
+        return output
+    @classmethod
+    def from_config(cls, cfg=None):
+        image_encoder = VisionTransformerEncoder.from_config(cfg)
+        # text encoder + multimodal encoder
+        bert_config = BertConfig.from_json_file(get_abs_path(cfg['med_config_path']))
+        text_encoder = BertModel(config=bert_config, add_pooling_layer=False)
+        num_classes = cfg.get('num_classes', 3)
+        assert num_classes > 1, 'Invalid number of classes provided, found {}'.format(num_classes)
+        model = cls(
+            image_encoder=image_encoder,
+            text_encoder=text_encoder,
+            num_classes=num_classes,
+        )
+        model.load_checkpoint_from_config(cfg)
+        return model
+    def load_from_pretrained(self, url_or_filename):
+        if is_url(url_or_filename):
+            cached_file = download_cached_file(url_or_filename, check_hash=False, progress=True)
+            checkpoint = torch.load(cached_file, map_location='cpu')
+        elif os.path.isfile(url_or_filename):
+            checkpoint = torch.load(url_or_filename, map_location='cpu')
+        else:
+            raise RuntimeError('checkpoint url or path is invalid')
+        state_dict = checkpoint['model']
+        state_dict['visual_encoder.pos_embed'] = interpolate_pos_embed(state_dict['visual_encoder.pos_embed'],
+                                                                       self.visual_encoder)
+        for key in list(state_dict.keys()):
+            if 'crossattention.self.' in key:
+                new_key0 = key.replace('self', 'self0')
+                new_key1 = key.replace('self', 'self1')
+                state_dict[new_key0] = state_dict[key]
+                state_dict[new_key1] = state_dict[key]
+            elif 'crossattention.output.dense.' in key:
+                new_key0 = key.replace('dense', 'dense0')
+                new_key1 = key.replace('dense', 'dense1')
+                state_dict[new_key0] = state_dict[key]
+                state_dict[new_key1] = state_dict[key]
+        msg = self.load_state_dict(state_dict, strict=False)
+        print('load checkpoint from %s' % url_or_filename)
+        print(f'missing keys {msg.missing_keys}')
+        return msg

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import torch
+from dataclasses import dataclass
+from transformers.modeling_outputs import (BaseModelOutputWithPoolingAndCrossAttentions,
+                                           CausalLMOutputWithCrossAttentions, ModelOutput)
+from typing import Optional
+@dataclass
+class BlipSimilarity(ModelOutput):
+    sim_i2t: torch.FloatTensor = None
+    sim_t2i: torch.FloatTensor = None
+    sim_i2t_m: Optional[torch.FloatTensor] = None
+    sim_t2i_m: Optional[torch.FloatTensor] = None
+    sim_i2t_targets: Optional[torch.FloatTensor] = None
+    sim_t2i_targets: Optional[torch.FloatTensor] = None
+@dataclass
+class BlipIntermediateOutput(ModelOutput):
+    """
+    Data class for intermediate outputs of BLIP models.
+    image_embeds (torch.FloatTensor): Image embeddings, shape (batch_size, num_patches, embed_dim).
+    text_embeds (torch.FloatTensor): Text embeddings, shape (batch_size, seq_len, embed_dim).
+    image_embeds_m (torch.FloatTensor): Image embeddings from momentum visual encoder, shape (batch_size, num_patches, embed_dim).
+    text_embeds_m (torch.FloatTensor): Text embeddings from momentum text encoder, shape (batch_size, seq_len, embed_dim).
+    encoder_output (BaseModelOutputWithPoolingAndCrossAttentions): output from the image-grounded text encoder.
+    encoder_output_neg (BaseModelOutputWithPoolingAndCrossAttentions): output from the image-grounded text encoder for negative pairs.
+    decoder_output (CausalLMOutputWithCrossAttentions): output from the image-grounded text decoder.
+    decoder_labels (torch.LongTensor): labels for the captioning loss.
+    itm_logits (torch.FloatTensor): logits for the image-text matching loss, shape (batch_size * 3, 2).
+    itm_labels (torch.LongTensor): labels for the image-text matching loss, shape (batch_size * 3,)
+    """
+    # uni-modal features
+    image_embeds: torch.FloatTensor = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds_m: Optional[torch.FloatTensor] = None
+    text_embeds_m: Optional[torch.FloatTensor] = None
+    # intermediate outputs of multimodal encoder
+    encoder_output: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None
+    encoder_output_neg: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None
+    itm_logits: Optional[torch.FloatTensor] = None
+    itm_labels: Optional[torch.LongTensor] = None
+    # intermediate outputs of multimodal decoder
+    decoder_output: Optional[CausalLMOutputWithCrossAttentions] = None
+    decoder_labels: Optional[torch.LongTensor] = None
+@dataclass
+class BlipOutput(ModelOutput):
+    # some finetuned models (e.g. BlipVQA) do not compute similarity, thus optional.
+    sims: Optional[BlipSimilarity] = None
+    intermediate_output: BlipIntermediateOutput = None
+    loss: Optional[torch.FloatTensor] = None
+    loss_itc: Optional[torch.FloatTensor] = None
+    loss_itm: Optional[torch.FloatTensor] = None
+    loss_lm: Optional[torch.FloatTensor] = None
+@dataclass
+class BlipOutputWithLogits(BlipOutput):
+    logits: torch.FloatTensor = None
+    logits_m: torch.FloatTensor = None
+@dataclass
+class BlipOutputFeatures(ModelOutput):
+    """
+    Data class of features from BlipFeatureExtractor.
+    Args:
+        image_embeds: (torch.FloatTensor) of shape (batch_size, num_patches+1, embed_dim), optional
+        image_features: (torch.FloatTensor) of shape (batch_size, num_patches+1, feature_dim), optional
+        text_embeds: (torch.FloatTensor) of shape (batch_size, sequence_length+1, embed_dim), optional
+        text_features: (torch.FloatTensor) of shape (batch_size, sequence_length+1, feature_dim), optional
+        The first embedding or feature is for the [CLS] token.
+        Features are obtained by projecting the corresponding embedding into a normalized low-dimensional space.
+    """
+    image_embeds: Optional[torch.FloatTensor] = None
+    image_embeds_proj: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    text_embeds_proj: Optional[torch.FloatTensor] = None
+    multimodal_embeds: Optional[torch.FloatTensor] = None

evalscope 0.13.2__py3-none-any.whl → 0.15.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.13.2py3-none-any.whl → 0.15.0py3-none-any.whl