PyPI - evalscope - Versions diffs - 0.13.2__py3-none-any.whl → 0.15.0__py3-none-any.whl - Mend

evalscope 0.13.2py3-none-any.whl → 0.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (214) hide show

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py ADDED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py ADDED Viewed

@@ -0,0 +1,211 @@
+"""
+ Copyright (c) 2023, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import contextlib
+import datetime
+import logging
+import os
+import time
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from modelscope import snapshot_download
+from transformers import BertTokenizer
+from ...common import dist_utils
+from ...common.dist_utils import download_cached_file
+from ...common.logger import MetricLogger
+from ...common.utils import is_url
+from ..base_model import BaseModel
+from ..clip_vit import create_clip_vit_L
+from ..eva_vit import create_eva_vit_g
+from .Qformer import BertConfig, BertLMHeadModel
+class Blip2Base(BaseModel):
+    @classmethod
+    def init_tokenizer(cls, truncation_side='right'):
+        bert_path = snapshot_download('AI-ModelScope/bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained(bert_path, truncation_side=truncation_side)
+        tokenizer.add_special_tokens({'bos_token': '[DEC]'})
+        return tokenizer
+    def maybe_autocast(self, dtype=torch.float16):
+        # if on cpu, don't use autocast
+        # if on gpu, use autocast with dtype if provided, otherwise use torch.float16
+        enable_autocast = self.device != torch.device('cpu')
+        if enable_autocast:
+            return torch.amp.autocast(device_type=self.device.type, dtype=dtype)
+        else:
+            return contextlib.nullcontext()
+    @classmethod
+    def init_Qformer(cls, num_query_token, vision_width, cross_attention_freq=2):
+        bert_path = snapshot_download('AI-ModelScope/bert-base-uncased')
+        encoder_config = BertConfig.from_pretrained(bert_path)
+        encoder_config.encoder_width = vision_width
+        encoder_config.vocab_size += 1  # add one for [DEC]
+        # insert cross-attention layer every other block
+        encoder_config.add_cross_attention = True
+        encoder_config.cross_attention_freq = cross_attention_freq
+        encoder_config.query_length = num_query_token
+        Qformer = BertLMHeadModel._from_config(encoder_config)
+        query_tokens = nn.Parameter(torch.zeros(1, num_query_token, encoder_config.hidden_size))
+        query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
+        return Qformer, query_tokens
+    @classmethod
+    def init_vision_encoder(cls, model_name, img_size, drop_path_rate, use_grad_checkpoint, precision):
+        assert model_name in [
+            'eva_clip_g',
+            'clip_L',
+        ], 'vit model must be eva_clip_g or clip_L'
+        if model_name == 'eva_clip_g':
+            visual_encoder = create_eva_vit_g(img_size, drop_path_rate, use_grad_checkpoint, precision)
+        elif model_name == 'clip_L':
+            visual_encoder = create_clip_vit_L(img_size, use_grad_checkpoint, precision)
+        ln_vision = LayerNorm(visual_encoder.num_features)
+        return visual_encoder, ln_vision
+    def load_from_pretrained(self, url_or_filename):
+        if is_url(url_or_filename):
+            cached_file = download_cached_file(url_or_filename, check_hash=False, progress=True)
+            checkpoint = torch.load(cached_file, map_location='cpu')
+        elif os.path.isfile(url_or_filename):
+            checkpoint = torch.load(url_or_filename, map_location='cpu')
+        else:
+            raise RuntimeError('checkpoint url or path is invalid')
+        state_dict = checkpoint['model']
+        msg = self.load_state_dict(state_dict, strict=False)
+        # logging.info("Missing keys {}".format(msg.missing_keys))
+        logging.info('load checkpoint from %s' % url_or_filename)
+        return msg
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+def compute_sim_matrix(model, data_loader, **kwargs):
+    k_test = kwargs.pop('k_test')
+    metric_logger = MetricLogger(delimiter='  ')
+    header = 'Evaluation:'
+    logging.info('Computing features for evaluation...')
+    start_time = time.time()
+    texts = data_loader.dataset.text
+    num_text = len(texts)
+    text_bs = 256
+    text_ids = []
+    text_embeds = []
+    text_atts = []
+    for i in range(0, num_text, text_bs):
+        text = texts[i:min(num_text, i + text_bs)]
+        text_input = model.tokenizer(
+            text,
+            padding='max_length',
+            truncation=True,
+            max_length=35,
+            return_tensors='pt',
+        ).to(model.device)
+        text_feat = model.forward_text(text_input)
+        text_embed = F.normalize(model.text_proj(text_feat))
+        text_embeds.append(text_embed)
+        text_ids.append(text_input.input_ids)
+        text_atts.append(text_input.attention_mask)
+    text_embeds = torch.cat(text_embeds, dim=0)
+    text_ids = torch.cat(text_ids, dim=0)
+    text_atts = torch.cat(text_atts, dim=0)
+    vit_feats = []
+    image_embeds = []
+    for samples in data_loader:
+        image = samples['image']
+        image = image.to(model.device)
+        image_feat, vit_feat = model.forward_image(image)
+        image_embed = model.vision_proj(image_feat)
+        image_embed = F.normalize(image_embed, dim=-1)
+        vit_feats.append(vit_feat.cpu())
+        image_embeds.append(image_embed)
+    vit_feats = torch.cat(vit_feats, dim=0)
+    image_embeds = torch.cat(image_embeds, dim=0)
+    sims_matrix = []
+    for image_embed in image_embeds:
+        sim_q2t = image_embed @ text_embeds.t()
+        sim_i2t, _ = sim_q2t.max(0)
+        sims_matrix.append(sim_i2t)
+    sims_matrix = torch.stack(sims_matrix, dim=0)
+    score_matrix_i2t = torch.full((len(data_loader.dataset.image), len(texts)), -100.0).to(model.device)
+    num_tasks = dist_utils.get_world_size()
+    rank = dist_utils.get_rank()
+    step = sims_matrix.size(0) // num_tasks + 1
+    start = rank * step
+    end = min(sims_matrix.size(0), start + step)
+    for i, sims in enumerate(metric_logger.log_every(sims_matrix[start:end], 50, header)):
+        topk_sim, topk_idx = sims.topk(k=k_test, dim=0)
+        image_inputs = vit_feats[start + i].repeat(k_test, 1, 1).to(model.device)
+        score = model.compute_itm(
+            image_inputs=image_inputs,
+            text_ids=text_ids[topk_idx],
+            text_atts=text_atts[topk_idx],
+        ).float()
+        score_matrix_i2t[start + i, topk_idx] = score + topk_sim
+    sims_matrix = sims_matrix.t()
+    score_matrix_t2i = torch.full((len(texts), len(data_loader.dataset.image)), -100.0).to(model.device)
+    step = sims_matrix.size(0) // num_tasks + 1
+    start = rank * step
+    end = min(sims_matrix.size(0), start + step)
+    for i, sims in enumerate(metric_logger.log_every(sims_matrix[start:end], 50, header)):
+        topk_sim, topk_idx = sims.topk(k=k_test, dim=0)
+        image_inputs = vit_feats[topk_idx.cpu()].to(model.device)
+        score = model.compute_itm(
+            image_inputs=image_inputs,
+            text_ids=text_ids[start + i].repeat(k_test, 1),
+            text_atts=text_atts[start + i].repeat(k_test, 1),
+        ).float()
+        score_matrix_t2i[start + i, topk_idx] = score + topk_sim
+    if dist_utils.is_dist_avail_and_initialized():
+        dist.barrier()
+        torch.distributed.all_reduce(score_matrix_i2t, op=torch.distributed.ReduceOp.SUM)
+        torch.distributed.all_reduce(score_matrix_t2i, op=torch.distributed.ReduceOp.SUM)
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    logging.info('Evaluation time {}'.format(total_time_str))
+    return score_matrix_i2t.cpu().numpy(), score_matrix_t2i.cpu().numpy()

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py ADDED Viewed

@@ -0,0 +1,109 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import torch
+import torch.nn.functional as F
+from ...common.registry import registry
+from .blip2_qformer import Blip2Qformer
+@registry.register_model('blip2_image_text_matching')
+class Blip2ITM(Blip2Qformer):
+    """
+    BLIP Image-Text Matching (ITM) model.
+    Supported model types:
+        - pretrained: pretrained model
+        - coco: fintuned model on coco
+    Usage:
+        >>> from lavis.models import load_model
+        >>> model = load_model("blip2_image_text_matching", "pretrained")
+        >>> model = load_model("blip2_image_text_matching", "coco")
+    """
+    def __init__(
+        self,
+        vit_model='eva_clip_g',
+        img_size=224,
+        drop_path_rate=0,
+        use_grad_checkpoint=False,
+        vit_precision='fp16',
+        freeze_vit=True,
+        num_query_token=32,
+        cross_attention_freq=2,
+        embed_dim=256,
+        max_txt_len=32,
+    ):
+        super().__init__(
+            vit_model=vit_model,
+            img_size=img_size,
+            drop_path_rate=drop_path_rate,
+            use_grad_checkpoint=use_grad_checkpoint,
+            vit_precision=vit_precision,
+            freeze_vit=freeze_vit,
+            num_query_token=num_query_token,
+            cross_attention_freq=cross_attention_freq,
+            embed_dim=embed_dim,
+            max_txt_len=max_txt_len,
+        )
+    def forward(self, samples, match_head='itm'):
+        image = samples['image']
+        caption = samples['text_input']
+        with self.maybe_autocast():
+            image_embeds = self.ln_vision(self.visual_encoder(image))
+        image_embeds = image_embeds.float()
+        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device)
+        text = self.tokenizer(
+            caption,
+            truncation=True,
+            max_length=self.max_txt_len,
+            return_tensors='pt',
+        ).to(image.device)
+        if match_head == 'itm':
+            query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+            query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(image.device)
+            attention_mask = torch.cat([query_atts, text.attention_mask], dim=1)
+            output_itm = self.Qformer.bert(
+                text.input_ids,
+                query_embeds=query_tokens,
+                attention_mask=attention_mask,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
+                return_dict=True,
+            )
+            itm_embeddings = output_itm.last_hidden_state[:, :query_tokens.size(1), :]
+            itm_logit = self.itm_head(itm_embeddings)
+            itm_logit = itm_logit.mean(dim=1)
+            return itm_logit
+        elif match_head == 'itc':
+            query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+            query_output = self.Qformer.bert(
+                query_embeds=query_tokens,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
+                return_dict=True,
+            )
+            image_feats = F.normalize(self.vision_proj(query_output.last_hidden_state), dim=-1)
+            text_output = self.Qformer.bert(
+                text.input_ids,
+                attention_mask=text.attention_mask,
+                return_dict=True,
+            )
+            text_feat = F.normalize(self.text_proj(text_output.last_hidden_state[:, 0, :]), dim=-1)
+            sims = torch.bmm(image_feats, text_feat.unsqueeze(-1))
+            sim, _ = torch.max(sims, dim=1)
+            return sim

evalscope 0.13.2__py3-none-any.whl → 0.15.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.13.2py3-none-any.whl → 0.15.0py3-none-any.whl