PyPI - evalscope - Versions diffs - 0.14.0__py3-none-any.whl → 0.15.1__py3-none-any.whl - Mend

evalscope 0.14.0py3-none-any.whl → 0.15.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (181) hide show

evalscope/metrics/t2v_metrics/models/model.py ADDED Viewed

@@ -0,0 +1,45 @@
+import numpy as np
+import os
+import torch
+from abc import ABC, abstractmethod
+from PIL import Image
+from typing import List
+from ..constants import CACHE_DIR
+def image_loader(image_path):
+    if image_path.split('.')[-1] == 'npy':
+        return Image.fromarray(np.load(image_path)[:, :, [2, 1, 0]], 'RGB')
+    else:
+        return Image.open(image_path).convert('RGB')
+class ScoreModel(ABC):
+    def __init__(self, model_name='clip-flant5-xxl', device='cuda', cache_dir=CACHE_DIR):
+        self.model_name = model_name
+        self.device = device
+        self.cache_dir = cache_dir
+        if not os.path.exists(self.cache_dir):
+            os.makedirs(self.cache_dir)
+        self.image_loader = image_loader
+        self.load_model()
+    @abstractmethod
+    def load_model(self):
+        """Load the model, tokenizer, and etc.
+        """
+        pass
+    @abstractmethod
+    def load_images(self, image: List[str]) -> torch.Tensor:
+        """Load the image(s), and return a tensor (after preprocessing) put on self.device
+        """
+        pass
+    @abstractmethod
+    def forward(self, images: List[str], texts: List[str], **kwargs) -> torch.Tensor:
+        """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor)
+        """
+        pass

evalscope/metrics/t2v_metrics/models/utils.py ADDED Viewed

@@ -0,0 +1,25 @@
+import os
+from modelscope import snapshot_download
+def download_open_clip_model(model_name, tag, cache_dir):
+    import open_clip
+    # get pretrained config
+    pretrained_cfg = open_clip.get_pretrained_cfg(model_name, tag)
+    model_hub = pretrained_cfg.get('hf_hub').strip('/')
+    # load model from modelscope
+    model_weight_name = 'open_clip_model.safetensors'
+    local_path = snapshot_download(model_id=model_hub, cache_dir=cache_dir, allow_patterns=model_weight_name)
+    model_file_path = os.path.join(local_path, model_weight_name)
+    return model_file_path
+def download_file(model_id, file_name=None, cache_dir=None):
+    # download file from modelscope
+    local_path = snapshot_download(model_id=model_id, cache_dir=cache_dir, allow_patterns=file_name)
+    if file_name is None:
+        return local_path
+    else:
+        return os.path.join(local_path, file_name)

evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+from ...constants import CACHE_DIR
+from .clip_t5_model import CLIP_T5_MODELS, CLIPT5Model
+from .gpt4v_model import GPT4V_MODELS, GPT4VModel
+ALL_VQA_MODELS = [
+    CLIP_T5_MODELS,
+    GPT4V_MODELS,
+]
+def list_all_vqascore_models():
+    return [model for models in ALL_VQA_MODELS for model in models]
+def get_vqascore_model(model_name, device='cuda', cache_dir=CACHE_DIR, **kwargs):
+    assert model_name in list_all_vqascore_models()
+    if model_name in CLIP_T5_MODELS:
+        return CLIPT5Model(model_name, device=device, cache_dir=cache_dir, **kwargs)
+    elif model_name in GPT4V_MODELS:
+        return GPT4VModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
+    else:
+        raise NotImplementedError()

evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py ADDED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .language_model.clip_t5 import CLIPT5Config, CLIPT5ForConditionalGeneration, ModelArguments

evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py ADDED Viewed

@@ -0,0 +1,300 @@
+#    Copyright 2023 Zhiqiu Lin
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import torch
+from dataclasses import dataclass, field
+from transformers import AutoConfig, AutoModelForSeq2SeqLM, T5Config, T5ForConditionalGeneration
+from transformers.modeling_outputs import Seq2SeqLMOutput
+from typing import List, Optional, Tuple, Union
+from ..multimodal_encoder.builder import build_vision_tower
+from ..multimodal_projector.builder import build_vision_projector
+IMAGE_TOKEN_INDEX = -200
+@dataclass
+class ModelArguments:
+    tune_mm_mlp_adapter: bool = field(default=False)
+    vision_tower: Optional[str] = field(default='openai/clip-vit-large-patch14-336')
+    mm_vision_select_layer: Optional[int] = field(default=-2)  # default to the second last layer in llava1.5
+    pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
+    mm_projector_type: Optional[str] = field(default='mlp2x_gelu')
+    mm_vision_select_feature: Optional[str] = field(default='patch')
+class CLIPT5Config(T5Config):
+    model_type = 'clip_t5'
+class CLIPT5ForConditionalGeneration(T5ForConditionalGeneration):
+    # This class supports both T5 and FlanT5
+    config_class = CLIPT5Config
+    def __init__(self, config):
+        super(CLIPT5ForConditionalGeneration, self).__init__(config)
+        self.embed_tokens = self.encoder.embed_tokens
+        if hasattr(config, 'mm_vision_tower'):
+            self.vision_tower = build_vision_tower(config, delay_load=False)
+            self.mm_projector = build_vision_projector(config)
+    def get_vision_tower(self):
+        vision_tower = getattr(self, 'vision_tower', None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def get_model(self):
+        return self  # for compatibility with LlavaMetaForCausalLM
+    def prepare_inputs_labels_for_multimodal(self, input_ids, attention_mask, decoder_attention_mask, past_key_values,
+                                             labels, images):
+        # The labels are now separated from the input_ids.
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            raise NotImplementedError()
+        if type(images) is list or images.ndim == 5:
+            concat_images = torch.cat([image for image in images], dim=0)
+            image_features = self.encode_images(concat_images)
+            split_sizes = [image.shape[0] for image in images]
+            image_features = torch.split(image_features, split_sizes, dim=0)
+            image_features = [x.flatten(0, 1) for x in image_features]
+        else:
+            image_features = self.encode_images(images)
+        new_input_embeds = []
+        cur_image_idx = 0
+        for _, cur_input_ids in enumerate(input_ids):
+            if (cur_input_ids == IMAGE_TOKEN_INDEX).sum() == 0:
+                # multimodal LLM, but the current sample is not multimodal
+                raise NotImplementedError()
+            image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
+            cur_new_input_embeds = []
+            while image_token_indices.numel() > 0:
+                cur_image_features = image_features[cur_image_idx]
+                image_token_start = image_token_indices[0]
+                cur_new_input_embeds.append(self.embed_tokens(cur_input_ids[:image_token_start]))
+                cur_new_input_embeds.append(cur_image_features)
+                cur_image_idx += 1
+                cur_input_ids = cur_input_ids[image_token_start + 1:]
+                image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
+            if cur_input_ids.numel() > 0:
+                cur_new_input_embeds.append(self.embed_tokens(cur_input_ids))
+            cur_new_input_embeds = [x.to(device=self.device) for x in cur_new_input_embeds]
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
+            new_input_embeds.append(cur_new_input_embeds)
+        if any(x.shape != new_input_embeds[0].shape for x in new_input_embeds):
+            max_len = max(x.shape[0] for x in new_input_embeds)
+            new_input_embeds_align = []
+            _input_embeds_lengths = []
+            for cur_new_embed in new_input_embeds:
+                _input_embeds_lengths.append(cur_new_embed.shape[0])
+                cur_new_embed = torch.cat((cur_new_embed,
+                                           torch.zeros((max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]),
+                                                       dtype=cur_new_embed.dtype,
+                                                       device=cur_new_embed.device)),
+                                          dim=0)
+                new_input_embeds_align.append(cur_new_embed)
+            new_input_embeds = torch.stack(new_input_embeds_align, dim=0)
+            if attention_mask is not None:
+                new_attention_mask = []
+                for cur_attention_mask, _input_embeds_length in zip(attention_mask, _input_embeds_lengths):
+                    new_attn_mask_pad_left = torch.full((_input_embeds_length - input_ids.shape[1], ),
+                                                        True,
+                                                        dtype=attention_mask.dtype,
+                                                        device=attention_mask.device)
+                    new_attn_mask_pad_right = torch.full((new_input_embeds.shape[1] - _input_embeds_length, ),
+                                                         False,
+                                                         dtype=attention_mask.dtype,
+                                                         device=attention_mask.device)
+                    cur_new_attention_mask = torch.cat(
+                        (new_attn_mask_pad_left, cur_attention_mask, new_attn_mask_pad_right), dim=0)
+                    new_attention_mask.append(cur_new_attention_mask)
+                attention_mask = torch.stack(new_attention_mask, dim=0)
+                assert attention_mask.shape == new_input_embeds.shape[:2]
+        else:
+            new_input_embeds = torch.stack(new_input_embeds, dim=0)
+            if attention_mask is not None:
+                new_attn_mask_pad_left = torch.full(
+                    (attention_mask.shape[0], new_input_embeds.shape[1] - input_ids.shape[1]),
+                    True,
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device)
+                attention_mask = torch.cat((new_attn_mask_pad_left, attention_mask), dim=1)
+                assert attention_mask.shape == new_input_embeds.shape[:2]
+        return None, attention_mask, decoder_attention_mask, past_key_values, new_input_embeds, labels
+    def encode_images(self, images):
+        image_features = self.get_vision_tower()(images)
+        image_features = self.mm_projector(image_features)
+        return image_features
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        vision_tower = model_args.vision_tower
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+        self.config.mm_vision_tower = vision_tower
+        self.config.pretrain_mm_mlp_adapter = pretrain_mm_mlp_adapter
+        if self.get_vision_tower() is None:
+            vision_tower = build_vision_tower(model_args)
+            if fsdp is not None and len(fsdp) > 0:
+                self.vision_tower = [vision_tower]
+            else:
+                self.vision_tower = vision_tower
+        else:
+            if fsdp is not None and len(fsdp) > 0:
+                vision_tower = self.vision_tower[0]
+            else:
+                vision_tower = self.vision_tower
+            if not vision_tower.is_loaded:
+                vision_tower.load_model()
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'mlp2x_gelu')
+        self.config.mm_hidden_size = vision_tower.hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+        if getattr(self, 'mm_projector', None) is None:
+            self.mm_projector = build_vision_projector(self.config)
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+            def get_w(weights, keyword):
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
+            self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if inputs_embeds is None:
+            _, attention_mask, decoder_attention_mask, past_key_values, inputs_embeds, labels = \
+                self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, decoder_attention_mask, past_key_values, labels, images)
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = super(CLIPT5ForConditionalGeneration, self).forward(
+            input_ids=None,  # will be None if inputs_embeds is not None
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            labels=labels,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs,
+        )
+        return outputs
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        assert images is not None, 'images must be provided'
+        assert inputs is not None, 'inputs must be provided'
+        assert attention_mask is not None, 'attention_mask must be provided'
+        _, attention_mask, _, _, inputs_embeds, _ = \
+            self.prepare_inputs_labels_for_multimodal(inputs, attention_mask, None, None, None, images)
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = super(CLIPT5ForConditionalGeneration, self).generate(
+            input_ids=None,  # will be None if inputs_embeds is not None
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+        )
+        return outputs
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        decoder_attention_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past_key_values is used
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[2]
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+            input_ids = input_ids[:, remove_prefix_length:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {'inputs_embeds': inputs_embeds}
+        else:
+            model_inputs = {'input_ids': input_ids}
+        model_inputs.update({
+            'decoder_input_ids': input_ids,
+            'past_key_values': past_key_values,
+            'encoder_outputs': encoder_outputs,
+            'attention_mask': attention_mask,
+            'head_mask': head_mask,
+            'decoder_head_mask': decoder_head_mask,
+            'decoder_attention_mask': decoder_attention_mask,
+            'cross_attn_head_mask': cross_attn_head_mask,
+            'use_cache': use_cache,
+        })
+        return model_inputs
+AutoConfig.register('clip_t5', CLIPT5Config)
+AutoModelForSeq2SeqLM.register(CLIPT5Config, CLIPT5ForConditionalGeneration)

evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py ADDED Viewed

@@ -0,0 +1,12 @@
+import os
+from .clip_encoder import CLIPVisionTower
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
+    is_absolute_path_exists = os.path.exists(vision_tower)
+    if is_absolute_path_exists or vision_tower.startswith('openai') or vision_tower.startswith('laion'):
+        return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    raise ValueError(f'Unknown vision tower: {vision_tower}')

evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py ADDED Viewed

@@ -0,0 +1,82 @@
+import torch
+import torch.nn as nn
+from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModel
+class CLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+    def load_model(self):
+        from .....utils import download_file
+        model_path = download_file(self.vision_tower_name.replace('openai', 'openai-mirror'))
+        self.image_processor = CLIPImageProcessor.from_pretrained(model_path)
+        self.vision_tower = CLIPVisionModel.from_pretrained(model_path)
+        self.vision_tower.requires_grad_(False)
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(
+                    image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(
+                images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size)**2

evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py ADDED Viewed

@@ -0,0 +1,50 @@
+import re
+import torch
+import torch.nn as nn
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {'mm_projector_type': 'identity'}
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels))
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+    if projector_type == 'linear':
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+    if projector_type == 'identity':
+        return IdentityMap()
+    raise ValueError(f'Unknown projector type: {projector_type}')

evalscope 0.14.0__py3-none-any.whl → 0.15.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.14.0py3-none-any.whl → 0.15.1py3-none-any.whl