PyPI - evalscope - Versions diffs - 0.14.0__py3-none-any.whl → 0.15.1__py3-none-any.whl - Mend

evalscope 0.14.0py3-none-any.whl → 0.15.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (181) hide show

evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py ADDED Viewed

@@ -0,0 +1,218 @@
+import torch
+from typing import List
+from ...constants import CACHE_DIR, CONTEXT_LEN, DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, SYSTEM_MSG
+from .clip_t5.model import CLIPT5ForConditionalGeneration, ModelArguments
+from .mm_utils import expand2square, load_pretrained_model, t5_tokenizer_image_token
+from .vqa_model import VQAScoreModel
+default_question_template = 'Does this figure show "{}"? Please answer yes or no.'
+default_answer_template = 'Yes'
+def format_question(question, conversation_style='plain'):
+    if conversation_style == 't5_plain':  # for 1st stage t5 model
+        question = DEFAULT_IMAGE_TOKEN + question
+    elif conversation_style == 't5_chat':  # for 2nd stage t5 model
+        question = SYSTEM_MSG + ' USER: ' + DEFAULT_IMAGE_TOKEN + '\n' + question + ' ASSISTANT: '
+    elif conversation_style == 't5_chat_no_system':  # for 2nd stage t5 model
+        question = 'USER: ' + DEFAULT_IMAGE_TOKEN + '\n' + question + ' ASSISTANT: '
+    elif conversation_style == 't5_chat_no_system_no_user':  # for 2nd stage t5 model
+        question = '' + DEFAULT_IMAGE_TOKEN + '\n' + question + ' : '
+    # elif conversation_style == 't5_chat_ood_system': # for 2nd stage t5 model
+    #     question = SYSTEM_MSG + " HUMAN: " + DEFAULT_IMAGE_TOKEN + "\n" + question + " GPT: "
+    else:
+        raise NotImplementedError()
+    return question
+def format_answer(answer, conversation_style='plain'):
+    return answer
+CLIP_T5_MODELS = {
+    # We recommend using 'clip-flant5-xxl' for maximal performance.
+    # If you want to use a smaller model, we recommend using 'clip-flant5-xl'.
+    'clip-flant5-xxl': {
+        'tokenizer': {
+            'path': 'AI-ModelScope/clip-flant5-xxl',  # zhiqiulin/clip-flant5-xxl
+            'model_max_length': CONTEXT_LEN,
+        },
+        'model': {
+            'path': 'AI-ModelScope/clip-flant5-xxl',  # zhiqiulin/clip-flant5-xxl
+            'conversation': 't5_chat',
+            'image_aspect_ratio': 'pad',
+        },
+    },
+    'clip-flant5-xl': {
+        'tokenizer': {
+            'path': 'zhiqiulin/clip-flant5-xl',
+            'model_max_length': CONTEXT_LEN,
+        },
+        'model': {
+            'path': 'zhiqiulin/clip-flant5-xl',
+            'conversation': 't5_chat',
+            'image_aspect_ratio': 'pad',
+        },
+    },
+}
+class CLIPT5Model(VQAScoreModel):
+    """A wrapper for the CLIP-FlanT5 or CLIP-T5 models"""
+    def __init__(self, model_name='clip-flant5-xxl', device='cuda', cache_dir=CACHE_DIR):
+        assert model_name in CLIP_T5_MODELS
+        super().__init__(model_name=model_name, device=device, cache_dir=cache_dir)
+    def load_model(self):
+        """Load the model, tokenizer, image transform
+        """
+        model_args = ModelArguments()
+        model_max_length = CLIP_T5_MODELS[self.model_name]['tokenizer']['model_max_length'] \
+            if 'model_max_length' in CLIP_T5_MODELS[self.model_name]['tokenizer'] else None
+        padding_side = CLIP_T5_MODELS[self.model_name]['tokenizer']['padding_side'] \
+            if 'padding_side' in CLIP_T5_MODELS[self.model_name]['tokenizer'] else None
+        mmprojector_repo = CLIP_T5_MODELS[self.model_name]['model']['mmprojector_repo'] \
+            if 'mmprojector_repo' in CLIP_T5_MODELS[self.model_name]['model'] else None
+        mmprojector_name = CLIP_T5_MODELS[self.model_name]['model']['mmprojector_name'] \
+            if 'mmprojector_name' in CLIP_T5_MODELS[self.model_name]['model'] else None
+        # default is 'pad'
+        # stage-1 models use 'square'
+        self.image_aspect_ratio = CLIP_T5_MODELS[self.model_name]['model']['image_aspect_ratio'] \
+            if 'image_aspect_ratio' in CLIP_T5_MODELS[self.model_name]['model'] else 'pad'
+        self.conversational_style = CLIP_T5_MODELS[self.model_name]['model']['conversation']
+        self.context_len = CONTEXT_LEN
+        self.tokenizer, self.model, self.image_processor = load_pretrained_model(
+            CLIPT5ForConditionalGeneration,
+            model_args,
+            model_path=CLIP_T5_MODELS[self.model_name]['model']['path'],
+            tokenizer_path=CLIP_T5_MODELS[self.model_name]['tokenizer']['path'],
+            model_max_length=model_max_length,
+            padding_side=padding_side,
+            image_aspect_ratio=self.image_aspect_ratio,
+            mmprojector_repo=mmprojector_repo,
+            mmprojector_name=mmprojector_name,
+            device=self.device,
+            cache_dir=self.cache_dir)
+    def load_images(self, image: List[str]) -> torch.Tensor:
+        """Load the image(s), and return a tensor (after preprocessing) put on self.device
+        """
+        image = [self.image_loader(x) for x in image]
+        if self.image_aspect_ratio == 'pad':
+            image = [
+                expand2square(image, tuple(int(x * 255) for x in self.image_processor.image_mean)) for image in image
+            ]
+        image = [self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] for image in image]
+        assert all(x.shape == image[0].shape for x in image)
+        image = torch.stack(image, dim=0).to(self.device)
+        return image
+    @torch.no_grad()
+    @torch.autocast(device_type='cuda', dtype=torch.bfloat16)
+    def forward(self,
+                images: List[str],
+                texts: List[str],
+                question_template: str = default_question_template,
+                answer_template: str = default_answer_template) -> torch.Tensor:
+        """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor)
+        """
+        assert len(images) == len(texts), 'Number of images and texts must match'
+        # Turn "a photo of a dog" into
+        # Q: "Does this figure show "a photo of a dog"? Please answer yes or no."
+        # A: "Yes"
+        questions = [question_template.format(text) for text in texts]
+        answers = [answer_template.format(text) for text in texts]
+        # Formatting for CLIP-FlanT5 desired input including system message and image tokens
+        questions = [format_question(question, conversation_style=self.conversational_style) for question in questions]
+        answers = [format_answer(answer, conversation_style=self.conversational_style) for answer in answers]
+        images = self.load_images(images)
+        input_ids = [t5_tokenizer_image_token(qs, self.tokenizer, return_tensors='pt') for qs in questions]
+        labels = [t5_tokenizer_image_token(ans, self.tokenizer, return_tensors='pt') for ans in answers]
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
+        input_ids = input_ids[:, :self.tokenizer.model_max_length]
+        labels = labels[:, :self.tokenizer.model_max_length]
+        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
+        decoder_attention_mask = labels.ne(IGNORE_INDEX)
+        input_ids, attention_mask, decoder_attention_mask, labels = input_ids.to(self.device), \
+            attention_mask.to(self.device), decoder_attention_mask.to(self.device), labels.to(self.device)
+        model_input_kwargs = {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'decoder_attention_mask': decoder_attention_mask,
+            'labels': labels,
+            'images': images,
+            'past_key_values': None,
+            'inputs_embeds': None,
+            'use_cache': None,
+            'output_attentions': None,
+            'output_hidden_states': None,
+            'return_dict': True,
+        }
+        outputs = self.model(**model_input_kwargs)
+        logits = outputs.logits
+        lm_prob = torch.zeros(logits.shape[0])
+        loss_fct = torch.nn.CrossEntropyLoss(reduction='mean')
+        for k in range(lm_prob.shape[0]):
+            lm_prob[k] = (
+                -loss_fct(logits[k], labels[k])).exp()  # exp to cancel the log and get raw prob between 0 and 1
+        return lm_prob
+    @torch.no_grad()
+    @torch.autocast(device_type='cuda', dtype=torch.bfloat16)
+    def generate(
+        self,
+        images: List[str],
+        prompts: List[str],
+        temperature: float = 0.2,
+    ):
+        """Forward pass of the model to return n strings for n (image, prompt) pairs
+        """
+        assert len(images) == len(prompts), 'Number of images and texts must match'
+        # Formatting for CLIP-FlanT5 desired input including system message and image tokens
+        questions = [format_question(prompt, conversation_style=self.conversational_style) for prompt in prompts]
+        images = self.load_images(images)
+        input_ids = [t5_tokenizer_image_token(qs, self.tokenizer, return_tensors='pt') for qs in questions]
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+        input_ids = input_ids[:, :self.tokenizer.model_max_length]
+        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
+        input_ids, attention_mask = input_ids.to(self.device), attention_mask.to(self.device)
+        model_input_kwargs = {
+            'inputs': input_ids,
+            'images': images,
+            'attention_mask': attention_mask,
+            'do_sample': True if temperature > 0 else False,
+            'temperature': temperature,
+            'top_p': None,
+            'num_beams': 1,
+            'max_new_token': 1024,
+            'use_cache': True,
+        }
+        outputs = self.model.generate(**model_input_kwargs)
+        outputs = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        for i in range(len(outputs)):
+            if outputs[i].endswith(' '):
+                outputs[i] = outputs[i][:-1]
+            outputs[i] = outputs[i].strip()
+        return outputs

evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py ADDED Viewed

@@ -0,0 +1,150 @@
+import base64
+import os
+import tiktoken
+import torch
+from openai import OpenAI
+from typing import List
+from .vqa_model import VQAScoreModel
+default_question_template = 'Does this figure show "{}"? Please answer yes or no.'
+default_answer_template = 'Yes'
+GPT4V_MODELS = {
+    # We recommend using 'gpt-4-turbo' for optimal performance.
+    'gpt-4-turbo': {},
+    'gpt-4o': {},
+}
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, 'rb') as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+def get_image_type(image_path):
+    image_type = image_path.split('.')[-1]
+    assert image_type in ['png', 'jpeg', 'jpg', 'gif', 'bmp', 'webp']
+    return image_type
+class GPT4VModel(VQAScoreModel):
+    """A wrapper for the GPT4V models"""
+    def __init__(self, model_name='gpt-4-turbo', device='cuda', cache_dir=None, openai_key=None, top_logprobs=2):
+        assert model_name in GPT4V_MODELS
+        assert openai_key is not None, 'Please provide an OpenAI API key'
+        self.openai_key = openai_key
+        self.top_logprobs = top_logprobs
+        super().__init__(model_name=model_name, device=device, cache_dir=cache_dir)
+    def load_model(self):
+        """Load the model, tokenizer, image transform
+        """
+        self.tokenizer = tiktoken.encoding_for_model(self.model_name)
+        self.client = OpenAI(api_key=self.openai_key)
+        # self.candidate_answers = GPT4V_MODELS[self.model_name]['candidate_answers']
+        # assert GPT4V_MODELS[self.model_name]['answer'] in self.candidate_answers
+        # self.candidate_tokens = []
+        # for ans in self.candidate_answers:
+        #     token = self.tokenizer.encode(ans)
+        #     assert len(token) == 1, "Currently only support single token answers"
+        #     self.candidate_tokens.append(token[0])
+    def load_images(self, image: List[str]) -> torch.Tensor:
+        """Load the image(s), and return the string
+        """
+        image = [{'path': img, 'type': get_image_type(img), 'base64': encode_image(img)} for img in image]
+        return image
+    def forward_single(self, image, question, answer):
+        try:
+            completion = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=[{
+                    'role':
+                    'user',
+                    'content': [{
+                        'type': 'text',
+                        'text': question
+                    }, {
+                        'type': 'image_url',
+                        'image_url': {
+                            'url': f"data:image/{image['type']};base64,{image['base64']}"
+                        }
+                    }]
+                }],
+                logprobs=True,
+                top_logprobs=self.top_logprobs,
+                # logit_bias={yes_token:50, no_token:50}
+            )
+        except:
+            print(
+                f"Warning: completion not generated for image: {image['path']} and question: {question} and answer: {answer}"
+            )
+            print(f'Trying again with the same image')
+            try:
+                completion = self.client.chat.completions.create(
+                    model=self.model_name,
+                    messages=[{
+                        'role':
+                        'user',
+                        'content': [{
+                            'type': 'text',
+                            'text': question
+                        }, {
+                            'type': 'image_url',
+                            'image_url': {
+                                'url': f"data:image/{image['type']};base64,{image['base64']}"
+                            }
+                        }]
+                    }],
+                    logprobs=True,
+                    top_logprobs=self.top_logprobs,
+                )
+            except:
+                print(f"Failed image: {image['path']} and question: {question} and answer: {answer}")
+                return torch.Tensor([0.0])
+        # print(completion.choices[0].message)
+        # print(completion.choices[0].logprobs)
+        # print(completion.choices[0].logprobs.content[0])
+        is_generated = False
+        for top_logprob in completion.choices[0].logprobs.content[0].top_logprobs:
+            if top_logprob.token == answer:
+                is_generated = True
+                return torch.Tensor([top_logprob.logprob]).exp()
+        if not is_generated:
+            print(
+                f"Warning: answer not generated for image: {image['path']} and question: {question} and answer: {answer}"
+            )
+            print(completion.choices[0].logprobs.content[0].top_logprobs)
+            return torch.Tensor([0.0])
+    def forward(self,
+                images: List[str],
+                texts: List[str],
+                question_template: str = default_question_template,
+                answer_template: str = default_answer_template) -> torch.Tensor:
+        """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor)
+        """
+        assert len(images) == len(texts), 'Number of images and texts must match'
+        # Turn "a photo of a dog" into
+        # Q: "Does this figure show "a photo of a dog"? Please answer yes or no."
+        # A: "Yes"
+        questions = [question_template.format(text) for text in texts]
+        answers = [answer_template.format(text) for text in texts]
+        for ans in answers:
+            ans_tokens = self.tokenizer.encode(ans)
+            assert len(ans_tokens) == 1, 'Currently only support single token answers'
+        images = self.load_images(images)
+        lm_prob = torch.zeros(len(images))
+        for idx, (image, question, answer) in enumerate(zip(images, questions, answers)):
+            lm_prob[idx] = self.forward_single(image, question, answer)
+        return lm_prob

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import os
+import sys
+from omegaconf import OmegaConf
+from .common.registry import registry
+from .models import *
+from .processors import *
+root_dir = os.path.dirname(os.path.abspath(__file__))
+default_cfg = OmegaConf.load(os.path.join(root_dir, 'configs/default.yaml'))
+registry.register_path('library_root', root_dir)
+repo_root = os.path.join(root_dir, '..')
+registry.register_path('repo_root', repo_root)
+cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
+registry.register_path('cache_root', cache_root)
+registry.register('MAX_INT', sys.maxsize)
+registry.register('SPLIT_NAMES', ['train', 'val', 'test'])

evalscope 0.14.0__py3-none-any.whl → 0.15.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.14.0py3-none-any.whl → 0.15.1py3-none-any.whl