PyPI - evalscope - Versions diffs - 0.13.2__py3-none-any.whl → 0.15.0__py3-none-any.whl - Mend

evalscope 0.13.2py3-none-any.whl → 0.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (214) hide show

evalscope/benchmarks/maritime_bench/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py ADDED Viewed

@@ -0,0 +1,79 @@
+from typing import Any
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType, OutputType
+from evalscope.metrics import exact_match
+from evalscope.utils.utils import ResponseParser
+SUBSET_LIST = ['default']
+@Benchmark.register(
+    name='maritime_bench',
+    pretty_name='MaritimeBench',
+    dataset_id='HiDolphin/MaritimeBench',
+    model_adapter=OutputType.GENERATION,
+    output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
+    subset_list=SUBSET_LIST,
+    metric_list=['AverageAccuracy'],
+    eval_split='test',
+    prompt_template=
+    '题目来自于{subset_name}请回答单选题。要求只输出选项，不输出解释，将选项放在<>里，直接输出答案。示例：\n\n题目：在船舶主推进动力装置中，传动轴系在运转中承受以下复杂的应力和负荷，但不包括______。\n选项：\nA. 电磁力\nB. 压拉应力\nC. 弯曲应力\nD. 扭应力\n答：<A> 当前题目\n {query}',  # noqa: E501
+)
+class MaritimeBenchAdapter(DataAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.choices = ['A', 'B', 'C', 'D']
+    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
+        prefix = ''
+        query = prefix + input_d['question'] + '\n'
+        available_choices = []
+        for option in self.choices:
+            if option in input_d and input_d[option]:
+                query += option + ':' + input_d[option] + '\n'
+                available_choices.append(option)
+        full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
+        return self.gen_prompt_data(full_prompt, choices=available_choices)
+    def get_gold_answer(self, input_d: dict) -> str:
+        """
+        Parse the raw input labels (gold).
+        Args:
+            input_d: input raw data. Depending on the dataset.
+        Returns:
+            The parsed input. e.g. gold answer ... Depending on the dataset.
+        """
+        return input_d['answer']
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
+        """
+        Parse the raw model prediction (pred).
+        Args:
+            pred: model prediction. Depending on the model.
+        Returns:
+            The parsed prediction. e.g. model answer... Depending on the model.
+        """
+        return ResponseParser.parse_bracketed_answer(result, options=self.choices)
+    def match(self, gold: Any, pred: Any) -> Any:
+        """
+        Match the gold answer with the predicted answer.
+        Args:
+            gold: The gold answer.
+            pred: The predicted answer.
+        Returns:
+            The result of the match.
+        """
+        return exact_match(gold=gold, pred=pred)

evalscope/benchmarks/math_500/math_500_adapter.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
+from evalscope.metrics import extract_answer, math_equal, strip_answer_string
 from evalscope.utils.logger import get_logger
 # flake8: noqa

evalscope/benchmarks/mmlu/mmlu_adapter.py CHANGED Viewed

@@ -137,7 +137,7 @@ SUBJECT_MAPPING = {
     name='mmlu',
     pretty_name='MMLU',
     dataset_id='modelscope/mmlu',
-    model_adapter=OutputType.MULTIPLE_CHOICE,
+    model_adapter=OutputType.GENERATION,
     output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
     subset_list=SUBSET_LIST,
     metric_list=['AverageAccuracy'],
@@ -145,7 +145,7 @@ SUBJECT_MAPPING = {
     train_split='train',
     eval_split='test',
     prompt_template=
-    'Answer the following multiple choice question about {subset_name}. There is only one correct answer. The last line of your response should be in the format "Answer: LETTER" (without quotes), where LETTER is one of A, B, C, D. \n{query}',
+    """Answer the following multiple choice question about {subset_name}. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\n{query}""",  # noqa: E501
 )
 class MMLUAdapter(DataAdapter):
@@ -224,9 +224,8 @@ class MMLUAdapter(DataAdapter):
         context: str = '\n'.join(few_shot_prompts) + '\n'
         context += self._generate_prompt(input_d=input_d, include_answer=False)
-        query = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
-        full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=query)
+        full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
         return self.gen_prompt_data(full_prompt)
@@ -249,7 +248,7 @@ class MMLUAdapter(DataAdapter):
         if self.model_adapter == OutputType.MULTIPLE_CHOICE:
             return result
         else:
-            return ResponseParser.parse_first_option(result)
+            return ResponseParser.parse_first_option(result, options=self.choices)
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=gold, pred=pred)
@@ -260,11 +259,12 @@ class MMLUAdapter(DataAdapter):
         example: str = input_d['input']
         for j in range(len(self.choices)):
-            example += '\n{}. {}'.format(self.choices[j], input_choices[j])
+            example += f'\n{self.choices[j]}) {input_choices[j]}'
-        example += '\nAnswer:'
         if include_answer:
-            example += ' {}\n\n'.format(input_d['target'])
+            example += f"\nAnswer: {input_d['target']}\n\n"
+        else:
+            example += '\nAnswer: \n\n'
         return example

evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py CHANGED Viewed

@@ -92,7 +92,7 @@ class MMLUProAdapter(DataAdapter):
         if self.model_adapter == OutputType.MULTIPLE_CHOICE:
             return result
         else:
-            return ResponseParser.parse_first_option(result)
+            return ResponseParser.parse_first_option(result, options=self.choices)
     def match(self, gold: str, pred: str) -> float:
         """

evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py CHANGED Viewed

@@ -164,7 +164,7 @@ class MMLUReduxAdapter(DataAdapter):
         if self.model_adapter == OutputType.MULTIPLE_CHOICE:
             return result
         else:
-            return ResponseParser.parse_first_option(result)
+            return ResponseParser.parse_first_option(result, options=self.choices)
     def match(self, gold: str, pred: str) -> float:
         """

evalscope/benchmarks/musr/musr_adapter.py CHANGED Viewed

@@ -62,7 +62,7 @@ class MuSRAdapter(DataAdapter):
         if self.model_adapter == OutputType.MULTIPLE_CHOICE:
             return result
         else:
-            return ResponseParser.parse_first_option(result)
+            return ResponseParser.parse_first_option(result, options=self.choices)
     def match(self, gold: str, pred: str) -> float:
         """

evalscope/benchmarks/simple_qa/simple_qa_adapter.py CHANGED Viewed

@@ -3,8 +3,7 @@ from collections import defaultdict
 from typing import Any, List
 from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.metrics import Metric, mean, metric_registry
-from evalscope.metrics.llm_judge import LLMJudge
+from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
 from evalscope.utils.logger import get_logger
 # flake8: noqa

evalscope/benchmarks/utils.py CHANGED Viewed

@@ -1,6 +1,6 @@
-from dataclasses import dataclass
+from dataclasses import asdict, dataclass
 from functools import wraps
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union
 from evalscope.constants import EvalType
 from evalscope.utils.filters import Filter
@@ -9,30 +9,21 @@ from evalscope.utils.filters import Filter
 @dataclass
 class PromptData:
     data: List[str]
-    index: Optional[int] = 0
+    index: Optional[Union[int, str]] = 0
     system_prompt: Optional[str] = None
     multi_choices: Optional[List[str]] = None
+    id: Optional[str] = None
     def to_dict(self) -> Dict:
-        if self.multi_choices is None:
-            return {
-                'data': self.data,
-                'index': self.index,
-                'system_prompt': self.system_prompt,
-            }
-        else:
-            return {
-                'data': self.data,
-                'index': self.index,
-                'system_prompt': self.system_prompt,
-                'multi_choices': self.multi_choices,
-            }
+        return {k: v for k, v in asdict(self).items() if v is not None}
 def preprocess_decorator(func):
     @wraps(func)
     def wrapper(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT):
+        if result is None:
+            result = ''
         filters = self.config_kwargs.get('filters', None)
         if filters:
             # Apply filters to the resultply filters to the result

evalscope/cli/start_app.py CHANGED Viewed

@@ -21,7 +21,7 @@ class StartAppCMD(CLICommand):
     def define_args(parsers: ArgumentParser):
         """ define args for create pipeline template command.
         """
-        from evalscope.report.app import add_argument
+        from evalscope.report import add_argument
         parser = parsers.add_parser(StartAppCMD.name)
         add_argument(parser)

evalscope/collections/evaluator.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import json
 import os
 import pandas as pd
+import random
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from copy import deepcopy
 from tabulate import tabulate
 from tqdm import tqdm
 from typing import List
@@ -10,7 +12,7 @@ from typing import List
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.collections.sampler import DatasetEntry
 from evalscope.config import TaskConfig
-from evalscope.constants import AnswerKeys, DumpMode, EvalType
+from evalscope.constants import AnswerKeys, DataCollection, DumpMode, EvalType
 from evalscope.evaluator import Evaluator
 from evalscope.models import initialize_model_adapter
 from evalscope.report import ReportGenerator
@@ -65,11 +67,12 @@ class EvaluatorCollection:
         self.evaluators = self._initialize_evaluators()
     def load(self) -> tuple[list[DatasetEntry], str]:
-        dataset_name = os.path.basename(self.data_adapter.dataset_id).split('.')[0]
+        dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
         raw_dataset = self.data_adapter.load()
-        # limit the dataset
+        # random limit the dataset
         if self.task_cfg.limit:
-            raw_dataset = raw_dataset[:self.task_cfg.limit]
+            raw_dataset = random.sample(raw_dataset,
+                                        self.task_cfg.limit) if len(raw_dataset) > self.task_cfg.limit else raw_dataset
         # index dataset
         datasets = []
         for sample in raw_dataset:
@@ -95,10 +98,17 @@ class EvaluatorCollection:
     def _initialize_evaluators(self):
         evaluators = {}
+        # load dataset args
+        dataset_args = deepcopy(self.task_cfg.dataset_args)
+        common_args = dataset_args.get(DataCollection.NAME, {})
         for dataset_name in self.dataset_name_map.keys():
             benchmark = Benchmark.get(dataset_name)
             model_adapter = initialize_model_adapter(self.task_cfg, benchmark, self.model)
-            data_adapter = benchmark.get_data_adapter()
+            # update dataset args
+            cur_dataset_args = dataset_args.get(dataset_name, {})
+            cur_dataset_args.update(common_args)
+            # get data adapter
+            data_adapter = benchmark.get_data_adapter(cur_dataset_args)
             evaluators[dataset_name] = SimpleEvaluator(dataset_name, data_adapter, model_adapter, self.task_cfg,
                                                        self.outputs)
         return evaluators
@@ -174,6 +184,7 @@ class EvaluatorCollection:
         os.makedirs(os.path.dirname(report_file_path), exist_ok=True)
         with open(report_file_path, 'w', encoding='utf-8') as f:
             json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
+        return report
     def _filter_answer(self, pred_file_path):
         answer_dict = defaultdict(dict)
@@ -184,12 +195,14 @@ class EvaluatorCollection:
                 index = answer.get(AnswerKeys.INDEX)
                 answer_dict[index] = answer
                 indices.add(index)
             data = []
             for sample in self.dataset:
                 if sample.index not in indices:
                     data.append(sample)
             data_map = self._init_name_map(data)
+            logger.info(f'Reuse from {pred_file_path}. Loaded {len(indices)} samples, remain {len(data)} samples.')
             return answer_dict, data, data_map
         return answer_dict, self.dataset, self.dataset_name_map
@@ -274,4 +287,5 @@ class EvaluatorCollection:
         answers = self.get_answers()
         reviews = self.get_reviews(answers)
         scores = self.get_scores(reviews)
-        self.get_report(scores)
+        report = self.get_report(scores)
+        return report

evalscope/config.py CHANGED Viewed

@@ -4,13 +4,12 @@ import copy
 import json
 import os
 from argparse import Namespace
-from collections import OrderedDict
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Union
 from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType,
-                                 JudgeStrategy, OutputType)
-from evalscope.models.custom import CustomModel
+                                 JudgeStrategy, ModelTask, OutputType)
+from evalscope.models import CustomModel, DummyCustomModel
 from evalscope.utils import gen_hash
 from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
 from evalscope.utils.logger import get_logger
@@ -36,6 +35,7 @@ class TaskConfig:
     model: Union[str, 'CustomModel', None] = None
     model_id: Optional[str] = None
     model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
+    model_task: Optional[str] = ModelTask.TEXT_GENERATION
     # Template-related arguments
     template_type: Optional[str] = None  # Deprecated, will be removed in v1.0.0.
@@ -79,6 +79,10 @@ class TaskConfig:
     judge_model_args: Optional[Dict] = field(default_factory=lambda: {})
     def __post_init__(self):
+        if self.model is None:
+            self.model = DummyCustomModel()
+            self.eval_type = EvalType.CUSTOM
         if (not self.model_id) and self.model:
             if isinstance(self.model, CustomModel):
                 self.model_id = self.model.config.get('model_id', 'custom_model')
@@ -212,7 +216,7 @@ def parse_task_config(task_cfg) -> TaskConfig:
         logger.info('Args: Task config is provided with CommandLine type.')
         task_cfg = TaskConfig.from_args(task_cfg)
     elif isinstance(task_cfg, str):
-        extension = task_cfg.split('.')[-1]
+        extension = os.path.splitext(task_cfg)[-1]
         logger.info(f'Args: Task config is provided with {extension} file type.')
         if extension in ['yaml', 'yml']:
             task_cfg = TaskConfig.from_yaml(task_cfg)

evalscope/constants.py CHANGED Viewed

@@ -1,4 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+# flake8: noqa
+import os
+os.environ['MODELSCOPE_LOG_LEVEL'] = '40'  # Set default log level to ERROR
 from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
 from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_root
@@ -145,6 +150,7 @@ class OutputType:
     GENERATION = 'generation'  # for text generation tasks and general tasks
     MULTIPLE_CHOICE = 'multiple_choice_logits'  # for multiple choice tasks
     CONTINUOUS = 'continuous_logits'  # for continuous tasks
+    IMAGE_GENERATION = 'image_generation'  # for image generation tasks
 class EvalBackend:
@@ -164,3 +170,8 @@ class JudgeStrategy:
     RULE = 'rule'
     LLM = 'llm'
     LLM_RECALL = 'llm_recall'
+class ModelTask:
+    TEXT_GENERATION = 'text_generation'
+    IMAGE_GENERATION = 'image_generation'

evalscope/evaluator/evaluator.py CHANGED Viewed

@@ -66,7 +66,7 @@ class Evaluator(object):
         if self.task_cfg.judge_strategy == JudgeStrategy.RULE:
             self.judge = None
         else:
-            from evalscope.metrics.llm_judge import LLMJudge
+            from evalscope.metrics import LLMJudge
             self.judge = LLMJudge(**self.task_cfg.judge_model_args)
     def load_dataset(self):
@@ -281,7 +281,7 @@ class Evaluator(object):
         os.makedirs(os.path.dirname(review_file_path), exist_ok=True)
         if self.use_cache and os.path.exists(review_file_path):
-            logger.warning(f'Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
+            logger.info(f'Updating the review file: {review_file_path} ...')
             os.remove(review_file_path)
         def process_single_review(answer_d):

evalscope/evaluator/reviewer/auto_reviewer.py CHANGED Viewed

@@ -11,7 +11,7 @@ from functools import partial
 from typing import Any, List, Tuple
 from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
-from evalscope.models.model import OpenAIModel
+from evalscope.models import OpenAIModel
 from evalscope.utils import completion_parsers, random_seeded_choice
 from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
 from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list

evalscope/metrics/__init__.py CHANGED Viewed

@@ -1,5 +1,50 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.metrics.metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean,
-                                       simple_f1_score, weighted_mean)
-from evalscope.metrics.named_metrics import *
-from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
+from typing import TYPE_CHECKING
+from evalscope.utils.import_utils import _LazyModule
+if TYPE_CHECKING:
+    from .llm_judge import LLMJudge
+    from .math_parser import extract_answer, math_equal, strip_answer_string
+    from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
+                          weighted_mean)
+    from .named_metrics import Metric, metric_registry
+    from .rouge_metric import compute_rouge_score_one_sample_zh
+else:
+    _import_structure = {
+        'metrics': [
+            'bleu_ngram_one_sample',
+            'exact_match',
+            'macro_mean',
+            'mean',
+            'micro_mean',
+            'simple_f1_score',
+            'weighted_mean',
+        ],
+        'named_metrics': [
+            'Metric',
+            'metric_registry',
+        ],
+        'rouge_metric': [
+            'compute_rouge_score_one_sample_zh',
+        ],
+        'llm_judge': [
+            'LLMJudge',
+        ],
+        'math_parser': [
+            'extract_answer',
+            'math_equal',
+            'strip_answer_string',
+        ],
+    }
+    import sys
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )

evalscope/metrics/llm_judge.py CHANGED Viewed

@@ -54,7 +54,7 @@ class LLMJudge:
         self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
         self.generation_config = generation_config
-        from evalscope.models.server_adapter import ServerModelAdapter
+        from evalscope.models import ServerModelAdapter
         # Initialize ServerModelAdapter
         self.server_adapter = ServerModelAdapter(api_url=self.api_url, model_id=self.model_id, api_key=self.api_key)

evalscope/metrics/named_metrics.py CHANGED Viewed

@@ -3,6 +3,8 @@ from functools import partial
 from typing import Callable, Dict
 from evalscope.metrics.metrics import mean, pass_at_k, weighted_mean
+from evalscope.metrics.t2v_metrics import (blip2_score, clip_flant5_score, clip_score, fga_blip2_score, hpsv2_1_score,
+                                           hpsv2_score, image_reward_score, mps_score, pick_score)
 @dataclass
@@ -40,3 +42,14 @@ metric_registry.register(Metric(name='WeightedAverageBLEU', object=weighted_mean
 metric_registry.register(Metric(name='AveragePass@1', object=mean))
 for k in range(1, 17):
     metric_registry.register(Metric(name=f'Pass@{k}', object=partial(pass_at_k, k=k)))
+# t2v_metrics
+metric_registry.register(Metric(name='VQAScore', object=clip_flant5_score))
+metric_registry.register(Metric(name='PickScore', object=pick_score))
+metric_registry.register(Metric(name='CLIPScore', object=clip_score))
+metric_registry.register(Metric(name='BLIPv2Score', object=blip2_score))
+metric_registry.register(Metric(name='HPSv2Score', object=hpsv2_score))
+metric_registry.register(Metric(name='HPSv2.1Score', object=hpsv2_1_score))
+metric_registry.register(Metric(name='ImageRewardScore', object=image_reward_score))
+metric_registry.register(Metric(name='FGA_BLIP2Score', object=fga_blip2_score))
+metric_registry.register(Metric(name='MPS', object=mps_score))

evalscope/metrics/t2v_metrics/__init__.py ADDED Viewed

@@ -0,0 +1,66 @@
+from __future__ import absolute_import, division, print_function
+from .clipscore import CLIPScore, list_all_clipscore_models
+from .constants import CACHE_DIR
+from .itmscore import ITMScore, list_all_itmscore_models
+from .vqascore import VQAScore, list_all_vqascore_models
+def list_all_models():
+    return list_all_vqascore_models() + list_all_clipscore_models() + list_all_itmscore_models()
+def get_score_model(model='clip-flant5-xxl', device='cuda', cache_dir=CACHE_DIR, **kwargs):
+    if model in list_all_vqascore_models():
+        return VQAScore(model, device=device, cache_dir=cache_dir, **kwargs)
+    elif model in list_all_clipscore_models():
+        return CLIPScore(model, device=device, cache_dir=cache_dir, **kwargs)
+    elif model in list_all_itmscore_models():
+        return ITMScore(model, device=device, cache_dir=cache_dir, **kwargs)
+    else:
+        raise NotImplementedError()
+def clip_flant5_score():
+    clip_flant5_score = VQAScore(model='clip-flant5-xxl')
+    return clip_flant5_score
+def pick_score():
+    pick_score = CLIPScore(model='pickscore-v1')
+    return pick_score
+def clip_score():
+    clip_score = CLIPScore(model='openai:ViT-L-14-336')
+    return clip_score
+def blip2_score():
+    blip_itm_score = ITMScore(model='blip2-itm')
+    return blip_itm_score
+def hpsv2_score():
+    hpsv2_score = CLIPScore(model='hpsv2')
+    return hpsv2_score
+def hpsv2_1_score():
+    hpsv2_1_score = CLIPScore(model='hpsv2.1')
+    return hpsv2_1_score
+def image_reward_score():
+    image_reward_score = ITMScore(model='image-reward-v1')
+    return image_reward_score
+def fga_blip2_score():
+    fga_blip2_score = ITMScore(model='fga_blip2')
+    return fga_blip2_score
+def mps_score():
+    mps_score = CLIPScore(model='mps')
+    return mps_score

evalscope/metrics/t2v_metrics/clipscore.py ADDED Viewed

@@ -0,0 +1,14 @@
+from typing import List
+from .constants import CACHE_DIR
+from .models.clipscore_models import get_clipscore_model, list_all_clipscore_models
+from .score import Score
+class CLIPScore(Score):
+    def prepare_scoremodel(self, model='openai:ViT-L/14', device='cuda', cache_dir=CACHE_DIR):
+        return get_clipscore_model(model, device=device, cache_dir=cache_dir)
+    def list_all_models(self) -> List[str]:
+        return list_all_clipscore_models()

evalscope/metrics/t2v_metrics/constants.py ADDED Viewed

@@ -0,0 +1,12 @@
+import os
+from modelscope.utils.file_utils import get_model_cache_root
+CACHE_DIR = get_model_cache_root()
+os.environ['TORCH_HOME'] = CACHE_DIR  # set timm cache dir
+# For CLIP-FlanT5
+CONTEXT_LEN = 2048
+SYSTEM_MSG = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = '<image>'

evalscope/metrics/t2v_metrics/itmscore.py ADDED Viewed

@@ -0,0 +1,14 @@
+from typing import List
+from .constants import CACHE_DIR
+from .models.itmscore_models import get_itmscore_model, list_all_itmscore_models
+from .score import Score
+class ITMScore(Score):
+    def prepare_scoremodel(self, model='blip2-itm', device='cuda', cache_dir=CACHE_DIR):
+        return get_itmscore_model(model, device=device, cache_dir=cache_dir)
+    def list_all_models(self) -> List[str]:
+        return list_all_itmscore_models()

evalscope/metrics/t2v_metrics/models/__init__.py ADDED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py ADDED Viewed

@@ -0,0 +1,30 @@
+from ...constants import CACHE_DIR
+from .clip_model import CLIP_MODELS, CLIPScoreModel
+from .hpsv2_model import HPSV2_MODELS, HPSV2ScoreModel
+from .mps_model import MPS_MODELS, MPSModel
+from .pickscore_model import PICKSCORE_MODELS, PickScoreModel
+ALL_CLIP_MODELS = [
+    CLIP_MODELS,
+    HPSV2_MODELS,
+    PICKSCORE_MODELS,
+    MPS_MODELS,
+]
+def list_all_clipscore_models():
+    return [model for models in ALL_CLIP_MODELS for model in models]
+def get_clipscore_model(model_name, device='cuda', cache_dir=CACHE_DIR):
+    assert model_name in list_all_clipscore_models()
+    if model_name in CLIP_MODELS:
+        return CLIPScoreModel(model_name, device=device, cache_dir=cache_dir)
+    elif model_name in HPSV2_MODELS:
+        return HPSV2ScoreModel(model_name, device=device, cache_dir=cache_dir)
+    elif model_name in PICKSCORE_MODELS:
+        return PickScoreModel(model_name, device=device, cache_dir=cache_dir)
+    elif model_name in MPS_MODELS:
+        return MPSModel(model_name, device=device, cache_dir=cache_dir)
+    else:
+        raise NotImplementedError()

evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py ADDED Viewed

File without changes

evalscope 0.13.2__py3-none-any.whl → 0.15.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.13.2py3-none-any.whl → 0.15.0py3-none-any.whl