PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (97) hide show

evalscope/api/benchmark/__init__.py +1 -1
evalscope/api/benchmark/adapters/__init__.py +2 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +1 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +7 -6
evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
evalscope/api/benchmark/benchmark.py +35 -0
evalscope/api/benchmark/meta.py +6 -0
evalscope/api/dataset/dataset.py +6 -6
evalscope/api/dataset/loader.py +2 -1
evalscope/api/evaluator/cache.py +24 -1
evalscope/api/evaluator/state.py +12 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +47 -2
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +0 -1
evalscope/api/model/generate_config.py +1 -3
evalscope/api/model/model.py +4 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/single_model.py +3 -3
evalscope/app/utils/data_utils.py +7 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/arguments.py +2 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/bfcl/bfcl_adapter.py +2 -6
evalscope/benchmarks/bfcl/generation.py +2 -2
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +5 -1
evalscope/benchmarks/tau_bench/generation.py +1 -1
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +15 -19
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +72 -13
evalscope/constants.py +8 -0
evalscope/evaluator/evaluator.py +6 -4
evalscope/metrics/llm_judge.py +19 -7
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +20 -0
evalscope/models/openai_compatible.py +3 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +7 -4
evalscope/perf/benchmark.py +2 -0
evalscope/perf/utils/benchmark_util.py +8 -5
evalscope/perf/utils/local_server.py +3 -0
evalscope/report/__init__.py +0 -1
evalscope/report/generator.py +8 -87
evalscope/run.py +9 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/chat_service.py +1 -1
evalscope/utils/import_utils.py +23 -1
evalscope/utils/io_utils.py +42 -1
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +23 -6
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/METADATA +12 -15
{evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/RECORD +94 -80
tests/benchmark/test_eval.py +30 -31
tests/benchmark/test_image_edit.py +65 -0
tests/benchmark/test_vlm.py +80 -0
tests/cli/test_all.py +83 -43
tests/cli/test_collection.py +8 -5
tests/cli/test_reasoning.py +81 -0
tests/common.py +73 -0
tests/perf/test_perf.py +4 -2
tests/rag/test_clip_benchmark.py +0 -3
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/aigc/__init__.py +0 -1
/evalscope/benchmarks/{aigc → image_edit}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → image_edit/gedit}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → math_vista}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
/tests/{aigc → benchmark}/test_t2i.py +0 -0

evalscope/benchmarks/mmmu/mmmu_adapter.py ADDED Viewed

@@ -0,0 +1,159 @@
+import ast
+import re
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers, prompt
+# flake8: noqa
+logger = get_logger()
+SUBSET_LIST = [
+    'Accounting',
+    'Agriculture',
+    'Architecture_and_Engineering',
+    'Art',
+    'Art_Theory',
+    'Basic_Medical_Science',
+    'Biology',
+    'Chemistry',
+    'Clinical_Medicine',
+    'Computer_Science',
+    'Design',
+    'Diagnostics_and_Laboratory_Medicine',
+    'Economics',
+    'Electronics',
+    'Energy_and_Power',
+    'Finance',
+    'Geography',
+    'History',
+    'Literature',
+    'Manage',
+    'Marketing',
+    'Materials',
+    'Math',
+    'Mechanical_Engineering',
+    'Music',
+    'Pharmacy',
+    'Physics',
+    'Psychology',
+    'Public_Health',
+    'Sociology',
+]
+MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
+OPEN_PROMPT = """
+Solve the following problem step by step. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.
+{question}
+Remember to put your answer on its own line at the end in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem, and you do not need to use a \\boxed command.
+"""
+MULTI_CHOICE_TYPE = 'multiple-choice'
+OPEN_TYPE = 'open'
+@register_benchmark(
+    BenchmarkMeta(
+        name='mmmu',
+        pretty_name='MMMU',
+        tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
+        description=
+        'MMMU (A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI) benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning. MMMU includes 11.5K meticulously collected multimodal questions from college exams, quizzes, and textbooks, covering six core disciplines: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering. These questions span 30 subjects and 183 subfields, comprising 30 highly heterogeneous image types, such as charts, diagrams, maps, tables, music sheets, and chemical structures.',  # noqa: E501
+        dataset_id='AI-ModelScope/MMMU',
+        subset_list=SUBSET_LIST,
+        metric_list=['acc'],
+        eval_split='validation',
+        prompt_template=OPEN_PROMPT,
+    )
+)
+class MMMUAdapter(VisionLanguageAdapter):
+    MAX_IMAGES: int = 7
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        question_type = record['question_type']
+        content_list, answers_list = MMMUAdapter.create_content_and_answers_list(record)
+        metadata = {
+            'id': record['id'],
+            'question_type': record['question_type'],
+            'subfield': record['subfield'],
+            'explanation': record['explanation'],
+            'img_type': record['img_type'],
+            'topic_difficulty': record['topic_difficulty'],
+        }
+        if question_type == MULTI_CHOICE_TYPE:
+            return Sample(
+                input=[ChatMessageUser(content=content_list)],
+                choices=answers_list,
+                target=record['answer'],
+                metadata=metadata,
+            )
+        elif question_type == OPEN_TYPE:
+            return Sample(
+                input=[ChatMessageUser(content=content_list)],
+                target=record['answer'],
+                metadata=metadata,
+            )
+        else:
+            raise ValueError(f'Unsupported question type: {question_type}')
+    def extract_answer(self, prediction: str, task_state: TaskState) -> str:
+        question_type = task_state.metadata['question_type']
+        if question_type == MULTI_CHOICE_TYPE:
+            answers = parse_answers(task_state)
+            return ''.join(sorted(list(answers)))
+        elif question_type == OPEN_TYPE:
+            pattern = r'ANSWER:\s*(.*)'
+            match = re.search(pattern, prediction)
+            if match:
+                return match.group(1).strip()
+            return ''
+        else:
+            raise ValueError(f'Unsupported question type: {question_type}')
+    @staticmethod
+    def create_content_and_answers_list(record: Dict[str, Any]) -> tuple[List[Content], List[str]]:
+        """
+        Create a list of content elements and a list of answers from a record.
+        Args:
+            record (dict): The record containing question, images, and options.
+        Returns:
+            tuple: A tuple containing:
+                - content_list (list): A list of content elements (text and images).
+                - answers_list (list): A list of possible answers (for multiple-choice questions).
+        """
+        question_type = record['question_type']
+        if question_type == MULTI_CHOICE_TYPE:
+            answers_list: List[str] = ast.literal_eval(record['options'])
+            input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
+            content_list: List[Content] = [ContentText(text=input_text)]
+        else:
+            answers_list: List[str] = []
+            content_list: List[Content] = [ContentText(text=OPEN_PROMPT.format(question=record['question']))]
+        for i in range(MMMUAdapter.MAX_IMAGES):
+            image = record[f'image_{i+1}']
+            if image:
+                image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
+                content_list.append(ContentImage(image=image_base64))
+        return content_list, answers_list

evalscope/benchmarks/mmmu_pro/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py ADDED Viewed

@@ -0,0 +1,129 @@
+import ast
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+from evalscope.utils.multi_choices import MultipleChoiceTemplate, answer_character, parse_answers, prompt
+logger = get_logger()
+SUBSET_LIST = [
+    'Accounting',
+    'Agriculture',
+    'Architecture_and_Engineering',
+    'Art',
+    'Art_Theory',
+    'Basic_Medical_Science',
+    'Biology',
+    'Chemistry',
+    'Clinical_Medicine',
+    'Computer_Science',
+    'Design',
+    'Diagnostics_and_Laboratory_Medicine',
+    'Economics',
+    'Electronics',
+    'Energy_and_Power',
+    'Finance',
+    'Geography',
+    'History',
+    'Literature',
+    'Manage',
+    'Marketing',
+    'Materials',
+    'Math',
+    'Mechanical_Engineering',
+    'Music',
+    'Pharmacy',
+    'Physics',
+    'Psychology',
+    'Public_Health',
+    'Sociology',
+]
+MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
+VISION_PROMPT = r"""
+Answer the following multiple choice question in image. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}. Think step by step before answering.
+""".strip()  # noqa: E501
+DATASET_FORMATS = ['standard (4 options)', 'standard (10 options)', 'vision']
+@register_benchmark(
+    BenchmarkMeta(
+        name='mmmu_pro',
+        pretty_name='MMMU-PRO',
+        tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
+        description=
+        'MMMU-Pro is an enhanced multimodal benchmark designed to rigorously assess the true understanding capabilities of advanced AI models across multiple modalities. It builds upon the original MMMU benchmark by introducing several key improvements that make it more challenging and realistic, ensuring that models are evaluated on their genuine ability to integrate and comprehend both visual and textual information.',  # noqa: E501
+        dataset_id='AI-ModelScope/MMMU_Pro',
+        subset_list=SUBSET_LIST,
+        metric_list=['acc'],
+        eval_split='test',
+        prompt_template=MULT_CHOICE_PROMPT,
+        extra_params={
+            'dataset_format': f"# choose from {DATASET_FORMATS}, default 'standard (4 options)'",
+        }
+    )
+)
+class MMMUPROAdapter(VisionLanguageAdapter):
+    MAX_IMAGES: int = 7
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.reformat_subset = True
+        self.dataset_format = self.extra_params.get('dataset_format', 'standard (4 options)')
+        if self.dataset_format not in DATASET_FORMATS:
+            logger.warning(f"Invalid dataset_format '{self.dataset_format}', fallback to 'standard (4 options)'")
+            self.dataset_format = 'standard (4 options)'
+        self.default_subset = self.dataset_format
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        metadata = {
+            'id': record['id'],
+            'explanation': record.get('explanation'),
+            'img_type': record.get('img_type'),
+            'topic_difficulty': record.get('topic_difficulty'),
+            'subject': record.get('subject')
+        }
+        answers_list: List[str] = ast.literal_eval(record['options'])
+        if self.dataset_format == 'vision':
+            letters = ','.join(answer_character(i) for i in range(len(answers_list)))
+            input_text = VISION_PROMPT.format(letters=letters)
+            content_list: List[Content] = [ContentText(text=input_text)]
+            image = record.get('image')
+            if image:
+                content_list.append(ContentImage(image=bytes_to_base64(image['bytes'], format='png', add_header=True)))
+        else:
+            input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
+            content_list: List[Content] = [ContentText(text=input_text)]
+            for i in range(MMMUPROAdapter.MAX_IMAGES):
+                image = record.get(f'image_{i+1}')
+                if image:
+                    image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
+                    content_list.append(ContentImage(image=image_base64))
+        return Sample(
+            input=[ChatMessageUser(content=content_list)],
+            choices=answers_list,
+            target=record['answer'],
+            subset_key=record['subject'],
+            metadata=metadata,
+        )
+    def extract_answer(self, prediction: str, task_state: TaskState) -> str:
+        answers = parse_answers(task_state)
+        return ''.join(sorted(list(answers)))

evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py CHANGED Viewed

@@ -164,7 +164,11 @@ class NeedleHaystackAdapter(DefaultDataAdapter):
                     records.append(record)
                 dataset = DictDataLoader(
-                    dict_list=records, limit=self.limit, repeats=self.repeats, sample_fields=self.record_to_sample
+                    dict_list=records,
+                    limit=self.limit,
+                    repeats=self.repeats,
+                    sample_fields=self.record_to_sample,
+                    shuffle=self.shuffle,
                 ).load()
                 datasets[subset_name] = dataset

evalscope/benchmarks/tau_bench/generation.py CHANGED Viewed

@@ -45,7 +45,7 @@ def _patch_agent_solve(model: Model):
                 input=[dict_to_chat_message(msg) for msg in messages],
                 tools=[ToolInfo.model_validate(tool['function']) for tool in self.tools_info]
             )
-            oai_res = openai_chat_choices(res.choices)
+            oai_res = openai_chat_choices(res.choices, include_reasoning=False)
             next_message = oai_res[0].message.model_dump(exclude_none=True)

evalscope/benchmarks/tau_bench/tau_bench_adapter.py CHANGED Viewed

@@ -13,6 +13,7 @@ from evalscope.api.registry import register_benchmark
 from evalscope.constants import Tags
 from evalscope.utils import get_logger
 from evalscope.utils.function_utils import run_once
+from evalscope.utils.import_utils import check_import
 logger = get_logger()
@@ -35,8 +36,8 @@ logger = get_logger()
             'api_key': 'EMPTY',
             'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
             'generation_config': {
-                'temperature': 0.7,
-                'max_new_tokens': 1024
+                'temperature': 0.0,
+                'max_tokens': 4096,
             }
         }
     )
@@ -46,22 +47,13 @@ class TauBenchAdapter(DefaultDataAdapter):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        spec = importlib.util.find_spec('tau_bench')
-        if spec is None:
-            raise ImportError(
-                '`tau_bench` not found, please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating.'  # noqa: E501
-            )
+        check_import('tau_bench', package='git+https://github.com/sierra-research/tau-bench', raise_error=True)
         # setup user model args
         self.user_model = self.extra_params.get('user_model', 'qwen-plus')
         self.api_key = self.extra_params.get('api_key', 'EMPTY')
         self.api_base = self.extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
-        self.generation_config = self.extra_params.get(
-            'generation_config', {
-                'temperature': 0.7,
-                'max_new_tokens': 1024
-            }
-        )
+        self.generation_config = self.extra_params.get('generation_config', {'temperature': 0.0, 'max_tokens': 4096})
         self._patch_env_completion()
@@ -84,10 +76,10 @@ class TauBenchAdapter(DefaultDataAdapter):
             res = user_server.generate(input=[dict_to_chat_message(msg) for msg in messages])
-            message = res.message.model_dump(exclude_none=True)
+            message = {'role': 'assistant', 'content': res.completion}
             self.messages.append(message)
             self.total_cost = 0
-            return message['content']
+            return res.completion
         # get the current instance of TauBenchAdapter
         adapter_instance = self
@@ -114,7 +106,11 @@ class TauBenchAdapter(DefaultDataAdapter):
                 })
             # load dataset
             dataset = DictDataLoader(
-                dict_list=tasks, sample_fields=self.record_to_sample, limit=self.limit, repeats=self.repeats
+                dict_list=tasks,
+                sample_fields=self.record_to_sample,
+                limit=self.limit,
+                repeats=self.repeats,
+                shuffle=self.shuffle,
             ).load()
             data_dict[env_name] = dataset
@@ -145,15 +141,15 @@ class TauBenchAdapter(DefaultDataAdapter):
         try:
             # Parse the prediction to get the reward
-            res = task_state.metadata
-            reward = res.get('reward', 0.0)
+            task_result = task_state.metadata['task_result']
+            reward = task_result.get('reward', 0.0)
             score.value = {
                 'Pass^1': float(reward),
             }
             score.explanation = f'Task completed with reward: {reward}'
             score.metadata = {
-                'task_result': res,
+                'task_result': task_result,
                 'env_name': task_state.metadata.get('env_name', 'unknown'),
                 'task_index': task_state.metadata.get('task_index', -1)
             }

evalscope/benchmarks/text2image/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py RENAMED Viewed

@@ -16,8 +16,10 @@ logger = get_logger()
 @register_benchmark(
     BenchmarkMeta(
         name='evalmuse',
+        pretty_name='EvalMuse',
         dataset_id='AI-ModelScope/T2V-Eval-Prompts',
-        description='EvalMuse Text-to-Image Benchmark',
+        description='EvalMuse Text-to-Image Benchmark. Used for evaluating the quality '
+        'and semantic alignment of finely generated images',
         tags=[Tags.TEXT_TO_IMAGE],
         subset_list=['EvalMuse'],
         metric_list=['FGA_BLIP2Score'],

evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py RENAMED Viewed

@@ -4,7 +4,6 @@ import os
 from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
 from evalscope.api.dataset import Sample
 from evalscope.api.messages import ChatMessageUser
-from evalscope.api.metric.scorer import Score
 from evalscope.api.registry import get_metric, register_benchmark
 from evalscope.constants import Tags
 from evalscope.utils.logger import get_logger
@@ -15,8 +14,9 @@ logger = get_logger()
 @register_benchmark(
     BenchmarkMeta(
         name='genai_bench',
+        pretty_name='GenAI-Bench',
         dataset_id='AI-ModelScope/T2V-Eval-Prompts',
-        description='GenAI-Bench Text-to-Image Benchmark',
+        description='GenAI-Bench Text-to-Image Benchmark. Includes 1600 prompts for text-to-image task.',
         tags=[Tags.TEXT_TO_IMAGE],
         subset_list=['GenAI-Bench-1600'],
         metric_list=['VQAScore'],

evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py RENAMED Viewed

@@ -16,7 +16,7 @@ logger = get_logger()
         name='general_t2i',
         dataset_id='general_t2i',
         description='General Text-to-Image Benchmark',
-        tags=[Tags.TEXT_TO_IMAGE],
+        tags=[Tags.TEXT_TO_IMAGE, Tags.CUSTOM],
         subset_list=['default'],
         metric_list=['PickScore'],
         few_shot_num=0,

evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py RENAMED Viewed

@@ -14,8 +14,10 @@ logger = get_logger()
 @register_benchmark(
     BenchmarkMeta(
         name='hpdv2',
+        pretty_name='HPD-v2',
         dataset_id='AI-ModelScope/T2V-Eval-Prompts',
-        description='HPDv2 Text-to-Image Benchmark',
+        description='HPDv2 Text-to-Image Benchmark. Evaluation metrics based on human preferences, '
+        'trained on the Human Preference Dataset (HPD v2)',
         tags=[Tags.TEXT_TO_IMAGE],
         subset_list=['HPDv2'],
         metric_list=['HPSv2.1Score'],
@@ -41,7 +43,10 @@ class HPDv2Adapter(Text2ImageAdapter):
         return Sample(
             input=[ChatMessageUser(content=record['prompt'])],
             metadata={
+                'id': record['id'],
+                'prompt': record['prompt'],
                 'category': record.get('tags', {}).get('category', ''),
-                'tags': record.get('tags', {})
+                'tags': record.get('tags', {}),
+                'image_path': record.get('image_path', ''),  # Optional field for existing image path
             }
         )

evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py RENAMED Viewed

@@ -10,6 +10,7 @@ logger = get_logger()
 @register_benchmark(
     BenchmarkMeta(
         name='tifa160',
+        pretty_name='TIFA-160',
         dataset_id='AI-ModelScope/T2V-Eval-Prompts',
         description='TIFA-160 Text-to-Image Benchmark',
         tags=[Tags.TEXT_TO_IMAGE],

evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py CHANGED Viewed

@@ -37,6 +37,7 @@ TRUTHFUL_QA_PROMPT = (
         dataset_id='evalscope/truthful_qa',
         metric_list=['multi_choice_acc'],
         subset_list=['multiple_choice'],
+        shuffle_choices=True,
         few_shot_num=0,
         train_split=None,
         eval_split='validation',
@@ -55,8 +56,6 @@ class TruthfulQaAdapter(MultiChoiceAdapter):
         super().__init__(**kwargs)
-        self.shuffle_choices = True
         self.multiple_correct = self.extra_params.get('multiple_correct', False)
         if self.multiple_correct:
             self.prompt_template = MultipleChoiceTemplate.MULTIPLE_ANSWER

evalscope/cli/start_app.py CHANGED Viewed

@@ -28,6 +28,12 @@ class StartAppCMD(CLICommand):
         parser.set_defaults(func=subparser_func)
     def execute(self):
-        from evalscope.app import create_app
+        try:
+            from evalscope.app import create_app
+        except ImportError as e:
+            raise ImportError(
+                f'Failed to import create_app from evalscope.app, due to {e}. '
+                "Please run `pip install 'evalscope[app]'`."
+            )
         create_app(self.args)

evalscope/cli/start_perf.py CHANGED Viewed

@@ -28,6 +28,12 @@ class PerfBenchCMD(CLICommand):
         parser.set_defaults(func=subparser_func)
     def execute(self):
-        from evalscope.perf.main import run_perf_benchmark
+        try:
+            from evalscope.perf.main import run_perf_benchmark
+        except ImportError as e:
+            raise ImportError(
+                f'Failed to import run_perf_benchmark from evalscope.perf.main, due to {e}. '
+                "Please run `pip install 'evalscope[perf]'`."
+            )
         run_perf_benchmark(self.args)

evalscope 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

evalscope 1.0.0py3-none-any.whl → 1.0.1py3-none-any.whl