PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (148) hide show

evalscope/api/benchmark/__init__.py +1 -1
evalscope/api/benchmark/adapters/__init__.py +2 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +7 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +62 -2
evalscope/api/benchmark/meta.py +9 -0
evalscope/api/dataset/dataset.py +6 -6
evalscope/api/dataset/loader.py +2 -1
evalscope/api/evaluator/cache.py +24 -1
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +204 -0
evalscope/api/model/generate_config.py +1 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/single_model.py +3 -3
evalscope/app/utils/data_utils.py +7 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
evalscope/benchmarks/amc/amc_adapter.py +46 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/bfcl_adapter.py +142 -7
evalscope/benchmarks/bfcl/generation.py +9 -9
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
evalscope/benchmarks/drop/drop_adapter.py +1 -1
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +6 -5
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/tau_bench/generation.py +1 -1
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +20 -19
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +96 -14
evalscope/constants.py +11 -0
evalscope/evaluator/evaluator.py +30 -10
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/metric.py +27 -2
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +3 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +8 -6
evalscope/perf/arguments.py +2 -0
evalscope/perf/benchmark.py +2 -0
evalscope/perf/plugin/api/base.py +2 -2
evalscope/perf/plugin/api/default_api.py +7 -7
evalscope/perf/plugin/api/openai_api.py +83 -19
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +7 -5
evalscope/perf/utils/local_server.py +3 -0
evalscope/report/__init__.py +0 -1
evalscope/report/combinator.py +0 -25
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +9 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +41 -0
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +56 -7
evalscope/utils/json_schema.py +23 -2
evalscope/utils/logger.py +19 -0
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +23 -6
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/METADATA +17 -24
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/RECORD +145 -103
tests/benchmark/test_eval.py +80 -37
tests/benchmark/test_image_edit.py +65 -0
tests/benchmark/test_sandbox.py +81 -0
tests/benchmark/test_vlm.py +137 -0
tests/cli/test_all.py +83 -43
tests/cli/test_collection.py +8 -5
tests/cli/test_reasoning.py +81 -0
tests/common.py +73 -0
tests/perf/test_perf.py +44 -14
tests/rag/test_clip_benchmark.py +0 -3
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/aigc/__init__.py +0 -1
/evalscope/benchmarks/{aigc → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → amc}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → healthbench}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
/tests/{aigc → benchmark}/test_t2i.py +0 -0

evalscope/benchmarks/mmmu/mmmu_adapter.py ADDED Viewed

@@ -0,0 +1,159 @@
+import ast
+import re
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers, prompt
+# flake8: noqa
+logger = get_logger()
+SUBSET_LIST = [
+    'Accounting',
+    'Agriculture',
+    'Architecture_and_Engineering',
+    'Art',
+    'Art_Theory',
+    'Basic_Medical_Science',
+    'Biology',
+    'Chemistry',
+    'Clinical_Medicine',
+    'Computer_Science',
+    'Design',
+    'Diagnostics_and_Laboratory_Medicine',
+    'Economics',
+    'Electronics',
+    'Energy_and_Power',
+    'Finance',
+    'Geography',
+    'History',
+    'Literature',
+    'Manage',
+    'Marketing',
+    'Materials',
+    'Math',
+    'Mechanical_Engineering',
+    'Music',
+    'Pharmacy',
+    'Physics',
+    'Psychology',
+    'Public_Health',
+    'Sociology',
+]
+MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
+OPEN_PROMPT = """
+Solve the following problem step by step. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.
+{question}
+Remember to put your answer on its own line at the end in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem, and you do not need to use a \\boxed command.
+"""
+MULTI_CHOICE_TYPE = 'multiple-choice'
+OPEN_TYPE = 'open'
+@register_benchmark(
+    BenchmarkMeta(
+        name='mmmu',
+        pretty_name='MMMU',
+        tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
+        description=
+        'MMMU (A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI) benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning. MMMU includes 11.5K meticulously collected multimodal questions from college exams, quizzes, and textbooks, covering six core disciplines: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering. These questions span 30 subjects and 183 subfields, comprising 30 highly heterogeneous image types, such as charts, diagrams, maps, tables, music sheets, and chemical structures.',  # noqa: E501
+        dataset_id='AI-ModelScope/MMMU',
+        subset_list=SUBSET_LIST,
+        metric_list=['acc'],
+        eval_split='validation',
+        prompt_template=OPEN_PROMPT,
+    )
+)
+class MMMUAdapter(VisionLanguageAdapter):
+    MAX_IMAGES: int = 7
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        question_type = record['question_type']
+        content_list, answers_list = MMMUAdapter.create_content_and_answers_list(record)
+        metadata = {
+            'id': record['id'],
+            'question_type': record['question_type'],
+            'subfield': record['subfield'],
+            'explanation': record['explanation'],
+            'img_type': record['img_type'],
+            'topic_difficulty': record['topic_difficulty'],
+        }
+        if question_type == MULTI_CHOICE_TYPE:
+            return Sample(
+                input=[ChatMessageUser(content=content_list)],
+                choices=answers_list,
+                target=record['answer'],
+                metadata=metadata,
+            )
+        elif question_type == OPEN_TYPE:
+            return Sample(
+                input=[ChatMessageUser(content=content_list)],
+                target=record['answer'],
+                metadata=metadata,
+            )
+        else:
+            raise ValueError(f'Unsupported question type: {question_type}')
+    def extract_answer(self, prediction: str, task_state: TaskState) -> str:
+        question_type = task_state.metadata['question_type']
+        if question_type == MULTI_CHOICE_TYPE:
+            answers = parse_answers(task_state)
+            return ''.join(sorted(list(answers)))
+        elif question_type == OPEN_TYPE:
+            pattern = r'ANSWER:\s*(.*)'
+            match = re.search(pattern, prediction)
+            if match:
+                return match.group(1).strip()
+            return ''
+        else:
+            raise ValueError(f'Unsupported question type: {question_type}')
+    @staticmethod
+    def create_content_and_answers_list(record: Dict[str, Any]) -> tuple[List[Content], List[str]]:
+        """
+        Create a list of content elements and a list of answers from a record.
+        Args:
+            record (dict): The record containing question, images, and options.
+        Returns:
+            tuple: A tuple containing:
+                - content_list (list): A list of content elements (text and images).
+                - answers_list (list): A list of possible answers (for multiple-choice questions).
+        """
+        question_type = record['question_type']
+        if question_type == MULTI_CHOICE_TYPE:
+            answers_list: List[str] = ast.literal_eval(record['options'])
+            input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
+            content_list: List[Content] = [ContentText(text=input_text)]
+        else:
+            answers_list: List[str] = []
+            content_list: List[Content] = [ContentText(text=OPEN_PROMPT.format(question=record['question']))]
+        for i in range(MMMUAdapter.MAX_IMAGES):
+            image = record[f'image_{i+1}']
+            if image:
+                image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
+                content_list.append(ContentImage(image=image_base64))
+        return content_list, answers_list

evalscope/benchmarks/mmmu_pro/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py ADDED Viewed

@@ -0,0 +1,124 @@
+import ast
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+from evalscope.utils.multi_choices import MultipleChoiceTemplate, answer_character, prompt
+logger = get_logger()
+SUBSET_LIST = [
+    'Accounting',
+    'Agriculture',
+    'Architecture_and_Engineering',
+    'Art',
+    'Art_Theory',
+    'Basic_Medical_Science',
+    'Biology',
+    'Chemistry',
+    'Clinical_Medicine',
+    'Computer_Science',
+    'Design',
+    'Diagnostics_and_Laboratory_Medicine',
+    'Economics',
+    'Electronics',
+    'Energy_and_Power',
+    'Finance',
+    'Geography',
+    'History',
+    'Literature',
+    'Manage',
+    'Marketing',
+    'Materials',
+    'Math',
+    'Mechanical_Engineering',
+    'Music',
+    'Pharmacy',
+    'Physics',
+    'Psychology',
+    'Public_Health',
+    'Sociology',
+]
+MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
+VISION_PROMPT = r"""
+Answer the following multiple choice question in image. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}. Think step by step before answering.
+""".strip()  # noqa: E501
+DATASET_FORMATS = ['standard (4 options)', 'standard (10 options)', 'vision']
+@register_benchmark(
+    BenchmarkMeta(
+        name='mmmu_pro',
+        pretty_name='MMMU-PRO',
+        tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
+        description=
+        'MMMU-Pro is an enhanced multimodal benchmark designed to rigorously assess the true understanding capabilities of advanced AI models across multiple modalities. It builds upon the original MMMU benchmark by introducing several key improvements that make it more challenging and realistic, ensuring that models are evaluated on their genuine ability to integrate and comprehend both visual and textual information.',  # noqa: E501
+        dataset_id='AI-ModelScope/MMMU_Pro',
+        subset_list=SUBSET_LIST,
+        metric_list=['acc'],
+        eval_split='test',
+        prompt_template=MULT_CHOICE_PROMPT,
+        extra_params={
+            'dataset_format': f"# choose from {DATASET_FORMATS}, default 'standard (4 options)'",
+        }
+    )
+)
+class MMMUPROAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
+    MAX_IMAGES: int = 7
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.reformat_subset = True
+        self.dataset_format = self.extra_params.get('dataset_format', 'standard (4 options)')
+        if self.dataset_format not in DATASET_FORMATS:
+            logger.warning(f"Invalid dataset_format '{self.dataset_format}', fallback to 'standard (4 options)'")
+            self.dataset_format = 'standard (4 options)'
+        self.default_subset = self.dataset_format
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        metadata = {
+            'id': record['id'],
+            'explanation': record.get('explanation'),
+            'img_type': record.get('img_type'),
+            'topic_difficulty': record.get('topic_difficulty'),
+            'subject': record.get('subject')
+        }
+        answers_list: List[str] = ast.literal_eval(record['options'])
+        if self.dataset_format == 'vision':
+            letters = ','.join(answer_character(i) for i in range(len(answers_list)))
+            input_text = VISION_PROMPT.format(letters=letters)
+            content_list: List[Content] = [ContentText(text=input_text)]
+            image = record.get('image')
+            if image:
+                content_list.append(ContentImage(image=bytes_to_base64(image['bytes'], format='png', add_header=True)))
+        else:
+            input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
+            content_list: List[Content] = [ContentText(text=input_text)]
+            for i in range(MMMUPROAdapter.MAX_IMAGES):
+                image = record.get(f'image_{i+1}')
+                if image:
+                    image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
+                    content_list.append(ContentImage(image=image_base64))
+        return Sample(
+            input=[ChatMessageUser(content=content_list)],
+            choices=answers_list,
+            target=record['answer'],
+            subset_key=record['subject'],
+            metadata=metadata,
+        )

evalscope/benchmarks/multi_if/__init__.py ADDED Viewed

File without changes

evalscope 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

Potentially problematic release.

evalscope 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl