PyPI - evalscope - Versions diffs - 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

evalscope 1.0.1py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (87) hide show

evalscope/api/benchmark/adapters/default_data_adapter.py +6 -4
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
evalscope/api/benchmark/benchmark.py +27 -2
evalscope/api/benchmark/meta.py +3 -0
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +5 -0
evalscope/api/messages/chat_message.py +6 -1
evalscope/api/mixin/__init__.py +1 -0
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +204 -0
evalscope/api/model/generate_config.py +0 -3
evalscope/api/model/model.py +1 -1
evalscope/api/tool/tool_info.py +1 -1
evalscope/arguments.py +6 -0
evalscope/benchmarks/ai2d/__init__.py +0 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
evalscope/benchmarks/amc/__init__.py +0 -0
evalscope/benchmarks/amc/amc_adapter.py +46 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/bfcl_adapter.py +141 -2
evalscope/benchmarks/bfcl/generation.py +7 -7
evalscope/benchmarks/drop/drop_adapter.py +1 -1
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
evalscope/config.py +24 -1
evalscope/constants.py +3 -0
evalscope/evaluator/evaluator.py +25 -7
evalscope/metrics/metric.py +27 -2
evalscope/models/model_apis.py +10 -8
evalscope/models/utils/openai.py +1 -2
evalscope/perf/arguments.py +2 -0
evalscope/perf/plugin/api/base.py +2 -2
evalscope/perf/plugin/api/default_api.py +7 -7
evalscope/perf/plugin/api/openai_api.py +83 -19
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +1 -2
evalscope/report/combinator.py +0 -25
evalscope/report/report.py +8 -4
evalscope/run.py +1 -1
evalscope/utils/function_utils.py +41 -0
evalscope/utils/import_utils.py +63 -13
evalscope/utils/io_utils.py +19 -11
evalscope/utils/json_schema.py +23 -2
evalscope/utils/logger.py +19 -0
evalscope/utils/model_utils.py +1 -1
evalscope/version.py +2 -2
{evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/METADATA +6 -10
{evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/RECORD +87 -59
tests/benchmark/test_eval.py +51 -7
tests/benchmark/test_sandbox.py +81 -0
tests/benchmark/test_vlm.py +60 -3
tests/perf/test_perf.py +40 -12
{evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
{evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
{evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py ADDED Viewed

@@ -0,0 +1,220 @@
+import json
+from typing import TYPE_CHECKING, Dict, List, Tuple
+from evalscope.utils.logger import get_logger
+if TYPE_CHECKING:
+    from evalscope.api.mixin.sandbox_mixin import SandboxMixin
+logger = get_logger()
+def evaluate_in_sandbox(
+    adapter: 'SandboxMixin',
+    code: str,
+    evaluation_sample: str,
+    timeout: int = 6,
+    debug: bool = False
+) -> Tuple[bool, Dict]:
+    """
+    Evaluate code in sandbox environment for Live Code Bench.
+    Args:
+        adapter: The adapter instance with sandbox capabilities
+        code: The code to evaluate
+        evaluation_sample: JSON string containing input/output test cases
+        timeout: Timeout for execution
+        debug: Whether to enable debug logging
+    Returns:
+        Tuple[bool, Dict]: (overall_pass, detailed_results)
+    """
+    try:
+        # Parse the evaluation sample
+        test_data = json.loads(evaluation_sample)
+        inputs = test_data.get('inputs', [])
+        outputs = test_data.get('outputs', [])
+        fn_name = test_data.get('fn_name')
+        if debug:
+            logger.info(f'Evaluating code with {len(inputs)} test cases')
+            logger.info(f'Function name: {fn_name}')
+        # Determine if this is call-based or stdio-based
+        if fn_name:
+            # Call-based evaluation
+            return _evaluate_call_based_in_sandbox(adapter, code, inputs, outputs, fn_name, timeout, debug)
+        else:
+            # Standard input/output evaluation
+            return _evaluate_stdio_in_sandbox(adapter, code, inputs, outputs, timeout, debug)
+    except Exception as e:
+        if debug:
+            logger.error(f'Sandbox evaluation error: {str(e)}')
+        return False, {'error': str(e), 'total_tests': 0, 'passed_tests': 0}
+def _evaluate_call_based_in_sandbox(
+    adapter: 'SandboxMixin', code: str, inputs: list, outputs: list, fn_name: str, timeout: int, debug: bool
+) -> Tuple[bool, Dict]:
+    """Evaluate call-based problems in sandbox."""
+    try:
+        all_passed = True
+        passed_count = 0
+        failed_cases = []
+        for i, (test_input, expected_output) in enumerate(zip(inputs, outputs)):
+            # Prepare individual test code for each test case
+            test_code = f"""
+import json
+import sys
+# User's code
+{code}
+# Test execution for single test case
+try:
+    test_input = {repr(test_input)}
+    expected_output = {repr(expected_output)}
+    if 'class Solution' in '''{code}''':
+        # LeetCode style
+        solution = Solution()
+        method = getattr(solution, '{fn_name}')
+    else:
+        # Function is directly available
+        method = {fn_name}
+    # Parse input if it's JSON string
+    if isinstance(test_input, str):
+        try:
+            test_input = json.loads(test_input)
+        except:
+            pass  # Keep as string if not valid JSON
+    # Call the method
+    if isinstance(test_input, list):
+        result = method(*test_input)
+    else:
+        result = method(test_input)
+    # Parse expected output if it's JSON string
+    if isinstance(expected_output, str):
+        try:
+            expected_output = json.loads(expected_output)
+        except:
+            pass  # Keep as string if not valid JSON
+    # Convert tuple to list for comparison
+    if isinstance(result, tuple):
+        result = list(result)
+    if result == expected_output:
+        print("TEST_PASSED")
+    else:
+        print(f"TEST_FAILED: expected {{expected_output}}, got {{result}}")
+except Exception as e:
+    print(f"EXECUTION_ERROR: {{str(e)}}")
+    import traceback
+    traceback.print_exc()
+"""
+            # Execute in sandbox
+            result = adapter.execute_code_in_sandbox(code=test_code, timeout=timeout, language='python')
+            if debug:
+                logger.info(f'Test case {i} execution result: {result}')
+            # Check if execution was successful and test passed
+            if result.get('status') == 'success':
+                output = result.get('output', '')
+                if 'TEST_PASSED' in output:
+                    passed_count += 1
+                elif 'TEST_FAILED:' in output:
+                    # Extract failure details from output
+                    for line in output.split('\n'):
+                        if line.startswith('TEST_FAILED:'):
+                            failed_cases.append(f"Test {i}: {line.replace('TEST_FAILED: ', '')}")
+                            break
+                    all_passed = False
+                    break
+                elif 'EXECUTION_ERROR:' in output:
+                    # Extract error details
+                    for line in output.split('\n'):
+                        if line.startswith('EXECUTION_ERROR:'):
+                            failed_cases.append(f'Test {i}: {line}')
+                            break
+                    all_passed = False
+                    break
+                else:
+                    failed_cases.append(f'Test {i}: Unknown error in output. Result: {result}')
+                    all_passed = False
+                    break
+            else:
+                failed_cases.append(f'Test {i}: Sandbox execution failed - Result: {result}')
+                all_passed = False
+                break
+        detailed_results = {'total_tests': len(inputs), 'passed_tests': passed_count, 'failed_cases': failed_cases}
+        return all_passed, detailed_results
+    except Exception as e:
+        if debug:
+            logger.error(f'Call-based evaluation error: {str(e)}')
+        return False, {'error': str(e), 'total_tests': len(inputs), 'passed_tests': 0}
+def _evaluate_stdio_in_sandbox(
+    adapter: 'SandboxMixin', code: str, inputs: list, outputs: list, timeout: int, debug: bool
+) -> Tuple[bool, Dict]:
+    """Evaluate stdio-based problems in sandbox."""
+    try:
+        all_passed = True
+        passed_count = 0
+        failed_cases = []
+        for i, (test_input, expected_output) in enumerate(zip(inputs, outputs)):
+            test_code = f"""
+import sys
+from io import StringIO
+# Redirect stdin
+sys.stdin = StringIO('''{test_input}''')
+# User's code
+{code}
+"""
+            # Execute in sandbox
+            result = adapter.execute_code_in_sandbox(code=test_code, timeout=timeout, language='python')
+            if result.get('status') != 'success':
+                if debug:
+                    logger.error(f'Test case {i} execution failed: {result}')
+                failed_cases.append(f'Test {i}: Execution error - Result: {result}')
+                all_passed = False
+                break
+            # Compare output
+            actual_output = result.get('output', '').strip()
+            expected_output = expected_output.strip()
+            if actual_output == expected_output:
+                passed_count += 1
+            else:
+                if debug:
+                    logger.info(f"Test case {i} failed: expected '{expected_output}', got '{actual_output}'")
+                failed_cases.append(f"Test {i}: Expected '{expected_output}', got '{actual_output}'")
+                all_passed = False
+                break
+        detailed_results = {'total_tests': len(inputs), 'passed_tests': passed_count, 'failed_cases': failed_cases}
+        return all_passed, detailed_results
+    except Exception as e:
+        if debug:
+            logger.error(f'Stdio evaluation error: {str(e)}')
+        return False, {'error': str(e), 'total_tests': len(inputs), 'passed_tests': 0}

evalscope/benchmarks/math_500/math_500_adapter.py CHANGED Viewed

@@ -4,7 +4,6 @@ from typing import Any, Dict
 from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
 from evalscope.api.dataset import Sample
-from evalscope.api.evaluator import TaskState
 from evalscope.api.registry import register_benchmark
 from evalscope.constants import Tags
 from evalscope.utils.logger import get_logger

evalscope/benchmarks/minerva_math/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/minerva_math/minerva_math_adapter.py ADDED Viewed

@@ -0,0 +1,48 @@
+from typing import Any, Dict
+from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+@register_benchmark(
+    BenchmarkMeta(
+        name='minerva_math',
+        pretty_name='Minerva-Math',
+        tags=[Tags.MATH, Tags.REASONING],
+        description='Minerva-math is a benchmark designed to evaluate the mathematical and quantitative '
+        'reasoning capabilities of LLMs. It consists of **272 problems** '
+        'sourced primarily from **MIT OpenCourseWare** '
+        'courses, covering advanced STEM subjects such as solid-state chemistry, astronomy, differential '
+        'equations, and special relativity at the **university and graduate level**.',
+        dataset_id='knoveleng/Minerva-Math',
+        subset_list=['default'],
+        metric_list=[{
+            'acc': {
+                'numeric': True
+            }
+        }],
+        eval_split='train',
+        prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
+    )
+)
+class MinervaMathAdapter(DefaultDataAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._use_llm_judge = True
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        return Sample(
+            input=record['problem'],
+            target=record['solution'],
+            metadata={
+                'type': record['type'],
+                'idx': record['idx'],
+            },
+        )

evalscope/benchmarks/mm_bench/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/mm_bench/mm_bench_adapter.py ADDED Viewed

@@ -0,0 +1,99 @@
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+from evalscope.utils.multi_choices import MultipleChoiceTemplate, prompt
+logger = get_logger()
+MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
+@register_benchmark(
+    BenchmarkMeta(
+        name='cc_bench',
+        pretty_name='CCBench',
+        tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
+        description=
+        'CCBench is an extension of MMBench with newly design questions about Chinese traditional culture, including Calligraphy Painting, Cultural Relic, Food & Clothes, Historical Figures, Scenery & Building, Sketch Reasoning and Traditional Show.',  # noqa: E501
+        dataset_id='lmms-lab/MMBench',
+        subset_list=['cc'],
+        metric_list=['acc'],
+        eval_split='test',
+        prompt_template=MULT_CHOICE_PROMPT,
+    )
+)
+class CCBenchAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        answers_list: List[str] = [record.get('A', ''), record.get('B', ''), record.get('C', ''), record.get('D', '')]
+        input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
+        content_list: List[Content] = [ContentText(text=input_text)]
+        image = record.get('image')
+        if image:
+            image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
+            content_list.append(ContentImage(image=image_base64))
+        label_answer = record.get('answer')
+        return Sample(
+            input=[ChatMessageUser(content=content_list)],
+            choices=answers_list,
+            target=label_answer,
+            metadata={
+                'index': record.get('index'),
+                'category': record.get('category'),
+                'source': record.get('source')
+            }
+        )
+@register_benchmark(
+    BenchmarkMeta(
+        name='mm_bench',
+        pretty_name='MMBench',
+        tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
+        description=
+        'MMBench is a comprehensive evaluation pipeline comprised of meticulously curated multimodal dataset and a novel circulareval strategy using ChatGPT. It is comprised of 20 ability dimensions defined by MMBench. It also contains chinese version with translated question.',  # noqa: E501
+        dataset_id='lmms-lab/MMBench',
+        subset_list=['cn', 'en'],
+        metric_list=['acc'],
+        eval_split='dev',
+        prompt_template=MULT_CHOICE_PROMPT,
+    )
+)
+class MMBenchAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        answers_list: List[str] = [record.get('A', ''), record.get('B', ''), record.get('C', ''), record.get('D', '')]
+        answers_list = [ans for ans in answers_list if (ans.strip() and ans != 'nan')]
+        question_hint = record['hint'] + record['question']
+        input_text = prompt(question=question_hint, choices=answers_list, template=MULT_CHOICE_PROMPT)
+        content_list: List[Content] = [ContentText(text=input_text)]
+        image = record.get('image')
+        if image:
+            image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
+            content_list.append(ContentImage(image=image_base64))
+        label_answer = record.get('answer')
+        return Sample(
+            input=[ChatMessageUser(content=content_list)],
+            choices=answers_list,
+            target=label_answer,
+            metadata={
+                'index': record.get('index'),
+                'category': record.get('category'),
+                'source': record.get('source'),
+                'L2-category': record.get('L2-category'),
+                'comment': record.get('comment'),
+                'split': record.get('split')
+            }
+        )

evalscope/benchmarks/mm_star/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/mm_star/mm_star_adapter.py ADDED Viewed

@@ -0,0 +1,73 @@
+import re
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+MULT_CHOICE_PROMPT = r"""
+Answer the following multiple choice question.
+The last line of your response should be of the following format:
+'ANSWER: $LETTER' (without quotes)
+where LETTER is one of A,B,C,D. Think step by step before answering.
+{question}
+""".strip()
+SUBSET_LIST = [
+    'coarse perception', 'fine-grained perception', 'instance reasoning', 'logical reasoning', 'math',
+    'science & technology'
+]
+@register_benchmark(
+    BenchmarkMeta(
+        name='mm_star',
+        pretty_name='MMStar',
+        tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
+        description=
+        'MMStar: an elite vision-indispensible multi-modal benchmark, aiming to ensure each curated sample exhibits visual dependency, minimal data leakage, and requires advanced multi-modal capabilities.',  # noqa: E501
+        dataset_id='evalscope/MMStar',
+        subset_list=SUBSET_LIST,
+        metric_list=['acc'],
+        default_subset='val',
+        eval_split='val',
+        prompt_template=MULT_CHOICE_PROMPT,
+    )
+)
+class MMStarAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.reformat_subset = True
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        input_text = MULT_CHOICE_PROMPT.format(question=record['question'])
+        content_list: List[Content] = [ContentText(text=input_text)]
+        image = record.get('image')
+        if image:
+            image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
+            content_list.append(ContentImage(image=image_base64))
+        label_answer = record.get('answer')
+        return Sample(
+            input=[ChatMessageUser(content=content_list)],
+            choices=['A', 'B', 'C', 'D'],
+            target=label_answer,
+            subset_key=record.get('category'),
+            metadata={
+                'index': record.get('index'),
+                'category': record.get('category'),
+                'l2_category': record.get('l2_category'),
+                'source': record.get('meta_info', {}).get('source'),
+                'split': record.get('meta_info', {}).get('split'),
+                'image_path': record.get('meta_info', {}).get('image_path')
+            }
+        )

evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py CHANGED Viewed

@@ -1,15 +1,14 @@
 import ast
 from typing import Any, Dict, List
-from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter, VisionLanguageAdapter
 from evalscope.api.dataset import Sample
-from evalscope.api.evaluator import TaskState
 from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
 from evalscope.api.registry import register_benchmark
 from evalscope.constants import Tags
 from evalscope.utils.io_utils import bytes_to_base64
 from evalscope.utils.logger import get_logger
-from evalscope.utils.multi_choices import MultipleChoiceTemplate, answer_character, parse_answers, prompt
+from evalscope.utils.multi_choices import MultipleChoiceTemplate, answer_character, prompt
 logger = get_logger()
@@ -60,7 +59,7 @@ DATASET_FORMATS = ['standard (4 options)', 'standard (10 options)', 'vision']
     BenchmarkMeta(
         name='mmmu_pro',
         pretty_name='MMMU-PRO',
-        tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
+        tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
         description=
         'MMMU-Pro is an enhanced multimodal benchmark designed to rigorously assess the true understanding capabilities of advanced AI models across multiple modalities. It builds upon the original MMMU benchmark by introducing several key improvements that make it more challenging and realistic, ensuring that models are evaluated on their genuine ability to integrate and comprehend both visual and textual information.',  # noqa: E501
         dataset_id='AI-ModelScope/MMMU_Pro',
@@ -73,7 +72,7 @@ DATASET_FORMATS = ['standard (4 options)', 'standard (10 options)', 'vision']
         }
     )
 )
-class MMMUPROAdapter(VisionLanguageAdapter):
+class MMMUPROAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
     MAX_IMAGES: int = 7
     def __init__(self, *args, **kwargs):
@@ -123,7 +122,3 @@ class MMMUPROAdapter(VisionLanguageAdapter):
             subset_key=record['subject'],
             metadata=metadata,
         )
-    def extract_answer(self, prediction: str, task_state: TaskState) -> str:
-        answers = parse_answers(task_state)
-        return ''.join(sorted(list(answers)))

evalscope/benchmarks/multi_if/__init__.py ADDED Viewed

File without changes

evalscope 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl

Potentially problematic release.

evalscope 1.0.1py3-none-any.whl → 1.0.2py3-none-any.whl