evalscope 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (87) hide show
  1. evalscope/api/benchmark/adapters/default_data_adapter.py +6 -4
  2. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  3. evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
  4. evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
  5. evalscope/api/benchmark/benchmark.py +27 -2
  6. evalscope/api/benchmark/meta.py +3 -0
  7. evalscope/api/evaluator/evaluator.py +5 -0
  8. evalscope/api/evaluator/state.py +5 -0
  9. evalscope/api/messages/chat_message.py +6 -1
  10. evalscope/api/mixin/__init__.py +1 -0
  11. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  12. evalscope/api/mixin/sandbox_mixin.py +204 -0
  13. evalscope/api/model/generate_config.py +0 -3
  14. evalscope/api/model/model.py +1 -1
  15. evalscope/api/tool/tool_info.py +1 -1
  16. evalscope/arguments.py +6 -0
  17. evalscope/benchmarks/ai2d/__init__.py +0 -0
  18. evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
  19. evalscope/benchmarks/amc/__init__.py +0 -0
  20. evalscope/benchmarks/amc/amc_adapter.py +46 -0
  21. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  22. evalscope/benchmarks/bfcl/bfcl_adapter.py +141 -2
  23. evalscope/benchmarks/bfcl/generation.py +7 -7
  24. evalscope/benchmarks/drop/drop_adapter.py +1 -1
  25. evalscope/benchmarks/healthbench/__init__.py +0 -0
  26. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  27. evalscope/benchmarks/healthbench/utils.py +102 -0
  28. evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
  29. evalscope/benchmarks/humaneval/utils.py +235 -0
  30. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  31. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
  32. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  33. evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
  34. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  35. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
  36. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  37. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  38. evalscope/benchmarks/mm_star/__init__.py +0 -0
  39. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  40. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
  41. evalscope/benchmarks/multi_if/__init__.py +0 -0
  42. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  43. evalscope/benchmarks/multi_if/metrics.py +120 -0
  44. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  45. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
  46. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  47. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  48. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  49. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  50. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  51. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  52. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  53. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
  54. evalscope/config.py +24 -1
  55. evalscope/constants.py +3 -0
  56. evalscope/evaluator/evaluator.py +25 -7
  57. evalscope/metrics/metric.py +27 -2
  58. evalscope/models/model_apis.py +10 -8
  59. evalscope/models/utils/openai.py +1 -2
  60. evalscope/perf/arguments.py +2 -0
  61. evalscope/perf/plugin/api/base.py +2 -2
  62. evalscope/perf/plugin/api/default_api.py +7 -7
  63. evalscope/perf/plugin/api/openai_api.py +83 -19
  64. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  65. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  66. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  67. evalscope/perf/utils/benchmark_util.py +1 -2
  68. evalscope/report/combinator.py +0 -25
  69. evalscope/report/report.py +8 -4
  70. evalscope/run.py +1 -1
  71. evalscope/utils/function_utils.py +41 -0
  72. evalscope/utils/import_utils.py +63 -13
  73. evalscope/utils/io_utils.py +19 -11
  74. evalscope/utils/json_schema.py +23 -2
  75. evalscope/utils/logger.py +19 -0
  76. evalscope/utils/model_utils.py +1 -1
  77. evalscope/version.py +2 -2
  78. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/METADATA +6 -10
  79. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/RECORD +87 -59
  80. tests/benchmark/test_eval.py +51 -7
  81. tests/benchmark/test_sandbox.py +81 -0
  82. tests/benchmark/test_vlm.py +60 -3
  83. tests/perf/test_perf.py +40 -12
  84. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
  85. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
  86. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
  87. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,220 @@
1
+ import json
2
+ from typing import TYPE_CHECKING, Dict, List, Tuple
3
+
4
+ from evalscope.utils.logger import get_logger
5
+
6
+ if TYPE_CHECKING:
7
+ from evalscope.api.mixin.sandbox_mixin import SandboxMixin
8
+
9
+ logger = get_logger()
10
+
11
+
12
+ def evaluate_in_sandbox(
13
+ adapter: 'SandboxMixin',
14
+ code: str,
15
+ evaluation_sample: str,
16
+ timeout: int = 6,
17
+ debug: bool = False
18
+ ) -> Tuple[bool, Dict]:
19
+ """
20
+ Evaluate code in sandbox environment for Live Code Bench.
21
+
22
+ Args:
23
+ adapter: The adapter instance with sandbox capabilities
24
+ code: The code to evaluate
25
+ evaluation_sample: JSON string containing input/output test cases
26
+ timeout: Timeout for execution
27
+ debug: Whether to enable debug logging
28
+
29
+ Returns:
30
+ Tuple[bool, Dict]: (overall_pass, detailed_results)
31
+ """
32
+ try:
33
+ # Parse the evaluation sample
34
+ test_data = json.loads(evaluation_sample)
35
+ inputs = test_data.get('inputs', [])
36
+ outputs = test_data.get('outputs', [])
37
+ fn_name = test_data.get('fn_name')
38
+
39
+ if debug:
40
+ logger.info(f'Evaluating code with {len(inputs)} test cases')
41
+ logger.info(f'Function name: {fn_name}')
42
+
43
+ # Determine if this is call-based or stdio-based
44
+ if fn_name:
45
+ # Call-based evaluation
46
+ return _evaluate_call_based_in_sandbox(adapter, code, inputs, outputs, fn_name, timeout, debug)
47
+ else:
48
+ # Standard input/output evaluation
49
+ return _evaluate_stdio_in_sandbox(adapter, code, inputs, outputs, timeout, debug)
50
+
51
+ except Exception as e:
52
+ if debug:
53
+ logger.error(f'Sandbox evaluation error: {str(e)}')
54
+ return False, {'error': str(e), 'total_tests': 0, 'passed_tests': 0}
55
+
56
+
57
+ def _evaluate_call_based_in_sandbox(
58
+ adapter: 'SandboxMixin', code: str, inputs: list, outputs: list, fn_name: str, timeout: int, debug: bool
59
+ ) -> Tuple[bool, Dict]:
60
+ """Evaluate call-based problems in sandbox."""
61
+ try:
62
+ all_passed = True
63
+ passed_count = 0
64
+ failed_cases = []
65
+
66
+ for i, (test_input, expected_output) in enumerate(zip(inputs, outputs)):
67
+ # Prepare individual test code for each test case
68
+ test_code = f"""
69
+ import json
70
+ import sys
71
+
72
+ # User's code
73
+ {code}
74
+
75
+ # Test execution for single test case
76
+ try:
77
+ test_input = {repr(test_input)}
78
+ expected_output = {repr(expected_output)}
79
+
80
+ if 'class Solution' in '''{code}''':
81
+ # LeetCode style
82
+ solution = Solution()
83
+ method = getattr(solution, '{fn_name}')
84
+ else:
85
+ # Function is directly available
86
+ method = {fn_name}
87
+
88
+ # Parse input if it's JSON string
89
+ if isinstance(test_input, str):
90
+ try:
91
+ test_input = json.loads(test_input)
92
+ except:
93
+ pass # Keep as string if not valid JSON
94
+
95
+ # Call the method
96
+ if isinstance(test_input, list):
97
+ result = method(*test_input)
98
+ else:
99
+ result = method(test_input)
100
+
101
+ # Parse expected output if it's JSON string
102
+ if isinstance(expected_output, str):
103
+ try:
104
+ expected_output = json.loads(expected_output)
105
+ except:
106
+ pass # Keep as string if not valid JSON
107
+
108
+ # Convert tuple to list for comparison
109
+ if isinstance(result, tuple):
110
+ result = list(result)
111
+
112
+ if result == expected_output:
113
+ print("TEST_PASSED")
114
+ else:
115
+ print(f"TEST_FAILED: expected {{expected_output}}, got {{result}}")
116
+
117
+ except Exception as e:
118
+ print(f"EXECUTION_ERROR: {{str(e)}}")
119
+ import traceback
120
+ traceback.print_exc()
121
+ """
122
+
123
+ # Execute in sandbox
124
+ result = adapter.execute_code_in_sandbox(code=test_code, timeout=timeout, language='python')
125
+
126
+ if debug:
127
+ logger.info(f'Test case {i} execution result: {result}')
128
+
129
+ # Check if execution was successful and test passed
130
+ if result.get('status') == 'success':
131
+ output = result.get('output', '')
132
+ if 'TEST_PASSED' in output:
133
+ passed_count += 1
134
+ elif 'TEST_FAILED:' in output:
135
+ # Extract failure details from output
136
+ for line in output.split('\n'):
137
+ if line.startswith('TEST_FAILED:'):
138
+ failed_cases.append(f"Test {i}: {line.replace('TEST_FAILED: ', '')}")
139
+ break
140
+ all_passed = False
141
+ break
142
+ elif 'EXECUTION_ERROR:' in output:
143
+ # Extract error details
144
+ for line in output.split('\n'):
145
+ if line.startswith('EXECUTION_ERROR:'):
146
+ failed_cases.append(f'Test {i}: {line}')
147
+ break
148
+ all_passed = False
149
+ break
150
+ else:
151
+ failed_cases.append(f'Test {i}: Unknown error in output. Result: {result}')
152
+ all_passed = False
153
+ break
154
+ else:
155
+ failed_cases.append(f'Test {i}: Sandbox execution failed - Result: {result}')
156
+ all_passed = False
157
+ break
158
+
159
+ detailed_results = {'total_tests': len(inputs), 'passed_tests': passed_count, 'failed_cases': failed_cases}
160
+
161
+ return all_passed, detailed_results
162
+
163
+ except Exception as e:
164
+ if debug:
165
+ logger.error(f'Call-based evaluation error: {str(e)}')
166
+ return False, {'error': str(e), 'total_tests': len(inputs), 'passed_tests': 0}
167
+
168
+
169
+ def _evaluate_stdio_in_sandbox(
170
+ adapter: 'SandboxMixin', code: str, inputs: list, outputs: list, timeout: int, debug: bool
171
+ ) -> Tuple[bool, Dict]:
172
+ """Evaluate stdio-based problems in sandbox."""
173
+ try:
174
+ all_passed = True
175
+ passed_count = 0
176
+ failed_cases = []
177
+
178
+ for i, (test_input, expected_output) in enumerate(zip(inputs, outputs)):
179
+ test_code = f"""
180
+ import sys
181
+ from io import StringIO
182
+
183
+ # Redirect stdin
184
+ sys.stdin = StringIO('''{test_input}''')
185
+
186
+ # User's code
187
+ {code}
188
+ """
189
+
190
+ # Execute in sandbox
191
+ result = adapter.execute_code_in_sandbox(code=test_code, timeout=timeout, language='python')
192
+
193
+ if result.get('status') != 'success':
194
+ if debug:
195
+ logger.error(f'Test case {i} execution failed: {result}')
196
+ failed_cases.append(f'Test {i}: Execution error - Result: {result}')
197
+ all_passed = False
198
+ break
199
+
200
+ # Compare output
201
+ actual_output = result.get('output', '').strip()
202
+ expected_output = expected_output.strip()
203
+
204
+ if actual_output == expected_output:
205
+ passed_count += 1
206
+ else:
207
+ if debug:
208
+ logger.info(f"Test case {i} failed: expected '{expected_output}', got '{actual_output}'")
209
+ failed_cases.append(f"Test {i}: Expected '{expected_output}', got '{actual_output}'")
210
+ all_passed = False
211
+ break
212
+
213
+ detailed_results = {'total_tests': len(inputs), 'passed_tests': passed_count, 'failed_cases': failed_cases}
214
+
215
+ return all_passed, detailed_results
216
+
217
+ except Exception as e:
218
+ if debug:
219
+ logger.error(f'Stdio evaluation error: {str(e)}')
220
+ return False, {'error': str(e), 'total_tests': len(inputs), 'passed_tests': 0}
@@ -4,7 +4,6 @@ from typing import Any, Dict
4
4
 
5
5
  from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
6
6
  from evalscope.api.dataset import Sample
7
- from evalscope.api.evaluator import TaskState
8
7
  from evalscope.api.registry import register_benchmark
9
8
  from evalscope.constants import Tags
10
9
  from evalscope.utils.logger import get_logger
File without changes
@@ -0,0 +1,48 @@
1
+ from typing import Any, Dict
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.registry import register_benchmark
6
+ from evalscope.constants import Tags
7
+ from evalscope.utils.logger import get_logger
8
+
9
+ logger = get_logger()
10
+
11
+
12
+ @register_benchmark(
13
+ BenchmarkMeta(
14
+ name='minerva_math',
15
+ pretty_name='Minerva-Math',
16
+ tags=[Tags.MATH, Tags.REASONING],
17
+ description='Minerva-math is a benchmark designed to evaluate the mathematical and quantitative '
18
+ 'reasoning capabilities of LLMs. It consists of **272 problems** '
19
+ 'sourced primarily from **MIT OpenCourseWare** '
20
+ 'courses, covering advanced STEM subjects such as solid-state chemistry, astronomy, differential '
21
+ 'equations, and special relativity at the **university and graduate level**.',
22
+ dataset_id='knoveleng/Minerva-Math',
23
+ subset_list=['default'],
24
+ metric_list=[{
25
+ 'acc': {
26
+ 'numeric': True
27
+ }
28
+ }],
29
+ eval_split='train',
30
+ prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
31
+ )
32
+ )
33
+ class MinervaMathAdapter(DefaultDataAdapter):
34
+
35
+ def __init__(self, **kwargs):
36
+ super().__init__(**kwargs)
37
+
38
+ self._use_llm_judge = True
39
+
40
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
41
+ return Sample(
42
+ input=record['problem'],
43
+ target=record['solution'],
44
+ metadata={
45
+ 'type': record['type'],
46
+ 'idx': record['idx'],
47
+ },
48
+ )
File without changes
@@ -0,0 +1,99 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter, VisionLanguageAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
6
+ from evalscope.api.registry import register_benchmark
7
+ from evalscope.constants import Tags
8
+ from evalscope.utils.io_utils import bytes_to_base64
9
+ from evalscope.utils.logger import get_logger
10
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate, prompt
11
+
12
+ logger = get_logger()
13
+
14
+ MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
15
+
16
+
17
+ @register_benchmark(
18
+ BenchmarkMeta(
19
+ name='cc_bench',
20
+ pretty_name='CCBench',
21
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
22
+ description=
23
+ 'CCBench is an extension of MMBench with newly design questions about Chinese traditional culture, including Calligraphy Painting, Cultural Relic, Food & Clothes, Historical Figures, Scenery & Building, Sketch Reasoning and Traditional Show.', # noqa: E501
24
+ dataset_id='lmms-lab/MMBench',
25
+ subset_list=['cc'],
26
+ metric_list=['acc'],
27
+ eval_split='test',
28
+ prompt_template=MULT_CHOICE_PROMPT,
29
+ )
30
+ )
31
+ class CCBenchAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
32
+
33
+ def __init__(self, **kwargs):
34
+ super().__init__(**kwargs)
35
+
36
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
37
+ answers_list: List[str] = [record.get('A', ''), record.get('B', ''), record.get('C', ''), record.get('D', '')]
38
+ input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
39
+ content_list: List[Content] = [ContentText(text=input_text)]
40
+ image = record.get('image')
41
+ if image:
42
+ image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
43
+ content_list.append(ContentImage(image=image_base64))
44
+ label_answer = record.get('answer')
45
+ return Sample(
46
+ input=[ChatMessageUser(content=content_list)],
47
+ choices=answers_list,
48
+ target=label_answer,
49
+ metadata={
50
+ 'index': record.get('index'),
51
+ 'category': record.get('category'),
52
+ 'source': record.get('source')
53
+ }
54
+ )
55
+
56
+
57
+ @register_benchmark(
58
+ BenchmarkMeta(
59
+ name='mm_bench',
60
+ pretty_name='MMBench',
61
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
62
+ description=
63
+ 'MMBench is a comprehensive evaluation pipeline comprised of meticulously curated multimodal dataset and a novel circulareval strategy using ChatGPT. It is comprised of 20 ability dimensions defined by MMBench. It also contains chinese version with translated question.', # noqa: E501
64
+ dataset_id='lmms-lab/MMBench',
65
+ subset_list=['cn', 'en'],
66
+ metric_list=['acc'],
67
+ eval_split='dev',
68
+ prompt_template=MULT_CHOICE_PROMPT,
69
+ )
70
+ )
71
+ class MMBenchAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
72
+
73
+ def __init__(self, **kwargs):
74
+ super().__init__(**kwargs)
75
+
76
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
77
+ answers_list: List[str] = [record.get('A', ''), record.get('B', ''), record.get('C', ''), record.get('D', '')]
78
+ answers_list = [ans for ans in answers_list if (ans.strip() and ans != 'nan')]
79
+ question_hint = record['hint'] + record['question']
80
+ input_text = prompt(question=question_hint, choices=answers_list, template=MULT_CHOICE_PROMPT)
81
+ content_list: List[Content] = [ContentText(text=input_text)]
82
+ image = record.get('image')
83
+ if image:
84
+ image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
85
+ content_list.append(ContentImage(image=image_base64))
86
+ label_answer = record.get('answer')
87
+ return Sample(
88
+ input=[ChatMessageUser(content=content_list)],
89
+ choices=answers_list,
90
+ target=label_answer,
91
+ metadata={
92
+ 'index': record.get('index'),
93
+ 'category': record.get('category'),
94
+ 'source': record.get('source'),
95
+ 'L2-category': record.get('L2-category'),
96
+ 'comment': record.get('comment'),
97
+ 'split': record.get('split')
98
+ }
99
+ )
File without changes
@@ -0,0 +1,73 @@
1
+ import re
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.io_utils import bytes_to_base64
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+ MULT_CHOICE_PROMPT = r"""
16
+ Answer the following multiple choice question.
17
+ The last line of your response should be of the following format:
18
+ 'ANSWER: $LETTER' (without quotes)
19
+ where LETTER is one of A,B,C,D. Think step by step before answering.
20
+
21
+ {question}
22
+ """.strip()
23
+
24
+ SUBSET_LIST = [
25
+ 'coarse perception', 'fine-grained perception', 'instance reasoning', 'logical reasoning', 'math',
26
+ 'science & technology'
27
+ ]
28
+
29
+
30
+ @register_benchmark(
31
+ BenchmarkMeta(
32
+ name='mm_star',
33
+ pretty_name='MMStar',
34
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
35
+ description=
36
+ 'MMStar: an elite vision-indispensible multi-modal benchmark, aiming to ensure each curated sample exhibits visual dependency, minimal data leakage, and requires advanced multi-modal capabilities.', # noqa: E501
37
+ dataset_id='evalscope/MMStar',
38
+ subset_list=SUBSET_LIST,
39
+ metric_list=['acc'],
40
+ default_subset='val',
41
+ eval_split='val',
42
+ prompt_template=MULT_CHOICE_PROMPT,
43
+ )
44
+ )
45
+ class MMStarAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
46
+
47
+ def __init__(self, **kwargs):
48
+ super().__init__(**kwargs)
49
+
50
+ self.reformat_subset = True
51
+
52
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
53
+ input_text = MULT_CHOICE_PROMPT.format(question=record['question'])
54
+ content_list: List[Content] = [ContentText(text=input_text)]
55
+ image = record.get('image')
56
+ if image:
57
+ image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
58
+ content_list.append(ContentImage(image=image_base64))
59
+ label_answer = record.get('answer')
60
+ return Sample(
61
+ input=[ChatMessageUser(content=content_list)],
62
+ choices=['A', 'B', 'C', 'D'],
63
+ target=label_answer,
64
+ subset_key=record.get('category'),
65
+ metadata={
66
+ 'index': record.get('index'),
67
+ 'category': record.get('category'),
68
+ 'l2_category': record.get('l2_category'),
69
+ 'source': record.get('meta_info', {}).get('source'),
70
+ 'split': record.get('meta_info', {}).get('split'),
71
+ 'image_path': record.get('meta_info', {}).get('image_path')
72
+ }
73
+ )
@@ -1,15 +1,14 @@
1
1
  import ast
2
2
  from typing import Any, Dict, List
3
3
 
4
- from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
4
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter, VisionLanguageAdapter
5
5
  from evalscope.api.dataset import Sample
6
- from evalscope.api.evaluator import TaskState
7
6
  from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
7
  from evalscope.api.registry import register_benchmark
9
8
  from evalscope.constants import Tags
10
9
  from evalscope.utils.io_utils import bytes_to_base64
11
10
  from evalscope.utils.logger import get_logger
12
- from evalscope.utils.multi_choices import MultipleChoiceTemplate, answer_character, parse_answers, prompt
11
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate, answer_character, prompt
13
12
 
14
13
  logger = get_logger()
15
14
 
@@ -60,7 +59,7 @@ DATASET_FORMATS = ['standard (4 options)', 'standard (10 options)', 'vision']
60
59
  BenchmarkMeta(
61
60
  name='mmmu_pro',
62
61
  pretty_name='MMMU-PRO',
63
- tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
62
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
64
63
  description=
65
64
  'MMMU-Pro is an enhanced multimodal benchmark designed to rigorously assess the true understanding capabilities of advanced AI models across multiple modalities. It builds upon the original MMMU benchmark by introducing several key improvements that make it more challenging and realistic, ensuring that models are evaluated on their genuine ability to integrate and comprehend both visual and textual information.', # noqa: E501
66
65
  dataset_id='AI-ModelScope/MMMU_Pro',
@@ -73,7 +72,7 @@ DATASET_FORMATS = ['standard (4 options)', 'standard (10 options)', 'vision']
73
72
  }
74
73
  )
75
74
  )
76
- class MMMUPROAdapter(VisionLanguageAdapter):
75
+ class MMMUPROAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
77
76
  MAX_IMAGES: int = 7
78
77
 
79
78
  def __init__(self, *args, **kwargs):
@@ -123,7 +122,3 @@ class MMMUPROAdapter(VisionLanguageAdapter):
123
122
  subset_key=record['subject'],
124
123
  metadata=metadata,
125
124
  )
126
-
127
- def extract_answer(self, prediction: str, task_state: TaskState) -> str:
128
- answers = parse_answers(task_state)
129
- return ''.join(sorted(list(answers)))
File without changes