evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (176) hide show
  1. evalscope/api/benchmark/__init__.py +8 -1
  2. evalscope/api/benchmark/adapters/__init__.py +1 -0
  3. evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
  4. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  5. evalscope/api/benchmark/benchmark.py +14 -0
  6. evalscope/api/dataset/dataset.py +21 -0
  7. evalscope/api/dataset/loader.py +6 -2
  8. evalscope/api/mixin/sandbox_mixin.py +32 -54
  9. evalscope/api/model/generate_config.py +6 -0
  10. evalscope/app/ui/multi_model.py +6 -1
  11. evalscope/app/ui/single_model.py +8 -2
  12. evalscope/app/utils/data_utils.py +3 -2
  13. evalscope/app/utils/visualization.py +2 -2
  14. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  15. evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
  16. evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
  17. evalscope/benchmarks/blink/__init__.py +0 -0
  18. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  19. evalscope/benchmarks/chartqa/__init__.py +0 -0
  20. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  21. evalscope/benchmarks/chartqa/utils.py +38 -0
  22. evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
  23. evalscope/benchmarks/docvqa/__init__.py +0 -0
  24. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  25. evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
  26. evalscope/benchmarks/general_arena/utils.py +2 -1
  27. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  28. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  29. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
  30. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  31. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
  32. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  33. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
  34. evalscope/benchmarks/infovqa/__init__.py +0 -0
  35. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  36. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
  37. evalscope/benchmarks/math_verse/__init__.py +0 -0
  38. evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
  39. evalscope/benchmarks/math_vision/__init__.py +0 -0
  40. evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
  41. evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
  42. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
  43. evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
  44. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
  45. evalscope/benchmarks/ner/__init__.py +0 -0
  46. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  47. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  48. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  49. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  50. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  51. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  52. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  53. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  54. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  55. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  56. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  57. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  58. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  59. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  60. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  61. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  62. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  63. evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
  64. evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
  65. evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
  66. evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
  67. evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  68. evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
  69. evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
  70. evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  71. evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  72. evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  73. evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
  74. evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
  75. evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
  76. evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
  77. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  78. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  79. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  80. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  81. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  82. evalscope/benchmarks/poly_math/__init__.py +0 -0
  83. evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
  84. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  85. evalscope/benchmarks/pope/__init__.py +0 -0
  86. evalscope/benchmarks/pope/pope_adapter.py +111 -0
  87. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  88. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  89. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  90. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  91. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
  92. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
  93. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  94. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  95. evalscope/benchmarks/zerobench/__init__.py +0 -0
  96. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  97. evalscope/constants.py +4 -0
  98. evalscope/evaluator/evaluator.py +72 -79
  99. evalscope/metrics/math_parser.py +14 -0
  100. evalscope/metrics/metric.py +52 -1
  101. evalscope/metrics/metrics.py +16 -0
  102. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  103. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  104. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  105. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  106. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  107. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  108. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  109. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  110. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  111. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  112. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  113. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  114. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  115. evalscope/models/utils/openai.py +4 -0
  116. evalscope/perf/arguments.py +24 -4
  117. evalscope/perf/benchmark.py +74 -89
  118. evalscope/perf/http_client.py +31 -16
  119. evalscope/perf/main.py +15 -2
  120. evalscope/perf/plugin/api/base.py +9 -7
  121. evalscope/perf/plugin/api/custom_api.py +13 -58
  122. evalscope/perf/plugin/api/default_api.py +179 -79
  123. evalscope/perf/plugin/api/openai_api.py +4 -3
  124. evalscope/perf/plugin/datasets/base.py +21 -0
  125. evalscope/perf/plugin/datasets/custom.py +2 -3
  126. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  127. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  128. evalscope/perf/plugin/datasets/openqa.py +2 -4
  129. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  130. evalscope/perf/utils/benchmark_util.py +36 -22
  131. evalscope/perf/utils/db_util.py +14 -19
  132. evalscope/perf/utils/local_server.py +0 -44
  133. evalscope/perf/utils/log_utils.py +21 -6
  134. evalscope/report/__init__.py +11 -2
  135. evalscope/report/combinator.py +52 -2
  136. evalscope/run.py +4 -0
  137. evalscope/utils/function_utils.py +195 -12
  138. evalscope/utils/io_utils.py +74 -0
  139. evalscope/utils/json_schema.py +8 -6
  140. evalscope/utils/logger.py +49 -17
  141. evalscope/utils/multi_choices.py +16 -1
  142. evalscope/utils/ner.py +377 -0
  143. evalscope/version.py +2 -2
  144. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
  145. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
  146. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
  147. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
  148. tests/__init__.py +0 -1
  149. tests/benchmark/__init__.py +0 -1
  150. tests/benchmark/test_eval.py +0 -429
  151. tests/benchmark/test_image_edit.py +0 -65
  152. tests/benchmark/test_sandbox.py +0 -81
  153. tests/benchmark/test_t2i.py +0 -142
  154. tests/benchmark/test_vlm.py +0 -137
  155. tests/cli/__init__.py +0 -1
  156. tests/cli/test_all.py +0 -269
  157. tests/cli/test_collection.py +0 -99
  158. tests/cli/test_custom.py +0 -268
  159. tests/cli/test_reasoning.py +0 -81
  160. tests/common.py +0 -73
  161. tests/perf/__init__.py +0 -1
  162. tests/perf/test_perf.py +0 -206
  163. tests/rag/test_clip_benchmark.py +0 -87
  164. tests/rag/test_mteb.py +0 -213
  165. tests/rag/test_ragas.py +0 -128
  166. tests/swift/__init__.py +0 -1
  167. tests/swift/test_run_swift_eval.py +0 -146
  168. tests/swift/test_run_swift_vlm_eval.py +0 -128
  169. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  170. tests/test_run_all.py +0 -12
  171. tests/utils.py +0 -13
  172. tests/vlm/__init__.py +0 -1
  173. tests/vlm/test_vlmeval.py +0 -102
  174. {tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
  175. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
  176. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
@@ -1,5 +1,6 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
+ import re
3
4
  from typing import Any, Dict
4
5
 
5
6
  from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
@@ -12,13 +13,26 @@ from evalscope.utils.logger import get_logger
12
13
  logger = get_logger()
13
14
 
14
15
  PROMPT_TEMPLATE = """
15
- Solve the following math problem step by step. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.
16
+ Solve the following math problem step by step. The last line of your response should display the answer enclosed within \\boxed{{\\text{{$ANSWER}}}}.
16
17
 
17
- {question}
18
+ Example:
19
+
20
+ Let's solve the problem step by step.
21
+
22
+ Problem: Eliza's rate per hour for the first 40 hours she works each week is $10. She also receives an overtime pay of 1.2 times her regular hourly rate. If Eliza worked for 45 hours this week, how much are her earnings for this week?
23
+
24
+ Step 1: Calculate Eliza's earnings for the first 40 hours. Eliza's hourly rate is $10, so her earnings for the first 40 hours are $10/hour x 40 hours = $400.
25
+ Step 2: Calculate Eliza's overtime pay rate. Eliza's overtime pay rate is 1.2 times her regular hourly rate, so her overtime pay rate is $10/hour x 1.2 = $12/hour.
26
+ Step 3: Calculate Eliza's earnings for the overtime hours. Eliza worked for 45 hours, so her overtime hours are 45 hours - 40 hours = 5 hours. Her earnings for the overtime hours are $12/hour x 5 hours = $60.
27
+ Step 4: Calculate Eliza's total earnings for the week. Eliza's total earnings for the week are her earnings for the first 40 hours plus her earnings for the overtime hours, which is $400 + $60 = $460.
18
28
 
19
- Remember to put your answer on its own line at the end in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem, and you do not need to use a \\boxed command.
29
+ Answer:
30
+ \\boxed{{\\text{{460}}}}
20
31
 
21
- Reasoning:
32
+ question:
33
+ {question}
34
+
35
+ Remember to put your answer on its own line at the end in the form "\\boxed{{\\text{{$ANSWER}}}}" (without quotes), where $ANSWER is replaced by the actual answer to the problem.
22
36
  """.lstrip() # noqa: E501
23
37
 
24
38
  FEWSHOT_TEMPLATE = """
@@ -69,6 +83,11 @@ class GSM8KAdapter(DefaultDataAdapter):
69
83
  return ''
70
84
 
71
85
  def extract_answer(self, prediction: str, task_state: TaskState):
86
+ boxed_match = re.search(r'\\boxed\\{\\text\\{([^}]*)\\}\\}', prediction)
87
+ if boxed_match:
88
+ result = boxed_match.group(1).strip()
89
+ return result.strip()
90
+
72
91
  from evalscope.filters.extraction import RegexFilter
73
92
 
74
93
  regex = RegexFilter(regex_pattern=r'(-?[0-9.,]{2,})|(-?[0-9]+)', group_select=-1)
File without changes
@@ -0,0 +1,158 @@
1
+ from collections import defaultdict
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator.state import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.metric.scorer import AggScore, SampleScore, Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.utils.io_utils import bytes_to_base64
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ logger = get_logger()
15
+
16
+
17
+ @register_benchmark(
18
+ BenchmarkMeta(
19
+ name='hallusion_bench',
20
+ pretty_name='HallusionBench',
21
+ tags=[Tags.MULTI_MODAL, Tags.HALLUCINATION, Tags.YES_NO],
22
+ description=
23
+ 'HallusionBench is an advanced diagnostic benchmark designed to evaluate image-context reasoning, analyze models\' tendencies for language hallucination and visual illusion in large vision-language models (LVLMs).', # noqa: E501
24
+ dataset_id='lmms-lab/HallusionBench',
25
+ metric_list=['aAcc', 'qAcc', 'fAcc'],
26
+ eval_split='image',
27
+ prompt_template='{question}\nPlease answer YES or NO without an explanation.',
28
+ )
29
+ )
30
+ class HallusionBenchAdapter(VisionLanguageAdapter):
31
+
32
+ def __init__(self, **kwargs):
33
+ super().__init__(**kwargs)
34
+
35
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
36
+
37
+ input_text = self.prompt_template.format(question=record['question'])
38
+ content_list: List[Content] = [ContentText(text=input_text)]
39
+ image = record.get('image')
40
+ if image:
41
+ image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
42
+ content_list.append(ContentImage(image=image_base64))
43
+ answer = 'NO' if str(record.get('answer', '0')) == '1' else 'YES'
44
+ return Sample(
45
+ input=[ChatMessageUser(content=content_list)],
46
+ target=answer,
47
+ metadata={
48
+ 'category': record.get('category'),
49
+ 'subcategory': record.get('subcategory'),
50
+ 'visual_input': record.get('visual_input'),
51
+ 'set_id': record.get('set_id'),
52
+ 'figure_id': record.get('figure_id'),
53
+ 'question_id': record.get('question_id'),
54
+ }
55
+ )
56
+
57
+ def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
58
+ score = Score(
59
+ extracted_prediction=filtered_prediction,
60
+ prediction=original_prediction,
61
+ )
62
+ # Check if the reference answer is in the filtered prediction
63
+ result = 1 if reference in filtered_prediction.strip().upper() else 0
64
+ score.value = {'acc': result}
65
+ return score
66
+
67
+ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
68
+
69
+ def compute_aAcc(scores: List[SampleScore]):
70
+ total = len(scores)
71
+ if total == 0:
72
+ return 0.0, 0
73
+ correct = sum(ss.score.main_value for ss in scores)
74
+ return (correct / total), total
75
+
76
+ def compute_group_accuracy(scores: List[SampleScore], group_type: str):
77
+ # group_type: 'figure' or 'question'
78
+ groups = defaultdict(list)
79
+ for ss in scores:
80
+ md = ss.sample_metadata
81
+ subcategory = md.get('subcategory')
82
+ set_id = md.get('set_id')
83
+ group_id = md.get('figure_id') if group_type == 'figure' else md.get('question_id')
84
+ if subcategory is None or set_id is None or group_id is None:
85
+ # Skip incomplete records for this grouping
86
+ continue
87
+ key = f'{subcategory}_{set_id}_{group_id}'
88
+ groups[key].append(ss.score.main_value)
89
+ if not groups:
90
+ return 0.0, 0
91
+ num_correct_groups = sum(1 for vals in groups.values() if all(vals))
92
+ num_groups = len(groups)
93
+ return (num_correct_groups / num_groups), num_groups
94
+
95
+ def compute_metrics(scores: List[SampleScore]) -> Dict[str, Dict[str, float]]:
96
+ a_acc, a_n = compute_aAcc(scores)
97
+ f_acc, f_n = compute_group_accuracy(scores, 'figure')
98
+ q_acc, q_n = compute_group_accuracy(scores, 'question')
99
+ return {
100
+ 'aAcc': {
101
+ 'score': a_acc,
102
+ 'num': a_n
103
+ },
104
+ 'fAcc': {
105
+ 'score': f_acc,
106
+ 'num': f_n
107
+ },
108
+ 'qAcc': {
109
+ 'score': q_acc,
110
+ 'num': q_n
111
+ },
112
+ }
113
+
114
+ outputs: List[AggScore] = []
115
+
116
+ # By subcategory
117
+ subcategories = sorted({ss.sample_metadata.get('subcategory') for ss in sample_scores})
118
+ for subcategory in subcategories:
119
+ subset = [ss for ss in sample_scores if ss.sample_metadata.get('subcategory') == subcategory]
120
+ stats = compute_metrics(subset)
121
+ for metric in ['aAcc', 'fAcc', 'qAcc']:
122
+ outputs.append(
123
+ AggScore(
124
+ score=stats[metric]['score'],
125
+ metric_name=metric,
126
+ aggregation_name=str(subcategory),
127
+ num=stats[metric]['num'],
128
+ )
129
+ )
130
+
131
+ # By category
132
+ categories = sorted({ss.sample_metadata.get('category') for ss in sample_scores})
133
+ for category in categories:
134
+ subset = [ss for ss in sample_scores if ss.sample_metadata.get('category') == category]
135
+ stats = compute_metrics(subset)
136
+ for metric in ['aAcc', 'fAcc', 'qAcc']:
137
+ outputs.append(
138
+ AggScore(
139
+ score=stats[metric]['score'],
140
+ metric_name=metric,
141
+ aggregation_name=str(category),
142
+ num=stats[metric]['num'],
143
+ )
144
+ )
145
+
146
+ # Overall
147
+ overall = compute_metrics(sample_scores)
148
+ for metric in ['aAcc', 'fAcc', 'qAcc']:
149
+ outputs.append(
150
+ AggScore(
151
+ score=overall[metric]['score'],
152
+ metric_name=metric,
153
+ aggregation_name='Overall',
154
+ num=overall[metric]['num'],
155
+ )
156
+ )
157
+
158
+ return outputs
@@ -57,8 +57,9 @@ Your judgment must focus only on if there are meaningful differences between [co
57
57
  'humanities/social science (9%), computer science/artificial intelligence (10%), '
58
58
  'engineering (4%), chemistry (7%), and other (9%). Around 14% of the questions '
59
59
  'require the ability to understand both text and images, i.e., multi-modality. '
60
- '24% of the questions are multiple-choice; the rest are short-answer, exact-match questions. '
61
- 'To evaluate the performance of model without multi-modality capabilities, please set the extra_params["include_multi_modal"] to False.', # noqa: E501
60
+ '24% of the questions are multiple-choice; the rest are short-answer, exact-match questions. \n'
61
+ '**To evaluate the performance of model without multi-modality capabilities, '
62
+ 'please set the `extra_params["include_multi_modal"]` to `False`.**', # noqa: E501
62
63
  dataset_id='cais/hle',
63
64
  subset_list=SUBSET_LIST,
64
65
  metric_list=['acc'],
@@ -21,7 +21,8 @@ logger = get_logger()
21
21
  pretty_name='HumanEval',
22
22
  tags=[Tags.CODING],
23
23
  description=
24
- 'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior.',
24
+ 'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior. '
25
+ '**By default the code is executed in local environment. We recommend using sandbox execution to safely run and evaluate the generated code, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/sandbox.html) for more details.**', # noqa: E501
25
26
  dataset_id='opencompass/humaneval',
26
27
  subset_list=['openai_humaneval'],
27
28
  metric_list=['Pass@1'],
File without changes
@@ -0,0 +1,66 @@
1
+ import json
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator.state import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.io_utils import bytes_to_base64
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+ PROMPT = """Answer the question according to the image using a single word or phrase.
16
+ {question}
17
+ The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the question.""" # noqa: E501
18
+
19
+
20
+ @register_benchmark(
21
+ BenchmarkMeta(
22
+ name='infovqa',
23
+ pretty_name='InfoVQA',
24
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
25
+ description=
26
+ 'InfoVQA (Information Visual Question Answering) is a benchmark designed to evaluate how well AI models can answer questions based on information-dense images, such as charts, graphs, diagrams, maps, and infographics.', # noqa: E501
27
+ dataset_id='lmms-lab/DocVQA',
28
+ subset_list=['InfographicVQA'],
29
+ metric_list=['anls'],
30
+ eval_split='validation',
31
+ prompt_template=PROMPT,
32
+ )
33
+ )
34
+ class InfoVQAAdapter(VisionLanguageAdapter):
35
+
36
+ def __init__(self, **kwargs):
37
+ super().__init__(**kwargs)
38
+ self.add_aggregation_name = False
39
+
40
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
41
+
42
+ input_text = PROMPT.format(question=record['question'])
43
+ content_list: List[Content] = [ContentText(text=input_text)]
44
+ image = record.get('image')
45
+ if image:
46
+ image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
47
+ content_list.append(ContentImage(image=image_base64))
48
+ return Sample(
49
+ input=[ChatMessageUser(content=content_list)],
50
+ target=json.dumps(record.get('answers')), # answers is a list
51
+ metadata={
52
+ 'questionId': record.get('questionId'),
53
+ 'answer_type': record.get('answer_type'),
54
+ 'image_url': record.get('image_url'),
55
+ 'ocr': record.get('ocr'),
56
+ }
57
+ )
58
+
59
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
60
+ import re
61
+
62
+ pattern = r'ANSWER:\s*(.*)'
63
+ match = re.search(pattern, prediction)
64
+ if match:
65
+ return match.group(1).strip()
66
+ return prediction.strip()
@@ -1,3 +1,4 @@
1
+ # flake8: noqa: E501
1
2
  from typing import Any, Dict
2
3
 
3
4
  from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
@@ -19,7 +20,8 @@ logger = get_logger()
19
20
  pretty_name='Live-Code-Bench',
20
21
  tags=[Tags.CODING],
21
22
  description=
22
- 'Live Code Bench is a benchmark for evaluating code generation models on real-world coding tasks. It includes a variety of programming problems with test cases to assess the model\'s ability to generate correct and efficient code solutions.', # noqa: E501
23
+ 'Live Code Bench is a benchmark for evaluating code generation models on real-world coding tasks. It includes a variety of programming problems with test cases to assess the model\'s ability to generate correct and efficient code solutions. '
24
+ '**By default the code is executed in local environment. We recommend using sandbox execution to safely run and evaluate the generated code, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/sandbox.html) for more details.**',
23
25
  dataset_id='AI-ModelScope/code_generation_lite',
24
26
  subset_list=['release_latest'],
25
27
  metric_list=['Pass@1'],
File without changes
@@ -0,0 +1,100 @@
1
+ # flake8: noqa: E501
2
+ from typing import Any, Dict
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
7
+ from evalscope.api.registry import register_benchmark
8
+ from evalscope.constants import Tags
9
+ from evalscope.utils.io_utils import bytes_to_base64
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+
14
+ MULTI_CHOICE_TYPE = 'multi-choice'
15
+ OPEN_TYPE = 'free-form'
16
+
17
+ OPEN_PROMPT = '{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.'
18
+
19
+ MULT_CHOICE_PROMPT = """
20
+ Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of A, B, C, D. Think step by step before answering.
21
+
22
+ {question}
23
+ """
24
+
25
+ SUBSET_LIST = ['Text Dominant', 'Text Lite', 'Vision Intensive', 'Vision Dominant', 'Vision Only']
26
+
27
+
28
+ @register_benchmark(
29
+ BenchmarkMeta(
30
+ name='math_verse',
31
+ pretty_name='MathVerse',
32
+ dataset_id='evalscope/MathVerse',
33
+ tags=[Tags.MATH, Tags.REASONING, Tags.MULTIPLE_CHOICE, Tags.MULTI_MODAL],
34
+ description=
35
+ 'MathVerse, an all-around visual math benchmark designed for an equitable and in-depth evaluation of MLLMs. 2,612 high-quality, multi-subject math problems with diagrams from publicly available sources. Each problem is then transformed by human annotators into six distinct versions, each offering varying degrees of information content in multi-modality, contributing to 15K test samples in total. This approach allows MathVerse to comprehensively assess whether and how much MLLMs can truly understand the visual diagrams for mathematical reasoning.',
36
+ subset_list=SUBSET_LIST,
37
+ metric_list=[{
38
+ 'acc': {
39
+ 'numeric': True
40
+ }
41
+ }],
42
+ default_subset='testmini',
43
+ eval_split='testmini',
44
+ prompt_template=OPEN_PROMPT,
45
+ )
46
+ )
47
+ class MathVerseAdapter(VisionLanguageAdapter):
48
+
49
+ def __init__(self, **kwargs):
50
+ super().__init__(**kwargs)
51
+ self.reformat_subset = True
52
+ self._use_llm_judge = True
53
+
54
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
55
+ """
56
+ Convert a dataset record to a Sample. Unifies handling for both multi-choice and free-form.
57
+ Builds the content list inline and appends image content if provided.
58
+
59
+ Args:
60
+ record: Raw dataset record.
61
+
62
+ Returns:
63
+ Sample: The standardized sample ready for evaluation.
64
+ """
65
+ question_type = record.get('question_type', OPEN_TYPE)
66
+ question: str = record.get('question', '')
67
+ content_list: list[Content] = []
68
+
69
+ # Choose prompt text based on type; keep a single unified flow for creating Sample
70
+ if question_type == MULTI_CHOICE_TYPE:
71
+ prompt_text = MULT_CHOICE_PROMPT.format(question=question).strip()
72
+ else:
73
+ prompt_text = OPEN_PROMPT.format(question=question).strip()
74
+
75
+ content_list.append(ContentText(text=prompt_text))
76
+
77
+ # Append image if exists
78
+ image = record.get('image')
79
+ if image and isinstance(image, dict):
80
+ image_bytes = image.get('bytes')
81
+ if image_bytes:
82
+ image_base64 = bytes_to_base64(image_bytes, format='png', add_header=True)
83
+ content_list.append(ContentImage(image=image_base64))
84
+
85
+ metadata: Dict[str, Any] = {
86
+ 'sample_index': record.get('sample_index'),
87
+ 'problem_index': record.get('problem_index'),
88
+ 'problem_version': record.get('problem_version'),
89
+ 'question_type': question_type,
90
+ 'query_wo': record.get('query_wo'),
91
+ 'query_cot': record.get('query_cot'),
92
+ 'question_for_eval': record.get('question_for_eval'),
93
+ }
94
+
95
+ return Sample(
96
+ input=[ChatMessageUser(content=content_list)],
97
+ target=record['answer'],
98
+ subset_key=record['problem_version'],
99
+ metadata=metadata,
100
+ )
File without changes
@@ -0,0 +1,111 @@
1
+ # flake8: noqa: E501
2
+ import re
3
+ from typing import Any, Dict, List
4
+
5
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.evaluator import TaskState
8
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.utils.io_utils import bytes_to_base64
12
+ from evalscope.utils.logger import get_logger
13
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers, prompt
14
+
15
+ logger = get_logger()
16
+
17
+ OPEN_PROMPT = '{question}\nPlease reason step by step, and put your final answer within \\boxed{{}} without units.'
18
+
19
+ MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
20
+
21
+ SUBSET_LIST = ['level 1', 'level 2', 'level 3', 'level 4', 'level 5']
22
+
23
+
24
+ @register_benchmark(
25
+ BenchmarkMeta(
26
+ name='math_vision',
27
+ pretty_name='MathVision',
28
+ dataset_id='evalscope/MathVision',
29
+ tags=[Tags.MATH, Tags.REASONING, Tags.MULTIPLE_CHOICE, Tags.MULTI_MODAL],
30
+ description=
31
+ 'The MATH-Vision (MATH-V) dataset, a meticulously curated collection of 3,040 high-quality mathematical problems with visual contexts sourced from real math competitions.',
32
+ subset_list=SUBSET_LIST,
33
+ metric_list=[{
34
+ 'acc': {
35
+ 'numeric': True
36
+ }
37
+ }],
38
+ eval_split='test',
39
+ prompt_template=OPEN_PROMPT,
40
+ )
41
+ )
42
+ class MathVisionAdapter(VisionLanguageAdapter):
43
+
44
+ def __init__(self, **kwargs):
45
+ super().__init__(**kwargs)
46
+ self.reformat_subset = True
47
+
48
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
49
+ if len(record['options']) > 0:
50
+ question_type = 'multi_choice'
51
+ else:
52
+ question_type = 'free_form'
53
+ content_list, answers_list = MathVisionAdapter.create_content_and_answers_list(record, question_type)
54
+ metadata = {
55
+ 'id': record['id'],
56
+ 'image': record['image'],
57
+ 'solution': record['solution'],
58
+ 'level': record['level'],
59
+ 'question_type': question_type,
60
+ 'subject': record['subject']
61
+ }
62
+ if question_type == 'multi_choice':
63
+ label_answer = record['answer']
64
+ return Sample(
65
+ input=[ChatMessageUser(content=content_list)],
66
+ choices=answers_list,
67
+ target=label_answer,
68
+ subset_key=f'level {record["level"]}',
69
+ metadata=metadata
70
+ )
71
+ elif question_type == 'free_form':
72
+ return Sample(
73
+ input=[ChatMessageUser(content=content_list)],
74
+ target=record['answer'],
75
+ subset_key=f'level {record["level"]}',
76
+ metadata=metadata
77
+ )
78
+ else:
79
+ raise ValueError(f'Unexpected question_type: {question_type}')
80
+
81
+ @staticmethod
82
+ def create_content_and_answers_list(record: Dict[str, Any], question_type) -> tuple[List[Content], List[str]]:
83
+ """
84
+ Create a list of content elements and a list of answers from a record.
85
+
86
+ Args:
87
+ record (dict): The record containing question, images, and options.
88
+ question_type (str): The type of this question
89
+
90
+
91
+ Returns:
92
+ tuple: A tuple containing:
93
+ - content_list (list): A list of content elements (text and images).
94
+ - answers_list (list): A list of possible answers (for multiple-choice questions).
95
+ """
96
+
97
+ # Replace <image1>, <image2> ... to [image1], [image2], ... from question text
98
+ question = re.sub(r'<image(\d+)>', r'[image\1]', record['question']).strip()
99
+
100
+ if question_type == 'multi_choice':
101
+ answers_list = record['options']
102
+ input_text = prompt(question=question, choices=answers_list, template=MULT_CHOICE_PROMPT)
103
+ content_list: List[Content] = [ContentText(text=input_text)]
104
+ else:
105
+ answers_list: List[str] = []
106
+ content_list: List[Content] = [ContentText(text=OPEN_PROMPT.format(question=question))]
107
+ image = record['decoded_image']
108
+ if image:
109
+ image_base64 = bytes_to_base64(image['bytes'], format='jpg', add_header=True)
110
+ content_list.append(ContentImage(image=image_base64))
111
+ return content_list, answers_list
@@ -4,7 +4,6 @@ from typing import Any, Dict
4
4
 
5
5
  from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
6
6
  from evalscope.api.dataset import Sample
7
- from evalscope.api.evaluator import TaskState
8
7
  from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
9
8
  from evalscope.api.registry import register_benchmark
10
9
  from evalscope.constants import Tags
@@ -14,15 +13,7 @@ from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers,
14
13
 
15
14
  logger = get_logger()
16
15
 
17
- SUBSET_LIST = ['default']
18
-
19
- OPEN_PROMPT = """
20
- Solve the following problem step by step. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.
21
-
22
- {question}
23
-
24
- Remember to put your answer on its own line at the end in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem, and you do not need to use a \\boxed command.
25
- """
16
+ OPEN_PROMPT = '{question}\nPlease reason step by step, and put your final answer within \\boxed{{}} without units.'
26
17
 
27
18
  MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
28
19
 
@@ -38,8 +29,11 @@ OPEN_TYPE = 'free_form'
38
29
  tags=[Tags.MATH, Tags.REASONING, Tags.MULTIPLE_CHOICE, Tags.MULTI_MODAL],
39
30
  description=
40
31
  'MathVista is a consolidated Mathematical reasoning benchmark within Visual contexts. It consists of three newly created datasets, IQTest, FunctionQA, and PaperQA, which address the missing visual domains and are tailored to evaluate logical reasoning on puzzle test figures, algebraic reasoning over functional plots, and scientific reasoning with academic paper figures, respectively. It also incorporates 9 MathQA datasets and 19 VQA datasets from the literature, which significantly enrich the diversity and complexity of visual perception and mathematical reasoning challenges within our benchmark. In total, MathVista includes 6,141 examples collected from 31 different datasets.',
41
- subset_list=SUBSET_LIST,
42
- metric_list=['acc'],
32
+ metric_list=[{
33
+ 'acc': {
34
+ 'numeric': True
35
+ }
36
+ }],
43
37
  eval_split='testmini',
44
38
  prompt_template=OPEN_PROMPT,
45
39
  )
@@ -86,20 +80,6 @@ class MathVistaAdapter(VisionLanguageAdapter):
86
80
  logger.warning(f"Answer '{value}' not found in options: {options}. This may cause evaluation issues.")
87
81
  return value
88
82
 
89
- def extract_answer(self, prediction: str, task_state: TaskState) -> str:
90
- question_type = task_state.metadata['question_type']
91
- if question_type == MULTI_CHOICE_TYPE:
92
- answers = parse_answers(task_state)
93
- return ''.join(sorted(list(answers)))
94
- elif question_type == OPEN_TYPE:
95
- pattern = r'ANSWER:\s*(.*)'
96
- match = re.search(pattern, prediction)
97
- if match:
98
- return match.group(1).strip()
99
- return ''
100
- else:
101
- raise ValueError(f'Unsupported question type: {question_type}')
102
-
103
83
  @staticmethod
104
84
  def create_content_and_answers_list(record: dict[str, Any], ) -> tuple[list[Content], list[str]]:
105
85
  """
@@ -35,7 +35,7 @@ class CCBenchAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
35
35
 
36
36
  def record_to_sample(self, record: Dict[str, Any]) -> Sample:
37
37
  answers_list: List[str] = [record.get('A', ''), record.get('B', ''), record.get('C', ''), record.get('D', '')]
38
- input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
38
+ input_text = prompt(question=record['question'], choices=answers_list, template=self.prompt_template)
39
39
  content_list: List[Content] = [ContentText(text=input_text)]
40
40
  image = record.get('image')
41
41
  if image:
@@ -77,7 +77,7 @@ class MMBenchAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
77
77
  answers_list: List[str] = [record.get('A', ''), record.get('B', ''), record.get('C', ''), record.get('D', '')]
78
78
  answers_list = [ans for ans in answers_list if (ans.strip() and ans != 'nan')]
79
79
  question_hint = record['hint'] + record['question']
80
- input_text = prompt(question=question_hint, choices=answers_list, template=MULT_CHOICE_PROMPT)
80
+ input_text = prompt(question=question_hint, choices=answers_list, template=self.prompt_template)
81
81
  content_list: List[Content] = [ContentText(text=input_text)]
82
82
  image = record.get('image')
83
83
  if image:
@@ -122,7 +122,7 @@ class MMMUAdapter(VisionLanguageAdapter):
122
122
  match = re.search(pattern, prediction)
123
123
  if match:
124
124
  return match.group(1).strip()
125
- return ''
125
+ return prediction.strip()
126
126
  else:
127
127
  raise ValueError(f'Unsupported question type: {question_type}')
128
128
 
@@ -36,7 +36,7 @@ Don't give information outside the document or repeat your findings."""
36
36
  tags=[Tags.RETRIEVAL, Tags.LONG_CONTEXT],
37
37
  description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
38
38
  'It requires the model to find specific information within a large corpus of text. '
39
- '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/needle_haystack.html)', # noqa: E501
39
+ '[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html)', # noqa: E501
40
40
  dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
41
41
  metric_list=['acc'],
42
42
  subset_list=['english', 'chinese'],
File without changes