evalscope 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (155) hide show
  1. evalscope/api/benchmark/adapters/default_data_adapter.py +18 -4
  2. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  3. evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
  4. evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
  5. evalscope/api/benchmark/benchmark.py +27 -2
  6. evalscope/api/benchmark/meta.py +3 -0
  7. evalscope/api/evaluator/evaluator.py +5 -0
  8. evalscope/api/evaluator/state.py +5 -0
  9. evalscope/api/messages/chat_message.py +6 -1
  10. evalscope/api/mixin/__init__.py +1 -0
  11. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  12. evalscope/api/mixin/sandbox_mixin.py +204 -0
  13. evalscope/api/model/generate_config.py +0 -3
  14. evalscope/api/model/model.py +1 -1
  15. evalscope/api/tool/tool_info.py +1 -1
  16. evalscope/app/ui/multi_model.py +6 -1
  17. evalscope/app/ui/single_model.py +8 -2
  18. evalscope/app/utils/data_utils.py +3 -2
  19. evalscope/app/utils/visualization.py +2 -2
  20. evalscope/arguments.py +6 -0
  21. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  22. evalscope/benchmarks/amc/__init__.py +0 -0
  23. evalscope/benchmarks/amc/amc_adapter.py +46 -0
  24. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  25. evalscope/benchmarks/bfcl/bfcl_adapter.py +106 -2
  26. evalscope/benchmarks/bfcl/generation.py +7 -7
  27. evalscope/benchmarks/blink/__init__.py +0 -0
  28. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  29. evalscope/benchmarks/chartqa/__init__.py +0 -0
  30. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  31. evalscope/benchmarks/chartqa/utils.py +38 -0
  32. evalscope/benchmarks/docvqa/__init__.py +0 -0
  33. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  34. evalscope/benchmarks/drop/drop_adapter.py +1 -1
  35. evalscope/benchmarks/general_arena/utils.py +2 -1
  36. evalscope/benchmarks/healthbench/__init__.py +0 -0
  37. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  38. evalscope/benchmarks/healthbench/utils.py +102 -0
  39. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  40. evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
  41. evalscope/benchmarks/humaneval/utils.py +235 -0
  42. evalscope/benchmarks/infovqa/__init__.py +0 -0
  43. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  44. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  45. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
  46. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  47. evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
  48. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  49. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
  50. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  51. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  52. evalscope/benchmarks/mm_star/__init__.py +0 -0
  53. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  54. evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
  55. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
  56. evalscope/benchmarks/multi_if/__init__.py +0 -0
  57. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  58. evalscope/benchmarks/multi_if/metrics.py +120 -0
  59. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  60. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
  61. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  62. evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
  63. evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
  64. evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
  65. evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
  66. evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  67. evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
  68. evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
  69. evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  70. evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  71. evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  72. evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
  73. evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
  74. evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
  75. evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
  76. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  77. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  78. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  79. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  80. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  81. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  82. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  83. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
  84. evalscope/config.py +24 -1
  85. evalscope/constants.py +3 -0
  86. evalscope/evaluator/evaluator.py +25 -7
  87. evalscope/metrics/metric.py +78 -2
  88. evalscope/metrics/metrics.py +16 -0
  89. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  90. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  91. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  92. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  93. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  94. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  95. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  96. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  97. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  98. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  99. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  100. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  101. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  102. evalscope/models/model_apis.py +10 -8
  103. evalscope/models/utils/openai.py +1 -2
  104. evalscope/perf/arguments.py +2 -0
  105. evalscope/perf/plugin/api/base.py +2 -2
  106. evalscope/perf/plugin/api/default_api.py +7 -7
  107. evalscope/perf/plugin/api/openai_api.py +83 -19
  108. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  109. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  110. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  111. evalscope/perf/utils/benchmark_util.py +1 -2
  112. evalscope/report/__init__.py +9 -1
  113. evalscope/report/combinator.py +45 -20
  114. evalscope/report/report.py +8 -4
  115. evalscope/run.py +1 -1
  116. evalscope/utils/function_utils.py +41 -0
  117. evalscope/utils/import_utils.py +63 -13
  118. evalscope/utils/io_utils.py +19 -11
  119. evalscope/utils/json_schema.py +25 -2
  120. evalscope/utils/logger.py +19 -0
  121. evalscope/utils/model_utils.py +1 -1
  122. evalscope/utils/multi_choices.py +16 -1
  123. evalscope/version.py +2 -2
  124. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/METADATA +10 -40
  125. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/RECORD +120 -95
  126. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
  127. tests/__init__.py +0 -1
  128. tests/benchmark/__init__.py +0 -1
  129. tests/benchmark/test_eval.py +0 -385
  130. tests/benchmark/test_image_edit.py +0 -65
  131. tests/benchmark/test_t2i.py +0 -142
  132. tests/benchmark/test_vlm.py +0 -80
  133. tests/cli/__init__.py +0 -1
  134. tests/cli/test_all.py +0 -269
  135. tests/cli/test_collection.py +0 -99
  136. tests/cli/test_custom.py +0 -268
  137. tests/cli/test_reasoning.py +0 -81
  138. tests/common.py +0 -73
  139. tests/perf/__init__.py +0 -1
  140. tests/perf/test_perf.py +0 -178
  141. tests/rag/test_clip_benchmark.py +0 -87
  142. tests/rag/test_mteb.py +0 -213
  143. tests/rag/test_ragas.py +0 -128
  144. tests/swift/__init__.py +0 -1
  145. tests/swift/test_run_swift_eval.py +0 -146
  146. tests/swift/test_run_swift_vlm_eval.py +0 -128
  147. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  148. tests/test_run_all.py +0 -12
  149. tests/utils.py +0 -13
  150. tests/vlm/__init__.py +0 -1
  151. tests/vlm/test_vlmeval.py +0 -102
  152. {tests/rag → evalscope/benchmarks/ai2d}/__init__.py +0 -0
  153. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
  154. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
  155. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
@@ -141,35 +141,61 @@ class BBHAdapter(DefaultDataAdapter):
141
141
  @classmethod
142
142
  def _extract_mc_answer(cls, ans: str) -> str:
143
143
  """
144
- Extract the answer from the model output for Multiple choice task.
144
+ Extract normalized answer for BBH multiple-choice tasks.
145
+ Handles formats like:
146
+ - "answer is (A)"
147
+ - "The answer is A."
148
+ - Extra text after answer.
149
+ Always uses the *last* occurrence of "answer is".
145
150
  """
146
- ans_line = ans.split('answer is ')
147
- if len(ans_line) != 1:
148
- ans = ans_line[1].strip()
149
- match = re.search(r'\(([A-Z])\)*', ans)
151
+ ans = ans.strip()
152
+
153
+ parts = ans.split('So the answer is ')
154
+ if len(parts) > 1:
155
+ ans = parts[-1].strip()
156
+ ans = ans.split('\n')[0].strip()
157
+
158
+ # Remove trailing period
159
+ if ans.endswith('.'):
160
+ ans = ans[:-1].strip()
161
+
162
+ # Capture uppercase letter inside parentheses (A) (B) ...
163
+ match = re.search(r'\(([A-Z])\)', ans)
150
164
  if match:
151
165
  return match.group(1)
152
- match = re.search(r'([A-Z])', ans)
166
+
167
+ # Capture single uppercase letter
168
+ match = re.search(r'\b([A-Z])\b', ans)
153
169
  if match:
154
170
  return match.group(1)
171
+
155
172
  return ans
156
173
 
157
174
  @classmethod
158
175
  def _extract_ff_answer(cls, ans: str):
159
176
  """
160
- Extract the answer from the model output for Free-form task.
177
+ Extract the normalized answer for BBH free-form tasks.
178
+ Handles patterns like:
179
+ - "answer is XXX."
180
+ - "The answer is **valid**."
181
+ - Extra trailing dots / line breaks.
182
+ - Bold-marked answers (**xxx**).
183
+ Always uses the *last* occurrence of "answer is".
161
184
  """
162
- pattern = r'answer is\s+(.*?)\.'
185
+ ans = ans.strip()
163
186
 
164
- match = re.search(pattern, ans)
165
- if match:
166
- res = match.group(1)
167
- return res
187
+ parts = ans.split('So the answer is ')
188
+ if len(parts) > 1:
189
+ ans = parts[-1].strip()
190
+ ans = ans.split('\n')[0].strip()
168
191
 
169
- ans_line = ans.split('answer is ')
170
- if len(ans_line) != 1:
171
- ans = ans_line[1].strip()
172
- ans = ans.split('\n')[0]
192
+ # Remove trailing period
173
193
  if ans.endswith('.'):
174
- ans = ans[:-1]
194
+ ans = ans[:-1].strip()
195
+
196
+ # If answer is in bold (**xxx**), prefer the content inside
197
+ match = re.search(r'\*\*(.*?)\*\*', ans)
198
+ if match:
199
+ ans = match.group(1).strip()
200
+
175
201
  return ans
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  import re
3
3
  import traceback
4
- from typing import Any, Dict
4
+ from typing import Any, Dict, List
5
5
 
6
6
  from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
7
7
  from evalscope.api.dataset import Sample
@@ -11,6 +11,7 @@ from evalscope.api.metric import Score
11
11
  from evalscope.api.model import Model, ModelOutput
12
12
  from evalscope.api.registry import register_benchmark
13
13
  from evalscope.constants import Tags
14
+ from evalscope.report import Category, Report, Subset, unweighted_average_from_subsets, weighted_average_from_subsets
14
15
  from evalscope.utils.import_utils import check_import
15
16
  from evalscope.utils.logger import get_logger
16
17
 
@@ -67,10 +68,12 @@ class BFCLAdapter(DefaultDataAdapter):
67
68
  def __init__(self, **kwargs):
68
69
  super().__init__(**kwargs)
69
70
 
70
- check_import('bfcl_eval', package='bfcl-eval==2025.6.16', raise_error=True)
71
+ check_import('bfcl_eval', package='bfcl-eval==2025.6.16', raise_error=True, feature_name=self.pretty_name)
71
72
 
72
73
  self.category_map = SUBJECT_MAPPING
73
74
  self.reformat_subset = True
75
+ self.add_overall_metric = False
76
+ self.add_aggregation_name = False
74
77
 
75
78
  self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
76
79
  self.is_fc_model = self.extra_params.get('is_fc_model', True)
@@ -252,3 +255,104 @@ class BFCLAdapter(DefaultDataAdapter):
252
255
  score.metadata = {'error': traceback.format_exc()}
253
256
  score.main_score_name = 'acc'
254
257
  return score
258
+
259
+ def _on_generate_report_end(self, report: Report, output_dir, **kwargs):
260
+ """
261
+ Finalize the report generation process. Calculate the overall score.
262
+
263
+ Track the number of each category.
264
+ - step1: simple, java, javascript unweighted average as simple_ast
265
+ - step2.1: simple_ast, multiple, parallel, parallel_multiple unweighted average as ast_non_live
266
+ - step2.2: live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average as ast_live
267
+ - step2.3: irrelevance as hallucination_non_live
268
+ - step2.4: live_irrelevance, live_relevance weighted average as hallucination_live
269
+ - step2.5: multi_turn_base as multi_turn_base
270
+ - step2.6: multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average as multi_turn_augmented
271
+ - step3.1: ast_non_live, hallucination_non_live unweighted average as non_live
272
+ - step3.2: ast_live, hallucination_live weighted average as live
273
+ - step3.3: multi_turn_base, multi_turn_augmented unweighted average as multi_turn
274
+ - step4: non_live, live, multi_turn unweighted average as overall
275
+ Args:
276
+ report (Report): The generated evaluation report.
277
+ output_dir (str): The directory to save the report.
278
+
279
+ Returns:
280
+ None
281
+ """ # noqa: E501
282
+ for metric in report.metrics:
283
+ # Collect all subsets in a dictionary for easy access
284
+ subset_dict: Dict[str, Subset] = {}
285
+ for category in metric.categories:
286
+ for subset in category.subsets:
287
+ subset_dict[subset.name] = subset
288
+
289
+ # Step 1: Calculate simple_ast (simple, java, javascript unweighted average)
290
+ simple_subsets = ['simple', 'java', 'javascript']
291
+ simple_ast = unweighted_average_from_subsets(simple_subsets, subset_dict)
292
+ subset_dict['simple_ast'] = simple_ast
293
+
294
+ # Step 2.1: Calculate ast_non_live
295
+ # (simple_ast, multiple, parallel, parallel_multiple unweighted average)
296
+ ast_non_live_subsets = ['simple_ast', 'multiple', 'parallel', 'parallel_multiple']
297
+ ast_non_live = unweighted_average_from_subsets(ast_non_live_subsets, subset_dict)
298
+ subset_dict['ast_non_live'] = ast_non_live
299
+
300
+ # Step 2.2: Calculate ast_live
301
+ # (live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average)
302
+ live_subsets = ['live_simple', 'live_multiple', 'live_parallel', 'live_parallel_multiple']
303
+ ast_live = weighted_average_from_subsets(live_subsets, subset_dict)
304
+ subset_dict['ast_live'] = ast_live
305
+
306
+ # Step 2.3: hallucination_non_live (irrelevance)
307
+ if 'irrelevance' in subset_dict:
308
+ subset_dict['hallucination_non_live'] = subset_dict['irrelevance']
309
+ else:
310
+ subset_dict['hallucination_non_live'] = Subset(name='hallucination_non_live', score=0, num=0)
311
+
312
+ # Step 2.4: Calculate hallucination_live (live_irrelevance, live_relevance weighted average)
313
+ hallucination_live_subsets = ['live_irrelevance', 'live_relevance']
314
+ hallucination_live = weighted_average_from_subsets(hallucination_live_subsets, subset_dict)
315
+ subset_dict['hallucination_live'] = hallucination_live
316
+
317
+ # Step 2.5: multi_turn_base
318
+ if 'multi_turn_base' not in subset_dict:
319
+ subset_dict['multi_turn_base'] = Subset(name='multi_turn_base', score=0, num=0)
320
+
321
+ # Step 2.6: Calculate multi_turn_augmented
322
+ # (multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average)
323
+ multi_turn_augmented_subsets = ['multi_turn_miss_func', 'multi_turn_miss_param', 'multi_turn_long_context']
324
+ multi_turn_augmented = weighted_average_from_subsets(multi_turn_augmented_subsets, subset_dict)
325
+ subset_dict['multi_turn_augmented'] = multi_turn_augmented
326
+
327
+ # Step 3.1: Calculate non_live (ast_non_live, hallucination_non_live unweighted average)
328
+ non_live_subsets = ['ast_non_live', 'hallucination_non_live']
329
+ non_live = unweighted_average_from_subsets(non_live_subsets, subset_dict)
330
+ subset_dict['non_live'] = non_live
331
+
332
+ # Step 3.2: Calculate live (ast_live, hallucination_live weighted average)
333
+ live_agg_subsets = ['ast_live', 'hallucination_live']
334
+ live = weighted_average_from_subsets(live_agg_subsets, subset_dict)
335
+ subset_dict['live'] = live
336
+
337
+ # Step 3.3: Calculate multi_turn (multi_turn_base, multi_turn_augmented unweighted average)
338
+ multi_turn_subsets = ['multi_turn_base', 'multi_turn_augmented']
339
+ multi_turn = unweighted_average_from_subsets(multi_turn_subsets, subset_dict)
340
+ subset_dict['multi_turn'] = multi_turn
341
+
342
+ # Step 4: Calculate overall (non_live, live, multi_turn unweighted average)
343
+ overall_subsets = ['non_live', 'live', 'multi_turn']
344
+ overall = unweighted_average_from_subsets(overall_subsets, subset_dict)
345
+ subset_dict['overall'] = overall
346
+
347
+ # Add computed scores to the category
348
+ computed_subset_names = ['non_live', 'live', 'multi_turn', 'overall']
349
+
350
+ # Add the computed scores as new subsets in the metric
351
+ dummy_subsets = []
352
+ for subset_name in computed_subset_names:
353
+ if subset_name in subset_dict:
354
+ subset = subset_dict[subset_name]
355
+ subset.name = subset_name.upper()
356
+ dummy_subsets.append(subset)
357
+ dummy_category = Category(name='-', subsets=dummy_subsets)
358
+ metric.categories.append(dummy_category)
@@ -72,7 +72,8 @@ def generate_turn(model: Model, row: dict[str, Any]):
72
72
 
73
73
  # Handle the response based on the model output structure
74
74
  message = model_output.message
75
- model_usage += model_output.usage
75
+ if model_output.usage is not None:
76
+ model_usage += model_output.usage
76
77
 
77
78
  current_messages.append(message)
78
79
  if isinstance(message, str):
@@ -115,7 +116,7 @@ def generate_turn(model: Model, row: dict[str, Any]):
115
116
 
116
117
  n_steps += 1
117
118
  if n_steps > MAXIMUM_STEP_LIMIT:
118
- logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
119
+ logger.warning(f'INFERENCE_WARNING: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
119
120
  break
120
121
 
121
122
  all_model_responses.append(current_responses)
@@ -145,9 +146,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
145
146
  new_tools = row['missing_functions'][str(turn_idx)]
146
147
  for new_tool in new_tools:
147
148
  cur_tool = new_tool[0]
148
- # change type to object
149
- if cur_tool['parameters']['type'] != 'object':
150
- cur_tool['parameters']['type'] = 'object'
149
+ cur_tool['parameters']['type'] = 'object'
151
150
  tools.append({
152
151
  'type': 'function',
153
152
  'function': cur_tool,
@@ -172,7 +171,8 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
172
171
 
173
172
  # Handle the response based on the model output structure
174
173
  message = model_output.message
175
- model_usage += model_output.usage
174
+ if model_output.usage is not None:
175
+ model_usage += model_output.usage
176
176
 
177
177
  current_messages.append(message)
178
178
  if isinstance(message, str):
@@ -214,7 +214,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
214
214
 
215
215
  n_steps += 1
216
216
  if n_steps > MAXIMUM_STEP_LIMIT:
217
- logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
217
+ logger.warning(f'INFERENCE_WARNING: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
218
218
  break
219
219
 
220
220
  all_model_responses.append(current_responses)
File without changes
@@ -0,0 +1,61 @@
1
+ import re
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
7
+ from evalscope.api.registry import register_benchmark
8
+ from evalscope.constants import Tags
9
+ from evalscope.utils.io_utils import bytes_to_base64
10
+ from evalscope.utils.logger import get_logger
11
+ from evalscope.utils.multi_choices import format_letter_choices
12
+
13
+ logger = get_logger()
14
+
15
+ MULT_CHOICE_PROMPT = r"""
16
+ Answer the following multiple choice question. The last line of your response should be of the following format:
17
+ 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
18
+
19
+ {question}
20
+ """.strip()
21
+
22
+ SUBSET_LIST = [
23
+ 'Art_Style', 'Counting', 'Forensic_Detection', 'Functional_Correspondence', 'IQ_Test', 'Jigsaw',
24
+ 'Multi-view_Reasoning', 'Object_Localization', 'Relative_Depth', 'Relative_Reflectance', 'Semantic_Correspondence',
25
+ 'Spatial_Relation', 'Visual_Correspondence', 'Visual_Similarity'
26
+ ]
27
+
28
+
29
+ @register_benchmark(
30
+ BenchmarkMeta(
31
+ name='blink',
32
+ pretty_name='BLINK',
33
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
34
+ description=
35
+ 'BLINK is a benchmark designed to evaluate the core visual perception abilities of multimodal large language models (MLLMs). It transforms 14 classic computer vision tasks into 3,807 multiple-choice questions, accompanied by single or multiple images and visual prompts.', # noqa: E501
36
+ dataset_id='evalscope/BLINK',
37
+ subset_list=SUBSET_LIST,
38
+ metric_list=['acc'],
39
+ eval_split='val',
40
+ prompt_template=MULT_CHOICE_PROMPT,
41
+ )
42
+ )
43
+ class BLINKAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
44
+ MAX_IMAGES: int = 4
45
+
46
+ def __init__(self, **kwargs):
47
+ super().__init__(**kwargs)
48
+
49
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
50
+ choices = record.get('choices')
51
+ input_text = MULT_CHOICE_PROMPT.format(question=record['prompt'], letters=format_letter_choices(choices))
52
+ content_list: List[Content] = [ContentText(text=input_text)]
53
+
54
+ for i in range(1, self.MAX_IMAGES + 1):
55
+ image = record.get(f'image_{i}')
56
+ if image:
57
+ image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
58
+ content_list.append(ContentImage(image=image_base64))
59
+
60
+ label_answer = record['answer'].strip('(').strip(')')
61
+ return Sample(input=[ChatMessageUser(content=content_list)], choices=choices, target=label_answer)
File without changes
@@ -0,0 +1,80 @@
1
+ import re
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.metric.scorer import Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.utils.io_utils import bytes_to_base64
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ # flake8: noqa
15
+
16
+ logger = get_logger()
17
+
18
+ OPEN_PROMPT = """
19
+ {question}
20
+
21
+ The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the a single word answer to the problem.
22
+ """
23
+
24
+
25
+ @register_benchmark(
26
+ BenchmarkMeta(
27
+ name='chartqa',
28
+ pretty_name='ChartQA',
29
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
30
+ description=
31
+ 'ChartQA is a benchmark designed to evaluate question-answering capabilities about charts (e.g., bar charts, line graphs, pie charts), focusing on both visual and logical reasoning.', # noqa: E501
32
+ dataset_id='lmms-lab/ChartQA',
33
+ subset_list=['human_test', 'augmented_test'],
34
+ metric_list=['relaxed_acc'],
35
+ eval_split='test',
36
+ prompt_template=OPEN_PROMPT,
37
+ )
38
+ )
39
+ class ChartQAAdapter(VisionLanguageAdapter):
40
+
41
+ def __init__(self, *args, **kwargs):
42
+ super().__init__(*args, **kwargs)
43
+
44
+ self.add_aggregation_name = False
45
+ self.reformat_subset = True
46
+
47
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
48
+ question = record['question']
49
+ image_data = record['image']
50
+ image_base64 = bytes_to_base64(image_data['bytes'], format='png', add_header=True)
51
+
52
+ content_list: List[Content] = [
53
+ ContentText(text=OPEN_PROMPT.format(question=question)),
54
+ ContentImage(image=image_base64)
55
+ ]
56
+
57
+ return Sample(
58
+ input=[ChatMessageUser(content=content_list)],
59
+ target=record['answer'],
60
+ subset_key=record['type'], # 'human_test' or 'augmented_split'
61
+ )
62
+
63
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
64
+ pattern = r'ANSWER:\s*(.*)'
65
+ match = re.search(pattern, prediction)
66
+ if match:
67
+ return match.group(1).strip()
68
+ return ''
69
+
70
+ def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
71
+ from .utils import relaxed_correctness
72
+
73
+ score = relaxed_correctness(filtered_prediction, reference)
74
+ score = 1.0 if score else 0.0
75
+
76
+ return Score(
77
+ value={'relaxed_acc': score},
78
+ prediction=original_prediction,
79
+ extracted_prediction=filtered_prediction,
80
+ )
@@ -0,0 +1,38 @@
1
+ def relaxed_correctness(prediction: str, target: str, max_relative_change: float = 0.05) -> bool:
2
+ """Calculates relaxed correctness.
3
+
4
+ The correctness tolerates certain error ratio defined by max_relative_change.
5
+ See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
6
+ “Following Methani et al. (2020), we use a relaxed accuracy measure for the
7
+ numeric answers to allow a minor inaccuracy that may result from the automatic
8
+ data extraction process. We consider an answer to be correct if it is within
9
+ 5% of the gold answer. For non-numeric answers, we still need an exact match
10
+ to consider an answer to be correct.”
11
+
12
+ This funcion is taken from https://github.com/QwenLM/Qwen-VL/blob/34b4c0ee7b07726371b960911f249fe61b362ca3/eval_mm/evaluate_vqa.py#L113
13
+ Args:
14
+ target: List of target string.
15
+ prediction: List of predicted string.
16
+ max_relative_change: Maximum relative change.
17
+
18
+ Returns:
19
+ Whether the prediction was correct given the specified tolerance.
20
+ """ # noqa: E501
21
+
22
+ def _to_float(text: str):
23
+ try:
24
+ if text.endswith('%'):
25
+ # Convert percentages to floats.
26
+ return float(text.rstrip('%')) / 100.0
27
+ else:
28
+ return float(text)
29
+ except ValueError:
30
+ return None
31
+
32
+ prediction_float = _to_float(prediction)
33
+ target_float = _to_float(target)
34
+ if prediction_float is not None and target_float:
35
+ relative_change = abs(prediction_float - target_float) / abs(target_float)
36
+ return relative_change <= max_relative_change
37
+ else:
38
+ return prediction.lower() == target.lower()
File without changes
@@ -0,0 +1,67 @@
1
+ import json
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator.state import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.io_utils import bytes_to_base64
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+ PROMPT = """Answer the question according to the image using a single word or phrase.
16
+ {question}
17
+ The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the question.""" # noqa: E501
18
+
19
+
20
+ @register_benchmark(
21
+ BenchmarkMeta(
22
+ name='docvqa',
23
+ pretty_name='DocVQA',
24
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
25
+ description=
26
+ 'DocVQA (Document Visual Question Answering) is a benchmark designed to evaluate AI systems on their ability to answer questions based on the content of document images, such as scanned pages, forms, or invoices. Unlike general visual question answering, it requires understanding not just the text extracted by OCR, but also the complex layout, structure, and visual elements of a document.', # noqa: E501
27
+ dataset_id='lmms-lab/DocVQA',
28
+ subset_list=['DocVQA'],
29
+ metric_list=['anls'],
30
+ eval_split='validation',
31
+ prompt_template=PROMPT,
32
+ )
33
+ )
34
+ class DocVQAAdapter(VisionLanguageAdapter):
35
+
36
+ def __init__(self, **kwargs):
37
+ super().__init__(**kwargs)
38
+ self.add_aggregation_name = False
39
+
40
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
41
+
42
+ input_text = PROMPT.format(question=record['question'])
43
+ content_list: List[Content] = [ContentText(text=input_text)]
44
+ image = record.get('image')
45
+ if image:
46
+ image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
47
+ content_list.append(ContentImage(image=image_base64))
48
+ return Sample(
49
+ input=[ChatMessageUser(content=content_list)],
50
+ target=json.dumps(record.get('answers')), # answers is a list
51
+ metadata={
52
+ 'questionId': record.get('questionId'),
53
+ 'question_types': record.get('question_types'),
54
+ 'docId': record.get('docId'),
55
+ 'ucsf_document_id': record.get('ucsf_document_id'),
56
+ 'ucsf_document_page_no': record.get('ucsf_document_page_no'),
57
+ }
58
+ )
59
+
60
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
61
+ import re
62
+
63
+ pattern = r'ANSWER:\s*(.*)'
64
+ match = re.search(pattern, prediction)
65
+ if match:
66
+ return match.group(1).strip()
67
+ return prediction.strip()
@@ -54,7 +54,7 @@ class DROPAdapter(DefaultDataAdapter):
54
54
  def __init__(self, **kwargs):
55
55
  super().__init__(**kwargs)
56
56
 
57
- if self.few_shot_num != 0:
57
+ if self.few_shot_num != 0 and self.few_shot_num != 3:
58
58
  self.few_shot_num = 3
59
59
  logger.info(f'Few shot num is set to {self.few_shot_num} for DROP dataset by system.')
60
60
  else:
@@ -34,7 +34,8 @@ def process_review_item(review_result: ReviewResult) -> list:
34
34
  'Index': str(review_result.index),
35
35
  'Input': review_result.input,
36
36
  'Question': review_result.input, # Use input as question
37
- 'Generated': prediction if prediction != extracted_prediction else extracted_prediction,
37
+ 'Generated':
38
+ prediction if prediction != extracted_prediction else extracted_prediction or '', # Ensure no None value
38
39
  'Gold': target,
39
40
  'Pred': extracted_prediction,
40
41
  'Score': sample_score.score.model_dump(exclude_none=True),
File without changes