evalscope 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/adapters/default_data_adapter.py +18 -4
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
- evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
- evalscope/api/benchmark/benchmark.py +27 -2
- evalscope/api/benchmark/meta.py +3 -0
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +5 -0
- evalscope/api/messages/chat_message.py +6 -1
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +204 -0
- evalscope/api/model/generate_config.py +0 -3
- evalscope/api/model/model.py +1 -1
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +8 -2
- evalscope/app/utils/data_utils.py +3 -2
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +6 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +46 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/bfcl_adapter.py +106 -2
- evalscope/benchmarks/bfcl/generation.py +7 -7
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drop/drop_adapter.py +1 -1
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
- evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
- evalscope/config.py +24 -1
- evalscope/constants.py +3 -0
- evalscope/evaluator/evaluator.py +25 -7
- evalscope/metrics/metric.py +78 -2
- evalscope/metrics/metrics.py +16 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/model_apis.py +10 -8
- evalscope/models/utils/openai.py +1 -2
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/plugin/api/base.py +2 -2
- evalscope/perf/plugin/api/default_api.py +7 -7
- evalscope/perf/plugin/api/openai_api.py +83 -19
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/report/__init__.py +9 -1
- evalscope/report/combinator.py +45 -20
- evalscope/report/report.py +8 -4
- evalscope/run.py +1 -1
- evalscope/utils/function_utils.py +41 -0
- evalscope/utils/import_utils.py +63 -13
- evalscope/utils/io_utils.py +19 -11
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +19 -0
- evalscope/utils/model_utils.py +1 -1
- evalscope/utils/multi_choices.py +16 -1
- evalscope/version.py +2 -2
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/METADATA +10 -40
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/RECORD +120 -95
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
- tests/__init__.py +0 -1
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -385
- tests/benchmark/test_image_edit.py +0 -65
- tests/benchmark/test_t2i.py +0 -142
- tests/benchmark/test_vlm.py +0 -80
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -269
- tests/cli/test_collection.py +0 -99
- tests/cli/test_custom.py +0 -268
- tests/cli/test_reasoning.py +0 -81
- tests/common.py +0 -73
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -178
- tests/rag/test_clip_benchmark.py +0 -87
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- {tests/rag → evalscope/benchmarks/ai2d}/__init__.py +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
|
@@ -141,35 +141,61 @@ class BBHAdapter(DefaultDataAdapter):
|
|
|
141
141
|
@classmethod
|
|
142
142
|
def _extract_mc_answer(cls, ans: str) -> str:
|
|
143
143
|
"""
|
|
144
|
-
Extract
|
|
144
|
+
Extract normalized answer for BBH multiple-choice tasks.
|
|
145
|
+
Handles formats like:
|
|
146
|
+
- "answer is (A)"
|
|
147
|
+
- "The answer is A."
|
|
148
|
+
- Extra text after answer.
|
|
149
|
+
Always uses the *last* occurrence of "answer is".
|
|
145
150
|
"""
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
151
|
+
ans = ans.strip()
|
|
152
|
+
|
|
153
|
+
parts = ans.split('So the answer is ')
|
|
154
|
+
if len(parts) > 1:
|
|
155
|
+
ans = parts[-1].strip()
|
|
156
|
+
ans = ans.split('\n')[0].strip()
|
|
157
|
+
|
|
158
|
+
# Remove trailing period
|
|
159
|
+
if ans.endswith('.'):
|
|
160
|
+
ans = ans[:-1].strip()
|
|
161
|
+
|
|
162
|
+
# Capture uppercase letter inside parentheses (A) (B) ...
|
|
163
|
+
match = re.search(r'\(([A-Z])\)', ans)
|
|
150
164
|
if match:
|
|
151
165
|
return match.group(1)
|
|
152
|
-
|
|
166
|
+
|
|
167
|
+
# Capture single uppercase letter
|
|
168
|
+
match = re.search(r'\b([A-Z])\b', ans)
|
|
153
169
|
if match:
|
|
154
170
|
return match.group(1)
|
|
171
|
+
|
|
155
172
|
return ans
|
|
156
173
|
|
|
157
174
|
@classmethod
|
|
158
175
|
def _extract_ff_answer(cls, ans: str):
|
|
159
176
|
"""
|
|
160
|
-
Extract the answer
|
|
177
|
+
Extract the normalized answer for BBH free-form tasks.
|
|
178
|
+
Handles patterns like:
|
|
179
|
+
- "answer is XXX."
|
|
180
|
+
- "The answer is **valid**."
|
|
181
|
+
- Extra trailing dots / line breaks.
|
|
182
|
+
- Bold-marked answers (**xxx**).
|
|
183
|
+
Always uses the *last* occurrence of "answer is".
|
|
161
184
|
"""
|
|
162
|
-
|
|
185
|
+
ans = ans.strip()
|
|
163
186
|
|
|
164
|
-
|
|
165
|
-
if
|
|
166
|
-
|
|
167
|
-
|
|
187
|
+
parts = ans.split('So the answer is ')
|
|
188
|
+
if len(parts) > 1:
|
|
189
|
+
ans = parts[-1].strip()
|
|
190
|
+
ans = ans.split('\n')[0].strip()
|
|
168
191
|
|
|
169
|
-
|
|
170
|
-
if len(ans_line) != 1:
|
|
171
|
-
ans = ans_line[1].strip()
|
|
172
|
-
ans = ans.split('\n')[0]
|
|
192
|
+
# Remove trailing period
|
|
173
193
|
if ans.endswith('.'):
|
|
174
|
-
ans = ans[:-1]
|
|
194
|
+
ans = ans[:-1].strip()
|
|
195
|
+
|
|
196
|
+
# If answer is in bold (**xxx**), prefer the content inside
|
|
197
|
+
match = re.search(r'\*\*(.*?)\*\*', ans)
|
|
198
|
+
if match:
|
|
199
|
+
ans = match.group(1).strip()
|
|
200
|
+
|
|
175
201
|
return ans
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import re
|
|
3
3
|
import traceback
|
|
4
|
-
from typing import Any, Dict
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
5
|
|
|
6
6
|
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
7
7
|
from evalscope.api.dataset import Sample
|
|
@@ -11,6 +11,7 @@ from evalscope.api.metric import Score
|
|
|
11
11
|
from evalscope.api.model import Model, ModelOutput
|
|
12
12
|
from evalscope.api.registry import register_benchmark
|
|
13
13
|
from evalscope.constants import Tags
|
|
14
|
+
from evalscope.report import Category, Report, Subset, unweighted_average_from_subsets, weighted_average_from_subsets
|
|
14
15
|
from evalscope.utils.import_utils import check_import
|
|
15
16
|
from evalscope.utils.logger import get_logger
|
|
16
17
|
|
|
@@ -67,10 +68,12 @@ class BFCLAdapter(DefaultDataAdapter):
|
|
|
67
68
|
def __init__(self, **kwargs):
|
|
68
69
|
super().__init__(**kwargs)
|
|
69
70
|
|
|
70
|
-
check_import('bfcl_eval', package='bfcl-eval==2025.6.16', raise_error=True)
|
|
71
|
+
check_import('bfcl_eval', package='bfcl-eval==2025.6.16', raise_error=True, feature_name=self.pretty_name)
|
|
71
72
|
|
|
72
73
|
self.category_map = SUBJECT_MAPPING
|
|
73
74
|
self.reformat_subset = True
|
|
75
|
+
self.add_overall_metric = False
|
|
76
|
+
self.add_aggregation_name = False
|
|
74
77
|
|
|
75
78
|
self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
|
|
76
79
|
self.is_fc_model = self.extra_params.get('is_fc_model', True)
|
|
@@ -252,3 +255,104 @@ class BFCLAdapter(DefaultDataAdapter):
|
|
|
252
255
|
score.metadata = {'error': traceback.format_exc()}
|
|
253
256
|
score.main_score_name = 'acc'
|
|
254
257
|
return score
|
|
258
|
+
|
|
259
|
+
def _on_generate_report_end(self, report: Report, output_dir, **kwargs):
|
|
260
|
+
"""
|
|
261
|
+
Finalize the report generation process. Calculate the overall score.
|
|
262
|
+
|
|
263
|
+
Track the number of each category.
|
|
264
|
+
- step1: simple, java, javascript unweighted average as simple_ast
|
|
265
|
+
- step2.1: simple_ast, multiple, parallel, parallel_multiple unweighted average as ast_non_live
|
|
266
|
+
- step2.2: live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average as ast_live
|
|
267
|
+
- step2.3: irrelevance as hallucination_non_live
|
|
268
|
+
- step2.4: live_irrelevance, live_relevance weighted average as hallucination_live
|
|
269
|
+
- step2.5: multi_turn_base as multi_turn_base
|
|
270
|
+
- step2.6: multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average as multi_turn_augmented
|
|
271
|
+
- step3.1: ast_non_live, hallucination_non_live unweighted average as non_live
|
|
272
|
+
- step3.2: ast_live, hallucination_live weighted average as live
|
|
273
|
+
- step3.3: multi_turn_base, multi_turn_augmented unweighted average as multi_turn
|
|
274
|
+
- step4: non_live, live, multi_turn unweighted average as overall
|
|
275
|
+
Args:
|
|
276
|
+
report (Report): The generated evaluation report.
|
|
277
|
+
output_dir (str): The directory to save the report.
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
None
|
|
281
|
+
""" # noqa: E501
|
|
282
|
+
for metric in report.metrics:
|
|
283
|
+
# Collect all subsets in a dictionary for easy access
|
|
284
|
+
subset_dict: Dict[str, Subset] = {}
|
|
285
|
+
for category in metric.categories:
|
|
286
|
+
for subset in category.subsets:
|
|
287
|
+
subset_dict[subset.name] = subset
|
|
288
|
+
|
|
289
|
+
# Step 1: Calculate simple_ast (simple, java, javascript unweighted average)
|
|
290
|
+
simple_subsets = ['simple', 'java', 'javascript']
|
|
291
|
+
simple_ast = unweighted_average_from_subsets(simple_subsets, subset_dict)
|
|
292
|
+
subset_dict['simple_ast'] = simple_ast
|
|
293
|
+
|
|
294
|
+
# Step 2.1: Calculate ast_non_live
|
|
295
|
+
# (simple_ast, multiple, parallel, parallel_multiple unweighted average)
|
|
296
|
+
ast_non_live_subsets = ['simple_ast', 'multiple', 'parallel', 'parallel_multiple']
|
|
297
|
+
ast_non_live = unweighted_average_from_subsets(ast_non_live_subsets, subset_dict)
|
|
298
|
+
subset_dict['ast_non_live'] = ast_non_live
|
|
299
|
+
|
|
300
|
+
# Step 2.2: Calculate ast_live
|
|
301
|
+
# (live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average)
|
|
302
|
+
live_subsets = ['live_simple', 'live_multiple', 'live_parallel', 'live_parallel_multiple']
|
|
303
|
+
ast_live = weighted_average_from_subsets(live_subsets, subset_dict)
|
|
304
|
+
subset_dict['ast_live'] = ast_live
|
|
305
|
+
|
|
306
|
+
# Step 2.3: hallucination_non_live (irrelevance)
|
|
307
|
+
if 'irrelevance' in subset_dict:
|
|
308
|
+
subset_dict['hallucination_non_live'] = subset_dict['irrelevance']
|
|
309
|
+
else:
|
|
310
|
+
subset_dict['hallucination_non_live'] = Subset(name='hallucination_non_live', score=0, num=0)
|
|
311
|
+
|
|
312
|
+
# Step 2.4: Calculate hallucination_live (live_irrelevance, live_relevance weighted average)
|
|
313
|
+
hallucination_live_subsets = ['live_irrelevance', 'live_relevance']
|
|
314
|
+
hallucination_live = weighted_average_from_subsets(hallucination_live_subsets, subset_dict)
|
|
315
|
+
subset_dict['hallucination_live'] = hallucination_live
|
|
316
|
+
|
|
317
|
+
# Step 2.5: multi_turn_base
|
|
318
|
+
if 'multi_turn_base' not in subset_dict:
|
|
319
|
+
subset_dict['multi_turn_base'] = Subset(name='multi_turn_base', score=0, num=0)
|
|
320
|
+
|
|
321
|
+
# Step 2.6: Calculate multi_turn_augmented
|
|
322
|
+
# (multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average)
|
|
323
|
+
multi_turn_augmented_subsets = ['multi_turn_miss_func', 'multi_turn_miss_param', 'multi_turn_long_context']
|
|
324
|
+
multi_turn_augmented = weighted_average_from_subsets(multi_turn_augmented_subsets, subset_dict)
|
|
325
|
+
subset_dict['multi_turn_augmented'] = multi_turn_augmented
|
|
326
|
+
|
|
327
|
+
# Step 3.1: Calculate non_live (ast_non_live, hallucination_non_live unweighted average)
|
|
328
|
+
non_live_subsets = ['ast_non_live', 'hallucination_non_live']
|
|
329
|
+
non_live = unweighted_average_from_subsets(non_live_subsets, subset_dict)
|
|
330
|
+
subset_dict['non_live'] = non_live
|
|
331
|
+
|
|
332
|
+
# Step 3.2: Calculate live (ast_live, hallucination_live weighted average)
|
|
333
|
+
live_agg_subsets = ['ast_live', 'hallucination_live']
|
|
334
|
+
live = weighted_average_from_subsets(live_agg_subsets, subset_dict)
|
|
335
|
+
subset_dict['live'] = live
|
|
336
|
+
|
|
337
|
+
# Step 3.3: Calculate multi_turn (multi_turn_base, multi_turn_augmented unweighted average)
|
|
338
|
+
multi_turn_subsets = ['multi_turn_base', 'multi_turn_augmented']
|
|
339
|
+
multi_turn = unweighted_average_from_subsets(multi_turn_subsets, subset_dict)
|
|
340
|
+
subset_dict['multi_turn'] = multi_turn
|
|
341
|
+
|
|
342
|
+
# Step 4: Calculate overall (non_live, live, multi_turn unweighted average)
|
|
343
|
+
overall_subsets = ['non_live', 'live', 'multi_turn']
|
|
344
|
+
overall = unweighted_average_from_subsets(overall_subsets, subset_dict)
|
|
345
|
+
subset_dict['overall'] = overall
|
|
346
|
+
|
|
347
|
+
# Add computed scores to the category
|
|
348
|
+
computed_subset_names = ['non_live', 'live', 'multi_turn', 'overall']
|
|
349
|
+
|
|
350
|
+
# Add the computed scores as new subsets in the metric
|
|
351
|
+
dummy_subsets = []
|
|
352
|
+
for subset_name in computed_subset_names:
|
|
353
|
+
if subset_name in subset_dict:
|
|
354
|
+
subset = subset_dict[subset_name]
|
|
355
|
+
subset.name = subset_name.upper()
|
|
356
|
+
dummy_subsets.append(subset)
|
|
357
|
+
dummy_category = Category(name='-', subsets=dummy_subsets)
|
|
358
|
+
metric.categories.append(dummy_category)
|
|
@@ -72,7 +72,8 @@ def generate_turn(model: Model, row: dict[str, Any]):
|
|
|
72
72
|
|
|
73
73
|
# Handle the response based on the model output structure
|
|
74
74
|
message = model_output.message
|
|
75
|
-
|
|
75
|
+
if model_output.usage is not None:
|
|
76
|
+
model_usage += model_output.usage
|
|
76
77
|
|
|
77
78
|
current_messages.append(message)
|
|
78
79
|
if isinstance(message, str):
|
|
@@ -115,7 +116,7 @@ def generate_turn(model: Model, row: dict[str, Any]):
|
|
|
115
116
|
|
|
116
117
|
n_steps += 1
|
|
117
118
|
if n_steps > MAXIMUM_STEP_LIMIT:
|
|
118
|
-
logger.
|
|
119
|
+
logger.warning(f'INFERENCE_WARNING: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
|
|
119
120
|
break
|
|
120
121
|
|
|
121
122
|
all_model_responses.append(current_responses)
|
|
@@ -145,9 +146,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
|
|
|
145
146
|
new_tools = row['missing_functions'][str(turn_idx)]
|
|
146
147
|
for new_tool in new_tools:
|
|
147
148
|
cur_tool = new_tool[0]
|
|
148
|
-
|
|
149
|
-
if cur_tool['parameters']['type'] != 'object':
|
|
150
|
-
cur_tool['parameters']['type'] = 'object'
|
|
149
|
+
cur_tool['parameters']['type'] = 'object'
|
|
151
150
|
tools.append({
|
|
152
151
|
'type': 'function',
|
|
153
152
|
'function': cur_tool,
|
|
@@ -172,7 +171,8 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
|
|
|
172
171
|
|
|
173
172
|
# Handle the response based on the model output structure
|
|
174
173
|
message = model_output.message
|
|
175
|
-
|
|
174
|
+
if model_output.usage is not None:
|
|
175
|
+
model_usage += model_output.usage
|
|
176
176
|
|
|
177
177
|
current_messages.append(message)
|
|
178
178
|
if isinstance(message, str):
|
|
@@ -214,7 +214,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
|
|
|
214
214
|
|
|
215
215
|
n_steps += 1
|
|
216
216
|
if n_steps > MAXIMUM_STEP_LIMIT:
|
|
217
|
-
logger.
|
|
217
|
+
logger.warning(f'INFERENCE_WARNING: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
|
|
218
218
|
break
|
|
219
219
|
|
|
220
220
|
all_model_responses.append(current_responses)
|
|
File without changes
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
from evalscope.utils.multi_choices import format_letter_choices
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
MULT_CHOICE_PROMPT = r"""
|
|
16
|
+
Answer the following multiple choice question. The last line of your response should be of the following format:
|
|
17
|
+
'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
|
|
18
|
+
|
|
19
|
+
{question}
|
|
20
|
+
""".strip()
|
|
21
|
+
|
|
22
|
+
SUBSET_LIST = [
|
|
23
|
+
'Art_Style', 'Counting', 'Forensic_Detection', 'Functional_Correspondence', 'IQ_Test', 'Jigsaw',
|
|
24
|
+
'Multi-view_Reasoning', 'Object_Localization', 'Relative_Depth', 'Relative_Reflectance', 'Semantic_Correspondence',
|
|
25
|
+
'Spatial_Relation', 'Visual_Correspondence', 'Visual_Similarity'
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@register_benchmark(
|
|
30
|
+
BenchmarkMeta(
|
|
31
|
+
name='blink',
|
|
32
|
+
pretty_name='BLINK',
|
|
33
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
|
|
34
|
+
description=
|
|
35
|
+
'BLINK is a benchmark designed to evaluate the core visual perception abilities of multimodal large language models (MLLMs). It transforms 14 classic computer vision tasks into 3,807 multiple-choice questions, accompanied by single or multiple images and visual prompts.', # noqa: E501
|
|
36
|
+
dataset_id='evalscope/BLINK',
|
|
37
|
+
subset_list=SUBSET_LIST,
|
|
38
|
+
metric_list=['acc'],
|
|
39
|
+
eval_split='val',
|
|
40
|
+
prompt_template=MULT_CHOICE_PROMPT,
|
|
41
|
+
)
|
|
42
|
+
)
|
|
43
|
+
class BLINKAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
|
|
44
|
+
MAX_IMAGES: int = 4
|
|
45
|
+
|
|
46
|
+
def __init__(self, **kwargs):
|
|
47
|
+
super().__init__(**kwargs)
|
|
48
|
+
|
|
49
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
50
|
+
choices = record.get('choices')
|
|
51
|
+
input_text = MULT_CHOICE_PROMPT.format(question=record['prompt'], letters=format_letter_choices(choices))
|
|
52
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
53
|
+
|
|
54
|
+
for i in range(1, self.MAX_IMAGES + 1):
|
|
55
|
+
image = record.get(f'image_{i}')
|
|
56
|
+
if image:
|
|
57
|
+
image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
|
|
58
|
+
content_list.append(ContentImage(image=image_base64))
|
|
59
|
+
|
|
60
|
+
label_answer = record['answer'].strip('(').strip(')')
|
|
61
|
+
return Sample(input=[ChatMessageUser(content=content_list)], choices=choices, target=label_answer)
|
|
File without changes
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.metric.scorer import Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
# flake8: noqa
|
|
15
|
+
|
|
16
|
+
logger = get_logger()
|
|
17
|
+
|
|
18
|
+
OPEN_PROMPT = """
|
|
19
|
+
{question}
|
|
20
|
+
|
|
21
|
+
The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the a single word answer to the problem.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@register_benchmark(
|
|
26
|
+
BenchmarkMeta(
|
|
27
|
+
name='chartqa',
|
|
28
|
+
pretty_name='ChartQA',
|
|
29
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
30
|
+
description=
|
|
31
|
+
'ChartQA is a benchmark designed to evaluate question-answering capabilities about charts (e.g., bar charts, line graphs, pie charts), focusing on both visual and logical reasoning.', # noqa: E501
|
|
32
|
+
dataset_id='lmms-lab/ChartQA',
|
|
33
|
+
subset_list=['human_test', 'augmented_test'],
|
|
34
|
+
metric_list=['relaxed_acc'],
|
|
35
|
+
eval_split='test',
|
|
36
|
+
prompt_template=OPEN_PROMPT,
|
|
37
|
+
)
|
|
38
|
+
)
|
|
39
|
+
class ChartQAAdapter(VisionLanguageAdapter):
|
|
40
|
+
|
|
41
|
+
def __init__(self, *args, **kwargs):
|
|
42
|
+
super().__init__(*args, **kwargs)
|
|
43
|
+
|
|
44
|
+
self.add_aggregation_name = False
|
|
45
|
+
self.reformat_subset = True
|
|
46
|
+
|
|
47
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
48
|
+
question = record['question']
|
|
49
|
+
image_data = record['image']
|
|
50
|
+
image_base64 = bytes_to_base64(image_data['bytes'], format='png', add_header=True)
|
|
51
|
+
|
|
52
|
+
content_list: List[Content] = [
|
|
53
|
+
ContentText(text=OPEN_PROMPT.format(question=question)),
|
|
54
|
+
ContentImage(image=image_base64)
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
return Sample(
|
|
58
|
+
input=[ChatMessageUser(content=content_list)],
|
|
59
|
+
target=record['answer'],
|
|
60
|
+
subset_key=record['type'], # 'human_test' or 'augmented_split'
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
64
|
+
pattern = r'ANSWER:\s*(.*)'
|
|
65
|
+
match = re.search(pattern, prediction)
|
|
66
|
+
if match:
|
|
67
|
+
return match.group(1).strip()
|
|
68
|
+
return ''
|
|
69
|
+
|
|
70
|
+
def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
|
|
71
|
+
from .utils import relaxed_correctness
|
|
72
|
+
|
|
73
|
+
score = relaxed_correctness(filtered_prediction, reference)
|
|
74
|
+
score = 1.0 if score else 0.0
|
|
75
|
+
|
|
76
|
+
return Score(
|
|
77
|
+
value={'relaxed_acc': score},
|
|
78
|
+
prediction=original_prediction,
|
|
79
|
+
extracted_prediction=filtered_prediction,
|
|
80
|
+
)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
def relaxed_correctness(prediction: str, target: str, max_relative_change: float = 0.05) -> bool:
|
|
2
|
+
"""Calculates relaxed correctness.
|
|
3
|
+
|
|
4
|
+
The correctness tolerates certain error ratio defined by max_relative_change.
|
|
5
|
+
See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
|
|
6
|
+
“Following Methani et al. (2020), we use a relaxed accuracy measure for the
|
|
7
|
+
numeric answers to allow a minor inaccuracy that may result from the automatic
|
|
8
|
+
data extraction process. We consider an answer to be correct if it is within
|
|
9
|
+
5% of the gold answer. For non-numeric answers, we still need an exact match
|
|
10
|
+
to consider an answer to be correct.”
|
|
11
|
+
|
|
12
|
+
This funcion is taken from https://github.com/QwenLM/Qwen-VL/blob/34b4c0ee7b07726371b960911f249fe61b362ca3/eval_mm/evaluate_vqa.py#L113
|
|
13
|
+
Args:
|
|
14
|
+
target: List of target string.
|
|
15
|
+
prediction: List of predicted string.
|
|
16
|
+
max_relative_change: Maximum relative change.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Whether the prediction was correct given the specified tolerance.
|
|
20
|
+
""" # noqa: E501
|
|
21
|
+
|
|
22
|
+
def _to_float(text: str):
|
|
23
|
+
try:
|
|
24
|
+
if text.endswith('%'):
|
|
25
|
+
# Convert percentages to floats.
|
|
26
|
+
return float(text.rstrip('%')) / 100.0
|
|
27
|
+
else:
|
|
28
|
+
return float(text)
|
|
29
|
+
except ValueError:
|
|
30
|
+
return None
|
|
31
|
+
|
|
32
|
+
prediction_float = _to_float(prediction)
|
|
33
|
+
target_float = _to_float(target)
|
|
34
|
+
if prediction_float is not None and target_float:
|
|
35
|
+
relative_change = abs(prediction_float - target_float) / abs(target_float)
|
|
36
|
+
return relative_change <= max_relative_change
|
|
37
|
+
else:
|
|
38
|
+
return prediction.lower() == target.lower()
|
|
File without changes
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator.state import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
PROMPT = """Answer the question according to the image using a single word or phrase.
|
|
16
|
+
{question}
|
|
17
|
+
The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the question.""" # noqa: E501
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@register_benchmark(
|
|
21
|
+
BenchmarkMeta(
|
|
22
|
+
name='docvqa',
|
|
23
|
+
pretty_name='DocVQA',
|
|
24
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
25
|
+
description=
|
|
26
|
+
'DocVQA (Document Visual Question Answering) is a benchmark designed to evaluate AI systems on their ability to answer questions based on the content of document images, such as scanned pages, forms, or invoices. Unlike general visual question answering, it requires understanding not just the text extracted by OCR, but also the complex layout, structure, and visual elements of a document.', # noqa: E501
|
|
27
|
+
dataset_id='lmms-lab/DocVQA',
|
|
28
|
+
subset_list=['DocVQA'],
|
|
29
|
+
metric_list=['anls'],
|
|
30
|
+
eval_split='validation',
|
|
31
|
+
prompt_template=PROMPT,
|
|
32
|
+
)
|
|
33
|
+
)
|
|
34
|
+
class DocVQAAdapter(VisionLanguageAdapter):
|
|
35
|
+
|
|
36
|
+
def __init__(self, **kwargs):
|
|
37
|
+
super().__init__(**kwargs)
|
|
38
|
+
self.add_aggregation_name = False
|
|
39
|
+
|
|
40
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
41
|
+
|
|
42
|
+
input_text = PROMPT.format(question=record['question'])
|
|
43
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
44
|
+
image = record.get('image')
|
|
45
|
+
if image:
|
|
46
|
+
image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
|
|
47
|
+
content_list.append(ContentImage(image=image_base64))
|
|
48
|
+
return Sample(
|
|
49
|
+
input=[ChatMessageUser(content=content_list)],
|
|
50
|
+
target=json.dumps(record.get('answers')), # answers is a list
|
|
51
|
+
metadata={
|
|
52
|
+
'questionId': record.get('questionId'),
|
|
53
|
+
'question_types': record.get('question_types'),
|
|
54
|
+
'docId': record.get('docId'),
|
|
55
|
+
'ucsf_document_id': record.get('ucsf_document_id'),
|
|
56
|
+
'ucsf_document_page_no': record.get('ucsf_document_page_no'),
|
|
57
|
+
}
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
61
|
+
import re
|
|
62
|
+
|
|
63
|
+
pattern = r'ANSWER:\s*(.*)'
|
|
64
|
+
match = re.search(pattern, prediction)
|
|
65
|
+
if match:
|
|
66
|
+
return match.group(1).strip()
|
|
67
|
+
return prediction.strip()
|
|
@@ -54,7 +54,7 @@ class DROPAdapter(DefaultDataAdapter):
|
|
|
54
54
|
def __init__(self, **kwargs):
|
|
55
55
|
super().__init__(**kwargs)
|
|
56
56
|
|
|
57
|
-
if self.few_shot_num != 0:
|
|
57
|
+
if self.few_shot_num != 0 and self.few_shot_num != 3:
|
|
58
58
|
self.few_shot_num = 3
|
|
59
59
|
logger.info(f'Few shot num is set to {self.few_shot_num} for DROP dataset by system.')
|
|
60
60
|
else:
|
|
@@ -34,7 +34,8 @@ def process_review_item(review_result: ReviewResult) -> list:
|
|
|
34
34
|
'Index': str(review_result.index),
|
|
35
35
|
'Input': review_result.input,
|
|
36
36
|
'Question': review_result.input, # Use input as question
|
|
37
|
-
'Generated':
|
|
37
|
+
'Generated':
|
|
38
|
+
prediction if prediction != extracted_prediction else extracted_prediction or '', # Ensure no None value
|
|
38
39
|
'Gold': target,
|
|
39
40
|
'Pred': extracted_prediction,
|
|
40
41
|
'Score': sample_score.score.model_dump(exclude_none=True),
|
|
File without changes
|