evalscope 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/adapters/default_data_adapter.py +18 -4
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
- evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
- evalscope/api/benchmark/benchmark.py +27 -2
- evalscope/api/benchmark/meta.py +3 -0
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +5 -0
- evalscope/api/messages/chat_message.py +6 -1
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +204 -0
- evalscope/api/model/generate_config.py +0 -3
- evalscope/api/model/model.py +1 -1
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +8 -2
- evalscope/app/utils/data_utils.py +3 -2
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +6 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +46 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/bfcl_adapter.py +106 -2
- evalscope/benchmarks/bfcl/generation.py +7 -7
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drop/drop_adapter.py +1 -1
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
- evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
- evalscope/config.py +24 -1
- evalscope/constants.py +3 -0
- evalscope/evaluator/evaluator.py +25 -7
- evalscope/metrics/metric.py +78 -2
- evalscope/metrics/metrics.py +16 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/model_apis.py +10 -8
- evalscope/models/utils/openai.py +1 -2
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/plugin/api/base.py +2 -2
- evalscope/perf/plugin/api/default_api.py +7 -7
- evalscope/perf/plugin/api/openai_api.py +83 -19
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/report/__init__.py +9 -1
- evalscope/report/combinator.py +45 -20
- evalscope/report/report.py +8 -4
- evalscope/run.py +1 -1
- evalscope/utils/function_utils.py +41 -0
- evalscope/utils/import_utils.py +63 -13
- evalscope/utils/io_utils.py +19 -11
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +19 -0
- evalscope/utils/model_utils.py +1 -1
- evalscope/utils/multi_choices.py +16 -1
- evalscope/version.py +2 -2
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/METADATA +10 -40
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/RECORD +120 -95
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
- tests/__init__.py +0 -1
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -385
- tests/benchmark/test_image_edit.py +0 -65
- tests/benchmark/test_t2i.py +0 -142
- tests/benchmark/test_vlm.py +0 -80
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -269
- tests/cli/test_collection.py +0 -99
- tests/cli/test_custom.py +0 -268
- tests/cli/test_reasoning.py +0 -81
- tests/common.py +0 -73
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -178
- tests/rag/test_clip_benchmark.py +0 -87
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- {tests/rag → evalscope/benchmarks/ai2d}/__init__.py +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.evaluator.state import TaskState
|
|
6
|
+
from evalscope.api.messages.chat_message import ChatMessageUser
|
|
7
|
+
from evalscope.api.messages.content import Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.metric.scorer import Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
SUBSET_LIST = [
|
|
17
|
+
'OE_MM_maths_en_COMP',
|
|
18
|
+
'OE_MM_maths_zh_CEE',
|
|
19
|
+
'OE_MM_maths_zh_COMP',
|
|
20
|
+
'OE_MM_physics_en_COMP',
|
|
21
|
+
'OE_MM_physics_zh_CEE',
|
|
22
|
+
'OE_TO_maths_en_COMP',
|
|
23
|
+
'OE_TO_maths_zh_CEE',
|
|
24
|
+
'OE_TO_maths_zh_COMP',
|
|
25
|
+
'OE_TO_physics_en_COMP',
|
|
26
|
+
'OE_TO_physics_zh_CEE',
|
|
27
|
+
'TP_MM_maths_en_COMP',
|
|
28
|
+
'TP_MM_maths_zh_CEE',
|
|
29
|
+
'TP_MM_maths_zh_COMP',
|
|
30
|
+
'TP_MM_physics_en_COMP',
|
|
31
|
+
'TP_TO_maths_en_COMP',
|
|
32
|
+
'TP_TO_maths_zh_CEE',
|
|
33
|
+
'TP_TO_maths_zh_COMP',
|
|
34
|
+
'TP_TO_physics_en_COMP',
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@register_benchmark(
|
|
39
|
+
BenchmarkMeta(
|
|
40
|
+
name='olympiad_bench',
|
|
41
|
+
pretty_name='OlympiadBench',
|
|
42
|
+
tags=[Tags.MATH, Tags.REASONING],
|
|
43
|
+
description='OlympiadBench is an Olympiad-level bilingual multimodal '
|
|
44
|
+
'scientific benchmark, featuring 8,476 problems from '
|
|
45
|
+
'Olympiad-level mathematics and physics competitions, '
|
|
46
|
+
'including the Chinese college entrance exam. '
|
|
47
|
+
'In the subsets: `OE` stands for `Open-Ended`, '
|
|
48
|
+
'`TP` stands for `Theorem Proving`, '
|
|
49
|
+
'`MM` stands for `Multimodal`, '
|
|
50
|
+
'`TO` stands for `Text-Only`, '
|
|
51
|
+
'`CEE` stands for `Chinese Entrance Exam`, '
|
|
52
|
+
'`COMP` stands for `Comprehensive`. '
|
|
53
|
+
'**Note: The `TP` subsets can\'t be evaluated with auto-judge for now**.',
|
|
54
|
+
dataset_id='AI-ModelScope/OlympiadBench',
|
|
55
|
+
subset_list=SUBSET_LIST,
|
|
56
|
+
metric_list=['acc'],
|
|
57
|
+
eval_split='train',
|
|
58
|
+
prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
59
|
+
)
|
|
60
|
+
)
|
|
61
|
+
class OlympiadBenchAdapter(VisionLanguageAdapter):
|
|
62
|
+
|
|
63
|
+
def __init__(self, *args, **kwargs):
|
|
64
|
+
super().__init__(*args, **kwargs)
|
|
65
|
+
|
|
66
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
67
|
+
"""Generate prompt for a single item."""
|
|
68
|
+
from .utils import OlympiadBenchPrompter
|
|
69
|
+
|
|
70
|
+
question = record.get('question', '')
|
|
71
|
+
language = record.get('language', 'English')
|
|
72
|
+
subject = record.get('subject', 'Math')
|
|
73
|
+
question_type = record.get('question_type', '')
|
|
74
|
+
answer_type = record.get('answer_type', '')
|
|
75
|
+
is_multiple_answer = record.get('is_multiple_answer', False)
|
|
76
|
+
unit = record.get('unit', '')
|
|
77
|
+
# Generate prompt
|
|
78
|
+
prompt = OlympiadBenchPrompter().make_prompt(
|
|
79
|
+
problem=question,
|
|
80
|
+
language=language,
|
|
81
|
+
subject=subject,
|
|
82
|
+
question_type=question_type,
|
|
83
|
+
answer_type=answer_type,
|
|
84
|
+
is_multiple_answer=is_multiple_answer,
|
|
85
|
+
unit=unit,
|
|
86
|
+
)
|
|
87
|
+
# Construct content list
|
|
88
|
+
content_list: List[Content] = []
|
|
89
|
+
# Add images if available
|
|
90
|
+
for i in range(9):
|
|
91
|
+
image = record.get(f'image_{i+1}')
|
|
92
|
+
if image:
|
|
93
|
+
image_base64 = bytes_to_base64(image['bytes'], format='jpg', add_header=True)
|
|
94
|
+
content_list.append(ContentImage(image=image_base64))
|
|
95
|
+
prompt = prompt.replace(f'<image_{i+1}>', f'[image_{i+1}]') # replace html tag
|
|
96
|
+
# Add text content
|
|
97
|
+
content_list.insert(0, ContentText(text=prompt))
|
|
98
|
+
|
|
99
|
+
final_answer = record.get('final_answer', [])
|
|
100
|
+
return Sample(
|
|
101
|
+
input=[ChatMessageUser(content=content_list)],
|
|
102
|
+
target=','.join(final_answer) if final_answer else '',
|
|
103
|
+
metadata={
|
|
104
|
+
'id': record.get('id', ''),
|
|
105
|
+
'subfield': record.get('subfield', ''),
|
|
106
|
+
'context': record.get('context', ''),
|
|
107
|
+
'solution': record.get('solution', []),
|
|
108
|
+
'final_answer': record.get('final_answer', []),
|
|
109
|
+
'is_multiple_answer': is_multiple_answer,
|
|
110
|
+
'unit': unit,
|
|
111
|
+
'answer_type': answer_type,
|
|
112
|
+
'question_type': question_type,
|
|
113
|
+
'language': language,
|
|
114
|
+
'subject': subject,
|
|
115
|
+
'error': record.get('error', None),
|
|
116
|
+
},
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
def extract_answer(self, prediction: str, task_state: TaskState):
|
|
120
|
+
import re
|
|
121
|
+
|
|
122
|
+
if task_state.metadata['language'] == 'Chinese':
|
|
123
|
+
matches = re.findall('所以最终答案是(.*)', prediction)
|
|
124
|
+
else:
|
|
125
|
+
matches = re.findall('So the final answer is (.*)', prediction)
|
|
126
|
+
|
|
127
|
+
# If found matches, take the last one, otherwise return the whole text
|
|
128
|
+
if matches:
|
|
129
|
+
return matches[-1].strip()
|
|
130
|
+
return prediction
|
|
131
|
+
|
|
132
|
+
def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
|
|
133
|
+
from .utils import MathJudger
|
|
134
|
+
|
|
135
|
+
judger = MathJudger()
|
|
136
|
+
score = Score(
|
|
137
|
+
extracted_prediction=filtered_prediction,
|
|
138
|
+
prediction=original_prediction,
|
|
139
|
+
)
|
|
140
|
+
question = task_state.metadata
|
|
141
|
+
model_answer = filtered_prediction
|
|
142
|
+
# Get precision/error threshold from reference if available
|
|
143
|
+
answer_type = question['answer_type']
|
|
144
|
+
try:
|
|
145
|
+
if 'Tuple' in answer_type: # 目前可机评的数据中 没有 need_human_evaluate
|
|
146
|
+
judge_result = judger.judge(model_answer, question['final_answer'][0])
|
|
147
|
+
else:
|
|
148
|
+
if question['error']:
|
|
149
|
+
if ',' in question['error']:
|
|
150
|
+
precisions = question['error'].split(',')
|
|
151
|
+
precisions = [float(p) if p else 1e-8 for p in precisions]
|
|
152
|
+
judge_result = judger.judge(model_answer, question['final_answer'][0], precisions)
|
|
153
|
+
else:
|
|
154
|
+
precision = float(question['error'])
|
|
155
|
+
judge_result = judger.judge(model_answer, question['final_answer'][0], precision)
|
|
156
|
+
else:
|
|
157
|
+
judge_result = judger.judge(model_answer, question['final_answer'][0])
|
|
158
|
+
except Exception as e:
|
|
159
|
+
logger.warning(f'Error in judging answer: {e}')
|
|
160
|
+
judge_result = False
|
|
161
|
+
|
|
162
|
+
score.value = {'acc': float(judge_result)}
|
|
163
|
+
return score
|