evalscope 1.0.2__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +8 -2
- evalscope/app/utils/data_utils.py +3 -2
- evalscope/app/utils/visualization.py +2 -2
- evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
- evalscope/benchmarks/bfcl/bfcl_adapter.py +10 -45
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
- evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
- evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/metrics/metric.py +51 -0
- evalscope/metrics/metrics.py +16 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/report/__init__.py +9 -1
- evalscope/report/combinator.py +52 -2
- evalscope/utils/json_schema.py +8 -6
- evalscope/utils/multi_choices.py +16 -1
- evalscope/version.py +2 -2
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/METADATA +6 -32
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/RECORD +51 -54
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
- tests/__init__.py +0 -1
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -429
- tests/benchmark/test_image_edit.py +0 -65
- tests/benchmark/test_sandbox.py +0 -81
- tests/benchmark/test_t2i.py +0 -142
- tests/benchmark/test_vlm.py +0 -137
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -269
- tests/cli/test_collection.py +0 -99
- tests/cli/test_custom.py +0 -268
- tests/cli/test_reasoning.py +0 -81
- tests/common.py +0 -73
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -206
- tests/rag/test_clip_benchmark.py +0 -87
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- {tests/rag → evalscope/benchmarks/blink}/__init__.py +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
tests/cli/test_custom.py
DELETED
|
@@ -1,268 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
from dotenv import dotenv_values
|
|
3
|
-
|
|
4
|
-
from tests.utils import test_level_list
|
|
5
|
-
|
|
6
|
-
env = dotenv_values('.env')
|
|
7
|
-
|
|
8
|
-
import os
|
|
9
|
-
import subprocess
|
|
10
|
-
import unittest
|
|
11
|
-
|
|
12
|
-
from evalscope.config import TaskConfig
|
|
13
|
-
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
14
|
-
from evalscope.run import run_task
|
|
15
|
-
from evalscope.utils.import_utils import is_module_installed
|
|
16
|
-
from evalscope.utils.logger import get_logger
|
|
17
|
-
|
|
18
|
-
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
19
|
-
|
|
20
|
-
logger = get_logger()
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class TestRunCustom(unittest.TestCase):
|
|
24
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
25
|
-
def test_run_custom_task(self):
|
|
26
|
-
from evalscope.config import TaskConfig
|
|
27
|
-
|
|
28
|
-
task_cfg = TaskConfig(
|
|
29
|
-
model='Qwen/Qwen3-0.6B',
|
|
30
|
-
datasets=[
|
|
31
|
-
'general_mcq',
|
|
32
|
-
'general_qa'
|
|
33
|
-
],
|
|
34
|
-
dataset_args={
|
|
35
|
-
'general_mcq': {
|
|
36
|
-
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
37
|
-
'subset_list': [
|
|
38
|
-
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
39
|
-
],
|
|
40
|
-
'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
|
|
41
|
-
},
|
|
42
|
-
'general_qa': {
|
|
43
|
-
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
44
|
-
'subset_list': [
|
|
45
|
-
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
46
|
-
]
|
|
47
|
-
}
|
|
48
|
-
},
|
|
49
|
-
)
|
|
50
|
-
res = run_task(task_cfg=task_cfg)
|
|
51
|
-
print(res)
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
55
|
-
def test_run_local_dataset(self):
|
|
56
|
-
from evalscope.config import TaskConfig
|
|
57
|
-
|
|
58
|
-
task_cfg = TaskConfig(
|
|
59
|
-
model='qwen-plus',
|
|
60
|
-
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
61
|
-
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
62
|
-
eval_type=EvalType.SERVICE,
|
|
63
|
-
datasets=[
|
|
64
|
-
# 'mmlu',
|
|
65
|
-
# 'race',
|
|
66
|
-
'trivia_qa',
|
|
67
|
-
# 'cmmlu',
|
|
68
|
-
# 'humaneval',
|
|
69
|
-
# 'gsm8k',
|
|
70
|
-
# 'bbh',
|
|
71
|
-
# 'competition_math',
|
|
72
|
-
# 'arc',
|
|
73
|
-
# 'ceval',
|
|
74
|
-
],
|
|
75
|
-
dataset_args={
|
|
76
|
-
'mmlu': {
|
|
77
|
-
'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
|
|
78
|
-
'few_shot_num': 0,
|
|
79
|
-
'dataset_id': 'data/data/mmlu',
|
|
80
|
-
},
|
|
81
|
-
'ceval': {
|
|
82
|
-
'subset_list': [
|
|
83
|
-
'computer_network', 'operating_system', 'computer_architecture'
|
|
84
|
-
],
|
|
85
|
-
'few_shot_num': 0,
|
|
86
|
-
'dataset_id': 'data/data/ceval',
|
|
87
|
-
},
|
|
88
|
-
'cmmlu': {
|
|
89
|
-
'subset_list': ['elementary_chinese'],
|
|
90
|
-
'dataset_id': 'data/data/cmmlu',
|
|
91
|
-
'few_shot_num': 0
|
|
92
|
-
},
|
|
93
|
-
'bbh': {
|
|
94
|
-
'subset_list': ['word_sorting', 'movie_recommendation'],
|
|
95
|
-
},
|
|
96
|
-
'humaneval': {
|
|
97
|
-
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
|
|
98
|
-
},
|
|
99
|
-
'trivia_qa': {
|
|
100
|
-
'dataset_id': 'data/data/trivia_qa',
|
|
101
|
-
},
|
|
102
|
-
},
|
|
103
|
-
eval_batch_size=10,
|
|
104
|
-
limit=5,
|
|
105
|
-
debug=True,
|
|
106
|
-
stream=True,
|
|
107
|
-
generation_config={
|
|
108
|
-
'temperature': 0,
|
|
109
|
-
'n': 1,
|
|
110
|
-
'max_tokens': 4096,
|
|
111
|
-
},
|
|
112
|
-
ignore_errors=False,
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
run_task(task_cfg=task_cfg)
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
119
|
-
def test_run_general_no_answer(self):
|
|
120
|
-
from evalscope.config import TaskConfig
|
|
121
|
-
|
|
122
|
-
task_cfg = TaskConfig(
|
|
123
|
-
model='qwen2.5-7b-instruct',
|
|
124
|
-
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
125
|
-
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
126
|
-
eval_type=EvalType.SERVICE,
|
|
127
|
-
datasets=[
|
|
128
|
-
'general_qa',
|
|
129
|
-
],
|
|
130
|
-
dataset_args={
|
|
131
|
-
'general_qa': {
|
|
132
|
-
'dataset_id': 'custom_eval/text/qa',
|
|
133
|
-
'subset_list': [
|
|
134
|
-
'arena',
|
|
135
|
-
# 'example'
|
|
136
|
-
],
|
|
137
|
-
}
|
|
138
|
-
},
|
|
139
|
-
eval_batch_size=10,
|
|
140
|
-
limit=10,
|
|
141
|
-
debug=True,
|
|
142
|
-
stream=True,
|
|
143
|
-
generation_config={
|
|
144
|
-
'temperature': 0,
|
|
145
|
-
'n': 1,
|
|
146
|
-
'max_tokens': 4096,
|
|
147
|
-
},
|
|
148
|
-
ignore_errors=False,
|
|
149
|
-
judge_model_args={
|
|
150
|
-
'model_id': 'qwen2.5-7b-instruct',
|
|
151
|
-
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
152
|
-
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
153
|
-
'generation_config': {
|
|
154
|
-
'temperature': 0.0,
|
|
155
|
-
'max_tokens': 4096
|
|
156
|
-
},
|
|
157
|
-
'score_type': 'numeric',
|
|
158
|
-
'prompt_template': """Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response.
|
|
159
|
-
Begin your evaluation by providing a short explanation. Be as objective as possible.
|
|
160
|
-
After providing your explanation, you must rate the response on a scale of 0 (worst) to 100 (best) by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\"
|
|
161
|
-
|
|
162
|
-
[Question]
|
|
163
|
-
{question}
|
|
164
|
-
|
|
165
|
-
[Response]
|
|
166
|
-
{pred}
|
|
167
|
-
"""
|
|
168
|
-
},
|
|
169
|
-
judge_worker_num=5,
|
|
170
|
-
judge_strategy=JudgeStrategy.LLM,
|
|
171
|
-
)
|
|
172
|
-
|
|
173
|
-
run_task(task_cfg=task_cfg)
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
177
|
-
def test_run_general_with_answer(self):
|
|
178
|
-
from evalscope.config import TaskConfig
|
|
179
|
-
|
|
180
|
-
task_cfg = TaskConfig(
|
|
181
|
-
model='qwen-plus',
|
|
182
|
-
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
183
|
-
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
184
|
-
eval_type=EvalType.SERVICE,
|
|
185
|
-
datasets=[
|
|
186
|
-
'general_qa',
|
|
187
|
-
],
|
|
188
|
-
dataset_args={
|
|
189
|
-
'general_qa': {
|
|
190
|
-
'dataset_id': 'custom_eval/text/qa',
|
|
191
|
-
'subset_list': [
|
|
192
|
-
'example'
|
|
193
|
-
],
|
|
194
|
-
}
|
|
195
|
-
},
|
|
196
|
-
eval_batch_size=10,
|
|
197
|
-
limit=10,
|
|
198
|
-
debug=True,
|
|
199
|
-
stream=True,
|
|
200
|
-
generation_config={
|
|
201
|
-
'temperature': 0,
|
|
202
|
-
'n': 1,
|
|
203
|
-
'max_tokens': 4096,
|
|
204
|
-
},
|
|
205
|
-
ignore_errors=False,
|
|
206
|
-
judge_model_args={
|
|
207
|
-
'model_id': 'qwen2.5-72b-instruct',
|
|
208
|
-
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
209
|
-
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
210
|
-
'generation_config': {
|
|
211
|
-
'temperature': 0.0,
|
|
212
|
-
'max_tokens': 4096
|
|
213
|
-
},
|
|
214
|
-
'score_type': 'pattern',
|
|
215
|
-
},
|
|
216
|
-
judge_worker_num=1,
|
|
217
|
-
judge_strategy=JudgeStrategy.LLM_RECALL,
|
|
218
|
-
use_cache='outputs/20250818_170420'
|
|
219
|
-
)
|
|
220
|
-
|
|
221
|
-
run_task(task_cfg=task_cfg)
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
225
|
-
def test_run_general_arena(self):
|
|
226
|
-
from evalscope.config import TaskConfig
|
|
227
|
-
|
|
228
|
-
task_cfg = TaskConfig(
|
|
229
|
-
model_id='Arena',
|
|
230
|
-
datasets=[
|
|
231
|
-
'general_arena',
|
|
232
|
-
],
|
|
233
|
-
dataset_args={
|
|
234
|
-
'general_arena': {
|
|
235
|
-
'extra_params':{
|
|
236
|
-
'models':[
|
|
237
|
-
{
|
|
238
|
-
'name': 'qwen2.5-7b',
|
|
239
|
-
'report_path': 'outputs/20250819_165034/reports/qwen2.5-7b-instruct'
|
|
240
|
-
},
|
|
241
|
-
{
|
|
242
|
-
'name': 'qwen2.5-72b',
|
|
243
|
-
'report_path': 'outputs/20250819_164926/reports/qwen2.5-72b-instruct'
|
|
244
|
-
}
|
|
245
|
-
],
|
|
246
|
-
'baseline': 'qwen2.5-72b'
|
|
247
|
-
}
|
|
248
|
-
}
|
|
249
|
-
},
|
|
250
|
-
eval_batch_size=10,
|
|
251
|
-
limit=10,
|
|
252
|
-
debug=True,
|
|
253
|
-
stream=True,
|
|
254
|
-
ignore_errors=False,
|
|
255
|
-
judge_model_args={
|
|
256
|
-
'model_id': 'qwen-plus',
|
|
257
|
-
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
258
|
-
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
259
|
-
'generation_config': {
|
|
260
|
-
'temperature': 0.0,
|
|
261
|
-
'max_tokens': 8000
|
|
262
|
-
},
|
|
263
|
-
},
|
|
264
|
-
judge_worker_num=5,
|
|
265
|
-
# use_cache='outputs/20250819_173546'
|
|
266
|
-
)
|
|
267
|
-
|
|
268
|
-
run_task(task_cfg=task_cfg)
|
tests/cli/test_reasoning.py
DELETED
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
from dotenv import dotenv_values
|
|
3
|
-
|
|
4
|
-
env = dotenv_values('.env')
|
|
5
|
-
|
|
6
|
-
import unittest
|
|
7
|
-
from unittest import TestCase
|
|
8
|
-
|
|
9
|
-
from evalscope.config import TaskConfig
|
|
10
|
-
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
11
|
-
from evalscope.run import run_task
|
|
12
|
-
from evalscope.utils.logger import get_logger
|
|
13
|
-
|
|
14
|
-
logger = get_logger()
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class TestReasoning(TestCase):
|
|
18
|
-
"""Benchmark evaluation test cases."""
|
|
19
|
-
|
|
20
|
-
def setUp(self):
|
|
21
|
-
"""Setup common test configuration."""
|
|
22
|
-
self.base_config = {
|
|
23
|
-
'model': 'Qwen3-0.6B',
|
|
24
|
-
'api_url': 'http://0.0.0.0:8801/v1',
|
|
25
|
-
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
26
|
-
'eval_type': EvalType.SERVICE,
|
|
27
|
-
'eval_batch_size': 5,
|
|
28
|
-
'limit': 5,
|
|
29
|
-
'generation_config': {
|
|
30
|
-
'max_tokens': 4096,
|
|
31
|
-
'temperature': 0.0,
|
|
32
|
-
'seed': 42,
|
|
33
|
-
'parallel_tool_calls': True,
|
|
34
|
-
'extra_body':{'chat_template_kwargs': {'enable_thinking': False}} # 关闭思考模式
|
|
35
|
-
},
|
|
36
|
-
'judge_strategy': JudgeStrategy.AUTO,
|
|
37
|
-
'judge_worker_num': 5,
|
|
38
|
-
'judge_model_args': {
|
|
39
|
-
'model_id': 'qwen2.5-72b-instruct',
|
|
40
|
-
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
41
|
-
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
42
|
-
'generation_config': {
|
|
43
|
-
'temperature': 0.0,
|
|
44
|
-
'max_tokens': 4096,
|
|
45
|
-
}
|
|
46
|
-
},
|
|
47
|
-
'debug': True,
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
|
|
51
|
-
"""Helper method to run test for a specific dataset."""
|
|
52
|
-
config = self.base_config.copy()
|
|
53
|
-
config['datasets'] = [dataset_name]
|
|
54
|
-
|
|
55
|
-
if use_mock:
|
|
56
|
-
config['eval_type'] = EvalType.MOCK_LLM
|
|
57
|
-
|
|
58
|
-
# 应用配置覆盖
|
|
59
|
-
config.update(config_overrides)
|
|
60
|
-
|
|
61
|
-
if dataset_args:
|
|
62
|
-
config['dataset_args'] = {dataset_name: dataset_args}
|
|
63
|
-
|
|
64
|
-
task_cfg = TaskConfig(**config)
|
|
65
|
-
run_task(task_cfg=task_cfg)
|
|
66
|
-
|
|
67
|
-
def _run_dataset_load_test(self, dataset_name, dataset_args=None):
|
|
68
|
-
"""Helper method to test dataset loading."""
|
|
69
|
-
|
|
70
|
-
self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
|
|
71
|
-
|
|
72
|
-
# Math & Reasoning datasets
|
|
73
|
-
def test_gsm8k(self):
|
|
74
|
-
"""Test GSM8K math reasoning dataset."""
|
|
75
|
-
self._run_dataset_test('gsm8k')
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
if __name__ == '__main__':
|
|
79
|
-
# Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k
|
|
80
|
-
# Run all tests: python -m unittest test_eval.TestBenchmark
|
|
81
|
-
unittest.main()
|
tests/common.py
DELETED
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
from dotenv import dotenv_values
|
|
3
|
-
|
|
4
|
-
env = dotenv_values('.env')
|
|
5
|
-
|
|
6
|
-
import unittest
|
|
7
|
-
from unittest import TestCase
|
|
8
|
-
|
|
9
|
-
from evalscope.config import TaskConfig
|
|
10
|
-
from evalscope.constants import EvalType, JudgeStrategy
|
|
11
|
-
from evalscope.run import run_task
|
|
12
|
-
from evalscope.utils.logger import get_logger
|
|
13
|
-
|
|
14
|
-
logger = get_logger()
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class TestBenchmark(TestCase):
|
|
18
|
-
"""Benchmark evaluation test cases."""
|
|
19
|
-
|
|
20
|
-
def setUp(self):
|
|
21
|
-
"""Setup common test configuration."""
|
|
22
|
-
self.base_config = {
|
|
23
|
-
'model': 'qwen-plus',
|
|
24
|
-
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
25
|
-
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
26
|
-
'eval_type': EvalType.SERVICE,
|
|
27
|
-
'eval_batch_size': 5,
|
|
28
|
-
'limit': 5,
|
|
29
|
-
'generation_config': {
|
|
30
|
-
'max_tokens': 4096,
|
|
31
|
-
'temperature': 0.0,
|
|
32
|
-
'seed': 42,
|
|
33
|
-
'parallel_tool_calls': True
|
|
34
|
-
},
|
|
35
|
-
'judge_strategy': JudgeStrategy.AUTO,
|
|
36
|
-
'judge_worker_num': 5,
|
|
37
|
-
'judge_model_args': {
|
|
38
|
-
'model_id': 'qwen2.5-72b-instruct',
|
|
39
|
-
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
40
|
-
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
41
|
-
'generation_config': {
|
|
42
|
-
'temperature': 0.0,
|
|
43
|
-
'max_tokens': 4096,
|
|
44
|
-
}
|
|
45
|
-
},
|
|
46
|
-
'debug': True,
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
|
|
50
|
-
"""Helper method to run test for a specific dataset."""
|
|
51
|
-
config = self.base_config.copy()
|
|
52
|
-
config['datasets'] = [dataset_name]
|
|
53
|
-
|
|
54
|
-
if not env.get('DASHSCOPE_API_KEY'):
|
|
55
|
-
use_mock = True
|
|
56
|
-
logger.warning('DASHSCOPE_API_KEY is not set. Using mock evaluation.')
|
|
57
|
-
|
|
58
|
-
if use_mock:
|
|
59
|
-
config['eval_type'] = EvalType.MOCK_LLM
|
|
60
|
-
|
|
61
|
-
# 应用配置覆盖
|
|
62
|
-
config.update(config_overrides)
|
|
63
|
-
|
|
64
|
-
if dataset_args:
|
|
65
|
-
config['dataset_args'] = {dataset_name: dataset_args}
|
|
66
|
-
|
|
67
|
-
task_cfg = TaskConfig(**config)
|
|
68
|
-
run_task(task_cfg=task_cfg)
|
|
69
|
-
|
|
70
|
-
def _run_dataset_load_test(self, dataset_name, dataset_args=None):
|
|
71
|
-
"""Helper method to test dataset loading."""
|
|
72
|
-
|
|
73
|
-
self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
|
tests/perf/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
tests/perf/test_perf.py
DELETED
|
@@ -1,206 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
from dotenv import dotenv_values
|
|
3
|
-
|
|
4
|
-
env = dotenv_values('.env')
|
|
5
|
-
import unittest
|
|
6
|
-
|
|
7
|
-
from evalscope.perf.main import run_perf_benchmark
|
|
8
|
-
from tests.utils import test_level_list
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class TestPerf(unittest.TestCase):
|
|
12
|
-
|
|
13
|
-
def setUp(self) -> None:
|
|
14
|
-
pass
|
|
15
|
-
|
|
16
|
-
def tearDown(self) -> None:
|
|
17
|
-
pass
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def test_run_perf(self):
|
|
21
|
-
task_cfg = {
|
|
22
|
-
'url': 'http://127.0.0.1:8001/v1/chat/completions',
|
|
23
|
-
'parallel': 1,
|
|
24
|
-
'model': 'qwen2.5',
|
|
25
|
-
'number': 15,
|
|
26
|
-
'api': 'openai',
|
|
27
|
-
'dataset': 'openqa',
|
|
28
|
-
# 'stream': True,
|
|
29
|
-
'debug': True,
|
|
30
|
-
}
|
|
31
|
-
run_perf_benchmark(task_cfg)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def test_run_perf_stream(self):
|
|
35
|
-
task_cfg = {
|
|
36
|
-
'url': 'http://127.0.0.1:8801/v1/chat/completions',
|
|
37
|
-
'parallel': 1,
|
|
38
|
-
'model': 'Qwen2.5-0.5B-Instruct',
|
|
39
|
-
'number': 15,
|
|
40
|
-
'api': 'openai',
|
|
41
|
-
'dataset': 'openqa',
|
|
42
|
-
'stream': True,
|
|
43
|
-
'debug': True,
|
|
44
|
-
}
|
|
45
|
-
run_perf_benchmark(task_cfg)
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def test_run_perf_speed_benchmark(self):
|
|
49
|
-
task_cfg = {
|
|
50
|
-
'url': 'http://127.0.0.1:8001/v1/completions',
|
|
51
|
-
'parallel': 1,
|
|
52
|
-
'model': 'qwen2.5',
|
|
53
|
-
'api': 'openai',
|
|
54
|
-
'dataset': 'speed_benchmark',
|
|
55
|
-
'min_tokens': 2048,
|
|
56
|
-
'max_tokens': 2048,
|
|
57
|
-
'debug': True,
|
|
58
|
-
}
|
|
59
|
-
run_perf_benchmark(task_cfg)
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def test_run_perf_local(self):
|
|
63
|
-
task_cfg = {
|
|
64
|
-
'parallel': 1,
|
|
65
|
-
'model': 'Qwen/Qwen2.5-0.5B-Instruct',
|
|
66
|
-
'number': 5,
|
|
67
|
-
'api': 'local',
|
|
68
|
-
'dataset': 'openqa',
|
|
69
|
-
'debug': True,
|
|
70
|
-
}
|
|
71
|
-
run_perf_benchmark(task_cfg)
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def test_run_perf_local_stream(self):
|
|
75
|
-
task_cfg = {
|
|
76
|
-
'parallel': 1,
|
|
77
|
-
'model': 'Qwen/Qwen2.5-0.5B-Instruct',
|
|
78
|
-
'number': 5,
|
|
79
|
-
'api': 'local',
|
|
80
|
-
'dataset': 'openqa',
|
|
81
|
-
'stream': True,
|
|
82
|
-
'debug': True,
|
|
83
|
-
}
|
|
84
|
-
run_perf_benchmark(task_cfg)
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def test_run_perf_local_speed_benchmark(self):
|
|
88
|
-
task_cfg = {
|
|
89
|
-
'parallel': 1,
|
|
90
|
-
'model': 'Qwen/Qwen2.5-0.5B-Instruct',
|
|
91
|
-
'api': 'local_vllm',
|
|
92
|
-
'dataset': 'speed_benchmark',
|
|
93
|
-
'min_tokens': 2048,
|
|
94
|
-
'max_tokens': 2048,
|
|
95
|
-
'debug': True,
|
|
96
|
-
}
|
|
97
|
-
run_perf_benchmark(task_cfg)
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def test_run_perf_local_random(self):
|
|
101
|
-
from evalscope.perf.arguments import Arguments
|
|
102
|
-
task_cfg = Arguments(
|
|
103
|
-
parallel=20,
|
|
104
|
-
model='Qwen3-1.7B',
|
|
105
|
-
url='http://127.0.0.1:8801/v1/completions',
|
|
106
|
-
api='openai',
|
|
107
|
-
dataset='random',
|
|
108
|
-
min_tokens=1024,
|
|
109
|
-
max_tokens=1024,
|
|
110
|
-
prefix_length=0,
|
|
111
|
-
min_prompt_length=1024,
|
|
112
|
-
max_prompt_length=1024,
|
|
113
|
-
number=20,
|
|
114
|
-
tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
|
|
115
|
-
seed=None,
|
|
116
|
-
extra_args={'ignore_eos': True}
|
|
117
|
-
)
|
|
118
|
-
metrics_result, percentile_result = run_perf_benchmark(task_cfg)
|
|
119
|
-
print(metrics_result)
|
|
120
|
-
print(percentile_result)
|
|
121
|
-
|
|
122
|
-
def test_run_completion_endpoint(self):
|
|
123
|
-
if not env.get('DASHSCOPE_API_KEY'):
|
|
124
|
-
self.skipTest('DASHSCOPE_API_KEY is not set.')
|
|
125
|
-
return
|
|
126
|
-
|
|
127
|
-
from evalscope.perf.arguments import Arguments
|
|
128
|
-
task_cfg = Arguments(
|
|
129
|
-
parallel=[1, 2],
|
|
130
|
-
number=[2, 4],
|
|
131
|
-
model='qwen2.5-coder-7b-instruct',
|
|
132
|
-
url='https://dashscope.aliyuncs.com/compatible-mode/v1/completions',
|
|
133
|
-
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
134
|
-
api='openai',
|
|
135
|
-
dataset='random',
|
|
136
|
-
min_tokens=100,
|
|
137
|
-
max_tokens=100,
|
|
138
|
-
prefix_length=0,
|
|
139
|
-
min_prompt_length=1024,
|
|
140
|
-
max_prompt_length=1024,
|
|
141
|
-
stream=False,
|
|
142
|
-
tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
|
|
143
|
-
seed=None,
|
|
144
|
-
extra_args={'ignore_eos': True}
|
|
145
|
-
)
|
|
146
|
-
metrics_result, percentile_result = run_perf_benchmark(task_cfg)
|
|
147
|
-
print(metrics_result)
|
|
148
|
-
print(percentile_result)
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
def test_run_perf_multi_parallel(self):
|
|
152
|
-
if not env.get('DASHSCOPE_API_KEY'):
|
|
153
|
-
self.skipTest('DASHSCOPE_API_KEY is not set.')
|
|
154
|
-
return
|
|
155
|
-
|
|
156
|
-
from evalscope.perf.arguments import Arguments
|
|
157
|
-
task_cfg = Arguments(
|
|
158
|
-
parallel=[1, 2],
|
|
159
|
-
number=[2, 4],
|
|
160
|
-
model='qwen-plus',
|
|
161
|
-
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
162
|
-
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
163
|
-
api='openai',
|
|
164
|
-
dataset='random',
|
|
165
|
-
min_tokens=100,
|
|
166
|
-
max_tokens=100,
|
|
167
|
-
prefix_length=0,
|
|
168
|
-
min_prompt_length=1024,
|
|
169
|
-
max_prompt_length=1024,
|
|
170
|
-
tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
|
|
171
|
-
seed=None,
|
|
172
|
-
extra_args={'ignore_eos': True}
|
|
173
|
-
)
|
|
174
|
-
metrics_result, percentile_result = run_perf_benchmark(task_cfg)
|
|
175
|
-
print(metrics_result)
|
|
176
|
-
print(percentile_result)
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
def test_run_perf_random_vl(self):
|
|
180
|
-
from evalscope.perf.arguments import Arguments
|
|
181
|
-
task_cfg = Arguments(
|
|
182
|
-
parallel=[1, 2],
|
|
183
|
-
number=[2, 4],
|
|
184
|
-
model='qwen-vl-max',
|
|
185
|
-
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
186
|
-
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
187
|
-
api='openai',
|
|
188
|
-
dataset='random_vl',
|
|
189
|
-
min_tokens=100,
|
|
190
|
-
max_tokens=100,
|
|
191
|
-
prefix_length=0,
|
|
192
|
-
min_prompt_length=100,
|
|
193
|
-
max_prompt_length=100,
|
|
194
|
-
image_height=512,
|
|
195
|
-
image_width=512,
|
|
196
|
-
image_num=2,
|
|
197
|
-
tokenizer_path='Qwen/Qwen2.5-VL-7B-Instruct',
|
|
198
|
-
seed=None,
|
|
199
|
-
extra_args={'ignore_eos': True}
|
|
200
|
-
)
|
|
201
|
-
metrics_result, percentile_result = run_perf_benchmark(task_cfg)
|
|
202
|
-
print(metrics_result)
|
|
203
|
-
print(percentile_result)
|
|
204
|
-
|
|
205
|
-
if __name__ == '__main__':
|
|
206
|
-
unittest.main(buffer=False)
|