evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +9 -762
- evalscope/app/constants.py +1 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +52 -0
- evalscope/app/ui/multi_model.py +323 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +202 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +178 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +91 -0
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/backend_manager.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
- evalscope/benchmarks/__init__.py +15 -1
- evalscope/benchmarks/aime/aime24_adapter.py +2 -1
- evalscope/benchmarks/aime/aime25_adapter.py +2 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/utils.py +0 -12
- evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
- evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +29 -9
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
- evalscope/benchmarks/general_arena/utils.py +226 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +118 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/race/race_adapter.py +1 -1
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
- evalscope/benchmarks/utils.py +2 -2
- evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
- evalscope/config.py +8 -123
- evalscope/constants.py +5 -21
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +20 -15
- evalscope/metrics/__init__.py +9 -1
- evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
- evalscope/metrics/llm_judge.py +106 -20
- evalscope/metrics/metrics.py +20 -8
- evalscope/models/__init__.py +4 -8
- evalscope/models/adapters/__init__.py +4 -9
- evalscope/models/adapters/base_adapter.py +4 -0
- evalscope/models/adapters/bfcl_adapter.py +2 -0
- evalscope/models/adapters/chat_adapter.py +3 -0
- evalscope/models/adapters/choice_adapter.py +4 -0
- evalscope/models/adapters/custom_adapter.py +7 -3
- evalscope/models/adapters/server_adapter.py +4 -2
- evalscope/models/adapters/t2i_adapter.py +3 -0
- evalscope/models/adapters/tau_bench_adapter.py +189 -0
- evalscope/models/custom/dummy_model.py +3 -3
- evalscope/models/register.py +0 -14
- evalscope/perf/arguments.py +15 -16
- evalscope/perf/benchmark.py +38 -39
- evalscope/perf/http_client.py +30 -86
- evalscope/perf/main.py +3 -3
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +22 -4
- evalscope/perf/plugin/api/custom_api.py +212 -55
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +105 -0
- evalscope/perf/plugin/api/openai_api.py +17 -19
- evalscope/perf/plugin/datasets/__init__.py +10 -7
- evalscope/perf/plugin/datasets/base.py +22 -1
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +4 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +2 -1
- evalscope/perf/plugin/datasets/random_dataset.py +15 -4
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +14 -20
- evalscope/perf/utils/db_util.py +79 -61
- evalscope/report/__init__.py +1 -1
- evalscope/report/utils.py +34 -15
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -2
- evalscope/utils/__init__.py +63 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/import_utils.py +16 -0
- evalscope/utils/io_utils.py +55 -4
- evalscope/utils/model_utils.py +37 -1
- evalscope/version.py +2 -2
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
- tests/aigc/test_t2i.py +1 -1
- tests/cli/test_all.py +68 -4
- tests/cli/test_collection.py +1 -1
- tests/cli/test_custom.py +261 -0
- tests/cli/test_run.py +34 -70
- tests/perf/test_perf.py +31 -4
- tests/rag/test_clip_benchmark.py +2 -1
- tests/rag/test_mteb.py +3 -1
- tests/rag/test_ragas.py +3 -1
- tests/swift/test_run_swift_eval.py +2 -1
- tests/swift/test_run_swift_vlm_eval.py +2 -1
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
- tests/utils.py +13 -0
- tests/vlm/test_vlmeval.py +8 -2
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/models/model.py +0 -189
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- /evalscope/{utils → benchmarks}/filters.py +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
tests/cli/test_custom.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from dotenv import dotenv_values
|
|
3
|
+
|
|
4
|
+
from tests.utils import test_level_list
|
|
5
|
+
|
|
6
|
+
env = dotenv_values('.env')
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import subprocess
|
|
10
|
+
import unittest
|
|
11
|
+
|
|
12
|
+
from evalscope.config import TaskConfig
|
|
13
|
+
from evalscope.constants import EvalStage, EvalType, JudgeStrategy, OutputType
|
|
14
|
+
from evalscope.run import run_task
|
|
15
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
16
|
+
from evalscope.utils.logger import get_logger
|
|
17
|
+
|
|
18
|
+
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
19
|
+
|
|
20
|
+
logger = get_logger()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TestRunCustom(unittest.TestCase):
|
|
24
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
25
|
+
def test_run_custom_task(self):
|
|
26
|
+
from evalscope.config import TaskConfig
|
|
27
|
+
|
|
28
|
+
task_cfg = TaskConfig(
|
|
29
|
+
model='Qwen/Qwen3-0.6B',
|
|
30
|
+
datasets=[
|
|
31
|
+
'general_mcq',
|
|
32
|
+
'general_qa'
|
|
33
|
+
],
|
|
34
|
+
dataset_args={
|
|
35
|
+
'general_mcq': {
|
|
36
|
+
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
37
|
+
'subset_list': [
|
|
38
|
+
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
39
|
+
],
|
|
40
|
+
'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
|
|
41
|
+
},
|
|
42
|
+
'general_qa': {
|
|
43
|
+
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
44
|
+
'subset_list': [
|
|
45
|
+
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
46
|
+
]
|
|
47
|
+
}
|
|
48
|
+
},
|
|
49
|
+
)
|
|
50
|
+
res = run_task(task_cfg=task_cfg)
|
|
51
|
+
print(res)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
55
|
+
def test_run_local_dataset(self):
|
|
56
|
+
from evalscope.config import TaskConfig
|
|
57
|
+
|
|
58
|
+
task_cfg = TaskConfig(
|
|
59
|
+
model='qwen-plus',
|
|
60
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
61
|
+
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
62
|
+
eval_type=EvalType.SERVICE,
|
|
63
|
+
datasets=[
|
|
64
|
+
# 'mmlu',
|
|
65
|
+
# 'race',
|
|
66
|
+
'trivia_qa',
|
|
67
|
+
# 'cmmlu',
|
|
68
|
+
# 'humaneval',
|
|
69
|
+
# 'gsm8k',
|
|
70
|
+
# 'bbh',
|
|
71
|
+
# 'competition_math',
|
|
72
|
+
# 'arc',
|
|
73
|
+
# 'ceval',
|
|
74
|
+
],
|
|
75
|
+
dataset_args={
|
|
76
|
+
'mmlu': {
|
|
77
|
+
'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
|
|
78
|
+
'few_shot_num': 0,
|
|
79
|
+
'dataset_id': 'data/data/mmlu',
|
|
80
|
+
},
|
|
81
|
+
'ceval': {
|
|
82
|
+
'subset_list': [
|
|
83
|
+
'computer_network', 'operating_system', 'computer_architecture'
|
|
84
|
+
],
|
|
85
|
+
'few_shot_num': 0,
|
|
86
|
+
'dataset_id': 'data/data/ceval',
|
|
87
|
+
},
|
|
88
|
+
'cmmlu': {
|
|
89
|
+
'subset_list': ['elementary_chinese'],
|
|
90
|
+
'dataset_id': 'data/data/cmmlu',
|
|
91
|
+
'few_shot_num': 0
|
|
92
|
+
},
|
|
93
|
+
'bbh': {
|
|
94
|
+
'subset_list': ['word_sorting', 'movie_recommendation'],
|
|
95
|
+
},
|
|
96
|
+
'humaneval': {
|
|
97
|
+
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
|
|
98
|
+
},
|
|
99
|
+
'trivia_qa': {
|
|
100
|
+
'dataset_id': 'data/data/trivia_qa',
|
|
101
|
+
},
|
|
102
|
+
},
|
|
103
|
+
eval_batch_size=10,
|
|
104
|
+
limit=5,
|
|
105
|
+
debug=True,
|
|
106
|
+
stream=True,
|
|
107
|
+
generation_config={
|
|
108
|
+
'temperature': 0,
|
|
109
|
+
'n': 1,
|
|
110
|
+
'max_tokens': 4096,
|
|
111
|
+
},
|
|
112
|
+
ignore_errors=False,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
run_task(task_cfg=task_cfg)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
119
|
+
def test_run_general_no_answer(self):
|
|
120
|
+
from evalscope.config import TaskConfig
|
|
121
|
+
|
|
122
|
+
task_cfg = TaskConfig(
|
|
123
|
+
model='qwen2.5-72b-instruct',
|
|
124
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
125
|
+
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
126
|
+
eval_type=EvalType.SERVICE,
|
|
127
|
+
datasets=[
|
|
128
|
+
'general_qa',
|
|
129
|
+
],
|
|
130
|
+
dataset_args={
|
|
131
|
+
'general_qa': {
|
|
132
|
+
'dataset_id': 'custom_eval/text/qa',
|
|
133
|
+
'subset_list': [
|
|
134
|
+
'arena',
|
|
135
|
+
'example'
|
|
136
|
+
],
|
|
137
|
+
}
|
|
138
|
+
},
|
|
139
|
+
eval_batch_size=10,
|
|
140
|
+
limit=10,
|
|
141
|
+
debug=True,
|
|
142
|
+
stream=True,
|
|
143
|
+
generation_config={
|
|
144
|
+
'temperature': 0,
|
|
145
|
+
'n': 1,
|
|
146
|
+
'max_tokens': 4096,
|
|
147
|
+
},
|
|
148
|
+
ignore_errors=False,
|
|
149
|
+
judge_model_args={
|
|
150
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
151
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
152
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
153
|
+
'generation_config': {
|
|
154
|
+
'temperature': 0.0,
|
|
155
|
+
'max_tokens': 4096
|
|
156
|
+
},
|
|
157
|
+
'score_type': 'numeric',
|
|
158
|
+
},
|
|
159
|
+
judge_worker_num=5,
|
|
160
|
+
judge_strategy=JudgeStrategy.AUTO,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
run_task(task_cfg=task_cfg)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
167
|
+
def test_run_general_with_answer(self):
|
|
168
|
+
from evalscope.config import TaskConfig
|
|
169
|
+
|
|
170
|
+
task_cfg = TaskConfig(
|
|
171
|
+
model='qwen-plus',
|
|
172
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
173
|
+
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
174
|
+
eval_type=EvalType.SERVICE,
|
|
175
|
+
datasets=[
|
|
176
|
+
'general_qa',
|
|
177
|
+
],
|
|
178
|
+
dataset_args={
|
|
179
|
+
'general_qa': {
|
|
180
|
+
'dataset_id': 'custom_eval/text/qa',
|
|
181
|
+
'subset_list': [
|
|
182
|
+
'example'
|
|
183
|
+
],
|
|
184
|
+
}
|
|
185
|
+
},
|
|
186
|
+
eval_batch_size=10,
|
|
187
|
+
limit=10,
|
|
188
|
+
debug=True,
|
|
189
|
+
stream=True,
|
|
190
|
+
generation_config={
|
|
191
|
+
'temperature': 0,
|
|
192
|
+
'n': 1,
|
|
193
|
+
'max_tokens': 4096,
|
|
194
|
+
},
|
|
195
|
+
ignore_errors=False,
|
|
196
|
+
judge_model_args={
|
|
197
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
198
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
199
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
200
|
+
'generation_config': {
|
|
201
|
+
'temperature': 0.0,
|
|
202
|
+
'max_tokens': 4096
|
|
203
|
+
},
|
|
204
|
+
'score_type': 'pattern',
|
|
205
|
+
},
|
|
206
|
+
judge_worker_num=5,
|
|
207
|
+
judge_strategy=JudgeStrategy.LLM,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
run_task(task_cfg=task_cfg)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
214
|
+
def test_run_general_arena(self):
|
|
215
|
+
from evalscope.config import TaskConfig
|
|
216
|
+
|
|
217
|
+
task_cfg = TaskConfig(
|
|
218
|
+
model_id='Arena',
|
|
219
|
+
datasets=[
|
|
220
|
+
'general_arena',
|
|
221
|
+
],
|
|
222
|
+
dataset_args={
|
|
223
|
+
'general_arena': {
|
|
224
|
+
'extra_params':{
|
|
225
|
+
'models':[
|
|
226
|
+
{
|
|
227
|
+
'name': 'qwen2.5-0.5b',
|
|
228
|
+
'report_path': 'outputs/20250702_140354/reports/qwen2.5-0.5b-instruct'
|
|
229
|
+
},
|
|
230
|
+
{
|
|
231
|
+
'name': 'qwen2.5-7b',
|
|
232
|
+
'report_path': 'outputs/20250702_140702/reports/qwen2.5-7b-instruct'
|
|
233
|
+
},
|
|
234
|
+
{
|
|
235
|
+
'name': 'qwen2.5-72b',
|
|
236
|
+
'report_path': 'outputs/20250702_140802/reports/qwen2.5-72b-instruct'
|
|
237
|
+
}
|
|
238
|
+
],
|
|
239
|
+
'baseline': 'qwen2.5-7b'
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
},
|
|
243
|
+
eval_batch_size=10,
|
|
244
|
+
limit=10,
|
|
245
|
+
debug=True,
|
|
246
|
+
stream=True,
|
|
247
|
+
ignore_errors=False,
|
|
248
|
+
judge_model_args={
|
|
249
|
+
'model_id': 'qwen-plus',
|
|
250
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
251
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
252
|
+
'generation_config': {
|
|
253
|
+
'temperature': 0.0,
|
|
254
|
+
'max_tokens': 8000
|
|
255
|
+
},
|
|
256
|
+
},
|
|
257
|
+
judge_worker_num=5,
|
|
258
|
+
use_cache='outputs/20250702_165727'
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
run_task(task_cfg=task_cfg)
|
tests/cli/test_run.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
from dotenv import dotenv_values
|
|
3
3
|
|
|
4
|
+
from tests.utils import test_level_list
|
|
5
|
+
|
|
4
6
|
env = dotenv_values('.env')
|
|
5
7
|
|
|
6
8
|
import os
|
|
@@ -8,9 +10,9 @@ import subprocess
|
|
|
8
10
|
import unittest
|
|
9
11
|
|
|
10
12
|
from evalscope.config import TaskConfig
|
|
11
|
-
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
13
|
+
from evalscope.constants import EvalStage, EvalType, JudgeStrategy, OutputType
|
|
12
14
|
from evalscope.run import run_task
|
|
13
|
-
from evalscope.utils import is_module_installed
|
|
15
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
14
16
|
from evalscope.utils.logger import get_logger
|
|
15
17
|
|
|
16
18
|
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
@@ -182,35 +184,6 @@ class TestRun(unittest.TestCase):
|
|
|
182
184
|
run_task(task_cfg=task_cfg)
|
|
183
185
|
|
|
184
186
|
|
|
185
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
186
|
-
def test_run_custom_task(self):
|
|
187
|
-
from evalscope.config import TaskConfig
|
|
188
|
-
|
|
189
|
-
task_cfg = TaskConfig(
|
|
190
|
-
model='Qwen/Qwen3-0.6B',
|
|
191
|
-
datasets=[
|
|
192
|
-
'general_mcq',
|
|
193
|
-
'general_qa'
|
|
194
|
-
],
|
|
195
|
-
dataset_args={
|
|
196
|
-
'general_mcq': {
|
|
197
|
-
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
198
|
-
'subset_list': [
|
|
199
|
-
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
200
|
-
],
|
|
201
|
-
'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
|
|
202
|
-
},
|
|
203
|
-
'general_qa': {
|
|
204
|
-
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
205
|
-
'subset_list': [
|
|
206
|
-
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
207
|
-
]
|
|
208
|
-
}
|
|
209
|
-
},
|
|
210
|
-
)
|
|
211
|
-
res = run_task(task_cfg=task_cfg)
|
|
212
|
-
print(res)
|
|
213
|
-
|
|
214
187
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
215
188
|
def test_run_one_task(self):
|
|
216
189
|
from evalscope.config import TaskConfig
|
|
@@ -293,7 +266,7 @@ class TestRun(unittest.TestCase):
|
|
|
293
266
|
# 'musr',
|
|
294
267
|
# 'process_bench',
|
|
295
268
|
# 'race',
|
|
296
|
-
|
|
269
|
+
'trivia_qa',
|
|
297
270
|
# 'cmmlu',
|
|
298
271
|
# 'humaneval',
|
|
299
272
|
# 'gsm8k',
|
|
@@ -306,7 +279,7 @@ class TestRun(unittest.TestCase):
|
|
|
306
279
|
# 'ceval',
|
|
307
280
|
# 'hellaswag',
|
|
308
281
|
# 'general_mcq',
|
|
309
|
-
'general_qa',
|
|
282
|
+
# 'general_qa',
|
|
310
283
|
# 'super_gpqa',
|
|
311
284
|
# 'mmlu_redux',
|
|
312
285
|
# 'maritime_bench',
|
|
@@ -315,6 +288,9 @@ class TestRun(unittest.TestCase):
|
|
|
315
288
|
# 'tool_bench',
|
|
316
289
|
# 'frames',
|
|
317
290
|
# 'bfcl_v3',
|
|
291
|
+
# 'truthful_qa',
|
|
292
|
+
# 'tau_bench',
|
|
293
|
+
# 'hle'
|
|
318
294
|
],
|
|
319
295
|
dataset_args={
|
|
320
296
|
'mmlu': {
|
|
@@ -323,7 +299,7 @@ class TestRun(unittest.TestCase):
|
|
|
323
299
|
},
|
|
324
300
|
'mmlu_pro': {
|
|
325
301
|
'subset_list': ['math', 'health'],
|
|
326
|
-
'few_shot_num':
|
|
302
|
+
'few_shot_num': 0
|
|
327
303
|
},
|
|
328
304
|
'ceval': {
|
|
329
305
|
'subset_list': [
|
|
@@ -354,7 +330,6 @@ class TestRun(unittest.TestCase):
|
|
|
354
330
|
},
|
|
355
331
|
'musr': {
|
|
356
332
|
'subset_list': ['murder_mysteries'],
|
|
357
|
-
'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/MuSR'
|
|
358
333
|
},
|
|
359
334
|
'general_mcq': {
|
|
360
335
|
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
@@ -378,59 +353,42 @@ class TestRun(unittest.TestCase):
|
|
|
378
353
|
'mmlu_redux':{
|
|
379
354
|
'subset_list': ['abstract_algebra']
|
|
380
355
|
},
|
|
356
|
+
'frames':{
|
|
357
|
+
'local_path': 'data/iic/frames',
|
|
358
|
+
},
|
|
381
359
|
'bfcl_v3': {
|
|
382
360
|
'subset_list': ['parallel'],
|
|
383
361
|
'extra_params': {
|
|
384
362
|
# 'is_fc_model': False,
|
|
385
363
|
}
|
|
386
364
|
},
|
|
365
|
+
'tau_bench': {
|
|
366
|
+
'extra_params': {
|
|
367
|
+
'user_model': 'qwen-plus',
|
|
368
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
369
|
+
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
370
|
+
}
|
|
371
|
+
},
|
|
372
|
+
'hle': {
|
|
373
|
+
'subset_list': ['Math', 'Other'],
|
|
374
|
+
},
|
|
387
375
|
},
|
|
388
376
|
eval_batch_size=10,
|
|
389
|
-
limit=
|
|
390
|
-
debug=True,
|
|
377
|
+
limit=10,
|
|
378
|
+
# debug=True,
|
|
391
379
|
stream=True,
|
|
392
380
|
generation_config={
|
|
393
|
-
'temperature': 0,
|
|
381
|
+
'temperature': 0.6,
|
|
394
382
|
'n': 1,
|
|
395
383
|
'max_tokens': 4096,
|
|
396
384
|
# 'extra_headers':{'key': 'value'},
|
|
397
385
|
},
|
|
398
386
|
ignore_errors=False,
|
|
399
|
-
use_cache='outputs/test_2'
|
|
400
387
|
)
|
|
401
388
|
|
|
402
389
|
run_task(task_cfg=task_cfg)
|
|
403
390
|
|
|
404
391
|
|
|
405
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
406
|
-
def test_run_batch_eval(self):
|
|
407
|
-
from evalscope.config import TaskConfig
|
|
408
|
-
|
|
409
|
-
task_cfg = TaskConfig(
|
|
410
|
-
model='LLM-Research/Llama-3.2-1B-Instruct',
|
|
411
|
-
datasets=[
|
|
412
|
-
# 'math_500',
|
|
413
|
-
# 'aime24',
|
|
414
|
-
# 'competition_math'
|
|
415
|
-
# 'arc',
|
|
416
|
-
'gsm8k'
|
|
417
|
-
# 'truthful_qa'
|
|
418
|
-
],
|
|
419
|
-
dataset_args={
|
|
420
|
-
'competition_math': {
|
|
421
|
-
'subset_list': ['Level 4', 'Level 5']
|
|
422
|
-
}
|
|
423
|
-
},
|
|
424
|
-
eval_batch_size=2,
|
|
425
|
-
limit=5,
|
|
426
|
-
generation_config={
|
|
427
|
-
'max_new_tokens': 2048,
|
|
428
|
-
'temperature': 0.7,
|
|
429
|
-
'num_return_sequences': 2,
|
|
430
|
-
}
|
|
431
|
-
)
|
|
432
|
-
|
|
433
|
-
run_task(task_cfg=task_cfg)
|
|
434
392
|
|
|
435
393
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
436
394
|
def test_run_judge_model(self):
|
|
@@ -442,7 +400,7 @@ class TestRun(unittest.TestCase):
|
|
|
442
400
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
443
401
|
eval_type=EvalType.SERVICE,
|
|
444
402
|
datasets=[
|
|
445
|
-
'math_500',
|
|
403
|
+
# 'math_500',
|
|
446
404
|
# 'aime24',
|
|
447
405
|
# 'competition_math',
|
|
448
406
|
# 'arc',
|
|
@@ -459,6 +417,7 @@ class TestRun(unittest.TestCase):
|
|
|
459
417
|
# 'docmath',
|
|
460
418
|
# 'needle_haystack',
|
|
461
419
|
# 'ifeval',
|
|
420
|
+
'hle'
|
|
462
421
|
],
|
|
463
422
|
dataset_args={
|
|
464
423
|
'needle_haystack': {
|
|
@@ -491,7 +450,10 @@ class TestRun(unittest.TestCase):
|
|
|
491
450
|
},
|
|
492
451
|
'frames': {
|
|
493
452
|
'local_path': '/root/.cache/modelscope/hub/datasets/iic/frames'
|
|
494
|
-
}
|
|
453
|
+
},
|
|
454
|
+
'hle': {
|
|
455
|
+
'subset_list': ['Math', 'Other'],
|
|
456
|
+
},
|
|
495
457
|
},
|
|
496
458
|
eval_batch_size=10,
|
|
497
459
|
limit=3,
|
|
@@ -514,6 +476,7 @@ class TestRun(unittest.TestCase):
|
|
|
514
476
|
},
|
|
515
477
|
timeout=60000,
|
|
516
478
|
stream=True,
|
|
479
|
+
use_cache='outputs/20250714_150626'
|
|
517
480
|
# analysis_report=True,
|
|
518
481
|
# debug=True,
|
|
519
482
|
# use_cache='outputs/20250616_161931'
|
|
@@ -521,5 +484,6 @@ class TestRun(unittest.TestCase):
|
|
|
521
484
|
|
|
522
485
|
run_task(task_cfg=task_cfg)
|
|
523
486
|
|
|
487
|
+
|
|
524
488
|
if __name__ == '__main__':
|
|
525
489
|
unittest.main()
|
tests/perf/test_perf.py
CHANGED
|
@@ -7,7 +7,7 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
|
|
7
7
|
import unittest
|
|
8
8
|
|
|
9
9
|
from evalscope.perf.main import run_perf_benchmark
|
|
10
|
-
from
|
|
10
|
+
from tests.utils import test_level_list
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class TestPerf(unittest.TestCase):
|
|
@@ -35,9 +35,9 @@ class TestPerf(unittest.TestCase):
|
|
|
35
35
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
36
36
|
def test_run_perf_stream(self):
|
|
37
37
|
task_cfg = {
|
|
38
|
-
'url': 'http://127.0.0.1:
|
|
38
|
+
'url': 'http://127.0.0.1:8801/v1/chat/completions',
|
|
39
39
|
'parallel': 1,
|
|
40
|
-
'model': '
|
|
40
|
+
'model': 'Qwen2.5-0.5B-Instruct',
|
|
41
41
|
'number': 15,
|
|
42
42
|
'api': 'openai',
|
|
43
43
|
'dataset': 'openqa',
|
|
@@ -126,7 +126,7 @@ class TestPerf(unittest.TestCase):
|
|
|
126
126
|
from evalscope.perf.arguments import Arguments
|
|
127
127
|
task_cfg = Arguments(
|
|
128
128
|
parallel=[1, 2],
|
|
129
|
-
number=[2,
|
|
129
|
+
number=[2, 4],
|
|
130
130
|
model='qwen2.5-7b-instruct',
|
|
131
131
|
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
132
132
|
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
@@ -145,5 +145,32 @@ class TestPerf(unittest.TestCase):
|
|
|
145
145
|
print(metrics_result)
|
|
146
146
|
print(percentile_result)
|
|
147
147
|
|
|
148
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
149
|
+
def test_run_perf_random_vl(self):
|
|
150
|
+
from evalscope.perf.arguments import Arguments
|
|
151
|
+
task_cfg = Arguments(
|
|
152
|
+
parallel=[1, 2],
|
|
153
|
+
number=[2, 4],
|
|
154
|
+
model='qwen-vl-max',
|
|
155
|
+
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
156
|
+
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
157
|
+
api='openai',
|
|
158
|
+
dataset='kontext_bench',
|
|
159
|
+
min_tokens=100,
|
|
160
|
+
max_tokens=100,
|
|
161
|
+
prefix_length=0,
|
|
162
|
+
min_prompt_length=100,
|
|
163
|
+
max_prompt_length=100,
|
|
164
|
+
image_height=512,
|
|
165
|
+
image_width=512,
|
|
166
|
+
image_num=2,
|
|
167
|
+
tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
|
|
168
|
+
seed=None,
|
|
169
|
+
extra_args={'ignore_eos': True}
|
|
170
|
+
)
|
|
171
|
+
metrics_result, percentile_result = run_perf_benchmark(task_cfg)
|
|
172
|
+
print(metrics_result)
|
|
173
|
+
print(percentile_result)
|
|
174
|
+
|
|
148
175
|
if __name__ == '__main__':
|
|
149
176
|
unittest.main(buffer=False)
|
tests/rag/test_clip_benchmark.py
CHANGED
|
@@ -6,8 +6,9 @@ import subprocess
|
|
|
6
6
|
import unittest
|
|
7
7
|
|
|
8
8
|
from evalscope.run import run_task
|
|
9
|
-
from evalscope.utils import is_module_installed
|
|
9
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
|
+
from tests.utils import test_level_list
|
|
11
12
|
|
|
12
13
|
logger = get_logger()
|
|
13
14
|
|
tests/rag/test_mteb.py
CHANGED
|
@@ -3,9 +3,11 @@
|
|
|
3
3
|
import unittest
|
|
4
4
|
from dotenv import dotenv_values
|
|
5
5
|
|
|
6
|
+
from tests.utils import test_level_list
|
|
7
|
+
|
|
6
8
|
env = dotenv_values('.env')
|
|
7
9
|
from evalscope.run import run_task
|
|
8
|
-
from evalscope.utils import is_module_installed
|
|
10
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
9
11
|
from evalscope.utils.logger import get_logger
|
|
10
12
|
|
|
11
13
|
logger = get_logger()
|
tests/rag/test_ragas.py
CHANGED
|
@@ -2,11 +2,13 @@
|
|
|
2
2
|
import os
|
|
3
3
|
from dotenv import dotenv_values
|
|
4
4
|
|
|
5
|
+
from tests.utils import test_level_list
|
|
6
|
+
|
|
5
7
|
env = dotenv_values('.env')
|
|
6
8
|
import unittest
|
|
7
9
|
|
|
8
10
|
from evalscope import TaskConfig, run_task
|
|
9
|
-
from evalscope.utils import is_module_installed
|
|
11
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
10
12
|
from evalscope.utils.logger import get_logger
|
|
11
13
|
|
|
12
14
|
logger = get_logger()
|
|
@@ -10,8 +10,9 @@ import unittest
|
|
|
10
10
|
from evalscope.backend.opencompass import OpenCompassBackendManager
|
|
11
11
|
from evalscope.run import run_task
|
|
12
12
|
from evalscope.summarizer import Summarizer
|
|
13
|
-
from evalscope.utils import is_module_installed
|
|
13
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
14
14
|
from evalscope.utils.logger import get_logger
|
|
15
|
+
from tests.utils import test_level_list
|
|
15
16
|
|
|
16
17
|
logger = get_logger(__name__)
|
|
17
18
|
|
|
@@ -10,8 +10,9 @@ import unittest
|
|
|
10
10
|
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
11
11
|
from evalscope.run import run_task
|
|
12
12
|
from evalscope.summarizer import Summarizer
|
|
13
|
-
from evalscope.utils import is_module_installed
|
|
13
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
14
14
|
from evalscope.utils.logger import get_logger
|
|
15
|
+
from tests.utils import test_level_list
|
|
15
16
|
|
|
16
17
|
logger = get_logger(__name__)
|
|
17
18
|
|
|
@@ -10,8 +10,9 @@ import unittest
|
|
|
10
10
|
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
11
11
|
from evalscope.run import run_task
|
|
12
12
|
from evalscope.summarizer import Summarizer
|
|
13
|
-
from evalscope.utils import is_module_installed
|
|
13
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
14
14
|
from evalscope.utils.logger import get_logger
|
|
15
|
+
from tests.utils import test_level_list
|
|
15
16
|
|
|
16
17
|
logger = get_logger(__name__)
|
|
17
18
|
|
tests/utils.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
TEST_LEVEL_LIST = [0, 1]
|
|
4
|
+
# Example: export TEST_LEVEL_LIST=0,1
|
|
5
|
+
TEST_LEVEL_LIST_STR = 'TEST_LEVEL_LIST'
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_level_list():
|
|
9
|
+
global TEST_LEVEL_LIST
|
|
10
|
+
if TEST_LEVEL_LIST_STR in os.environ:
|
|
11
|
+
TEST_LEVEL_LIST = [int(x) for x in os.environ[TEST_LEVEL_LIST_STR].split(',')]
|
|
12
|
+
|
|
13
|
+
return TEST_LEVEL_LIST
|
tests/vlm/test_vlmeval.py
CHANGED
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
from dotenv import dotenv_values
|
|
3
3
|
|
|
4
|
+
from tests.utils import test_level_list
|
|
5
|
+
|
|
4
6
|
env = dotenv_values('.env')
|
|
5
7
|
import unittest
|
|
6
8
|
|
|
7
9
|
from evalscope.run import run_task
|
|
8
10
|
from evalscope.summarizer import Summarizer
|
|
9
|
-
from evalscope.utils import is_module_installed
|
|
11
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
10
12
|
from evalscope.utils.logger import get_logger
|
|
11
13
|
|
|
12
14
|
logger = get_logger()
|
|
@@ -62,7 +64,11 @@ class TestVLMEval(unittest.TestCase):
|
|
|
62
64
|
task_cfg = {
|
|
63
65
|
'eval_backend': 'VLMEvalKit',
|
|
64
66
|
'eval_config': {
|
|
65
|
-
'data': [
|
|
67
|
+
'data': [
|
|
68
|
+
# 'SEEDBench_IMG',
|
|
69
|
+
# 'ChartQA_TEST',
|
|
70
|
+
'MMDU'
|
|
71
|
+
],
|
|
66
72
|
'limit': 5,
|
|
67
73
|
'mode': 'all',
|
|
68
74
|
'model': [
|