evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +9 -762
- evalscope/app/constants.py +1 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +52 -0
- evalscope/app/ui/multi_model.py +323 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +202 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +178 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +91 -0
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/backend_manager.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
- evalscope/benchmarks/__init__.py +15 -1
- evalscope/benchmarks/aime/aime24_adapter.py +2 -1
- evalscope/benchmarks/aime/aime25_adapter.py +2 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/utils.py +0 -12
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
- evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +20 -5
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
- evalscope/benchmarks/general_arena/utils.py +226 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/race/race_adapter.py +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
- evalscope/benchmarks/utils.py +1 -2
- evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
- evalscope/config.py +8 -123
- evalscope/evaluator/evaluator.py +15 -12
- evalscope/metrics/__init__.py +6 -0
- evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
- evalscope/metrics/llm_judge.py +105 -20
- evalscope/metrics/metrics.py +1 -1
- evalscope/models/adapters/base_adapter.py +0 -2
- evalscope/models/adapters/server_adapter.py +2 -2
- evalscope/models/custom/dummy_model.py +3 -3
- evalscope/perf/arguments.py +2 -16
- evalscope/perf/main.py +1 -1
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +1 -1
- evalscope/report/__init__.py +1 -1
- evalscope/report/utils.py +34 -15
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -2
- evalscope/utils/__init__.py +63 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/import_utils.py +16 -0
- evalscope/utils/io_utils.py +45 -4
- evalscope/utils/model_utils.py +37 -1
- evalscope/version.py +2 -2
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
- tests/aigc/test_t2i.py +1 -1
- tests/cli/test_all.py +50 -2
- tests/cli/test_collection.py +1 -1
- tests/cli/test_custom.py +261 -0
- tests/cli/test_run.py +13 -37
- tests/perf/test_perf.py +2 -2
- tests/rag/test_clip_benchmark.py +2 -1
- tests/rag/test_mteb.py +3 -1
- tests/rag/test_ragas.py +3 -1
- tests/swift/test_run_swift_eval.py +2 -1
- tests/swift/test_run_swift_vlm_eval.py +2 -1
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
- tests/utils.py +13 -0
- tests/vlm/test_vlmeval.py +8 -2
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- /evalscope/{utils → benchmarks}/filters.py +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0
tests/cli/test_custom.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from dotenv import dotenv_values
|
|
3
|
+
|
|
4
|
+
from tests.utils import test_level_list
|
|
5
|
+
|
|
6
|
+
env = dotenv_values('.env')
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import subprocess
|
|
10
|
+
import unittest
|
|
11
|
+
|
|
12
|
+
from evalscope.config import TaskConfig
|
|
13
|
+
from evalscope.constants import EvalStage, EvalType, JudgeStrategy, OutputType
|
|
14
|
+
from evalscope.run import run_task
|
|
15
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
16
|
+
from evalscope.utils.logger import get_logger
|
|
17
|
+
|
|
18
|
+
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
19
|
+
|
|
20
|
+
logger = get_logger()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TestRunCustom(unittest.TestCase):
|
|
24
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
25
|
+
def test_run_custom_task(self):
|
|
26
|
+
from evalscope.config import TaskConfig
|
|
27
|
+
|
|
28
|
+
task_cfg = TaskConfig(
|
|
29
|
+
model='Qwen/Qwen3-0.6B',
|
|
30
|
+
datasets=[
|
|
31
|
+
'general_mcq',
|
|
32
|
+
'general_qa'
|
|
33
|
+
],
|
|
34
|
+
dataset_args={
|
|
35
|
+
'general_mcq': {
|
|
36
|
+
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
37
|
+
'subset_list': [
|
|
38
|
+
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
39
|
+
],
|
|
40
|
+
'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
|
|
41
|
+
},
|
|
42
|
+
'general_qa': {
|
|
43
|
+
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
44
|
+
'subset_list': [
|
|
45
|
+
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
46
|
+
]
|
|
47
|
+
}
|
|
48
|
+
},
|
|
49
|
+
)
|
|
50
|
+
res = run_task(task_cfg=task_cfg)
|
|
51
|
+
print(res)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
55
|
+
def test_run_local_dataset(self):
|
|
56
|
+
from evalscope.config import TaskConfig
|
|
57
|
+
|
|
58
|
+
task_cfg = TaskConfig(
|
|
59
|
+
model='qwen-plus',
|
|
60
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
61
|
+
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
62
|
+
eval_type=EvalType.SERVICE,
|
|
63
|
+
datasets=[
|
|
64
|
+
# 'mmlu',
|
|
65
|
+
# 'race',
|
|
66
|
+
'trivia_qa',
|
|
67
|
+
# 'cmmlu',
|
|
68
|
+
# 'humaneval',
|
|
69
|
+
# 'gsm8k',
|
|
70
|
+
# 'bbh',
|
|
71
|
+
# 'competition_math',
|
|
72
|
+
# 'arc',
|
|
73
|
+
# 'ceval',
|
|
74
|
+
],
|
|
75
|
+
dataset_args={
|
|
76
|
+
'mmlu': {
|
|
77
|
+
'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
|
|
78
|
+
'few_shot_num': 0,
|
|
79
|
+
'dataset_id': 'data/data/mmlu',
|
|
80
|
+
},
|
|
81
|
+
'ceval': {
|
|
82
|
+
'subset_list': [
|
|
83
|
+
'computer_network', 'operating_system', 'computer_architecture'
|
|
84
|
+
],
|
|
85
|
+
'few_shot_num': 0,
|
|
86
|
+
'dataset_id': 'data/data/ceval',
|
|
87
|
+
},
|
|
88
|
+
'cmmlu': {
|
|
89
|
+
'subset_list': ['elementary_chinese'],
|
|
90
|
+
'dataset_id': 'data/data/cmmlu',
|
|
91
|
+
'few_shot_num': 0
|
|
92
|
+
},
|
|
93
|
+
'bbh': {
|
|
94
|
+
'subset_list': ['word_sorting', 'movie_recommendation'],
|
|
95
|
+
},
|
|
96
|
+
'humaneval': {
|
|
97
|
+
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
|
|
98
|
+
},
|
|
99
|
+
'trivia_qa': {
|
|
100
|
+
'dataset_id': 'data/data/trivia_qa',
|
|
101
|
+
},
|
|
102
|
+
},
|
|
103
|
+
eval_batch_size=10,
|
|
104
|
+
limit=5,
|
|
105
|
+
debug=True,
|
|
106
|
+
stream=True,
|
|
107
|
+
generation_config={
|
|
108
|
+
'temperature': 0,
|
|
109
|
+
'n': 1,
|
|
110
|
+
'max_tokens': 4096,
|
|
111
|
+
},
|
|
112
|
+
ignore_errors=False,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
run_task(task_cfg=task_cfg)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
119
|
+
def test_run_general_no_answer(self):
|
|
120
|
+
from evalscope.config import TaskConfig
|
|
121
|
+
|
|
122
|
+
task_cfg = TaskConfig(
|
|
123
|
+
model='qwen2.5-72b-instruct',
|
|
124
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
125
|
+
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
126
|
+
eval_type=EvalType.SERVICE,
|
|
127
|
+
datasets=[
|
|
128
|
+
'general_qa',
|
|
129
|
+
],
|
|
130
|
+
dataset_args={
|
|
131
|
+
'general_qa': {
|
|
132
|
+
'dataset_id': 'custom_eval/text/qa',
|
|
133
|
+
'subset_list': [
|
|
134
|
+
'arena',
|
|
135
|
+
'example'
|
|
136
|
+
],
|
|
137
|
+
}
|
|
138
|
+
},
|
|
139
|
+
eval_batch_size=10,
|
|
140
|
+
limit=10,
|
|
141
|
+
debug=True,
|
|
142
|
+
stream=True,
|
|
143
|
+
generation_config={
|
|
144
|
+
'temperature': 0,
|
|
145
|
+
'n': 1,
|
|
146
|
+
'max_tokens': 4096,
|
|
147
|
+
},
|
|
148
|
+
ignore_errors=False,
|
|
149
|
+
judge_model_args={
|
|
150
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
151
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
152
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
153
|
+
'generation_config': {
|
|
154
|
+
'temperature': 0.0,
|
|
155
|
+
'max_tokens': 4096
|
|
156
|
+
},
|
|
157
|
+
'score_type': 'numeric',
|
|
158
|
+
},
|
|
159
|
+
judge_worker_num=5,
|
|
160
|
+
judge_strategy=JudgeStrategy.AUTO,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
run_task(task_cfg=task_cfg)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
167
|
+
def test_run_general_with_answer(self):
|
|
168
|
+
from evalscope.config import TaskConfig
|
|
169
|
+
|
|
170
|
+
task_cfg = TaskConfig(
|
|
171
|
+
model='qwen-plus',
|
|
172
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
173
|
+
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
174
|
+
eval_type=EvalType.SERVICE,
|
|
175
|
+
datasets=[
|
|
176
|
+
'general_qa',
|
|
177
|
+
],
|
|
178
|
+
dataset_args={
|
|
179
|
+
'general_qa': {
|
|
180
|
+
'dataset_id': 'custom_eval/text/qa',
|
|
181
|
+
'subset_list': [
|
|
182
|
+
'example'
|
|
183
|
+
],
|
|
184
|
+
}
|
|
185
|
+
},
|
|
186
|
+
eval_batch_size=10,
|
|
187
|
+
limit=10,
|
|
188
|
+
debug=True,
|
|
189
|
+
stream=True,
|
|
190
|
+
generation_config={
|
|
191
|
+
'temperature': 0,
|
|
192
|
+
'n': 1,
|
|
193
|
+
'max_tokens': 4096,
|
|
194
|
+
},
|
|
195
|
+
ignore_errors=False,
|
|
196
|
+
judge_model_args={
|
|
197
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
198
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
199
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
200
|
+
'generation_config': {
|
|
201
|
+
'temperature': 0.0,
|
|
202
|
+
'max_tokens': 4096
|
|
203
|
+
},
|
|
204
|
+
'score_type': 'pattern',
|
|
205
|
+
},
|
|
206
|
+
judge_worker_num=5,
|
|
207
|
+
judge_strategy=JudgeStrategy.LLM,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
run_task(task_cfg=task_cfg)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
214
|
+
def test_run_general_arena(self):
|
|
215
|
+
from evalscope.config import TaskConfig
|
|
216
|
+
|
|
217
|
+
task_cfg = TaskConfig(
|
|
218
|
+
model_id='Arena',
|
|
219
|
+
datasets=[
|
|
220
|
+
'general_arena',
|
|
221
|
+
],
|
|
222
|
+
dataset_args={
|
|
223
|
+
'general_arena': {
|
|
224
|
+
'extra_params':{
|
|
225
|
+
'models':[
|
|
226
|
+
{
|
|
227
|
+
'name': 'qwen2.5-0.5b',
|
|
228
|
+
'report_path': 'outputs/20250702_140354/reports/qwen2.5-0.5b-instruct'
|
|
229
|
+
},
|
|
230
|
+
{
|
|
231
|
+
'name': 'qwen2.5-7b',
|
|
232
|
+
'report_path': 'outputs/20250702_140702/reports/qwen2.5-7b-instruct'
|
|
233
|
+
},
|
|
234
|
+
{
|
|
235
|
+
'name': 'qwen2.5-72b',
|
|
236
|
+
'report_path': 'outputs/20250702_140802/reports/qwen2.5-72b-instruct'
|
|
237
|
+
}
|
|
238
|
+
],
|
|
239
|
+
'baseline': 'qwen2.5-7b'
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
},
|
|
243
|
+
eval_batch_size=10,
|
|
244
|
+
limit=10,
|
|
245
|
+
debug=True,
|
|
246
|
+
stream=True,
|
|
247
|
+
ignore_errors=False,
|
|
248
|
+
judge_model_args={
|
|
249
|
+
'model_id': 'qwen-plus',
|
|
250
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
251
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
252
|
+
'generation_config': {
|
|
253
|
+
'temperature': 0.0,
|
|
254
|
+
'max_tokens': 8000
|
|
255
|
+
},
|
|
256
|
+
},
|
|
257
|
+
judge_worker_num=5,
|
|
258
|
+
use_cache='outputs/20250702_165727'
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
run_task(task_cfg=task_cfg)
|
tests/cli/test_run.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
from dotenv import dotenv_values
|
|
3
3
|
|
|
4
|
+
from tests.utils import test_level_list
|
|
5
|
+
|
|
4
6
|
env = dotenv_values('.env')
|
|
5
7
|
|
|
6
8
|
import os
|
|
@@ -8,9 +10,9 @@ import subprocess
|
|
|
8
10
|
import unittest
|
|
9
11
|
|
|
10
12
|
from evalscope.config import TaskConfig
|
|
11
|
-
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
13
|
+
from evalscope.constants import EvalStage, EvalType, JudgeStrategy, OutputType
|
|
12
14
|
from evalscope.run import run_task
|
|
13
|
-
from evalscope.utils import is_module_installed
|
|
15
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
14
16
|
from evalscope.utils.logger import get_logger
|
|
15
17
|
|
|
16
18
|
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
@@ -182,35 +184,6 @@ class TestRun(unittest.TestCase):
|
|
|
182
184
|
run_task(task_cfg=task_cfg)
|
|
183
185
|
|
|
184
186
|
|
|
185
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
186
|
-
def test_run_custom_task(self):
|
|
187
|
-
from evalscope.config import TaskConfig
|
|
188
|
-
|
|
189
|
-
task_cfg = TaskConfig(
|
|
190
|
-
model='Qwen/Qwen3-0.6B',
|
|
191
|
-
datasets=[
|
|
192
|
-
'general_mcq',
|
|
193
|
-
'general_qa'
|
|
194
|
-
],
|
|
195
|
-
dataset_args={
|
|
196
|
-
'general_mcq': {
|
|
197
|
-
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
198
|
-
'subset_list': [
|
|
199
|
-
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
200
|
-
],
|
|
201
|
-
'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
|
|
202
|
-
},
|
|
203
|
-
'general_qa': {
|
|
204
|
-
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
205
|
-
'subset_list': [
|
|
206
|
-
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
207
|
-
]
|
|
208
|
-
}
|
|
209
|
-
},
|
|
210
|
-
)
|
|
211
|
-
res = run_task(task_cfg=task_cfg)
|
|
212
|
-
print(res)
|
|
213
|
-
|
|
214
187
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
215
188
|
def test_run_one_task(self):
|
|
216
189
|
from evalscope.config import TaskConfig
|
|
@@ -286,7 +259,7 @@ class TestRun(unittest.TestCase):
|
|
|
286
259
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
287
260
|
eval_type=EvalType.SERVICE,
|
|
288
261
|
datasets=[
|
|
289
|
-
|
|
262
|
+
'iquiz',
|
|
290
263
|
# 'ifeval',
|
|
291
264
|
# 'mmlu',
|
|
292
265
|
# 'mmlu_pro',
|
|
@@ -305,7 +278,7 @@ class TestRun(unittest.TestCase):
|
|
|
305
278
|
# 'arc',
|
|
306
279
|
# 'ceval',
|
|
307
280
|
# 'hellaswag',
|
|
308
|
-
'general_mcq',
|
|
281
|
+
# 'general_mcq',
|
|
309
282
|
# 'general_qa',
|
|
310
283
|
# 'super_gpqa',
|
|
311
284
|
# 'mmlu_redux',
|
|
@@ -315,6 +288,7 @@ class TestRun(unittest.TestCase):
|
|
|
315
288
|
# 'tool_bench',
|
|
316
289
|
# 'frames',
|
|
317
290
|
# 'bfcl_v3',
|
|
291
|
+
# 'truthful_qa',
|
|
318
292
|
],
|
|
319
293
|
dataset_args={
|
|
320
294
|
'mmlu': {
|
|
@@ -354,7 +328,6 @@ class TestRun(unittest.TestCase):
|
|
|
354
328
|
},
|
|
355
329
|
'musr': {
|
|
356
330
|
'subset_list': ['murder_mysteries'],
|
|
357
|
-
'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/MuSR'
|
|
358
331
|
},
|
|
359
332
|
'general_mcq': {
|
|
360
333
|
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
@@ -378,6 +351,9 @@ class TestRun(unittest.TestCase):
|
|
|
378
351
|
'mmlu_redux':{
|
|
379
352
|
'subset_list': ['abstract_algebra']
|
|
380
353
|
},
|
|
354
|
+
'frames':{
|
|
355
|
+
'local_path': 'data/iic/frames',
|
|
356
|
+
},
|
|
381
357
|
'bfcl_v3': {
|
|
382
358
|
'subset_list': ['parallel'],
|
|
383
359
|
'extra_params': {
|
|
@@ -385,9 +361,9 @@ class TestRun(unittest.TestCase):
|
|
|
385
361
|
}
|
|
386
362
|
},
|
|
387
363
|
},
|
|
388
|
-
eval_batch_size=
|
|
364
|
+
eval_batch_size=1,
|
|
389
365
|
limit=5,
|
|
390
|
-
debug=True,
|
|
366
|
+
# debug=True,
|
|
391
367
|
stream=True,
|
|
392
368
|
generation_config={
|
|
393
369
|
'temperature': 0,
|
|
@@ -396,7 +372,6 @@ class TestRun(unittest.TestCase):
|
|
|
396
372
|
# 'extra_headers':{'key': 'value'},
|
|
397
373
|
},
|
|
398
374
|
ignore_errors=False,
|
|
399
|
-
# use_cache='outputs/20250616_153756'
|
|
400
375
|
)
|
|
401
376
|
|
|
402
377
|
run_task(task_cfg=task_cfg)
|
|
@@ -521,5 +496,6 @@ class TestRun(unittest.TestCase):
|
|
|
521
496
|
|
|
522
497
|
run_task(task_cfg=task_cfg)
|
|
523
498
|
|
|
499
|
+
|
|
524
500
|
if __name__ == '__main__':
|
|
525
501
|
unittest.main()
|
tests/perf/test_perf.py
CHANGED
|
@@ -7,7 +7,7 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
|
|
7
7
|
import unittest
|
|
8
8
|
|
|
9
9
|
from evalscope.perf.main import run_perf_benchmark
|
|
10
|
-
from
|
|
10
|
+
from tests.utils import test_level_list
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class TestPerf(unittest.TestCase):
|
|
@@ -126,7 +126,7 @@ class TestPerf(unittest.TestCase):
|
|
|
126
126
|
from evalscope.perf.arguments import Arguments
|
|
127
127
|
task_cfg = Arguments(
|
|
128
128
|
parallel=[1, 2],
|
|
129
|
-
number=[2,
|
|
129
|
+
number=[2, 4],
|
|
130
130
|
model='qwen2.5-7b-instruct',
|
|
131
131
|
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
132
132
|
api_key=env.get('DASHSCOPE_API_KEY'),
|
tests/rag/test_clip_benchmark.py
CHANGED
|
@@ -6,8 +6,9 @@ import subprocess
|
|
|
6
6
|
import unittest
|
|
7
7
|
|
|
8
8
|
from evalscope.run import run_task
|
|
9
|
-
from evalscope.utils import is_module_installed
|
|
9
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
|
+
from tests.utils import test_level_list
|
|
11
12
|
|
|
12
13
|
logger = get_logger()
|
|
13
14
|
|
tests/rag/test_mteb.py
CHANGED
|
@@ -3,9 +3,11 @@
|
|
|
3
3
|
import unittest
|
|
4
4
|
from dotenv import dotenv_values
|
|
5
5
|
|
|
6
|
+
from tests.utils import test_level_list
|
|
7
|
+
|
|
6
8
|
env = dotenv_values('.env')
|
|
7
9
|
from evalscope.run import run_task
|
|
8
|
-
from evalscope.utils import is_module_installed
|
|
10
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
9
11
|
from evalscope.utils.logger import get_logger
|
|
10
12
|
|
|
11
13
|
logger = get_logger()
|
tests/rag/test_ragas.py
CHANGED
|
@@ -2,11 +2,13 @@
|
|
|
2
2
|
import os
|
|
3
3
|
from dotenv import dotenv_values
|
|
4
4
|
|
|
5
|
+
from tests.utils import test_level_list
|
|
6
|
+
|
|
5
7
|
env = dotenv_values('.env')
|
|
6
8
|
import unittest
|
|
7
9
|
|
|
8
10
|
from evalscope import TaskConfig, run_task
|
|
9
|
-
from evalscope.utils import is_module_installed
|
|
11
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
10
12
|
from evalscope.utils.logger import get_logger
|
|
11
13
|
|
|
12
14
|
logger = get_logger()
|
|
@@ -10,8 +10,9 @@ import unittest
|
|
|
10
10
|
from evalscope.backend.opencompass import OpenCompassBackendManager
|
|
11
11
|
from evalscope.run import run_task
|
|
12
12
|
from evalscope.summarizer import Summarizer
|
|
13
|
-
from evalscope.utils import is_module_installed
|
|
13
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
14
14
|
from evalscope.utils.logger import get_logger
|
|
15
|
+
from tests.utils import test_level_list
|
|
15
16
|
|
|
16
17
|
logger = get_logger(__name__)
|
|
17
18
|
|
|
@@ -10,8 +10,9 @@ import unittest
|
|
|
10
10
|
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
11
11
|
from evalscope.run import run_task
|
|
12
12
|
from evalscope.summarizer import Summarizer
|
|
13
|
-
from evalscope.utils import is_module_installed
|
|
13
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
14
14
|
from evalscope.utils.logger import get_logger
|
|
15
|
+
from tests.utils import test_level_list
|
|
15
16
|
|
|
16
17
|
logger = get_logger(__name__)
|
|
17
18
|
|
|
@@ -10,8 +10,9 @@ import unittest
|
|
|
10
10
|
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
11
11
|
from evalscope.run import run_task
|
|
12
12
|
from evalscope.summarizer import Summarizer
|
|
13
|
-
from evalscope.utils import is_module_installed
|
|
13
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
14
14
|
from evalscope.utils.logger import get_logger
|
|
15
|
+
from tests.utils import test_level_list
|
|
15
16
|
|
|
16
17
|
logger = get_logger(__name__)
|
|
17
18
|
|
tests/utils.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
TEST_LEVEL_LIST = [0, 1]
|
|
4
|
+
# Example: export TEST_LEVEL_LIST=0,1
|
|
5
|
+
TEST_LEVEL_LIST_STR = 'TEST_LEVEL_LIST'
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_level_list():
|
|
9
|
+
global TEST_LEVEL_LIST
|
|
10
|
+
if TEST_LEVEL_LIST_STR in os.environ:
|
|
11
|
+
TEST_LEVEL_LIST = [int(x) for x in os.environ[TEST_LEVEL_LIST_STR].split(',')]
|
|
12
|
+
|
|
13
|
+
return TEST_LEVEL_LIST
|
tests/vlm/test_vlmeval.py
CHANGED
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
from dotenv import dotenv_values
|
|
3
3
|
|
|
4
|
+
from tests.utils import test_level_list
|
|
5
|
+
|
|
4
6
|
env = dotenv_values('.env')
|
|
5
7
|
import unittest
|
|
6
8
|
|
|
7
9
|
from evalscope.run import run_task
|
|
8
10
|
from evalscope.summarizer import Summarizer
|
|
9
|
-
from evalscope.utils import is_module_installed
|
|
11
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
10
12
|
from evalscope.utils.logger import get_logger
|
|
11
13
|
|
|
12
14
|
logger = get_logger()
|
|
@@ -62,7 +64,11 @@ class TestVLMEval(unittest.TestCase):
|
|
|
62
64
|
task_cfg = {
|
|
63
65
|
'eval_backend': 'VLMEvalKit',
|
|
64
66
|
'eval_config': {
|
|
65
|
-
'data': [
|
|
67
|
+
'data': [
|
|
68
|
+
# 'SEEDBench_IMG',
|
|
69
|
+
# 'ChartQA_TEST',
|
|
70
|
+
'MMDU'
|
|
71
|
+
],
|
|
66
72
|
'limit': 5,
|
|
67
73
|
'mode': 'all',
|
|
68
74
|
'model': [
|
|
@@ -1,157 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
import pandas as pd
|
|
4
|
-
import pyarrow as pa
|
|
5
|
-
from typing import List, Union
|
|
6
|
-
|
|
7
|
-
from evalscope.constants import MetricMembers
|
|
8
|
-
from evalscope.utils.arena_utils import compute_elo
|
|
9
|
-
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
|
-
from evalscope.utils.logger import get_logger
|
|
11
|
-
|
|
12
|
-
logger = get_logger()
|
|
13
|
-
|
|
14
|
-
DEFAULT_COLUMNS_MAPPING = {'model_a': 'model_a', 'model_b': 'model_b', 'win': 'win', 'tstamp': 'ts', 'language': 'lang'}
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class RatingEvaluate(object):
|
|
18
|
-
|
|
19
|
-
def __init__(self, metrics: list, baseline_model: str = None, **kwargs):
|
|
20
|
-
self.metrics = metrics
|
|
21
|
-
self.baseline_model = baseline_model
|
|
22
|
-
self.kwargs = kwargs
|
|
23
|
-
|
|
24
|
-
def preprocess(self, raw_data_df: pd.DataFrame, **kwargs):
|
|
25
|
-
|
|
26
|
-
# Get battles data
|
|
27
|
-
raw_data_df = raw_data_df.sort_values(ascending=True, by=['tstamp'])
|
|
28
|
-
battles = raw_data_df[raw_data_df['anony']].reset_index(drop=True)
|
|
29
|
-
|
|
30
|
-
return battles
|
|
31
|
-
|
|
32
|
-
def compute_elo_rating(self, raw_data):
|
|
33
|
-
battles = self.preprocess(raw_data_df=raw_data)
|
|
34
|
-
elo_ratings = compute_elo(battles)
|
|
35
|
-
col_model = 'Model'
|
|
36
|
-
col_elo_rating = 'Elo_Rating'
|
|
37
|
-
elo_ratings_res = pd.DataFrame([[n, elo_ratings[n]] for n in elo_ratings.keys()],
|
|
38
|
-
columns=[col_model, col_elo_rating]).sort_values(
|
|
39
|
-
col_elo_rating, ascending=False).reset_index(drop=True)
|
|
40
|
-
elo_ratings_res = elo_ratings_res.round({col_elo_rating: 1})
|
|
41
|
-
return elo_ratings_res
|
|
42
|
-
|
|
43
|
-
def get_single_pairwise_rating(self, row: pd.Series):
|
|
44
|
-
tie = False
|
|
45
|
-
if 'win' in row:
|
|
46
|
-
win = row['win']
|
|
47
|
-
if win == 'tie':
|
|
48
|
-
tie = True
|
|
49
|
-
else:
|
|
50
|
-
if win == 'model_a':
|
|
51
|
-
winner = row['model_a']
|
|
52
|
-
loser = row['model_b']
|
|
53
|
-
else:
|
|
54
|
-
winner = row['model_b']
|
|
55
|
-
loser = row['model_a']
|
|
56
|
-
elif 'win_1' in row:
|
|
57
|
-
win_1 = row['win_1']
|
|
58
|
-
win_2 = row['win_2']
|
|
59
|
-
if win_1 == 'tie' or win_1 != win_2:
|
|
60
|
-
tie = True
|
|
61
|
-
else:
|
|
62
|
-
if win_1 == 'model_a':
|
|
63
|
-
winner = row['model_a']
|
|
64
|
-
loser = row['model_b']
|
|
65
|
-
else:
|
|
66
|
-
winner = row['model_b']
|
|
67
|
-
loser = row['model_a']
|
|
68
|
-
else:
|
|
69
|
-
raise ValueError('Unsupported data format')
|
|
70
|
-
|
|
71
|
-
if tie:
|
|
72
|
-
return [{
|
|
73
|
-
'model': row['model_a'],
|
|
74
|
-
'win': 0,
|
|
75
|
-
'loss': 0,
|
|
76
|
-
'tie': 1
|
|
77
|
-
}, {
|
|
78
|
-
'model': row['model_b'],
|
|
79
|
-
'win': 0,
|
|
80
|
-
'loss': 0,
|
|
81
|
-
'tie': 1
|
|
82
|
-
}]
|
|
83
|
-
else:
|
|
84
|
-
return [{'model': winner, 'win': 1, 'loss': 0, 'tie': 0}, {'model': loser, 'win': 0, 'loss': 1, 'tie': 0}]
|
|
85
|
-
|
|
86
|
-
def compute_pairwise_rating(self, raw_data):
|
|
87
|
-
df_all = self.preprocess(raw_data_df=raw_data)
|
|
88
|
-
model_list = (df_all['model_a'].unique().tolist() + df_all['model_b'].unique().tolist())
|
|
89
|
-
model_list = list(set(model_list))
|
|
90
|
-
|
|
91
|
-
list_res = []
|
|
92
|
-
# traverse df row by row
|
|
93
|
-
for index, row in df_all.iterrows():
|
|
94
|
-
if self.baseline_model is not None:
|
|
95
|
-
if self.baseline_model not in [row['model_a'], row['model_b']]:
|
|
96
|
-
logger.warning(
|
|
97
|
-
f'One of the models in the battle should be the baseline model: {self.baseline_model}')
|
|
98
|
-
continue
|
|
99
|
-
rating = self.get_single_pairwise_rating(row)
|
|
100
|
-
list_res = list_res + rating
|
|
101
|
-
|
|
102
|
-
df = pd.DataFrame(list_res)
|
|
103
|
-
df = df.groupby(['model']).sum()
|
|
104
|
-
|
|
105
|
-
# remove baseline model
|
|
106
|
-
if self.baseline_model is not None:
|
|
107
|
-
df = df[df.index != self.baseline_model]
|
|
108
|
-
# add win rate
|
|
109
|
-
df['win_rate'] = df['win'] / (df['win'] + df['loss'] + df['tie'])
|
|
110
|
-
df['loss_rate'] = df['loss'] / (df['win'] + df['loss'] + df['tie'])
|
|
111
|
-
df['tie_rate'] = df['tie'] / (df['win'] + df['loss'] + df['tie'])
|
|
112
|
-
return df.sort_values(by='win_rate', ascending=False)
|
|
113
|
-
|
|
114
|
-
def compute_score_rating(self, raw_data):
|
|
115
|
-
df_all = self.preprocess(raw_data_df=raw_data)
|
|
116
|
-
df = df_all[['model', 'score']]
|
|
117
|
-
|
|
118
|
-
df_score = df.groupby(['model']).mean()
|
|
119
|
-
return df_score.sort_values(by='score', ascending=False)
|
|
120
|
-
|
|
121
|
-
def eval_samples(self, data_list: list):
|
|
122
|
-
res_all = []
|
|
123
|
-
|
|
124
|
-
raw_data: pd.DataFrame = None
|
|
125
|
-
|
|
126
|
-
if len(data_list) > 0:
|
|
127
|
-
raw_data = data_list[0]
|
|
128
|
-
|
|
129
|
-
for metric in self.metrics:
|
|
130
|
-
|
|
131
|
-
if metric == MetricMembers.ELO:
|
|
132
|
-
res = self.compute_elo_rating(raw_data)
|
|
133
|
-
res_all.append(res)
|
|
134
|
-
|
|
135
|
-
elif metric == MetricMembers.PAIRWISE:
|
|
136
|
-
res = self.compute_pairwise_rating(raw_data)
|
|
137
|
-
res_all.append(res)
|
|
138
|
-
|
|
139
|
-
elif metric == MetricMembers.SCORE:
|
|
140
|
-
res = self.compute_score_rating(raw_data)
|
|
141
|
-
res_all.append(res)
|
|
142
|
-
|
|
143
|
-
else:
|
|
144
|
-
raise ValueError(f'Unsupported metric: {metric}')
|
|
145
|
-
|
|
146
|
-
return res_all
|
|
147
|
-
|
|
148
|
-
def run(self, prompts: Union[str, list], **kwargs) -> List[pd.DataFrame]:
|
|
149
|
-
"""
|
|
150
|
-
Load the predicted samples and evaluate them in arena mode.
|
|
151
|
-
"""
|
|
152
|
-
# raw_data = pd.read_json(prompts)
|
|
153
|
-
data_list = jsonl_to_list(prompts)
|
|
154
|
-
data_df = pa.Table.from_pylist(data_list).to_pandas()
|
|
155
|
-
res_list = self.eval_samples([data_df])
|
|
156
|
-
|
|
157
|
-
return res_list
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|