evalscope 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +6 -1
- evalscope/benchmarks/aime/aime24_adapter.py +3 -3
- evalscope/benchmarks/aime/aime25_adapter.py +3 -3
- evalscope/benchmarks/arc/arc_adapter.py +15 -18
- evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
- evalscope/benchmarks/benchmark.py +12 -11
- evalscope/benchmarks/ceval/ceval_adapter.py +12 -16
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +13 -17
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
- evalscope/benchmarks/data_adapter.py +59 -21
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
- evalscope/benchmarks/general_qa/general_qa_adapter.py +30 -15
- evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -31
- evalscope/benchmarks/humaneval/humaneval_adapter.py +10 -7
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
- evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
- evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
- evalscope/benchmarks/mmlu/mmlu_adapter.py +13 -17
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
- evalscope/benchmarks/musr/musr_adapter.py +8 -5
- evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
- evalscope/benchmarks/race/race_adapter.py +12 -16
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +167 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
- evalscope/benchmarks/super_gpqa/utils.py +85 -0
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
- evalscope/benchmarks/utils.py +43 -0
- evalscope/collections/evaluator.py +14 -5
- evalscope/config.py +15 -2
- evalscope/constants.py +14 -0
- evalscope/evaluator/evaluator.py +51 -13
- evalscope/metrics/llm_judge.py +104 -0
- evalscope/metrics/named_metrics.py +1 -0
- evalscope/models/__init__.py +2 -1
- evalscope/models/base_adapter.py +25 -5
- evalscope/models/chat_adapter.py +3 -0
- evalscope/models/choice_adapter.py +4 -0
- evalscope/models/custom_adapter.py +2 -0
- evalscope/models/register.py +28 -0
- evalscope/models/server_adapter.py +35 -8
- evalscope/perf/arguments.py +13 -7
- evalscope/perf/benchmark.py +5 -0
- evalscope/perf/http_client.py +15 -5
- evalscope/perf/main.py +1 -0
- evalscope/perf/utils/analysis_result.py +1 -1
- evalscope/report/app.py +3 -0
- evalscope/report/combinator.py +2 -2
- evalscope/run.py +6 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/eval.py +220 -55
- evalscope/third_party/thinkbench/infer.py +37 -7
- evalscope/third_party/thinkbench/tools/llm.py +1 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
- evalscope/utils/chat_service.py +1 -0
- evalscope/utils/filters.py +59 -0
- evalscope/utils/logger.py +3 -3
- evalscope/version.py +2 -2
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/METADATA +31 -12
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/RECORD +85 -62
- tests/cli/test_all.py +144 -0
- tests/cli/test_collection.py +28 -2
- tests/cli/test_run.py +201 -32
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/LICENSE +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/WHEEL +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/top_level.txt +0 -0
tests/cli/test_run.py
CHANGED
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from dotenv import dotenv_values
|
|
3
|
+
|
|
4
|
+
env = dotenv_values('.env')
|
|
5
|
+
|
|
2
6
|
import os
|
|
3
7
|
import subprocess
|
|
4
|
-
import torch
|
|
5
8
|
import unittest
|
|
6
9
|
|
|
7
|
-
from evalscope.
|
|
10
|
+
from evalscope.config import TaskConfig
|
|
11
|
+
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
8
12
|
from evalscope.run import run_task
|
|
9
13
|
from evalscope.utils import is_module_installed, test_level_list
|
|
10
14
|
from evalscope.utils.logger import get_logger
|
|
@@ -71,21 +75,104 @@ class TestRun(unittest.TestCase):
|
|
|
71
75
|
|
|
72
76
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
73
77
|
def test_run_task(self):
|
|
74
|
-
task_cfg =
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
78
|
+
task_cfg = TaskConfig(
|
|
79
|
+
model='qwen/Qwen2.5-0.5B-Instruct',
|
|
80
|
+
datasets=[
|
|
81
|
+
'iquiz',
|
|
82
|
+
# 'ifeval',
|
|
83
|
+
# 'mmlu',
|
|
84
|
+
# 'mmlu_pro',
|
|
85
|
+
# 'musr',
|
|
86
|
+
# 'process_bench',
|
|
87
|
+
# 'race',
|
|
88
|
+
# 'trivia_qa',
|
|
89
|
+
# 'cmmlu',
|
|
90
|
+
# 'humaneval',
|
|
91
|
+
# 'super_gpqa',
|
|
92
|
+
# 'gsm8k',
|
|
93
|
+
# 'bbh',
|
|
94
|
+
# 'competition_math',
|
|
95
|
+
# 'math_500',
|
|
96
|
+
'aime24',
|
|
97
|
+
'gpqa',
|
|
98
|
+
# 'arc',
|
|
99
|
+
# 'ceval',
|
|
100
|
+
# 'hellaswag',
|
|
101
|
+
# 'general_mcq',
|
|
102
|
+
# 'general_qa'
|
|
103
|
+
],
|
|
104
|
+
dataset_args={
|
|
105
|
+
'mmlu': {
|
|
106
|
+
'subset_list': ['elementary_mathematics'],
|
|
107
|
+
'few_shot_num': 0
|
|
108
|
+
},
|
|
109
|
+
'mmlu_pro': {
|
|
110
|
+
'subset_list': ['math', 'health'],
|
|
111
|
+
'few_shot_num': 4
|
|
112
|
+
},
|
|
113
|
+
'ceval': {
|
|
114
|
+
'subset_list': [
|
|
115
|
+
'computer_network', 'operating_system', 'computer_architecture'
|
|
116
|
+
],
|
|
117
|
+
'few_shot_num': 0
|
|
118
|
+
},
|
|
119
|
+
'cmmlu': {
|
|
120
|
+
'subset_list': ['elementary_chinese'],
|
|
121
|
+
'few_shot_num': 0
|
|
122
|
+
},
|
|
123
|
+
'bbh': {
|
|
124
|
+
'subset_list': ['word_sorting', 'movie_recommendation'],
|
|
125
|
+
},
|
|
126
|
+
'gpqa': {
|
|
127
|
+
'subset_list': ['gpqa_diamond'],
|
|
128
|
+
'few_shot_num': 0
|
|
129
|
+
},
|
|
130
|
+
'humaneval': {
|
|
131
|
+
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
|
|
132
|
+
},
|
|
133
|
+
'competition_math': {
|
|
134
|
+
'subset_list': ['Level 1']
|
|
135
|
+
},
|
|
136
|
+
'process_bench': {
|
|
137
|
+
'subset_list': ['gsm8k'],
|
|
138
|
+
},
|
|
139
|
+
'musr': {
|
|
140
|
+
'subset_list': ['murder_mysteries']
|
|
141
|
+
},
|
|
142
|
+
'general_mcq': {
|
|
143
|
+
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
144
|
+
'subset_list': [
|
|
145
|
+
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
146
|
+
],
|
|
147
|
+
'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
|
|
148
|
+
},
|
|
149
|
+
'general_qa': {
|
|
150
|
+
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
151
|
+
'subset_list': [
|
|
152
|
+
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
153
|
+
# 'test'
|
|
154
|
+
],
|
|
155
|
+
'metric_list': ['AverageBLEU']
|
|
156
|
+
},
|
|
157
|
+
'super_gpqa': {
|
|
158
|
+
'subset_list': ['Philosophy', 'Education'],
|
|
159
|
+
'few_shot_num': 0
|
|
160
|
+
},
|
|
161
|
+
'ifeval': {
|
|
162
|
+
'filters': {
|
|
163
|
+
'remove_until': '</think>'
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
},
|
|
167
|
+
limit=2,
|
|
168
|
+
eval_batch_size=2,
|
|
169
|
+
generation_config={
|
|
170
|
+
'max_new_tokens': 2048,
|
|
171
|
+
'temperature': 0.7,
|
|
172
|
+
'num_return_sequences': 1,
|
|
173
|
+
},
|
|
174
|
+
# debug=True
|
|
175
|
+
)
|
|
89
176
|
run_task(task_cfg=task_cfg)
|
|
90
177
|
|
|
91
178
|
|
|
@@ -141,12 +228,12 @@ class TestRun(unittest.TestCase):
|
|
|
141
228
|
from evalscope.config import TaskConfig
|
|
142
229
|
|
|
143
230
|
task_cfg = TaskConfig(
|
|
144
|
-
model='
|
|
145
|
-
api_url='
|
|
146
|
-
api_key='
|
|
231
|
+
model='qwen2.5-7b-instruct',
|
|
232
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
233
|
+
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
147
234
|
eval_type=EvalType.SERVICE,
|
|
148
235
|
datasets=[
|
|
149
|
-
'iquiz',
|
|
236
|
+
# 'iquiz',
|
|
150
237
|
# 'ifeval',
|
|
151
238
|
# 'mmlu',
|
|
152
239
|
# 'mmlu_pro',
|
|
@@ -164,11 +251,14 @@ class TestRun(unittest.TestCase):
|
|
|
164
251
|
# 'gpqa',
|
|
165
252
|
# 'arc',
|
|
166
253
|
# 'ceval',
|
|
167
|
-
|
|
254
|
+
'hellaswag',
|
|
255
|
+
# 'general_mcq',
|
|
256
|
+
# 'general_qa'
|
|
257
|
+
# 'super_gpqa',
|
|
168
258
|
],
|
|
169
259
|
dataset_args={
|
|
170
260
|
'mmlu': {
|
|
171
|
-
'subset_list': ['elementary_mathematics'],
|
|
261
|
+
'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
|
|
172
262
|
'few_shot_num': 0
|
|
173
263
|
},
|
|
174
264
|
'mmlu_pro': {
|
|
@@ -189,8 +279,9 @@ class TestRun(unittest.TestCase):
|
|
|
189
279
|
'subset_list': ['word_sorting', 'movie_recommendation'],
|
|
190
280
|
},
|
|
191
281
|
'gpqa': {
|
|
192
|
-
'subset_list': ['gpqa_diamond'],
|
|
193
|
-
'few_shot_num': 0
|
|
282
|
+
# 'subset_list': ['gpqa_diamond'],
|
|
283
|
+
'few_shot_num': 0,
|
|
284
|
+
'local_path': './data/data/gpqa',
|
|
194
285
|
},
|
|
195
286
|
'humaneval': {
|
|
196
287
|
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
|
|
@@ -204,17 +295,36 @@ class TestRun(unittest.TestCase):
|
|
|
204
295
|
'musr': {
|
|
205
296
|
'subset_list': ['murder_mysteries']
|
|
206
297
|
},
|
|
298
|
+
'general_mcq': {
|
|
299
|
+
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
300
|
+
'subset_list': [
|
|
301
|
+
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
302
|
+
],
|
|
303
|
+
'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
|
|
304
|
+
},
|
|
305
|
+
'general_qa': {
|
|
306
|
+
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
307
|
+
'subset_list': [
|
|
308
|
+
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
309
|
+
# 'test'
|
|
310
|
+
],
|
|
311
|
+
'metric_list': ['AverageBLEU']
|
|
312
|
+
},
|
|
313
|
+
'super_gpqa': {
|
|
314
|
+
# 'subset_list': ['Philosophy', 'Education'],
|
|
315
|
+
'few_shot_num': 0
|
|
316
|
+
}
|
|
207
317
|
},
|
|
208
|
-
eval_batch_size=
|
|
209
|
-
limit=
|
|
210
|
-
debug=True,
|
|
211
|
-
stream=
|
|
318
|
+
eval_batch_size=32,
|
|
319
|
+
limit=15,
|
|
320
|
+
# debug=True,
|
|
321
|
+
stream=False,
|
|
212
322
|
generation_config={
|
|
213
|
-
'temperature': 0
|
|
323
|
+
'temperature': 0,
|
|
214
324
|
'n': 1,
|
|
215
|
-
'max_tokens':
|
|
325
|
+
'max_tokens': 4096,
|
|
216
326
|
},
|
|
217
|
-
# use_cache='
|
|
327
|
+
# use_cache='./outputs/20250212_150525',
|
|
218
328
|
)
|
|
219
329
|
|
|
220
330
|
run_task(task_cfg=task_cfg)
|
|
@@ -250,5 +360,64 @@ class TestRun(unittest.TestCase):
|
|
|
250
360
|
|
|
251
361
|
run_task(task_cfg=task_cfg)
|
|
252
362
|
|
|
363
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
364
|
+
def test_run_judge_model(self):
|
|
365
|
+
from evalscope.config import TaskConfig
|
|
366
|
+
|
|
367
|
+
task_cfg = TaskConfig(
|
|
368
|
+
model='qwen2.5-7b-instruct',
|
|
369
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
370
|
+
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
371
|
+
eval_type=EvalType.SERVICE,
|
|
372
|
+
datasets=[
|
|
373
|
+
# 'math_500',
|
|
374
|
+
'aime24',
|
|
375
|
+
# 'competition_math',
|
|
376
|
+
# 'arc',
|
|
377
|
+
# 'gsm8k'
|
|
378
|
+
# 'truthful_qa',
|
|
379
|
+
# 'simple_qa',
|
|
380
|
+
# # 'chinese_simpleqa',
|
|
381
|
+
# 'live_code_bench',
|
|
382
|
+
# 'humaneval'
|
|
383
|
+
# 'general_qa'
|
|
384
|
+
],
|
|
385
|
+
dataset_args={
|
|
386
|
+
'competition_math': {
|
|
387
|
+
'subset_list': ['Level 4']
|
|
388
|
+
},
|
|
389
|
+
'live_code_bench': {
|
|
390
|
+
'subset_list': ['v4_v5'],
|
|
391
|
+
'extra_params': {
|
|
392
|
+
'start_date': '2024-12-01',
|
|
393
|
+
'end_date': '2025-01-01'
|
|
394
|
+
},
|
|
395
|
+
'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
|
|
396
|
+
},
|
|
397
|
+
'general_qa': {
|
|
398
|
+
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
399
|
+
'subset_list': [
|
|
400
|
+
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
401
|
+
# 'test'
|
|
402
|
+
]
|
|
403
|
+
},
|
|
404
|
+
},
|
|
405
|
+
eval_batch_size=5,
|
|
406
|
+
limit=5,
|
|
407
|
+
judge_strategy=JudgeStrategy.AUTO,
|
|
408
|
+
judge_model_args={
|
|
409
|
+
'model_id': 'qwen2.5-7b-instruct',
|
|
410
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
411
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
412
|
+
},
|
|
413
|
+
generation_config={
|
|
414
|
+
'max_new_tokens': 2048,
|
|
415
|
+
'temperature': 0.0,
|
|
416
|
+
'seed': 42,
|
|
417
|
+
}
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
run_task(task_cfg=task_cfg)
|
|
421
|
+
|
|
253
422
|
if __name__ == '__main__':
|
|
254
423
|
unittest.main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|