evalscope 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -1
- evalscope/backend/rag_eval/utils/llm.py +4 -5
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
- evalscope/benchmarks/arena_hard/utils.py +162 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
- evalscope/benchmarks/data_adapter.py +26 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -11
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
- evalscope/benchmarks/live_code_bench/testing_util.py +3 -3
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
- evalscope/collections/evaluator.py +1 -1
- evalscope/config.py +6 -3
- evalscope/constants.py +1 -0
- evalscope/evaluator/evaluator.py +5 -4
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/models/chat_adapter.py +32 -11
- evalscope/models/custom_adapter.py +1 -1
- evalscope/perf/arguments.py +19 -46
- evalscope/perf/benchmark.py +64 -90
- evalscope/perf/main.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +4 -2
- evalscope/perf/plugin/datasets/__init__.py +1 -0
- evalscope/perf/plugin/datasets/openqa.py +6 -11
- evalscope/perf/plugin/datasets/random_dataset.py +51 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/utils/db_util.py +5 -2
- evalscope/run.py +14 -2
- evalscope/version.py +2 -2
- {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/METADATA +42 -78
- {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/RECORD +45 -37
- tests/cli/test_all.py +33 -24
- tests/cli/test_run.py +69 -22
- tests/perf/test_perf.py +23 -0
- tests/rag/test_ragas.py +4 -1
- {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/LICENSE +0 -0
- {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/WHEEL +0 -0
- {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/entry_points.txt +0 -0
- {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/top_level.txt +0 -0
tests/cli/test_run.py
CHANGED
|
@@ -203,15 +203,16 @@ class TestRun(unittest.TestCase):
|
|
|
203
203
|
print(res)
|
|
204
204
|
|
|
205
205
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
206
|
-
def
|
|
206
|
+
def test_run_one_task(self):
|
|
207
207
|
from evalscope.config import TaskConfig
|
|
208
208
|
|
|
209
209
|
task_cfg = TaskConfig(
|
|
210
|
-
model='
|
|
210
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
211
211
|
datasets=[
|
|
212
|
+
'iquiz',
|
|
212
213
|
# 'math_500',
|
|
213
214
|
# 'aime24',
|
|
214
|
-
'competition_math'
|
|
215
|
+
# 'competition_math'
|
|
215
216
|
],
|
|
216
217
|
dataset_args={
|
|
217
218
|
'competition_math': {
|
|
@@ -223,12 +224,39 @@ class TestRun(unittest.TestCase):
|
|
|
223
224
|
|
|
224
225
|
run_task(task_cfg=task_cfg)
|
|
225
226
|
|
|
227
|
+
|
|
228
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
229
|
+
def test_run_task_loop(self):
|
|
230
|
+
os.environ['CUDA_VISIBLE_DEVICES'] = '2'
|
|
231
|
+
from evalscope.config import TaskConfig
|
|
232
|
+
|
|
233
|
+
task_cfg1 = TaskConfig(
|
|
234
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
235
|
+
model_id='model1',
|
|
236
|
+
datasets=['iquiz'],
|
|
237
|
+
limit=10
|
|
238
|
+
)
|
|
239
|
+
task_cfg2 = TaskConfig(
|
|
240
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
241
|
+
model_id='model2',
|
|
242
|
+
datasets=['iquiz'],
|
|
243
|
+
limit=10
|
|
244
|
+
)
|
|
245
|
+
task_cfg3 = TaskConfig(
|
|
246
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
247
|
+
model_id='model3',
|
|
248
|
+
datasets=['iquiz'],
|
|
249
|
+
limit=10
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
run_task(task_cfg=[task_cfg1, task_cfg2, task_cfg3])
|
|
253
|
+
|
|
226
254
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
227
255
|
def test_run_server_model(self):
|
|
228
256
|
from evalscope.config import TaskConfig
|
|
229
257
|
|
|
230
258
|
task_cfg = TaskConfig(
|
|
231
|
-
model='
|
|
259
|
+
model='qwen-plus',
|
|
232
260
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
233
261
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
234
262
|
eval_type=EvalType.SERVICE,
|
|
@@ -251,10 +279,11 @@ class TestRun(unittest.TestCase):
|
|
|
251
279
|
# 'gpqa',
|
|
252
280
|
# 'arc',
|
|
253
281
|
# 'ceval',
|
|
254
|
-
'hellaswag',
|
|
282
|
+
# 'hellaswag',
|
|
255
283
|
# 'general_mcq',
|
|
256
|
-
|
|
284
|
+
'general_qa'
|
|
257
285
|
# 'super_gpqa',
|
|
286
|
+
# 'mmlu_redux'
|
|
258
287
|
],
|
|
259
288
|
dataset_args={
|
|
260
289
|
'mmlu': {
|
|
@@ -308,23 +337,26 @@ class TestRun(unittest.TestCase):
|
|
|
308
337
|
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
309
338
|
# 'test'
|
|
310
339
|
],
|
|
311
|
-
'metric_list': ['
|
|
340
|
+
'metric_list': ['AverageRouge']
|
|
312
341
|
},
|
|
313
342
|
'super_gpqa': {
|
|
314
343
|
# 'subset_list': ['Philosophy', 'Education'],
|
|
315
344
|
'few_shot_num': 0
|
|
316
|
-
}
|
|
345
|
+
},
|
|
346
|
+
'mmlu_redux':{
|
|
347
|
+
'subset_list': ['abstract_algebra']
|
|
348
|
+
},
|
|
317
349
|
},
|
|
318
350
|
eval_batch_size=32,
|
|
319
351
|
limit=15,
|
|
320
|
-
|
|
352
|
+
debug=True,
|
|
321
353
|
stream=False,
|
|
322
354
|
generation_config={
|
|
323
355
|
'temperature': 0,
|
|
324
|
-
'n':
|
|
356
|
+
'n': 2,
|
|
325
357
|
'max_tokens': 4096,
|
|
326
358
|
},
|
|
327
|
-
|
|
359
|
+
use_cache='outputs/20250326_202848',
|
|
328
360
|
)
|
|
329
361
|
|
|
330
362
|
run_task(task_cfg=task_cfg)
|
|
@@ -365,32 +397,33 @@ class TestRun(unittest.TestCase):
|
|
|
365
397
|
from evalscope.config import TaskConfig
|
|
366
398
|
|
|
367
399
|
task_cfg = TaskConfig(
|
|
368
|
-
model='qwen2.5-
|
|
400
|
+
model='qwen2.5-0.5b-instruct',
|
|
369
401
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
370
402
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
371
403
|
eval_type=EvalType.SERVICE,
|
|
372
404
|
datasets=[
|
|
373
405
|
# 'math_500',
|
|
374
|
-
'aime24',
|
|
406
|
+
# 'aime24',
|
|
375
407
|
# 'competition_math',
|
|
376
408
|
# 'arc',
|
|
377
409
|
# 'gsm8k'
|
|
378
410
|
# 'truthful_qa',
|
|
379
411
|
# 'simple_qa',
|
|
380
|
-
#
|
|
412
|
+
# 'chinese_simpleqa',
|
|
381
413
|
# 'live_code_bench',
|
|
382
|
-
# 'humaneval'
|
|
383
|
-
# 'general_qa'
|
|
414
|
+
# 'humaneval',
|
|
415
|
+
# 'general_qa',
|
|
416
|
+
# 'alpaca_eval',
|
|
417
|
+
'arena_hard'
|
|
384
418
|
],
|
|
385
419
|
dataset_args={
|
|
386
420
|
'competition_math': {
|
|
387
421
|
'subset_list': ['Level 4']
|
|
388
422
|
},
|
|
389
423
|
'live_code_bench': {
|
|
390
|
-
'subset_list': ['v4_v5'],
|
|
391
424
|
'extra_params': {
|
|
392
|
-
'start_date': '2024-
|
|
393
|
-
'end_date': '2025-
|
|
425
|
+
'start_date': '2024-08-01',
|
|
426
|
+
'end_date': '2025-02-28'
|
|
394
427
|
},
|
|
395
428
|
'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
|
|
396
429
|
},
|
|
@@ -401,20 +434,34 @@ class TestRun(unittest.TestCase):
|
|
|
401
434
|
# 'test'
|
|
402
435
|
]
|
|
403
436
|
},
|
|
437
|
+
'chinese_simpleqa': {
|
|
438
|
+
'subset_list': [
|
|
439
|
+
'中华文化'
|
|
440
|
+
]
|
|
441
|
+
},
|
|
404
442
|
},
|
|
405
443
|
eval_batch_size=5,
|
|
406
|
-
limit=
|
|
444
|
+
limit=10,
|
|
407
445
|
judge_strategy=JudgeStrategy.AUTO,
|
|
446
|
+
judge_worker_num=5,
|
|
408
447
|
judge_model_args={
|
|
409
448
|
'model_id': 'qwen2.5-7b-instruct',
|
|
410
449
|
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
411
450
|
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
451
|
+
'generation_config': {
|
|
452
|
+
'temperature': 0.0,
|
|
453
|
+
'max_tokens': 4096
|
|
454
|
+
}
|
|
412
455
|
},
|
|
413
456
|
generation_config={
|
|
414
|
-
'max_new_tokens':
|
|
457
|
+
'max_new_tokens': 20000,
|
|
415
458
|
'temperature': 0.0,
|
|
416
459
|
'seed': 42,
|
|
417
|
-
|
|
460
|
+
'n': 1
|
|
461
|
+
},
|
|
462
|
+
timeout=60000,
|
|
463
|
+
stream=True,
|
|
464
|
+
# use_cache='outputs/20250320_143658'
|
|
418
465
|
)
|
|
419
466
|
|
|
420
467
|
run_task(task_cfg=task_cfg)
|
tests/perf/test_perf.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
import os
|
|
3
|
+
from dotenv import dotenv_values
|
|
3
4
|
|
|
5
|
+
env = dotenv_values('.env')
|
|
4
6
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
|
5
7
|
import unittest
|
|
6
8
|
|
|
@@ -96,6 +98,27 @@ class TestPerf(unittest.TestCase):
|
|
|
96
98
|
}
|
|
97
99
|
run_perf_benchmark(task_cfg)
|
|
98
100
|
|
|
101
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
102
|
+
def test_run_perf_local_random(self):
|
|
103
|
+
from evalscope.perf.arguments import Arguments
|
|
104
|
+
task_cfg = Arguments(
|
|
105
|
+
parallel=20,
|
|
106
|
+
model='Qwen2.5-0.5B-Instruct',
|
|
107
|
+
url='http://127.0.0.1:8801/v1/chat/completions',
|
|
108
|
+
api='openai',
|
|
109
|
+
dataset='random',
|
|
110
|
+
min_tokens=1024,
|
|
111
|
+
max_tokens=1024,
|
|
112
|
+
prefix_length=0,
|
|
113
|
+
min_prompt_length=1024,
|
|
114
|
+
max_prompt_length=1024,
|
|
115
|
+
number=40,
|
|
116
|
+
tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
|
|
117
|
+
seed=None,
|
|
118
|
+
debug= True,
|
|
119
|
+
)
|
|
120
|
+
run_perf_benchmark(task_cfg)
|
|
121
|
+
|
|
99
122
|
|
|
100
123
|
if __name__ == '__main__':
|
|
101
124
|
unittest.main(buffer=False)
|
tests/rag/test_ragas.py
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
import os
|
|
3
|
+
from dotenv import dotenv_values
|
|
4
|
+
|
|
5
|
+
env = dotenv_values('.env')
|
|
3
6
|
import unittest
|
|
4
7
|
|
|
5
8
|
from evalscope.run import run_task
|
|
@@ -63,7 +66,7 @@ class TestRAGAS(unittest.TestCase):
|
|
|
63
66
|
'eval': {
|
|
64
67
|
'testset_file': 'outputs/testset_chinese_with_answer.json',
|
|
65
68
|
'critic_llm': {
|
|
66
|
-
'model_name_or_path': '
|
|
69
|
+
'model_name_or_path': 'Qwen/Qwen2.5-7B-Instruct',
|
|
67
70
|
},
|
|
68
71
|
'embeddings': {
|
|
69
72
|
'model_name_or_path': 'AI-ModelScope/m3e-base',
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|