evalscope 0.15.1__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +10 -0
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
- evalscope/benchmarks/data_adapter.py +4 -2
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +133 -0
- evalscope/benchmarks/drop/utils.py +59 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +67 -0
- evalscope/benchmarks/tool_bench/utils.py +202 -0
- evalscope/benchmarks/utils.py +3 -2
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
- evalscope/collections/evaluator.py +76 -26
- evalscope/config.py +46 -15
- evalscope/evaluator/evaluator.py +43 -15
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +3 -3
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/models/adapters/chat_adapter.py +51 -34
- evalscope/models/adapters/server_adapter.py +15 -19
- evalscope/perf/arguments.py +14 -5
- evalscope/perf/benchmark.py +0 -6
- evalscope/perf/main.py +65 -15
- evalscope/perf/utils/benchmark_util.py +33 -15
- evalscope/perf/utils/db_util.py +25 -15
- evalscope/perf/utils/log_utils.py +1 -1
- evalscope/perf/utils/rich_display.py +186 -0
- evalscope/report/app.py +47 -34
- evalscope/report/utils.py +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/deprecation_utils.py +42 -0
- evalscope/version.py +2 -2
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/METADATA +45 -21
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/RECORD +46 -36
- tests/cli/test_all.py +3 -0
- tests/cli/test_collection.py +2 -1
- tests/cli/test_run.py +28 -12
- tests/perf/test_perf.py +23 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/LICENSE +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/WHEEL +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/top_level.txt +0 -0
tests/cli/test_run.py
CHANGED
|
@@ -73,6 +73,12 @@ class TestRun(unittest.TestCase):
|
|
|
73
73
|
logger.info(f'>>test_run_eval_with_args stdout: {run_res.stdout}')
|
|
74
74
|
logger.error(f'>>test_run_eval_with_args stderr: {run_res.stderr}')
|
|
75
75
|
|
|
76
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
77
|
+
def test_run_yaml_config(self):
|
|
78
|
+
from evalscope import run_task
|
|
79
|
+
|
|
80
|
+
run_task(task_cfg='examples/tasks/eval_native.yaml')
|
|
81
|
+
|
|
76
82
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
77
83
|
def test_run_task(self):
|
|
78
84
|
task_cfg = TaskConfig(
|
|
@@ -209,12 +215,16 @@ class TestRun(unittest.TestCase):
|
|
|
209
215
|
task_cfg = TaskConfig(
|
|
210
216
|
model='Qwen/Qwen3-1.7B',
|
|
211
217
|
datasets=[
|
|
212
|
-
|
|
218
|
+
'iquiz',
|
|
213
219
|
# 'math_500',
|
|
214
|
-
'aime24',
|
|
220
|
+
# 'aime24',
|
|
215
221
|
# 'competition_math',
|
|
216
222
|
# 'mmlu',
|
|
223
|
+
# 'simple_qa',
|
|
217
224
|
],
|
|
225
|
+
model_args={
|
|
226
|
+
'device_map': 'auto',
|
|
227
|
+
},
|
|
218
228
|
dataset_args={
|
|
219
229
|
'competition_math': {
|
|
220
230
|
'subset_list': ['Level 4', 'Level 5']
|
|
@@ -232,7 +242,8 @@ class TestRun(unittest.TestCase):
|
|
|
232
242
|
'top_p': 0.8, # top-p采样 (qwen 报告推荐值)
|
|
233
243
|
'top_k': 20, # top-k采样 (qwen 报告推荐值)
|
|
234
244
|
'chat_template_kwargs': {'enable_thinking': False} # 关闭思考模式
|
|
235
|
-
}
|
|
245
|
+
},
|
|
246
|
+
judge_strategy=JudgeStrategy.AUTO,
|
|
236
247
|
)
|
|
237
248
|
|
|
238
249
|
run_task(task_cfg=task_cfg)
|
|
@@ -276,7 +287,7 @@ class TestRun(unittest.TestCase):
|
|
|
276
287
|
datasets=[
|
|
277
288
|
# 'iquiz',
|
|
278
289
|
# 'ifeval',
|
|
279
|
-
'mmlu',
|
|
290
|
+
# 'mmlu',
|
|
280
291
|
# 'mmlu_pro',
|
|
281
292
|
# 'musr',
|
|
282
293
|
# 'process_bench',
|
|
@@ -294,10 +305,13 @@ class TestRun(unittest.TestCase):
|
|
|
294
305
|
# 'ceval',
|
|
295
306
|
# 'hellaswag',
|
|
296
307
|
# 'general_mcq',
|
|
297
|
-
# 'general_qa'
|
|
308
|
+
# 'general_qa',
|
|
298
309
|
# 'super_gpqa',
|
|
299
310
|
# 'mmlu_redux',
|
|
300
|
-
# 'maritime_bench'
|
|
311
|
+
# 'maritime_bench',
|
|
312
|
+
# 'drop',
|
|
313
|
+
# 'winogrande',
|
|
314
|
+
'tool_bench',
|
|
301
315
|
],
|
|
302
316
|
dataset_args={
|
|
303
317
|
'mmlu': {
|
|
@@ -363,14 +377,16 @@ class TestRun(unittest.TestCase):
|
|
|
363
377
|
},
|
|
364
378
|
},
|
|
365
379
|
eval_batch_size=32,
|
|
366
|
-
limit=
|
|
380
|
+
limit=10,
|
|
367
381
|
debug=True,
|
|
368
382
|
stream=False,
|
|
369
383
|
generation_config={
|
|
370
384
|
'temperature': 0,
|
|
371
385
|
'n': 1,
|
|
372
386
|
'max_tokens': 4096,
|
|
373
|
-
}
|
|
387
|
+
},
|
|
388
|
+
# ignore_errors=True,
|
|
389
|
+
use_cache='outputs/20250519_142106'
|
|
374
390
|
)
|
|
375
391
|
|
|
376
392
|
run_task(task_cfg=task_cfg)
|
|
@@ -423,12 +439,12 @@ class TestRun(unittest.TestCase):
|
|
|
423
439
|
# 'gsm8k'
|
|
424
440
|
# 'truthful_qa',
|
|
425
441
|
# 'simple_qa',
|
|
426
|
-
|
|
442
|
+
'chinese_simpleqa',
|
|
427
443
|
# 'live_code_bench',
|
|
428
444
|
# 'humaneval',
|
|
429
445
|
# 'general_qa',
|
|
430
446
|
# 'alpaca_eval',
|
|
431
|
-
'arena_hard'
|
|
447
|
+
# 'arena_hard'
|
|
432
448
|
],
|
|
433
449
|
dataset_args={
|
|
434
450
|
'competition_math': {
|
|
@@ -454,7 +470,7 @@ class TestRun(unittest.TestCase):
|
|
|
454
470
|
]
|
|
455
471
|
},
|
|
456
472
|
},
|
|
457
|
-
eval_batch_size=
|
|
473
|
+
eval_batch_size=10,
|
|
458
474
|
limit=10,
|
|
459
475
|
judge_strategy=JudgeStrategy.AUTO,
|
|
460
476
|
judge_worker_num=5,
|
|
@@ -475,7 +491,7 @@ class TestRun(unittest.TestCase):
|
|
|
475
491
|
},
|
|
476
492
|
timeout=60000,
|
|
477
493
|
stream=True,
|
|
478
|
-
|
|
494
|
+
use_cache='outputs/20250519_142551'
|
|
479
495
|
)
|
|
480
496
|
|
|
481
497
|
run_task(task_cfg=task_cfg)
|
tests/perf/test_perf.py
CHANGED
|
@@ -121,6 +121,29 @@ class TestPerf(unittest.TestCase):
|
|
|
121
121
|
print(metrics_result)
|
|
122
122
|
print(percentile_result)
|
|
123
123
|
|
|
124
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
125
|
+
def test_run_perf_multi_parallel(self):
|
|
126
|
+
from evalscope.perf.arguments import Arguments
|
|
127
|
+
task_cfg = Arguments(
|
|
128
|
+
parallel=[1, 2],
|
|
129
|
+
number=[2, 5],
|
|
130
|
+
model='qwen2.5-7b-instruct',
|
|
131
|
+
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
132
|
+
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
133
|
+
api='openai',
|
|
134
|
+
dataset='random',
|
|
135
|
+
min_tokens=100,
|
|
136
|
+
max_tokens=100,
|
|
137
|
+
prefix_length=0,
|
|
138
|
+
min_prompt_length=1024,
|
|
139
|
+
max_prompt_length=1024,
|
|
140
|
+
tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
|
|
141
|
+
seed=None,
|
|
142
|
+
extra_args={'ignore_eos': True}
|
|
143
|
+
)
|
|
144
|
+
metrics_result, percentile_result = run_perf_benchmark(task_cfg)
|
|
145
|
+
print(metrics_result)
|
|
146
|
+
print(percentile_result)
|
|
124
147
|
|
|
125
148
|
if __name__ == '__main__':
|
|
126
149
|
unittest.main(buffer=False)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|