evalscope 0.15.1__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/__init__.py +28 -0
- evalscope/{report → app}/app.py +67 -59
- evalscope/app/constants.py +21 -0
- evalscope/arguments.py +12 -1
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/utils/embedding.py +75 -35
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
- evalscope/benchmarks/benchmark.py +1 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
- evalscope/benchmarks/data_adapter.py +101 -18
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
- evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +133 -0
- evalscope/benchmarks/drop/utils.py +59 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +90 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +70 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/utils.py +28 -2
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
- evalscope/cli/start_app.py +2 -2
- evalscope/collections/__init__.py +35 -3
- evalscope/collections/evaluator.py +94 -32
- evalscope/config.py +54 -17
- evalscope/evaluator/evaluator.py +80 -41
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +15 -8
- evalscope/metrics/math_parser.py +1 -1
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/models/adapters/chat_adapter.py +51 -34
- evalscope/models/adapters/server_adapter.py +17 -25
- evalscope/perf/arguments.py +16 -7
- evalscope/perf/benchmark.py +0 -15
- evalscope/perf/main.py +72 -15
- evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope/perf/utils/benchmark_util.py +34 -16
- evalscope/perf/utils/db_util.py +25 -15
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/perf/utils/log_utils.py +12 -5
- evalscope/perf/utils/rich_display.py +186 -0
- evalscope/report/__init__.py +36 -4
- evalscope/report/combinator.py +8 -0
- evalscope/report/generator.py +33 -9
- evalscope/report/utils.py +61 -4
- evalscope/run.py +12 -0
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/deprecation_utils.py +42 -0
- evalscope/utils/logger.py +1 -1
- evalscope/utils/utils.py +12 -0
- evalscope/version.py +2 -2
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/METADATA +57 -31
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/RECORD +78 -57
- tests/aigc/test_t2i.py +40 -3
- tests/cli/test_all.py +39 -32
- tests/cli/test_collection.py +8 -6
- tests/cli/test_run.py +43 -17
- tests/perf/test_perf.py +23 -0
- tests/rag/test_mteb.py +5 -5
- /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0
tests/cli/test_all.py
CHANGED
|
@@ -12,40 +12,46 @@ from evalscope.run import run_task
|
|
|
12
12
|
from evalscope.utils import test_level_list
|
|
13
13
|
from evalscope.utils.logger import get_logger
|
|
14
14
|
|
|
15
|
-
os.environ['
|
|
15
|
+
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
16
16
|
|
|
17
17
|
logger = get_logger()
|
|
18
18
|
|
|
19
19
|
datasets=[
|
|
20
|
-
'iquiz',
|
|
21
|
-
'ifeval',
|
|
22
|
-
'mmlu',
|
|
23
|
-
'mmlu_pro',
|
|
24
|
-
'musr',
|
|
25
|
-
'process_bench',
|
|
26
|
-
'race',
|
|
27
|
-
'trivia_qa',
|
|
28
|
-
'cmmlu',
|
|
29
|
-
'humaneval',
|
|
30
|
-
'gsm8k',
|
|
31
|
-
'bbh',
|
|
32
|
-
'competition_math',
|
|
33
|
-
'math_500',
|
|
34
|
-
'aime24',
|
|
35
|
-
'gpqa',
|
|
36
|
-
'arc',
|
|
37
|
-
'ceval',
|
|
38
|
-
'hellaswag',
|
|
39
|
-
'general_mcq',
|
|
40
|
-
'general_qa',
|
|
41
|
-
'super_gpqa',
|
|
42
|
-
'live_code_bench',
|
|
43
|
-
'mmlu_redux',
|
|
44
|
-
'simple_qa',
|
|
45
|
-
'chinese_simpleqa',
|
|
46
|
-
'alpaca_eval',
|
|
47
|
-
'arena_hard',
|
|
48
|
-
'maritime_bench',
|
|
20
|
+
# 'iquiz',
|
|
21
|
+
# 'ifeval',
|
|
22
|
+
# 'mmlu',
|
|
23
|
+
# 'mmlu_pro',
|
|
24
|
+
# 'musr',
|
|
25
|
+
# 'process_bench',
|
|
26
|
+
# 'race',
|
|
27
|
+
# 'trivia_qa',
|
|
28
|
+
# 'cmmlu',
|
|
29
|
+
# 'humaneval',
|
|
30
|
+
# 'gsm8k',
|
|
31
|
+
# 'bbh',
|
|
32
|
+
# 'competition_math',
|
|
33
|
+
# 'math_500',
|
|
34
|
+
# 'aime24',
|
|
35
|
+
# 'gpqa',
|
|
36
|
+
# 'arc',
|
|
37
|
+
# 'ceval',
|
|
38
|
+
# 'hellaswag',
|
|
39
|
+
# 'general_mcq',
|
|
40
|
+
# 'general_qa',
|
|
41
|
+
# 'super_gpqa',
|
|
42
|
+
# 'live_code_bench',
|
|
43
|
+
# 'mmlu_redux',
|
|
44
|
+
# 'simple_qa',
|
|
45
|
+
# 'chinese_simpleqa',
|
|
46
|
+
# 'alpaca_eval',
|
|
47
|
+
# 'arena_hard',
|
|
48
|
+
# 'maritime_bench',
|
|
49
|
+
# 'drop',
|
|
50
|
+
# 'winogrande',
|
|
51
|
+
# 'tool_bench',
|
|
52
|
+
'frames',
|
|
53
|
+
'docmath',
|
|
54
|
+
'needle_haystack'
|
|
49
55
|
]
|
|
50
56
|
|
|
51
57
|
dataset_args={
|
|
@@ -128,7 +134,7 @@ class TestRun(unittest.TestCase):
|
|
|
128
134
|
from evalscope.config import TaskConfig
|
|
129
135
|
|
|
130
136
|
task_cfg = TaskConfig(
|
|
131
|
-
model='
|
|
137
|
+
model='qwen-plus',
|
|
132
138
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
133
139
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
134
140
|
eval_type=EvalType.SERVICE,
|
|
@@ -142,9 +148,10 @@ class TestRun(unittest.TestCase):
|
|
|
142
148
|
'n': 1,
|
|
143
149
|
'max_tokens': 4096,
|
|
144
150
|
},
|
|
151
|
+
judge_worker_num=5,
|
|
145
152
|
judge_strategy=JudgeStrategy.AUTO,
|
|
146
153
|
judge_model_args={
|
|
147
|
-
'model_id': 'qwen2.5-
|
|
154
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
148
155
|
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
149
156
|
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
150
157
|
}
|
tests/cli/test_collection.py
CHANGED
|
@@ -72,13 +72,15 @@ class TestCollection(unittest.TestCase):
|
|
|
72
72
|
'local_path': 'outputs/mixed_data_test.jsonl'
|
|
73
73
|
# 'local_path': 'outputs/weighted_mixed_data.jsonl'
|
|
74
74
|
}},
|
|
75
|
-
limit=
|
|
76
|
-
judge_strategy=JudgeStrategy.
|
|
75
|
+
limit=5,
|
|
76
|
+
judge_strategy=JudgeStrategy.AUTO,
|
|
77
77
|
judge_model_args={
|
|
78
|
-
'model_id': 'qwen2.5-
|
|
79
|
-
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
80
|
-
'api_key': os.getenv('DASHSCOPE_API_KEY'),
|
|
81
|
-
}
|
|
78
|
+
# 'model_id': 'qwen2.5-72b-instruct',
|
|
79
|
+
# 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
80
|
+
# 'api_key': os.getenv('DASHSCOPE_API_KEY'),
|
|
81
|
+
},
|
|
82
|
+
analysis_report=True,
|
|
83
|
+
# use_cache='outputs/20250522_204520'
|
|
82
84
|
)
|
|
83
85
|
res = run_task(task_cfg=task_cfg)
|
|
84
86
|
print(res)
|
tests/cli/test_run.py
CHANGED
|
@@ -13,7 +13,7 @@ from evalscope.run import run_task
|
|
|
13
13
|
from evalscope.utils import is_module_installed, test_level_list
|
|
14
14
|
from evalscope.utils.logger import get_logger
|
|
15
15
|
|
|
16
|
-
os.environ['
|
|
16
|
+
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
17
17
|
|
|
18
18
|
logger = get_logger()
|
|
19
19
|
|
|
@@ -73,6 +73,12 @@ class TestRun(unittest.TestCase):
|
|
|
73
73
|
logger.info(f'>>test_run_eval_with_args stdout: {run_res.stdout}')
|
|
74
74
|
logger.error(f'>>test_run_eval_with_args stderr: {run_res.stderr}')
|
|
75
75
|
|
|
76
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
77
|
+
def test_run_yaml_config(self):
|
|
78
|
+
from evalscope import run_task
|
|
79
|
+
|
|
80
|
+
run_task(task_cfg='examples/tasks/eval_native.yaml')
|
|
81
|
+
|
|
76
82
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
77
83
|
def test_run_task(self):
|
|
78
84
|
task_cfg = TaskConfig(
|
|
@@ -209,12 +215,16 @@ class TestRun(unittest.TestCase):
|
|
|
209
215
|
task_cfg = TaskConfig(
|
|
210
216
|
model='Qwen/Qwen3-1.7B',
|
|
211
217
|
datasets=[
|
|
212
|
-
|
|
218
|
+
'iquiz',
|
|
213
219
|
# 'math_500',
|
|
214
|
-
'aime24',
|
|
220
|
+
# 'aime24',
|
|
215
221
|
# 'competition_math',
|
|
216
222
|
# 'mmlu',
|
|
223
|
+
# 'simple_qa',
|
|
217
224
|
],
|
|
225
|
+
model_args={
|
|
226
|
+
'device_map': 'auto',
|
|
227
|
+
},
|
|
218
228
|
dataset_args={
|
|
219
229
|
'competition_math': {
|
|
220
230
|
'subset_list': ['Level 4', 'Level 5']
|
|
@@ -232,7 +242,8 @@ class TestRun(unittest.TestCase):
|
|
|
232
242
|
'top_p': 0.8, # top-p采样 (qwen 报告推荐值)
|
|
233
243
|
'top_k': 20, # top-k采样 (qwen 报告推荐值)
|
|
234
244
|
'chat_template_kwargs': {'enable_thinking': False} # 关闭思考模式
|
|
235
|
-
}
|
|
245
|
+
},
|
|
246
|
+
judge_strategy=JudgeStrategy.AUTO,
|
|
236
247
|
)
|
|
237
248
|
|
|
238
249
|
run_task(task_cfg=task_cfg)
|
|
@@ -276,7 +287,7 @@ class TestRun(unittest.TestCase):
|
|
|
276
287
|
datasets=[
|
|
277
288
|
# 'iquiz',
|
|
278
289
|
# 'ifeval',
|
|
279
|
-
'mmlu',
|
|
290
|
+
# 'mmlu',
|
|
280
291
|
# 'mmlu_pro',
|
|
281
292
|
# 'musr',
|
|
282
293
|
# 'process_bench',
|
|
@@ -294,10 +305,14 @@ class TestRun(unittest.TestCase):
|
|
|
294
305
|
# 'ceval',
|
|
295
306
|
# 'hellaswag',
|
|
296
307
|
# 'general_mcq',
|
|
297
|
-
# 'general_qa'
|
|
308
|
+
# 'general_qa',
|
|
298
309
|
# 'super_gpqa',
|
|
299
310
|
# 'mmlu_redux',
|
|
300
|
-
# 'maritime_bench'
|
|
311
|
+
# 'maritime_bench',
|
|
312
|
+
# 'drop',
|
|
313
|
+
# 'winogrande',
|
|
314
|
+
# 'tool_bench',
|
|
315
|
+
'frames',
|
|
301
316
|
],
|
|
302
317
|
dataset_args={
|
|
303
318
|
'mmlu': {
|
|
@@ -363,14 +378,17 @@ class TestRun(unittest.TestCase):
|
|
|
363
378
|
},
|
|
364
379
|
},
|
|
365
380
|
eval_batch_size=32,
|
|
366
|
-
limit=
|
|
381
|
+
limit=10,
|
|
367
382
|
debug=True,
|
|
368
383
|
stream=False,
|
|
369
384
|
generation_config={
|
|
370
385
|
'temperature': 0,
|
|
371
386
|
'n': 1,
|
|
372
387
|
'max_tokens': 4096,
|
|
373
|
-
|
|
388
|
+
# 'extra_headers':{'key': 'value'},
|
|
389
|
+
},
|
|
390
|
+
# ignore_errors=True,
|
|
391
|
+
# use_cache='outputs/20250519_142106'
|
|
374
392
|
)
|
|
375
393
|
|
|
376
394
|
run_task(task_cfg=task_cfg)
|
|
@@ -411,16 +429,16 @@ class TestRun(unittest.TestCase):
|
|
|
411
429
|
from evalscope.config import TaskConfig
|
|
412
430
|
|
|
413
431
|
task_cfg = TaskConfig(
|
|
414
|
-
model='
|
|
432
|
+
model='qwen-plus',
|
|
415
433
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
416
434
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
417
435
|
eval_type=EvalType.SERVICE,
|
|
418
436
|
datasets=[
|
|
419
437
|
# 'math_500',
|
|
420
|
-
|
|
438
|
+
'aime24',
|
|
421
439
|
# 'competition_math',
|
|
422
440
|
# 'arc',
|
|
423
|
-
# 'gsm8k'
|
|
441
|
+
# 'gsm8k',
|
|
424
442
|
# 'truthful_qa',
|
|
425
443
|
# 'simple_qa',
|
|
426
444
|
# 'chinese_simpleqa',
|
|
@@ -428,7 +446,10 @@ class TestRun(unittest.TestCase):
|
|
|
428
446
|
# 'humaneval',
|
|
429
447
|
# 'general_qa',
|
|
430
448
|
# 'alpaca_eval',
|
|
431
|
-
'arena_hard'
|
|
449
|
+
# 'arena_hard',
|
|
450
|
+
# 'frames',
|
|
451
|
+
# 'docmath',
|
|
452
|
+
# 'needle_haystack',
|
|
432
453
|
],
|
|
433
454
|
dataset_args={
|
|
434
455
|
'competition_math': {
|
|
@@ -453,13 +474,16 @@ class TestRun(unittest.TestCase):
|
|
|
453
474
|
'中华文化'
|
|
454
475
|
]
|
|
455
476
|
},
|
|
477
|
+
'frames': {
|
|
478
|
+
'local_path': '/root/.cache/modelscope/hub/datasets/iic/frames'
|
|
479
|
+
}
|
|
456
480
|
},
|
|
457
|
-
eval_batch_size=
|
|
458
|
-
limit=
|
|
481
|
+
eval_batch_size=10,
|
|
482
|
+
limit=1,
|
|
459
483
|
judge_strategy=JudgeStrategy.AUTO,
|
|
460
484
|
judge_worker_num=5,
|
|
461
485
|
judge_model_args={
|
|
462
|
-
'model_id': 'qwen2.5-
|
|
486
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
463
487
|
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
464
488
|
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
465
489
|
'generation_config': {
|
|
@@ -475,7 +499,9 @@ class TestRun(unittest.TestCase):
|
|
|
475
499
|
},
|
|
476
500
|
timeout=60000,
|
|
477
501
|
stream=True,
|
|
478
|
-
|
|
502
|
+
analysis_report=True,
|
|
503
|
+
# debug=True,
|
|
504
|
+
# use_cache='outputs/20250602_135859'
|
|
479
505
|
)
|
|
480
506
|
|
|
481
507
|
run_task(task_cfg=task_cfg)
|
tests/perf/test_perf.py
CHANGED
|
@@ -121,6 +121,29 @@ class TestPerf(unittest.TestCase):
|
|
|
121
121
|
print(metrics_result)
|
|
122
122
|
print(percentile_result)
|
|
123
123
|
|
|
124
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
125
|
+
def test_run_perf_multi_parallel(self):
|
|
126
|
+
from evalscope.perf.arguments import Arguments
|
|
127
|
+
task_cfg = Arguments(
|
|
128
|
+
parallel=[1, 2],
|
|
129
|
+
number=[2, 5],
|
|
130
|
+
model='qwen2.5-7b-instruct',
|
|
131
|
+
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
132
|
+
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
133
|
+
api='openai',
|
|
134
|
+
dataset='random',
|
|
135
|
+
min_tokens=100,
|
|
136
|
+
max_tokens=100,
|
|
137
|
+
prefix_length=0,
|
|
138
|
+
min_prompt_length=1024,
|
|
139
|
+
max_prompt_length=1024,
|
|
140
|
+
tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
|
|
141
|
+
seed=None,
|
|
142
|
+
extra_args={'ignore_eos': True}
|
|
143
|
+
)
|
|
144
|
+
metrics_result, percentile_result = run_perf_benchmark(task_cfg)
|
|
145
|
+
print(metrics_result)
|
|
146
|
+
print(percentile_result)
|
|
124
147
|
|
|
125
148
|
if __name__ == '__main__':
|
|
126
149
|
unittest.main(buffer=False)
|
tests/rag/test_mteb.py
CHANGED
|
@@ -46,11 +46,11 @@ class TestMTEB(unittest.TestCase):
|
|
|
46
46
|
],
|
|
47
47
|
'eval': {
|
|
48
48
|
'tasks': [
|
|
49
|
-
|
|
50
|
-
|
|
49
|
+
'TNews',
|
|
50
|
+
'CLSClusteringS2S',
|
|
51
51
|
'T2Reranking',
|
|
52
|
-
|
|
53
|
-
|
|
52
|
+
'T2Retrieval',
|
|
53
|
+
'ATEC',
|
|
54
54
|
],
|
|
55
55
|
'verbosity': 2,
|
|
56
56
|
'overwrite_results': True,
|
|
@@ -85,7 +85,7 @@ class TestMTEB(unittest.TestCase):
|
|
|
85
85
|
],
|
|
86
86
|
'verbosity': 2,
|
|
87
87
|
'overwrite_results': True,
|
|
88
|
-
'limits':
|
|
88
|
+
'limits': 10,
|
|
89
89
|
},
|
|
90
90
|
},
|
|
91
91
|
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|