evalscope 0.16.1__py3-none-any.whl → 0.16.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +20 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
- evalscope/backend/rag_eval/utils/embedding.py +2 -4
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
- evalscope/benchmarks/aime/aime24_adapter.py +3 -1
- evalscope/benchmarks/aime/aime25_adapter.py +3 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
- evalscope/benchmarks/benchmark.py +1 -0
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
- evalscope/benchmarks/data_adapter.py +2 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +1 -0
- evalscope/benchmarks/drop/drop_adapter.py +3 -0
- evalscope/benchmarks/frames/frames_adapter.py +1 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
- evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
- evalscope/benchmarks/musr/musr_adapter.py +3 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +15 -8
- evalscope/benchmarks/needle_haystack/utils.py +2 -2
- evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
- evalscope/benchmarks/race/race_adapter.py +3 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +5 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
- evalscope/collections/evaluator.py +50 -28
- evalscope/constants.py +1 -1
- evalscope/evaluator/evaluator.py +6 -5
- evalscope/metrics/t2v_metrics/__init__.py +9 -23
- evalscope/models/adapters/__init__.py +2 -0
- evalscope/models/adapters/base_adapter.py +31 -27
- evalscope/models/adapters/bfcl_adapter.py +244 -0
- evalscope/models/adapters/server_adapter.py +78 -17
- evalscope/models/custom/custom_model.py +0 -3
- evalscope/models/custom/dummy_model.py +77 -39
- evalscope/models/local_model.py +1 -1
- evalscope/models/register.py +2 -1
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/benchmark.py +16 -3
- evalscope/perf/plugin/api/openai_api.py +2 -0
- evalscope/report/combinator.py +38 -12
- evalscope/report/utils.py +24 -1
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -1
- evalscope/utils/io_utils.py +59 -2
- evalscope/version.py +2 -2
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/METADATA +4 -3
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/RECORD +82 -79
- tests/aigc/test_t2i.py +8 -8
- tests/cli/test_all.py +40 -33
- tests/cli/test_collection.py +4 -3
- tests/cli/test_run.py +36 -21
- tests/rag/test_clip_benchmark.py +5 -1
- tests/rag/test_mteb.py +46 -2
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
tests/cli/test_run.py
CHANGED
|
@@ -63,7 +63,7 @@ class TestRun(unittest.TestCase):
|
|
|
63
63
|
f'--model {model} ' \
|
|
64
64
|
f'--datasets {datasets} ' \
|
|
65
65
|
f'--limit {limit} ' \
|
|
66
|
-
f'--generation-config do_sample=
|
|
66
|
+
f'--generation-config do_sample=true,temperature=0.6,max_length=65535,max_new_tokens=65535,max_tokens=65535,n=1,top_p=0.95,top_k=20 ' \
|
|
67
67
|
f"""--dataset-args \'{dataset_args}\' """
|
|
68
68
|
|
|
69
69
|
logger.info(f'Start to run command: {cmd_with_args}')
|
|
@@ -187,8 +187,11 @@ class TestRun(unittest.TestCase):
|
|
|
187
187
|
from evalscope.config import TaskConfig
|
|
188
188
|
|
|
189
189
|
task_cfg = TaskConfig(
|
|
190
|
-
model='
|
|
191
|
-
datasets=[
|
|
190
|
+
model='Qwen/Qwen3-0.6B',
|
|
191
|
+
datasets=[
|
|
192
|
+
'general_mcq',
|
|
193
|
+
'general_qa'
|
|
194
|
+
],
|
|
192
195
|
dataset_args={
|
|
193
196
|
'general_mcq': {
|
|
194
197
|
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
@@ -215,16 +218,14 @@ class TestRun(unittest.TestCase):
|
|
|
215
218
|
task_cfg = TaskConfig(
|
|
216
219
|
model='Qwen/Qwen3-1.7B',
|
|
217
220
|
datasets=[
|
|
218
|
-
'iquiz',
|
|
221
|
+
# 'iquiz',
|
|
219
222
|
# 'math_500',
|
|
220
223
|
# 'aime24',
|
|
221
224
|
# 'competition_math',
|
|
222
225
|
# 'mmlu',
|
|
223
226
|
# 'simple_qa',
|
|
227
|
+
'truthful_qa',
|
|
224
228
|
],
|
|
225
|
-
model_args={
|
|
226
|
-
'device_map': 'auto',
|
|
227
|
-
},
|
|
228
229
|
dataset_args={
|
|
229
230
|
'competition_math': {
|
|
230
231
|
'subset_list': ['Level 4', 'Level 5']
|
|
@@ -304,7 +305,7 @@ class TestRun(unittest.TestCase):
|
|
|
304
305
|
# 'arc',
|
|
305
306
|
# 'ceval',
|
|
306
307
|
# 'hellaswag',
|
|
307
|
-
|
|
308
|
+
'general_mcq',
|
|
308
309
|
# 'general_qa',
|
|
309
310
|
# 'super_gpqa',
|
|
310
311
|
# 'mmlu_redux',
|
|
@@ -312,7 +313,8 @@ class TestRun(unittest.TestCase):
|
|
|
312
313
|
# 'drop',
|
|
313
314
|
# 'winogrande',
|
|
314
315
|
# 'tool_bench',
|
|
315
|
-
'frames',
|
|
316
|
+
# 'frames',
|
|
317
|
+
# 'bfcl_v3',
|
|
316
318
|
],
|
|
317
319
|
dataset_args={
|
|
318
320
|
'mmlu': {
|
|
@@ -370,25 +372,31 @@ class TestRun(unittest.TestCase):
|
|
|
370
372
|
'metric_list': ['AverageRouge']
|
|
371
373
|
},
|
|
372
374
|
'super_gpqa': {
|
|
373
|
-
|
|
375
|
+
'subset_list': ['Philosophy', 'Education'],
|
|
374
376
|
'few_shot_num': 0
|
|
375
377
|
},
|
|
376
378
|
'mmlu_redux':{
|
|
377
379
|
'subset_list': ['abstract_algebra']
|
|
378
380
|
},
|
|
381
|
+
'bfcl_v3': {
|
|
382
|
+
'subset_list': ['parallel'],
|
|
383
|
+
'extra_params': {
|
|
384
|
+
# 'is_fc_model': False,
|
|
385
|
+
}
|
|
386
|
+
},
|
|
379
387
|
},
|
|
380
|
-
eval_batch_size=
|
|
381
|
-
limit=
|
|
388
|
+
eval_batch_size=10,
|
|
389
|
+
limit=5,
|
|
382
390
|
debug=True,
|
|
383
|
-
stream=
|
|
391
|
+
stream=True,
|
|
384
392
|
generation_config={
|
|
385
393
|
'temperature': 0,
|
|
386
394
|
'n': 1,
|
|
387
395
|
'max_tokens': 4096,
|
|
388
396
|
# 'extra_headers':{'key': 'value'},
|
|
389
397
|
},
|
|
390
|
-
|
|
391
|
-
# use_cache='outputs/
|
|
398
|
+
ignore_errors=False,
|
|
399
|
+
# use_cache='outputs/20250616_153756'
|
|
392
400
|
)
|
|
393
401
|
|
|
394
402
|
run_task(task_cfg=task_cfg)
|
|
@@ -434,8 +442,8 @@ class TestRun(unittest.TestCase):
|
|
|
434
442
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
435
443
|
eval_type=EvalType.SERVICE,
|
|
436
444
|
datasets=[
|
|
437
|
-
|
|
438
|
-
'aime24',
|
|
445
|
+
'math_500',
|
|
446
|
+
# 'aime24',
|
|
439
447
|
# 'competition_math',
|
|
440
448
|
# 'arc',
|
|
441
449
|
# 'gsm8k',
|
|
@@ -450,8 +458,15 @@ class TestRun(unittest.TestCase):
|
|
|
450
458
|
# 'frames',
|
|
451
459
|
# 'docmath',
|
|
452
460
|
# 'needle_haystack',
|
|
461
|
+
# 'ifeval',
|
|
453
462
|
],
|
|
454
463
|
dataset_args={
|
|
464
|
+
'needle_haystack': {
|
|
465
|
+
'subset_list': ['english'],
|
|
466
|
+
'extra_params': {
|
|
467
|
+
'show_score': True,
|
|
468
|
+
}
|
|
469
|
+
},
|
|
455
470
|
'competition_math': {
|
|
456
471
|
'subset_list': ['Level 4']
|
|
457
472
|
},
|
|
@@ -479,8 +494,8 @@ class TestRun(unittest.TestCase):
|
|
|
479
494
|
}
|
|
480
495
|
},
|
|
481
496
|
eval_batch_size=10,
|
|
482
|
-
limit=
|
|
483
|
-
judge_strategy=JudgeStrategy.
|
|
497
|
+
limit=3,
|
|
498
|
+
judge_strategy=JudgeStrategy.LLM,
|
|
484
499
|
judge_worker_num=5,
|
|
485
500
|
judge_model_args={
|
|
486
501
|
'model_id': 'qwen2.5-72b-instruct',
|
|
@@ -499,9 +514,9 @@ class TestRun(unittest.TestCase):
|
|
|
499
514
|
},
|
|
500
515
|
timeout=60000,
|
|
501
516
|
stream=True,
|
|
502
|
-
analysis_report=True,
|
|
517
|
+
# analysis_report=True,
|
|
503
518
|
# debug=True,
|
|
504
|
-
# use_cache='outputs/
|
|
519
|
+
# use_cache='outputs/20250616_161931'
|
|
505
520
|
)
|
|
506
521
|
|
|
507
522
|
run_task(task_cfg=task_cfg)
|
tests/rag/test_clip_benchmark.py
CHANGED
|
@@ -39,7 +39,11 @@ class TestCLIPBenchmark(unittest.TestCase):
|
|
|
39
39
|
'model_name': 'AI-ModelScope/chinese-clip-vit-large-patch14-336px',
|
|
40
40
|
}
|
|
41
41
|
],
|
|
42
|
-
'dataset_name': [
|
|
42
|
+
'dataset_name': [
|
|
43
|
+
'muge',
|
|
44
|
+
'mnist',
|
|
45
|
+
'flickr8k'
|
|
46
|
+
],
|
|
43
47
|
'split': 'test',
|
|
44
48
|
'batch_size': 128,
|
|
45
49
|
'num_workers': 1,
|
tests/rag/test_mteb.py
CHANGED
|
@@ -121,10 +121,54 @@ class TestMTEB(unittest.TestCase):
|
|
|
121
121
|
},
|
|
122
122
|
],
|
|
123
123
|
'eval': {
|
|
124
|
-
'tasks': [
|
|
124
|
+
'tasks': [
|
|
125
|
+
'MedicalRetrieval',
|
|
126
|
+
'T2Retrieval'
|
|
127
|
+
],
|
|
125
128
|
'verbosity': 2,
|
|
126
129
|
'overwrite_results': True,
|
|
127
|
-
|
|
130
|
+
'limits': 10,
|
|
131
|
+
'top_k': 10,
|
|
132
|
+
},
|
|
133
|
+
},
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
run_task(task_cfg)
|
|
137
|
+
|
|
138
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
139
|
+
def test_run_two_stage_api(self):
|
|
140
|
+
task_cfg = {
|
|
141
|
+
'eval_backend': 'RAGEval',
|
|
142
|
+
'eval_config': {
|
|
143
|
+
'tool': 'MTEB',
|
|
144
|
+
'model': [
|
|
145
|
+
{
|
|
146
|
+
'model_name': 'text-embedding-v3',
|
|
147
|
+
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
148
|
+
'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
|
|
149
|
+
'dimensions': 1024,
|
|
150
|
+
'encode_kwargs': {
|
|
151
|
+
'batch_size': 10,
|
|
152
|
+
},
|
|
153
|
+
},
|
|
154
|
+
{
|
|
155
|
+
'model_name': 'text-embedding-v3',
|
|
156
|
+
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
157
|
+
'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
|
|
158
|
+
'dimensions': 1024,
|
|
159
|
+
'encode_kwargs': {
|
|
160
|
+
'batch_size': 10,
|
|
161
|
+
},
|
|
162
|
+
},
|
|
163
|
+
],
|
|
164
|
+
'eval': {
|
|
165
|
+
'tasks': [
|
|
166
|
+
'MedicalRetrieval',
|
|
167
|
+
# 'T2Retrieval'
|
|
168
|
+
],
|
|
169
|
+
'verbosity': 2,
|
|
170
|
+
'overwrite_results': True,
|
|
171
|
+
'limits': 10,
|
|
128
172
|
'top_k': 10,
|
|
129
173
|
},
|
|
130
174
|
},
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|