evalscope 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +1 -1
- evalscope/api/benchmark/adapters/__init__.py +2 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +7 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +62 -2
- evalscope/api/benchmark/meta.py +9 -0
- evalscope/api/dataset/dataset.py +6 -6
- evalscope/api/dataset/loader.py +2 -1
- evalscope/api/evaluator/cache.py +24 -1
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +204 -0
- evalscope/api/model/generate_config.py +1 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/single_model.py +3 -3
- evalscope/app/utils/data_utils.py +7 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
- evalscope/benchmarks/amc/amc_adapter.py +46 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/bfcl_adapter.py +142 -7
- evalscope/benchmarks/bfcl/generation.py +9 -9
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
- evalscope/benchmarks/drop/drop_adapter.py +1 -1
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +6 -5
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/tau_bench/generation.py +1 -1
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +20 -19
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +96 -14
- evalscope/constants.py +11 -0
- evalscope/evaluator/evaluator.py +30 -10
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/metric.py +27 -2
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +3 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +8 -6
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/benchmark.py +2 -0
- evalscope/perf/plugin/api/base.py +2 -2
- evalscope/perf/plugin/api/default_api.py +7 -7
- evalscope/perf/plugin/api/openai_api.py +83 -19
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/local_server.py +3 -0
- evalscope/report/__init__.py +0 -1
- evalscope/report/combinator.py +0 -25
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +9 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +41 -0
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +56 -7
- evalscope/utils/json_schema.py +23 -2
- evalscope/utils/logger.py +19 -0
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +23 -6
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/METADATA +17 -24
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/RECORD +145 -103
- tests/benchmark/test_eval.py +80 -37
- tests/benchmark/test_image_edit.py +65 -0
- tests/benchmark/test_sandbox.py +81 -0
- tests/benchmark/test_vlm.py +137 -0
- tests/cli/test_all.py +83 -43
- tests/cli/test_collection.py +8 -5
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +44 -14
- tests/rag/test_clip_benchmark.py +0 -3
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/aigc/__init__.py +0 -1
- /evalscope/benchmarks/{aigc → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → amc}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → healthbench}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/test_t2i.py +0 -0
tests/benchmark/test_eval.py
CHANGED
|
@@ -4,17 +4,15 @@ from dotenv import dotenv_values
|
|
|
4
4
|
env = dotenv_values('.env')
|
|
5
5
|
|
|
6
6
|
import unittest
|
|
7
|
-
from unittest import TestCase
|
|
8
7
|
|
|
9
|
-
from evalscope.config import TaskConfig
|
|
10
8
|
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
11
|
-
from evalscope.run import run_task
|
|
12
9
|
from evalscope.utils.logger import get_logger
|
|
10
|
+
from tests.common import TestBenchmark
|
|
13
11
|
|
|
14
12
|
logger = get_logger()
|
|
15
13
|
|
|
16
14
|
|
|
17
|
-
class TestBenchmark
|
|
15
|
+
class TestNativeBenchmark(TestBenchmark):
|
|
18
16
|
"""Benchmark evaluation test cases."""
|
|
19
17
|
|
|
20
18
|
def setUp(self):
|
|
@@ -35,38 +33,18 @@ class TestBenchmark(TestCase):
|
|
|
35
33
|
'judge_strategy': JudgeStrategy.AUTO,
|
|
36
34
|
'judge_worker_num': 5,
|
|
37
35
|
'judge_model_args': {
|
|
38
|
-
'model_id': '
|
|
36
|
+
'model_id': 'qwen3-235b-a22b',
|
|
39
37
|
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
40
38
|
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
41
39
|
'generation_config': {
|
|
42
40
|
'temperature': 0.0,
|
|
43
41
|
'max_tokens': 4096,
|
|
42
|
+
'extra_body': {'enable_thinking': False}
|
|
44
43
|
}
|
|
45
44
|
},
|
|
46
45
|
'debug': True,
|
|
47
46
|
}
|
|
48
47
|
|
|
49
|
-
def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
|
|
50
|
-
"""Helper method to run test for a specific dataset."""
|
|
51
|
-
config = self.base_config.copy()
|
|
52
|
-
config['datasets'] = [dataset_name]
|
|
53
|
-
|
|
54
|
-
if use_mock:
|
|
55
|
-
config['eval_type'] = EvalType.MOCK_LLM
|
|
56
|
-
|
|
57
|
-
# 应用配置覆盖
|
|
58
|
-
config.update(config_overrides)
|
|
59
|
-
|
|
60
|
-
if dataset_args:
|
|
61
|
-
config['dataset_args'] = {dataset_name: dataset_args}
|
|
62
|
-
|
|
63
|
-
task_cfg = TaskConfig(**config)
|
|
64
|
-
run_task(task_cfg=task_cfg)
|
|
65
|
-
|
|
66
|
-
def _run_dataset_load_test(self, dataset_name, dataset_args=None):
|
|
67
|
-
"""Helper method to test dataset loading."""
|
|
68
|
-
|
|
69
|
-
self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
|
|
70
48
|
|
|
71
49
|
# Math & Reasoning datasets
|
|
72
50
|
def test_gsm8k(self):
|
|
@@ -84,10 +62,18 @@ class TestBenchmark(TestCase):
|
|
|
84
62
|
"""Test MMLU reasoning dataset."""
|
|
85
63
|
dataset_args = {
|
|
86
64
|
'few_shot_num': 0,
|
|
87
|
-
|
|
65
|
+
'subset_list': ['abstract_algebra', 'computer_security']
|
|
88
66
|
}
|
|
89
67
|
self._run_dataset_test('mmlu', use_mock=True, dataset_args=dataset_args)
|
|
90
68
|
|
|
69
|
+
def test_mmlu_reasoning(self):
|
|
70
|
+
"""Test MMLU reasoning dataset."""
|
|
71
|
+
dataset_args = {
|
|
72
|
+
'few_shot_num': 0,
|
|
73
|
+
'subset_list': ['abstract_algebra', 'computer_security']
|
|
74
|
+
}
|
|
75
|
+
self._run_dataset_test('mmlu', dataset_args=dataset_args, model='qwen3-0.6b', stream=True)
|
|
76
|
+
|
|
91
77
|
def test_mmlu_pro(self):
|
|
92
78
|
"""Test MMLU-Pro reasoning dataset."""
|
|
93
79
|
dataset_args = {
|
|
@@ -116,7 +102,11 @@ class TestBenchmark(TestCase):
|
|
|
116
102
|
def test_math_500(self):
|
|
117
103
|
"""Test MATH 500 dataset."""
|
|
118
104
|
# self._run_dataset_load_test('math_500')
|
|
119
|
-
|
|
105
|
+
dataset_args = {
|
|
106
|
+
'subset_list': ['Level 1', 'Level 2'],
|
|
107
|
+
'few_shot_num': 0,
|
|
108
|
+
}
|
|
109
|
+
self._run_dataset_test('math_500', dataset_args=dataset_args)
|
|
120
110
|
|
|
121
111
|
def test_aime24(self):
|
|
122
112
|
"""Test AIME 2024 dataset."""
|
|
@@ -222,6 +212,7 @@ class TestBenchmark(TestCase):
|
|
|
222
212
|
def test_bbh(self):
|
|
223
213
|
dataset_args = {
|
|
224
214
|
'subset_list': ['temporal_sequences', 'navigate'],
|
|
215
|
+
'few_shot_num': 0,
|
|
225
216
|
}
|
|
226
217
|
self._run_dataset_test('bbh', dataset_args=dataset_args)
|
|
227
218
|
|
|
@@ -336,20 +327,21 @@ class TestBenchmark(TestCase):
|
|
|
336
327
|
def test_humaneval(self):
|
|
337
328
|
"""Test HumanEval dataset."""
|
|
338
329
|
dataset_args = {
|
|
339
|
-
'metric_list': ['Pass@1'
|
|
330
|
+
'metric_list': ['Pass@1']
|
|
340
331
|
}
|
|
341
|
-
self._run_dataset_test('humaneval', dataset_args
|
|
332
|
+
self._run_dataset_test('humaneval', dataset_args)
|
|
342
333
|
|
|
343
334
|
def test_live_code_bench(self):
|
|
344
335
|
"""Test LiveCodeBench dataset."""
|
|
345
336
|
dataset_args = {
|
|
346
|
-
'subset_list': ['
|
|
337
|
+
'subset_list': ['v5'],
|
|
338
|
+
'review_timeout': 6,
|
|
347
339
|
'extra_params': {
|
|
348
340
|
'start_date': '2024-08-01',
|
|
349
341
|
'end_date': '2025-02-28'
|
|
350
342
|
},
|
|
351
343
|
}
|
|
352
|
-
self._run_dataset_test('live_code_bench', dataset_args,
|
|
344
|
+
self._run_dataset_test('live_code_bench', dataset_args, limit=20, use_cache='outputs/20250918_200232', rerun_review=True)
|
|
353
345
|
|
|
354
346
|
def test_tool_bench(self):
|
|
355
347
|
"""Test ToolBench dataset."""
|
|
@@ -358,27 +350,78 @@ class TestBenchmark(TestCase):
|
|
|
358
350
|
def test_bfcl(self):
|
|
359
351
|
"""Test BFCL dataset."""
|
|
360
352
|
dataset_args = {
|
|
361
|
-
'subset_list': [
|
|
353
|
+
'subset_list': [
|
|
354
|
+
# 'simple',
|
|
355
|
+
# 'live_multiple',
|
|
356
|
+
# 'multi_turn_base',
|
|
357
|
+
'multi_turn_miss_func'
|
|
358
|
+
],
|
|
362
359
|
'extra_params': {
|
|
363
360
|
'is_fc_model': True,
|
|
364
361
|
'underscore_to_dot': True
|
|
365
362
|
}
|
|
366
363
|
}
|
|
367
|
-
self._run_dataset_test('bfcl_v3', dataset_args)
|
|
364
|
+
self._run_dataset_test('bfcl_v3', dataset_args, model='qwen-plus', limit=30, eval_batch_size=5)
|
|
368
365
|
|
|
369
366
|
def test_tau_bench(self):
|
|
370
367
|
dataset_args = {
|
|
368
|
+
'subset_list': [
|
|
369
|
+
'airline',
|
|
370
|
+
'retail'
|
|
371
|
+
],
|
|
371
372
|
'extra_params': {
|
|
372
373
|
'user_model': 'qwen-plus',
|
|
373
374
|
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
374
375
|
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
375
376
|
'generation_config': {
|
|
376
|
-
'temperature': 0.
|
|
377
|
-
'
|
|
377
|
+
'temperature': 0.0,
|
|
378
|
+
'max_tokens': 12000,
|
|
379
|
+
'stream': True
|
|
378
380
|
}
|
|
379
381
|
}
|
|
380
382
|
}
|
|
381
|
-
self._run_dataset_test('tau_bench', dataset_args, limit=
|
|
383
|
+
self._run_dataset_test('tau_bench', dataset_args, limit=5, model='qwq-plus', stream=True)
|
|
384
|
+
|
|
385
|
+
def test_r1_collection(self):
|
|
386
|
+
dataset_args = {
|
|
387
|
+
'dataset_id': 'evalscope/R1-Distill-Math-Test-v2'
|
|
388
|
+
}
|
|
389
|
+
self._run_dataset_test('data_collection', dataset_args)
|
|
390
|
+
|
|
391
|
+
def test_qwen3_collection(self):
|
|
392
|
+
dataset_args = {
|
|
393
|
+
'dataset_id': 'evalscope/Qwen3-Test-Collection'
|
|
394
|
+
}
|
|
395
|
+
self._run_dataset_test('data_collection', dataset_args)
|
|
396
|
+
|
|
397
|
+
def test_multi_if(self):
|
|
398
|
+
dataset_args = {
|
|
399
|
+
'subset_list': ['English', 'Chinese'],
|
|
400
|
+
'few_shot_num': 0,
|
|
401
|
+
}
|
|
402
|
+
self._run_dataset_test('multi_if', dataset_args, limit=5)
|
|
403
|
+
|
|
404
|
+
def test_healthbench(self):
|
|
405
|
+
dataset_args = {
|
|
406
|
+
'subset_list': ['health_data_tasks'],
|
|
407
|
+
'extra_params': {
|
|
408
|
+
'version': 'Hard'
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
self._run_dataset_test('health_bench', dataset_args, limit=5)
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def test_amc(self):
|
|
415
|
+
dataset_args = {
|
|
416
|
+
'subset_list': ['amc22'],
|
|
417
|
+
}
|
|
418
|
+
self._run_dataset_test('amc', dataset_args)
|
|
419
|
+
|
|
420
|
+
def test_minerva_math(self):
|
|
421
|
+
dataset_args = {
|
|
422
|
+
'subset_list': ['default'],
|
|
423
|
+
}
|
|
424
|
+
self._run_dataset_test('minerva_math', dataset_args)
|
|
382
425
|
|
|
383
426
|
if __name__ == '__main__':
|
|
384
427
|
# Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from dotenv import dotenv_values
|
|
3
|
+
|
|
4
|
+
env = dotenv_values('.env')
|
|
5
|
+
|
|
6
|
+
import unittest
|
|
7
|
+
|
|
8
|
+
from evalscope.constants import EvalType, JudgeStrategy, ModelTask
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
from tests.common import TestBenchmark
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TestImageEditBenchmark(TestBenchmark):
|
|
16
|
+
def setUp(self):
|
|
17
|
+
"""Setup common test configuration."""
|
|
18
|
+
self.base_config = {
|
|
19
|
+
'model': 'Qwen/Qwen-Image-Edit',
|
|
20
|
+
'model_args':{
|
|
21
|
+
'precision': 'bfloat16',
|
|
22
|
+
'device_map': 'cuda:2'
|
|
23
|
+
},
|
|
24
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
25
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
26
|
+
'model_task': ModelTask.IMAGE_GENERATION,
|
|
27
|
+
'eval_type': EvalType.IMAGE_EDITING,
|
|
28
|
+
'eval_batch_size': 1,
|
|
29
|
+
'limit': 5,
|
|
30
|
+
'generation_config': {
|
|
31
|
+
'true_cfg_scale': 4.0,
|
|
32
|
+
'num_inference_steps': 50,
|
|
33
|
+
'negative_prompt': ' ',
|
|
34
|
+
},
|
|
35
|
+
'judge_strategy': JudgeStrategy.AUTO,
|
|
36
|
+
'judge_worker_num': 5,
|
|
37
|
+
'judge_model_args': {
|
|
38
|
+
'model_id': 'qwen2.5-vl-72b-instruct',
|
|
39
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
40
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
41
|
+
'generation_config': {
|
|
42
|
+
'temperature': 0.0,
|
|
43
|
+
'max_tokens': 4096,
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
'debug': True,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
def test_gedit(self):
|
|
50
|
+
"""Test GEdit dataset."""
|
|
51
|
+
dataset_args = {
|
|
52
|
+
'extra_params':{
|
|
53
|
+
'language': 'cn',
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
self._run_dataset_test('gedit', dataset_args=dataset_args, use_cache='outputs/20250829_150058')
|
|
57
|
+
|
|
58
|
+
def test_gedit_local(self):
|
|
59
|
+
dataset_args = {
|
|
60
|
+
'extra_params':{
|
|
61
|
+
'language': 'cn',
|
|
62
|
+
'local_file': 'outputs/example_edit.jsonl',
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
self._run_dataset_test('gedit', dataset_args=dataset_args, model=None, model_id='offline_model')
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from dotenv import dotenv_values
|
|
3
|
+
|
|
4
|
+
env = dotenv_values('.env')
|
|
5
|
+
|
|
6
|
+
import unittest
|
|
7
|
+
|
|
8
|
+
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
from tests.common import TestBenchmark
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TestCodeBenchmark(TestBenchmark):
|
|
16
|
+
"""Benchmark evaluation test cases."""
|
|
17
|
+
|
|
18
|
+
def setUp(self):
|
|
19
|
+
"""Setup common test configuration."""
|
|
20
|
+
self.base_config = {
|
|
21
|
+
'model': 'qwen-plus',
|
|
22
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
23
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
24
|
+
'eval_type': EvalType.SERVICE,
|
|
25
|
+
'eval_batch_size': 5,
|
|
26
|
+
'limit': 5,
|
|
27
|
+
'generation_config': {
|
|
28
|
+
'max_tokens': 4096,
|
|
29
|
+
'temperature': 0.0,
|
|
30
|
+
'seed': 42,
|
|
31
|
+
'parallel_tool_calls': True
|
|
32
|
+
},
|
|
33
|
+
'judge_strategy': JudgeStrategy.AUTO,
|
|
34
|
+
'judge_worker_num': 5,
|
|
35
|
+
'judge_model_args': {
|
|
36
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
37
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
38
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
39
|
+
'generation_config': {
|
|
40
|
+
'temperature': 0.0,
|
|
41
|
+
'max_tokens': 4096,
|
|
42
|
+
}
|
|
43
|
+
},
|
|
44
|
+
'use_sandbox': True,
|
|
45
|
+
'sandbox_type': 'docker',
|
|
46
|
+
'debug': True,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
def test_humaneval(self):
|
|
50
|
+
"""Test Humaneval dataset."""
|
|
51
|
+
self._run_dataset_test('humaneval', limit=5)
|
|
52
|
+
|
|
53
|
+
def test_humaneval_remote_sandbox(self):
|
|
54
|
+
"""Test Humaneval dataset with remote sandbox manager."""
|
|
55
|
+
sandbox_manager_config = {'base_url': 'http://localhost:8000'}
|
|
56
|
+
self._run_dataset_test('humaneval', limit=5, sandbox_manager_config=sandbox_manager_config)
|
|
57
|
+
|
|
58
|
+
def test_live_code_bench(self):
|
|
59
|
+
"""Test Live Code Bench dataset."""
|
|
60
|
+
dataset_args = {
|
|
61
|
+
'subset_list': ['v5'],
|
|
62
|
+
'review_timeout': 6,
|
|
63
|
+
'extra_params': {
|
|
64
|
+
'start_date': '2024-08-01',
|
|
65
|
+
'end_date': '2025-02-28'
|
|
66
|
+
},
|
|
67
|
+
}
|
|
68
|
+
self._run_dataset_test('live_code_bench', limit=5, dataset_args=dataset_args, use_cache='outputs/20250918_200232', rerun_review=True)
|
|
69
|
+
|
|
70
|
+
def test_live_code_bench_remote_sandbox(self):
|
|
71
|
+
"""Test Live Code Bench dataset."""
|
|
72
|
+
dataset_args = {
|
|
73
|
+
'subset_list': ['v5'],
|
|
74
|
+
'review_timeout': 6,
|
|
75
|
+
'extra_params': {
|
|
76
|
+
'start_date': '2024-08-01',
|
|
77
|
+
'end_date': '2025-02-28'
|
|
78
|
+
},
|
|
79
|
+
}
|
|
80
|
+
sandbox_manager_config = {'base_url': 'http://localhost:8000'}
|
|
81
|
+
self._run_dataset_test('live_code_bench', limit=20, dataset_args=dataset_args, sandbox_manager_config=sandbox_manager_config, use_cache='outputs/20250918_200232_2', rerun_review=True)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from dotenv import dotenv_values
|
|
3
|
+
|
|
4
|
+
env = dotenv_values('.env')
|
|
5
|
+
|
|
6
|
+
import unittest
|
|
7
|
+
|
|
8
|
+
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
from tests.common import TestBenchmark
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TestVLMBenchmark(TestBenchmark):
|
|
16
|
+
"""Benchmark evaluation test cases."""
|
|
17
|
+
|
|
18
|
+
def setUp(self):
|
|
19
|
+
"""Setup common test configuration."""
|
|
20
|
+
self.base_config = {
|
|
21
|
+
'model': 'qwen-vl-plus',
|
|
22
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
23
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
24
|
+
'eval_type': EvalType.SERVICE,
|
|
25
|
+
'eval_batch_size': 5,
|
|
26
|
+
'limit': 5,
|
|
27
|
+
'generation_config': {
|
|
28
|
+
'max_tokens': 2048,
|
|
29
|
+
'temperature': 0.0,
|
|
30
|
+
'seed': 42,
|
|
31
|
+
'parallel_tool_calls': True
|
|
32
|
+
},
|
|
33
|
+
'judge_strategy': JudgeStrategy.AUTO,
|
|
34
|
+
'judge_worker_num': 5,
|
|
35
|
+
'judge_model_args': {
|
|
36
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
37
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
38
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
39
|
+
'generation_config': {
|
|
40
|
+
'temperature': 0.0,
|
|
41
|
+
'max_tokens': 4096,
|
|
42
|
+
}
|
|
43
|
+
},
|
|
44
|
+
'debug': True,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
def test_mmmu(self):
|
|
48
|
+
dataset_args = {
|
|
49
|
+
'subset_list':[
|
|
50
|
+
'Accounting',
|
|
51
|
+
'Agriculture',
|
|
52
|
+
# 'Architecture_and_Engineering'
|
|
53
|
+
]
|
|
54
|
+
}
|
|
55
|
+
self._run_dataset_test('mmmu', dataset_args=dataset_args)
|
|
56
|
+
|
|
57
|
+
def test_math_vista(self):
|
|
58
|
+
dataset_args = {
|
|
59
|
+
'subset_list': ['default']
|
|
60
|
+
}
|
|
61
|
+
self._run_dataset_test('math_vista', dataset_args=dataset_args)
|
|
62
|
+
|
|
63
|
+
def test_mmmu_pro(self):
|
|
64
|
+
dataset_args = {
|
|
65
|
+
'subset_list':[
|
|
66
|
+
'Accounting',
|
|
67
|
+
# 'Agriculture',
|
|
68
|
+
],
|
|
69
|
+
'extra_params': {
|
|
70
|
+
'dataset_format': 'standard (4 options)', # 'standard (4 options)', 'standard (10 options)', 'vision'
|
|
71
|
+
},
|
|
72
|
+
}
|
|
73
|
+
self._run_dataset_test('mmmu_pro', dataset_args=dataset_args, limit=10)
|
|
74
|
+
|
|
75
|
+
def test_qwen3_vl_collection(self):
|
|
76
|
+
dataset_args = {
|
|
77
|
+
'dataset_id': 'outputs/qwen3_vl_test.jsonl',
|
|
78
|
+
'shuffle': True,
|
|
79
|
+
}
|
|
80
|
+
self._run_dataset_test('data_collection', dataset_args, limit=100)
|
|
81
|
+
|
|
82
|
+
def test_real_world_qa(self):
|
|
83
|
+
dataset_args = {
|
|
84
|
+
'subset_list': ['default']
|
|
85
|
+
}
|
|
86
|
+
self._run_dataset_test('real_world_qa', dataset_args=dataset_args, limit=10)
|
|
87
|
+
|
|
88
|
+
def test_ai2d(self):
|
|
89
|
+
dataset_args = {
|
|
90
|
+
'subset_list': ['default']
|
|
91
|
+
}
|
|
92
|
+
self._run_dataset_test('ai2d', dataset_args=dataset_args)
|
|
93
|
+
|
|
94
|
+
def test_cc_bench(self):
|
|
95
|
+
dataset_args = {
|
|
96
|
+
'subset_list': ['cc']
|
|
97
|
+
}
|
|
98
|
+
self._run_dataset_test('cc_bench', dataset_args=dataset_args)
|
|
99
|
+
|
|
100
|
+
def test_mm_bench(self):
|
|
101
|
+
dataset_args = {
|
|
102
|
+
'subset_list': ['cn', 'en']
|
|
103
|
+
}
|
|
104
|
+
self._run_dataset_test('mm_bench', dataset_args=dataset_args)
|
|
105
|
+
|
|
106
|
+
def test_mm_star(self):
|
|
107
|
+
dataset_args = {
|
|
108
|
+
# 'subset_list': ['val']
|
|
109
|
+
}
|
|
110
|
+
self._run_dataset_test('mm_star', dataset_args=dataset_args)
|
|
111
|
+
|
|
112
|
+
def test_omni_bench(self):
|
|
113
|
+
dataset_args = {
|
|
114
|
+
'extra_params': {
|
|
115
|
+
'use_image': True, # Whether to use image input, if False, use text alternative image content.
|
|
116
|
+
'use_audio': True, # Whether to use audio input, if False, use text alternative audio content.
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
self._run_dataset_test('omni_bench', dataset_args=dataset_args, model='qwen-omni-turbo')
|
|
120
|
+
|
|
121
|
+
def test_olympiad_bench(self):
|
|
122
|
+
dataset_args = {
|
|
123
|
+
'subset_list': [
|
|
124
|
+
# 'OE_MM_maths_en_COMP',
|
|
125
|
+
# 'OE_MM_maths_zh_CEE',
|
|
126
|
+
# 'OE_MM_maths_zh_COMP',
|
|
127
|
+
# 'OE_MM_physics_en_COMP',
|
|
128
|
+
# 'OE_MM_physics_zh_CEE',
|
|
129
|
+
# 'OE_TO_maths_en_COMP',
|
|
130
|
+
# 'OE_TO_maths_zh_CEE',
|
|
131
|
+
# 'OE_TO_maths_zh_COMP',
|
|
132
|
+
# 'OE_TO_physics_en_COMP',
|
|
133
|
+
# 'OE_TO_physics_zh_CEE',
|
|
134
|
+
'TP_TO_maths_zh_CEE',
|
|
135
|
+
]
|
|
136
|
+
}
|
|
137
|
+
self._run_dataset_test('olympiad_bench', dataset_args=dataset_args)
|
tests/cli/test_all.py
CHANGED
|
@@ -17,44 +17,44 @@ os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
|
17
17
|
logger = get_logger()
|
|
18
18
|
|
|
19
19
|
datasets=[
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
20
|
+
'iquiz',
|
|
21
|
+
'ifeval',
|
|
22
|
+
'mmlu',
|
|
23
|
+
'mmlu_pro',
|
|
24
|
+
'musr',
|
|
25
|
+
'process_bench',
|
|
26
|
+
'race',
|
|
27
|
+
'trivia_qa',
|
|
28
|
+
'cmmlu',
|
|
29
|
+
'humaneval',
|
|
30
|
+
'gsm8k',
|
|
31
|
+
'bbh',
|
|
32
|
+
'competition_math',
|
|
33
|
+
'math_500',
|
|
34
|
+
'aime24',
|
|
35
|
+
'gpqa_diamond',
|
|
36
|
+
'arc',
|
|
37
|
+
'ceval',
|
|
38
|
+
'hellaswag',
|
|
39
|
+
'general_mcq',
|
|
40
|
+
'general_qa',
|
|
41
|
+
'super_gpqa',
|
|
42
|
+
# 'live_code_bench',
|
|
43
|
+
'mmlu_redux',
|
|
44
|
+
'simple_qa',
|
|
45
|
+
'chinese_simpleqa',
|
|
46
|
+
'alpaca_eval',
|
|
47
|
+
'arena_hard',
|
|
48
|
+
'maritime_bench',
|
|
49
|
+
'drop',
|
|
50
|
+
'winogrande',
|
|
51
|
+
'tool_bench',
|
|
52
|
+
'frames',
|
|
53
|
+
'docmath',
|
|
54
|
+
'needle_haystack',
|
|
55
|
+
'bfcl_v3',
|
|
56
|
+
'hle',
|
|
57
|
+
'tau_bench',
|
|
58
58
|
]
|
|
59
59
|
|
|
60
60
|
# Reverse the datasets list to ensure the order is from most recent to oldest
|
|
@@ -150,7 +150,6 @@ dataset_args={
|
|
|
150
150
|
}
|
|
151
151
|
|
|
152
152
|
class TestRun(unittest.TestCase):
|
|
153
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
154
153
|
def test_benchmarks(self):
|
|
155
154
|
from evalscope.config import TaskConfig
|
|
156
155
|
|
|
@@ -180,19 +179,60 @@ class TestRun(unittest.TestCase):
|
|
|
180
179
|
|
|
181
180
|
run_task(task_cfg=task_cfg)
|
|
182
181
|
|
|
182
|
+
def test_vlm_benchmark(self):
|
|
183
|
+
from evalscope.config import TaskConfig
|
|
184
|
+
|
|
185
|
+
task_cfg = TaskConfig(
|
|
186
|
+
model='qwen-vl-plus',
|
|
187
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
188
|
+
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
189
|
+
eval_type=EvalType.SERVICE,
|
|
190
|
+
datasets=[
|
|
191
|
+
'mmmu',
|
|
192
|
+
# 'math_vista',
|
|
193
|
+
],
|
|
194
|
+
dataset_args={
|
|
195
|
+
'mmmu': {
|
|
196
|
+
'subset_list': ['Accounting']
|
|
197
|
+
},
|
|
198
|
+
'math_vista': {
|
|
199
|
+
'subset_list': ['default']
|
|
200
|
+
}
|
|
201
|
+
},
|
|
202
|
+
eval_batch_size=1,
|
|
203
|
+
limit=1,
|
|
204
|
+
stream=True,
|
|
205
|
+
generation_config={
|
|
206
|
+
'temperature': 0,
|
|
207
|
+
'n': 1,
|
|
208
|
+
'max_tokens': 4096,
|
|
209
|
+
'image_height': 512,
|
|
210
|
+
'image_width': 512,
|
|
211
|
+
'image_num': 2,
|
|
212
|
+
},
|
|
213
|
+
judge_worker_num=5,
|
|
214
|
+
judge_strategy=JudgeStrategy.AUTO,
|
|
215
|
+
judge_model_args={
|
|
216
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
217
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
218
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
219
|
+
}
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
run_task(task_cfg=task_cfg)
|
|
183
223
|
|
|
184
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
185
224
|
def test_ci_lite(self):
|
|
186
225
|
from evalscope.config import TaskConfig
|
|
187
226
|
|
|
227
|
+
api_key = env.get('DASHSCOPE_API_KEY')
|
|
228
|
+
|
|
188
229
|
task_cfg = TaskConfig(
|
|
189
230
|
model='qwen-plus',
|
|
190
231
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
191
|
-
api_key=
|
|
192
|
-
eval_type=EvalType.SERVICE,
|
|
232
|
+
api_key=api_key,
|
|
233
|
+
eval_type=EvalType.SERVICE if api_key else EvalType.MOCK_LLM,
|
|
193
234
|
datasets=[
|
|
194
235
|
'general_mcq',
|
|
195
|
-
'general_qa',
|
|
196
236
|
'iquiz',
|
|
197
237
|
],
|
|
198
238
|
dataset_args={
|
tests/cli/test_collection.py
CHANGED
|
@@ -52,16 +52,19 @@ class TestCollection(unittest.TestCase):
|
|
|
52
52
|
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
53
53
|
eval_type=EvalType.SERVICE,
|
|
54
54
|
datasets=['data_collection'],
|
|
55
|
-
dataset_args={
|
|
56
|
-
'
|
|
57
|
-
|
|
58
|
-
|
|
55
|
+
dataset_args={
|
|
56
|
+
'data_collection': {
|
|
57
|
+
# 'local_path': 'outputs/test_mix.jsonl'
|
|
58
|
+
'local_path': 'outputs/mixed_data_test.jsonl',
|
|
59
|
+
'shuffle': True,
|
|
60
|
+
}
|
|
61
|
+
},
|
|
59
62
|
eval_batch_size=5,
|
|
60
63
|
generation_config = {
|
|
61
64
|
'max_tokens': 10000,
|
|
62
65
|
'temperature': 0.0,
|
|
63
66
|
},
|
|
64
|
-
limit=
|
|
67
|
+
limit=10,
|
|
65
68
|
# use_cache='outputs/20250822_161804'
|
|
66
69
|
)
|
|
67
70
|
run_task(task_cfg=task_cfg)
|