evalscope 1.0.2__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +8 -2
- evalscope/app/utils/data_utils.py +3 -2
- evalscope/app/utils/visualization.py +2 -2
- evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
- evalscope/benchmarks/bfcl/bfcl_adapter.py +10 -45
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
- evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
- evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/metrics/metric.py +51 -0
- evalscope/metrics/metrics.py +16 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/report/__init__.py +9 -1
- evalscope/report/combinator.py +52 -2
- evalscope/utils/json_schema.py +8 -6
- evalscope/utils/multi_choices.py +16 -1
- evalscope/version.py +2 -2
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/METADATA +6 -32
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/RECORD +51 -54
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
- tests/__init__.py +0 -1
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -429
- tests/benchmark/test_image_edit.py +0 -65
- tests/benchmark/test_sandbox.py +0 -81
- tests/benchmark/test_t2i.py +0 -142
- tests/benchmark/test_vlm.py +0 -137
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -269
- tests/cli/test_collection.py +0 -99
- tests/cli/test_custom.py +0 -268
- tests/cli/test_reasoning.py +0 -81
- tests/common.py +0 -73
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -206
- tests/rag/test_clip_benchmark.py +0 -87
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- {tests/rag → evalscope/benchmarks/blink}/__init__.py +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
tests/benchmark/test_t2i.py
DELETED
|
@@ -1,142 +0,0 @@
|
|
|
1
|
-
from dotenv import dotenv_values
|
|
2
|
-
|
|
3
|
-
env = dotenv_values('.env')
|
|
4
|
-
|
|
5
|
-
import os
|
|
6
|
-
import unittest
|
|
7
|
-
|
|
8
|
-
from evalscope.config import TaskConfig
|
|
9
|
-
from evalscope.constants import EvalType, JudgeStrategy, ModelTask, OutputType
|
|
10
|
-
from evalscope.run import run_task
|
|
11
|
-
from evalscope.utils.logger import get_logger
|
|
12
|
-
from tests.utils import test_level_list
|
|
13
|
-
|
|
14
|
-
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
15
|
-
|
|
16
|
-
logger = get_logger()
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class TestRun(unittest.TestCase):
|
|
20
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
21
|
-
def test_run_general(self):
|
|
22
|
-
from evalscope.config import TaskConfig
|
|
23
|
-
|
|
24
|
-
task_cfg = TaskConfig(
|
|
25
|
-
datasets=[
|
|
26
|
-
'general_t2i'
|
|
27
|
-
],
|
|
28
|
-
model_task=ModelTask.IMAGE_GENERATION, # must be IMAGE_GENERATION
|
|
29
|
-
dataset_args={
|
|
30
|
-
'general_t2i': {
|
|
31
|
-
'metric_list': [
|
|
32
|
-
'PickScore',
|
|
33
|
-
# 'CLIPScore',
|
|
34
|
-
# 'HPSv2Score',
|
|
35
|
-
# 'HPSv2.1Score',
|
|
36
|
-
# 'BLIPv2Score',
|
|
37
|
-
# 'ImageRewardScore',
|
|
38
|
-
# 'VQAScore',
|
|
39
|
-
# 'FGA_BLIP2Score',
|
|
40
|
-
# 'MPS'
|
|
41
|
-
],
|
|
42
|
-
'dataset_id': 'custom_eval/multimodal/t2i/example.jsonl',
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
run_task(task_cfg=task_cfg)
|
|
48
|
-
|
|
49
|
-
def test_run_local_evalmuse(self):
|
|
50
|
-
from evalscope import TaskConfig, run_task
|
|
51
|
-
|
|
52
|
-
task_cfg = TaskConfig(
|
|
53
|
-
model_id='T2I-Model', # 只用于展示,实际运行时不需要指定模型ID
|
|
54
|
-
model_task=ModelTask.IMAGE_GENERATION,
|
|
55
|
-
datasets=[
|
|
56
|
-
'evalmuse', # 使用 EvalMuse benchmark
|
|
57
|
-
],
|
|
58
|
-
dataset_args={
|
|
59
|
-
'evalmuse': {
|
|
60
|
-
'dataset_id': 'data/example.jsonl', # 构建的jsonl路径
|
|
61
|
-
}
|
|
62
|
-
},
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
run_task(task_cfg=task_cfg)
|
|
66
|
-
|
|
67
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
68
|
-
def test_run_benchmark(self):
|
|
69
|
-
|
|
70
|
-
task_cfg = TaskConfig(
|
|
71
|
-
model='stabilityai/stable-diffusion-xl-base-1.0', # model on modelscope
|
|
72
|
-
model_task=ModelTask.IMAGE_GENERATION, # must be IMAGE_GENERATION
|
|
73
|
-
model_args={
|
|
74
|
-
'use_safetensors': True,
|
|
75
|
-
'variant': 'fp16',
|
|
76
|
-
'torch_dtype': 'torch.float16',
|
|
77
|
-
},
|
|
78
|
-
datasets=[
|
|
79
|
-
# 'tifa160',
|
|
80
|
-
# 'genai_bench',
|
|
81
|
-
'evalmuse',
|
|
82
|
-
# 'hpdv2',
|
|
83
|
-
],
|
|
84
|
-
dataset_args={
|
|
85
|
-
'tifa160': {
|
|
86
|
-
'metric_list': [
|
|
87
|
-
# 'PickScore',
|
|
88
|
-
# 'CLIPScore',
|
|
89
|
-
# 'HPSv2Score',
|
|
90
|
-
# 'BLIPv2Score',
|
|
91
|
-
# 'ImageRewardScore',
|
|
92
|
-
# 'VQAScore',
|
|
93
|
-
'FGA_BLIP2Score',
|
|
94
|
-
]
|
|
95
|
-
}
|
|
96
|
-
},
|
|
97
|
-
limit=5,
|
|
98
|
-
generation_config={
|
|
99
|
-
'num_inference_steps': 50,
|
|
100
|
-
'guidance_scale': 7.5
|
|
101
|
-
},
|
|
102
|
-
# use_cache='outputs/20250427_134122',
|
|
103
|
-
)
|
|
104
|
-
|
|
105
|
-
run_task(task_cfg=task_cfg)
|
|
106
|
-
|
|
107
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
108
|
-
def test_run_benchmark_flux(self):
|
|
109
|
-
|
|
110
|
-
task_cfg = TaskConfig(
|
|
111
|
-
model='black-forest-labs/FLUX.1-dev', # model on modelscope
|
|
112
|
-
model_task=ModelTask.IMAGE_GENERATION, # must be IMAGE_GENERATION
|
|
113
|
-
model_args={
|
|
114
|
-
'torch_dtype': 'torch.float16',
|
|
115
|
-
},
|
|
116
|
-
datasets=[
|
|
117
|
-
# 'tifa160',
|
|
118
|
-
# 'genai_bench',
|
|
119
|
-
'evalmuse',
|
|
120
|
-
# 'hpdv2',
|
|
121
|
-
],
|
|
122
|
-
dataset_args={
|
|
123
|
-
'tifa160': {
|
|
124
|
-
'metric_list': [
|
|
125
|
-
'PickScore',
|
|
126
|
-
# 'CLIPScore',
|
|
127
|
-
# 'HPSv2Score',
|
|
128
|
-
# 'BLIPv2Score',
|
|
129
|
-
# 'ImageRewardScore',
|
|
130
|
-
# 'VQAScore',
|
|
131
|
-
# 'FGA_BLIP2Score',
|
|
132
|
-
]
|
|
133
|
-
}
|
|
134
|
-
},
|
|
135
|
-
generation_config={
|
|
136
|
-
'num_inference_steps': 50,
|
|
137
|
-
'guidance_scale': 3.5
|
|
138
|
-
},
|
|
139
|
-
use_cache='outputs/20250520_112314'
|
|
140
|
-
)
|
|
141
|
-
|
|
142
|
-
run_task(task_cfg=task_cfg)
|
tests/benchmark/test_vlm.py
DELETED
|
@@ -1,137 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
from dotenv import dotenv_values
|
|
3
|
-
|
|
4
|
-
env = dotenv_values('.env')
|
|
5
|
-
|
|
6
|
-
import unittest
|
|
7
|
-
|
|
8
|
-
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
9
|
-
from evalscope.utils.logger import get_logger
|
|
10
|
-
from tests.common import TestBenchmark
|
|
11
|
-
|
|
12
|
-
logger = get_logger()
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class TestVLMBenchmark(TestBenchmark):
|
|
16
|
-
"""Benchmark evaluation test cases."""
|
|
17
|
-
|
|
18
|
-
def setUp(self):
|
|
19
|
-
"""Setup common test configuration."""
|
|
20
|
-
self.base_config = {
|
|
21
|
-
'model': 'qwen-vl-plus',
|
|
22
|
-
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
23
|
-
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
24
|
-
'eval_type': EvalType.SERVICE,
|
|
25
|
-
'eval_batch_size': 5,
|
|
26
|
-
'limit': 5,
|
|
27
|
-
'generation_config': {
|
|
28
|
-
'max_tokens': 2048,
|
|
29
|
-
'temperature': 0.0,
|
|
30
|
-
'seed': 42,
|
|
31
|
-
'parallel_tool_calls': True
|
|
32
|
-
},
|
|
33
|
-
'judge_strategy': JudgeStrategy.AUTO,
|
|
34
|
-
'judge_worker_num': 5,
|
|
35
|
-
'judge_model_args': {
|
|
36
|
-
'model_id': 'qwen2.5-72b-instruct',
|
|
37
|
-
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
38
|
-
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
39
|
-
'generation_config': {
|
|
40
|
-
'temperature': 0.0,
|
|
41
|
-
'max_tokens': 4096,
|
|
42
|
-
}
|
|
43
|
-
},
|
|
44
|
-
'debug': True,
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
def test_mmmu(self):
|
|
48
|
-
dataset_args = {
|
|
49
|
-
'subset_list':[
|
|
50
|
-
'Accounting',
|
|
51
|
-
'Agriculture',
|
|
52
|
-
# 'Architecture_and_Engineering'
|
|
53
|
-
]
|
|
54
|
-
}
|
|
55
|
-
self._run_dataset_test('mmmu', dataset_args=dataset_args)
|
|
56
|
-
|
|
57
|
-
def test_math_vista(self):
|
|
58
|
-
dataset_args = {
|
|
59
|
-
'subset_list': ['default']
|
|
60
|
-
}
|
|
61
|
-
self._run_dataset_test('math_vista', dataset_args=dataset_args)
|
|
62
|
-
|
|
63
|
-
def test_mmmu_pro(self):
|
|
64
|
-
dataset_args = {
|
|
65
|
-
'subset_list':[
|
|
66
|
-
'Accounting',
|
|
67
|
-
# 'Agriculture',
|
|
68
|
-
],
|
|
69
|
-
'extra_params': {
|
|
70
|
-
'dataset_format': 'standard (4 options)', # 'standard (4 options)', 'standard (10 options)', 'vision'
|
|
71
|
-
},
|
|
72
|
-
}
|
|
73
|
-
self._run_dataset_test('mmmu_pro', dataset_args=dataset_args, limit=10)
|
|
74
|
-
|
|
75
|
-
def test_qwen3_vl_collection(self):
|
|
76
|
-
dataset_args = {
|
|
77
|
-
'dataset_id': 'outputs/qwen3_vl_test.jsonl',
|
|
78
|
-
'shuffle': True,
|
|
79
|
-
}
|
|
80
|
-
self._run_dataset_test('data_collection', dataset_args, limit=100)
|
|
81
|
-
|
|
82
|
-
def test_real_world_qa(self):
|
|
83
|
-
dataset_args = {
|
|
84
|
-
'subset_list': ['default']
|
|
85
|
-
}
|
|
86
|
-
self._run_dataset_test('real_world_qa', dataset_args=dataset_args, limit=10)
|
|
87
|
-
|
|
88
|
-
def test_ai2d(self):
|
|
89
|
-
dataset_args = {
|
|
90
|
-
'subset_list': ['default']
|
|
91
|
-
}
|
|
92
|
-
self._run_dataset_test('ai2d', dataset_args=dataset_args)
|
|
93
|
-
|
|
94
|
-
def test_cc_bench(self):
|
|
95
|
-
dataset_args = {
|
|
96
|
-
'subset_list': ['cc']
|
|
97
|
-
}
|
|
98
|
-
self._run_dataset_test('cc_bench', dataset_args=dataset_args)
|
|
99
|
-
|
|
100
|
-
def test_mm_bench(self):
|
|
101
|
-
dataset_args = {
|
|
102
|
-
'subset_list': ['cn', 'en']
|
|
103
|
-
}
|
|
104
|
-
self._run_dataset_test('mm_bench', dataset_args=dataset_args)
|
|
105
|
-
|
|
106
|
-
def test_mm_star(self):
|
|
107
|
-
dataset_args = {
|
|
108
|
-
# 'subset_list': ['val']
|
|
109
|
-
}
|
|
110
|
-
self._run_dataset_test('mm_star', dataset_args=dataset_args)
|
|
111
|
-
|
|
112
|
-
def test_omni_bench(self):
|
|
113
|
-
dataset_args = {
|
|
114
|
-
'extra_params': {
|
|
115
|
-
'use_image': True, # Whether to use image input, if False, use text alternative image content.
|
|
116
|
-
'use_audio': True, # Whether to use audio input, if False, use text alternative audio content.
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
self._run_dataset_test('omni_bench', dataset_args=dataset_args, model='qwen-omni-turbo')
|
|
120
|
-
|
|
121
|
-
def test_olympiad_bench(self):
|
|
122
|
-
dataset_args = {
|
|
123
|
-
'subset_list': [
|
|
124
|
-
# 'OE_MM_maths_en_COMP',
|
|
125
|
-
# 'OE_MM_maths_zh_CEE',
|
|
126
|
-
# 'OE_MM_maths_zh_COMP',
|
|
127
|
-
# 'OE_MM_physics_en_COMP',
|
|
128
|
-
# 'OE_MM_physics_zh_CEE',
|
|
129
|
-
# 'OE_TO_maths_en_COMP',
|
|
130
|
-
# 'OE_TO_maths_zh_CEE',
|
|
131
|
-
# 'OE_TO_maths_zh_COMP',
|
|
132
|
-
# 'OE_TO_physics_en_COMP',
|
|
133
|
-
# 'OE_TO_physics_zh_CEE',
|
|
134
|
-
'TP_TO_maths_zh_CEE',
|
|
135
|
-
]
|
|
136
|
-
}
|
|
137
|
-
self._run_dataset_test('olympiad_bench', dataset_args=dataset_args)
|
tests/cli/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
tests/cli/test_all.py
DELETED
|
@@ -1,269 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
from dotenv import dotenv_values
|
|
3
|
-
|
|
4
|
-
env = dotenv_values('.env')
|
|
5
|
-
|
|
6
|
-
import os
|
|
7
|
-
import unittest
|
|
8
|
-
|
|
9
|
-
from evalscope.config import TaskConfig
|
|
10
|
-
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
11
|
-
from evalscope.run import run_task
|
|
12
|
-
from evalscope.utils.logger import get_logger
|
|
13
|
-
from tests.utils import test_level_list
|
|
14
|
-
|
|
15
|
-
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
16
|
-
|
|
17
|
-
logger = get_logger()
|
|
18
|
-
|
|
19
|
-
datasets=[
|
|
20
|
-
'iquiz',
|
|
21
|
-
'ifeval',
|
|
22
|
-
'mmlu',
|
|
23
|
-
'mmlu_pro',
|
|
24
|
-
'musr',
|
|
25
|
-
'process_bench',
|
|
26
|
-
'race',
|
|
27
|
-
'trivia_qa',
|
|
28
|
-
'cmmlu',
|
|
29
|
-
'humaneval',
|
|
30
|
-
'gsm8k',
|
|
31
|
-
'bbh',
|
|
32
|
-
'competition_math',
|
|
33
|
-
'math_500',
|
|
34
|
-
'aime24',
|
|
35
|
-
'gpqa_diamond',
|
|
36
|
-
'arc',
|
|
37
|
-
'ceval',
|
|
38
|
-
'hellaswag',
|
|
39
|
-
'general_mcq',
|
|
40
|
-
'general_qa',
|
|
41
|
-
'super_gpqa',
|
|
42
|
-
# 'live_code_bench',
|
|
43
|
-
'mmlu_redux',
|
|
44
|
-
'simple_qa',
|
|
45
|
-
'chinese_simpleqa',
|
|
46
|
-
'alpaca_eval',
|
|
47
|
-
'arena_hard',
|
|
48
|
-
'maritime_bench',
|
|
49
|
-
'drop',
|
|
50
|
-
'winogrande',
|
|
51
|
-
'tool_bench',
|
|
52
|
-
'frames',
|
|
53
|
-
'docmath',
|
|
54
|
-
'needle_haystack',
|
|
55
|
-
'bfcl_v3',
|
|
56
|
-
'hle',
|
|
57
|
-
'tau_bench',
|
|
58
|
-
]
|
|
59
|
-
|
|
60
|
-
# Reverse the datasets list to ensure the order is from most recent to oldest
|
|
61
|
-
datasets.reverse()
|
|
62
|
-
|
|
63
|
-
dataset_args={
|
|
64
|
-
'mmlu': {
|
|
65
|
-
'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
|
|
66
|
-
'few_shot_num': 0
|
|
67
|
-
},
|
|
68
|
-
'mmlu_pro': {
|
|
69
|
-
'subset_list': ['math', 'health'],
|
|
70
|
-
'few_shot_num': 4
|
|
71
|
-
},
|
|
72
|
-
'ceval': {
|
|
73
|
-
'subset_list': [
|
|
74
|
-
'computer_network', 'operating_system', 'computer_architecture'
|
|
75
|
-
],
|
|
76
|
-
'few_shot_num': 0
|
|
77
|
-
},
|
|
78
|
-
'cmmlu': {
|
|
79
|
-
'subset_list': ['elementary_chinese'],
|
|
80
|
-
'few_shot_num': 0
|
|
81
|
-
},
|
|
82
|
-
'bbh': {
|
|
83
|
-
'subset_list': ['word_sorting', 'movie_recommendation'],
|
|
84
|
-
},
|
|
85
|
-
'gpqa_diamond': {
|
|
86
|
-
'few_shot_num': 0,
|
|
87
|
-
},
|
|
88
|
-
'humaneval': {
|
|
89
|
-
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
|
|
90
|
-
},
|
|
91
|
-
'competition_math': {
|
|
92
|
-
'subset_list': ['Level 1']
|
|
93
|
-
},
|
|
94
|
-
'math_500': {
|
|
95
|
-
'subset_list': ['Level 1']
|
|
96
|
-
},
|
|
97
|
-
'process_bench': {
|
|
98
|
-
'subset_list': ['gsm8k'],
|
|
99
|
-
},
|
|
100
|
-
'musr': {
|
|
101
|
-
'subset_list': ['murder_mysteries']
|
|
102
|
-
},
|
|
103
|
-
'general_mcq': {
|
|
104
|
-
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
105
|
-
'subset_list': [
|
|
106
|
-
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
107
|
-
],
|
|
108
|
-
},
|
|
109
|
-
'general_qa': {
|
|
110
|
-
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
111
|
-
'subset_list': [
|
|
112
|
-
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
113
|
-
# 'test'
|
|
114
|
-
]
|
|
115
|
-
},
|
|
116
|
-
'super_gpqa': {
|
|
117
|
-
'subset_list': ['Philosophy', 'Education'],
|
|
118
|
-
'few_shot_num': 0
|
|
119
|
-
},
|
|
120
|
-
'live_code_bench': {
|
|
121
|
-
'subset_list': ['v4_v5'],
|
|
122
|
-
'extra_params': {
|
|
123
|
-
'start_date': '2024-12-01',
|
|
124
|
-
'end_date': '2025-01-01'
|
|
125
|
-
},
|
|
126
|
-
},
|
|
127
|
-
'chinese_simpleqa': {
|
|
128
|
-
'subset_list': ['中华文化']
|
|
129
|
-
},
|
|
130
|
-
'mmlu_redux':{
|
|
131
|
-
'subset_list': ['abstract_algebra']
|
|
132
|
-
},
|
|
133
|
-
'docmath':{
|
|
134
|
-
'subset_list': ['simpshort_testmini']
|
|
135
|
-
},
|
|
136
|
-
'bfcl_v3':{
|
|
137
|
-
'subset_list': ['simple', 'multiple']
|
|
138
|
-
},
|
|
139
|
-
'hle': {
|
|
140
|
-
'subset_list': ['Math', 'Other'],
|
|
141
|
-
},
|
|
142
|
-
'tau_bench': {
|
|
143
|
-
'extra_params': {
|
|
144
|
-
'user_model': 'qwen-plus',
|
|
145
|
-
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
146
|
-
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
147
|
-
},
|
|
148
|
-
'subset_list': ['airline'],
|
|
149
|
-
},
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
class TestRun(unittest.TestCase):
|
|
153
|
-
def test_benchmarks(self):
|
|
154
|
-
from evalscope.config import TaskConfig
|
|
155
|
-
|
|
156
|
-
task_cfg = TaskConfig(
|
|
157
|
-
model='qwen-plus',
|
|
158
|
-
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
159
|
-
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
160
|
-
eval_type=EvalType.SERVICE,
|
|
161
|
-
datasets=datasets,
|
|
162
|
-
dataset_args=dataset_args,
|
|
163
|
-
eval_batch_size=1,
|
|
164
|
-
limit=1,
|
|
165
|
-
stream=True,
|
|
166
|
-
generation_config={
|
|
167
|
-
'temperature': 0,
|
|
168
|
-
'n': 1,
|
|
169
|
-
'max_tokens': 4096,
|
|
170
|
-
},
|
|
171
|
-
judge_worker_num=5,
|
|
172
|
-
judge_strategy=JudgeStrategy.AUTO,
|
|
173
|
-
judge_model_args={
|
|
174
|
-
'model_id': 'qwen2.5-72b-instruct',
|
|
175
|
-
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
176
|
-
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
177
|
-
}
|
|
178
|
-
)
|
|
179
|
-
|
|
180
|
-
run_task(task_cfg=task_cfg)
|
|
181
|
-
|
|
182
|
-
def test_vlm_benchmark(self):
|
|
183
|
-
from evalscope.config import TaskConfig
|
|
184
|
-
|
|
185
|
-
task_cfg = TaskConfig(
|
|
186
|
-
model='qwen-vl-plus',
|
|
187
|
-
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
188
|
-
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
189
|
-
eval_type=EvalType.SERVICE,
|
|
190
|
-
datasets=[
|
|
191
|
-
'mmmu',
|
|
192
|
-
# 'math_vista',
|
|
193
|
-
],
|
|
194
|
-
dataset_args={
|
|
195
|
-
'mmmu': {
|
|
196
|
-
'subset_list': ['Accounting']
|
|
197
|
-
},
|
|
198
|
-
'math_vista': {
|
|
199
|
-
'subset_list': ['default']
|
|
200
|
-
}
|
|
201
|
-
},
|
|
202
|
-
eval_batch_size=1,
|
|
203
|
-
limit=1,
|
|
204
|
-
stream=True,
|
|
205
|
-
generation_config={
|
|
206
|
-
'temperature': 0,
|
|
207
|
-
'n': 1,
|
|
208
|
-
'max_tokens': 4096,
|
|
209
|
-
'image_height': 512,
|
|
210
|
-
'image_width': 512,
|
|
211
|
-
'image_num': 2,
|
|
212
|
-
},
|
|
213
|
-
judge_worker_num=5,
|
|
214
|
-
judge_strategy=JudgeStrategy.AUTO,
|
|
215
|
-
judge_model_args={
|
|
216
|
-
'model_id': 'qwen2.5-72b-instruct',
|
|
217
|
-
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
218
|
-
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
219
|
-
}
|
|
220
|
-
)
|
|
221
|
-
|
|
222
|
-
run_task(task_cfg=task_cfg)
|
|
223
|
-
|
|
224
|
-
def test_ci_lite(self):
|
|
225
|
-
from evalscope.config import TaskConfig
|
|
226
|
-
|
|
227
|
-
api_key = env.get('DASHSCOPE_API_KEY')
|
|
228
|
-
|
|
229
|
-
task_cfg = TaskConfig(
|
|
230
|
-
model='qwen-plus',
|
|
231
|
-
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
232
|
-
api_key=api_key,
|
|
233
|
-
eval_type=EvalType.SERVICE if api_key else EvalType.MOCK_LLM,
|
|
234
|
-
datasets=[
|
|
235
|
-
'general_mcq',
|
|
236
|
-
'iquiz',
|
|
237
|
-
],
|
|
238
|
-
dataset_args={
|
|
239
|
-
'general_mcq': {
|
|
240
|
-
'local_path': 'custom_eval/text/mcq',
|
|
241
|
-
'subset_list': [
|
|
242
|
-
'example'
|
|
243
|
-
],
|
|
244
|
-
},
|
|
245
|
-
'general_qa': {
|
|
246
|
-
'local_path': 'custom_eval/text/qa',
|
|
247
|
-
'subset_list': [
|
|
248
|
-
'example'
|
|
249
|
-
]
|
|
250
|
-
}
|
|
251
|
-
},
|
|
252
|
-
eval_batch_size=1,
|
|
253
|
-
limit=1,
|
|
254
|
-
stream=True,
|
|
255
|
-
generation_config={
|
|
256
|
-
'temperature': 0,
|
|
257
|
-
'n': 1,
|
|
258
|
-
'max_tokens': 4096,
|
|
259
|
-
},
|
|
260
|
-
judge_worker_num=1,
|
|
261
|
-
judge_strategy=JudgeStrategy.AUTO,
|
|
262
|
-
judge_model_args={
|
|
263
|
-
'model_id': 'qwen2.5-72b-instruct',
|
|
264
|
-
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
265
|
-
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
266
|
-
}
|
|
267
|
-
)
|
|
268
|
-
|
|
269
|
-
run_task(task_cfg=task_cfg)
|
tests/cli/test_collection.py
DELETED
|
@@ -1,99 +0,0 @@
|
|
|
1
|
-
from dotenv import dotenv_values
|
|
2
|
-
|
|
3
|
-
env = dotenv_values('.env')
|
|
4
|
-
import json
|
|
5
|
-
import os
|
|
6
|
-
import unittest
|
|
7
|
-
|
|
8
|
-
from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler
|
|
9
|
-
from evalscope.constants import EvalType, JudgeStrategy
|
|
10
|
-
from evalscope.utils.io_utils import dump_jsonl_data
|
|
11
|
-
from tests.utils import test_level_list
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class TestCollection(unittest.TestCase):
|
|
15
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
16
|
-
def test_create_collection(self):
|
|
17
|
-
schema = CollectionSchema(name='math&reasoning', datasets=[
|
|
18
|
-
CollectionSchema(name='math', datasets=[
|
|
19
|
-
CollectionSchema(name='generation', datasets=[
|
|
20
|
-
DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']),
|
|
21
|
-
]),
|
|
22
|
-
CollectionSchema(name='multiple_choice', datasets=[
|
|
23
|
-
DatasetInfo(name='cmmlu', weight=2, task_type='math', tags=['zh', 'math'], args={'subset_list': ['college_mathematics', 'high_school_mathematics']}),
|
|
24
|
-
DatasetInfo(name='ceval', weight=3, task_type='math', tags=['zh', 'math'], args={'subset_list': ['advanced_mathematics', 'high_school_mathematics', 'discrete_mathematics', 'middle_school_mathematics']}),
|
|
25
|
-
]),
|
|
26
|
-
]),
|
|
27
|
-
CollectionSchema(name='reasoning', datasets=[
|
|
28
|
-
DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
|
|
29
|
-
DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh', 'reasoning'], args={'subset_list': ['logic']}),
|
|
30
|
-
DatasetInfo(name='race', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
|
|
31
|
-
]),
|
|
32
|
-
])
|
|
33
|
-
print(schema.to_dict())
|
|
34
|
-
print(schema.flatten())
|
|
35
|
-
schema.dump_json('outputs/schema_test.json')
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
39
|
-
def test_generate_data(self):
|
|
40
|
-
schema = CollectionSchema.from_dict(json.load(open('outputs/schema_test.json', 'r')))
|
|
41
|
-
print(schema.to_dict())
|
|
42
|
-
mixed_data = WeightedSampler(schema).sample(100)
|
|
43
|
-
dump_jsonl_data(mixed_data, 'outputs/mixed_data_test.jsonl')
|
|
44
|
-
|
|
45
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
46
|
-
def test_evaluate_collection(self):
|
|
47
|
-
from evalscope import TaskConfig, run_task
|
|
48
|
-
|
|
49
|
-
task_cfg = TaskConfig(
|
|
50
|
-
model='qwen-plus',
|
|
51
|
-
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
52
|
-
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
53
|
-
eval_type=EvalType.SERVICE,
|
|
54
|
-
datasets=['data_collection'],
|
|
55
|
-
dataset_args={
|
|
56
|
-
'data_collection': {
|
|
57
|
-
# 'local_path': 'outputs/test_mix.jsonl'
|
|
58
|
-
'local_path': 'outputs/mixed_data_test.jsonl',
|
|
59
|
-
'shuffle': True,
|
|
60
|
-
}
|
|
61
|
-
},
|
|
62
|
-
eval_batch_size=5,
|
|
63
|
-
generation_config = {
|
|
64
|
-
'max_tokens': 10000,
|
|
65
|
-
'temperature': 0.0,
|
|
66
|
-
},
|
|
67
|
-
limit=10,
|
|
68
|
-
# use_cache='outputs/20250822_161804'
|
|
69
|
-
)
|
|
70
|
-
run_task(task_cfg=task_cfg)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
74
|
-
def test_evaluate_collection_with_judge(self):
|
|
75
|
-
from evalscope import TaskConfig, run_task
|
|
76
|
-
|
|
77
|
-
task_cfg = TaskConfig(
|
|
78
|
-
model='qwen2.5-7b-instruct',
|
|
79
|
-
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
80
|
-
api_key= os.getenv('DASHSCOPE_API_KEY'),
|
|
81
|
-
eval_type=EvalType.SERVICE,
|
|
82
|
-
datasets=['data_collection'],
|
|
83
|
-
dataset_args={'data_collection': {
|
|
84
|
-
'local_path': 'outputs/mixed_data_test.jsonl'
|
|
85
|
-
# 'local_path': 'outputs/weighted_mixed_data.jsonl'
|
|
86
|
-
}},
|
|
87
|
-
limit=5,
|
|
88
|
-
judge_strategy=JudgeStrategy.AUTO,
|
|
89
|
-
judge_model_args={
|
|
90
|
-
'model_id': 'qwen2.5-72b-instruct',
|
|
91
|
-
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
92
|
-
'api_key': os.getenv('DASHSCOPE_API_KEY'),
|
|
93
|
-
},
|
|
94
|
-
analysis_report=True,
|
|
95
|
-
ignore_errors=True,
|
|
96
|
-
# use_cache='outputs/20250522_204520'
|
|
97
|
-
)
|
|
98
|
-
res = run_task(task_cfg=task_cfg)
|
|
99
|
-
print(res)
|