evalscope 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +1 -1
- evalscope/api/benchmark/adapters/__init__.py +2 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +1 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +7 -6
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +35 -0
- evalscope/api/benchmark/meta.py +6 -0
- evalscope/api/dataset/dataset.py +6 -6
- evalscope/api/dataset/loader.py +2 -1
- evalscope/api/evaluator/cache.py +24 -1
- evalscope/api/evaluator/state.py +12 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +47 -2
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +0 -1
- evalscope/api/model/generate_config.py +1 -3
- evalscope/api/model/model.py +4 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/single_model.py +3 -3
- evalscope/app/utils/data_utils.py +7 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/arguments.py +2 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/bfcl/bfcl_adapter.py +2 -6
- evalscope/benchmarks/bfcl/generation.py +2 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +5 -1
- evalscope/benchmarks/tau_bench/generation.py +1 -1
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +15 -19
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +72 -13
- evalscope/constants.py +8 -0
- evalscope/evaluator/evaluator.py +6 -4
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +20 -0
- evalscope/models/openai_compatible.py +3 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +7 -4
- evalscope/perf/benchmark.py +2 -0
- evalscope/perf/utils/benchmark_util.py +8 -5
- evalscope/perf/utils/local_server.py +3 -0
- evalscope/report/__init__.py +0 -1
- evalscope/report/generator.py +8 -87
- evalscope/run.py +9 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +42 -1
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +23 -6
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/METADATA +12 -15
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/RECORD +94 -80
- tests/benchmark/test_eval.py +30 -31
- tests/benchmark/test_image_edit.py +65 -0
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +83 -43
- tests/cli/test_collection.py +8 -5
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -3
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/aigc/__init__.py +0 -1
- /evalscope/benchmarks/{aigc → image_edit}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → image_edit/gedit}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → math_vista}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/test_t2i.py +0 -0
tests/cli/test_all.py
CHANGED
|
@@ -17,44 +17,44 @@ os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
|
17
17
|
logger = get_logger()
|
|
18
18
|
|
|
19
19
|
datasets=[
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
20
|
+
'iquiz',
|
|
21
|
+
'ifeval',
|
|
22
|
+
'mmlu',
|
|
23
|
+
'mmlu_pro',
|
|
24
|
+
'musr',
|
|
25
|
+
'process_bench',
|
|
26
|
+
'race',
|
|
27
|
+
'trivia_qa',
|
|
28
|
+
'cmmlu',
|
|
29
|
+
'humaneval',
|
|
30
|
+
'gsm8k',
|
|
31
|
+
'bbh',
|
|
32
|
+
'competition_math',
|
|
33
|
+
'math_500',
|
|
34
|
+
'aime24',
|
|
35
|
+
'gpqa_diamond',
|
|
36
|
+
'arc',
|
|
37
|
+
'ceval',
|
|
38
|
+
'hellaswag',
|
|
39
|
+
'general_mcq',
|
|
40
|
+
'general_qa',
|
|
41
|
+
'super_gpqa',
|
|
42
|
+
# 'live_code_bench',
|
|
43
|
+
'mmlu_redux',
|
|
44
|
+
'simple_qa',
|
|
45
|
+
'chinese_simpleqa',
|
|
46
|
+
'alpaca_eval',
|
|
47
|
+
'arena_hard',
|
|
48
|
+
'maritime_bench',
|
|
49
|
+
'drop',
|
|
50
|
+
'winogrande',
|
|
51
|
+
'tool_bench',
|
|
52
|
+
'frames',
|
|
53
|
+
'docmath',
|
|
54
|
+
'needle_haystack',
|
|
55
|
+
'bfcl_v3',
|
|
56
|
+
'hle',
|
|
57
|
+
'tau_bench',
|
|
58
58
|
]
|
|
59
59
|
|
|
60
60
|
# Reverse the datasets list to ensure the order is from most recent to oldest
|
|
@@ -150,7 +150,6 @@ dataset_args={
|
|
|
150
150
|
}
|
|
151
151
|
|
|
152
152
|
class TestRun(unittest.TestCase):
|
|
153
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
154
153
|
def test_benchmarks(self):
|
|
155
154
|
from evalscope.config import TaskConfig
|
|
156
155
|
|
|
@@ -180,19 +179,60 @@ class TestRun(unittest.TestCase):
|
|
|
180
179
|
|
|
181
180
|
run_task(task_cfg=task_cfg)
|
|
182
181
|
|
|
182
|
+
def test_vlm_benchmark(self):
|
|
183
|
+
from evalscope.config import TaskConfig
|
|
184
|
+
|
|
185
|
+
task_cfg = TaskConfig(
|
|
186
|
+
model='qwen-vl-plus',
|
|
187
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
188
|
+
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
189
|
+
eval_type=EvalType.SERVICE,
|
|
190
|
+
datasets=[
|
|
191
|
+
'mmmu',
|
|
192
|
+
# 'math_vista',
|
|
193
|
+
],
|
|
194
|
+
dataset_args={
|
|
195
|
+
'mmmu': {
|
|
196
|
+
'subset_list': ['Accounting']
|
|
197
|
+
},
|
|
198
|
+
'math_vista': {
|
|
199
|
+
'subset_list': ['default']
|
|
200
|
+
}
|
|
201
|
+
},
|
|
202
|
+
eval_batch_size=1,
|
|
203
|
+
limit=1,
|
|
204
|
+
stream=True,
|
|
205
|
+
generation_config={
|
|
206
|
+
'temperature': 0,
|
|
207
|
+
'n': 1,
|
|
208
|
+
'max_tokens': 4096,
|
|
209
|
+
'image_height': 512,
|
|
210
|
+
'image_width': 512,
|
|
211
|
+
'image_num': 2,
|
|
212
|
+
},
|
|
213
|
+
judge_worker_num=5,
|
|
214
|
+
judge_strategy=JudgeStrategy.AUTO,
|
|
215
|
+
judge_model_args={
|
|
216
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
217
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
218
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
219
|
+
}
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
run_task(task_cfg=task_cfg)
|
|
183
223
|
|
|
184
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
185
224
|
def test_ci_lite(self):
|
|
186
225
|
from evalscope.config import TaskConfig
|
|
187
226
|
|
|
227
|
+
api_key = env.get('DASHSCOPE_API_KEY')
|
|
228
|
+
|
|
188
229
|
task_cfg = TaskConfig(
|
|
189
230
|
model='qwen-plus',
|
|
190
231
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
191
|
-
api_key=
|
|
192
|
-
eval_type=EvalType.SERVICE,
|
|
232
|
+
api_key=api_key,
|
|
233
|
+
eval_type=EvalType.SERVICE if api_key else EvalType.MOCK_LLM,
|
|
193
234
|
datasets=[
|
|
194
235
|
'general_mcq',
|
|
195
|
-
'general_qa',
|
|
196
236
|
'iquiz',
|
|
197
237
|
],
|
|
198
238
|
dataset_args={
|
tests/cli/test_collection.py
CHANGED
|
@@ -52,16 +52,19 @@ class TestCollection(unittest.TestCase):
|
|
|
52
52
|
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
53
53
|
eval_type=EvalType.SERVICE,
|
|
54
54
|
datasets=['data_collection'],
|
|
55
|
-
dataset_args={
|
|
56
|
-
'
|
|
57
|
-
|
|
58
|
-
|
|
55
|
+
dataset_args={
|
|
56
|
+
'data_collection': {
|
|
57
|
+
# 'local_path': 'outputs/test_mix.jsonl'
|
|
58
|
+
'local_path': 'outputs/mixed_data_test.jsonl',
|
|
59
|
+
'shuffle': True,
|
|
60
|
+
}
|
|
61
|
+
},
|
|
59
62
|
eval_batch_size=5,
|
|
60
63
|
generation_config = {
|
|
61
64
|
'max_tokens': 10000,
|
|
62
65
|
'temperature': 0.0,
|
|
63
66
|
},
|
|
64
|
-
limit=
|
|
67
|
+
limit=10,
|
|
65
68
|
# use_cache='outputs/20250822_161804'
|
|
66
69
|
)
|
|
67
70
|
run_task(task_cfg=task_cfg)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from dotenv import dotenv_values
|
|
3
|
+
|
|
4
|
+
env = dotenv_values('.env')
|
|
5
|
+
|
|
6
|
+
import unittest
|
|
7
|
+
from unittest import TestCase
|
|
8
|
+
|
|
9
|
+
from evalscope.config import TaskConfig
|
|
10
|
+
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
11
|
+
from evalscope.run import run_task
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TestReasoning(TestCase):
|
|
18
|
+
"""Benchmark evaluation test cases."""
|
|
19
|
+
|
|
20
|
+
def setUp(self):
|
|
21
|
+
"""Setup common test configuration."""
|
|
22
|
+
self.base_config = {
|
|
23
|
+
'model': 'Qwen3-0.6B',
|
|
24
|
+
'api_url': 'http://0.0.0.0:8801/v1',
|
|
25
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
26
|
+
'eval_type': EvalType.SERVICE,
|
|
27
|
+
'eval_batch_size': 5,
|
|
28
|
+
'limit': 5,
|
|
29
|
+
'generation_config': {
|
|
30
|
+
'max_tokens': 4096,
|
|
31
|
+
'temperature': 0.0,
|
|
32
|
+
'seed': 42,
|
|
33
|
+
'parallel_tool_calls': True,
|
|
34
|
+
'extra_body':{'chat_template_kwargs': {'enable_thinking': False}} # 关闭思考模式
|
|
35
|
+
},
|
|
36
|
+
'judge_strategy': JudgeStrategy.AUTO,
|
|
37
|
+
'judge_worker_num': 5,
|
|
38
|
+
'judge_model_args': {
|
|
39
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
40
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
41
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
42
|
+
'generation_config': {
|
|
43
|
+
'temperature': 0.0,
|
|
44
|
+
'max_tokens': 4096,
|
|
45
|
+
}
|
|
46
|
+
},
|
|
47
|
+
'debug': True,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
|
|
51
|
+
"""Helper method to run test for a specific dataset."""
|
|
52
|
+
config = self.base_config.copy()
|
|
53
|
+
config['datasets'] = [dataset_name]
|
|
54
|
+
|
|
55
|
+
if use_mock:
|
|
56
|
+
config['eval_type'] = EvalType.MOCK_LLM
|
|
57
|
+
|
|
58
|
+
# 应用配置覆盖
|
|
59
|
+
config.update(config_overrides)
|
|
60
|
+
|
|
61
|
+
if dataset_args:
|
|
62
|
+
config['dataset_args'] = {dataset_name: dataset_args}
|
|
63
|
+
|
|
64
|
+
task_cfg = TaskConfig(**config)
|
|
65
|
+
run_task(task_cfg=task_cfg)
|
|
66
|
+
|
|
67
|
+
def _run_dataset_load_test(self, dataset_name, dataset_args=None):
|
|
68
|
+
"""Helper method to test dataset loading."""
|
|
69
|
+
|
|
70
|
+
self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
|
|
71
|
+
|
|
72
|
+
# Math & Reasoning datasets
|
|
73
|
+
def test_gsm8k(self):
|
|
74
|
+
"""Test GSM8K math reasoning dataset."""
|
|
75
|
+
self._run_dataset_test('gsm8k')
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
if __name__ == '__main__':
|
|
79
|
+
# Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k
|
|
80
|
+
# Run all tests: python -m unittest test_eval.TestBenchmark
|
|
81
|
+
unittest.main()
|
tests/common.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from dotenv import dotenv_values
|
|
3
|
+
|
|
4
|
+
env = dotenv_values('.env')
|
|
5
|
+
|
|
6
|
+
import unittest
|
|
7
|
+
from unittest import TestCase
|
|
8
|
+
|
|
9
|
+
from evalscope.config import TaskConfig
|
|
10
|
+
from evalscope.constants import EvalType, JudgeStrategy
|
|
11
|
+
from evalscope.run import run_task
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TestBenchmark(TestCase):
|
|
18
|
+
"""Benchmark evaluation test cases."""
|
|
19
|
+
|
|
20
|
+
def setUp(self):
|
|
21
|
+
"""Setup common test configuration."""
|
|
22
|
+
self.base_config = {
|
|
23
|
+
'model': 'qwen-plus',
|
|
24
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
25
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
26
|
+
'eval_type': EvalType.SERVICE,
|
|
27
|
+
'eval_batch_size': 5,
|
|
28
|
+
'limit': 5,
|
|
29
|
+
'generation_config': {
|
|
30
|
+
'max_tokens': 4096,
|
|
31
|
+
'temperature': 0.0,
|
|
32
|
+
'seed': 42,
|
|
33
|
+
'parallel_tool_calls': True
|
|
34
|
+
},
|
|
35
|
+
'judge_strategy': JudgeStrategy.AUTO,
|
|
36
|
+
'judge_worker_num': 5,
|
|
37
|
+
'judge_model_args': {
|
|
38
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
39
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
40
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
41
|
+
'generation_config': {
|
|
42
|
+
'temperature': 0.0,
|
|
43
|
+
'max_tokens': 4096,
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
'debug': True,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
|
|
50
|
+
"""Helper method to run test for a specific dataset."""
|
|
51
|
+
config = self.base_config.copy()
|
|
52
|
+
config['datasets'] = [dataset_name]
|
|
53
|
+
|
|
54
|
+
if not env.get('DASHSCOPE_API_KEY'):
|
|
55
|
+
use_mock = True
|
|
56
|
+
logger.warning('DASHSCOPE_API_KEY is not set. Using mock evaluation.')
|
|
57
|
+
|
|
58
|
+
if use_mock:
|
|
59
|
+
config['eval_type'] = EvalType.MOCK_LLM
|
|
60
|
+
|
|
61
|
+
# 应用配置覆盖
|
|
62
|
+
config.update(config_overrides)
|
|
63
|
+
|
|
64
|
+
if dataset_args:
|
|
65
|
+
config['dataset_args'] = {dataset_name: dataset_args}
|
|
66
|
+
|
|
67
|
+
task_cfg = TaskConfig(**config)
|
|
68
|
+
run_task(task_cfg=task_cfg)
|
|
69
|
+
|
|
70
|
+
def _run_dataset_load_test(self, dataset_name, dataset_args=None):
|
|
71
|
+
"""Helper method to test dataset loading."""
|
|
72
|
+
|
|
73
|
+
self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
|
tests/perf/test_perf.py
CHANGED
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os
|
|
3
2
|
from dotenv import dotenv_values
|
|
4
3
|
|
|
5
4
|
env = dotenv_values('.env')
|
|
6
|
-
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
|
7
5
|
import unittest
|
|
8
6
|
|
|
9
7
|
from evalscope.perf.main import run_perf_benchmark
|
|
@@ -123,6 +121,10 @@ class TestPerf(unittest.TestCase):
|
|
|
123
121
|
|
|
124
122
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
125
123
|
def test_run_perf_multi_parallel(self):
|
|
124
|
+
if not env.get('DASHSCOPE_API_KEY'):
|
|
125
|
+
self.skipTest('DASHSCOPE_API_KEY is not set.')
|
|
126
|
+
return
|
|
127
|
+
|
|
126
128
|
from evalscope.perf.arguments import Arguments
|
|
127
129
|
task_cfg = Arguments(
|
|
128
130
|
parallel=[1, 2],
|
tests/rag/test_clip_benchmark.py
CHANGED
|
@@ -1,105 +0,0 @@
|
|
|
1
|
-
from abc import ABC
|
|
2
|
-
from collections import defaultdict
|
|
3
|
-
from typing import Any, Callable, Dict
|
|
4
|
-
|
|
5
|
-
from evalscope.api.dataset import Dataset, DatasetDict, RemoteDataLoader
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class DatasetLoaderMixin:
|
|
9
|
-
"""
|
|
10
|
-
Mixin class providing dataset loading functionality for benchmarks.
|
|
11
|
-
|
|
12
|
-
This mixin provides common dataset loading methods that can be shared
|
|
13
|
-
across different data adapters, including support for:
|
|
14
|
-
- Loading multiple subsets
|
|
15
|
-
- Few-shot dataset loading
|
|
16
|
-
- Remote dataset loading with configuration
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
def load_subsets(self, load_func: Callable[[str], Dataset]) -> DatasetDict:
|
|
20
|
-
"""
|
|
21
|
-
Load multiple subsets of the dataset using the provided loading function.
|
|
22
|
-
|
|
23
|
-
This method handles two loading strategies:
|
|
24
|
-
1. Reformat mode: Load only the default subset and reformat it
|
|
25
|
-
2. Multi-subset mode: Load all subsets specified in subset_list
|
|
26
|
-
|
|
27
|
-
Args:
|
|
28
|
-
load_func (Callable[[str], Dataset]): Function to load individual subsets
|
|
29
|
-
|
|
30
|
-
Returns:
|
|
31
|
-
DatasetDict: Dictionary containing all loaded subsets
|
|
32
|
-
"""
|
|
33
|
-
if self.reformat_subset:
|
|
34
|
-
# Load only the default subset
|
|
35
|
-
subset_data = load_func(self.default_subset)
|
|
36
|
-
# Reformat the subset to create multiple subsets based on sample keys
|
|
37
|
-
# NOTE: subset_list and limit is applied here if specified
|
|
38
|
-
dataset_dict = DatasetDict.from_dataset(dataset=subset_data, subset_list=self.subset_list, limit=self.limit)
|
|
39
|
-
else:
|
|
40
|
-
# Load all specified subsets into separate entries
|
|
41
|
-
subset_dict = defaultdict()
|
|
42
|
-
for subset in self.subset_list:
|
|
43
|
-
subset_data = load_func(subset)
|
|
44
|
-
subset_dict[subset] = subset_data
|
|
45
|
-
dataset_dict = DatasetDict(subset_dict)
|
|
46
|
-
return dataset_dict
|
|
47
|
-
|
|
48
|
-
def load_subset(self, subset: str) -> Dataset:
|
|
49
|
-
"""
|
|
50
|
-
Load a specific subset of the dataset for evaluation.
|
|
51
|
-
|
|
52
|
-
This method configures and executes the data loading for a single subset,
|
|
53
|
-
handling both split-as-subset and traditional subset configurations.
|
|
54
|
-
|
|
55
|
-
Args:
|
|
56
|
-
subset (str): The subset identifier to load
|
|
57
|
-
|
|
58
|
-
Returns:
|
|
59
|
-
Dataset: The loaded dataset subset with processed samples
|
|
60
|
-
"""
|
|
61
|
-
# Determine the split and subset names based on configuration
|
|
62
|
-
split = subset if self.split_as_subset else self.eval_split
|
|
63
|
-
subset_name = self.default_subset if self.split_as_subset else subset
|
|
64
|
-
|
|
65
|
-
# Create and configure the remote data loader
|
|
66
|
-
loader = RemoteDataLoader(
|
|
67
|
-
data_id_or_path=self.dataset_id,
|
|
68
|
-
split=split,
|
|
69
|
-
subset=subset_name,
|
|
70
|
-
sample_fields=self.record_to_sample, # Custom sample conversion function
|
|
71
|
-
limit=self.limit if not self.reformat_subset else None, # Limit number of samples if specified
|
|
72
|
-
repeats=self._task_config.repeats, # Number of repetitions for each sample
|
|
73
|
-
data_source=self._task_config.dataset_hub, # Data source configuration
|
|
74
|
-
)
|
|
75
|
-
return loader.load()
|
|
76
|
-
|
|
77
|
-
def load_fewshot_subset(self, subset: str) -> Dataset:
|
|
78
|
-
"""
|
|
79
|
-
Load a subset specifically for few-shot examples.
|
|
80
|
-
|
|
81
|
-
This method loads training data to be used as demonstrations in few-shot prompting.
|
|
82
|
-
It typically loads from the training split with limited samples and optional shuffling.
|
|
83
|
-
|
|
84
|
-
Args:
|
|
85
|
-
subset (str): The subset identifier to load few-shot examples from
|
|
86
|
-
|
|
87
|
-
Returns:
|
|
88
|
-
Dataset: The loaded few-shot dataset with demonstration examples
|
|
89
|
-
"""
|
|
90
|
-
# Use training split for few-shot examples
|
|
91
|
-
split = subset if self.split_as_subset else self.train_split
|
|
92
|
-
subset_name = self.default_subset if self.split_as_subset else subset
|
|
93
|
-
|
|
94
|
-
# Create loader specifically configured for few-shot sampling
|
|
95
|
-
loader = RemoteDataLoader(
|
|
96
|
-
data_id_or_path=self.dataset_id,
|
|
97
|
-
split=split,
|
|
98
|
-
subset=subset_name,
|
|
99
|
-
sample_fields=self.record_to_sample,
|
|
100
|
-
limit=self.few_shot_num
|
|
101
|
-
if not self.reformat_subset else None, # Limit to specified number of few-shot examples
|
|
102
|
-
shuffle=self.few_shot_random, # Randomize selection if enabled
|
|
103
|
-
data_source=self._task_config.dataset_hub,
|
|
104
|
-
)
|
|
105
|
-
return loader.load()
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os.path
|
|
3
|
-
from collections import defaultdict
|
|
4
|
-
from typing import List, Optional, Union
|
|
5
|
-
|
|
6
|
-
from evalscope.utils.io_utils import jsonl_to_list
|
|
7
|
-
from evalscope.utils.logger import get_logger
|
|
8
|
-
|
|
9
|
-
logger = get_logger()
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class GeneralI2IAdapter:
|
|
13
|
-
|
|
14
|
-
def __init__(self, **kwargs):
|
|
15
|
-
|
|
16
|
-
super().__init__(**kwargs)
|
|
17
|
-
|
|
18
|
-
def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
|
|
19
|
-
dataset_name_or_path = dataset_name_or_path or self.dataset_id
|
|
20
|
-
subset_list = subset_list or self.subset_list
|
|
21
|
-
|
|
22
|
-
data_file_dict = defaultdict(str)
|
|
23
|
-
data_item_dict = defaultdict(list)
|
|
24
|
-
|
|
25
|
-
# get data file path and subset name
|
|
26
|
-
if os.path.isdir(dataset_name_or_path):
|
|
27
|
-
for subset_name in subset_list:
|
|
28
|
-
data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
|
|
29
|
-
elif os.path.isfile(dataset_name_or_path):
|
|
30
|
-
cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
|
|
31
|
-
data_file_dict[cur_subset_name] = dataset_name_or_path
|
|
32
|
-
else:
|
|
33
|
-
raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
|
|
34
|
-
|
|
35
|
-
# load data from local disk
|
|
36
|
-
try:
|
|
37
|
-
for subset_name, file_path in data_file_dict.items():
|
|
38
|
-
data_item_dict[subset_name] = jsonl_to_list(file_path)
|
|
39
|
-
except Exception as e:
|
|
40
|
-
raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
|
|
41
|
-
|
|
42
|
-
data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
|
|
43
|
-
|
|
44
|
-
return data_dict
|
tests/aigc/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|