evalscope 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +1 -1
- evalscope/api/benchmark/adapters/__init__.py +2 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +7 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +62 -2
- evalscope/api/benchmark/meta.py +9 -0
- evalscope/api/dataset/dataset.py +6 -6
- evalscope/api/dataset/loader.py +2 -1
- evalscope/api/evaluator/cache.py +24 -1
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +204 -0
- evalscope/api/model/generate_config.py +1 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/single_model.py +3 -3
- evalscope/app/utils/data_utils.py +7 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
- evalscope/benchmarks/amc/amc_adapter.py +46 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/bfcl_adapter.py +142 -7
- evalscope/benchmarks/bfcl/generation.py +9 -9
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
- evalscope/benchmarks/drop/drop_adapter.py +1 -1
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +6 -5
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/tau_bench/generation.py +1 -1
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +20 -19
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +96 -14
- evalscope/constants.py +11 -0
- evalscope/evaluator/evaluator.py +30 -10
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/metric.py +27 -2
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +3 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +8 -6
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/benchmark.py +2 -0
- evalscope/perf/plugin/api/base.py +2 -2
- evalscope/perf/plugin/api/default_api.py +7 -7
- evalscope/perf/plugin/api/openai_api.py +83 -19
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/local_server.py +3 -0
- evalscope/report/__init__.py +0 -1
- evalscope/report/combinator.py +0 -25
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +9 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +41 -0
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +56 -7
- evalscope/utils/json_schema.py +23 -2
- evalscope/utils/logger.py +19 -0
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +23 -6
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/METADATA +17 -24
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/RECORD +145 -103
- tests/benchmark/test_eval.py +80 -37
- tests/benchmark/test_image_edit.py +65 -0
- tests/benchmark/test_sandbox.py +81 -0
- tests/benchmark/test_vlm.py +137 -0
- tests/cli/test_all.py +83 -43
- tests/cli/test_collection.py +8 -5
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +44 -14
- tests/rag/test_clip_benchmark.py +0 -3
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/aigc/__init__.py +0 -1
- /evalscope/benchmarks/{aigc → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → amc}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → healthbench}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/test_t2i.py +0 -0
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from dotenv import dotenv_values
|
|
3
|
+
|
|
4
|
+
env = dotenv_values('.env')
|
|
5
|
+
|
|
6
|
+
import unittest
|
|
7
|
+
from unittest import TestCase
|
|
8
|
+
|
|
9
|
+
from evalscope.config import TaskConfig
|
|
10
|
+
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
11
|
+
from evalscope.run import run_task
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TestReasoning(TestCase):
|
|
18
|
+
"""Benchmark evaluation test cases."""
|
|
19
|
+
|
|
20
|
+
def setUp(self):
|
|
21
|
+
"""Setup common test configuration."""
|
|
22
|
+
self.base_config = {
|
|
23
|
+
'model': 'Qwen3-0.6B',
|
|
24
|
+
'api_url': 'http://0.0.0.0:8801/v1',
|
|
25
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
26
|
+
'eval_type': EvalType.SERVICE,
|
|
27
|
+
'eval_batch_size': 5,
|
|
28
|
+
'limit': 5,
|
|
29
|
+
'generation_config': {
|
|
30
|
+
'max_tokens': 4096,
|
|
31
|
+
'temperature': 0.0,
|
|
32
|
+
'seed': 42,
|
|
33
|
+
'parallel_tool_calls': True,
|
|
34
|
+
'extra_body':{'chat_template_kwargs': {'enable_thinking': False}} # 关闭思考模式
|
|
35
|
+
},
|
|
36
|
+
'judge_strategy': JudgeStrategy.AUTO,
|
|
37
|
+
'judge_worker_num': 5,
|
|
38
|
+
'judge_model_args': {
|
|
39
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
40
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
41
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
42
|
+
'generation_config': {
|
|
43
|
+
'temperature': 0.0,
|
|
44
|
+
'max_tokens': 4096,
|
|
45
|
+
}
|
|
46
|
+
},
|
|
47
|
+
'debug': True,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
|
|
51
|
+
"""Helper method to run test for a specific dataset."""
|
|
52
|
+
config = self.base_config.copy()
|
|
53
|
+
config['datasets'] = [dataset_name]
|
|
54
|
+
|
|
55
|
+
if use_mock:
|
|
56
|
+
config['eval_type'] = EvalType.MOCK_LLM
|
|
57
|
+
|
|
58
|
+
# 应用配置覆盖
|
|
59
|
+
config.update(config_overrides)
|
|
60
|
+
|
|
61
|
+
if dataset_args:
|
|
62
|
+
config['dataset_args'] = {dataset_name: dataset_args}
|
|
63
|
+
|
|
64
|
+
task_cfg = TaskConfig(**config)
|
|
65
|
+
run_task(task_cfg=task_cfg)
|
|
66
|
+
|
|
67
|
+
def _run_dataset_load_test(self, dataset_name, dataset_args=None):
|
|
68
|
+
"""Helper method to test dataset loading."""
|
|
69
|
+
|
|
70
|
+
self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
|
|
71
|
+
|
|
72
|
+
# Math & Reasoning datasets
|
|
73
|
+
def test_gsm8k(self):
|
|
74
|
+
"""Test GSM8K math reasoning dataset."""
|
|
75
|
+
self._run_dataset_test('gsm8k')
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
if __name__ == '__main__':
|
|
79
|
+
# Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k
|
|
80
|
+
# Run all tests: python -m unittest test_eval.TestBenchmark
|
|
81
|
+
unittest.main()
|
tests/common.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from dotenv import dotenv_values
|
|
3
|
+
|
|
4
|
+
env = dotenv_values('.env')
|
|
5
|
+
|
|
6
|
+
import unittest
|
|
7
|
+
from unittest import TestCase
|
|
8
|
+
|
|
9
|
+
from evalscope.config import TaskConfig
|
|
10
|
+
from evalscope.constants import EvalType, JudgeStrategy
|
|
11
|
+
from evalscope.run import run_task
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TestBenchmark(TestCase):
|
|
18
|
+
"""Benchmark evaluation test cases."""
|
|
19
|
+
|
|
20
|
+
def setUp(self):
|
|
21
|
+
"""Setup common test configuration."""
|
|
22
|
+
self.base_config = {
|
|
23
|
+
'model': 'qwen-plus',
|
|
24
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
25
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
26
|
+
'eval_type': EvalType.SERVICE,
|
|
27
|
+
'eval_batch_size': 5,
|
|
28
|
+
'limit': 5,
|
|
29
|
+
'generation_config': {
|
|
30
|
+
'max_tokens': 4096,
|
|
31
|
+
'temperature': 0.0,
|
|
32
|
+
'seed': 42,
|
|
33
|
+
'parallel_tool_calls': True
|
|
34
|
+
},
|
|
35
|
+
'judge_strategy': JudgeStrategy.AUTO,
|
|
36
|
+
'judge_worker_num': 5,
|
|
37
|
+
'judge_model_args': {
|
|
38
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
39
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
40
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
41
|
+
'generation_config': {
|
|
42
|
+
'temperature': 0.0,
|
|
43
|
+
'max_tokens': 4096,
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
'debug': True,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
|
|
50
|
+
"""Helper method to run test for a specific dataset."""
|
|
51
|
+
config = self.base_config.copy()
|
|
52
|
+
config['datasets'] = [dataset_name]
|
|
53
|
+
|
|
54
|
+
if not env.get('DASHSCOPE_API_KEY'):
|
|
55
|
+
use_mock = True
|
|
56
|
+
logger.warning('DASHSCOPE_API_KEY is not set. Using mock evaluation.')
|
|
57
|
+
|
|
58
|
+
if use_mock:
|
|
59
|
+
config['eval_type'] = EvalType.MOCK_LLM
|
|
60
|
+
|
|
61
|
+
# 应用配置覆盖
|
|
62
|
+
config.update(config_overrides)
|
|
63
|
+
|
|
64
|
+
if dataset_args:
|
|
65
|
+
config['dataset_args'] = {dataset_name: dataset_args}
|
|
66
|
+
|
|
67
|
+
task_cfg = TaskConfig(**config)
|
|
68
|
+
run_task(task_cfg=task_cfg)
|
|
69
|
+
|
|
70
|
+
def _run_dataset_load_test(self, dataset_name, dataset_args=None):
|
|
71
|
+
"""Helper method to test dataset loading."""
|
|
72
|
+
|
|
73
|
+
self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
|
tests/perf/test_perf.py
CHANGED
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os
|
|
3
2
|
from dotenv import dotenv_values
|
|
4
3
|
|
|
5
4
|
env = dotenv_values('.env')
|
|
6
|
-
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
|
7
5
|
import unittest
|
|
8
6
|
|
|
9
7
|
from evalscope.perf.main import run_perf_benchmark
|
|
@@ -18,7 +16,7 @@ class TestPerf(unittest.TestCase):
|
|
|
18
16
|
def tearDown(self) -> None:
|
|
19
17
|
pass
|
|
20
18
|
|
|
21
|
-
|
|
19
|
+
|
|
22
20
|
def test_run_perf(self):
|
|
23
21
|
task_cfg = {
|
|
24
22
|
'url': 'http://127.0.0.1:8001/v1/chat/completions',
|
|
@@ -32,7 +30,7 @@ class TestPerf(unittest.TestCase):
|
|
|
32
30
|
}
|
|
33
31
|
run_perf_benchmark(task_cfg)
|
|
34
32
|
|
|
35
|
-
|
|
33
|
+
|
|
36
34
|
def test_run_perf_stream(self):
|
|
37
35
|
task_cfg = {
|
|
38
36
|
'url': 'http://127.0.0.1:8801/v1/chat/completions',
|
|
@@ -46,7 +44,7 @@ class TestPerf(unittest.TestCase):
|
|
|
46
44
|
}
|
|
47
45
|
run_perf_benchmark(task_cfg)
|
|
48
46
|
|
|
49
|
-
|
|
47
|
+
|
|
50
48
|
def test_run_perf_speed_benchmark(self):
|
|
51
49
|
task_cfg = {
|
|
52
50
|
'url': 'http://127.0.0.1:8001/v1/completions',
|
|
@@ -60,7 +58,7 @@ class TestPerf(unittest.TestCase):
|
|
|
60
58
|
}
|
|
61
59
|
run_perf_benchmark(task_cfg)
|
|
62
60
|
|
|
63
|
-
|
|
61
|
+
|
|
64
62
|
def test_run_perf_local(self):
|
|
65
63
|
task_cfg = {
|
|
66
64
|
'parallel': 1,
|
|
@@ -72,7 +70,7 @@ class TestPerf(unittest.TestCase):
|
|
|
72
70
|
}
|
|
73
71
|
run_perf_benchmark(task_cfg)
|
|
74
72
|
|
|
75
|
-
|
|
73
|
+
|
|
76
74
|
def test_run_perf_local_stream(self):
|
|
77
75
|
task_cfg = {
|
|
78
76
|
'parallel': 1,
|
|
@@ -85,7 +83,7 @@ class TestPerf(unittest.TestCase):
|
|
|
85
83
|
}
|
|
86
84
|
run_perf_benchmark(task_cfg)
|
|
87
85
|
|
|
88
|
-
|
|
86
|
+
|
|
89
87
|
def test_run_perf_local_speed_benchmark(self):
|
|
90
88
|
task_cfg = {
|
|
91
89
|
'parallel': 1,
|
|
@@ -98,7 +96,7 @@ class TestPerf(unittest.TestCase):
|
|
|
98
96
|
}
|
|
99
97
|
run_perf_benchmark(task_cfg)
|
|
100
98
|
|
|
101
|
-
|
|
99
|
+
|
|
102
100
|
def test_run_perf_local_random(self):
|
|
103
101
|
from evalscope.perf.arguments import Arguments
|
|
104
102
|
task_cfg = Arguments(
|
|
@@ -121,13 +119,45 @@ class TestPerf(unittest.TestCase):
|
|
|
121
119
|
print(metrics_result)
|
|
122
120
|
print(percentile_result)
|
|
123
121
|
|
|
124
|
-
|
|
122
|
+
def test_run_completion_endpoint(self):
|
|
123
|
+
if not env.get('DASHSCOPE_API_KEY'):
|
|
124
|
+
self.skipTest('DASHSCOPE_API_KEY is not set.')
|
|
125
|
+
return
|
|
126
|
+
|
|
127
|
+
from evalscope.perf.arguments import Arguments
|
|
128
|
+
task_cfg = Arguments(
|
|
129
|
+
parallel=[1, 2],
|
|
130
|
+
number=[2, 4],
|
|
131
|
+
model='qwen2.5-coder-7b-instruct',
|
|
132
|
+
url='https://dashscope.aliyuncs.com/compatible-mode/v1/completions',
|
|
133
|
+
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
134
|
+
api='openai',
|
|
135
|
+
dataset='random',
|
|
136
|
+
min_tokens=100,
|
|
137
|
+
max_tokens=100,
|
|
138
|
+
prefix_length=0,
|
|
139
|
+
min_prompt_length=1024,
|
|
140
|
+
max_prompt_length=1024,
|
|
141
|
+
stream=False,
|
|
142
|
+
tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
|
|
143
|
+
seed=None,
|
|
144
|
+
extra_args={'ignore_eos': True}
|
|
145
|
+
)
|
|
146
|
+
metrics_result, percentile_result = run_perf_benchmark(task_cfg)
|
|
147
|
+
print(metrics_result)
|
|
148
|
+
print(percentile_result)
|
|
149
|
+
|
|
150
|
+
|
|
125
151
|
def test_run_perf_multi_parallel(self):
|
|
152
|
+
if not env.get('DASHSCOPE_API_KEY'):
|
|
153
|
+
self.skipTest('DASHSCOPE_API_KEY is not set.')
|
|
154
|
+
return
|
|
155
|
+
|
|
126
156
|
from evalscope.perf.arguments import Arguments
|
|
127
157
|
task_cfg = Arguments(
|
|
128
158
|
parallel=[1, 2],
|
|
129
159
|
number=[2, 4],
|
|
130
|
-
model='
|
|
160
|
+
model='qwen-plus',
|
|
131
161
|
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
132
162
|
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
133
163
|
api='openai',
|
|
@@ -145,7 +175,7 @@ class TestPerf(unittest.TestCase):
|
|
|
145
175
|
print(metrics_result)
|
|
146
176
|
print(percentile_result)
|
|
147
177
|
|
|
148
|
-
|
|
178
|
+
|
|
149
179
|
def test_run_perf_random_vl(self):
|
|
150
180
|
from evalscope.perf.arguments import Arguments
|
|
151
181
|
task_cfg = Arguments(
|
|
@@ -155,7 +185,7 @@ class TestPerf(unittest.TestCase):
|
|
|
155
185
|
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
156
186
|
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
157
187
|
api='openai',
|
|
158
|
-
dataset='
|
|
188
|
+
dataset='random_vl',
|
|
159
189
|
min_tokens=100,
|
|
160
190
|
max_tokens=100,
|
|
161
191
|
prefix_length=0,
|
|
@@ -164,7 +194,7 @@ class TestPerf(unittest.TestCase):
|
|
|
164
194
|
image_height=512,
|
|
165
195
|
image_width=512,
|
|
166
196
|
image_num=2,
|
|
167
|
-
tokenizer_path='Qwen/Qwen2.5-
|
|
197
|
+
tokenizer_path='Qwen/Qwen2.5-VL-7B-Instruct',
|
|
168
198
|
seed=None,
|
|
169
199
|
extra_args={'ignore_eos': True}
|
|
170
200
|
)
|
tests/rag/test_clip_benchmark.py
CHANGED
|
@@ -1,105 +0,0 @@
|
|
|
1
|
-
from abc import ABC
|
|
2
|
-
from collections import defaultdict
|
|
3
|
-
from typing import Any, Callable, Dict
|
|
4
|
-
|
|
5
|
-
from evalscope.api.dataset import Dataset, DatasetDict, RemoteDataLoader
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class DatasetLoaderMixin:
|
|
9
|
-
"""
|
|
10
|
-
Mixin class providing dataset loading functionality for benchmarks.
|
|
11
|
-
|
|
12
|
-
This mixin provides common dataset loading methods that can be shared
|
|
13
|
-
across different data adapters, including support for:
|
|
14
|
-
- Loading multiple subsets
|
|
15
|
-
- Few-shot dataset loading
|
|
16
|
-
- Remote dataset loading with configuration
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
def load_subsets(self, load_func: Callable[[str], Dataset]) -> DatasetDict:
|
|
20
|
-
"""
|
|
21
|
-
Load multiple subsets of the dataset using the provided loading function.
|
|
22
|
-
|
|
23
|
-
This method handles two loading strategies:
|
|
24
|
-
1. Reformat mode: Load only the default subset and reformat it
|
|
25
|
-
2. Multi-subset mode: Load all subsets specified in subset_list
|
|
26
|
-
|
|
27
|
-
Args:
|
|
28
|
-
load_func (Callable[[str], Dataset]): Function to load individual subsets
|
|
29
|
-
|
|
30
|
-
Returns:
|
|
31
|
-
DatasetDict: Dictionary containing all loaded subsets
|
|
32
|
-
"""
|
|
33
|
-
if self.reformat_subset:
|
|
34
|
-
# Load only the default subset
|
|
35
|
-
subset_data = load_func(self.default_subset)
|
|
36
|
-
# Reformat the subset to create multiple subsets based on sample keys
|
|
37
|
-
# NOTE: subset_list and limit is applied here if specified
|
|
38
|
-
dataset_dict = DatasetDict.from_dataset(dataset=subset_data, subset_list=self.subset_list, limit=self.limit)
|
|
39
|
-
else:
|
|
40
|
-
# Load all specified subsets into separate entries
|
|
41
|
-
subset_dict = defaultdict()
|
|
42
|
-
for subset in self.subset_list:
|
|
43
|
-
subset_data = load_func(subset)
|
|
44
|
-
subset_dict[subset] = subset_data
|
|
45
|
-
dataset_dict = DatasetDict(subset_dict)
|
|
46
|
-
return dataset_dict
|
|
47
|
-
|
|
48
|
-
def load_subset(self, subset: str) -> Dataset:
|
|
49
|
-
"""
|
|
50
|
-
Load a specific subset of the dataset for evaluation.
|
|
51
|
-
|
|
52
|
-
This method configures and executes the data loading for a single subset,
|
|
53
|
-
handling both split-as-subset and traditional subset configurations.
|
|
54
|
-
|
|
55
|
-
Args:
|
|
56
|
-
subset (str): The subset identifier to load
|
|
57
|
-
|
|
58
|
-
Returns:
|
|
59
|
-
Dataset: The loaded dataset subset with processed samples
|
|
60
|
-
"""
|
|
61
|
-
# Determine the split and subset names based on configuration
|
|
62
|
-
split = subset if self.split_as_subset else self.eval_split
|
|
63
|
-
subset_name = self.default_subset if self.split_as_subset else subset
|
|
64
|
-
|
|
65
|
-
# Create and configure the remote data loader
|
|
66
|
-
loader = RemoteDataLoader(
|
|
67
|
-
data_id_or_path=self.dataset_id,
|
|
68
|
-
split=split,
|
|
69
|
-
subset=subset_name,
|
|
70
|
-
sample_fields=self.record_to_sample, # Custom sample conversion function
|
|
71
|
-
limit=self.limit if not self.reformat_subset else None, # Limit number of samples if specified
|
|
72
|
-
repeats=self._task_config.repeats, # Number of repetitions for each sample
|
|
73
|
-
data_source=self._task_config.dataset_hub, # Data source configuration
|
|
74
|
-
)
|
|
75
|
-
return loader.load()
|
|
76
|
-
|
|
77
|
-
def load_fewshot_subset(self, subset: str) -> Dataset:
|
|
78
|
-
"""
|
|
79
|
-
Load a subset specifically for few-shot examples.
|
|
80
|
-
|
|
81
|
-
This method loads training data to be used as demonstrations in few-shot prompting.
|
|
82
|
-
It typically loads from the training split with limited samples and optional shuffling.
|
|
83
|
-
|
|
84
|
-
Args:
|
|
85
|
-
subset (str): The subset identifier to load few-shot examples from
|
|
86
|
-
|
|
87
|
-
Returns:
|
|
88
|
-
Dataset: The loaded few-shot dataset with demonstration examples
|
|
89
|
-
"""
|
|
90
|
-
# Use training split for few-shot examples
|
|
91
|
-
split = subset if self.split_as_subset else self.train_split
|
|
92
|
-
subset_name = self.default_subset if self.split_as_subset else subset
|
|
93
|
-
|
|
94
|
-
# Create loader specifically configured for few-shot sampling
|
|
95
|
-
loader = RemoteDataLoader(
|
|
96
|
-
data_id_or_path=self.dataset_id,
|
|
97
|
-
split=split,
|
|
98
|
-
subset=subset_name,
|
|
99
|
-
sample_fields=self.record_to_sample,
|
|
100
|
-
limit=self.few_shot_num
|
|
101
|
-
if not self.reformat_subset else None, # Limit to specified number of few-shot examples
|
|
102
|
-
shuffle=self.few_shot_random, # Randomize selection if enabled
|
|
103
|
-
data_source=self._task_config.dataset_hub,
|
|
104
|
-
)
|
|
105
|
-
return loader.load()
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os.path
|
|
3
|
-
from collections import defaultdict
|
|
4
|
-
from typing import List, Optional, Union
|
|
5
|
-
|
|
6
|
-
from evalscope.utils.io_utils import jsonl_to_list
|
|
7
|
-
from evalscope.utils.logger import get_logger
|
|
8
|
-
|
|
9
|
-
logger = get_logger()
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class GeneralI2IAdapter:
|
|
13
|
-
|
|
14
|
-
def __init__(self, **kwargs):
|
|
15
|
-
|
|
16
|
-
super().__init__(**kwargs)
|
|
17
|
-
|
|
18
|
-
def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
|
|
19
|
-
dataset_name_or_path = dataset_name_or_path or self.dataset_id
|
|
20
|
-
subset_list = subset_list or self.subset_list
|
|
21
|
-
|
|
22
|
-
data_file_dict = defaultdict(str)
|
|
23
|
-
data_item_dict = defaultdict(list)
|
|
24
|
-
|
|
25
|
-
# get data file path and subset name
|
|
26
|
-
if os.path.isdir(dataset_name_or_path):
|
|
27
|
-
for subset_name in subset_list:
|
|
28
|
-
data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
|
|
29
|
-
elif os.path.isfile(dataset_name_or_path):
|
|
30
|
-
cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
|
|
31
|
-
data_file_dict[cur_subset_name] = dataset_name_or_path
|
|
32
|
-
else:
|
|
33
|
-
raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
|
|
34
|
-
|
|
35
|
-
# load data from local disk
|
|
36
|
-
try:
|
|
37
|
-
for subset_name, file_path in data_file_dict.items():
|
|
38
|
-
data_item_dict[subset_name] = jsonl_to_list(file_path)
|
|
39
|
-
except Exception as e:
|
|
40
|
-
raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
|
|
41
|
-
|
|
42
|
-
data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
|
|
43
|
-
|
|
44
|
-
return data_dict
|
tests/aigc/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|