evalscope 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/adapters/default_data_adapter.py +18 -4
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
- evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
- evalscope/api/benchmark/benchmark.py +27 -2
- evalscope/api/benchmark/meta.py +3 -0
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +5 -0
- evalscope/api/messages/chat_message.py +6 -1
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +204 -0
- evalscope/api/model/generate_config.py +0 -3
- evalscope/api/model/model.py +1 -1
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +8 -2
- evalscope/app/utils/data_utils.py +3 -2
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +6 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +46 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/bfcl_adapter.py +106 -2
- evalscope/benchmarks/bfcl/generation.py +7 -7
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drop/drop_adapter.py +1 -1
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
- evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
- evalscope/config.py +24 -1
- evalscope/constants.py +3 -0
- evalscope/evaluator/evaluator.py +25 -7
- evalscope/metrics/metric.py +78 -2
- evalscope/metrics/metrics.py +16 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/model_apis.py +10 -8
- evalscope/models/utils/openai.py +1 -2
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/plugin/api/base.py +2 -2
- evalscope/perf/plugin/api/default_api.py +7 -7
- evalscope/perf/plugin/api/openai_api.py +83 -19
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/report/__init__.py +9 -1
- evalscope/report/combinator.py +45 -20
- evalscope/report/report.py +8 -4
- evalscope/run.py +1 -1
- evalscope/utils/function_utils.py +41 -0
- evalscope/utils/import_utils.py +63 -13
- evalscope/utils/io_utils.py +19 -11
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +19 -0
- evalscope/utils/model_utils.py +1 -1
- evalscope/utils/multi_choices.py +16 -1
- evalscope/version.py +2 -2
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/METADATA +10 -40
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/RECORD +120 -95
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
- tests/__init__.py +0 -1
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -385
- tests/benchmark/test_image_edit.py +0 -65
- tests/benchmark/test_t2i.py +0 -142
- tests/benchmark/test_vlm.py +0 -80
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -269
- tests/cli/test_collection.py +0 -99
- tests/cli/test_custom.py +0 -268
- tests/cli/test_reasoning.py +0 -81
- tests/common.py +0 -73
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -178
- tests/rag/test_clip_benchmark.py +0 -87
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- {tests/rag → evalscope/benchmarks/ai2d}/__init__.py +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
tests/cli/test_reasoning.py
DELETED
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
from dotenv import dotenv_values
|
|
3
|
-
|
|
4
|
-
env = dotenv_values('.env')
|
|
5
|
-
|
|
6
|
-
import unittest
|
|
7
|
-
from unittest import TestCase
|
|
8
|
-
|
|
9
|
-
from evalscope.config import TaskConfig
|
|
10
|
-
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
11
|
-
from evalscope.run import run_task
|
|
12
|
-
from evalscope.utils.logger import get_logger
|
|
13
|
-
|
|
14
|
-
logger = get_logger()
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class TestReasoning(TestCase):
|
|
18
|
-
"""Benchmark evaluation test cases."""
|
|
19
|
-
|
|
20
|
-
def setUp(self):
|
|
21
|
-
"""Setup common test configuration."""
|
|
22
|
-
self.base_config = {
|
|
23
|
-
'model': 'Qwen3-0.6B',
|
|
24
|
-
'api_url': 'http://0.0.0.0:8801/v1',
|
|
25
|
-
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
26
|
-
'eval_type': EvalType.SERVICE,
|
|
27
|
-
'eval_batch_size': 5,
|
|
28
|
-
'limit': 5,
|
|
29
|
-
'generation_config': {
|
|
30
|
-
'max_tokens': 4096,
|
|
31
|
-
'temperature': 0.0,
|
|
32
|
-
'seed': 42,
|
|
33
|
-
'parallel_tool_calls': True,
|
|
34
|
-
'extra_body':{'chat_template_kwargs': {'enable_thinking': False}} # 关闭思考模式
|
|
35
|
-
},
|
|
36
|
-
'judge_strategy': JudgeStrategy.AUTO,
|
|
37
|
-
'judge_worker_num': 5,
|
|
38
|
-
'judge_model_args': {
|
|
39
|
-
'model_id': 'qwen2.5-72b-instruct',
|
|
40
|
-
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
41
|
-
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
42
|
-
'generation_config': {
|
|
43
|
-
'temperature': 0.0,
|
|
44
|
-
'max_tokens': 4096,
|
|
45
|
-
}
|
|
46
|
-
},
|
|
47
|
-
'debug': True,
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
|
|
51
|
-
"""Helper method to run test for a specific dataset."""
|
|
52
|
-
config = self.base_config.copy()
|
|
53
|
-
config['datasets'] = [dataset_name]
|
|
54
|
-
|
|
55
|
-
if use_mock:
|
|
56
|
-
config['eval_type'] = EvalType.MOCK_LLM
|
|
57
|
-
|
|
58
|
-
# 应用配置覆盖
|
|
59
|
-
config.update(config_overrides)
|
|
60
|
-
|
|
61
|
-
if dataset_args:
|
|
62
|
-
config['dataset_args'] = {dataset_name: dataset_args}
|
|
63
|
-
|
|
64
|
-
task_cfg = TaskConfig(**config)
|
|
65
|
-
run_task(task_cfg=task_cfg)
|
|
66
|
-
|
|
67
|
-
def _run_dataset_load_test(self, dataset_name, dataset_args=None):
|
|
68
|
-
"""Helper method to test dataset loading."""
|
|
69
|
-
|
|
70
|
-
self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
|
|
71
|
-
|
|
72
|
-
# Math & Reasoning datasets
|
|
73
|
-
def test_gsm8k(self):
|
|
74
|
-
"""Test GSM8K math reasoning dataset."""
|
|
75
|
-
self._run_dataset_test('gsm8k')
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
if __name__ == '__main__':
|
|
79
|
-
# Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k
|
|
80
|
-
# Run all tests: python -m unittest test_eval.TestBenchmark
|
|
81
|
-
unittest.main()
|
tests/common.py
DELETED
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
from dotenv import dotenv_values
|
|
3
|
-
|
|
4
|
-
env = dotenv_values('.env')
|
|
5
|
-
|
|
6
|
-
import unittest
|
|
7
|
-
from unittest import TestCase
|
|
8
|
-
|
|
9
|
-
from evalscope.config import TaskConfig
|
|
10
|
-
from evalscope.constants import EvalType, JudgeStrategy
|
|
11
|
-
from evalscope.run import run_task
|
|
12
|
-
from evalscope.utils.logger import get_logger
|
|
13
|
-
|
|
14
|
-
logger = get_logger()
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class TestBenchmark(TestCase):
|
|
18
|
-
"""Benchmark evaluation test cases."""
|
|
19
|
-
|
|
20
|
-
def setUp(self):
|
|
21
|
-
"""Setup common test configuration."""
|
|
22
|
-
self.base_config = {
|
|
23
|
-
'model': 'qwen-plus',
|
|
24
|
-
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
25
|
-
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
26
|
-
'eval_type': EvalType.SERVICE,
|
|
27
|
-
'eval_batch_size': 5,
|
|
28
|
-
'limit': 5,
|
|
29
|
-
'generation_config': {
|
|
30
|
-
'max_tokens': 4096,
|
|
31
|
-
'temperature': 0.0,
|
|
32
|
-
'seed': 42,
|
|
33
|
-
'parallel_tool_calls': True
|
|
34
|
-
},
|
|
35
|
-
'judge_strategy': JudgeStrategy.AUTO,
|
|
36
|
-
'judge_worker_num': 5,
|
|
37
|
-
'judge_model_args': {
|
|
38
|
-
'model_id': 'qwen2.5-72b-instruct',
|
|
39
|
-
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
40
|
-
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
41
|
-
'generation_config': {
|
|
42
|
-
'temperature': 0.0,
|
|
43
|
-
'max_tokens': 4096,
|
|
44
|
-
}
|
|
45
|
-
},
|
|
46
|
-
'debug': True,
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
|
|
50
|
-
"""Helper method to run test for a specific dataset."""
|
|
51
|
-
config = self.base_config.copy()
|
|
52
|
-
config['datasets'] = [dataset_name]
|
|
53
|
-
|
|
54
|
-
if not env.get('DASHSCOPE_API_KEY'):
|
|
55
|
-
use_mock = True
|
|
56
|
-
logger.warning('DASHSCOPE_API_KEY is not set. Using mock evaluation.')
|
|
57
|
-
|
|
58
|
-
if use_mock:
|
|
59
|
-
config['eval_type'] = EvalType.MOCK_LLM
|
|
60
|
-
|
|
61
|
-
# 应用配置覆盖
|
|
62
|
-
config.update(config_overrides)
|
|
63
|
-
|
|
64
|
-
if dataset_args:
|
|
65
|
-
config['dataset_args'] = {dataset_name: dataset_args}
|
|
66
|
-
|
|
67
|
-
task_cfg = TaskConfig(**config)
|
|
68
|
-
run_task(task_cfg=task_cfg)
|
|
69
|
-
|
|
70
|
-
def _run_dataset_load_test(self, dataset_name, dataset_args=None):
|
|
71
|
-
"""Helper method to test dataset loading."""
|
|
72
|
-
|
|
73
|
-
self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
|
tests/perf/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
tests/perf/test_perf.py
DELETED
|
@@ -1,178 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
from dotenv import dotenv_values
|
|
3
|
-
|
|
4
|
-
env = dotenv_values('.env')
|
|
5
|
-
import unittest
|
|
6
|
-
|
|
7
|
-
from evalscope.perf.main import run_perf_benchmark
|
|
8
|
-
from tests.utils import test_level_list
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class TestPerf(unittest.TestCase):
|
|
12
|
-
|
|
13
|
-
def setUp(self) -> None:
|
|
14
|
-
pass
|
|
15
|
-
|
|
16
|
-
def tearDown(self) -> None:
|
|
17
|
-
pass
|
|
18
|
-
|
|
19
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
20
|
-
def test_run_perf(self):
|
|
21
|
-
task_cfg = {
|
|
22
|
-
'url': 'http://127.0.0.1:8001/v1/chat/completions',
|
|
23
|
-
'parallel': 1,
|
|
24
|
-
'model': 'qwen2.5',
|
|
25
|
-
'number': 15,
|
|
26
|
-
'api': 'openai',
|
|
27
|
-
'dataset': 'openqa',
|
|
28
|
-
# 'stream': True,
|
|
29
|
-
'debug': True,
|
|
30
|
-
}
|
|
31
|
-
run_perf_benchmark(task_cfg)
|
|
32
|
-
|
|
33
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
34
|
-
def test_run_perf_stream(self):
|
|
35
|
-
task_cfg = {
|
|
36
|
-
'url': 'http://127.0.0.1:8801/v1/chat/completions',
|
|
37
|
-
'parallel': 1,
|
|
38
|
-
'model': 'Qwen2.5-0.5B-Instruct',
|
|
39
|
-
'number': 15,
|
|
40
|
-
'api': 'openai',
|
|
41
|
-
'dataset': 'openqa',
|
|
42
|
-
'stream': True,
|
|
43
|
-
'debug': True,
|
|
44
|
-
}
|
|
45
|
-
run_perf_benchmark(task_cfg)
|
|
46
|
-
|
|
47
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
48
|
-
def test_run_perf_speed_benchmark(self):
|
|
49
|
-
task_cfg = {
|
|
50
|
-
'url': 'http://127.0.0.1:8001/v1/completions',
|
|
51
|
-
'parallel': 1,
|
|
52
|
-
'model': 'qwen2.5',
|
|
53
|
-
'api': 'openai',
|
|
54
|
-
'dataset': 'speed_benchmark',
|
|
55
|
-
'min_tokens': 2048,
|
|
56
|
-
'max_tokens': 2048,
|
|
57
|
-
'debug': True,
|
|
58
|
-
}
|
|
59
|
-
run_perf_benchmark(task_cfg)
|
|
60
|
-
|
|
61
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
62
|
-
def test_run_perf_local(self):
|
|
63
|
-
task_cfg = {
|
|
64
|
-
'parallel': 1,
|
|
65
|
-
'model': 'Qwen/Qwen2.5-0.5B-Instruct',
|
|
66
|
-
'number': 5,
|
|
67
|
-
'api': 'local',
|
|
68
|
-
'dataset': 'openqa',
|
|
69
|
-
'debug': True,
|
|
70
|
-
}
|
|
71
|
-
run_perf_benchmark(task_cfg)
|
|
72
|
-
|
|
73
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
74
|
-
def test_run_perf_local_stream(self):
|
|
75
|
-
task_cfg = {
|
|
76
|
-
'parallel': 1,
|
|
77
|
-
'model': 'Qwen/Qwen2.5-0.5B-Instruct',
|
|
78
|
-
'number': 5,
|
|
79
|
-
'api': 'local',
|
|
80
|
-
'dataset': 'openqa',
|
|
81
|
-
'stream': True,
|
|
82
|
-
'debug': True,
|
|
83
|
-
}
|
|
84
|
-
run_perf_benchmark(task_cfg)
|
|
85
|
-
|
|
86
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
87
|
-
def test_run_perf_local_speed_benchmark(self):
|
|
88
|
-
task_cfg = {
|
|
89
|
-
'parallel': 1,
|
|
90
|
-
'model': 'Qwen/Qwen2.5-0.5B-Instruct',
|
|
91
|
-
'api': 'local_vllm',
|
|
92
|
-
'dataset': 'speed_benchmark',
|
|
93
|
-
'min_tokens': 2048,
|
|
94
|
-
'max_tokens': 2048,
|
|
95
|
-
'debug': True,
|
|
96
|
-
}
|
|
97
|
-
run_perf_benchmark(task_cfg)
|
|
98
|
-
|
|
99
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
100
|
-
def test_run_perf_local_random(self):
|
|
101
|
-
from evalscope.perf.arguments import Arguments
|
|
102
|
-
task_cfg = Arguments(
|
|
103
|
-
parallel=20,
|
|
104
|
-
model='Qwen3-1.7B',
|
|
105
|
-
url='http://127.0.0.1:8801/v1/completions',
|
|
106
|
-
api='openai',
|
|
107
|
-
dataset='random',
|
|
108
|
-
min_tokens=1024,
|
|
109
|
-
max_tokens=1024,
|
|
110
|
-
prefix_length=0,
|
|
111
|
-
min_prompt_length=1024,
|
|
112
|
-
max_prompt_length=1024,
|
|
113
|
-
number=20,
|
|
114
|
-
tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
|
|
115
|
-
seed=None,
|
|
116
|
-
extra_args={'ignore_eos': True}
|
|
117
|
-
)
|
|
118
|
-
metrics_result, percentile_result = run_perf_benchmark(task_cfg)
|
|
119
|
-
print(metrics_result)
|
|
120
|
-
print(percentile_result)
|
|
121
|
-
|
|
122
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
123
|
-
def test_run_perf_multi_parallel(self):
|
|
124
|
-
if not env.get('DASHSCOPE_API_KEY'):
|
|
125
|
-
self.skipTest('DASHSCOPE_API_KEY is not set.')
|
|
126
|
-
return
|
|
127
|
-
|
|
128
|
-
from evalscope.perf.arguments import Arguments
|
|
129
|
-
task_cfg = Arguments(
|
|
130
|
-
parallel=[1, 2],
|
|
131
|
-
number=[2, 4],
|
|
132
|
-
model='qwen2.5-7b-instruct',
|
|
133
|
-
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
134
|
-
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
135
|
-
api='openai',
|
|
136
|
-
dataset='random',
|
|
137
|
-
min_tokens=100,
|
|
138
|
-
max_tokens=100,
|
|
139
|
-
prefix_length=0,
|
|
140
|
-
min_prompt_length=1024,
|
|
141
|
-
max_prompt_length=1024,
|
|
142
|
-
tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
|
|
143
|
-
seed=None,
|
|
144
|
-
extra_args={'ignore_eos': True}
|
|
145
|
-
)
|
|
146
|
-
metrics_result, percentile_result = run_perf_benchmark(task_cfg)
|
|
147
|
-
print(metrics_result)
|
|
148
|
-
print(percentile_result)
|
|
149
|
-
|
|
150
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
151
|
-
def test_run_perf_random_vl(self):
|
|
152
|
-
from evalscope.perf.arguments import Arguments
|
|
153
|
-
task_cfg = Arguments(
|
|
154
|
-
parallel=[1, 2],
|
|
155
|
-
number=[2, 4],
|
|
156
|
-
model='qwen-vl-max',
|
|
157
|
-
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
158
|
-
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
159
|
-
api='openai',
|
|
160
|
-
dataset='kontext_bench',
|
|
161
|
-
min_tokens=100,
|
|
162
|
-
max_tokens=100,
|
|
163
|
-
prefix_length=0,
|
|
164
|
-
min_prompt_length=100,
|
|
165
|
-
max_prompt_length=100,
|
|
166
|
-
image_height=512,
|
|
167
|
-
image_width=512,
|
|
168
|
-
image_num=2,
|
|
169
|
-
tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
|
|
170
|
-
seed=None,
|
|
171
|
-
extra_args={'ignore_eos': True}
|
|
172
|
-
)
|
|
173
|
-
metrics_result, percentile_result = run_perf_benchmark(task_cfg)
|
|
174
|
-
print(metrics_result)
|
|
175
|
-
print(percentile_result)
|
|
176
|
-
|
|
177
|
-
if __name__ == '__main__':
|
|
178
|
-
unittest.main(buffer=False)
|
tests/rag/test_clip_benchmark.py
DELETED
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os
|
|
3
|
-
import subprocess
|
|
4
|
-
import unittest
|
|
5
|
-
|
|
6
|
-
from evalscope.run import run_task
|
|
7
|
-
from evalscope.utils.import_utils import is_module_installed
|
|
8
|
-
from evalscope.utils.logger import get_logger
|
|
9
|
-
from tests.utils import test_level_list
|
|
10
|
-
|
|
11
|
-
logger = get_logger()
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class TestCLIPBenchmark(unittest.TestCase):
|
|
15
|
-
|
|
16
|
-
def setUp(self) -> None:
|
|
17
|
-
self._check_env('webdataset')
|
|
18
|
-
|
|
19
|
-
def tearDown(self) -> None:
|
|
20
|
-
pass
|
|
21
|
-
|
|
22
|
-
@staticmethod
|
|
23
|
-
def _check_env(module_name: str):
|
|
24
|
-
if is_module_installed(module_name):
|
|
25
|
-
logger.info(f'{module_name} is installed.')
|
|
26
|
-
else:
|
|
27
|
-
raise ModuleNotFoundError(f'run: pip install {module_name}')
|
|
28
|
-
|
|
29
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
30
|
-
def test_run_task(self):
|
|
31
|
-
task_cfg = {
|
|
32
|
-
'eval_backend': 'RAGEval',
|
|
33
|
-
'eval_config': {
|
|
34
|
-
'tool': 'clip_benchmark',
|
|
35
|
-
'eval': {
|
|
36
|
-
'models': [
|
|
37
|
-
{
|
|
38
|
-
'model_name': 'AI-ModelScope/chinese-clip-vit-large-patch14-336px',
|
|
39
|
-
}
|
|
40
|
-
],
|
|
41
|
-
'dataset_name': [
|
|
42
|
-
'muge',
|
|
43
|
-
'mnist',
|
|
44
|
-
'flickr8k'
|
|
45
|
-
],
|
|
46
|
-
'split': 'test',
|
|
47
|
-
'batch_size': 128,
|
|
48
|
-
'num_workers': 1,
|
|
49
|
-
'verbose': True,
|
|
50
|
-
'skip_existing': False,
|
|
51
|
-
'cache_dir': 'cache',
|
|
52
|
-
'limit': 1000,
|
|
53
|
-
},
|
|
54
|
-
},
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
run_task(task_cfg)
|
|
58
|
-
|
|
59
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
60
|
-
def test_run_custom(self):
|
|
61
|
-
task_cfg = {
|
|
62
|
-
'eval_backend': 'RAGEval',
|
|
63
|
-
'eval_config': {
|
|
64
|
-
'tool': 'clip_benchmark',
|
|
65
|
-
'eval': {
|
|
66
|
-
'models': [
|
|
67
|
-
{
|
|
68
|
-
'model_name': 'AI-ModelScope/chinese-clip-vit-large-patch14-336px',
|
|
69
|
-
}
|
|
70
|
-
],
|
|
71
|
-
'dataset_name': ['custom'],
|
|
72
|
-
'data_dir': 'custom_eval/multimodal/text-image-retrieval',
|
|
73
|
-
'split': 'test',
|
|
74
|
-
'batch_size': 128,
|
|
75
|
-
'num_workers': 1,
|
|
76
|
-
'verbose': True,
|
|
77
|
-
'skip_existing': False,
|
|
78
|
-
'limit': 1000,
|
|
79
|
-
},
|
|
80
|
-
},
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
run_task(task_cfg)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
if __name__ == '__main__':
|
|
87
|
-
unittest.main(buffer=False)
|
tests/rag/test_mteb.py
DELETED
|
@@ -1,213 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
import unittest
|
|
4
|
-
from dotenv import dotenv_values
|
|
5
|
-
|
|
6
|
-
from tests.utils import test_level_list
|
|
7
|
-
|
|
8
|
-
env = dotenv_values('.env')
|
|
9
|
-
from evalscope.run import run_task
|
|
10
|
-
from evalscope.utils.import_utils import is_module_installed
|
|
11
|
-
from evalscope.utils.logger import get_logger
|
|
12
|
-
|
|
13
|
-
logger = get_logger()
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class TestMTEB(unittest.TestCase):
|
|
17
|
-
|
|
18
|
-
def setUp(self) -> None:
|
|
19
|
-
self._check_env('mteb')
|
|
20
|
-
|
|
21
|
-
def tearDown(self) -> None:
|
|
22
|
-
pass
|
|
23
|
-
|
|
24
|
-
@staticmethod
|
|
25
|
-
def _check_env(module_name: str):
|
|
26
|
-
if is_module_installed(module_name):
|
|
27
|
-
logger.info(f'{module_name} is installed.')
|
|
28
|
-
else:
|
|
29
|
-
raise ModuleNotFoundError(f'run: pip install {module_name}')
|
|
30
|
-
|
|
31
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
32
|
-
def test_run_one_stage_mteb(self):
|
|
33
|
-
task_cfg = {
|
|
34
|
-
'eval_backend': 'RAGEval',
|
|
35
|
-
'eval_config': {
|
|
36
|
-
'tool': 'MTEB',
|
|
37
|
-
'model': [
|
|
38
|
-
{
|
|
39
|
-
'model_name_or_path': 'AI-ModelScope/m3e-base',
|
|
40
|
-
'pooling_mode': None, # load from model config
|
|
41
|
-
'max_seq_length': 512,
|
|
42
|
-
'prompt': '',
|
|
43
|
-
'model_kwargs': {'torch_dtype': 'auto'},
|
|
44
|
-
'encode_kwargs': {
|
|
45
|
-
'batch_size': 128,
|
|
46
|
-
},
|
|
47
|
-
}
|
|
48
|
-
],
|
|
49
|
-
'eval': {
|
|
50
|
-
'tasks': [
|
|
51
|
-
'TNews',
|
|
52
|
-
'CLSClusteringS2S',
|
|
53
|
-
'T2Reranking',
|
|
54
|
-
'T2Retrieval',
|
|
55
|
-
'ATEC',
|
|
56
|
-
],
|
|
57
|
-
'verbosity': 2,
|
|
58
|
-
'overwrite_results': True,
|
|
59
|
-
'limits': 500,
|
|
60
|
-
},
|
|
61
|
-
},
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
run_task(task_cfg)
|
|
65
|
-
|
|
66
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
67
|
-
def test_run_one_stage_api(self):
|
|
68
|
-
from evalscope import TaskConfig
|
|
69
|
-
task_cfg = TaskConfig(
|
|
70
|
-
eval_backend='RAGEval',
|
|
71
|
-
eval_config={
|
|
72
|
-
'tool': 'MTEB',
|
|
73
|
-
'model': [
|
|
74
|
-
{
|
|
75
|
-
'model_name': 'text-embedding-v3',
|
|
76
|
-
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
77
|
-
'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
|
|
78
|
-
'dimensions': 1024,
|
|
79
|
-
'encode_kwargs': {
|
|
80
|
-
'batch_size': 10,
|
|
81
|
-
},
|
|
82
|
-
}
|
|
83
|
-
],
|
|
84
|
-
'eval': {
|
|
85
|
-
'tasks': [
|
|
86
|
-
'T2Retrieval',
|
|
87
|
-
],
|
|
88
|
-
'verbosity': 2,
|
|
89
|
-
'overwrite_results': True,
|
|
90
|
-
'limits': 10,
|
|
91
|
-
},
|
|
92
|
-
},
|
|
93
|
-
)
|
|
94
|
-
|
|
95
|
-
run_task(task_cfg)
|
|
96
|
-
|
|
97
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
98
|
-
def test_run_two_stage_mteb(self):
|
|
99
|
-
task_cfg = {
|
|
100
|
-
'eval_backend': 'RAGEval',
|
|
101
|
-
'eval_config': {
|
|
102
|
-
'tool': 'MTEB',
|
|
103
|
-
'model': [
|
|
104
|
-
{
|
|
105
|
-
'model_name_or_path': 'AI-ModelScope/m3e-base',
|
|
106
|
-
'is_cross_encoder': False,
|
|
107
|
-
'max_seq_length': 512,
|
|
108
|
-
'prompt': '',
|
|
109
|
-
'model_kwargs': {'torch_dtype': 'auto'},
|
|
110
|
-
'encode_kwargs': {
|
|
111
|
-
'batch_size': 64,
|
|
112
|
-
},
|
|
113
|
-
},
|
|
114
|
-
{
|
|
115
|
-
'model_name_or_path': 'BAAI/bge-reranker-v2-m3',
|
|
116
|
-
'is_cross_encoder': True,
|
|
117
|
-
'max_seq_length': 512,
|
|
118
|
-
'prompt': '为这个问题生成一个检索用的表示',
|
|
119
|
-
'model_kwargs': {'torch_dtype': 'auto'},
|
|
120
|
-
'encode_kwargs': {
|
|
121
|
-
'batch_size': 32,
|
|
122
|
-
},
|
|
123
|
-
},
|
|
124
|
-
],
|
|
125
|
-
'eval': {
|
|
126
|
-
'tasks': [
|
|
127
|
-
'MedicalRetrieval',
|
|
128
|
-
'T2Retrieval'
|
|
129
|
-
],
|
|
130
|
-
'verbosity': 2,
|
|
131
|
-
'overwrite_results': True,
|
|
132
|
-
'limits': 10,
|
|
133
|
-
'top_k': 10,
|
|
134
|
-
},
|
|
135
|
-
},
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
run_task(task_cfg)
|
|
139
|
-
|
|
140
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
141
|
-
def test_run_two_stage_api(self):
|
|
142
|
-
task_cfg = {
|
|
143
|
-
'eval_backend': 'RAGEval',
|
|
144
|
-
'eval_config': {
|
|
145
|
-
'tool': 'MTEB',
|
|
146
|
-
'model': [
|
|
147
|
-
{
|
|
148
|
-
'model_name': 'text-embedding-v3',
|
|
149
|
-
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
150
|
-
'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
|
|
151
|
-
'dimensions': 1024,
|
|
152
|
-
'encode_kwargs': {
|
|
153
|
-
'batch_size': 10,
|
|
154
|
-
},
|
|
155
|
-
},
|
|
156
|
-
{
|
|
157
|
-
'model_name': 'text-embedding-v3',
|
|
158
|
-
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
159
|
-
'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
|
|
160
|
-
'dimensions': 1024,
|
|
161
|
-
'encode_kwargs': {
|
|
162
|
-
'batch_size': 10,
|
|
163
|
-
},
|
|
164
|
-
},
|
|
165
|
-
],
|
|
166
|
-
'eval': {
|
|
167
|
-
'tasks': [
|
|
168
|
-
'MedicalRetrieval',
|
|
169
|
-
# 'T2Retrieval'
|
|
170
|
-
],
|
|
171
|
-
'verbosity': 2,
|
|
172
|
-
'overwrite_results': True,
|
|
173
|
-
'limits': 10,
|
|
174
|
-
'top_k': 10,
|
|
175
|
-
},
|
|
176
|
-
},
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
run_task(task_cfg)
|
|
180
|
-
|
|
181
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
182
|
-
def test_run_custom(self):
|
|
183
|
-
task_cfg = {
|
|
184
|
-
'eval_backend': 'RAGEval',
|
|
185
|
-
'eval_config': {
|
|
186
|
-
'tool': 'MTEB',
|
|
187
|
-
'model': [
|
|
188
|
-
{
|
|
189
|
-
'model_name_or_path': 'AI-ModelScope/m3e-base',
|
|
190
|
-
'pooling_mode': None, # load from model config
|
|
191
|
-
'max_seq_length': 512,
|
|
192
|
-
'prompt': '',
|
|
193
|
-
'model_kwargs': {'torch_dtype': 'auto'},
|
|
194
|
-
'encode_kwargs': {
|
|
195
|
-
'batch_size': 128,
|
|
196
|
-
},
|
|
197
|
-
}
|
|
198
|
-
],
|
|
199
|
-
'eval': {
|
|
200
|
-
'tasks': ['CustomRetrieval'],
|
|
201
|
-
'dataset_path': 'custom_eval/text/retrieval',
|
|
202
|
-
'verbosity': 2,
|
|
203
|
-
'overwrite_results': True,
|
|
204
|
-
'limits': 500,
|
|
205
|
-
},
|
|
206
|
-
},
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
run_task(task_cfg)
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
if __name__ == '__main__':
|
|
213
|
-
unittest.main(buffer=False)
|