evalscope 0.6.1__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/opencompass/tasks/eval_api.py +2 -1
- evalscope/backend/opencompass/tasks/eval_datasets.py +1 -0
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +230 -0
- evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +43 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +87 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +36 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +26 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +41 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +60 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +36 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +22 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +35 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +34 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +36 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +25 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +16 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +24 -0
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +18 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +23 -21
- evalscope/benchmarks/ceval/samples.jsonl +1 -0
- evalscope/benchmarks/cmmlu/samples.jsonl +5 -0
- evalscope/benchmarks/mmlu/samples.jsonl +5 -0
- evalscope/benchmarks/race/samples.jsonl +5 -0
- evalscope/benchmarks/trivia_qa/samples.jsonl +5 -0
- evalscope/cli/start_perf.py +8 -11
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +58485 -0
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +1 -0
- evalscope/metrics/rouge_metric.py +30 -15
- evalscope/perf/arguments.py +179 -0
- evalscope/perf/benchmark.py +245 -0
- evalscope/perf/http_client.py +127 -711
- evalscope/perf/main.py +35 -0
- evalscope/perf/plugin/__init__.py +2 -0
- evalscope/perf/plugin/api/__init__.py +3 -0
- evalscope/perf/{api_plugin_base.py → plugin/api/base.py} +17 -18
- evalscope/perf/{custom_api.py → plugin/api/custom_api.py} +25 -19
- evalscope/perf/{dashscope_api.py → plugin/api/dashscope_api.py} +28 -14
- evalscope/perf/{openai_api.py → plugin/api/openai_api.py} +51 -27
- evalscope/perf/plugin/datasets/__init__.py +6 -0
- evalscope/perf/{dataset_plugin_base.py → plugin/datasets/base.py} +13 -10
- evalscope/perf/plugin/datasets/custom.py +21 -0
- evalscope/perf/plugin/datasets/flickr8k.py +51 -0
- evalscope/perf/{datasets → plugin/datasets}/line_by_line.py +9 -5
- evalscope/perf/plugin/datasets/longalpaca.py +28 -0
- evalscope/perf/plugin/datasets/openqa.py +38 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +50 -0
- evalscope/perf/plugin/registry.py +54 -0
- evalscope/perf/{how_to_analysis_result.py → utils/analysis_result.py} +11 -5
- evalscope/perf/utils/benchmark_util.py +135 -0
- evalscope/perf/utils/chat_service.py +252 -0
- evalscope/perf/utils/db_util.py +200 -0
- evalscope/perf/utils/handler.py +46 -0
- evalscope/perf/utils/local_server.py +139 -0
- evalscope/registry/config/cfg_arena.yaml +77 -0
- evalscope/registry/config/cfg_arena_zhihu.yaml +63 -0
- evalscope/registry/config/cfg_pairwise_baseline.yaml +83 -0
- evalscope/registry/config/cfg_single.yaml +78 -0
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +8 -0
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +8 -0
- evalscope/registry/data/qa_browser/battle.jsonl +634 -0
- evalscope/registry/data/qa_browser/category_mapping.yaml +10 -0
- evalscope/registry/data/question.jsonl +80 -0
- evalscope/third_party/longbench_write/README.md +118 -0
- evalscope/third_party/longbench_write/default_task.json +27 -0
- evalscope/third_party/longbench_write/default_task.yaml +24 -0
- evalscope/third_party/toolbench_static/README.md +118 -0
- evalscope/third_party/toolbench_static/config_default.json +15 -0
- evalscope/third_party/toolbench_static/config_default.yaml +12 -0
- evalscope/third_party/toolbench_static/requirements.txt +2 -0
- evalscope/utils/logger.py +18 -20
- evalscope/utils/utils.py +41 -42
- evalscope/version.py +2 -2
- evalscope-0.7.1.dist-info/LICENSE +203 -0
- {evalscope-0.6.1.dist-info → evalscope-0.7.1.dist-info}/METADATA +93 -35
- {evalscope-0.6.1.dist-info → evalscope-0.7.1.dist-info}/RECORD +101 -31
- {evalscope-0.6.1.dist-info → evalscope-0.7.1.dist-info}/WHEEL +1 -1
- {evalscope-0.6.1.dist-info → evalscope-0.7.1.dist-info}/top_level.txt +1 -0
- tests/cli/__init__.py +1 -0
- tests/cli/test_run.py +76 -0
- tests/perf/__init__.py +1 -0
- tests/perf/test_perf.py +96 -0
- tests/rag/test_clip_benchmark.py +85 -0
- tests/rag/test_mteb.py +136 -0
- tests/rag/test_ragas.py +120 -0
- tests/swift/__init__.py +1 -0
- tests/swift/test_run_swift_eval.py +146 -0
- tests/swift/test_run_swift_vlm_eval.py +128 -0
- tests/swift/test_run_swift_vlm_jugde_eval.py +157 -0
- tests/test_run_all.py +12 -0
- tests/vlm/__init__.py +1 -0
- tests/vlm/test_vlmeval.py +59 -0
- evalscope/perf/_logging.py +0 -32
- evalscope/perf/datasets/longalpaca_12k.py +0 -20
- evalscope/perf/datasets/openqa.py +0 -22
- evalscope/perf/plugin_registry.py +0 -35
- evalscope/perf/query_parameters.py +0 -42
- evalscope/perf/server_sent_event.py +0 -43
- evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -221
- /evalscope/perf/{datasets → utils}/__init__.py +0 -0
- {evalscope-0.6.1.dist-info → evalscope-0.7.1.dist-info}/entry_points.txt +0 -0
- {evalscope/preprocess → tests}/__init__.py +0 -0
- {evalscope/preprocess/tokenizers → tests/rag}/__init__.py +0 -0
tests/rag/test_mteb.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
import unittest
|
|
5
|
+
from evalscope.utils import test_level_list, is_module_installed
|
|
6
|
+
from evalscope.utils.logger import get_logger
|
|
7
|
+
from evalscope.run import run_task
|
|
8
|
+
|
|
9
|
+
logger = get_logger()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TestMTEB(unittest.TestCase):
|
|
13
|
+
|
|
14
|
+
def setUp(self) -> None:
|
|
15
|
+
self._check_env('mteb')
|
|
16
|
+
|
|
17
|
+
def tearDown(self) -> None:
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
@staticmethod
|
|
21
|
+
def _check_env(module_name: str):
|
|
22
|
+
if is_module_installed(module_name):
|
|
23
|
+
logger.info(f'{module_name} is installed.')
|
|
24
|
+
else:
|
|
25
|
+
raise ModuleNotFoundError(f'run: pip install {module_name}')
|
|
26
|
+
|
|
27
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
28
|
+
def test_run_one_stage_mteb(self):
|
|
29
|
+
task_cfg = {
|
|
30
|
+
'eval_backend': 'RAGEval',
|
|
31
|
+
'eval_config': {
|
|
32
|
+
'tool': 'MTEB',
|
|
33
|
+
'model': [
|
|
34
|
+
{
|
|
35
|
+
'model_name_or_path': 'AI-ModelScope/m3e-base',
|
|
36
|
+
'pooling_mode': None, # load from model config
|
|
37
|
+
'max_seq_length': 512,
|
|
38
|
+
'prompt': '',
|
|
39
|
+
'model_kwargs': {'torch_dtype': 'auto'},
|
|
40
|
+
'encode_kwargs': {
|
|
41
|
+
'batch_size': 128,
|
|
42
|
+
},
|
|
43
|
+
}
|
|
44
|
+
],
|
|
45
|
+
'eval': {
|
|
46
|
+
'tasks': [
|
|
47
|
+
'TNews',
|
|
48
|
+
'CLSClusteringS2S',
|
|
49
|
+
'T2Reranking',
|
|
50
|
+
'T2Retrieval',
|
|
51
|
+
'ATEC',
|
|
52
|
+
],
|
|
53
|
+
'verbosity': 2,
|
|
54
|
+
'output_folder': 'outputs',
|
|
55
|
+
'overwrite_results': True,
|
|
56
|
+
'limits': 500,
|
|
57
|
+
},
|
|
58
|
+
},
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
run_task(task_cfg)
|
|
62
|
+
|
|
63
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
64
|
+
def test_run_two_stage_mteb(self):
|
|
65
|
+
task_cfg = {
|
|
66
|
+
'eval_backend': 'RAGEval',
|
|
67
|
+
'eval_config': {
|
|
68
|
+
'tool': 'MTEB',
|
|
69
|
+
'model': [
|
|
70
|
+
{
|
|
71
|
+
'model_name_or_path': 'AI-ModelScope/m3e-base',
|
|
72
|
+
'is_cross_encoder': False,
|
|
73
|
+
'max_seq_length': 512,
|
|
74
|
+
'prompt': '',
|
|
75
|
+
'model_kwargs': {'torch_dtype': 'auto'},
|
|
76
|
+
'encode_kwargs': {
|
|
77
|
+
'batch_size': 64,
|
|
78
|
+
},
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
'model_name_or_path': 'OpenBMB/MiniCPM-Reranker',
|
|
82
|
+
'is_cross_encoder': True,
|
|
83
|
+
'max_seq_length': 512,
|
|
84
|
+
'prompt': '为这个问题生成一个检索用的表示',
|
|
85
|
+
'model_kwargs': {'torch_dtype': 'auto'},
|
|
86
|
+
'encode_kwargs': {
|
|
87
|
+
'batch_size': 32,
|
|
88
|
+
},
|
|
89
|
+
},
|
|
90
|
+
],
|
|
91
|
+
'eval': {
|
|
92
|
+
'tasks': ['MedicalRetrieval', 'T2Retrieval'],
|
|
93
|
+
'verbosity': 2,
|
|
94
|
+
'output_folder': 'outputs',
|
|
95
|
+
'overwrite_results': True,
|
|
96
|
+
'limits': 10,
|
|
97
|
+
},
|
|
98
|
+
},
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
run_task(task_cfg)
|
|
102
|
+
|
|
103
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
104
|
+
def test_run_custom(self):
|
|
105
|
+
task_cfg = {
|
|
106
|
+
'eval_backend': 'RAGEval',
|
|
107
|
+
'eval_config': {
|
|
108
|
+
'tool': 'MTEB',
|
|
109
|
+
'model': [
|
|
110
|
+
{
|
|
111
|
+
'model_name_or_path': 'AI-ModelScope/m3e-base',
|
|
112
|
+
'pooling_mode': None, # load from model config
|
|
113
|
+
'max_seq_length': 512,
|
|
114
|
+
'prompt': '',
|
|
115
|
+
'model_kwargs': {'torch_dtype': 'auto'},
|
|
116
|
+
'encode_kwargs': {
|
|
117
|
+
'batch_size': 128,
|
|
118
|
+
},
|
|
119
|
+
}
|
|
120
|
+
],
|
|
121
|
+
'eval': {
|
|
122
|
+
'tasks': ['CustomRetrieval'],
|
|
123
|
+
'dataset_path': 'custom_eval/text/retrieval',
|
|
124
|
+
'verbosity': 2,
|
|
125
|
+
'output_folder': 'outputs',
|
|
126
|
+
'overwrite_results': True,
|
|
127
|
+
'limits': 500,
|
|
128
|
+
},
|
|
129
|
+
},
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
run_task(task_cfg)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == '__main__':
|
|
136
|
+
unittest.main(buffer=False)
|
tests/rag/test_ragas.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os
|
|
3
|
+
import unittest
|
|
4
|
+
from evalscope.utils import test_level_list, is_module_installed
|
|
5
|
+
from evalscope.utils.logger import get_logger
|
|
6
|
+
from evalscope.run import run_task
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TestRAGAS(unittest.TestCase):
|
|
12
|
+
|
|
13
|
+
def setUp(self) -> None:
|
|
14
|
+
self._check_env('ragas')
|
|
15
|
+
|
|
16
|
+
def tearDown(self) -> None:
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
@staticmethod
|
|
20
|
+
def _check_env(module_name: str):
|
|
21
|
+
if is_module_installed(module_name):
|
|
22
|
+
logger.info(f'{module_name} is installed.')
|
|
23
|
+
else:
|
|
24
|
+
raise ModuleNotFoundError(f'run: pip install {module_name}')
|
|
25
|
+
|
|
26
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
27
|
+
def test_run_generate_dataset(self):
|
|
28
|
+
task_cfg = {
|
|
29
|
+
'eval_backend': 'RAGEval',
|
|
30
|
+
'eval_config': {
|
|
31
|
+
'tool': 'RAGAS',
|
|
32
|
+
'testset_generation': {
|
|
33
|
+
'docs': ['README_zh.md'],
|
|
34
|
+
'test_size': 5,
|
|
35
|
+
'output_file': 'outputs/testset.json',
|
|
36
|
+
'distribution': {
|
|
37
|
+
'simple': 0.5,
|
|
38
|
+
'multi_context': 0.4,
|
|
39
|
+
'reasoning': 0.1,
|
|
40
|
+
},
|
|
41
|
+
'generator_llm': {
|
|
42
|
+
'model_name_or_path': 'qwen/Qwen2-7B-Instruct',
|
|
43
|
+
'template_type': 'qwen',
|
|
44
|
+
},
|
|
45
|
+
'embeddings': {
|
|
46
|
+
'model_name_or_path': 'AI-ModelScope/m3e-base',
|
|
47
|
+
},
|
|
48
|
+
'language': 'chinese',
|
|
49
|
+
},
|
|
50
|
+
},
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
logger.info(f'>> Start to run task: {task_cfg}')
|
|
54
|
+
|
|
55
|
+
run_task(task_cfg)
|
|
56
|
+
|
|
57
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
58
|
+
def test_run_rag_eval(self):
|
|
59
|
+
task_cfg = {
|
|
60
|
+
'eval_backend': 'RAGEval',
|
|
61
|
+
'eval_config': {
|
|
62
|
+
'tool': 'RAGAS',
|
|
63
|
+
'eval': {
|
|
64
|
+
'testset_file': 'outputs/testset_chinese_with_answer.json',
|
|
65
|
+
'critic_llm': {
|
|
66
|
+
'model_name_or_path': 'qwen/Qwen2-7B-Instruct',
|
|
67
|
+
'template_type': 'qwen',
|
|
68
|
+
},
|
|
69
|
+
'embeddings': {
|
|
70
|
+
'model_name_or_path': 'AI-ModelScope/m3e-base',
|
|
71
|
+
},
|
|
72
|
+
'metrics': [
|
|
73
|
+
'Faithfulness',
|
|
74
|
+
'AnswerRelevancy',
|
|
75
|
+
'ContextPrecision',
|
|
76
|
+
'AnswerCorrectness',
|
|
77
|
+
],
|
|
78
|
+
},
|
|
79
|
+
},
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
logger.info(f'>> Start to run task: {task_cfg}')
|
|
83
|
+
|
|
84
|
+
run_task(task_cfg)
|
|
85
|
+
|
|
86
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
87
|
+
def test_run_rag_eval_api(self):
|
|
88
|
+
task_cfg = {
|
|
89
|
+
'eval_backend': 'RAGEval',
|
|
90
|
+
'eval_config': {
|
|
91
|
+
'tool': 'RAGAS',
|
|
92
|
+
'eval': {
|
|
93
|
+
'testset_file': 'outputs/testset.json',
|
|
94
|
+
'critic_llm': {
|
|
95
|
+
'model_name': 'gpt-4o-mini', # 自定义聊天模型名称
|
|
96
|
+
'api_base': 'http://127.0.0.1:8088/v1', # 自定义基础URL
|
|
97
|
+
'api_key': 'xxxx', # 你的API密钥
|
|
98
|
+
},
|
|
99
|
+
'embeddings': {
|
|
100
|
+
'model_name_or_path': 'AI-ModelScope/m3e-base',
|
|
101
|
+
},
|
|
102
|
+
'metrics': [
|
|
103
|
+
'Faithfulness',
|
|
104
|
+
'AnswerRelevancy',
|
|
105
|
+
'ContextPrecision',
|
|
106
|
+
'AnswerCorrectness',
|
|
107
|
+
'MultiModalFaithfulness',
|
|
108
|
+
'MultiModalRelevance',
|
|
109
|
+
],
|
|
110
|
+
},
|
|
111
|
+
},
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
logger.info(f'>> Start to run task: {task_cfg}')
|
|
115
|
+
|
|
116
|
+
run_task(task_cfg)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
if __name__ == '__main__':
|
|
120
|
+
unittest.main(buffer=False)
|
tests/swift/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import subprocess
|
|
5
|
+
import time
|
|
6
|
+
import unittest
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
from evalscope.backend.opencompass import OpenCompassBackendManager
|
|
12
|
+
from evalscope.run import run_task
|
|
13
|
+
from evalscope.summarizer import Summarizer
|
|
14
|
+
from evalscope.utils import is_module_installed, test_level_list
|
|
15
|
+
from evalscope.utils.logger import get_logger
|
|
16
|
+
|
|
17
|
+
logger = get_logger(__name__)
|
|
18
|
+
|
|
19
|
+
DEFAULT_CHAT_MODEL_URL = 'http://127.0.0.1:8000/v1/chat/completions'
|
|
20
|
+
DEFAULT_BASE_MODEL_URL = 'http://127.0.0.1:8001/v1/completions'
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TestRunSwiftEval(unittest.TestCase):
|
|
24
|
+
|
|
25
|
+
def setUp(self) -> None:
|
|
26
|
+
logger.info('Init env for swift-eval UTs ...\n')
|
|
27
|
+
|
|
28
|
+
self.model_name = 'qwen2_5-0_5b-instruct'
|
|
29
|
+
assert is_module_installed('evalscope'), 'Please install `evalscope` from pypi or source code.'
|
|
30
|
+
|
|
31
|
+
if not is_module_installed('opencompass'):
|
|
32
|
+
logger.warning('Note: installing ms-opencompass ...')
|
|
33
|
+
subprocess.run('pip3 install ms-opencompass -U', shell=True, check=True)
|
|
34
|
+
|
|
35
|
+
if not is_module_installed('swift'):
|
|
36
|
+
logger.warning('Note: installing ms-swift ...')
|
|
37
|
+
subprocess.run('pip3 install ms-swift[llm]', shell=True, check=True)
|
|
38
|
+
|
|
39
|
+
logger.warning('vllm not installed, use native swift deploy service instead.')
|
|
40
|
+
|
|
41
|
+
logger.info('\nStaring run swift deploy ...')
|
|
42
|
+
self.process_swift_deploy = subprocess.Popen(
|
|
43
|
+
f'swift deploy --model_type {self.model_name} --infer_backend pt', text=True, shell=True)
|
|
44
|
+
if self.process_swift_deploy.stderr:
|
|
45
|
+
logger.info(f'swift deploy log info: {self.process_swift_deploy.stderr}')
|
|
46
|
+
|
|
47
|
+
self.all_datasets = OpenCompassBackendManager.list_datasets()
|
|
48
|
+
assert len(self.all_datasets) > 0, f'Failed to list datasets from OpenCompass backend: {self.all_datasets}'
|
|
49
|
+
|
|
50
|
+
def tearDown(self) -> None:
|
|
51
|
+
# Stop the swift deploy model service
|
|
52
|
+
logger.warning('\nStopping swift deploy ...')
|
|
53
|
+
self.process_swift_deploy.terminate()
|
|
54
|
+
self.process_swift_deploy.wait()
|
|
55
|
+
logger.info('Process swift-deploy terminated successfully.')
|
|
56
|
+
|
|
57
|
+
@staticmethod
|
|
58
|
+
def find_and_kill_pid(pids: list):
|
|
59
|
+
if len(pids) > 0:
|
|
60
|
+
for pid in pids:
|
|
61
|
+
subprocess.run(['kill', str(pid)])
|
|
62
|
+
logger.warning(f'Killed process {pid}.')
|
|
63
|
+
else:
|
|
64
|
+
logger.info('No pids found.')
|
|
65
|
+
|
|
66
|
+
@staticmethod
|
|
67
|
+
def find_and_kill_service(service_name):
|
|
68
|
+
try:
|
|
69
|
+
# find pid
|
|
70
|
+
result = subprocess.run(['ps', '-ef'], stdout=subprocess.PIPE, text=True)
|
|
71
|
+
|
|
72
|
+
lines = result.stdout.splitlines()
|
|
73
|
+
pids = []
|
|
74
|
+
for line in lines:
|
|
75
|
+
if service_name in line and 'grep' not in line:
|
|
76
|
+
parts = line.split()
|
|
77
|
+
pid = parts[1]
|
|
78
|
+
pids.append(pid)
|
|
79
|
+
|
|
80
|
+
if not pids:
|
|
81
|
+
logger.info(f'No process found for {service_name}.')
|
|
82
|
+
else:
|
|
83
|
+
for pid in pids:
|
|
84
|
+
subprocess.run(['kill', pid])
|
|
85
|
+
logger.warning(f'Killed process {pid} for service {service_name}.')
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logger.error(f'An error occurred: {e}')
|
|
88
|
+
|
|
89
|
+
@staticmethod
|
|
90
|
+
def check_service_status(url: str, data: dict, retries: int = 30, delay: int = 10):
|
|
91
|
+
for i in range(retries):
|
|
92
|
+
try:
|
|
93
|
+
logger.info(f'Attempt {i + 1}: Checking service at {url} ...')
|
|
94
|
+
response = requests.post(
|
|
95
|
+
url, data=json.dumps(data), headers={'Content-Type': 'application/json'}, timeout=30)
|
|
96
|
+
if response.status_code == 200:
|
|
97
|
+
logger.info(f'Service at {url} is available !\n\n')
|
|
98
|
+
return True
|
|
99
|
+
else:
|
|
100
|
+
logger.info(f'Service at {url} returned status code {response.status_code}.')
|
|
101
|
+
except requests.exceptions.RequestException as e:
|
|
102
|
+
logger.info(f'Attempt {i + 1}: An error occurred: {e}')
|
|
103
|
+
|
|
104
|
+
time.sleep(delay)
|
|
105
|
+
|
|
106
|
+
logger.info(f'Service at {url} is not available after {retries} retries.')
|
|
107
|
+
return False
|
|
108
|
+
|
|
109
|
+
@unittest.skipUnless(1 in test_level_list(), 'skip test in current test level')
|
|
110
|
+
def test_run_task(self):
|
|
111
|
+
# Prepare the config
|
|
112
|
+
task_cfg = dict(
|
|
113
|
+
eval_backend='OpenCompass',
|
|
114
|
+
eval_config={
|
|
115
|
+
'datasets': ['cmb', 'bbh', 'ceval', 'ARC_e', 'gsm8k'],
|
|
116
|
+
'models': [
|
|
117
|
+
{
|
|
118
|
+
'path': self.model_name,
|
|
119
|
+
'openai_api_base': DEFAULT_CHAT_MODEL_URL,
|
|
120
|
+
'batch_size': 8
|
|
121
|
+
},
|
|
122
|
+
],
|
|
123
|
+
'work_dir': 'outputs/llama3_eval_result',
|
|
124
|
+
'reuse': None, # string, `latest` or timestamp, e.g. `20230516_144254`, default to None
|
|
125
|
+
'limit': 5, # string or int or float, e.g. `[2:5]`, 5, 5.0, default to None, it means run all examples
|
|
126
|
+
},
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# Check the service status
|
|
130
|
+
data = {'model': self.model_name, 'messages': [{'role': 'user', 'content': 'who are you?'}]}
|
|
131
|
+
assert self.check_service_status(
|
|
132
|
+
DEFAULT_CHAT_MODEL_URL, data=data), f'Failed to check service status: {DEFAULT_CHAT_MODEL_URL}'
|
|
133
|
+
|
|
134
|
+
# Submit the task
|
|
135
|
+
logger.info(f'Start to run UT with cfg: {task_cfg}')
|
|
136
|
+
run_task(task_cfg=task_cfg)
|
|
137
|
+
|
|
138
|
+
# Get the final report with summarizer
|
|
139
|
+
report_list = Summarizer.get_report_from_cfg(task_cfg)
|
|
140
|
+
logger.info(f'>>The report list:\n{report_list}')
|
|
141
|
+
|
|
142
|
+
assert len(report_list) > 0, f'Failed to get report list: {report_list}'
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
if __name__ == '__main__':
|
|
146
|
+
unittest.main()
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os
|
|
3
|
+
import shutil
|
|
4
|
+
import subprocess
|
|
5
|
+
import time
|
|
6
|
+
import unittest
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
12
|
+
from evalscope.run import run_task
|
|
13
|
+
from evalscope.summarizer import Summarizer
|
|
14
|
+
from evalscope.utils import is_module_installed, test_level_list
|
|
15
|
+
from evalscope.utils.logger import get_logger
|
|
16
|
+
|
|
17
|
+
logger = get_logger(__name__)
|
|
18
|
+
|
|
19
|
+
DEFAULT_CHAT_MODEL_URL = 'http://127.0.0.1:8000/v1/chat/completions'
|
|
20
|
+
DEFAULT_API_KEY = 'EMPTY'
|
|
21
|
+
DEFAULT_MODEL_NAME = 'CustomAPIModel'
|
|
22
|
+
DEFAULT_WORK_DIR = 'outputs/qwen-vl-chat'
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class TestRunSwiftVLMEval(unittest.TestCase):
|
|
26
|
+
|
|
27
|
+
def setUp(self) -> None:
|
|
28
|
+
logger.info('Init env for swift-eval UTs ...\n')
|
|
29
|
+
assert is_module_installed('evalscope'), 'Please install `llmuses` from pypi or source code.'
|
|
30
|
+
|
|
31
|
+
if not is_module_installed('vlmeval'):
|
|
32
|
+
logger.warning('Note: installing ms-vlmeval ...')
|
|
33
|
+
subprocess.run('pip3 install ms-vlmeval -U', shell=True, check=True)
|
|
34
|
+
|
|
35
|
+
if not is_module_installed('swift'):
|
|
36
|
+
logger.warning('Note: installing ms-swift ...')
|
|
37
|
+
subprocess.run('pip3 install ms-swift -U', shell=True, check=True)
|
|
38
|
+
|
|
39
|
+
if os.path.exists(DEFAULT_WORK_DIR):
|
|
40
|
+
shutil.rmtree(DEFAULT_WORK_DIR)
|
|
41
|
+
logger.info(f'Removed work dir: {os.path.abspath(DEFAULT_WORK_DIR)} \n')
|
|
42
|
+
|
|
43
|
+
logger.info('\nStaring run swift deploy ...')
|
|
44
|
+
self.model_name = 'qwen-vl-chat'
|
|
45
|
+
self.process_swift_deploy = subprocess.Popen(
|
|
46
|
+
f'swift deploy --model_type {self.model_name} --infer_backend pt', text=True, shell=True)
|
|
47
|
+
|
|
48
|
+
self.all_datasets = VLMEvalKitBackendManager.list_supported_datasets()
|
|
49
|
+
assert len(self.all_datasets) > 0, f'Failed to list datasets from VLMEvalKit backend: {self.all_datasets}'
|
|
50
|
+
|
|
51
|
+
def tearDown(self) -> None:
|
|
52
|
+
# Stop the swift deploy model service
|
|
53
|
+
logger.warning('Stopping swift deploy ...')
|
|
54
|
+
self.process_swift_deploy.terminate()
|
|
55
|
+
self.process_swift_deploy.wait()
|
|
56
|
+
logger.info('Process swift-deploy terminated successfully.')
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def _check_env(module_name: str):
|
|
60
|
+
if is_module_installed(module_name):
|
|
61
|
+
logger.info(f'{module_name} is installed.')
|
|
62
|
+
else:
|
|
63
|
+
raise ModuleNotFoundError(f'run: pip install {module_name}')
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
def check_service_status(url: str, data: dict, retries: int = 20, delay: int = 10):
|
|
67
|
+
for i in range(retries):
|
|
68
|
+
try:
|
|
69
|
+
logger.info(f'Attempt {i + 1}: Checking service at {url} ...')
|
|
70
|
+
response = requests.post(
|
|
71
|
+
url, data=json.dumps(data), headers={'Content-Type': 'application/json'}, timeout=30)
|
|
72
|
+
if response.status_code == 200:
|
|
73
|
+
logger.info(f'Service at {url} is available !\n\n')
|
|
74
|
+
return True
|
|
75
|
+
else:
|
|
76
|
+
logger.info(f'Service at {url} returned status code {response.status_code}.')
|
|
77
|
+
except requests.exceptions.RequestException as e:
|
|
78
|
+
logger.info(f'Attempt {i + 1}: An error occurred: {e}')
|
|
79
|
+
|
|
80
|
+
time.sleep(delay)
|
|
81
|
+
|
|
82
|
+
logger.info(f'Service at {url} is not available after {retries} retries.')
|
|
83
|
+
return False
|
|
84
|
+
|
|
85
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
86
|
+
def test_run_api(self):
|
|
87
|
+
api_base = DEFAULT_CHAT_MODEL_URL
|
|
88
|
+
task_cfg = {
|
|
89
|
+
'eval_backend': 'VLMEvalKit',
|
|
90
|
+
'eval_config': {
|
|
91
|
+
'data': ['SEEDBench_IMG', 'ChartQA_TEST'],
|
|
92
|
+
'limit':
|
|
93
|
+
30,
|
|
94
|
+
'mode':
|
|
95
|
+
'all',
|
|
96
|
+
'model': [{
|
|
97
|
+
'api_base': api_base, # swfit deploy model api
|
|
98
|
+
'key': DEFAULT_API_KEY,
|
|
99
|
+
'name': DEFAULT_MODEL_NAME, # must be CustomAPIModel for swift
|
|
100
|
+
'temperature': 0.0,
|
|
101
|
+
'type': self.model_name
|
|
102
|
+
}], # swift model type
|
|
103
|
+
'nproc':
|
|
104
|
+
1,
|
|
105
|
+
'reuse':
|
|
106
|
+
True,
|
|
107
|
+
'work_dir':
|
|
108
|
+
DEFAULT_WORK_DIR
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
# Check the service status
|
|
113
|
+
data = {'model': self.model_name, 'messages': [{'role': 'user', 'content': 'who are you?'}]}
|
|
114
|
+
assert self.check_service_status(api_base, data=data), f'Failed to check service status: {api_base}'
|
|
115
|
+
|
|
116
|
+
logger.info(f'>> Start to run task: {task_cfg}')
|
|
117
|
+
|
|
118
|
+
run_task(task_cfg)
|
|
119
|
+
|
|
120
|
+
logger.info('>> Start to get the report with summarizer ...')
|
|
121
|
+
report_list = Summarizer.get_report_from_cfg(task_cfg)
|
|
122
|
+
logger.info(f'\n>> The report list: {report_list}')
|
|
123
|
+
|
|
124
|
+
assert len(report_list) > 0, f'Failed to get report list: {report_list}'
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
if __name__ == '__main__':
|
|
128
|
+
unittest.main(buffer=False)
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os
|
|
3
|
+
import shutil
|
|
4
|
+
import subprocess
|
|
5
|
+
import time
|
|
6
|
+
import unittest
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
12
|
+
from evalscope.run import run_task
|
|
13
|
+
from evalscope.summarizer import Summarizer
|
|
14
|
+
from evalscope.utils import is_module_installed, test_level_list
|
|
15
|
+
from evalscope.utils.logger import get_logger
|
|
16
|
+
|
|
17
|
+
logger = get_logger(__name__)
|
|
18
|
+
|
|
19
|
+
DEFAULT_CHAT_MODEL_URL = 'http://127.0.0.1:8000/v1/chat/completions'
|
|
20
|
+
DEFAULT_JUDGE_MODEL_URL = 'http://127.0.0.1:8866/v1/chat/completions'
|
|
21
|
+
|
|
22
|
+
DEFAULT_API_KEY = 'EMPTY'
|
|
23
|
+
DEFAULT_MODEL_NAME = 'CustomAPIModel'
|
|
24
|
+
DEFAULT_WORK_DIR = 'outputs'
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class TestRunSwiftVLMEval(unittest.TestCase):
|
|
28
|
+
|
|
29
|
+
def setUp(self) -> None:
|
|
30
|
+
logger.info('Init env for swift-eval UTs ...\n')
|
|
31
|
+
assert is_module_installed('llmuses'), 'Please install `llmuses` from pypi or source code.'
|
|
32
|
+
|
|
33
|
+
logger.warning('Note: installing ms-vlmeval ...')
|
|
34
|
+
subprocess.run('pip3 install ms-vlmeval -U', shell=True, check=True)
|
|
35
|
+
|
|
36
|
+
logger.warning('Note: installing ms-swift ...')
|
|
37
|
+
subprocess.run('pip3 install ms-swift -U', shell=True, check=True)
|
|
38
|
+
|
|
39
|
+
if os.path.exists(DEFAULT_WORK_DIR):
|
|
40
|
+
shutil.rmtree(DEFAULT_WORK_DIR)
|
|
41
|
+
logger.info(f'Removed work dir: {os.path.abspath(DEFAULT_WORK_DIR)} \n')
|
|
42
|
+
|
|
43
|
+
logger.info('\nStaring run swift deploy ...')
|
|
44
|
+
self.model_name = 'qwen-vl-chat'
|
|
45
|
+
self.process_swift_deploy = subprocess.Popen(
|
|
46
|
+
f'swift deploy --model_type {self.model_name}',
|
|
47
|
+
text=True,
|
|
48
|
+
shell=True,
|
|
49
|
+
stdout=subprocess.PIPE,
|
|
50
|
+
stderr=subprocess.PIPE)
|
|
51
|
+
|
|
52
|
+
logger.info('\nStaring run swift deploy judge ...')
|
|
53
|
+
self.judge_model_name = 'qwen2-7b-instruct'
|
|
54
|
+
self.process_swift_deploy_judge = subprocess.Popen(
|
|
55
|
+
f'swift deploy --model_type {self.judge_model_name} \
|
|
56
|
+
--port 8866',
|
|
57
|
+
text=True,
|
|
58
|
+
shell=True,
|
|
59
|
+
stdout=subprocess.PIPE,
|
|
60
|
+
stderr=subprocess.PIPE)
|
|
61
|
+
|
|
62
|
+
self.all_datasets = VLMEvalKitBackendManager.list_supported_datasets()
|
|
63
|
+
assert len(self.all_datasets) > 0, f'Failed to list datasets from VLMEvalKit backend: {self.all_datasets}'
|
|
64
|
+
|
|
65
|
+
def tearDown(self) -> None:
|
|
66
|
+
# Stop the swift deploy model service
|
|
67
|
+
logger.warning('\nStopping swift deploy ...')
|
|
68
|
+
self.process_swift_deploy.terminate()
|
|
69
|
+
self.process_swift_deploy.wait()
|
|
70
|
+
|
|
71
|
+
self.process_swift_deploy_judge.terminate()
|
|
72
|
+
self.process_swift_deploy_judge.wait()
|
|
73
|
+
|
|
74
|
+
logger.info('Process swift-deploy terminated successfully.')
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def _check_env(module_name: str):
|
|
78
|
+
if is_module_installed(module_name):
|
|
79
|
+
logger.info(f'{module_name} is installed.')
|
|
80
|
+
else:
|
|
81
|
+
raise ModuleNotFoundError(f'run: pip install {module_name}')
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def check_service_status(url: str, data: dict, retries: int = 20, delay: int = 10):
|
|
85
|
+
for i in range(retries):
|
|
86
|
+
try:
|
|
87
|
+
logger.info(f'Attempt {i + 1}: Checking service at {url} ...')
|
|
88
|
+
response = requests.post(
|
|
89
|
+
url, data=json.dumps(data), headers={'Content-Type': 'application/json'}, timeout=30)
|
|
90
|
+
if response.status_code == 200:
|
|
91
|
+
logger.info(f'Service at {url} is available !\n\n')
|
|
92
|
+
return True
|
|
93
|
+
else:
|
|
94
|
+
logger.info(f'Service at {url} returned status code {response.status_code}.')
|
|
95
|
+
except requests.exceptions.RequestException as e:
|
|
96
|
+
logger.info(f'Attempt {i + 1}: An error occurred: {e}')
|
|
97
|
+
|
|
98
|
+
time.sleep(delay)
|
|
99
|
+
|
|
100
|
+
logger.info(f'Service at {url} is not available after {retries} retries.')
|
|
101
|
+
return False
|
|
102
|
+
|
|
103
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
104
|
+
def test_run_api_with_judge(self):
|
|
105
|
+
# Check the judge service status
|
|
106
|
+
data = {'model': self.judge_model_name, 'messages': [{'role': 'user', 'content': 'who are you?'}]}
|
|
107
|
+
assert self.check_service_status(DEFAULT_JUDGE_MODEL_URL, data=data), \
|
|
108
|
+
f'Failed to check judge service status: {DEFAULT_JUDGE_MODEL_URL}'
|
|
109
|
+
|
|
110
|
+
api_base = DEFAULT_CHAT_MODEL_URL
|
|
111
|
+
task_cfg = {
|
|
112
|
+
'eval_backend': 'VLMEvalKit',
|
|
113
|
+
'eval_config': {
|
|
114
|
+
'LOCAL_LLM':
|
|
115
|
+
self.judge_model_name, # judge model id
|
|
116
|
+
'OPENAI_API_BASE':
|
|
117
|
+
DEFAULT_JUDGE_MODEL_URL, # judge model api
|
|
118
|
+
'OPENAI_API_KEY':
|
|
119
|
+
DEFAULT_API_KEY,
|
|
120
|
+
'data': ['SEEDBench_IMG', 'ChartQA_TEST'],
|
|
121
|
+
'limit':
|
|
122
|
+
20,
|
|
123
|
+
'mode':
|
|
124
|
+
'all',
|
|
125
|
+
'model': [{
|
|
126
|
+
'api_base': api_base, # swfit deploy model api
|
|
127
|
+
'key': DEFAULT_API_KEY,
|
|
128
|
+
'name': DEFAULT_MODEL_NAME, # must be CustomAPIModel for swift
|
|
129
|
+
'temperature': 0.0,
|
|
130
|
+
'type': self.model_name
|
|
131
|
+
}], # swift model type
|
|
132
|
+
'nproc':
|
|
133
|
+
1,
|
|
134
|
+
'reuse':
|
|
135
|
+
True,
|
|
136
|
+
'work_dir':
|
|
137
|
+
DEFAULT_WORK_DIR
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
# Check the service status
|
|
142
|
+
data = {'model': self.model_name, 'messages': [{'role': 'user', 'content': 'who are you?'}]}
|
|
143
|
+
assert self.check_service_status(api_base, data=data), f'Failed to check service status: {api_base}'
|
|
144
|
+
|
|
145
|
+
logger.info(f'>> Start to run task: {task_cfg}')
|
|
146
|
+
|
|
147
|
+
run_task(task_cfg)
|
|
148
|
+
|
|
149
|
+
logger.info('>> Start to get the report with summarizer ...')
|
|
150
|
+
report_list = Summarizer.get_report_from_cfg(task_cfg)
|
|
151
|
+
logger.info(f'\n>> The report list: {report_list}')
|
|
152
|
+
|
|
153
|
+
assert len(report_list) > 0, f'Failed to get report list: {report_list}'
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
if __name__ == '__main__':
|
|
157
|
+
unittest.main(buffer=False)
|