evalscope 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +230 -0
  2. evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +43 -0
  3. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +87 -0
  4. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +36 -0
  5. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +26 -0
  6. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +41 -0
  7. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +60 -0
  8. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +36 -0
  9. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +22 -0
  10. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +35 -0
  11. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  12. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  13. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  14. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  15. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +34 -0
  16. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +36 -0
  17. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +25 -0
  18. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  19. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  20. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +16 -0
  21. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +24 -0
  22. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +18 -0
  23. evalscope/backend/vlm_eval_kit/backend_manager.py +23 -21
  24. evalscope/benchmarks/ceval/samples.jsonl +1 -0
  25. evalscope/benchmarks/cmmlu/samples.jsonl +5 -0
  26. evalscope/benchmarks/mmlu/samples.jsonl +5 -0
  27. evalscope/benchmarks/race/samples.jsonl +5 -0
  28. evalscope/benchmarks/trivia_qa/samples.jsonl +5 -0
  29. evalscope/cli/start_perf.py +8 -11
  30. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +58485 -0
  31. evalscope/metrics/resources/gpt2-zhcn3-v4.json +1 -0
  32. evalscope/metrics/rouge_metric.py +30 -15
  33. evalscope/perf/arguments.py +179 -0
  34. evalscope/perf/benchmark.py +245 -0
  35. evalscope/perf/http_client.py +127 -711
  36. evalscope/perf/main.py +35 -0
  37. evalscope/perf/plugin/__init__.py +2 -0
  38. evalscope/perf/plugin/api/__init__.py +3 -0
  39. evalscope/perf/{api_plugin_base.py → plugin/api/base.py} +17 -18
  40. evalscope/perf/{custom_api.py → plugin/api/custom_api.py} +25 -19
  41. evalscope/perf/{dashscope_api.py → plugin/api/dashscope_api.py} +28 -14
  42. evalscope/perf/{openai_api.py → plugin/api/openai_api.py} +51 -27
  43. evalscope/perf/plugin/datasets/__init__.py +6 -0
  44. evalscope/perf/{dataset_plugin_base.py → plugin/datasets/base.py} +13 -10
  45. evalscope/perf/plugin/datasets/custom.py +21 -0
  46. evalscope/perf/plugin/datasets/flickr8k.py +51 -0
  47. evalscope/perf/{datasets → plugin/datasets}/line_by_line.py +9 -5
  48. evalscope/perf/plugin/datasets/longalpaca.py +28 -0
  49. evalscope/perf/plugin/datasets/openqa.py +38 -0
  50. evalscope/perf/plugin/datasets/speed_benchmark.py +50 -0
  51. evalscope/perf/plugin/registry.py +54 -0
  52. evalscope/perf/{how_to_analysis_result.py → utils/analysis_result.py} +11 -5
  53. evalscope/perf/utils/benchmark_util.py +135 -0
  54. evalscope/perf/utils/chat_service.py +252 -0
  55. evalscope/perf/utils/db_util.py +200 -0
  56. evalscope/perf/utils/handler.py +46 -0
  57. evalscope/perf/utils/local_server.py +139 -0
  58. evalscope/registry/config/cfg_arena.yaml +77 -0
  59. evalscope/registry/config/cfg_arena_zhihu.yaml +63 -0
  60. evalscope/registry/config/cfg_pairwise_baseline.yaml +83 -0
  61. evalscope/registry/config/cfg_single.yaml +78 -0
  62. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +8 -0
  63. evalscope/registry/data/prompt_template/prompt_templates.jsonl +8 -0
  64. evalscope/registry/data/qa_browser/battle.jsonl +634 -0
  65. evalscope/registry/data/qa_browser/category_mapping.yaml +10 -0
  66. evalscope/registry/data/question.jsonl +80 -0
  67. evalscope/third_party/longbench_write/README.md +118 -0
  68. evalscope/third_party/longbench_write/default_task.json +27 -0
  69. evalscope/third_party/longbench_write/default_task.yaml +24 -0
  70. evalscope/third_party/toolbench_static/README.md +118 -0
  71. evalscope/third_party/toolbench_static/config_default.json +15 -0
  72. evalscope/third_party/toolbench_static/config_default.yaml +12 -0
  73. evalscope/third_party/toolbench_static/requirements.txt +2 -0
  74. evalscope/utils/logger.py +18 -20
  75. evalscope/utils/utils.py +41 -42
  76. evalscope/version.py +2 -2
  77. evalscope-0.7.0.dist-info/LICENSE +203 -0
  78. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/METADATA +91 -33
  79. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/RECORD +99 -29
  80. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/WHEEL +1 -1
  81. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/top_level.txt +1 -0
  82. tests/cli/__init__.py +1 -0
  83. tests/cli/test_run.py +76 -0
  84. tests/perf/__init__.py +1 -0
  85. tests/perf/test_perf.py +96 -0
  86. tests/rag/test_clip_benchmark.py +85 -0
  87. tests/rag/test_mteb.py +136 -0
  88. tests/rag/test_ragas.py +120 -0
  89. tests/swift/__init__.py +1 -0
  90. tests/swift/test_run_swift_eval.py +146 -0
  91. tests/swift/test_run_swift_vlm_eval.py +128 -0
  92. tests/swift/test_run_swift_vlm_jugde_eval.py +157 -0
  93. tests/test_run_all.py +12 -0
  94. tests/vlm/__init__.py +1 -0
  95. tests/vlm/test_vlmeval.py +59 -0
  96. evalscope/perf/_logging.py +0 -32
  97. evalscope/perf/datasets/longalpaca_12k.py +0 -20
  98. evalscope/perf/datasets/openqa.py +0 -22
  99. evalscope/perf/plugin_registry.py +0 -35
  100. evalscope/perf/query_parameters.py +0 -42
  101. evalscope/perf/server_sent_event.py +0 -43
  102. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -221
  103. /evalscope/perf/{datasets → utils}/__init__.py +0 -0
  104. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/entry_points.txt +0 -0
  105. {evalscope/preprocess → tests}/__init__.py +0 -0
  106. {evalscope/preprocess/tokenizers → tests/rag}/__init__.py +0 -0
@@ -0,0 +1,120 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
3
+ import unittest
4
+ from evalscope.utils import test_level_list, is_module_installed
5
+ from evalscope.utils.logger import get_logger
6
+ from evalscope.run import run_task
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ class TestRAGAS(unittest.TestCase):
12
+
13
+ def setUp(self) -> None:
14
+ self._check_env('ragas')
15
+
16
+ def tearDown(self) -> None:
17
+ pass
18
+
19
+ @staticmethod
20
+ def _check_env(module_name: str):
21
+ if is_module_installed(module_name):
22
+ logger.info(f'{module_name} is installed.')
23
+ else:
24
+ raise ModuleNotFoundError(f'run: pip install {module_name}')
25
+
26
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
27
+ def test_run_generate_dataset(self):
28
+ task_cfg = {
29
+ 'eval_backend': 'RAGEval',
30
+ 'eval_config': {
31
+ 'tool': 'RAGAS',
32
+ 'testset_generation': {
33
+ 'docs': ['README_zh.md'],
34
+ 'test_size': 5,
35
+ 'output_file': 'outputs/testset.json',
36
+ 'distribution': {
37
+ 'simple': 0.5,
38
+ 'multi_context': 0.4,
39
+ 'reasoning': 0.1,
40
+ },
41
+ 'generator_llm': {
42
+ 'model_name_or_path': 'qwen/Qwen2-7B-Instruct',
43
+ 'template_type': 'qwen',
44
+ },
45
+ 'embeddings': {
46
+ 'model_name_or_path': 'AI-ModelScope/m3e-base',
47
+ },
48
+ 'language': 'chinese',
49
+ },
50
+ },
51
+ }
52
+
53
+ logger.info(f'>> Start to run task: {task_cfg}')
54
+
55
+ run_task(task_cfg)
56
+
57
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
58
+ def test_run_rag_eval(self):
59
+ task_cfg = {
60
+ 'eval_backend': 'RAGEval',
61
+ 'eval_config': {
62
+ 'tool': 'RAGAS',
63
+ 'eval': {
64
+ 'testset_file': 'outputs/testset_chinese_with_answer.json',
65
+ 'critic_llm': {
66
+ 'model_name_or_path': 'qwen/Qwen2-7B-Instruct',
67
+ 'template_type': 'qwen',
68
+ },
69
+ 'embeddings': {
70
+ 'model_name_or_path': 'AI-ModelScope/m3e-base',
71
+ },
72
+ 'metrics': [
73
+ 'Faithfulness',
74
+ 'AnswerRelevancy',
75
+ 'ContextPrecision',
76
+ 'AnswerCorrectness',
77
+ ],
78
+ },
79
+ },
80
+ }
81
+
82
+ logger.info(f'>> Start to run task: {task_cfg}')
83
+
84
+ run_task(task_cfg)
85
+
86
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
87
+ def test_run_rag_eval_api(self):
88
+ task_cfg = {
89
+ 'eval_backend': 'RAGEval',
90
+ 'eval_config': {
91
+ 'tool': 'RAGAS',
92
+ 'eval': {
93
+ 'testset_file': 'outputs/testset.json',
94
+ 'critic_llm': {
95
+ 'model_name': 'gpt-4o-mini', # 自定义聊天模型名称
96
+ 'api_base': 'http://127.0.0.1:8088/v1', # 自定义基础URL
97
+ 'api_key': 'xxxx', # 你的API密钥
98
+ },
99
+ 'embeddings': {
100
+ 'model_name_or_path': 'AI-ModelScope/m3e-base',
101
+ },
102
+ 'metrics': [
103
+ 'Faithfulness',
104
+ 'AnswerRelevancy',
105
+ 'ContextPrecision',
106
+ 'AnswerCorrectness',
107
+ 'MultiModalFaithfulness',
108
+ 'MultiModalRelevance',
109
+ ],
110
+ },
111
+ },
112
+ }
113
+
114
+ logger.info(f'>> Start to run task: {task_cfg}')
115
+
116
+ run_task(task_cfg)
117
+
118
+
119
+ if __name__ == '__main__':
120
+ unittest.main(buffer=False)
@@ -0,0 +1 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
@@ -0,0 +1,146 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import os
4
+ import subprocess
5
+ import time
6
+ import unittest
7
+
8
+ import json
9
+ import requests
10
+
11
+ from evalscope.backend.opencompass import OpenCompassBackendManager
12
+ from evalscope.run import run_task
13
+ from evalscope.summarizer import Summarizer
14
+ from evalscope.utils import is_module_installed, test_level_list
15
+ from evalscope.utils.logger import get_logger
16
+
17
+ logger = get_logger(__name__)
18
+
19
+ DEFAULT_CHAT_MODEL_URL = 'http://127.0.0.1:8000/v1/chat/completions'
20
+ DEFAULT_BASE_MODEL_URL = 'http://127.0.0.1:8001/v1/completions'
21
+
22
+
23
+ class TestRunSwiftEval(unittest.TestCase):
24
+
25
+ def setUp(self) -> None:
26
+ logger.info('Init env for swift-eval UTs ...\n')
27
+
28
+ self.model_name = 'qwen2_5-0_5b-instruct'
29
+ assert is_module_installed('evalscope'), 'Please install `evalscope` from pypi or source code.'
30
+
31
+ if not is_module_installed('opencompass'):
32
+ logger.warning('Note: installing ms-opencompass ...')
33
+ subprocess.run('pip3 install ms-opencompass -U', shell=True, check=True)
34
+
35
+ if not is_module_installed('swift'):
36
+ logger.warning('Note: installing ms-swift ...')
37
+ subprocess.run('pip3 install ms-swift[llm]', shell=True, check=True)
38
+
39
+ logger.warning('vllm not installed, use native swift deploy service instead.')
40
+
41
+ logger.info('\nStaring run swift deploy ...')
42
+ self.process_swift_deploy = subprocess.Popen(
43
+ f'swift deploy --model_type {self.model_name} --infer_backend pt', text=True, shell=True)
44
+ if self.process_swift_deploy.stderr:
45
+ logger.info(f'swift deploy log info: {self.process_swift_deploy.stderr}')
46
+
47
+ self.all_datasets = OpenCompassBackendManager.list_datasets()
48
+ assert len(self.all_datasets) > 0, f'Failed to list datasets from OpenCompass backend: {self.all_datasets}'
49
+
50
+ def tearDown(self) -> None:
51
+ # Stop the swift deploy model service
52
+ logger.warning('\nStopping swift deploy ...')
53
+ self.process_swift_deploy.terminate()
54
+ self.process_swift_deploy.wait()
55
+ logger.info('Process swift-deploy terminated successfully.')
56
+
57
+ @staticmethod
58
+ def find_and_kill_pid(pids: list):
59
+ if len(pids) > 0:
60
+ for pid in pids:
61
+ subprocess.run(['kill', str(pid)])
62
+ logger.warning(f'Killed process {pid}.')
63
+ else:
64
+ logger.info('No pids found.')
65
+
66
+ @staticmethod
67
+ def find_and_kill_service(service_name):
68
+ try:
69
+ # find pid
70
+ result = subprocess.run(['ps', '-ef'], stdout=subprocess.PIPE, text=True)
71
+
72
+ lines = result.stdout.splitlines()
73
+ pids = []
74
+ for line in lines:
75
+ if service_name in line and 'grep' not in line:
76
+ parts = line.split()
77
+ pid = parts[1]
78
+ pids.append(pid)
79
+
80
+ if not pids:
81
+ logger.info(f'No process found for {service_name}.')
82
+ else:
83
+ for pid in pids:
84
+ subprocess.run(['kill', pid])
85
+ logger.warning(f'Killed process {pid} for service {service_name}.')
86
+ except Exception as e:
87
+ logger.error(f'An error occurred: {e}')
88
+
89
+ @staticmethod
90
+ def check_service_status(url: str, data: dict, retries: int = 30, delay: int = 10):
91
+ for i in range(retries):
92
+ try:
93
+ logger.info(f'Attempt {i + 1}: Checking service at {url} ...')
94
+ response = requests.post(
95
+ url, data=json.dumps(data), headers={'Content-Type': 'application/json'}, timeout=30)
96
+ if response.status_code == 200:
97
+ logger.info(f'Service at {url} is available !\n\n')
98
+ return True
99
+ else:
100
+ logger.info(f'Service at {url} returned status code {response.status_code}.')
101
+ except requests.exceptions.RequestException as e:
102
+ logger.info(f'Attempt {i + 1}: An error occurred: {e}')
103
+
104
+ time.sleep(delay)
105
+
106
+ logger.info(f'Service at {url} is not available after {retries} retries.')
107
+ return False
108
+
109
+ @unittest.skipUnless(1 in test_level_list(), 'skip test in current test level')
110
+ def test_run_task(self):
111
+ # Prepare the config
112
+ task_cfg = dict(
113
+ eval_backend='OpenCompass',
114
+ eval_config={
115
+ 'datasets': ['cmb', 'bbh', 'ceval', 'ARC_e', 'gsm8k'],
116
+ 'models': [
117
+ {
118
+ 'path': self.model_name,
119
+ 'openai_api_base': DEFAULT_CHAT_MODEL_URL,
120
+ 'batch_size': 8
121
+ },
122
+ ],
123
+ 'work_dir': 'outputs/llama3_eval_result',
124
+ 'reuse': None, # string, `latest` or timestamp, e.g. `20230516_144254`, default to None
125
+ 'limit': 5, # string or int or float, e.g. `[2:5]`, 5, 5.0, default to None, it means run all examples
126
+ },
127
+ )
128
+
129
+ # Check the service status
130
+ data = {'model': self.model_name, 'messages': [{'role': 'user', 'content': 'who are you?'}]}
131
+ assert self.check_service_status(
132
+ DEFAULT_CHAT_MODEL_URL, data=data), f'Failed to check service status: {DEFAULT_CHAT_MODEL_URL}'
133
+
134
+ # Submit the task
135
+ logger.info(f'Start to run UT with cfg: {task_cfg}')
136
+ run_task(task_cfg=task_cfg)
137
+
138
+ # Get the final report with summarizer
139
+ report_list = Summarizer.get_report_from_cfg(task_cfg)
140
+ logger.info(f'>>The report list:\n{report_list}')
141
+
142
+ assert len(report_list) > 0, f'Failed to get report list: {report_list}'
143
+
144
+
145
+ if __name__ == '__main__':
146
+ unittest.main()
@@ -0,0 +1,128 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
3
+ import shutil
4
+ import subprocess
5
+ import time
6
+ import unittest
7
+
8
+ import json
9
+ import requests
10
+
11
+ from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
12
+ from evalscope.run import run_task
13
+ from evalscope.summarizer import Summarizer
14
+ from evalscope.utils import is_module_installed, test_level_list
15
+ from evalscope.utils.logger import get_logger
16
+
17
+ logger = get_logger(__name__)
18
+
19
+ DEFAULT_CHAT_MODEL_URL = 'http://127.0.0.1:8000/v1/chat/completions'
20
+ DEFAULT_API_KEY = 'EMPTY'
21
+ DEFAULT_MODEL_NAME = 'CustomAPIModel'
22
+ DEFAULT_WORK_DIR = 'outputs/qwen-vl-chat'
23
+
24
+
25
+ class TestRunSwiftVLMEval(unittest.TestCase):
26
+
27
+ def setUp(self) -> None:
28
+ logger.info('Init env for swift-eval UTs ...\n')
29
+ assert is_module_installed('evalscope'), 'Please install `llmuses` from pypi or source code.'
30
+
31
+ if not is_module_installed('vlmeval'):
32
+ logger.warning('Note: installing ms-vlmeval ...')
33
+ subprocess.run('pip3 install ms-vlmeval -U', shell=True, check=True)
34
+
35
+ if not is_module_installed('swift'):
36
+ logger.warning('Note: installing ms-swift ...')
37
+ subprocess.run('pip3 install ms-swift -U', shell=True, check=True)
38
+
39
+ if os.path.exists(DEFAULT_WORK_DIR):
40
+ shutil.rmtree(DEFAULT_WORK_DIR)
41
+ logger.info(f'Removed work dir: {os.path.abspath(DEFAULT_WORK_DIR)} \n')
42
+
43
+ logger.info('\nStaring run swift deploy ...')
44
+ self.model_name = 'qwen-vl-chat'
45
+ self.process_swift_deploy = subprocess.Popen(
46
+ f'swift deploy --model_type {self.model_name} --infer_backend pt', text=True, shell=True)
47
+
48
+ self.all_datasets = VLMEvalKitBackendManager.list_supported_datasets()
49
+ assert len(self.all_datasets) > 0, f'Failed to list datasets from VLMEvalKit backend: {self.all_datasets}'
50
+
51
+ def tearDown(self) -> None:
52
+ # Stop the swift deploy model service
53
+ logger.warning('Stopping swift deploy ...')
54
+ self.process_swift_deploy.terminate()
55
+ self.process_swift_deploy.wait()
56
+ logger.info('Process swift-deploy terminated successfully.')
57
+
58
+ @staticmethod
59
+ def _check_env(module_name: str):
60
+ if is_module_installed(module_name):
61
+ logger.info(f'{module_name} is installed.')
62
+ else:
63
+ raise ModuleNotFoundError(f'run: pip install {module_name}')
64
+
65
+ @staticmethod
66
+ def check_service_status(url: str, data: dict, retries: int = 20, delay: int = 10):
67
+ for i in range(retries):
68
+ try:
69
+ logger.info(f'Attempt {i + 1}: Checking service at {url} ...')
70
+ response = requests.post(
71
+ url, data=json.dumps(data), headers={'Content-Type': 'application/json'}, timeout=30)
72
+ if response.status_code == 200:
73
+ logger.info(f'Service at {url} is available !\n\n')
74
+ return True
75
+ else:
76
+ logger.info(f'Service at {url} returned status code {response.status_code}.')
77
+ except requests.exceptions.RequestException as e:
78
+ logger.info(f'Attempt {i + 1}: An error occurred: {e}')
79
+
80
+ time.sleep(delay)
81
+
82
+ logger.info(f'Service at {url} is not available after {retries} retries.')
83
+ return False
84
+
85
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
86
+ def test_run_api(self):
87
+ api_base = DEFAULT_CHAT_MODEL_URL
88
+ task_cfg = {
89
+ 'eval_backend': 'VLMEvalKit',
90
+ 'eval_config': {
91
+ 'data': ['SEEDBench_IMG', 'ChartQA_TEST'],
92
+ 'limit':
93
+ 30,
94
+ 'mode':
95
+ 'all',
96
+ 'model': [{
97
+ 'api_base': api_base, # swfit deploy model api
98
+ 'key': DEFAULT_API_KEY,
99
+ 'name': DEFAULT_MODEL_NAME, # must be CustomAPIModel for swift
100
+ 'temperature': 0.0,
101
+ 'type': self.model_name
102
+ }], # swift model type
103
+ 'nproc':
104
+ 1,
105
+ 'reuse':
106
+ True,
107
+ 'work_dir':
108
+ DEFAULT_WORK_DIR
109
+ }
110
+ }
111
+
112
+ # Check the service status
113
+ data = {'model': self.model_name, 'messages': [{'role': 'user', 'content': 'who are you?'}]}
114
+ assert self.check_service_status(api_base, data=data), f'Failed to check service status: {api_base}'
115
+
116
+ logger.info(f'>> Start to run task: {task_cfg}')
117
+
118
+ run_task(task_cfg)
119
+
120
+ logger.info('>> Start to get the report with summarizer ...')
121
+ report_list = Summarizer.get_report_from_cfg(task_cfg)
122
+ logger.info(f'\n>> The report list: {report_list}')
123
+
124
+ assert len(report_list) > 0, f'Failed to get report list: {report_list}'
125
+
126
+
127
+ if __name__ == '__main__':
128
+ unittest.main(buffer=False)
@@ -0,0 +1,157 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
3
+ import shutil
4
+ import subprocess
5
+ import time
6
+ import unittest
7
+
8
+ import json
9
+ import requests
10
+
11
+ from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
12
+ from evalscope.run import run_task
13
+ from evalscope.summarizer import Summarizer
14
+ from evalscope.utils import is_module_installed, test_level_list
15
+ from evalscope.utils.logger import get_logger
16
+
17
+ logger = get_logger(__name__)
18
+
19
+ DEFAULT_CHAT_MODEL_URL = 'http://127.0.0.1:8000/v1/chat/completions'
20
+ DEFAULT_JUDGE_MODEL_URL = 'http://127.0.0.1:8866/v1/chat/completions'
21
+
22
+ DEFAULT_API_KEY = 'EMPTY'
23
+ DEFAULT_MODEL_NAME = 'CustomAPIModel'
24
+ DEFAULT_WORK_DIR = 'outputs'
25
+
26
+
27
+ class TestRunSwiftVLMEval(unittest.TestCase):
28
+
29
+ def setUp(self) -> None:
30
+ logger.info('Init env for swift-eval UTs ...\n')
31
+ assert is_module_installed('llmuses'), 'Please install `llmuses` from pypi or source code.'
32
+
33
+ logger.warning('Note: installing ms-vlmeval ...')
34
+ subprocess.run('pip3 install ms-vlmeval -U', shell=True, check=True)
35
+
36
+ logger.warning('Note: installing ms-swift ...')
37
+ subprocess.run('pip3 install ms-swift -U', shell=True, check=True)
38
+
39
+ if os.path.exists(DEFAULT_WORK_DIR):
40
+ shutil.rmtree(DEFAULT_WORK_DIR)
41
+ logger.info(f'Removed work dir: {os.path.abspath(DEFAULT_WORK_DIR)} \n')
42
+
43
+ logger.info('\nStaring run swift deploy ...')
44
+ self.model_name = 'qwen-vl-chat'
45
+ self.process_swift_deploy = subprocess.Popen(
46
+ f'swift deploy --model_type {self.model_name}',
47
+ text=True,
48
+ shell=True,
49
+ stdout=subprocess.PIPE,
50
+ stderr=subprocess.PIPE)
51
+
52
+ logger.info('\nStaring run swift deploy judge ...')
53
+ self.judge_model_name = 'qwen2-7b-instruct'
54
+ self.process_swift_deploy_judge = subprocess.Popen(
55
+ f'swift deploy --model_type {self.judge_model_name} \
56
+ --port 8866',
57
+ text=True,
58
+ shell=True,
59
+ stdout=subprocess.PIPE,
60
+ stderr=subprocess.PIPE)
61
+
62
+ self.all_datasets = VLMEvalKitBackendManager.list_supported_datasets()
63
+ assert len(self.all_datasets) > 0, f'Failed to list datasets from VLMEvalKit backend: {self.all_datasets}'
64
+
65
+ def tearDown(self) -> None:
66
+ # Stop the swift deploy model service
67
+ logger.warning('\nStopping swift deploy ...')
68
+ self.process_swift_deploy.terminate()
69
+ self.process_swift_deploy.wait()
70
+
71
+ self.process_swift_deploy_judge.terminate()
72
+ self.process_swift_deploy_judge.wait()
73
+
74
+ logger.info('Process swift-deploy terminated successfully.')
75
+
76
+ @staticmethod
77
+ def _check_env(module_name: str):
78
+ if is_module_installed(module_name):
79
+ logger.info(f'{module_name} is installed.')
80
+ else:
81
+ raise ModuleNotFoundError(f'run: pip install {module_name}')
82
+
83
+ @staticmethod
84
+ def check_service_status(url: str, data: dict, retries: int = 20, delay: int = 10):
85
+ for i in range(retries):
86
+ try:
87
+ logger.info(f'Attempt {i + 1}: Checking service at {url} ...')
88
+ response = requests.post(
89
+ url, data=json.dumps(data), headers={'Content-Type': 'application/json'}, timeout=30)
90
+ if response.status_code == 200:
91
+ logger.info(f'Service at {url} is available !\n\n')
92
+ return True
93
+ else:
94
+ logger.info(f'Service at {url} returned status code {response.status_code}.')
95
+ except requests.exceptions.RequestException as e:
96
+ logger.info(f'Attempt {i + 1}: An error occurred: {e}')
97
+
98
+ time.sleep(delay)
99
+
100
+ logger.info(f'Service at {url} is not available after {retries} retries.')
101
+ return False
102
+
103
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
104
+ def test_run_api_with_judge(self):
105
+ # Check the judge service status
106
+ data = {'model': self.judge_model_name, 'messages': [{'role': 'user', 'content': 'who are you?'}]}
107
+ assert self.check_service_status(DEFAULT_JUDGE_MODEL_URL, data=data), \
108
+ f'Failed to check judge service status: {DEFAULT_JUDGE_MODEL_URL}'
109
+
110
+ api_base = DEFAULT_CHAT_MODEL_URL
111
+ task_cfg = {
112
+ 'eval_backend': 'VLMEvalKit',
113
+ 'eval_config': {
114
+ 'LOCAL_LLM':
115
+ self.judge_model_name, # judge model id
116
+ 'OPENAI_API_BASE':
117
+ DEFAULT_JUDGE_MODEL_URL, # judge model api
118
+ 'OPENAI_API_KEY':
119
+ DEFAULT_API_KEY,
120
+ 'data': ['SEEDBench_IMG', 'ChartQA_TEST'],
121
+ 'limit':
122
+ 20,
123
+ 'mode':
124
+ 'all',
125
+ 'model': [{
126
+ 'api_base': api_base, # swfit deploy model api
127
+ 'key': DEFAULT_API_KEY,
128
+ 'name': DEFAULT_MODEL_NAME, # must be CustomAPIModel for swift
129
+ 'temperature': 0.0,
130
+ 'type': self.model_name
131
+ }], # swift model type
132
+ 'nproc':
133
+ 1,
134
+ 'reuse':
135
+ True,
136
+ 'work_dir':
137
+ DEFAULT_WORK_DIR
138
+ }
139
+ }
140
+
141
+ # Check the service status
142
+ data = {'model': self.model_name, 'messages': [{'role': 'user', 'content': 'who are you?'}]}
143
+ assert self.check_service_status(api_base, data=data), f'Failed to check service status: {api_base}'
144
+
145
+ logger.info(f'>> Start to run task: {task_cfg}')
146
+
147
+ run_task(task_cfg)
148
+
149
+ logger.info('>> Start to get the report with summarizer ...')
150
+ report_list = Summarizer.get_report_from_cfg(task_cfg)
151
+ logger.info(f'\n>> The report list: {report_list}')
152
+
153
+ assert len(report_list) > 0, f'Failed to get report list: {report_list}'
154
+
155
+
156
+ if __name__ == '__main__':
157
+ unittest.main(buffer=False)
tests/test_run_all.py ADDED
@@ -0,0 +1,12 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import subprocess
4
+
5
+ if __name__ == '__main__':
6
+ cmd = f'TEST_LEVEL_LIST=0,1 python3 -m unittest discover tests'
7
+ run_res = subprocess.run(cmd, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
8
+
9
+ if run_res.returncode == 0:
10
+ print(f'>>test_run_all stdout: {run_res.stdout}')
11
+ else:
12
+ print(f'>>test_run_all stderr: {run_res.stderr}')
tests/vlm/__init__.py ADDED
@@ -0,0 +1 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
@@ -0,0 +1,59 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import subprocess
4
+ import unittest
5
+
6
+ from evalscope.run import run_task
7
+ from evalscope.summarizer import Summarizer
8
+ from evalscope.utils import is_module_installed, test_level_list
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
12
+
13
+
14
+ class TestVLMEval(unittest.TestCase):
15
+
16
+ def setUp(self) -> None:
17
+ self._check_env('vlmeval')
18
+
19
+ def tearDown(self) -> None:
20
+ pass
21
+
22
+ @staticmethod
23
+ def _check_env(module_name: str):
24
+ if is_module_installed(module_name):
25
+ logger.info(f'{module_name} is installed.')
26
+ else:
27
+ raise ModuleNotFoundError(f'run: pip install {module_name}')
28
+
29
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
30
+ def test_run_vlm_eval_local(self):
31
+ task_cfg = {
32
+ 'eval_backend': 'VLMEvalKit',
33
+ 'eval_config': {
34
+ 'data': ['SEEDBench_IMG', 'ChartQA_TEST'],
35
+ 'limit': 20,
36
+ 'mode': 'all',
37
+ 'model': [{
38
+ 'name': 'qwen-vl-chat',
39
+ 'model_path': '../models/Qwen-VL-Chat'
40
+ }], # model name for VLMEval config
41
+ 'nproc': 1,
42
+ 'reuse': True,
43
+ 'work_dir': 'outputs'
44
+ }
45
+ }
46
+
47
+ logger.info(f'>> Start to run task: {task_cfg}')
48
+
49
+ run_task(task_cfg)
50
+
51
+ logger.info('>> Start to get the report with summarizer ...')
52
+ report_list = Summarizer.get_report_from_cfg(task_cfg)
53
+ logger.info(f'\n>>The report list: {report_list}')
54
+
55
+ assert len(report_list) > 0, f'Failed to get report list: {report_list}'
56
+
57
+
58
+ if __name__ == '__main__':
59
+ unittest.main(buffer=False)
@@ -1,32 +0,0 @@
1
- import logging
2
- import os
3
-
4
-
5
- logger = logging.getLogger('perf')
6
-
7
-
8
- def enable_logging():
9
- level = os.environ.get('LOGGING_LEVEL', 'info')
10
- if level is not None: # set logging level.
11
- if level not in ['info', 'debug']:
12
- # set logging level env, but invalid value, use default.
13
- level = 'info'
14
- if level == 'info':
15
- logger.setLevel(logging.INFO)
16
- else:
17
- logger.setLevel(logging.DEBUG)
18
- # set default logging handler
19
- console_handler = logging.StreamHandler()
20
- formatter = logging.Formatter(
21
- '%(asctime)s - %(name)s - %(filename)s - %(funcName)s - %(lineno)d - %(levelname)s - %(message)s' # noqa E501
22
- )
23
- #formatter = logging.Formatter(
24
- # '%(asctime)s - %(name)s - %(levelname)s - %(message)s' # noqa E501
25
- #)
26
- console_handler.setFormatter(formatter)
27
- logger.addHandler(console_handler)
28
-
29
-
30
- # in release disable dashscope log
31
- # you can enable dashscope log for debugger.
32
- enable_logging()
@@ -1,20 +0,0 @@
1
- import sys
2
- from typing import Any, Dict, Iterator, List
3
- from evalscope.perf.dataset_plugin_base import DatasetPluginBase
4
-
5
- from evalscope.perf.plugin_registry import register_dataset
6
- from evalscope.perf.query_parameters import QueryParameters
7
-
8
- @register_dataset('longalpaca')
9
- class LongAlpacaDatasetPlugin(DatasetPluginBase):
10
- """Read data from file which is list of requests.
11
- Sample: https://huggingface.co/datasets/Yukang/LongAlpaca-12k
12
- """
13
- def __init__(self, query_parameters: QueryParameters):
14
- super().__init__(query_parameters)
15
-
16
- def build_messages(self) -> Iterator[List[Dict]]:
17
- for item in self.dataset_json_list(self.query_parameters.dataset_path):
18
- prompt = item['instruction'].strip()
19
- if len(prompt) > self.query_parameters.min_prompt_length and len(prompt) < self.query_parameters.max_prompt_length:
20
- yield [{'role': 'user', 'content': prompt}]