evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (176) hide show
  1. evalscope/api/benchmark/__init__.py +8 -1
  2. evalscope/api/benchmark/adapters/__init__.py +1 -0
  3. evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
  4. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  5. evalscope/api/benchmark/benchmark.py +14 -0
  6. evalscope/api/dataset/dataset.py +21 -0
  7. evalscope/api/dataset/loader.py +6 -2
  8. evalscope/api/mixin/sandbox_mixin.py +32 -54
  9. evalscope/api/model/generate_config.py +6 -0
  10. evalscope/app/ui/multi_model.py +6 -1
  11. evalscope/app/ui/single_model.py +8 -2
  12. evalscope/app/utils/data_utils.py +3 -2
  13. evalscope/app/utils/visualization.py +2 -2
  14. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  15. evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
  16. evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
  17. evalscope/benchmarks/blink/__init__.py +0 -0
  18. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  19. evalscope/benchmarks/chartqa/__init__.py +0 -0
  20. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  21. evalscope/benchmarks/chartqa/utils.py +38 -0
  22. evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
  23. evalscope/benchmarks/docvqa/__init__.py +0 -0
  24. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  25. evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
  26. evalscope/benchmarks/general_arena/utils.py +2 -1
  27. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  28. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  29. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
  30. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  31. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
  32. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  33. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
  34. evalscope/benchmarks/infovqa/__init__.py +0 -0
  35. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  36. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
  37. evalscope/benchmarks/math_verse/__init__.py +0 -0
  38. evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
  39. evalscope/benchmarks/math_vision/__init__.py +0 -0
  40. evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
  41. evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
  42. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
  43. evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
  44. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
  45. evalscope/benchmarks/ner/__init__.py +0 -0
  46. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  47. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  48. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  49. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  50. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  51. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  52. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  53. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  54. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  55. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  56. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  57. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  58. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  59. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  60. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  61. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  62. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  63. evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
  64. evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
  65. evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
  66. evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
  67. evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  68. evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
  69. evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
  70. evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  71. evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  72. evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  73. evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
  74. evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
  75. evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
  76. evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
  77. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  78. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  79. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  80. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  81. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  82. evalscope/benchmarks/poly_math/__init__.py +0 -0
  83. evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
  84. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  85. evalscope/benchmarks/pope/__init__.py +0 -0
  86. evalscope/benchmarks/pope/pope_adapter.py +111 -0
  87. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  88. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  89. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  90. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  91. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
  92. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
  93. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  94. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  95. evalscope/benchmarks/zerobench/__init__.py +0 -0
  96. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  97. evalscope/constants.py +4 -0
  98. evalscope/evaluator/evaluator.py +72 -79
  99. evalscope/metrics/math_parser.py +14 -0
  100. evalscope/metrics/metric.py +52 -1
  101. evalscope/metrics/metrics.py +16 -0
  102. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  103. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  104. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  105. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  106. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  107. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  108. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  109. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  110. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  111. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  112. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  113. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  114. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  115. evalscope/models/utils/openai.py +4 -0
  116. evalscope/perf/arguments.py +24 -4
  117. evalscope/perf/benchmark.py +74 -89
  118. evalscope/perf/http_client.py +31 -16
  119. evalscope/perf/main.py +15 -2
  120. evalscope/perf/plugin/api/base.py +9 -7
  121. evalscope/perf/plugin/api/custom_api.py +13 -58
  122. evalscope/perf/plugin/api/default_api.py +179 -79
  123. evalscope/perf/plugin/api/openai_api.py +4 -3
  124. evalscope/perf/plugin/datasets/base.py +21 -0
  125. evalscope/perf/plugin/datasets/custom.py +2 -3
  126. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  127. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  128. evalscope/perf/plugin/datasets/openqa.py +2 -4
  129. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  130. evalscope/perf/utils/benchmark_util.py +36 -22
  131. evalscope/perf/utils/db_util.py +14 -19
  132. evalscope/perf/utils/local_server.py +0 -44
  133. evalscope/perf/utils/log_utils.py +21 -6
  134. evalscope/report/__init__.py +11 -2
  135. evalscope/report/combinator.py +52 -2
  136. evalscope/run.py +4 -0
  137. evalscope/utils/function_utils.py +195 -12
  138. evalscope/utils/io_utils.py +74 -0
  139. evalscope/utils/json_schema.py +8 -6
  140. evalscope/utils/logger.py +49 -17
  141. evalscope/utils/multi_choices.py +16 -1
  142. evalscope/utils/ner.py +377 -0
  143. evalscope/version.py +2 -2
  144. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
  145. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
  146. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
  147. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
  148. tests/__init__.py +0 -1
  149. tests/benchmark/__init__.py +0 -1
  150. tests/benchmark/test_eval.py +0 -429
  151. tests/benchmark/test_image_edit.py +0 -65
  152. tests/benchmark/test_sandbox.py +0 -81
  153. tests/benchmark/test_t2i.py +0 -142
  154. tests/benchmark/test_vlm.py +0 -137
  155. tests/cli/__init__.py +0 -1
  156. tests/cli/test_all.py +0 -269
  157. tests/cli/test_collection.py +0 -99
  158. tests/cli/test_custom.py +0 -268
  159. tests/cli/test_reasoning.py +0 -81
  160. tests/common.py +0 -73
  161. tests/perf/__init__.py +0 -1
  162. tests/perf/test_perf.py +0 -206
  163. tests/rag/test_clip_benchmark.py +0 -87
  164. tests/rag/test_mteb.py +0 -213
  165. tests/rag/test_ragas.py +0 -128
  166. tests/swift/__init__.py +0 -1
  167. tests/swift/test_run_swift_eval.py +0 -146
  168. tests/swift/test_run_swift_vlm_eval.py +0 -128
  169. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  170. tests/test_run_all.py +0 -12
  171. tests/utils.py +0 -13
  172. tests/vlm/__init__.py +0 -1
  173. tests/vlm/test_vlmeval.py +0 -102
  174. {tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
  175. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
  176. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
@@ -1,87 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os
3
- import subprocess
4
- import unittest
5
-
6
- from evalscope.run import run_task
7
- from evalscope.utils.import_utils import is_module_installed
8
- from evalscope.utils.logger import get_logger
9
- from tests.utils import test_level_list
10
-
11
- logger = get_logger()
12
-
13
-
14
- class TestCLIPBenchmark(unittest.TestCase):
15
-
16
- def setUp(self) -> None:
17
- self._check_env('webdataset')
18
-
19
- def tearDown(self) -> None:
20
- pass
21
-
22
- @staticmethod
23
- def _check_env(module_name: str):
24
- if is_module_installed(module_name):
25
- logger.info(f'{module_name} is installed.')
26
- else:
27
- raise ModuleNotFoundError(f'run: pip install {module_name}')
28
-
29
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
30
- def test_run_task(self):
31
- task_cfg = {
32
- 'eval_backend': 'RAGEval',
33
- 'eval_config': {
34
- 'tool': 'clip_benchmark',
35
- 'eval': {
36
- 'models': [
37
- {
38
- 'model_name': 'AI-ModelScope/chinese-clip-vit-large-patch14-336px',
39
- }
40
- ],
41
- 'dataset_name': [
42
- 'muge',
43
- 'mnist',
44
- 'flickr8k'
45
- ],
46
- 'split': 'test',
47
- 'batch_size': 128,
48
- 'num_workers': 1,
49
- 'verbose': True,
50
- 'skip_existing': False,
51
- 'cache_dir': 'cache',
52
- 'limit': 1000,
53
- },
54
- },
55
- }
56
-
57
- run_task(task_cfg)
58
-
59
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
60
- def test_run_custom(self):
61
- task_cfg = {
62
- 'eval_backend': 'RAGEval',
63
- 'eval_config': {
64
- 'tool': 'clip_benchmark',
65
- 'eval': {
66
- 'models': [
67
- {
68
- 'model_name': 'AI-ModelScope/chinese-clip-vit-large-patch14-336px',
69
- }
70
- ],
71
- 'dataset_name': ['custom'],
72
- 'data_dir': 'custom_eval/multimodal/text-image-retrieval',
73
- 'split': 'test',
74
- 'batch_size': 128,
75
- 'num_workers': 1,
76
- 'verbose': True,
77
- 'skip_existing': False,
78
- 'limit': 1000,
79
- },
80
- },
81
- }
82
-
83
- run_task(task_cfg)
84
-
85
-
86
- if __name__ == '__main__':
87
- unittest.main(buffer=False)
tests/rag/test_mteb.py DELETED
@@ -1,213 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- import unittest
4
- from dotenv import dotenv_values
5
-
6
- from tests.utils import test_level_list
7
-
8
- env = dotenv_values('.env')
9
- from evalscope.run import run_task
10
- from evalscope.utils.import_utils import is_module_installed
11
- from evalscope.utils.logger import get_logger
12
-
13
- logger = get_logger()
14
-
15
-
16
- class TestMTEB(unittest.TestCase):
17
-
18
- def setUp(self) -> None:
19
- self._check_env('mteb')
20
-
21
- def tearDown(self) -> None:
22
- pass
23
-
24
- @staticmethod
25
- def _check_env(module_name: str):
26
- if is_module_installed(module_name):
27
- logger.info(f'{module_name} is installed.')
28
- else:
29
- raise ModuleNotFoundError(f'run: pip install {module_name}')
30
-
31
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
32
- def test_run_one_stage_mteb(self):
33
- task_cfg = {
34
- 'eval_backend': 'RAGEval',
35
- 'eval_config': {
36
- 'tool': 'MTEB',
37
- 'model': [
38
- {
39
- 'model_name_or_path': 'AI-ModelScope/m3e-base',
40
- 'pooling_mode': None, # load from model config
41
- 'max_seq_length': 512,
42
- 'prompt': '',
43
- 'model_kwargs': {'torch_dtype': 'auto'},
44
- 'encode_kwargs': {
45
- 'batch_size': 128,
46
- },
47
- }
48
- ],
49
- 'eval': {
50
- 'tasks': [
51
- 'TNews',
52
- 'CLSClusteringS2S',
53
- 'T2Reranking',
54
- 'T2Retrieval',
55
- 'ATEC',
56
- ],
57
- 'verbosity': 2,
58
- 'overwrite_results': True,
59
- 'limits': 500,
60
- },
61
- },
62
- }
63
-
64
- run_task(task_cfg)
65
-
66
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
67
- def test_run_one_stage_api(self):
68
- from evalscope import TaskConfig
69
- task_cfg = TaskConfig(
70
- eval_backend='RAGEval',
71
- eval_config={
72
- 'tool': 'MTEB',
73
- 'model': [
74
- {
75
- 'model_name': 'text-embedding-v3',
76
- 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
77
- 'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
78
- 'dimensions': 1024,
79
- 'encode_kwargs': {
80
- 'batch_size': 10,
81
- },
82
- }
83
- ],
84
- 'eval': {
85
- 'tasks': [
86
- 'T2Retrieval',
87
- ],
88
- 'verbosity': 2,
89
- 'overwrite_results': True,
90
- 'limits': 10,
91
- },
92
- },
93
- )
94
-
95
- run_task(task_cfg)
96
-
97
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
98
- def test_run_two_stage_mteb(self):
99
- task_cfg = {
100
- 'eval_backend': 'RAGEval',
101
- 'eval_config': {
102
- 'tool': 'MTEB',
103
- 'model': [
104
- {
105
- 'model_name_or_path': 'AI-ModelScope/m3e-base',
106
- 'is_cross_encoder': False,
107
- 'max_seq_length': 512,
108
- 'prompt': '',
109
- 'model_kwargs': {'torch_dtype': 'auto'},
110
- 'encode_kwargs': {
111
- 'batch_size': 64,
112
- },
113
- },
114
- {
115
- 'model_name_or_path': 'BAAI/bge-reranker-v2-m3',
116
- 'is_cross_encoder': True,
117
- 'max_seq_length': 512,
118
- 'prompt': '为这个问题生成一个检索用的表示',
119
- 'model_kwargs': {'torch_dtype': 'auto'},
120
- 'encode_kwargs': {
121
- 'batch_size': 32,
122
- },
123
- },
124
- ],
125
- 'eval': {
126
- 'tasks': [
127
- 'MedicalRetrieval',
128
- 'T2Retrieval'
129
- ],
130
- 'verbosity': 2,
131
- 'overwrite_results': True,
132
- 'limits': 10,
133
- 'top_k': 10,
134
- },
135
- },
136
- }
137
-
138
- run_task(task_cfg)
139
-
140
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
141
- def test_run_two_stage_api(self):
142
- task_cfg = {
143
- 'eval_backend': 'RAGEval',
144
- 'eval_config': {
145
- 'tool': 'MTEB',
146
- 'model': [
147
- {
148
- 'model_name': 'text-embedding-v3',
149
- 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
150
- 'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
151
- 'dimensions': 1024,
152
- 'encode_kwargs': {
153
- 'batch_size': 10,
154
- },
155
- },
156
- {
157
- 'model_name': 'text-embedding-v3',
158
- 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
159
- 'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
160
- 'dimensions': 1024,
161
- 'encode_kwargs': {
162
- 'batch_size': 10,
163
- },
164
- },
165
- ],
166
- 'eval': {
167
- 'tasks': [
168
- 'MedicalRetrieval',
169
- # 'T2Retrieval'
170
- ],
171
- 'verbosity': 2,
172
- 'overwrite_results': True,
173
- 'limits': 10,
174
- 'top_k': 10,
175
- },
176
- },
177
- }
178
-
179
- run_task(task_cfg)
180
-
181
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
182
- def test_run_custom(self):
183
- task_cfg = {
184
- 'eval_backend': 'RAGEval',
185
- 'eval_config': {
186
- 'tool': 'MTEB',
187
- 'model': [
188
- {
189
- 'model_name_or_path': 'AI-ModelScope/m3e-base',
190
- 'pooling_mode': None, # load from model config
191
- 'max_seq_length': 512,
192
- 'prompt': '',
193
- 'model_kwargs': {'torch_dtype': 'auto'},
194
- 'encode_kwargs': {
195
- 'batch_size': 128,
196
- },
197
- }
198
- ],
199
- 'eval': {
200
- 'tasks': ['CustomRetrieval'],
201
- 'dataset_path': 'custom_eval/text/retrieval',
202
- 'verbosity': 2,
203
- 'overwrite_results': True,
204
- 'limits': 500,
205
- },
206
- },
207
- }
208
-
209
- run_task(task_cfg)
210
-
211
-
212
- if __name__ == '__main__':
213
- unittest.main(buffer=False)
tests/rag/test_ragas.py DELETED
@@ -1,128 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os
3
- from dotenv import dotenv_values
4
-
5
- from tests.utils import test_level_list
6
-
7
- env = dotenv_values('.env')
8
- import unittest
9
-
10
- from evalscope import TaskConfig, run_task
11
- from evalscope.utils.import_utils import is_module_installed
12
- from evalscope.utils.logger import get_logger
13
-
14
- logger = get_logger()
15
-
16
-
17
- class TestRAGAS(unittest.TestCase):
18
-
19
- def setUp(self) -> None:
20
- self._check_env('ragas')
21
-
22
- def tearDown(self) -> None:
23
- pass
24
-
25
- @staticmethod
26
- def _check_env(module_name: str):
27
- if is_module_installed(module_name):
28
- logger.info(f'{module_name} is installed.')
29
- else:
30
- raise ModuleNotFoundError(f'run: pip install {module_name}')
31
-
32
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
33
- def test_run_generate_dataset(self):
34
- task_cfg = {
35
- 'eval_backend': 'RAGEval',
36
- 'eval_config': {
37
- 'tool': 'RAGAS',
38
- 'testset_generation': {
39
- 'docs': ['README_zh.md'],
40
- 'test_size': 5,
41
- 'output_file': 'outputs/testset.json',
42
- 'generator_llm': {
43
- 'model_name': 'qwen-plus', # 自定义聊天模型名称
44
- 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1', # 自定义基础URL
45
- 'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'), # 自定义API密钥
46
- },
47
- 'embeddings': {
48
- 'model_name_or_path': 'AI-ModelScope/m3e-base',
49
- },
50
- 'language': 'chinese',
51
- },
52
- },
53
- }
54
-
55
- logger.info(f'>> Start to run task: {task_cfg}')
56
-
57
- run_task(task_cfg)
58
-
59
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
60
- def test_run_rag_eval(self):
61
- task_cfg = {
62
- 'eval_backend': 'RAGEval',
63
- 'eval_config': {
64
- 'tool': 'RAGAS',
65
- 'eval': {
66
- 'testset_file': 'outputs/testset_chinese_with_answer.json',
67
- 'critic_llm': {
68
- 'model_name_or_path': 'Qwen/Qwen2.5-7B-Instruct',
69
- },
70
- 'embeddings': {
71
- 'model_name_or_path': 'AI-ModelScope/m3e-base',
72
- },
73
- 'metrics': [
74
- 'Faithfulness',
75
- 'AnswerRelevancy',
76
- 'ContextPrecision',
77
- 'AnswerCorrectness',
78
- ],
79
- },
80
- },
81
- }
82
-
83
- logger.info(f'>> Start to run task: {task_cfg}')
84
-
85
- run_task(task_cfg)
86
-
87
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
88
- def test_run_rag_eval_api(self):
89
- from evalscope.backend.rag_eval.ragas.arguments import EvaluationArguments
90
- task_cfg = TaskConfig(
91
- eval_backend='RAGEval',
92
- eval_config=dict(
93
- tool='RAGAS',
94
- eval=EvaluationArguments(
95
- testset_file='outputs/testset_chinese_with_answer_small.json',
96
- critic_llm={
97
- 'model_name': 'qwen-plus', # 自定义聊天模型名称
98
- 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1', # 自定义基础URL
99
- 'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'), # 自定义API密钥
100
- },
101
- embeddings={
102
- 'model_name': 'text-embedding-v1',
103
- 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
104
- 'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
105
- 'dimensions': 1024,
106
- 'encode_kwargs': {
107
- 'batch_size': 10,
108
- },
109
- },
110
- metrics=[
111
- 'Faithfulness',
112
- 'AnswerRelevancy',
113
- 'ContextPrecision',
114
- 'AnswerCorrectness',
115
- # 'MultiModalFaithfulness',
116
- # 'MultiModalRelevance',
117
- ],
118
- ),
119
- ),
120
- )
121
-
122
- logger.info(f'>> Start to run task: {task_cfg}')
123
-
124
- run_task(task_cfg)
125
-
126
-
127
- if __name__ == '__main__':
128
- unittest.main(buffer=False)
tests/swift/__init__.py DELETED
@@ -1 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
@@ -1,146 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- import json
4
- import os
5
- import requests
6
- import subprocess
7
- import time
8
- import unittest
9
-
10
- from evalscope.backend.opencompass import OpenCompassBackendManager
11
- from evalscope.run import run_task
12
- from evalscope.summarizer import Summarizer
13
- from evalscope.utils.import_utils import is_module_installed
14
- from evalscope.utils.logger import get_logger
15
- from tests.utils import test_level_list
16
-
17
- logger = get_logger(__name__)
18
-
19
- DEFAULT_CHAT_MODEL_URL = 'http://127.0.0.1:8000/v1/chat/completions'
20
- DEFAULT_BASE_MODEL_URL = 'http://127.0.0.1:8001/v1/completions'
21
-
22
-
23
- class TestRunSwiftEval(unittest.TestCase):
24
-
25
- def setUp(self) -> None:
26
- logger.info('Init env for swift-eval UTs ...\n')
27
-
28
- self.model_name = 'qwen2_5-0_5b-instruct'
29
- assert is_module_installed('evalscope'), 'Please install `evalscope` from pypi or source code.'
30
-
31
- if not is_module_installed('opencompass'):
32
- logger.warning('Note: installing ms-opencompass ...')
33
- subprocess.run('pip3 install ms-opencompass -U', shell=True, check=True)
34
-
35
- if not is_module_installed('swift'):
36
- logger.warning('Note: installing ms-swift ...')
37
- subprocess.run('pip3 install ms-swift[llm]', shell=True, check=True)
38
-
39
- logger.warning('vllm not installed, use native swift deploy service instead.')
40
-
41
- logger.info('\nStaring run swift deploy ...')
42
- self.process_swift_deploy = subprocess.Popen(
43
- f'swift deploy --model_type {self.model_name} --infer_backend pt', text=True, shell=True)
44
- if self.process_swift_deploy.stderr:
45
- logger.info(f'swift deploy log info: {self.process_swift_deploy.stderr}')
46
-
47
- self.all_datasets = OpenCompassBackendManager.list_datasets()
48
- assert len(self.all_datasets) > 0, f'Failed to list datasets from OpenCompass backend: {self.all_datasets}'
49
-
50
- def tearDown(self) -> None:
51
- # Stop the swift deploy model service
52
- logger.warning('\nStopping swift deploy ...')
53
- self.process_swift_deploy.terminate()
54
- self.process_swift_deploy.wait()
55
- logger.info('Process swift-deploy terminated successfully.')
56
-
57
- @staticmethod
58
- def find_and_kill_pid(pids: list):
59
- if len(pids) > 0:
60
- for pid in pids:
61
- subprocess.run(['kill', str(pid)])
62
- logger.warning(f'Killed process {pid}.')
63
- else:
64
- logger.info('No pids found.')
65
-
66
- @staticmethod
67
- def find_and_kill_service(service_name):
68
- try:
69
- # find pid
70
- result = subprocess.run(['ps', '-ef'], stdout=subprocess.PIPE, text=True)
71
-
72
- lines = result.stdout.splitlines()
73
- pids = []
74
- for line in lines:
75
- if service_name in line and 'grep' not in line:
76
- parts = line.split()
77
- pid = parts[1]
78
- pids.append(pid)
79
-
80
- if not pids:
81
- logger.info(f'No process found for {service_name}.')
82
- else:
83
- for pid in pids:
84
- subprocess.run(['kill', pid])
85
- logger.warning(f'Killed process {pid} for service {service_name}.')
86
- except Exception as e:
87
- logger.error(f'An error occurred: {e}')
88
-
89
- @staticmethod
90
- def check_service_status(url: str, data: dict, retries: int = 30, delay: int = 10):
91
- for i in range(retries):
92
- try:
93
- logger.info(f'Attempt {i + 1}: Checking service at {url} ...')
94
- response = requests.post(
95
- url, data=json.dumps(data), headers={'Content-Type': 'application/json'}, timeout=30)
96
- if response.status_code == 200:
97
- logger.info(f'Service at {url} is available !\n\n')
98
- return True
99
- else:
100
- logger.info(f'Service at {url} returned status code {response.status_code}.')
101
- except requests.exceptions.RequestException as e:
102
- logger.info(f'Attempt {i + 1}: An error occurred: {e}')
103
-
104
- time.sleep(delay)
105
-
106
- logger.info(f'Service at {url} is not available after {retries} retries.')
107
- return False
108
-
109
- @unittest.skipUnless(1 in test_level_list(), 'skip test in current test level')
110
- def test_run_task(self):
111
- # Prepare the config
112
- task_cfg = dict(
113
- eval_backend='OpenCompass',
114
- eval_config={
115
- 'datasets': ['cmb', 'bbh', 'ceval', 'ARC_e', 'gsm8k'],
116
- 'models': [
117
- {
118
- 'path': self.model_name,
119
- 'openai_api_base': DEFAULT_CHAT_MODEL_URL,
120
- 'batch_size': 8
121
- },
122
- ],
123
- 'work_dir': 'outputs/llama3_eval_result',
124
- 'reuse': None, # string, `latest` or timestamp, e.g. `20230516_144254`, default to None
125
- 'limit': 5, # string or int or float, e.g. `[2:5]`, 5, 5.0, default to None, it means run all examples
126
- },
127
- )
128
-
129
- # Check the service status
130
- data = {'model': self.model_name, 'messages': [{'role': 'user', 'content': 'who are you?'}]}
131
- assert self.check_service_status(
132
- DEFAULT_CHAT_MODEL_URL, data=data), f'Failed to check service status: {DEFAULT_CHAT_MODEL_URL}'
133
-
134
- # Submit the task
135
- logger.info(f'Start to run UT with cfg: {task_cfg}')
136
- run_task(task_cfg=task_cfg)
137
-
138
- # Get the final report with summarizer
139
- report_list = Summarizer.get_report_from_cfg(task_cfg)
140
- logger.info(f'>>The report list:\n{report_list}')
141
-
142
- assert len(report_list) > 0, f'Failed to get report list: {report_list}'
143
-
144
-
145
- if __name__ == '__main__':
146
- unittest.main()