evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (176) hide show
  1. evalscope/api/benchmark/__init__.py +8 -1
  2. evalscope/api/benchmark/adapters/__init__.py +1 -0
  3. evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
  4. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  5. evalscope/api/benchmark/benchmark.py +14 -0
  6. evalscope/api/dataset/dataset.py +21 -0
  7. evalscope/api/dataset/loader.py +6 -2
  8. evalscope/api/mixin/sandbox_mixin.py +32 -54
  9. evalscope/api/model/generate_config.py +6 -0
  10. evalscope/app/ui/multi_model.py +6 -1
  11. evalscope/app/ui/single_model.py +8 -2
  12. evalscope/app/utils/data_utils.py +3 -2
  13. evalscope/app/utils/visualization.py +2 -2
  14. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  15. evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
  16. evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
  17. evalscope/benchmarks/blink/__init__.py +0 -0
  18. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  19. evalscope/benchmarks/chartqa/__init__.py +0 -0
  20. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  21. evalscope/benchmarks/chartqa/utils.py +38 -0
  22. evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
  23. evalscope/benchmarks/docvqa/__init__.py +0 -0
  24. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  25. evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
  26. evalscope/benchmarks/general_arena/utils.py +2 -1
  27. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  28. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  29. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
  30. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  31. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
  32. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  33. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
  34. evalscope/benchmarks/infovqa/__init__.py +0 -0
  35. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  36. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
  37. evalscope/benchmarks/math_verse/__init__.py +0 -0
  38. evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
  39. evalscope/benchmarks/math_vision/__init__.py +0 -0
  40. evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
  41. evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
  42. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
  43. evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
  44. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
  45. evalscope/benchmarks/ner/__init__.py +0 -0
  46. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  47. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  48. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  49. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  50. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  51. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  52. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  53. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  54. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  55. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  56. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  57. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  58. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  59. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  60. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  61. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  62. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  63. evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
  64. evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
  65. evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
  66. evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
  67. evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  68. evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
  69. evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
  70. evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  71. evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  72. evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  73. evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
  74. evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
  75. evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
  76. evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
  77. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  78. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  79. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  80. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  81. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  82. evalscope/benchmarks/poly_math/__init__.py +0 -0
  83. evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
  84. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  85. evalscope/benchmarks/pope/__init__.py +0 -0
  86. evalscope/benchmarks/pope/pope_adapter.py +111 -0
  87. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  88. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  89. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  90. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  91. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
  92. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
  93. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  94. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  95. evalscope/benchmarks/zerobench/__init__.py +0 -0
  96. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  97. evalscope/constants.py +4 -0
  98. evalscope/evaluator/evaluator.py +72 -79
  99. evalscope/metrics/math_parser.py +14 -0
  100. evalscope/metrics/metric.py +52 -1
  101. evalscope/metrics/metrics.py +16 -0
  102. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  103. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  104. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  105. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  106. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  107. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  108. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  109. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  110. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  111. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  112. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  113. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  114. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  115. evalscope/models/utils/openai.py +4 -0
  116. evalscope/perf/arguments.py +24 -4
  117. evalscope/perf/benchmark.py +74 -89
  118. evalscope/perf/http_client.py +31 -16
  119. evalscope/perf/main.py +15 -2
  120. evalscope/perf/plugin/api/base.py +9 -7
  121. evalscope/perf/plugin/api/custom_api.py +13 -58
  122. evalscope/perf/plugin/api/default_api.py +179 -79
  123. evalscope/perf/plugin/api/openai_api.py +4 -3
  124. evalscope/perf/plugin/datasets/base.py +21 -0
  125. evalscope/perf/plugin/datasets/custom.py +2 -3
  126. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  127. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  128. evalscope/perf/plugin/datasets/openqa.py +2 -4
  129. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  130. evalscope/perf/utils/benchmark_util.py +36 -22
  131. evalscope/perf/utils/db_util.py +14 -19
  132. evalscope/perf/utils/local_server.py +0 -44
  133. evalscope/perf/utils/log_utils.py +21 -6
  134. evalscope/report/__init__.py +11 -2
  135. evalscope/report/combinator.py +52 -2
  136. evalscope/run.py +4 -0
  137. evalscope/utils/function_utils.py +195 -12
  138. evalscope/utils/io_utils.py +74 -0
  139. evalscope/utils/json_schema.py +8 -6
  140. evalscope/utils/logger.py +49 -17
  141. evalscope/utils/multi_choices.py +16 -1
  142. evalscope/utils/ner.py +377 -0
  143. evalscope/version.py +2 -2
  144. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
  145. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
  146. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
  147. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
  148. tests/__init__.py +0 -1
  149. tests/benchmark/__init__.py +0 -1
  150. tests/benchmark/test_eval.py +0 -429
  151. tests/benchmark/test_image_edit.py +0 -65
  152. tests/benchmark/test_sandbox.py +0 -81
  153. tests/benchmark/test_t2i.py +0 -142
  154. tests/benchmark/test_vlm.py +0 -137
  155. tests/cli/__init__.py +0 -1
  156. tests/cli/test_all.py +0 -269
  157. tests/cli/test_collection.py +0 -99
  158. tests/cli/test_custom.py +0 -268
  159. tests/cli/test_reasoning.py +0 -81
  160. tests/common.py +0 -73
  161. tests/perf/__init__.py +0 -1
  162. tests/perf/test_perf.py +0 -206
  163. tests/rag/test_clip_benchmark.py +0 -87
  164. tests/rag/test_mteb.py +0 -213
  165. tests/rag/test_ragas.py +0 -128
  166. tests/swift/__init__.py +0 -1
  167. tests/swift/test_run_swift_eval.py +0 -146
  168. tests/swift/test_run_swift_vlm_eval.py +0 -128
  169. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  170. tests/test_run_all.py +0 -12
  171. tests/utils.py +0 -13
  172. tests/vlm/__init__.py +0 -1
  173. tests/vlm/test_vlmeval.py +0 -102
  174. {tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
  175. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
  176. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
tests/cli/test_custom.py DELETED
@@ -1,268 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from dotenv import dotenv_values
3
-
4
- from tests.utils import test_level_list
5
-
6
- env = dotenv_values('.env')
7
-
8
- import os
9
- import subprocess
10
- import unittest
11
-
12
- from evalscope.config import TaskConfig
13
- from evalscope.constants import EvalType, JudgeStrategy, OutputType
14
- from evalscope.run import run_task
15
- from evalscope.utils.import_utils import is_module_installed
16
- from evalscope.utils.logger import get_logger
17
-
18
- os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
19
-
20
- logger = get_logger()
21
-
22
-
23
- class TestRunCustom(unittest.TestCase):
24
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
25
- def test_run_custom_task(self):
26
- from evalscope.config import TaskConfig
27
-
28
- task_cfg = TaskConfig(
29
- model='Qwen/Qwen3-0.6B',
30
- datasets=[
31
- 'general_mcq',
32
- 'general_qa'
33
- ],
34
- dataset_args={
35
- 'general_mcq': {
36
- 'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
37
- 'subset_list': [
38
- 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
39
- ],
40
- 'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
41
- },
42
- 'general_qa': {
43
- 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
44
- 'subset_list': [
45
- 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
46
- ]
47
- }
48
- },
49
- )
50
- res = run_task(task_cfg=task_cfg)
51
- print(res)
52
-
53
-
54
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
55
- def test_run_local_dataset(self):
56
- from evalscope.config import TaskConfig
57
-
58
- task_cfg = TaskConfig(
59
- model='qwen-plus',
60
- api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
61
- api_key= env.get('DASHSCOPE_API_KEY'),
62
- eval_type=EvalType.SERVICE,
63
- datasets=[
64
- # 'mmlu',
65
- # 'race',
66
- 'trivia_qa',
67
- # 'cmmlu',
68
- # 'humaneval',
69
- # 'gsm8k',
70
- # 'bbh',
71
- # 'competition_math',
72
- # 'arc',
73
- # 'ceval',
74
- ],
75
- dataset_args={
76
- 'mmlu': {
77
- 'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
78
- 'few_shot_num': 0,
79
- 'dataset_id': 'data/data/mmlu',
80
- },
81
- 'ceval': {
82
- 'subset_list': [
83
- 'computer_network', 'operating_system', 'computer_architecture'
84
- ],
85
- 'few_shot_num': 0,
86
- 'dataset_id': 'data/data/ceval',
87
- },
88
- 'cmmlu': {
89
- 'subset_list': ['elementary_chinese'],
90
- 'dataset_id': 'data/data/cmmlu',
91
- 'few_shot_num': 0
92
- },
93
- 'bbh': {
94
- 'subset_list': ['word_sorting', 'movie_recommendation'],
95
- },
96
- 'humaneval': {
97
- 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
98
- },
99
- 'trivia_qa': {
100
- 'dataset_id': 'data/data/trivia_qa',
101
- },
102
- },
103
- eval_batch_size=10,
104
- limit=5,
105
- debug=True,
106
- stream=True,
107
- generation_config={
108
- 'temperature': 0,
109
- 'n': 1,
110
- 'max_tokens': 4096,
111
- },
112
- ignore_errors=False,
113
- )
114
-
115
- run_task(task_cfg=task_cfg)
116
-
117
-
118
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
119
- def test_run_general_no_answer(self):
120
- from evalscope.config import TaskConfig
121
-
122
- task_cfg = TaskConfig(
123
- model='qwen2.5-7b-instruct',
124
- api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
125
- api_key= env.get('DASHSCOPE_API_KEY'),
126
- eval_type=EvalType.SERVICE,
127
- datasets=[
128
- 'general_qa',
129
- ],
130
- dataset_args={
131
- 'general_qa': {
132
- 'dataset_id': 'custom_eval/text/qa',
133
- 'subset_list': [
134
- 'arena',
135
- # 'example'
136
- ],
137
- }
138
- },
139
- eval_batch_size=10,
140
- limit=10,
141
- debug=True,
142
- stream=True,
143
- generation_config={
144
- 'temperature': 0,
145
- 'n': 1,
146
- 'max_tokens': 4096,
147
- },
148
- ignore_errors=False,
149
- judge_model_args={
150
- 'model_id': 'qwen2.5-7b-instruct',
151
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
152
- 'api_key': env.get('DASHSCOPE_API_KEY'),
153
- 'generation_config': {
154
- 'temperature': 0.0,
155
- 'max_tokens': 4096
156
- },
157
- 'score_type': 'numeric',
158
- 'prompt_template': """Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response.
159
- Begin your evaluation by providing a short explanation. Be as objective as possible.
160
- After providing your explanation, you must rate the response on a scale of 0 (worst) to 100 (best) by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\"
161
-
162
- [Question]
163
- {question}
164
-
165
- [Response]
166
- {pred}
167
- """
168
- },
169
- judge_worker_num=5,
170
- judge_strategy=JudgeStrategy.LLM,
171
- )
172
-
173
- run_task(task_cfg=task_cfg)
174
-
175
-
176
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
177
- def test_run_general_with_answer(self):
178
- from evalscope.config import TaskConfig
179
-
180
- task_cfg = TaskConfig(
181
- model='qwen-plus',
182
- api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
183
- api_key= env.get('DASHSCOPE_API_KEY'),
184
- eval_type=EvalType.SERVICE,
185
- datasets=[
186
- 'general_qa',
187
- ],
188
- dataset_args={
189
- 'general_qa': {
190
- 'dataset_id': 'custom_eval/text/qa',
191
- 'subset_list': [
192
- 'example'
193
- ],
194
- }
195
- },
196
- eval_batch_size=10,
197
- limit=10,
198
- debug=True,
199
- stream=True,
200
- generation_config={
201
- 'temperature': 0,
202
- 'n': 1,
203
- 'max_tokens': 4096,
204
- },
205
- ignore_errors=False,
206
- judge_model_args={
207
- 'model_id': 'qwen2.5-72b-instruct',
208
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
209
- 'api_key': env.get('DASHSCOPE_API_KEY'),
210
- 'generation_config': {
211
- 'temperature': 0.0,
212
- 'max_tokens': 4096
213
- },
214
- 'score_type': 'pattern',
215
- },
216
- judge_worker_num=1,
217
- judge_strategy=JudgeStrategy.LLM_RECALL,
218
- use_cache='outputs/20250818_170420'
219
- )
220
-
221
- run_task(task_cfg=task_cfg)
222
-
223
-
224
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
225
- def test_run_general_arena(self):
226
- from evalscope.config import TaskConfig
227
-
228
- task_cfg = TaskConfig(
229
- model_id='Arena',
230
- datasets=[
231
- 'general_arena',
232
- ],
233
- dataset_args={
234
- 'general_arena': {
235
- 'extra_params':{
236
- 'models':[
237
- {
238
- 'name': 'qwen2.5-7b',
239
- 'report_path': 'outputs/20250819_165034/reports/qwen2.5-7b-instruct'
240
- },
241
- {
242
- 'name': 'qwen2.5-72b',
243
- 'report_path': 'outputs/20250819_164926/reports/qwen2.5-72b-instruct'
244
- }
245
- ],
246
- 'baseline': 'qwen2.5-72b'
247
- }
248
- }
249
- },
250
- eval_batch_size=10,
251
- limit=10,
252
- debug=True,
253
- stream=True,
254
- ignore_errors=False,
255
- judge_model_args={
256
- 'model_id': 'qwen-plus',
257
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
258
- 'api_key': env.get('DASHSCOPE_API_KEY'),
259
- 'generation_config': {
260
- 'temperature': 0.0,
261
- 'max_tokens': 8000
262
- },
263
- },
264
- judge_worker_num=5,
265
- # use_cache='outputs/20250819_173546'
266
- )
267
-
268
- run_task(task_cfg=task_cfg)
@@ -1,81 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from dotenv import dotenv_values
3
-
4
- env = dotenv_values('.env')
5
-
6
- import unittest
7
- from unittest import TestCase
8
-
9
- from evalscope.config import TaskConfig
10
- from evalscope.constants import EvalType, JudgeStrategy, OutputType
11
- from evalscope.run import run_task
12
- from evalscope.utils.logger import get_logger
13
-
14
- logger = get_logger()
15
-
16
-
17
- class TestReasoning(TestCase):
18
- """Benchmark evaluation test cases."""
19
-
20
- def setUp(self):
21
- """Setup common test configuration."""
22
- self.base_config = {
23
- 'model': 'Qwen3-0.6B',
24
- 'api_url': 'http://0.0.0.0:8801/v1',
25
- 'api_key': env.get('DASHSCOPE_API_KEY'),
26
- 'eval_type': EvalType.SERVICE,
27
- 'eval_batch_size': 5,
28
- 'limit': 5,
29
- 'generation_config': {
30
- 'max_tokens': 4096,
31
- 'temperature': 0.0,
32
- 'seed': 42,
33
- 'parallel_tool_calls': True,
34
- 'extra_body':{'chat_template_kwargs': {'enable_thinking': False}} # 关闭思考模式
35
- },
36
- 'judge_strategy': JudgeStrategy.AUTO,
37
- 'judge_worker_num': 5,
38
- 'judge_model_args': {
39
- 'model_id': 'qwen2.5-72b-instruct',
40
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
41
- 'api_key': env.get('DASHSCOPE_API_KEY'),
42
- 'generation_config': {
43
- 'temperature': 0.0,
44
- 'max_tokens': 4096,
45
- }
46
- },
47
- 'debug': True,
48
- }
49
-
50
- def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
51
- """Helper method to run test for a specific dataset."""
52
- config = self.base_config.copy()
53
- config['datasets'] = [dataset_name]
54
-
55
- if use_mock:
56
- config['eval_type'] = EvalType.MOCK_LLM
57
-
58
- # 应用配置覆盖
59
- config.update(config_overrides)
60
-
61
- if dataset_args:
62
- config['dataset_args'] = {dataset_name: dataset_args}
63
-
64
- task_cfg = TaskConfig(**config)
65
- run_task(task_cfg=task_cfg)
66
-
67
- def _run_dataset_load_test(self, dataset_name, dataset_args=None):
68
- """Helper method to test dataset loading."""
69
-
70
- self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
71
-
72
- # Math & Reasoning datasets
73
- def test_gsm8k(self):
74
- """Test GSM8K math reasoning dataset."""
75
- self._run_dataset_test('gsm8k')
76
-
77
-
78
- if __name__ == '__main__':
79
- # Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k
80
- # Run all tests: python -m unittest test_eval.TestBenchmark
81
- unittest.main()
tests/common.py DELETED
@@ -1,73 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from dotenv import dotenv_values
3
-
4
- env = dotenv_values('.env')
5
-
6
- import unittest
7
- from unittest import TestCase
8
-
9
- from evalscope.config import TaskConfig
10
- from evalscope.constants import EvalType, JudgeStrategy
11
- from evalscope.run import run_task
12
- from evalscope.utils.logger import get_logger
13
-
14
- logger = get_logger()
15
-
16
-
17
- class TestBenchmark(TestCase):
18
- """Benchmark evaluation test cases."""
19
-
20
- def setUp(self):
21
- """Setup common test configuration."""
22
- self.base_config = {
23
- 'model': 'qwen-plus',
24
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
25
- 'api_key': env.get('DASHSCOPE_API_KEY'),
26
- 'eval_type': EvalType.SERVICE,
27
- 'eval_batch_size': 5,
28
- 'limit': 5,
29
- 'generation_config': {
30
- 'max_tokens': 4096,
31
- 'temperature': 0.0,
32
- 'seed': 42,
33
- 'parallel_tool_calls': True
34
- },
35
- 'judge_strategy': JudgeStrategy.AUTO,
36
- 'judge_worker_num': 5,
37
- 'judge_model_args': {
38
- 'model_id': 'qwen2.5-72b-instruct',
39
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
40
- 'api_key': env.get('DASHSCOPE_API_KEY'),
41
- 'generation_config': {
42
- 'temperature': 0.0,
43
- 'max_tokens': 4096,
44
- }
45
- },
46
- 'debug': True,
47
- }
48
-
49
- def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
50
- """Helper method to run test for a specific dataset."""
51
- config = self.base_config.copy()
52
- config['datasets'] = [dataset_name]
53
-
54
- if not env.get('DASHSCOPE_API_KEY'):
55
- use_mock = True
56
- logger.warning('DASHSCOPE_API_KEY is not set. Using mock evaluation.')
57
-
58
- if use_mock:
59
- config['eval_type'] = EvalType.MOCK_LLM
60
-
61
- # 应用配置覆盖
62
- config.update(config_overrides)
63
-
64
- if dataset_args:
65
- config['dataset_args'] = {dataset_name: dataset_args}
66
-
67
- task_cfg = TaskConfig(**config)
68
- run_task(task_cfg=task_cfg)
69
-
70
- def _run_dataset_load_test(self, dataset_name, dataset_args=None):
71
- """Helper method to test dataset loading."""
72
-
73
- self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
tests/perf/__init__.py DELETED
@@ -1 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
tests/perf/test_perf.py DELETED
@@ -1,206 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from dotenv import dotenv_values
3
-
4
- env = dotenv_values('.env')
5
- import unittest
6
-
7
- from evalscope.perf.main import run_perf_benchmark
8
- from tests.utils import test_level_list
9
-
10
-
11
- class TestPerf(unittest.TestCase):
12
-
13
- def setUp(self) -> None:
14
- pass
15
-
16
- def tearDown(self) -> None:
17
- pass
18
-
19
-
20
- def test_run_perf(self):
21
- task_cfg = {
22
- 'url': 'http://127.0.0.1:8001/v1/chat/completions',
23
- 'parallel': 1,
24
- 'model': 'qwen2.5',
25
- 'number': 15,
26
- 'api': 'openai',
27
- 'dataset': 'openqa',
28
- # 'stream': True,
29
- 'debug': True,
30
- }
31
- run_perf_benchmark(task_cfg)
32
-
33
-
34
- def test_run_perf_stream(self):
35
- task_cfg = {
36
- 'url': 'http://127.0.0.1:8801/v1/chat/completions',
37
- 'parallel': 1,
38
- 'model': 'Qwen2.5-0.5B-Instruct',
39
- 'number': 15,
40
- 'api': 'openai',
41
- 'dataset': 'openqa',
42
- 'stream': True,
43
- 'debug': True,
44
- }
45
- run_perf_benchmark(task_cfg)
46
-
47
-
48
- def test_run_perf_speed_benchmark(self):
49
- task_cfg = {
50
- 'url': 'http://127.0.0.1:8001/v1/completions',
51
- 'parallel': 1,
52
- 'model': 'qwen2.5',
53
- 'api': 'openai',
54
- 'dataset': 'speed_benchmark',
55
- 'min_tokens': 2048,
56
- 'max_tokens': 2048,
57
- 'debug': True,
58
- }
59
- run_perf_benchmark(task_cfg)
60
-
61
-
62
- def test_run_perf_local(self):
63
- task_cfg = {
64
- 'parallel': 1,
65
- 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
66
- 'number': 5,
67
- 'api': 'local',
68
- 'dataset': 'openqa',
69
- 'debug': True,
70
- }
71
- run_perf_benchmark(task_cfg)
72
-
73
-
74
- def test_run_perf_local_stream(self):
75
- task_cfg = {
76
- 'parallel': 1,
77
- 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
78
- 'number': 5,
79
- 'api': 'local',
80
- 'dataset': 'openqa',
81
- 'stream': True,
82
- 'debug': True,
83
- }
84
- run_perf_benchmark(task_cfg)
85
-
86
-
87
- def test_run_perf_local_speed_benchmark(self):
88
- task_cfg = {
89
- 'parallel': 1,
90
- 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
91
- 'api': 'local_vllm',
92
- 'dataset': 'speed_benchmark',
93
- 'min_tokens': 2048,
94
- 'max_tokens': 2048,
95
- 'debug': True,
96
- }
97
- run_perf_benchmark(task_cfg)
98
-
99
-
100
- def test_run_perf_local_random(self):
101
- from evalscope.perf.arguments import Arguments
102
- task_cfg = Arguments(
103
- parallel=20,
104
- model='Qwen3-1.7B',
105
- url='http://127.0.0.1:8801/v1/completions',
106
- api='openai',
107
- dataset='random',
108
- min_tokens=1024,
109
- max_tokens=1024,
110
- prefix_length=0,
111
- min_prompt_length=1024,
112
- max_prompt_length=1024,
113
- number=20,
114
- tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
115
- seed=None,
116
- extra_args={'ignore_eos': True}
117
- )
118
- metrics_result, percentile_result = run_perf_benchmark(task_cfg)
119
- print(metrics_result)
120
- print(percentile_result)
121
-
122
- def test_run_completion_endpoint(self):
123
- if not env.get('DASHSCOPE_API_KEY'):
124
- self.skipTest('DASHSCOPE_API_KEY is not set.')
125
- return
126
-
127
- from evalscope.perf.arguments import Arguments
128
- task_cfg = Arguments(
129
- parallel=[1, 2],
130
- number=[2, 4],
131
- model='qwen2.5-coder-7b-instruct',
132
- url='https://dashscope.aliyuncs.com/compatible-mode/v1/completions',
133
- api_key=env.get('DASHSCOPE_API_KEY'),
134
- api='openai',
135
- dataset='random',
136
- min_tokens=100,
137
- max_tokens=100,
138
- prefix_length=0,
139
- min_prompt_length=1024,
140
- max_prompt_length=1024,
141
- stream=False,
142
- tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
143
- seed=None,
144
- extra_args={'ignore_eos': True}
145
- )
146
- metrics_result, percentile_result = run_perf_benchmark(task_cfg)
147
- print(metrics_result)
148
- print(percentile_result)
149
-
150
-
151
- def test_run_perf_multi_parallel(self):
152
- if not env.get('DASHSCOPE_API_KEY'):
153
- self.skipTest('DASHSCOPE_API_KEY is not set.')
154
- return
155
-
156
- from evalscope.perf.arguments import Arguments
157
- task_cfg = Arguments(
158
- parallel=[1, 2],
159
- number=[2, 4],
160
- model='qwen-plus',
161
- url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
162
- api_key=env.get('DASHSCOPE_API_KEY'),
163
- api='openai',
164
- dataset='random',
165
- min_tokens=100,
166
- max_tokens=100,
167
- prefix_length=0,
168
- min_prompt_length=1024,
169
- max_prompt_length=1024,
170
- tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
171
- seed=None,
172
- extra_args={'ignore_eos': True}
173
- )
174
- metrics_result, percentile_result = run_perf_benchmark(task_cfg)
175
- print(metrics_result)
176
- print(percentile_result)
177
-
178
-
179
- def test_run_perf_random_vl(self):
180
- from evalscope.perf.arguments import Arguments
181
- task_cfg = Arguments(
182
- parallel=[1, 2],
183
- number=[2, 4],
184
- model='qwen-vl-max',
185
- url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
186
- api_key=env.get('DASHSCOPE_API_KEY'),
187
- api='openai',
188
- dataset='random_vl',
189
- min_tokens=100,
190
- max_tokens=100,
191
- prefix_length=0,
192
- min_prompt_length=100,
193
- max_prompt_length=100,
194
- image_height=512,
195
- image_width=512,
196
- image_num=2,
197
- tokenizer_path='Qwen/Qwen2.5-VL-7B-Instruct',
198
- seed=None,
199
- extra_args={'ignore_eos': True}
200
- )
201
- metrics_result, percentile_result = run_perf_benchmark(task_cfg)
202
- print(metrics_result)
203
- print(percentile_result)
204
-
205
- if __name__ == '__main__':
206
- unittest.main(buffer=False)