evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (157) hide show
  1. evalscope/app/app.py +9 -762
  2. evalscope/app/constants.py +1 -0
  3. evalscope/app/ui/__init__.py +20 -0
  4. evalscope/app/ui/app_ui.py +52 -0
  5. evalscope/app/ui/multi_model.py +323 -0
  6. evalscope/app/ui/sidebar.py +42 -0
  7. evalscope/app/ui/single_model.py +202 -0
  8. evalscope/app/ui/visualization.py +36 -0
  9. evalscope/app/utils/data_utils.py +178 -0
  10. evalscope/app/utils/localization.py +221 -0
  11. evalscope/app/utils/text_utils.py +119 -0
  12. evalscope/app/utils/visualization.py +91 -0
  13. evalscope/backend/opencompass/backend_manager.py +2 -1
  14. evalscope/backend/rag_eval/backend_manager.py +2 -1
  15. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  16. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  17. evalscope/benchmarks/__init__.py +15 -1
  18. evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  19. evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  20. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  21. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  22. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  23. evalscope/benchmarks/arena_hard/utils.py +0 -12
  24. evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
  25. evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  26. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  27. evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  28. evalscope/benchmarks/data_adapter.py +29 -9
  29. evalscope/benchmarks/general_arena/__init__.py +0 -0
  30. evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  31. evalscope/benchmarks/general_arena/utils.py +226 -0
  32. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
  33. evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
  34. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  35. evalscope/benchmarks/hle/__init__.py +0 -0
  36. evalscope/benchmarks/hle/hle_adapter.py +118 -0
  37. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
  38. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  39. evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  40. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  41. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  42. evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  43. evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
  44. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  45. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  46. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  47. evalscope/benchmarks/race/race_adapter.py +1 -1
  48. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  49. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
  50. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
  51. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  52. evalscope/benchmarks/utils.py +2 -2
  53. evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  54. evalscope/config.py +8 -123
  55. evalscope/constants.py +5 -21
  56. evalscope/evaluator/__init__.py +1 -1
  57. evalscope/evaluator/evaluator.py +20 -15
  58. evalscope/metrics/__init__.py +9 -1
  59. evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
  60. evalscope/metrics/llm_judge.py +106 -20
  61. evalscope/metrics/metrics.py +20 -8
  62. evalscope/models/__init__.py +4 -8
  63. evalscope/models/adapters/__init__.py +4 -9
  64. evalscope/models/adapters/base_adapter.py +4 -0
  65. evalscope/models/adapters/bfcl_adapter.py +2 -0
  66. evalscope/models/adapters/chat_adapter.py +3 -0
  67. evalscope/models/adapters/choice_adapter.py +4 -0
  68. evalscope/models/adapters/custom_adapter.py +7 -3
  69. evalscope/models/adapters/server_adapter.py +4 -2
  70. evalscope/models/adapters/t2i_adapter.py +3 -0
  71. evalscope/models/adapters/tau_bench_adapter.py +189 -0
  72. evalscope/models/custom/dummy_model.py +3 -3
  73. evalscope/models/register.py +0 -14
  74. evalscope/perf/arguments.py +15 -16
  75. evalscope/perf/benchmark.py +38 -39
  76. evalscope/perf/http_client.py +30 -86
  77. evalscope/perf/main.py +3 -3
  78. evalscope/perf/plugin/__init__.py +3 -2
  79. evalscope/perf/plugin/api/__init__.py +4 -3
  80. evalscope/perf/plugin/api/base.py +22 -4
  81. evalscope/perf/plugin/api/custom_api.py +212 -55
  82. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  83. evalscope/perf/plugin/api/default_api.py +105 -0
  84. evalscope/perf/plugin/api/openai_api.py +17 -19
  85. evalscope/perf/plugin/datasets/__init__.py +10 -7
  86. evalscope/perf/plugin/datasets/base.py +22 -1
  87. evalscope/perf/plugin/datasets/custom.py +2 -1
  88. evalscope/perf/plugin/datasets/flickr8k.py +4 -27
  89. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  90. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  91. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  92. evalscope/perf/plugin/datasets/openqa.py +2 -1
  93. evalscope/perf/plugin/datasets/random_dataset.py +15 -4
  94. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  95. evalscope/perf/plugin/registry.py +36 -16
  96. evalscope/perf/utils/analysis_result.py +24 -23
  97. evalscope/perf/utils/benchmark_util.py +14 -20
  98. evalscope/perf/utils/db_util.py +79 -61
  99. evalscope/report/__init__.py +1 -1
  100. evalscope/report/utils.py +34 -15
  101. evalscope/run.py +1 -1
  102. evalscope/summarizer.py +1 -2
  103. evalscope/utils/__init__.py +63 -2
  104. evalscope/utils/argument_utils.py +64 -0
  105. evalscope/utils/import_utils.py +16 -0
  106. evalscope/utils/io_utils.py +55 -4
  107. evalscope/utils/model_utils.py +37 -1
  108. evalscope/version.py +2 -2
  109. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
  110. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
  111. tests/aigc/test_t2i.py +1 -1
  112. tests/cli/test_all.py +68 -4
  113. tests/cli/test_collection.py +1 -1
  114. tests/cli/test_custom.py +261 -0
  115. tests/cli/test_run.py +34 -70
  116. tests/perf/test_perf.py +31 -4
  117. tests/rag/test_clip_benchmark.py +2 -1
  118. tests/rag/test_mteb.py +3 -1
  119. tests/rag/test_ragas.py +3 -1
  120. tests/swift/test_run_swift_eval.py +2 -1
  121. tests/swift/test_run_swift_vlm_eval.py +2 -1
  122. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  123. tests/utils.py +13 -0
  124. tests/vlm/test_vlmeval.py +8 -2
  125. evalscope/evaluator/rating_eval.py +0 -157
  126. evalscope/evaluator/reviewer/__init__.py +0 -1
  127. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  128. evalscope/models/model.py +0 -189
  129. evalscope/registry/__init__.py +0 -1
  130. evalscope/registry/config/cfg_arena.yaml +0 -77
  131. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  132. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  133. evalscope/registry/config/cfg_single.yaml +0 -78
  134. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  135. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  136. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  137. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  138. evalscope/registry/data/question.jsonl +0 -80
  139. evalscope/registry/tasks/arc.yaml +0 -28
  140. evalscope/registry/tasks/bbh.yaml +0 -26
  141. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  142. evalscope/registry/tasks/ceval.yaml +0 -27
  143. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  144. evalscope/registry/tasks/cmmlu.yaml +0 -27
  145. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  146. evalscope/registry/tasks/general_qa.yaml +0 -27
  147. evalscope/registry/tasks/gsm8k.yaml +0 -29
  148. evalscope/registry/tasks/mmlu.yaml +0 -29
  149. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  150. evalscope/run_arena.py +0 -202
  151. evalscope/utils/arena_utils.py +0 -217
  152. evalscope/utils/completion_parsers.py +0 -82
  153. /evalscope/{utils → benchmarks}/filters.py +0 -0
  154. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
  155. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
  156. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
  157. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,261 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ from tests.utils import test_level_list
5
+
6
+ env = dotenv_values('.env')
7
+
8
+ import os
9
+ import subprocess
10
+ import unittest
11
+
12
+ from evalscope.config import TaskConfig
13
+ from evalscope.constants import EvalStage, EvalType, JudgeStrategy, OutputType
14
+ from evalscope.run import run_task
15
+ from evalscope.utils.import_utils import is_module_installed
16
+ from evalscope.utils.logger import get_logger
17
+
18
+ os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
19
+
20
+ logger = get_logger()
21
+
22
+
23
+ class TestRunCustom(unittest.TestCase):
24
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
25
+ def test_run_custom_task(self):
26
+ from evalscope.config import TaskConfig
27
+
28
+ task_cfg = TaskConfig(
29
+ model='Qwen/Qwen3-0.6B',
30
+ datasets=[
31
+ 'general_mcq',
32
+ 'general_qa'
33
+ ],
34
+ dataset_args={
35
+ 'general_mcq': {
36
+ 'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
37
+ 'subset_list': [
38
+ 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
39
+ ],
40
+ 'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
41
+ },
42
+ 'general_qa': {
43
+ 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
44
+ 'subset_list': [
45
+ 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
46
+ ]
47
+ }
48
+ },
49
+ )
50
+ res = run_task(task_cfg=task_cfg)
51
+ print(res)
52
+
53
+
54
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
55
+ def test_run_local_dataset(self):
56
+ from evalscope.config import TaskConfig
57
+
58
+ task_cfg = TaskConfig(
59
+ model='qwen-plus',
60
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
61
+ api_key= env.get('DASHSCOPE_API_KEY'),
62
+ eval_type=EvalType.SERVICE,
63
+ datasets=[
64
+ # 'mmlu',
65
+ # 'race',
66
+ 'trivia_qa',
67
+ # 'cmmlu',
68
+ # 'humaneval',
69
+ # 'gsm8k',
70
+ # 'bbh',
71
+ # 'competition_math',
72
+ # 'arc',
73
+ # 'ceval',
74
+ ],
75
+ dataset_args={
76
+ 'mmlu': {
77
+ 'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
78
+ 'few_shot_num': 0,
79
+ 'dataset_id': 'data/data/mmlu',
80
+ },
81
+ 'ceval': {
82
+ 'subset_list': [
83
+ 'computer_network', 'operating_system', 'computer_architecture'
84
+ ],
85
+ 'few_shot_num': 0,
86
+ 'dataset_id': 'data/data/ceval',
87
+ },
88
+ 'cmmlu': {
89
+ 'subset_list': ['elementary_chinese'],
90
+ 'dataset_id': 'data/data/cmmlu',
91
+ 'few_shot_num': 0
92
+ },
93
+ 'bbh': {
94
+ 'subset_list': ['word_sorting', 'movie_recommendation'],
95
+ },
96
+ 'humaneval': {
97
+ 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
98
+ },
99
+ 'trivia_qa': {
100
+ 'dataset_id': 'data/data/trivia_qa',
101
+ },
102
+ },
103
+ eval_batch_size=10,
104
+ limit=5,
105
+ debug=True,
106
+ stream=True,
107
+ generation_config={
108
+ 'temperature': 0,
109
+ 'n': 1,
110
+ 'max_tokens': 4096,
111
+ },
112
+ ignore_errors=False,
113
+ )
114
+
115
+ run_task(task_cfg=task_cfg)
116
+
117
+
118
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
119
+ def test_run_general_no_answer(self):
120
+ from evalscope.config import TaskConfig
121
+
122
+ task_cfg = TaskConfig(
123
+ model='qwen2.5-72b-instruct',
124
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
125
+ api_key= env.get('DASHSCOPE_API_KEY'),
126
+ eval_type=EvalType.SERVICE,
127
+ datasets=[
128
+ 'general_qa',
129
+ ],
130
+ dataset_args={
131
+ 'general_qa': {
132
+ 'dataset_id': 'custom_eval/text/qa',
133
+ 'subset_list': [
134
+ 'arena',
135
+ 'example'
136
+ ],
137
+ }
138
+ },
139
+ eval_batch_size=10,
140
+ limit=10,
141
+ debug=True,
142
+ stream=True,
143
+ generation_config={
144
+ 'temperature': 0,
145
+ 'n': 1,
146
+ 'max_tokens': 4096,
147
+ },
148
+ ignore_errors=False,
149
+ judge_model_args={
150
+ 'model_id': 'qwen2.5-72b-instruct',
151
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
152
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
153
+ 'generation_config': {
154
+ 'temperature': 0.0,
155
+ 'max_tokens': 4096
156
+ },
157
+ 'score_type': 'numeric',
158
+ },
159
+ judge_worker_num=5,
160
+ judge_strategy=JudgeStrategy.AUTO,
161
+ )
162
+
163
+ run_task(task_cfg=task_cfg)
164
+
165
+
166
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
167
+ def test_run_general_with_answer(self):
168
+ from evalscope.config import TaskConfig
169
+
170
+ task_cfg = TaskConfig(
171
+ model='qwen-plus',
172
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
173
+ api_key= env.get('DASHSCOPE_API_KEY'),
174
+ eval_type=EvalType.SERVICE,
175
+ datasets=[
176
+ 'general_qa',
177
+ ],
178
+ dataset_args={
179
+ 'general_qa': {
180
+ 'dataset_id': 'custom_eval/text/qa',
181
+ 'subset_list': [
182
+ 'example'
183
+ ],
184
+ }
185
+ },
186
+ eval_batch_size=10,
187
+ limit=10,
188
+ debug=True,
189
+ stream=True,
190
+ generation_config={
191
+ 'temperature': 0,
192
+ 'n': 1,
193
+ 'max_tokens': 4096,
194
+ },
195
+ ignore_errors=False,
196
+ judge_model_args={
197
+ 'model_id': 'qwen2.5-72b-instruct',
198
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
199
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
200
+ 'generation_config': {
201
+ 'temperature': 0.0,
202
+ 'max_tokens': 4096
203
+ },
204
+ 'score_type': 'pattern',
205
+ },
206
+ judge_worker_num=5,
207
+ judge_strategy=JudgeStrategy.LLM,
208
+ )
209
+
210
+ run_task(task_cfg=task_cfg)
211
+
212
+
213
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
214
+ def test_run_general_arena(self):
215
+ from evalscope.config import TaskConfig
216
+
217
+ task_cfg = TaskConfig(
218
+ model_id='Arena',
219
+ datasets=[
220
+ 'general_arena',
221
+ ],
222
+ dataset_args={
223
+ 'general_arena': {
224
+ 'extra_params':{
225
+ 'models':[
226
+ {
227
+ 'name': 'qwen2.5-0.5b',
228
+ 'report_path': 'outputs/20250702_140354/reports/qwen2.5-0.5b-instruct'
229
+ },
230
+ {
231
+ 'name': 'qwen2.5-7b',
232
+ 'report_path': 'outputs/20250702_140702/reports/qwen2.5-7b-instruct'
233
+ },
234
+ {
235
+ 'name': 'qwen2.5-72b',
236
+ 'report_path': 'outputs/20250702_140802/reports/qwen2.5-72b-instruct'
237
+ }
238
+ ],
239
+ 'baseline': 'qwen2.5-7b'
240
+ }
241
+ }
242
+ },
243
+ eval_batch_size=10,
244
+ limit=10,
245
+ debug=True,
246
+ stream=True,
247
+ ignore_errors=False,
248
+ judge_model_args={
249
+ 'model_id': 'qwen-plus',
250
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
251
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
252
+ 'generation_config': {
253
+ 'temperature': 0.0,
254
+ 'max_tokens': 8000
255
+ },
256
+ },
257
+ judge_worker_num=5,
258
+ use_cache='outputs/20250702_165727'
259
+ )
260
+
261
+ run_task(task_cfg=task_cfg)
tests/cli/test_run.py CHANGED
@@ -1,6 +1,8 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  from dotenv import dotenv_values
3
3
 
4
+ from tests.utils import test_level_list
5
+
4
6
  env = dotenv_values('.env')
5
7
 
6
8
  import os
@@ -8,9 +10,9 @@ import subprocess
8
10
  import unittest
9
11
 
10
12
  from evalscope.config import TaskConfig
11
- from evalscope.constants import EvalType, JudgeStrategy, OutputType
13
+ from evalscope.constants import EvalStage, EvalType, JudgeStrategy, OutputType
12
14
  from evalscope.run import run_task
13
- from evalscope.utils import is_module_installed, test_level_list
15
+ from evalscope.utils.import_utils import is_module_installed
14
16
  from evalscope.utils.logger import get_logger
15
17
 
16
18
  os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
@@ -182,35 +184,6 @@ class TestRun(unittest.TestCase):
182
184
  run_task(task_cfg=task_cfg)
183
185
 
184
186
 
185
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
186
- def test_run_custom_task(self):
187
- from evalscope.config import TaskConfig
188
-
189
- task_cfg = TaskConfig(
190
- model='Qwen/Qwen3-0.6B',
191
- datasets=[
192
- 'general_mcq',
193
- 'general_qa'
194
- ],
195
- dataset_args={
196
- 'general_mcq': {
197
- 'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
198
- 'subset_list': [
199
- 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
200
- ],
201
- 'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
202
- },
203
- 'general_qa': {
204
- 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
205
- 'subset_list': [
206
- 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
207
- ]
208
- }
209
- },
210
- )
211
- res = run_task(task_cfg=task_cfg)
212
- print(res)
213
-
214
187
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
215
188
  def test_run_one_task(self):
216
189
  from evalscope.config import TaskConfig
@@ -293,7 +266,7 @@ class TestRun(unittest.TestCase):
293
266
  # 'musr',
294
267
  # 'process_bench',
295
268
  # 'race',
296
- # 'trivia_qa',
269
+ 'trivia_qa',
297
270
  # 'cmmlu',
298
271
  # 'humaneval',
299
272
  # 'gsm8k',
@@ -306,7 +279,7 @@ class TestRun(unittest.TestCase):
306
279
  # 'ceval',
307
280
  # 'hellaswag',
308
281
  # 'general_mcq',
309
- 'general_qa',
282
+ # 'general_qa',
310
283
  # 'super_gpqa',
311
284
  # 'mmlu_redux',
312
285
  # 'maritime_bench',
@@ -315,6 +288,9 @@ class TestRun(unittest.TestCase):
315
288
  # 'tool_bench',
316
289
  # 'frames',
317
290
  # 'bfcl_v3',
291
+ # 'truthful_qa',
292
+ # 'tau_bench',
293
+ # 'hle'
318
294
  ],
319
295
  dataset_args={
320
296
  'mmlu': {
@@ -323,7 +299,7 @@ class TestRun(unittest.TestCase):
323
299
  },
324
300
  'mmlu_pro': {
325
301
  'subset_list': ['math', 'health'],
326
- 'few_shot_num': 4
302
+ 'few_shot_num': 0
327
303
  },
328
304
  'ceval': {
329
305
  'subset_list': [
@@ -354,7 +330,6 @@ class TestRun(unittest.TestCase):
354
330
  },
355
331
  'musr': {
356
332
  'subset_list': ['murder_mysteries'],
357
- 'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/MuSR'
358
333
  },
359
334
  'general_mcq': {
360
335
  'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
@@ -378,59 +353,42 @@ class TestRun(unittest.TestCase):
378
353
  'mmlu_redux':{
379
354
  'subset_list': ['abstract_algebra']
380
355
  },
356
+ 'frames':{
357
+ 'local_path': 'data/iic/frames',
358
+ },
381
359
  'bfcl_v3': {
382
360
  'subset_list': ['parallel'],
383
361
  'extra_params': {
384
362
  # 'is_fc_model': False,
385
363
  }
386
364
  },
365
+ 'tau_bench': {
366
+ 'extra_params': {
367
+ 'user_model': 'qwen-plus',
368
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
369
+ 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
370
+ }
371
+ },
372
+ 'hle': {
373
+ 'subset_list': ['Math', 'Other'],
374
+ },
387
375
  },
388
376
  eval_batch_size=10,
389
- limit=5,
390
- debug=True,
377
+ limit=10,
378
+ # debug=True,
391
379
  stream=True,
392
380
  generation_config={
393
- 'temperature': 0,
381
+ 'temperature': 0.6,
394
382
  'n': 1,
395
383
  'max_tokens': 4096,
396
384
  # 'extra_headers':{'key': 'value'},
397
385
  },
398
386
  ignore_errors=False,
399
- use_cache='outputs/test_2'
400
387
  )
401
388
 
402
389
  run_task(task_cfg=task_cfg)
403
390
 
404
391
 
405
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
406
- def test_run_batch_eval(self):
407
- from evalscope.config import TaskConfig
408
-
409
- task_cfg = TaskConfig(
410
- model='LLM-Research/Llama-3.2-1B-Instruct',
411
- datasets=[
412
- # 'math_500',
413
- # 'aime24',
414
- # 'competition_math'
415
- # 'arc',
416
- 'gsm8k'
417
- # 'truthful_qa'
418
- ],
419
- dataset_args={
420
- 'competition_math': {
421
- 'subset_list': ['Level 4', 'Level 5']
422
- }
423
- },
424
- eval_batch_size=2,
425
- limit=5,
426
- generation_config={
427
- 'max_new_tokens': 2048,
428
- 'temperature': 0.7,
429
- 'num_return_sequences': 2,
430
- }
431
- )
432
-
433
- run_task(task_cfg=task_cfg)
434
392
 
435
393
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
436
394
  def test_run_judge_model(self):
@@ -442,7 +400,7 @@ class TestRun(unittest.TestCase):
442
400
  api_key= env.get('DASHSCOPE_API_KEY'),
443
401
  eval_type=EvalType.SERVICE,
444
402
  datasets=[
445
- 'math_500',
403
+ # 'math_500',
446
404
  # 'aime24',
447
405
  # 'competition_math',
448
406
  # 'arc',
@@ -459,6 +417,7 @@ class TestRun(unittest.TestCase):
459
417
  # 'docmath',
460
418
  # 'needle_haystack',
461
419
  # 'ifeval',
420
+ 'hle'
462
421
  ],
463
422
  dataset_args={
464
423
  'needle_haystack': {
@@ -491,7 +450,10 @@ class TestRun(unittest.TestCase):
491
450
  },
492
451
  'frames': {
493
452
  'local_path': '/root/.cache/modelscope/hub/datasets/iic/frames'
494
- }
453
+ },
454
+ 'hle': {
455
+ 'subset_list': ['Math', 'Other'],
456
+ },
495
457
  },
496
458
  eval_batch_size=10,
497
459
  limit=3,
@@ -514,6 +476,7 @@ class TestRun(unittest.TestCase):
514
476
  },
515
477
  timeout=60000,
516
478
  stream=True,
479
+ use_cache='outputs/20250714_150626'
517
480
  # analysis_report=True,
518
481
  # debug=True,
519
482
  # use_cache='outputs/20250616_161931'
@@ -521,5 +484,6 @@ class TestRun(unittest.TestCase):
521
484
 
522
485
  run_task(task_cfg=task_cfg)
523
486
 
487
+
524
488
  if __name__ == '__main__':
525
489
  unittest.main()
tests/perf/test_perf.py CHANGED
@@ -7,7 +7,7 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '0'
7
7
  import unittest
8
8
 
9
9
  from evalscope.perf.main import run_perf_benchmark
10
- from evalscope.utils import test_level_list
10
+ from tests.utils import test_level_list
11
11
 
12
12
 
13
13
  class TestPerf(unittest.TestCase):
@@ -35,9 +35,9 @@ class TestPerf(unittest.TestCase):
35
35
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
36
36
  def test_run_perf_stream(self):
37
37
  task_cfg = {
38
- 'url': 'http://127.0.0.1:8000/v1/chat/completions',
38
+ 'url': 'http://127.0.0.1:8801/v1/chat/completions',
39
39
  'parallel': 1,
40
- 'model': 'qwen2.5',
40
+ 'model': 'Qwen2.5-0.5B-Instruct',
41
41
  'number': 15,
42
42
  'api': 'openai',
43
43
  'dataset': 'openqa',
@@ -126,7 +126,7 @@ class TestPerf(unittest.TestCase):
126
126
  from evalscope.perf.arguments import Arguments
127
127
  task_cfg = Arguments(
128
128
  parallel=[1, 2],
129
- number=[2, 5],
129
+ number=[2, 4],
130
130
  model='qwen2.5-7b-instruct',
131
131
  url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
132
132
  api_key=env.get('DASHSCOPE_API_KEY'),
@@ -145,5 +145,32 @@ class TestPerf(unittest.TestCase):
145
145
  print(metrics_result)
146
146
  print(percentile_result)
147
147
 
148
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
149
+ def test_run_perf_random_vl(self):
150
+ from evalscope.perf.arguments import Arguments
151
+ task_cfg = Arguments(
152
+ parallel=[1, 2],
153
+ number=[2, 4],
154
+ model='qwen-vl-max',
155
+ url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
156
+ api_key=env.get('DASHSCOPE_API_KEY'),
157
+ api='openai',
158
+ dataset='kontext_bench',
159
+ min_tokens=100,
160
+ max_tokens=100,
161
+ prefix_length=0,
162
+ min_prompt_length=100,
163
+ max_prompt_length=100,
164
+ image_height=512,
165
+ image_width=512,
166
+ image_num=2,
167
+ tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
168
+ seed=None,
169
+ extra_args={'ignore_eos': True}
170
+ )
171
+ metrics_result, percentile_result = run_perf_benchmark(task_cfg)
172
+ print(metrics_result)
173
+ print(percentile_result)
174
+
148
175
  if __name__ == '__main__':
149
176
  unittest.main(buffer=False)
@@ -6,8 +6,9 @@ import subprocess
6
6
  import unittest
7
7
 
8
8
  from evalscope.run import run_task
9
- from evalscope.utils import is_module_installed, test_level_list
9
+ from evalscope.utils.import_utils import is_module_installed
10
10
  from evalscope.utils.logger import get_logger
11
+ from tests.utils import test_level_list
11
12
 
12
13
  logger = get_logger()
13
14
 
tests/rag/test_mteb.py CHANGED
@@ -3,9 +3,11 @@
3
3
  import unittest
4
4
  from dotenv import dotenv_values
5
5
 
6
+ from tests.utils import test_level_list
7
+
6
8
  env = dotenv_values('.env')
7
9
  from evalscope.run import run_task
8
- from evalscope.utils import is_module_installed, test_level_list
10
+ from evalscope.utils.import_utils import is_module_installed
9
11
  from evalscope.utils.logger import get_logger
10
12
 
11
13
  logger = get_logger()
tests/rag/test_ragas.py CHANGED
@@ -2,11 +2,13 @@
2
2
  import os
3
3
  from dotenv import dotenv_values
4
4
 
5
+ from tests.utils import test_level_list
6
+
5
7
  env = dotenv_values('.env')
6
8
  import unittest
7
9
 
8
10
  from evalscope import TaskConfig, run_task
9
- from evalscope.utils import is_module_installed, test_level_list
11
+ from evalscope.utils.import_utils import is_module_installed
10
12
  from evalscope.utils.logger import get_logger
11
13
 
12
14
  logger = get_logger()
@@ -10,8 +10,9 @@ import unittest
10
10
  from evalscope.backend.opencompass import OpenCompassBackendManager
11
11
  from evalscope.run import run_task
12
12
  from evalscope.summarizer import Summarizer
13
- from evalscope.utils import is_module_installed, test_level_list
13
+ from evalscope.utils.import_utils import is_module_installed
14
14
  from evalscope.utils.logger import get_logger
15
+ from tests.utils import test_level_list
15
16
 
16
17
  logger = get_logger(__name__)
17
18
 
@@ -10,8 +10,9 @@ import unittest
10
10
  from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
11
11
  from evalscope.run import run_task
12
12
  from evalscope.summarizer import Summarizer
13
- from evalscope.utils import is_module_installed, test_level_list
13
+ from evalscope.utils.import_utils import is_module_installed
14
14
  from evalscope.utils.logger import get_logger
15
+ from tests.utils import test_level_list
15
16
 
16
17
  logger = get_logger(__name__)
17
18
 
@@ -10,8 +10,9 @@ import unittest
10
10
  from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
11
11
  from evalscope.run import run_task
12
12
  from evalscope.summarizer import Summarizer
13
- from evalscope.utils import is_module_installed, test_level_list
13
+ from evalscope.utils.import_utils import is_module_installed
14
14
  from evalscope.utils.logger import get_logger
15
+ from tests.utils import test_level_list
15
16
 
16
17
  logger = get_logger(__name__)
17
18
 
tests/utils.py ADDED
@@ -0,0 +1,13 @@
1
+ import os
2
+
3
+ TEST_LEVEL_LIST = [0, 1]
4
+ # Example: export TEST_LEVEL_LIST=0,1
5
+ TEST_LEVEL_LIST_STR = 'TEST_LEVEL_LIST'
6
+
7
+
8
+ def test_level_list():
9
+ global TEST_LEVEL_LIST
10
+ if TEST_LEVEL_LIST_STR in os.environ:
11
+ TEST_LEVEL_LIST = [int(x) for x in os.environ[TEST_LEVEL_LIST_STR].split(',')]
12
+
13
+ return TEST_LEVEL_LIST
tests/vlm/test_vlmeval.py CHANGED
@@ -1,12 +1,14 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  from dotenv import dotenv_values
3
3
 
4
+ from tests.utils import test_level_list
5
+
4
6
  env = dotenv_values('.env')
5
7
  import unittest
6
8
 
7
9
  from evalscope.run import run_task
8
10
  from evalscope.summarizer import Summarizer
9
- from evalscope.utils import is_module_installed, test_level_list
11
+ from evalscope.utils.import_utils import is_module_installed
10
12
  from evalscope.utils.logger import get_logger
11
13
 
12
14
  logger = get_logger()
@@ -62,7 +64,11 @@ class TestVLMEval(unittest.TestCase):
62
64
  task_cfg = {
63
65
  'eval_backend': 'VLMEvalKit',
64
66
  'eval_config': {
65
- 'data': ['SEEDBench_IMG', 'ChartQA_TEST'],
67
+ 'data': [
68
+ # 'SEEDBench_IMG',
69
+ # 'ChartQA_TEST',
70
+ 'MMDU'
71
+ ],
66
72
  'limit': 5,
67
73
  'mode': 'all',
68
74
  'model': [