evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (117) hide show
  1. evalscope/app/app.py +9 -762
  2. evalscope/app/constants.py +1 -0
  3. evalscope/app/ui/__init__.py +20 -0
  4. evalscope/app/ui/app_ui.py +52 -0
  5. evalscope/app/ui/multi_model.py +323 -0
  6. evalscope/app/ui/sidebar.py +42 -0
  7. evalscope/app/ui/single_model.py +202 -0
  8. evalscope/app/ui/visualization.py +36 -0
  9. evalscope/app/utils/data_utils.py +178 -0
  10. evalscope/app/utils/localization.py +221 -0
  11. evalscope/app/utils/text_utils.py +119 -0
  12. evalscope/app/utils/visualization.py +91 -0
  13. evalscope/backend/opencompass/backend_manager.py +2 -1
  14. evalscope/backend/rag_eval/backend_manager.py +2 -1
  15. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  16. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  17. evalscope/benchmarks/__init__.py +15 -1
  18. evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  19. evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  20. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  21. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  22. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  23. evalscope/benchmarks/arena_hard/utils.py +0 -12
  24. evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  25. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  26. evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  27. evalscope/benchmarks/data_adapter.py +20 -5
  28. evalscope/benchmarks/general_arena/__init__.py +0 -0
  29. evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  30. evalscope/benchmarks/general_arena/utils.py +226 -0
  31. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  32. evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
  33. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  34. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  35. evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  36. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  37. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  38. evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  39. evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
  40. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  41. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  42. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  43. evalscope/benchmarks/race/race_adapter.py +1 -1
  44. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  45. evalscope/benchmarks/utils.py +1 -2
  46. evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  47. evalscope/config.py +8 -123
  48. evalscope/evaluator/evaluator.py +15 -12
  49. evalscope/metrics/__init__.py +6 -0
  50. evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
  51. evalscope/metrics/llm_judge.py +105 -20
  52. evalscope/metrics/metrics.py +1 -1
  53. evalscope/models/adapters/base_adapter.py +0 -2
  54. evalscope/models/adapters/server_adapter.py +2 -2
  55. evalscope/models/custom/dummy_model.py +3 -3
  56. evalscope/perf/arguments.py +2 -16
  57. evalscope/perf/main.py +1 -1
  58. evalscope/perf/utils/analysis_result.py +24 -23
  59. evalscope/perf/utils/benchmark_util.py +1 -1
  60. evalscope/report/__init__.py +1 -1
  61. evalscope/report/utils.py +34 -15
  62. evalscope/run.py +1 -1
  63. evalscope/summarizer.py +1 -2
  64. evalscope/utils/__init__.py +63 -2
  65. evalscope/utils/argument_utils.py +64 -0
  66. evalscope/utils/import_utils.py +16 -0
  67. evalscope/utils/io_utils.py +45 -4
  68. evalscope/utils/model_utils.py +37 -1
  69. evalscope/version.py +2 -2
  70. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
  71. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
  72. tests/aigc/test_t2i.py +1 -1
  73. tests/cli/test_all.py +50 -2
  74. tests/cli/test_collection.py +1 -1
  75. tests/cli/test_custom.py +261 -0
  76. tests/cli/test_run.py +13 -37
  77. tests/perf/test_perf.py +2 -2
  78. tests/rag/test_clip_benchmark.py +2 -1
  79. tests/rag/test_mteb.py +3 -1
  80. tests/rag/test_ragas.py +3 -1
  81. tests/swift/test_run_swift_eval.py +2 -1
  82. tests/swift/test_run_swift_vlm_eval.py +2 -1
  83. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  84. tests/utils.py +13 -0
  85. tests/vlm/test_vlmeval.py +8 -2
  86. evalscope/evaluator/rating_eval.py +0 -157
  87. evalscope/evaluator/reviewer/__init__.py +0 -1
  88. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  89. evalscope/registry/__init__.py +0 -1
  90. evalscope/registry/config/cfg_arena.yaml +0 -77
  91. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  92. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  93. evalscope/registry/config/cfg_single.yaml +0 -78
  94. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  95. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  96. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  97. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  98. evalscope/registry/data/question.jsonl +0 -80
  99. evalscope/registry/tasks/arc.yaml +0 -28
  100. evalscope/registry/tasks/bbh.yaml +0 -26
  101. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  102. evalscope/registry/tasks/ceval.yaml +0 -27
  103. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  104. evalscope/registry/tasks/cmmlu.yaml +0 -27
  105. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  106. evalscope/registry/tasks/general_qa.yaml +0 -27
  107. evalscope/registry/tasks/gsm8k.yaml +0 -29
  108. evalscope/registry/tasks/mmlu.yaml +0 -29
  109. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  110. evalscope/run_arena.py +0 -202
  111. evalscope/utils/arena_utils.py +0 -217
  112. evalscope/utils/completion_parsers.py +0 -82
  113. /evalscope/{utils → benchmarks}/filters.py +0 -0
  114. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
  115. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
  116. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
  117. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,261 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ from tests.utils import test_level_list
5
+
6
+ env = dotenv_values('.env')
7
+
8
+ import os
9
+ import subprocess
10
+ import unittest
11
+
12
+ from evalscope.config import TaskConfig
13
+ from evalscope.constants import EvalStage, EvalType, JudgeStrategy, OutputType
14
+ from evalscope.run import run_task
15
+ from evalscope.utils.import_utils import is_module_installed
16
+ from evalscope.utils.logger import get_logger
17
+
18
+ os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
19
+
20
+ logger = get_logger()
21
+
22
+
23
+ class TestRunCustom(unittest.TestCase):
24
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
25
+ def test_run_custom_task(self):
26
+ from evalscope.config import TaskConfig
27
+
28
+ task_cfg = TaskConfig(
29
+ model='Qwen/Qwen3-0.6B',
30
+ datasets=[
31
+ 'general_mcq',
32
+ 'general_qa'
33
+ ],
34
+ dataset_args={
35
+ 'general_mcq': {
36
+ 'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
37
+ 'subset_list': [
38
+ 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
39
+ ],
40
+ 'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
41
+ },
42
+ 'general_qa': {
43
+ 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
44
+ 'subset_list': [
45
+ 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
46
+ ]
47
+ }
48
+ },
49
+ )
50
+ res = run_task(task_cfg=task_cfg)
51
+ print(res)
52
+
53
+
54
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
55
+ def test_run_local_dataset(self):
56
+ from evalscope.config import TaskConfig
57
+
58
+ task_cfg = TaskConfig(
59
+ model='qwen-plus',
60
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
61
+ api_key= env.get('DASHSCOPE_API_KEY'),
62
+ eval_type=EvalType.SERVICE,
63
+ datasets=[
64
+ # 'mmlu',
65
+ # 'race',
66
+ 'trivia_qa',
67
+ # 'cmmlu',
68
+ # 'humaneval',
69
+ # 'gsm8k',
70
+ # 'bbh',
71
+ # 'competition_math',
72
+ # 'arc',
73
+ # 'ceval',
74
+ ],
75
+ dataset_args={
76
+ 'mmlu': {
77
+ 'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
78
+ 'few_shot_num': 0,
79
+ 'dataset_id': 'data/data/mmlu',
80
+ },
81
+ 'ceval': {
82
+ 'subset_list': [
83
+ 'computer_network', 'operating_system', 'computer_architecture'
84
+ ],
85
+ 'few_shot_num': 0,
86
+ 'dataset_id': 'data/data/ceval',
87
+ },
88
+ 'cmmlu': {
89
+ 'subset_list': ['elementary_chinese'],
90
+ 'dataset_id': 'data/data/cmmlu',
91
+ 'few_shot_num': 0
92
+ },
93
+ 'bbh': {
94
+ 'subset_list': ['word_sorting', 'movie_recommendation'],
95
+ },
96
+ 'humaneval': {
97
+ 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
98
+ },
99
+ 'trivia_qa': {
100
+ 'dataset_id': 'data/data/trivia_qa',
101
+ },
102
+ },
103
+ eval_batch_size=10,
104
+ limit=5,
105
+ debug=True,
106
+ stream=True,
107
+ generation_config={
108
+ 'temperature': 0,
109
+ 'n': 1,
110
+ 'max_tokens': 4096,
111
+ },
112
+ ignore_errors=False,
113
+ )
114
+
115
+ run_task(task_cfg=task_cfg)
116
+
117
+
118
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
119
+ def test_run_general_no_answer(self):
120
+ from evalscope.config import TaskConfig
121
+
122
+ task_cfg = TaskConfig(
123
+ model='qwen2.5-72b-instruct',
124
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
125
+ api_key= env.get('DASHSCOPE_API_KEY'),
126
+ eval_type=EvalType.SERVICE,
127
+ datasets=[
128
+ 'general_qa',
129
+ ],
130
+ dataset_args={
131
+ 'general_qa': {
132
+ 'dataset_id': 'custom_eval/text/qa',
133
+ 'subset_list': [
134
+ 'arena',
135
+ 'example'
136
+ ],
137
+ }
138
+ },
139
+ eval_batch_size=10,
140
+ limit=10,
141
+ debug=True,
142
+ stream=True,
143
+ generation_config={
144
+ 'temperature': 0,
145
+ 'n': 1,
146
+ 'max_tokens': 4096,
147
+ },
148
+ ignore_errors=False,
149
+ judge_model_args={
150
+ 'model_id': 'qwen2.5-72b-instruct',
151
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
152
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
153
+ 'generation_config': {
154
+ 'temperature': 0.0,
155
+ 'max_tokens': 4096
156
+ },
157
+ 'score_type': 'numeric',
158
+ },
159
+ judge_worker_num=5,
160
+ judge_strategy=JudgeStrategy.AUTO,
161
+ )
162
+
163
+ run_task(task_cfg=task_cfg)
164
+
165
+
166
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
167
+ def test_run_general_with_answer(self):
168
+ from evalscope.config import TaskConfig
169
+
170
+ task_cfg = TaskConfig(
171
+ model='qwen-plus',
172
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
173
+ api_key= env.get('DASHSCOPE_API_KEY'),
174
+ eval_type=EvalType.SERVICE,
175
+ datasets=[
176
+ 'general_qa',
177
+ ],
178
+ dataset_args={
179
+ 'general_qa': {
180
+ 'dataset_id': 'custom_eval/text/qa',
181
+ 'subset_list': [
182
+ 'example'
183
+ ],
184
+ }
185
+ },
186
+ eval_batch_size=10,
187
+ limit=10,
188
+ debug=True,
189
+ stream=True,
190
+ generation_config={
191
+ 'temperature': 0,
192
+ 'n': 1,
193
+ 'max_tokens': 4096,
194
+ },
195
+ ignore_errors=False,
196
+ judge_model_args={
197
+ 'model_id': 'qwen2.5-72b-instruct',
198
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
199
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
200
+ 'generation_config': {
201
+ 'temperature': 0.0,
202
+ 'max_tokens': 4096
203
+ },
204
+ 'score_type': 'pattern',
205
+ },
206
+ judge_worker_num=5,
207
+ judge_strategy=JudgeStrategy.LLM,
208
+ )
209
+
210
+ run_task(task_cfg=task_cfg)
211
+
212
+
213
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
214
+ def test_run_general_arena(self):
215
+ from evalscope.config import TaskConfig
216
+
217
+ task_cfg = TaskConfig(
218
+ model_id='Arena',
219
+ datasets=[
220
+ 'general_arena',
221
+ ],
222
+ dataset_args={
223
+ 'general_arena': {
224
+ 'extra_params':{
225
+ 'models':[
226
+ {
227
+ 'name': 'qwen2.5-0.5b',
228
+ 'report_path': 'outputs/20250702_140354/reports/qwen2.5-0.5b-instruct'
229
+ },
230
+ {
231
+ 'name': 'qwen2.5-7b',
232
+ 'report_path': 'outputs/20250702_140702/reports/qwen2.5-7b-instruct'
233
+ },
234
+ {
235
+ 'name': 'qwen2.5-72b',
236
+ 'report_path': 'outputs/20250702_140802/reports/qwen2.5-72b-instruct'
237
+ }
238
+ ],
239
+ 'baseline': 'qwen2.5-7b'
240
+ }
241
+ }
242
+ },
243
+ eval_batch_size=10,
244
+ limit=10,
245
+ debug=True,
246
+ stream=True,
247
+ ignore_errors=False,
248
+ judge_model_args={
249
+ 'model_id': 'qwen-plus',
250
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
251
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
252
+ 'generation_config': {
253
+ 'temperature': 0.0,
254
+ 'max_tokens': 8000
255
+ },
256
+ },
257
+ judge_worker_num=5,
258
+ use_cache='outputs/20250702_165727'
259
+ )
260
+
261
+ run_task(task_cfg=task_cfg)
tests/cli/test_run.py CHANGED
@@ -1,6 +1,8 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  from dotenv import dotenv_values
3
3
 
4
+ from tests.utils import test_level_list
5
+
4
6
  env = dotenv_values('.env')
5
7
 
6
8
  import os
@@ -8,9 +10,9 @@ import subprocess
8
10
  import unittest
9
11
 
10
12
  from evalscope.config import TaskConfig
11
- from evalscope.constants import EvalType, JudgeStrategy, OutputType
13
+ from evalscope.constants import EvalStage, EvalType, JudgeStrategy, OutputType
12
14
  from evalscope.run import run_task
13
- from evalscope.utils import is_module_installed, test_level_list
15
+ from evalscope.utils.import_utils import is_module_installed
14
16
  from evalscope.utils.logger import get_logger
15
17
 
16
18
  os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
@@ -182,35 +184,6 @@ class TestRun(unittest.TestCase):
182
184
  run_task(task_cfg=task_cfg)
183
185
 
184
186
 
185
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
186
- def test_run_custom_task(self):
187
- from evalscope.config import TaskConfig
188
-
189
- task_cfg = TaskConfig(
190
- model='Qwen/Qwen3-0.6B',
191
- datasets=[
192
- 'general_mcq',
193
- 'general_qa'
194
- ],
195
- dataset_args={
196
- 'general_mcq': {
197
- 'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
198
- 'subset_list': [
199
- 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
200
- ],
201
- 'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
202
- },
203
- 'general_qa': {
204
- 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
205
- 'subset_list': [
206
- 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
207
- ]
208
- }
209
- },
210
- )
211
- res = run_task(task_cfg=task_cfg)
212
- print(res)
213
-
214
187
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
215
188
  def test_run_one_task(self):
216
189
  from evalscope.config import TaskConfig
@@ -286,7 +259,7 @@ class TestRun(unittest.TestCase):
286
259
  api_key= env.get('DASHSCOPE_API_KEY'),
287
260
  eval_type=EvalType.SERVICE,
288
261
  datasets=[
289
- # 'iquiz',
262
+ 'iquiz',
290
263
  # 'ifeval',
291
264
  # 'mmlu',
292
265
  # 'mmlu_pro',
@@ -305,7 +278,7 @@ class TestRun(unittest.TestCase):
305
278
  # 'arc',
306
279
  # 'ceval',
307
280
  # 'hellaswag',
308
- 'general_mcq',
281
+ # 'general_mcq',
309
282
  # 'general_qa',
310
283
  # 'super_gpqa',
311
284
  # 'mmlu_redux',
@@ -315,6 +288,7 @@ class TestRun(unittest.TestCase):
315
288
  # 'tool_bench',
316
289
  # 'frames',
317
290
  # 'bfcl_v3',
291
+ # 'truthful_qa',
318
292
  ],
319
293
  dataset_args={
320
294
  'mmlu': {
@@ -354,7 +328,6 @@ class TestRun(unittest.TestCase):
354
328
  },
355
329
  'musr': {
356
330
  'subset_list': ['murder_mysteries'],
357
- 'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/MuSR'
358
331
  },
359
332
  'general_mcq': {
360
333
  'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
@@ -378,6 +351,9 @@ class TestRun(unittest.TestCase):
378
351
  'mmlu_redux':{
379
352
  'subset_list': ['abstract_algebra']
380
353
  },
354
+ 'frames':{
355
+ 'local_path': 'data/iic/frames',
356
+ },
381
357
  'bfcl_v3': {
382
358
  'subset_list': ['parallel'],
383
359
  'extra_params': {
@@ -385,9 +361,9 @@ class TestRun(unittest.TestCase):
385
361
  }
386
362
  },
387
363
  },
388
- eval_batch_size=10,
364
+ eval_batch_size=1,
389
365
  limit=5,
390
- debug=True,
366
+ # debug=True,
391
367
  stream=True,
392
368
  generation_config={
393
369
  'temperature': 0,
@@ -396,7 +372,6 @@ class TestRun(unittest.TestCase):
396
372
  # 'extra_headers':{'key': 'value'},
397
373
  },
398
374
  ignore_errors=False,
399
- # use_cache='outputs/20250616_153756'
400
375
  )
401
376
 
402
377
  run_task(task_cfg=task_cfg)
@@ -521,5 +496,6 @@ class TestRun(unittest.TestCase):
521
496
 
522
497
  run_task(task_cfg=task_cfg)
523
498
 
499
+
524
500
  if __name__ == '__main__':
525
501
  unittest.main()
tests/perf/test_perf.py CHANGED
@@ -7,7 +7,7 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '0'
7
7
  import unittest
8
8
 
9
9
  from evalscope.perf.main import run_perf_benchmark
10
- from evalscope.utils import test_level_list
10
+ from tests.utils import test_level_list
11
11
 
12
12
 
13
13
  class TestPerf(unittest.TestCase):
@@ -126,7 +126,7 @@ class TestPerf(unittest.TestCase):
126
126
  from evalscope.perf.arguments import Arguments
127
127
  task_cfg = Arguments(
128
128
  parallel=[1, 2],
129
- number=[2, 5],
129
+ number=[2, 4],
130
130
  model='qwen2.5-7b-instruct',
131
131
  url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
132
132
  api_key=env.get('DASHSCOPE_API_KEY'),
@@ -6,8 +6,9 @@ import subprocess
6
6
  import unittest
7
7
 
8
8
  from evalscope.run import run_task
9
- from evalscope.utils import is_module_installed, test_level_list
9
+ from evalscope.utils.import_utils import is_module_installed
10
10
  from evalscope.utils.logger import get_logger
11
+ from tests.utils import test_level_list
11
12
 
12
13
  logger = get_logger()
13
14
 
tests/rag/test_mteb.py CHANGED
@@ -3,9 +3,11 @@
3
3
  import unittest
4
4
  from dotenv import dotenv_values
5
5
 
6
+ from tests.utils import test_level_list
7
+
6
8
  env = dotenv_values('.env')
7
9
  from evalscope.run import run_task
8
- from evalscope.utils import is_module_installed, test_level_list
10
+ from evalscope.utils.import_utils import is_module_installed
9
11
  from evalscope.utils.logger import get_logger
10
12
 
11
13
  logger = get_logger()
tests/rag/test_ragas.py CHANGED
@@ -2,11 +2,13 @@
2
2
  import os
3
3
  from dotenv import dotenv_values
4
4
 
5
+ from tests.utils import test_level_list
6
+
5
7
  env = dotenv_values('.env')
6
8
  import unittest
7
9
 
8
10
  from evalscope import TaskConfig, run_task
9
- from evalscope.utils import is_module_installed, test_level_list
11
+ from evalscope.utils.import_utils import is_module_installed
10
12
  from evalscope.utils.logger import get_logger
11
13
 
12
14
  logger = get_logger()
@@ -10,8 +10,9 @@ import unittest
10
10
  from evalscope.backend.opencompass import OpenCompassBackendManager
11
11
  from evalscope.run import run_task
12
12
  from evalscope.summarizer import Summarizer
13
- from evalscope.utils import is_module_installed, test_level_list
13
+ from evalscope.utils.import_utils import is_module_installed
14
14
  from evalscope.utils.logger import get_logger
15
+ from tests.utils import test_level_list
15
16
 
16
17
  logger = get_logger(__name__)
17
18
 
@@ -10,8 +10,9 @@ import unittest
10
10
  from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
11
11
  from evalscope.run import run_task
12
12
  from evalscope.summarizer import Summarizer
13
- from evalscope.utils import is_module_installed, test_level_list
13
+ from evalscope.utils.import_utils import is_module_installed
14
14
  from evalscope.utils.logger import get_logger
15
+ from tests.utils import test_level_list
15
16
 
16
17
  logger = get_logger(__name__)
17
18
 
@@ -10,8 +10,9 @@ import unittest
10
10
  from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
11
11
  from evalscope.run import run_task
12
12
  from evalscope.summarizer import Summarizer
13
- from evalscope.utils import is_module_installed, test_level_list
13
+ from evalscope.utils.import_utils import is_module_installed
14
14
  from evalscope.utils.logger import get_logger
15
+ from tests.utils import test_level_list
15
16
 
16
17
  logger = get_logger(__name__)
17
18
 
tests/utils.py ADDED
@@ -0,0 +1,13 @@
1
+ import os
2
+
3
+ TEST_LEVEL_LIST = [0, 1]
4
+ # Example: export TEST_LEVEL_LIST=0,1
5
+ TEST_LEVEL_LIST_STR = 'TEST_LEVEL_LIST'
6
+
7
+
8
+ def test_level_list():
9
+ global TEST_LEVEL_LIST
10
+ if TEST_LEVEL_LIST_STR in os.environ:
11
+ TEST_LEVEL_LIST = [int(x) for x in os.environ[TEST_LEVEL_LIST_STR].split(',')]
12
+
13
+ return TEST_LEVEL_LIST
tests/vlm/test_vlmeval.py CHANGED
@@ -1,12 +1,14 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  from dotenv import dotenv_values
3
3
 
4
+ from tests.utils import test_level_list
5
+
4
6
  env = dotenv_values('.env')
5
7
  import unittest
6
8
 
7
9
  from evalscope.run import run_task
8
10
  from evalscope.summarizer import Summarizer
9
- from evalscope.utils import is_module_installed, test_level_list
11
+ from evalscope.utils.import_utils import is_module_installed
10
12
  from evalscope.utils.logger import get_logger
11
13
 
12
14
  logger = get_logger()
@@ -62,7 +64,11 @@ class TestVLMEval(unittest.TestCase):
62
64
  task_cfg = {
63
65
  'eval_backend': 'VLMEvalKit',
64
66
  'eval_config': {
65
- 'data': ['SEEDBench_IMG', 'ChartQA_TEST'],
67
+ 'data': [
68
+ # 'SEEDBench_IMG',
69
+ # 'ChartQA_TEST',
70
+ 'MMDU'
71
+ ],
66
72
  'limit': 5,
67
73
  'mode': 'all',
68
74
  'model': [
@@ -1,157 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- import pandas as pd
4
- import pyarrow as pa
5
- from typing import List, Union
6
-
7
- from evalscope.constants import MetricMembers
8
- from evalscope.utils.arena_utils import compute_elo
9
- from evalscope.utils.io_utils import jsonl_to_list
10
- from evalscope.utils.logger import get_logger
11
-
12
- logger = get_logger()
13
-
14
- DEFAULT_COLUMNS_MAPPING = {'model_a': 'model_a', 'model_b': 'model_b', 'win': 'win', 'tstamp': 'ts', 'language': 'lang'}
15
-
16
-
17
- class RatingEvaluate(object):
18
-
19
- def __init__(self, metrics: list, baseline_model: str = None, **kwargs):
20
- self.metrics = metrics
21
- self.baseline_model = baseline_model
22
- self.kwargs = kwargs
23
-
24
- def preprocess(self, raw_data_df: pd.DataFrame, **kwargs):
25
-
26
- # Get battles data
27
- raw_data_df = raw_data_df.sort_values(ascending=True, by=['tstamp'])
28
- battles = raw_data_df[raw_data_df['anony']].reset_index(drop=True)
29
-
30
- return battles
31
-
32
- def compute_elo_rating(self, raw_data):
33
- battles = self.preprocess(raw_data_df=raw_data)
34
- elo_ratings = compute_elo(battles)
35
- col_model = 'Model'
36
- col_elo_rating = 'Elo_Rating'
37
- elo_ratings_res = pd.DataFrame([[n, elo_ratings[n]] for n in elo_ratings.keys()],
38
- columns=[col_model, col_elo_rating]).sort_values(
39
- col_elo_rating, ascending=False).reset_index(drop=True)
40
- elo_ratings_res = elo_ratings_res.round({col_elo_rating: 1})
41
- return elo_ratings_res
42
-
43
- def get_single_pairwise_rating(self, row: pd.Series):
44
- tie = False
45
- if 'win' in row:
46
- win = row['win']
47
- if win == 'tie':
48
- tie = True
49
- else:
50
- if win == 'model_a':
51
- winner = row['model_a']
52
- loser = row['model_b']
53
- else:
54
- winner = row['model_b']
55
- loser = row['model_a']
56
- elif 'win_1' in row:
57
- win_1 = row['win_1']
58
- win_2 = row['win_2']
59
- if win_1 == 'tie' or win_1 != win_2:
60
- tie = True
61
- else:
62
- if win_1 == 'model_a':
63
- winner = row['model_a']
64
- loser = row['model_b']
65
- else:
66
- winner = row['model_b']
67
- loser = row['model_a']
68
- else:
69
- raise ValueError('Unsupported data format')
70
-
71
- if tie:
72
- return [{
73
- 'model': row['model_a'],
74
- 'win': 0,
75
- 'loss': 0,
76
- 'tie': 1
77
- }, {
78
- 'model': row['model_b'],
79
- 'win': 0,
80
- 'loss': 0,
81
- 'tie': 1
82
- }]
83
- else:
84
- return [{'model': winner, 'win': 1, 'loss': 0, 'tie': 0}, {'model': loser, 'win': 0, 'loss': 1, 'tie': 0}]
85
-
86
- def compute_pairwise_rating(self, raw_data):
87
- df_all = self.preprocess(raw_data_df=raw_data)
88
- model_list = (df_all['model_a'].unique().tolist() + df_all['model_b'].unique().tolist())
89
- model_list = list(set(model_list))
90
-
91
- list_res = []
92
- # traverse df row by row
93
- for index, row in df_all.iterrows():
94
- if self.baseline_model is not None:
95
- if self.baseline_model not in [row['model_a'], row['model_b']]:
96
- logger.warning(
97
- f'One of the models in the battle should be the baseline model: {self.baseline_model}')
98
- continue
99
- rating = self.get_single_pairwise_rating(row)
100
- list_res = list_res + rating
101
-
102
- df = pd.DataFrame(list_res)
103
- df = df.groupby(['model']).sum()
104
-
105
- # remove baseline model
106
- if self.baseline_model is not None:
107
- df = df[df.index != self.baseline_model]
108
- # add win rate
109
- df['win_rate'] = df['win'] / (df['win'] + df['loss'] + df['tie'])
110
- df['loss_rate'] = df['loss'] / (df['win'] + df['loss'] + df['tie'])
111
- df['tie_rate'] = df['tie'] / (df['win'] + df['loss'] + df['tie'])
112
- return df.sort_values(by='win_rate', ascending=False)
113
-
114
- def compute_score_rating(self, raw_data):
115
- df_all = self.preprocess(raw_data_df=raw_data)
116
- df = df_all[['model', 'score']]
117
-
118
- df_score = df.groupby(['model']).mean()
119
- return df_score.sort_values(by='score', ascending=False)
120
-
121
- def eval_samples(self, data_list: list):
122
- res_all = []
123
-
124
- raw_data: pd.DataFrame = None
125
-
126
- if len(data_list) > 0:
127
- raw_data = data_list[0]
128
-
129
- for metric in self.metrics:
130
-
131
- if metric == MetricMembers.ELO:
132
- res = self.compute_elo_rating(raw_data)
133
- res_all.append(res)
134
-
135
- elif metric == MetricMembers.PAIRWISE:
136
- res = self.compute_pairwise_rating(raw_data)
137
- res_all.append(res)
138
-
139
- elif metric == MetricMembers.SCORE:
140
- res = self.compute_score_rating(raw_data)
141
- res_all.append(res)
142
-
143
- else:
144
- raise ValueError(f'Unsupported metric: {metric}')
145
-
146
- return res_all
147
-
148
- def run(self, prompts: Union[str, list], **kwargs) -> List[pd.DataFrame]:
149
- """
150
- Load the predicted samples and evaluate them in arena mode.
151
- """
152
- # raw_data = pd.read_json(prompts)
153
- data_list = jsonl_to_list(prompts)
154
- data_df = pa.Table.from_pylist(data_list).to_pandas()
155
- res_list = self.eval_samples([data_df])
156
-
157
- return res_list
@@ -1 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.