evalscope 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (85) hide show
  1. evalscope/arguments.py +6 -1
  2. evalscope/benchmarks/aime/aime24_adapter.py +3 -3
  3. evalscope/benchmarks/aime/aime25_adapter.py +3 -3
  4. evalscope/benchmarks/arc/arc_adapter.py +15 -18
  5. evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
  6. evalscope/benchmarks/benchmark.py +12 -11
  7. evalscope/benchmarks/ceval/ceval_adapter.py +12 -16
  8. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  9. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
  10. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +13 -17
  11. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
  12. evalscope/benchmarks/data_adapter.py +59 -21
  13. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  14. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
  15. evalscope/benchmarks/general_qa/general_qa_adapter.py +30 -15
  16. evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
  17. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
  18. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -31
  19. evalscope/benchmarks/humaneval/humaneval_adapter.py +10 -7
  20. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
  21. evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
  22. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  23. evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
  24. evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
  25. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  26. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
  27. evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
  28. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  29. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  30. evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
  31. evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
  32. evalscope/benchmarks/mmlu/mmlu_adapter.py +13 -17
  33. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
  34. evalscope/benchmarks/musr/musr_adapter.py +8 -5
  35. evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
  36. evalscope/benchmarks/race/race_adapter.py +12 -16
  37. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  38. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +167 -0
  39. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  40. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
  41. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
  42. evalscope/benchmarks/super_gpqa/utils.py +85 -0
  43. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
  44. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
  45. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
  46. evalscope/benchmarks/utils.py +43 -0
  47. evalscope/collections/evaluator.py +14 -5
  48. evalscope/config.py +15 -2
  49. evalscope/constants.py +14 -0
  50. evalscope/evaluator/evaluator.py +51 -13
  51. evalscope/metrics/llm_judge.py +104 -0
  52. evalscope/metrics/named_metrics.py +1 -0
  53. evalscope/models/__init__.py +2 -1
  54. evalscope/models/base_adapter.py +25 -5
  55. evalscope/models/chat_adapter.py +3 -0
  56. evalscope/models/choice_adapter.py +4 -0
  57. evalscope/models/custom_adapter.py +2 -0
  58. evalscope/models/register.py +28 -0
  59. evalscope/models/server_adapter.py +35 -8
  60. evalscope/perf/arguments.py +13 -7
  61. evalscope/perf/benchmark.py +5 -0
  62. evalscope/perf/http_client.py +15 -5
  63. evalscope/perf/main.py +1 -0
  64. evalscope/perf/utils/analysis_result.py +1 -1
  65. evalscope/report/app.py +3 -0
  66. evalscope/report/combinator.py +2 -2
  67. evalscope/run.py +6 -5
  68. evalscope/third_party/longbench_write/infer.py +1 -1
  69. evalscope/third_party/thinkbench/eval.py +220 -55
  70. evalscope/third_party/thinkbench/infer.py +37 -7
  71. evalscope/third_party/thinkbench/tools/llm.py +1 -0
  72. evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
  73. evalscope/utils/chat_service.py +1 -0
  74. evalscope/utils/filters.py +59 -0
  75. evalscope/utils/logger.py +3 -3
  76. evalscope/version.py +2 -2
  77. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/METADATA +31 -12
  78. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/RECORD +85 -62
  79. tests/cli/test_all.py +144 -0
  80. tests/cli/test_collection.py +28 -2
  81. tests/cli/test_run.py +201 -32
  82. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/LICENSE +0 -0
  83. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/WHEEL +0 -0
  84. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/entry_points.txt +0 -0
  85. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/top_level.txt +0 -0
tests/cli/test_run.py CHANGED
@@ -1,10 +1,14 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
2
6
  import os
3
7
  import subprocess
4
- import torch
5
8
  import unittest
6
9
 
7
- from evalscope.constants import EvalType
10
+ from evalscope.config import TaskConfig
11
+ from evalscope.constants import EvalType, JudgeStrategy, OutputType
8
12
  from evalscope.run import run_task
9
13
  from evalscope.utils import is_module_installed, test_level_list
10
14
  from evalscope.utils.logger import get_logger
@@ -71,21 +75,104 @@ class TestRun(unittest.TestCase):
71
75
 
72
76
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
73
77
  def test_run_task(self):
74
- task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct',
75
- 'datasets': [
76
- # 'mmlu_pro',
77
- # 'bbh',
78
- # 'hellaswag',
79
- 'gsm8k',
80
- # 'arc',
81
- # 'race',
82
- # 'ifeval',
83
- # 'truthful_qa',
84
- # 'trivia_qa',
85
- ],
86
- 'limit': 2,
87
- 'eval_batch_size': 2,
88
- 'debug': True}
78
+ task_cfg = TaskConfig(
79
+ model='qwen/Qwen2.5-0.5B-Instruct',
80
+ datasets=[
81
+ 'iquiz',
82
+ # 'ifeval',
83
+ # 'mmlu',
84
+ # 'mmlu_pro',
85
+ # 'musr',
86
+ # 'process_bench',
87
+ # 'race',
88
+ # 'trivia_qa',
89
+ # 'cmmlu',
90
+ # 'humaneval',
91
+ # 'super_gpqa',
92
+ # 'gsm8k',
93
+ # 'bbh',
94
+ # 'competition_math',
95
+ # 'math_500',
96
+ 'aime24',
97
+ 'gpqa',
98
+ # 'arc',
99
+ # 'ceval',
100
+ # 'hellaswag',
101
+ # 'general_mcq',
102
+ # 'general_qa'
103
+ ],
104
+ dataset_args={
105
+ 'mmlu': {
106
+ 'subset_list': ['elementary_mathematics'],
107
+ 'few_shot_num': 0
108
+ },
109
+ 'mmlu_pro': {
110
+ 'subset_list': ['math', 'health'],
111
+ 'few_shot_num': 4
112
+ },
113
+ 'ceval': {
114
+ 'subset_list': [
115
+ 'computer_network', 'operating_system', 'computer_architecture'
116
+ ],
117
+ 'few_shot_num': 0
118
+ },
119
+ 'cmmlu': {
120
+ 'subset_list': ['elementary_chinese'],
121
+ 'few_shot_num': 0
122
+ },
123
+ 'bbh': {
124
+ 'subset_list': ['word_sorting', 'movie_recommendation'],
125
+ },
126
+ 'gpqa': {
127
+ 'subset_list': ['gpqa_diamond'],
128
+ 'few_shot_num': 0
129
+ },
130
+ 'humaneval': {
131
+ 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
132
+ },
133
+ 'competition_math': {
134
+ 'subset_list': ['Level 1']
135
+ },
136
+ 'process_bench': {
137
+ 'subset_list': ['gsm8k'],
138
+ },
139
+ 'musr': {
140
+ 'subset_list': ['murder_mysteries']
141
+ },
142
+ 'general_mcq': {
143
+ 'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
144
+ 'subset_list': [
145
+ 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
146
+ ],
147
+ 'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
148
+ },
149
+ 'general_qa': {
150
+ 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
151
+ 'subset_list': [
152
+ 'example', # 评测数据集名称,上述 *_dev.csv 中的 *
153
+ # 'test'
154
+ ],
155
+ 'metric_list': ['AverageBLEU']
156
+ },
157
+ 'super_gpqa': {
158
+ 'subset_list': ['Philosophy', 'Education'],
159
+ 'few_shot_num': 0
160
+ },
161
+ 'ifeval': {
162
+ 'filters': {
163
+ 'remove_until': '</think>'
164
+ }
165
+ }
166
+ },
167
+ limit=2,
168
+ eval_batch_size=2,
169
+ generation_config={
170
+ 'max_new_tokens': 2048,
171
+ 'temperature': 0.7,
172
+ 'num_return_sequences': 1,
173
+ },
174
+ # debug=True
175
+ )
89
176
  run_task(task_cfg=task_cfg)
90
177
 
91
178
 
@@ -141,12 +228,12 @@ class TestRun(unittest.TestCase):
141
228
  from evalscope.config import TaskConfig
142
229
 
143
230
  task_cfg = TaskConfig(
144
- model='Qwen2.5-0.5B-Instruct',
145
- api_url='http://127.0.0.1:8801/v1',
146
- api_key='EMPTY',
231
+ model='qwen2.5-7b-instruct',
232
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
233
+ api_key= env.get('DASHSCOPE_API_KEY'),
147
234
  eval_type=EvalType.SERVICE,
148
235
  datasets=[
149
- 'iquiz',
236
+ # 'iquiz',
150
237
  # 'ifeval',
151
238
  # 'mmlu',
152
239
  # 'mmlu_pro',
@@ -164,11 +251,14 @@ class TestRun(unittest.TestCase):
164
251
  # 'gpqa',
165
252
  # 'arc',
166
253
  # 'ceval',
167
- # 'hellaswag',
254
+ 'hellaswag',
255
+ # 'general_mcq',
256
+ # 'general_qa'
257
+ # 'super_gpqa',
168
258
  ],
169
259
  dataset_args={
170
260
  'mmlu': {
171
- 'subset_list': ['elementary_mathematics'],
261
+ 'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
172
262
  'few_shot_num': 0
173
263
  },
174
264
  'mmlu_pro': {
@@ -189,8 +279,9 @@ class TestRun(unittest.TestCase):
189
279
  'subset_list': ['word_sorting', 'movie_recommendation'],
190
280
  },
191
281
  'gpqa': {
192
- 'subset_list': ['gpqa_diamond'],
193
- 'few_shot_num': 0
282
+ # 'subset_list': ['gpqa_diamond'],
283
+ 'few_shot_num': 0,
284
+ 'local_path': './data/data/gpqa',
194
285
  },
195
286
  'humaneval': {
196
287
  'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
@@ -204,17 +295,36 @@ class TestRun(unittest.TestCase):
204
295
  'musr': {
205
296
  'subset_list': ['murder_mysteries']
206
297
  },
298
+ 'general_mcq': {
299
+ 'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
300
+ 'subset_list': [
301
+ 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
302
+ ],
303
+ 'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
304
+ },
305
+ 'general_qa': {
306
+ 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
307
+ 'subset_list': [
308
+ 'example', # 评测数据集名称,上述 *_dev.csv 中的 *
309
+ # 'test'
310
+ ],
311
+ 'metric_list': ['AverageBLEU']
312
+ },
313
+ 'super_gpqa': {
314
+ # 'subset_list': ['Philosophy', 'Education'],
315
+ 'few_shot_num': 0
316
+ }
207
317
  },
208
- eval_batch_size=5,
209
- limit=5,
210
- debug=True,
211
- stream=True,
318
+ eval_batch_size=32,
319
+ limit=15,
320
+ # debug=True,
321
+ stream=False,
212
322
  generation_config={
213
- 'temperature': 0.7,
323
+ 'temperature': 0,
214
324
  'n': 1,
215
- 'max_tokens': 512,
325
+ 'max_tokens': 4096,
216
326
  },
217
- # use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250212_150525',
327
+ # use_cache='./outputs/20250212_150525',
218
328
  )
219
329
 
220
330
  run_task(task_cfg=task_cfg)
@@ -250,5 +360,64 @@ class TestRun(unittest.TestCase):
250
360
 
251
361
  run_task(task_cfg=task_cfg)
252
362
 
363
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
364
+ def test_run_judge_model(self):
365
+ from evalscope.config import TaskConfig
366
+
367
+ task_cfg = TaskConfig(
368
+ model='qwen2.5-7b-instruct',
369
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
370
+ api_key= env.get('DASHSCOPE_API_KEY'),
371
+ eval_type=EvalType.SERVICE,
372
+ datasets=[
373
+ # 'math_500',
374
+ 'aime24',
375
+ # 'competition_math',
376
+ # 'arc',
377
+ # 'gsm8k'
378
+ # 'truthful_qa',
379
+ # 'simple_qa',
380
+ # # 'chinese_simpleqa',
381
+ # 'live_code_bench',
382
+ # 'humaneval'
383
+ # 'general_qa'
384
+ ],
385
+ dataset_args={
386
+ 'competition_math': {
387
+ 'subset_list': ['Level 4']
388
+ },
389
+ 'live_code_bench': {
390
+ 'subset_list': ['v4_v5'],
391
+ 'extra_params': {
392
+ 'start_date': '2024-12-01',
393
+ 'end_date': '2025-01-01'
394
+ },
395
+ 'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
396
+ },
397
+ 'general_qa': {
398
+ 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
399
+ 'subset_list': [
400
+ 'example', # 评测数据集名称,上述 *_dev.csv 中的 *
401
+ # 'test'
402
+ ]
403
+ },
404
+ },
405
+ eval_batch_size=5,
406
+ limit=5,
407
+ judge_strategy=JudgeStrategy.AUTO,
408
+ judge_model_args={
409
+ 'model_id': 'qwen2.5-7b-instruct',
410
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
411
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
412
+ },
413
+ generation_config={
414
+ 'max_new_tokens': 2048,
415
+ 'temperature': 0.0,
416
+ 'seed': 42,
417
+ }
418
+ )
419
+
420
+ run_task(task_cfg=task_cfg)
421
+
253
422
  if __name__ == '__main__':
254
423
  unittest.main()