evalscope 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (45) hide show
  1. evalscope/arguments.py +1 -1
  2. evalscope/backend/rag_eval/utils/llm.py +4 -5
  3. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  4. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
  5. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  6. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
  7. evalscope/benchmarks/arena_hard/utils.py +162 -0
  8. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
  10. evalscope/benchmarks/data_adapter.py +26 -2
  11. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  12. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -11
  13. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
  14. evalscope/benchmarks/live_code_bench/testing_util.py +3 -3
  15. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  16. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
  17. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
  18. evalscope/collections/evaluator.py +1 -1
  19. evalscope/config.py +6 -3
  20. evalscope/constants.py +1 -0
  21. evalscope/evaluator/evaluator.py +5 -4
  22. evalscope/metrics/llm_judge.py +1 -1
  23. evalscope/models/chat_adapter.py +32 -11
  24. evalscope/models/custom_adapter.py +1 -1
  25. evalscope/perf/arguments.py +19 -46
  26. evalscope/perf/benchmark.py +64 -90
  27. evalscope/perf/main.py +1 -1
  28. evalscope/perf/plugin/api/openai_api.py +4 -2
  29. evalscope/perf/plugin/datasets/__init__.py +1 -0
  30. evalscope/perf/plugin/datasets/openqa.py +6 -11
  31. evalscope/perf/plugin/datasets/random_dataset.py +51 -0
  32. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  33. evalscope/perf/utils/db_util.py +5 -2
  34. evalscope/run.py +14 -2
  35. evalscope/version.py +2 -2
  36. {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/METADATA +42 -78
  37. {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/RECORD +45 -37
  38. tests/cli/test_all.py +33 -24
  39. tests/cli/test_run.py +69 -22
  40. tests/perf/test_perf.py +23 -0
  41. tests/rag/test_ragas.py +4 -1
  42. {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/LICENSE +0 -0
  43. {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/WHEEL +0 -0
  44. {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/entry_points.txt +0 -0
  45. {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/top_level.txt +0 -0
tests/cli/test_run.py CHANGED
@@ -203,15 +203,16 @@ class TestRun(unittest.TestCase):
203
203
  print(res)
204
204
 
205
205
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
206
- def test_run_humaneval(self):
206
+ def test_run_one_task(self):
207
207
  from evalscope.config import TaskConfig
208
208
 
209
209
  task_cfg = TaskConfig(
210
- model='qwen/Qwen2-0.5B-Instruct',
210
+ model='Qwen/Qwen2.5-0.5B-Instruct',
211
211
  datasets=[
212
+ 'iquiz',
212
213
  # 'math_500',
213
214
  # 'aime24',
214
- 'competition_math'
215
+ # 'competition_math'
215
216
  ],
216
217
  dataset_args={
217
218
  'competition_math': {
@@ -223,12 +224,39 @@ class TestRun(unittest.TestCase):
223
224
 
224
225
  run_task(task_cfg=task_cfg)
225
226
 
227
+
228
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
229
+ def test_run_task_loop(self):
230
+ os.environ['CUDA_VISIBLE_DEVICES'] = '2'
231
+ from evalscope.config import TaskConfig
232
+
233
+ task_cfg1 = TaskConfig(
234
+ model='Qwen/Qwen2.5-0.5B-Instruct',
235
+ model_id='model1',
236
+ datasets=['iquiz'],
237
+ limit=10
238
+ )
239
+ task_cfg2 = TaskConfig(
240
+ model='Qwen/Qwen2.5-0.5B-Instruct',
241
+ model_id='model2',
242
+ datasets=['iquiz'],
243
+ limit=10
244
+ )
245
+ task_cfg3 = TaskConfig(
246
+ model='Qwen/Qwen2.5-0.5B-Instruct',
247
+ model_id='model3',
248
+ datasets=['iquiz'],
249
+ limit=10
250
+ )
251
+
252
+ run_task(task_cfg=[task_cfg1, task_cfg2, task_cfg3])
253
+
226
254
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
227
255
  def test_run_server_model(self):
228
256
  from evalscope.config import TaskConfig
229
257
 
230
258
  task_cfg = TaskConfig(
231
- model='qwen2.5-7b-instruct',
259
+ model='qwen-plus',
232
260
  api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
233
261
  api_key= env.get('DASHSCOPE_API_KEY'),
234
262
  eval_type=EvalType.SERVICE,
@@ -251,10 +279,11 @@ class TestRun(unittest.TestCase):
251
279
  # 'gpqa',
252
280
  # 'arc',
253
281
  # 'ceval',
254
- 'hellaswag',
282
+ # 'hellaswag',
255
283
  # 'general_mcq',
256
- # 'general_qa'
284
+ 'general_qa'
257
285
  # 'super_gpqa',
286
+ # 'mmlu_redux'
258
287
  ],
259
288
  dataset_args={
260
289
  'mmlu': {
@@ -308,23 +337,26 @@ class TestRun(unittest.TestCase):
308
337
  'example', # 评测数据集名称,上述 *_dev.csv 中的 *
309
338
  # 'test'
310
339
  ],
311
- 'metric_list': ['AverageBLEU']
340
+ 'metric_list': ['AverageRouge']
312
341
  },
313
342
  'super_gpqa': {
314
343
  # 'subset_list': ['Philosophy', 'Education'],
315
344
  'few_shot_num': 0
316
- }
345
+ },
346
+ 'mmlu_redux':{
347
+ 'subset_list': ['abstract_algebra']
348
+ },
317
349
  },
318
350
  eval_batch_size=32,
319
351
  limit=15,
320
- # debug=True,
352
+ debug=True,
321
353
  stream=False,
322
354
  generation_config={
323
355
  'temperature': 0,
324
- 'n': 1,
356
+ 'n': 2,
325
357
  'max_tokens': 4096,
326
358
  },
327
- # use_cache='./outputs/20250212_150525',
359
+ use_cache='outputs/20250326_202848',
328
360
  )
329
361
 
330
362
  run_task(task_cfg=task_cfg)
@@ -365,32 +397,33 @@ class TestRun(unittest.TestCase):
365
397
  from evalscope.config import TaskConfig
366
398
 
367
399
  task_cfg = TaskConfig(
368
- model='qwen2.5-7b-instruct',
400
+ model='qwen2.5-0.5b-instruct',
369
401
  api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
370
402
  api_key= env.get('DASHSCOPE_API_KEY'),
371
403
  eval_type=EvalType.SERVICE,
372
404
  datasets=[
373
405
  # 'math_500',
374
- 'aime24',
406
+ # 'aime24',
375
407
  # 'competition_math',
376
408
  # 'arc',
377
409
  # 'gsm8k'
378
410
  # 'truthful_qa',
379
411
  # 'simple_qa',
380
- # # 'chinese_simpleqa',
412
+ # 'chinese_simpleqa',
381
413
  # 'live_code_bench',
382
- # 'humaneval'
383
- # 'general_qa'
414
+ # 'humaneval',
415
+ # 'general_qa',
416
+ # 'alpaca_eval',
417
+ 'arena_hard'
384
418
  ],
385
419
  dataset_args={
386
420
  'competition_math': {
387
421
  'subset_list': ['Level 4']
388
422
  },
389
423
  'live_code_bench': {
390
- 'subset_list': ['v4_v5'],
391
424
  'extra_params': {
392
- 'start_date': '2024-12-01',
393
- 'end_date': '2025-01-01'
425
+ 'start_date': '2024-08-01',
426
+ 'end_date': '2025-02-28'
394
427
  },
395
428
  'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
396
429
  },
@@ -401,20 +434,34 @@ class TestRun(unittest.TestCase):
401
434
  # 'test'
402
435
  ]
403
436
  },
437
+ 'chinese_simpleqa': {
438
+ 'subset_list': [
439
+ '中华文化'
440
+ ]
441
+ },
404
442
  },
405
443
  eval_batch_size=5,
406
- limit=5,
444
+ limit=10,
407
445
  judge_strategy=JudgeStrategy.AUTO,
446
+ judge_worker_num=5,
408
447
  judge_model_args={
409
448
  'model_id': 'qwen2.5-7b-instruct',
410
449
  'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
411
450
  'api_key': env.get('DASHSCOPE_API_KEY'),
451
+ 'generation_config': {
452
+ 'temperature': 0.0,
453
+ 'max_tokens': 4096
454
+ }
412
455
  },
413
456
  generation_config={
414
- 'max_new_tokens': 2048,
457
+ 'max_new_tokens': 20000,
415
458
  'temperature': 0.0,
416
459
  'seed': 42,
417
- }
460
+ 'n': 1
461
+ },
462
+ timeout=60000,
463
+ stream=True,
464
+ # use_cache='outputs/20250320_143658'
418
465
  )
419
466
 
420
467
  run_task(task_cfg=task_cfg)
tests/perf/test_perf.py CHANGED
@@ -1,6 +1,8 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  import os
3
+ from dotenv import dotenv_values
3
4
 
5
+ env = dotenv_values('.env')
4
6
  os.environ['CUDA_VISIBLE_DEVICES'] = '0'
5
7
  import unittest
6
8
 
@@ -96,6 +98,27 @@ class TestPerf(unittest.TestCase):
96
98
  }
97
99
  run_perf_benchmark(task_cfg)
98
100
 
101
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
102
+ def test_run_perf_local_random(self):
103
+ from evalscope.perf.arguments import Arguments
104
+ task_cfg = Arguments(
105
+ parallel=20,
106
+ model='Qwen2.5-0.5B-Instruct',
107
+ url='http://127.0.0.1:8801/v1/chat/completions',
108
+ api='openai',
109
+ dataset='random',
110
+ min_tokens=1024,
111
+ max_tokens=1024,
112
+ prefix_length=0,
113
+ min_prompt_length=1024,
114
+ max_prompt_length=1024,
115
+ number=40,
116
+ tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
117
+ seed=None,
118
+ debug= True,
119
+ )
120
+ run_perf_benchmark(task_cfg)
121
+
99
122
 
100
123
  if __name__ == '__main__':
101
124
  unittest.main(buffer=False)
tests/rag/test_ragas.py CHANGED
@@ -1,5 +1,8 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  import os
3
+ from dotenv import dotenv_values
4
+
5
+ env = dotenv_values('.env')
3
6
  import unittest
4
7
 
5
8
  from evalscope.run import run_task
@@ -63,7 +66,7 @@ class TestRAGAS(unittest.TestCase):
63
66
  'eval': {
64
67
  'testset_file': 'outputs/testset_chinese_with_answer.json',
65
68
  'critic_llm': {
66
- 'model_name_or_path': 'qwen/Qwen2-7B-Instruct',
69
+ 'model_name_or_path': 'Qwen/Qwen2.5-7B-Instruct',
67
70
  },
68
71
  'embeddings': {
69
72
  'model_name_or_path': 'AI-ModelScope/m3e-base',