evalscope 0.15.1__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (46) hide show
  1. evalscope/arguments.py +10 -0
  2. evalscope/backend/rag_eval/utils/llm.py +1 -1
  3. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
  4. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
  5. evalscope/benchmarks/data_adapter.py +4 -2
  6. evalscope/benchmarks/drop/__init__.py +0 -0
  7. evalscope/benchmarks/drop/drop_adapter.py +133 -0
  8. evalscope/benchmarks/drop/utils.py +59 -0
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
  10. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
  11. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  12. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +67 -0
  13. evalscope/benchmarks/tool_bench/utils.py +202 -0
  14. evalscope/benchmarks/utils.py +3 -2
  15. evalscope/benchmarks/winogrande/__init__.py +0 -0
  16. evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
  17. evalscope/collections/evaluator.py +76 -26
  18. evalscope/config.py +46 -15
  19. evalscope/evaluator/evaluator.py +43 -15
  20. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  21. evalscope/metrics/llm_judge.py +3 -3
  22. evalscope/metrics/rouge_metric.py +11 -13
  23. evalscope/models/adapters/chat_adapter.py +51 -34
  24. evalscope/models/adapters/server_adapter.py +15 -19
  25. evalscope/perf/arguments.py +14 -5
  26. evalscope/perf/benchmark.py +0 -6
  27. evalscope/perf/main.py +65 -15
  28. evalscope/perf/utils/benchmark_util.py +33 -15
  29. evalscope/perf/utils/db_util.py +25 -15
  30. evalscope/perf/utils/log_utils.py +1 -1
  31. evalscope/perf/utils/rich_display.py +186 -0
  32. evalscope/report/app.py +47 -34
  33. evalscope/report/utils.py +1 -1
  34. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  35. evalscope/utils/deprecation_utils.py +42 -0
  36. evalscope/version.py +2 -2
  37. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/METADATA +45 -21
  38. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/RECORD +46 -36
  39. tests/cli/test_all.py +3 -0
  40. tests/cli/test_collection.py +2 -1
  41. tests/cli/test_run.py +28 -12
  42. tests/perf/test_perf.py +23 -0
  43. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/LICENSE +0 -0
  44. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/WHEEL +0 -0
  45. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/entry_points.txt +0 -0
  46. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/top_level.txt +0 -0
tests/cli/test_run.py CHANGED
@@ -73,6 +73,12 @@ class TestRun(unittest.TestCase):
73
73
  logger.info(f'>>test_run_eval_with_args stdout: {run_res.stdout}')
74
74
  logger.error(f'>>test_run_eval_with_args stderr: {run_res.stderr}')
75
75
 
76
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
77
+ def test_run_yaml_config(self):
78
+ from evalscope import run_task
79
+
80
+ run_task(task_cfg='examples/tasks/eval_native.yaml')
81
+
76
82
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
77
83
  def test_run_task(self):
78
84
  task_cfg = TaskConfig(
@@ -209,12 +215,16 @@ class TestRun(unittest.TestCase):
209
215
  task_cfg = TaskConfig(
210
216
  model='Qwen/Qwen3-1.7B',
211
217
  datasets=[
212
- # 'iquiz',
218
+ 'iquiz',
213
219
  # 'math_500',
214
- 'aime24',
220
+ # 'aime24',
215
221
  # 'competition_math',
216
222
  # 'mmlu',
223
+ # 'simple_qa',
217
224
  ],
225
+ model_args={
226
+ 'device_map': 'auto',
227
+ },
218
228
  dataset_args={
219
229
  'competition_math': {
220
230
  'subset_list': ['Level 4', 'Level 5']
@@ -232,7 +242,8 @@ class TestRun(unittest.TestCase):
232
242
  'top_p': 0.8, # top-p采样 (qwen 报告推荐值)
233
243
  'top_k': 20, # top-k采样 (qwen 报告推荐值)
234
244
  'chat_template_kwargs': {'enable_thinking': False} # 关闭思考模式
235
- }
245
+ },
246
+ judge_strategy=JudgeStrategy.AUTO,
236
247
  )
237
248
 
238
249
  run_task(task_cfg=task_cfg)
@@ -276,7 +287,7 @@ class TestRun(unittest.TestCase):
276
287
  datasets=[
277
288
  # 'iquiz',
278
289
  # 'ifeval',
279
- 'mmlu',
290
+ # 'mmlu',
280
291
  # 'mmlu_pro',
281
292
  # 'musr',
282
293
  # 'process_bench',
@@ -294,10 +305,13 @@ class TestRun(unittest.TestCase):
294
305
  # 'ceval',
295
306
  # 'hellaswag',
296
307
  # 'general_mcq',
297
- # 'general_qa'
308
+ # 'general_qa',
298
309
  # 'super_gpqa',
299
310
  # 'mmlu_redux',
300
- # 'maritime_bench'
311
+ # 'maritime_bench',
312
+ # 'drop',
313
+ # 'winogrande',
314
+ 'tool_bench',
301
315
  ],
302
316
  dataset_args={
303
317
  'mmlu': {
@@ -363,14 +377,16 @@ class TestRun(unittest.TestCase):
363
377
  },
364
378
  },
365
379
  eval_batch_size=32,
366
- limit=15,
380
+ limit=10,
367
381
  debug=True,
368
382
  stream=False,
369
383
  generation_config={
370
384
  'temperature': 0,
371
385
  'n': 1,
372
386
  'max_tokens': 4096,
373
- }
387
+ },
388
+ # ignore_errors=True,
389
+ use_cache='outputs/20250519_142106'
374
390
  )
375
391
 
376
392
  run_task(task_cfg=task_cfg)
@@ -423,12 +439,12 @@ class TestRun(unittest.TestCase):
423
439
  # 'gsm8k'
424
440
  # 'truthful_qa',
425
441
  # 'simple_qa',
426
- # 'chinese_simpleqa',
442
+ 'chinese_simpleqa',
427
443
  # 'live_code_bench',
428
444
  # 'humaneval',
429
445
  # 'general_qa',
430
446
  # 'alpaca_eval',
431
- 'arena_hard'
447
+ # 'arena_hard'
432
448
  ],
433
449
  dataset_args={
434
450
  'competition_math': {
@@ -454,7 +470,7 @@ class TestRun(unittest.TestCase):
454
470
  ]
455
471
  },
456
472
  },
457
- eval_batch_size=5,
473
+ eval_batch_size=10,
458
474
  limit=10,
459
475
  judge_strategy=JudgeStrategy.AUTO,
460
476
  judge_worker_num=5,
@@ -475,7 +491,7 @@ class TestRun(unittest.TestCase):
475
491
  },
476
492
  timeout=60000,
477
493
  stream=True,
478
- # use_cache='outputs/20250320_143658'
494
+ use_cache='outputs/20250519_142551'
479
495
  )
480
496
 
481
497
  run_task(task_cfg=task_cfg)
tests/perf/test_perf.py CHANGED
@@ -121,6 +121,29 @@ class TestPerf(unittest.TestCase):
121
121
  print(metrics_result)
122
122
  print(percentile_result)
123
123
 
124
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
125
+ def test_run_perf_multi_parallel(self):
126
+ from evalscope.perf.arguments import Arguments
127
+ task_cfg = Arguments(
128
+ parallel=[1, 2],
129
+ number=[2, 5],
130
+ model='qwen2.5-7b-instruct',
131
+ url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
132
+ api_key=env.get('DASHSCOPE_API_KEY'),
133
+ api='openai',
134
+ dataset='random',
135
+ min_tokens=100,
136
+ max_tokens=100,
137
+ prefix_length=0,
138
+ min_prompt_length=1024,
139
+ max_prompt_length=1024,
140
+ tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
141
+ seed=None,
142
+ extra_args={'ignore_eos': True}
143
+ )
144
+ metrics_result, percentile_result = run_perf_benchmark(task_cfg)
145
+ print(metrics_result)
146
+ print(percentile_result)
124
147
 
125
148
  if __name__ == '__main__':
126
149
  unittest.main(buffer=False)