evalscope 0.15.1__py3-none-any.whl → 0.16.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (78) hide show
  1. evalscope/app/__init__.py +28 -0
  2. evalscope/{report → app}/app.py +67 -59
  3. evalscope/app/constants.py +21 -0
  4. evalscope/arguments.py +12 -1
  5. evalscope/backend/opencompass/backend_manager.py +2 -1
  6. evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
  7. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  8. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  9. evalscope/backend/rag_eval/utils/embedding.py +75 -35
  10. evalscope/backend/rag_eval/utils/llm.py +1 -1
  11. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
  12. evalscope/benchmarks/benchmark.py +1 -0
  13. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
  14. evalscope/benchmarks/data_adapter.py +101 -18
  15. evalscope/benchmarks/docmath/__init__.py +0 -0
  16. evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
  17. evalscope/benchmarks/docmath/utils.py +220 -0
  18. evalscope/benchmarks/drop/__init__.py +0 -0
  19. evalscope/benchmarks/drop/drop_adapter.py +133 -0
  20. evalscope/benchmarks/drop/utils.py +59 -0
  21. evalscope/benchmarks/frames/__init__.py +0 -0
  22. evalscope/benchmarks/frames/frames_adapter.py +90 -0
  23. evalscope/benchmarks/frames/utils.py +37 -0
  24. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
  25. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  26. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
  27. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  28. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
  29. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  30. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +70 -0
  31. evalscope/benchmarks/tool_bench/utils.py +203 -0
  32. evalscope/benchmarks/utils.py +28 -2
  33. evalscope/benchmarks/winogrande/__init__.py +0 -0
  34. evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
  35. evalscope/cli/start_app.py +2 -2
  36. evalscope/collections/__init__.py +35 -3
  37. evalscope/collections/evaluator.py +94 -32
  38. evalscope/config.py +54 -17
  39. evalscope/evaluator/evaluator.py +80 -41
  40. evalscope/metrics/__init__.py +3 -1
  41. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  42. evalscope/metrics/llm_judge.py +15 -8
  43. evalscope/metrics/math_parser.py +1 -1
  44. evalscope/metrics/rouge_metric.py +11 -13
  45. evalscope/models/adapters/chat_adapter.py +51 -34
  46. evalscope/models/adapters/server_adapter.py +17 -25
  47. evalscope/perf/arguments.py +16 -7
  48. evalscope/perf/benchmark.py +0 -15
  49. evalscope/perf/main.py +72 -15
  50. evalscope/perf/plugin/datasets/custom.py +15 -0
  51. evalscope/perf/utils/benchmark_util.py +34 -16
  52. evalscope/perf/utils/db_util.py +25 -15
  53. evalscope/perf/utils/local_server.py +1 -0
  54. evalscope/perf/utils/log_utils.py +12 -5
  55. evalscope/perf/utils/rich_display.py +186 -0
  56. evalscope/report/__init__.py +36 -4
  57. evalscope/report/combinator.py +8 -0
  58. evalscope/report/generator.py +33 -9
  59. evalscope/report/utils.py +61 -4
  60. evalscope/run.py +12 -0
  61. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  62. evalscope/utils/deprecation_utils.py +42 -0
  63. evalscope/utils/logger.py +1 -1
  64. evalscope/utils/utils.py +12 -0
  65. evalscope/version.py +2 -2
  66. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/METADATA +57 -31
  67. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/RECORD +78 -57
  68. tests/aigc/test_t2i.py +40 -3
  69. tests/cli/test_all.py +39 -32
  70. tests/cli/test_collection.py +8 -6
  71. tests/cli/test_run.py +43 -17
  72. tests/perf/test_perf.py +23 -0
  73. tests/rag/test_mteb.py +5 -5
  74. /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
  75. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
  76. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
  77. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
  78. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0
tests/cli/test_all.py CHANGED
@@ -12,40 +12,46 @@ from evalscope.run import run_task
12
12
  from evalscope.utils import test_level_list
13
13
  from evalscope.utils.logger import get_logger
14
14
 
15
- os.environ['LOG_LEVEL'] = 'DEBUG'
15
+ os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
16
16
 
17
17
  logger = get_logger()
18
18
 
19
19
  datasets=[
20
- 'iquiz',
21
- 'ifeval',
22
- 'mmlu',
23
- 'mmlu_pro',
24
- 'musr',
25
- 'process_bench',
26
- 'race',
27
- 'trivia_qa',
28
- 'cmmlu',
29
- 'humaneval',
30
- 'gsm8k',
31
- 'bbh',
32
- 'competition_math',
33
- 'math_500',
34
- 'aime24',
35
- 'gpqa',
36
- 'arc',
37
- 'ceval',
38
- 'hellaswag',
39
- 'general_mcq',
40
- 'general_qa',
41
- 'super_gpqa',
42
- 'live_code_bench',
43
- 'mmlu_redux',
44
- 'simple_qa',
45
- 'chinese_simpleqa',
46
- 'alpaca_eval',
47
- 'arena_hard',
48
- 'maritime_bench',
20
+ # 'iquiz',
21
+ # 'ifeval',
22
+ # 'mmlu',
23
+ # 'mmlu_pro',
24
+ # 'musr',
25
+ # 'process_bench',
26
+ # 'race',
27
+ # 'trivia_qa',
28
+ # 'cmmlu',
29
+ # 'humaneval',
30
+ # 'gsm8k',
31
+ # 'bbh',
32
+ # 'competition_math',
33
+ # 'math_500',
34
+ # 'aime24',
35
+ # 'gpqa',
36
+ # 'arc',
37
+ # 'ceval',
38
+ # 'hellaswag',
39
+ # 'general_mcq',
40
+ # 'general_qa',
41
+ # 'super_gpqa',
42
+ # 'live_code_bench',
43
+ # 'mmlu_redux',
44
+ # 'simple_qa',
45
+ # 'chinese_simpleqa',
46
+ # 'alpaca_eval',
47
+ # 'arena_hard',
48
+ # 'maritime_bench',
49
+ # 'drop',
50
+ # 'winogrande',
51
+ # 'tool_bench',
52
+ 'frames',
53
+ 'docmath',
54
+ 'needle_haystack'
49
55
  ]
50
56
 
51
57
  dataset_args={
@@ -128,7 +134,7 @@ class TestRun(unittest.TestCase):
128
134
  from evalscope.config import TaskConfig
129
135
 
130
136
  task_cfg = TaskConfig(
131
- model='qwen2.5-0.5b-instruct',
137
+ model='qwen-plus',
132
138
  api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
133
139
  api_key= env.get('DASHSCOPE_API_KEY'),
134
140
  eval_type=EvalType.SERVICE,
@@ -142,9 +148,10 @@ class TestRun(unittest.TestCase):
142
148
  'n': 1,
143
149
  'max_tokens': 4096,
144
150
  },
151
+ judge_worker_num=5,
145
152
  judge_strategy=JudgeStrategy.AUTO,
146
153
  judge_model_args={
147
- 'model_id': 'qwen2.5-7b-instruct',
154
+ 'model_id': 'qwen2.5-72b-instruct',
148
155
  'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
149
156
  'api_key': env.get('DASHSCOPE_API_KEY'),
150
157
  }
@@ -72,13 +72,15 @@ class TestCollection(unittest.TestCase):
72
72
  'local_path': 'outputs/mixed_data_test.jsonl'
73
73
  # 'local_path': 'outputs/weighted_mixed_data.jsonl'
74
74
  }},
75
- limit=10,
76
- judge_strategy=JudgeStrategy.LLM_RECALL,
75
+ limit=5,
76
+ judge_strategy=JudgeStrategy.AUTO,
77
77
  judge_model_args={
78
- 'model_id': 'qwen2.5-7b-instruct',
79
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
80
- 'api_key': os.getenv('DASHSCOPE_API_KEY'),
81
- }
78
+ # 'model_id': 'qwen2.5-72b-instruct',
79
+ # 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
80
+ # 'api_key': os.getenv('DASHSCOPE_API_KEY'),
81
+ },
82
+ analysis_report=True,
83
+ # use_cache='outputs/20250522_204520'
82
84
  )
83
85
  res = run_task(task_cfg=task_cfg)
84
86
  print(res)
tests/cli/test_run.py CHANGED
@@ -13,7 +13,7 @@ from evalscope.run import run_task
13
13
  from evalscope.utils import is_module_installed, test_level_list
14
14
  from evalscope.utils.logger import get_logger
15
15
 
16
- os.environ['LOG_LEVEL'] = 'DEBUG'
16
+ os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
17
17
 
18
18
  logger = get_logger()
19
19
 
@@ -73,6 +73,12 @@ class TestRun(unittest.TestCase):
73
73
  logger.info(f'>>test_run_eval_with_args stdout: {run_res.stdout}')
74
74
  logger.error(f'>>test_run_eval_with_args stderr: {run_res.stderr}')
75
75
 
76
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
77
+ def test_run_yaml_config(self):
78
+ from evalscope import run_task
79
+
80
+ run_task(task_cfg='examples/tasks/eval_native.yaml')
81
+
76
82
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
77
83
  def test_run_task(self):
78
84
  task_cfg = TaskConfig(
@@ -209,12 +215,16 @@ class TestRun(unittest.TestCase):
209
215
  task_cfg = TaskConfig(
210
216
  model='Qwen/Qwen3-1.7B',
211
217
  datasets=[
212
- # 'iquiz',
218
+ 'iquiz',
213
219
  # 'math_500',
214
- 'aime24',
220
+ # 'aime24',
215
221
  # 'competition_math',
216
222
  # 'mmlu',
223
+ # 'simple_qa',
217
224
  ],
225
+ model_args={
226
+ 'device_map': 'auto',
227
+ },
218
228
  dataset_args={
219
229
  'competition_math': {
220
230
  'subset_list': ['Level 4', 'Level 5']
@@ -232,7 +242,8 @@ class TestRun(unittest.TestCase):
232
242
  'top_p': 0.8, # top-p采样 (qwen 报告推荐值)
233
243
  'top_k': 20, # top-k采样 (qwen 报告推荐值)
234
244
  'chat_template_kwargs': {'enable_thinking': False} # 关闭思考模式
235
- }
245
+ },
246
+ judge_strategy=JudgeStrategy.AUTO,
236
247
  )
237
248
 
238
249
  run_task(task_cfg=task_cfg)
@@ -276,7 +287,7 @@ class TestRun(unittest.TestCase):
276
287
  datasets=[
277
288
  # 'iquiz',
278
289
  # 'ifeval',
279
- 'mmlu',
290
+ # 'mmlu',
280
291
  # 'mmlu_pro',
281
292
  # 'musr',
282
293
  # 'process_bench',
@@ -294,10 +305,14 @@ class TestRun(unittest.TestCase):
294
305
  # 'ceval',
295
306
  # 'hellaswag',
296
307
  # 'general_mcq',
297
- # 'general_qa'
308
+ # 'general_qa',
298
309
  # 'super_gpqa',
299
310
  # 'mmlu_redux',
300
- # 'maritime_bench'
311
+ # 'maritime_bench',
312
+ # 'drop',
313
+ # 'winogrande',
314
+ # 'tool_bench',
315
+ 'frames',
301
316
  ],
302
317
  dataset_args={
303
318
  'mmlu': {
@@ -363,14 +378,17 @@ class TestRun(unittest.TestCase):
363
378
  },
364
379
  },
365
380
  eval_batch_size=32,
366
- limit=15,
381
+ limit=10,
367
382
  debug=True,
368
383
  stream=False,
369
384
  generation_config={
370
385
  'temperature': 0,
371
386
  'n': 1,
372
387
  'max_tokens': 4096,
373
- }
388
+ # 'extra_headers':{'key': 'value'},
389
+ },
390
+ # ignore_errors=True,
391
+ # use_cache='outputs/20250519_142106'
374
392
  )
375
393
 
376
394
  run_task(task_cfg=task_cfg)
@@ -411,16 +429,16 @@ class TestRun(unittest.TestCase):
411
429
  from evalscope.config import TaskConfig
412
430
 
413
431
  task_cfg = TaskConfig(
414
- model='qwen2.5-0.5b-instruct',
432
+ model='qwen-plus',
415
433
  api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
416
434
  api_key= env.get('DASHSCOPE_API_KEY'),
417
435
  eval_type=EvalType.SERVICE,
418
436
  datasets=[
419
437
  # 'math_500',
420
- # 'aime24',
438
+ 'aime24',
421
439
  # 'competition_math',
422
440
  # 'arc',
423
- # 'gsm8k'
441
+ # 'gsm8k',
424
442
  # 'truthful_qa',
425
443
  # 'simple_qa',
426
444
  # 'chinese_simpleqa',
@@ -428,7 +446,10 @@ class TestRun(unittest.TestCase):
428
446
  # 'humaneval',
429
447
  # 'general_qa',
430
448
  # 'alpaca_eval',
431
- 'arena_hard'
449
+ # 'arena_hard',
450
+ # 'frames',
451
+ # 'docmath',
452
+ # 'needle_haystack',
432
453
  ],
433
454
  dataset_args={
434
455
  'competition_math': {
@@ -453,13 +474,16 @@ class TestRun(unittest.TestCase):
453
474
  '中华文化'
454
475
  ]
455
476
  },
477
+ 'frames': {
478
+ 'local_path': '/root/.cache/modelscope/hub/datasets/iic/frames'
479
+ }
456
480
  },
457
- eval_batch_size=5,
458
- limit=10,
481
+ eval_batch_size=10,
482
+ limit=1,
459
483
  judge_strategy=JudgeStrategy.AUTO,
460
484
  judge_worker_num=5,
461
485
  judge_model_args={
462
- 'model_id': 'qwen2.5-7b-instruct',
486
+ 'model_id': 'qwen2.5-72b-instruct',
463
487
  'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
464
488
  'api_key': env.get('DASHSCOPE_API_KEY'),
465
489
  'generation_config': {
@@ -475,7 +499,9 @@ class TestRun(unittest.TestCase):
475
499
  },
476
500
  timeout=60000,
477
501
  stream=True,
478
- # use_cache='outputs/20250320_143658'
502
+ analysis_report=True,
503
+ # debug=True,
504
+ # use_cache='outputs/20250602_135859'
479
505
  )
480
506
 
481
507
  run_task(task_cfg=task_cfg)
tests/perf/test_perf.py CHANGED
@@ -121,6 +121,29 @@ class TestPerf(unittest.TestCase):
121
121
  print(metrics_result)
122
122
  print(percentile_result)
123
123
 
124
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
125
+ def test_run_perf_multi_parallel(self):
126
+ from evalscope.perf.arguments import Arguments
127
+ task_cfg = Arguments(
128
+ parallel=[1, 2],
129
+ number=[2, 5],
130
+ model='qwen2.5-7b-instruct',
131
+ url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
132
+ api_key=env.get('DASHSCOPE_API_KEY'),
133
+ api='openai',
134
+ dataset='random',
135
+ min_tokens=100,
136
+ max_tokens=100,
137
+ prefix_length=0,
138
+ min_prompt_length=1024,
139
+ max_prompt_length=1024,
140
+ tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
141
+ seed=None,
142
+ extra_args={'ignore_eos': True}
143
+ )
144
+ metrics_result, percentile_result = run_perf_benchmark(task_cfg)
145
+ print(metrics_result)
146
+ print(percentile_result)
124
147
 
125
148
  if __name__ == '__main__':
126
149
  unittest.main(buffer=False)
tests/rag/test_mteb.py CHANGED
@@ -46,11 +46,11 @@ class TestMTEB(unittest.TestCase):
46
46
  ],
47
47
  'eval': {
48
48
  'tasks': [
49
- # 'TNews',
50
- # 'CLSClusteringS2S',
49
+ 'TNews',
50
+ 'CLSClusteringS2S',
51
51
  'T2Reranking',
52
- # 'T2Retrieval',
53
- # 'ATEC',
52
+ 'T2Retrieval',
53
+ 'ATEC',
54
54
  ],
55
55
  'verbosity': 2,
56
56
  'overwrite_results': True,
@@ -85,7 +85,7 @@ class TestMTEB(unittest.TestCase):
85
85
  ],
86
86
  'verbosity': 2,
87
87
  'overwrite_results': True,
88
- 'limits': 30,
88
+ 'limits': 10,
89
89
  },
90
90
  },
91
91
  )
File without changes