evalscope 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (148) hide show
  1. evalscope/api/benchmark/__init__.py +1 -1
  2. evalscope/api/benchmark/adapters/__init__.py +2 -0
  3. evalscope/api/benchmark/adapters/default_data_adapter.py +7 -4
  4. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  5. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  6. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  7. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  8. evalscope/api/benchmark/benchmark.py +62 -2
  9. evalscope/api/benchmark/meta.py +9 -0
  10. evalscope/api/dataset/dataset.py +6 -6
  11. evalscope/api/dataset/loader.py +2 -1
  12. evalscope/api/evaluator/cache.py +24 -1
  13. evalscope/api/evaluator/evaluator.py +5 -0
  14. evalscope/api/evaluator/state.py +17 -1
  15. evalscope/api/messages/__init__.py +1 -0
  16. evalscope/api/messages/chat_message.py +52 -2
  17. evalscope/api/metric/scorer.py +15 -7
  18. evalscope/api/mixin/__init__.py +1 -1
  19. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  20. evalscope/api/mixin/sandbox_mixin.py +204 -0
  21. evalscope/api/model/generate_config.py +1 -6
  22. evalscope/api/model/model.py +5 -2
  23. evalscope/api/tool/tool_info.py +1 -1
  24. evalscope/app/app.py +3 -0
  25. evalscope/app/ui/single_model.py +3 -3
  26. evalscope/app/utils/data_utils.py +7 -7
  27. evalscope/app/utils/env_utils.py +12 -0
  28. evalscope/app/utils/text_utils.py +14 -12
  29. evalscope/arguments.py +8 -4
  30. evalscope/backend/opencompass/backend_manager.py +0 -2
  31. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  32. evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
  33. evalscope/benchmarks/amc/amc_adapter.py +46 -0
  34. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  35. evalscope/benchmarks/bfcl/bfcl_adapter.py +142 -7
  36. evalscope/benchmarks/bfcl/generation.py +9 -9
  37. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  38. evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
  39. evalscope/benchmarks/drop/drop_adapter.py +1 -1
  40. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  41. evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
  42. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  43. evalscope/benchmarks/healthbench/utils.py +102 -0
  44. evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
  45. evalscope/benchmarks/humaneval/utils.py +235 -0
  46. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  47. evalscope/benchmarks/image_edit/__init__.py +0 -0
  48. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  49. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  50. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  51. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  52. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  53. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
  54. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  55. evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
  56. evalscope/benchmarks/math_vista/__init__.py +0 -0
  57. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  58. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  59. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
  60. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  61. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  62. evalscope/benchmarks/mm_star/__init__.py +0 -0
  63. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  64. evalscope/benchmarks/mmmu/__init__.py +0 -0
  65. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  66. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  67. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  68. evalscope/benchmarks/multi_if/__init__.py +0 -0
  69. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  70. evalscope/benchmarks/multi_if/metrics.py +120 -0
  71. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  72. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +6 -5
  73. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  74. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  75. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  76. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  77. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  78. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  79. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  80. evalscope/benchmarks/tau_bench/generation.py +1 -1
  81. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +20 -19
  82. evalscope/benchmarks/text2image/__init__.py +0 -0
  83. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  84. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  85. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  86. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  87. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  88. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  89. evalscope/cli/start_app.py +7 -1
  90. evalscope/cli/start_perf.py +7 -1
  91. evalscope/config.py +96 -14
  92. evalscope/constants.py +11 -0
  93. evalscope/evaluator/evaluator.py +30 -10
  94. evalscope/metrics/llm_judge.py +19 -7
  95. evalscope/metrics/metric.py +27 -2
  96. evalscope/models/image_edit_model.py +125 -0
  97. evalscope/models/model_apis.py +22 -0
  98. evalscope/models/openai_compatible.py +3 -0
  99. evalscope/models/text2image_model.py +2 -2
  100. evalscope/models/utils/openai.py +8 -6
  101. evalscope/perf/arguments.py +2 -0
  102. evalscope/perf/benchmark.py +2 -0
  103. evalscope/perf/plugin/api/base.py +2 -2
  104. evalscope/perf/plugin/api/default_api.py +7 -7
  105. evalscope/perf/plugin/api/openai_api.py +83 -19
  106. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  107. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  108. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  109. evalscope/perf/utils/benchmark_util.py +7 -5
  110. evalscope/perf/utils/local_server.py +3 -0
  111. evalscope/report/__init__.py +0 -1
  112. evalscope/report/combinator.py +0 -25
  113. evalscope/report/generator.py +8 -87
  114. evalscope/report/report.py +8 -4
  115. evalscope/run.py +9 -5
  116. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  117. evalscope/utils/chat_service.py +1 -1
  118. evalscope/utils/function_utils.py +41 -0
  119. evalscope/utils/import_utils.py +73 -1
  120. evalscope/utils/io_utils.py +56 -7
  121. evalscope/utils/json_schema.py +23 -2
  122. evalscope/utils/logger.py +19 -0
  123. evalscope/utils/model_utils.py +4 -3
  124. evalscope/utils/multi_choices.py +23 -6
  125. evalscope/version.py +2 -2
  126. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/METADATA +17 -24
  127. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/RECORD +145 -103
  128. tests/benchmark/test_eval.py +80 -37
  129. tests/benchmark/test_image_edit.py +65 -0
  130. tests/benchmark/test_sandbox.py +81 -0
  131. tests/benchmark/test_vlm.py +137 -0
  132. tests/cli/test_all.py +83 -43
  133. tests/cli/test_collection.py +8 -5
  134. tests/cli/test_reasoning.py +81 -0
  135. tests/common.py +73 -0
  136. tests/perf/test_perf.py +44 -14
  137. tests/rag/test_clip_benchmark.py +0 -3
  138. evalscope/api/mixin/dataset_mixin.py +0 -105
  139. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  140. tests/aigc/__init__.py +0 -1
  141. /evalscope/benchmarks/{aigc → ai2d}/__init__.py +0 -0
  142. /evalscope/benchmarks/{aigc/i2i → amc}/__init__.py +0 -0
  143. /evalscope/benchmarks/{aigc/t2i → healthbench}/__init__.py +0 -0
  144. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
  145. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
  146. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
  147. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
  148. /tests/{aigc → benchmark}/test_t2i.py +0 -0
@@ -4,17 +4,15 @@ from dotenv import dotenv_values
4
4
  env = dotenv_values('.env')
5
5
 
6
6
  import unittest
7
- from unittest import TestCase
8
7
 
9
- from evalscope.config import TaskConfig
10
8
  from evalscope.constants import EvalType, JudgeStrategy, OutputType
11
- from evalscope.run import run_task
12
9
  from evalscope.utils.logger import get_logger
10
+ from tests.common import TestBenchmark
13
11
 
14
12
  logger = get_logger()
15
13
 
16
14
 
17
- class TestBenchmark(TestCase):
15
+ class TestNativeBenchmark(TestBenchmark):
18
16
  """Benchmark evaluation test cases."""
19
17
 
20
18
  def setUp(self):
@@ -35,38 +33,18 @@ class TestBenchmark(TestCase):
35
33
  'judge_strategy': JudgeStrategy.AUTO,
36
34
  'judge_worker_num': 5,
37
35
  'judge_model_args': {
38
- 'model_id': 'qwen2.5-72b-instruct',
36
+ 'model_id': 'qwen3-235b-a22b',
39
37
  'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
40
38
  'api_key': env.get('DASHSCOPE_API_KEY'),
41
39
  'generation_config': {
42
40
  'temperature': 0.0,
43
41
  'max_tokens': 4096,
42
+ 'extra_body': {'enable_thinking': False}
44
43
  }
45
44
  },
46
45
  'debug': True,
47
46
  }
48
47
 
49
- def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
50
- """Helper method to run test for a specific dataset."""
51
- config = self.base_config.copy()
52
- config['datasets'] = [dataset_name]
53
-
54
- if use_mock:
55
- config['eval_type'] = EvalType.MOCK_LLM
56
-
57
- # 应用配置覆盖
58
- config.update(config_overrides)
59
-
60
- if dataset_args:
61
- config['dataset_args'] = {dataset_name: dataset_args}
62
-
63
- task_cfg = TaskConfig(**config)
64
- run_task(task_cfg=task_cfg)
65
-
66
- def _run_dataset_load_test(self, dataset_name, dataset_args=None):
67
- """Helper method to test dataset loading."""
68
-
69
- self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
70
48
 
71
49
  # Math & Reasoning datasets
72
50
  def test_gsm8k(self):
@@ -84,10 +62,18 @@ class TestBenchmark(TestCase):
84
62
  """Test MMLU reasoning dataset."""
85
63
  dataset_args = {
86
64
  'few_shot_num': 0,
87
- # 'subset_list': ['abstract_algebra', 'computer_security']
65
+ 'subset_list': ['abstract_algebra', 'computer_security']
88
66
  }
89
67
  self._run_dataset_test('mmlu', use_mock=True, dataset_args=dataset_args)
90
68
 
69
+ def test_mmlu_reasoning(self):
70
+ """Test MMLU reasoning dataset."""
71
+ dataset_args = {
72
+ 'few_shot_num': 0,
73
+ 'subset_list': ['abstract_algebra', 'computer_security']
74
+ }
75
+ self._run_dataset_test('mmlu', dataset_args=dataset_args, model='qwen3-0.6b', stream=True)
76
+
91
77
  def test_mmlu_pro(self):
92
78
  """Test MMLU-Pro reasoning dataset."""
93
79
  dataset_args = {
@@ -116,7 +102,11 @@ class TestBenchmark(TestCase):
116
102
  def test_math_500(self):
117
103
  """Test MATH 500 dataset."""
118
104
  # self._run_dataset_load_test('math_500')
119
- self._run_dataset_test('math_500')
105
+ dataset_args = {
106
+ 'subset_list': ['Level 1', 'Level 2'],
107
+ 'few_shot_num': 0,
108
+ }
109
+ self._run_dataset_test('math_500', dataset_args=dataset_args)
120
110
 
121
111
  def test_aime24(self):
122
112
  """Test AIME 2024 dataset."""
@@ -222,6 +212,7 @@ class TestBenchmark(TestCase):
222
212
  def test_bbh(self):
223
213
  dataset_args = {
224
214
  'subset_list': ['temporal_sequences', 'navigate'],
215
+ 'few_shot_num': 0,
225
216
  }
226
217
  self._run_dataset_test('bbh', dataset_args=dataset_args)
227
218
 
@@ -336,20 +327,21 @@ class TestBenchmark(TestCase):
336
327
  def test_humaneval(self):
337
328
  """Test HumanEval dataset."""
338
329
  dataset_args = {
339
- 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5']
330
+ 'metric_list': ['Pass@1']
340
331
  }
341
- self._run_dataset_test('humaneval', dataset_args, repeats=5)
332
+ self._run_dataset_test('humaneval', dataset_args)
342
333
 
343
334
  def test_live_code_bench(self):
344
335
  """Test LiveCodeBench dataset."""
345
336
  dataset_args = {
346
- 'subset_list': ['v6'],
337
+ 'subset_list': ['v5'],
338
+ 'review_timeout': 6,
347
339
  'extra_params': {
348
340
  'start_date': '2024-08-01',
349
341
  'end_date': '2025-02-28'
350
342
  },
351
343
  }
352
- self._run_dataset_test('live_code_bench', dataset_args, judge_worker_num=1)
344
+ self._run_dataset_test('live_code_bench', dataset_args, limit=20, use_cache='outputs/20250918_200232', rerun_review=True)
353
345
 
354
346
  def test_tool_bench(self):
355
347
  """Test ToolBench dataset."""
@@ -358,27 +350,78 @@ class TestBenchmark(TestCase):
358
350
  def test_bfcl(self):
359
351
  """Test BFCL dataset."""
360
352
  dataset_args = {
361
- 'subset_list': ['simple', 'live_multiple', 'multi_turn_base'],
353
+ 'subset_list': [
354
+ # 'simple',
355
+ # 'live_multiple',
356
+ # 'multi_turn_base',
357
+ 'multi_turn_miss_func'
358
+ ],
362
359
  'extra_params': {
363
360
  'is_fc_model': True,
364
361
  'underscore_to_dot': True
365
362
  }
366
363
  }
367
- self._run_dataset_test('bfcl_v3', dataset_args)
364
+ self._run_dataset_test('bfcl_v3', dataset_args, model='qwen-plus', limit=30, eval_batch_size=5)
368
365
 
369
366
  def test_tau_bench(self):
370
367
  dataset_args = {
368
+ 'subset_list': [
369
+ 'airline',
370
+ 'retail'
371
+ ],
371
372
  'extra_params': {
372
373
  'user_model': 'qwen-plus',
373
374
  'api_key': env.get('DASHSCOPE_API_KEY'),
374
375
  'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
375
376
  'generation_config': {
376
- 'temperature': 0.7,
377
- 'max_new_tokens': 1024
377
+ 'temperature': 0.0,
378
+ 'max_tokens': 12000,
379
+ 'stream': True
378
380
  }
379
381
  }
380
382
  }
381
- self._run_dataset_test('tau_bench', dataset_args, limit=1)
383
+ self._run_dataset_test('tau_bench', dataset_args, limit=5, model='qwq-plus', stream=True)
384
+
385
+ def test_r1_collection(self):
386
+ dataset_args = {
387
+ 'dataset_id': 'evalscope/R1-Distill-Math-Test-v2'
388
+ }
389
+ self._run_dataset_test('data_collection', dataset_args)
390
+
391
+ def test_qwen3_collection(self):
392
+ dataset_args = {
393
+ 'dataset_id': 'evalscope/Qwen3-Test-Collection'
394
+ }
395
+ self._run_dataset_test('data_collection', dataset_args)
396
+
397
+ def test_multi_if(self):
398
+ dataset_args = {
399
+ 'subset_list': ['English', 'Chinese'],
400
+ 'few_shot_num': 0,
401
+ }
402
+ self._run_dataset_test('multi_if', dataset_args, limit=5)
403
+
404
+ def test_healthbench(self):
405
+ dataset_args = {
406
+ 'subset_list': ['health_data_tasks'],
407
+ 'extra_params': {
408
+ 'version': 'Hard'
409
+ }
410
+ }
411
+ self._run_dataset_test('health_bench', dataset_args, limit=5)
412
+
413
+
414
+ def test_amc(self):
415
+ dataset_args = {
416
+ 'subset_list': ['amc22'],
417
+ }
418
+ self._run_dataset_test('amc', dataset_args)
419
+
420
+ def test_minerva_math(self):
421
+ dataset_args = {
422
+ 'subset_list': ['default'],
423
+ }
424
+ self._run_dataset_test('minerva_math', dataset_args)
382
425
 
383
426
  if __name__ == '__main__':
384
427
  # Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k
@@ -0,0 +1,65 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
6
+ import unittest
7
+
8
+ from evalscope.constants import EvalType, JudgeStrategy, ModelTask
9
+ from evalscope.utils.logger import get_logger
10
+ from tests.common import TestBenchmark
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ class TestImageEditBenchmark(TestBenchmark):
16
+ def setUp(self):
17
+ """Setup common test configuration."""
18
+ self.base_config = {
19
+ 'model': 'Qwen/Qwen-Image-Edit',
20
+ 'model_args':{
21
+ 'precision': 'bfloat16',
22
+ 'device_map': 'cuda:2'
23
+ },
24
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
25
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
26
+ 'model_task': ModelTask.IMAGE_GENERATION,
27
+ 'eval_type': EvalType.IMAGE_EDITING,
28
+ 'eval_batch_size': 1,
29
+ 'limit': 5,
30
+ 'generation_config': {
31
+ 'true_cfg_scale': 4.0,
32
+ 'num_inference_steps': 50,
33
+ 'negative_prompt': ' ',
34
+ },
35
+ 'judge_strategy': JudgeStrategy.AUTO,
36
+ 'judge_worker_num': 5,
37
+ 'judge_model_args': {
38
+ 'model_id': 'qwen2.5-vl-72b-instruct',
39
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
40
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
41
+ 'generation_config': {
42
+ 'temperature': 0.0,
43
+ 'max_tokens': 4096,
44
+ }
45
+ },
46
+ 'debug': True,
47
+ }
48
+
49
+ def test_gedit(self):
50
+ """Test GEdit dataset."""
51
+ dataset_args = {
52
+ 'extra_params':{
53
+ 'language': 'cn',
54
+ }
55
+ }
56
+ self._run_dataset_test('gedit', dataset_args=dataset_args, use_cache='outputs/20250829_150058')
57
+
58
+ def test_gedit_local(self):
59
+ dataset_args = {
60
+ 'extra_params':{
61
+ 'language': 'cn',
62
+ 'local_file': 'outputs/example_edit.jsonl',
63
+ }
64
+ }
65
+ self._run_dataset_test('gedit', dataset_args=dataset_args, model=None, model_id='offline_model')
@@ -0,0 +1,81 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
6
+ import unittest
7
+
8
+ from evalscope.constants import EvalType, JudgeStrategy, OutputType
9
+ from evalscope.utils.logger import get_logger
10
+ from tests.common import TestBenchmark
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ class TestCodeBenchmark(TestBenchmark):
16
+ """Benchmark evaluation test cases."""
17
+
18
+ def setUp(self):
19
+ """Setup common test configuration."""
20
+ self.base_config = {
21
+ 'model': 'qwen-plus',
22
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
23
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
24
+ 'eval_type': EvalType.SERVICE,
25
+ 'eval_batch_size': 5,
26
+ 'limit': 5,
27
+ 'generation_config': {
28
+ 'max_tokens': 4096,
29
+ 'temperature': 0.0,
30
+ 'seed': 42,
31
+ 'parallel_tool_calls': True
32
+ },
33
+ 'judge_strategy': JudgeStrategy.AUTO,
34
+ 'judge_worker_num': 5,
35
+ 'judge_model_args': {
36
+ 'model_id': 'qwen2.5-72b-instruct',
37
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
38
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
39
+ 'generation_config': {
40
+ 'temperature': 0.0,
41
+ 'max_tokens': 4096,
42
+ }
43
+ },
44
+ 'use_sandbox': True,
45
+ 'sandbox_type': 'docker',
46
+ 'debug': True,
47
+ }
48
+
49
+ def test_humaneval(self):
50
+ """Test Humaneval dataset."""
51
+ self._run_dataset_test('humaneval', limit=5)
52
+
53
+ def test_humaneval_remote_sandbox(self):
54
+ """Test Humaneval dataset with remote sandbox manager."""
55
+ sandbox_manager_config = {'base_url': 'http://localhost:8000'}
56
+ self._run_dataset_test('humaneval', limit=5, sandbox_manager_config=sandbox_manager_config)
57
+
58
+ def test_live_code_bench(self):
59
+ """Test Live Code Bench dataset."""
60
+ dataset_args = {
61
+ 'subset_list': ['v5'],
62
+ 'review_timeout': 6,
63
+ 'extra_params': {
64
+ 'start_date': '2024-08-01',
65
+ 'end_date': '2025-02-28'
66
+ },
67
+ }
68
+ self._run_dataset_test('live_code_bench', limit=5, dataset_args=dataset_args, use_cache='outputs/20250918_200232', rerun_review=True)
69
+
70
+ def test_live_code_bench_remote_sandbox(self):
71
+ """Test Live Code Bench dataset."""
72
+ dataset_args = {
73
+ 'subset_list': ['v5'],
74
+ 'review_timeout': 6,
75
+ 'extra_params': {
76
+ 'start_date': '2024-08-01',
77
+ 'end_date': '2025-02-28'
78
+ },
79
+ }
80
+ sandbox_manager_config = {'base_url': 'http://localhost:8000'}
81
+ self._run_dataset_test('live_code_bench', limit=20, dataset_args=dataset_args, sandbox_manager_config=sandbox_manager_config, use_cache='outputs/20250918_200232_2', rerun_review=True)
@@ -0,0 +1,137 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
6
+ import unittest
7
+
8
+ from evalscope.constants import EvalType, JudgeStrategy, OutputType
9
+ from evalscope.utils.logger import get_logger
10
+ from tests.common import TestBenchmark
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ class TestVLMBenchmark(TestBenchmark):
16
+ """Benchmark evaluation test cases."""
17
+
18
+ def setUp(self):
19
+ """Setup common test configuration."""
20
+ self.base_config = {
21
+ 'model': 'qwen-vl-plus',
22
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
23
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
24
+ 'eval_type': EvalType.SERVICE,
25
+ 'eval_batch_size': 5,
26
+ 'limit': 5,
27
+ 'generation_config': {
28
+ 'max_tokens': 2048,
29
+ 'temperature': 0.0,
30
+ 'seed': 42,
31
+ 'parallel_tool_calls': True
32
+ },
33
+ 'judge_strategy': JudgeStrategy.AUTO,
34
+ 'judge_worker_num': 5,
35
+ 'judge_model_args': {
36
+ 'model_id': 'qwen2.5-72b-instruct',
37
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
38
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
39
+ 'generation_config': {
40
+ 'temperature': 0.0,
41
+ 'max_tokens': 4096,
42
+ }
43
+ },
44
+ 'debug': True,
45
+ }
46
+
47
+ def test_mmmu(self):
48
+ dataset_args = {
49
+ 'subset_list':[
50
+ 'Accounting',
51
+ 'Agriculture',
52
+ # 'Architecture_and_Engineering'
53
+ ]
54
+ }
55
+ self._run_dataset_test('mmmu', dataset_args=dataset_args)
56
+
57
+ def test_math_vista(self):
58
+ dataset_args = {
59
+ 'subset_list': ['default']
60
+ }
61
+ self._run_dataset_test('math_vista', dataset_args=dataset_args)
62
+
63
+ def test_mmmu_pro(self):
64
+ dataset_args = {
65
+ 'subset_list':[
66
+ 'Accounting',
67
+ # 'Agriculture',
68
+ ],
69
+ 'extra_params': {
70
+ 'dataset_format': 'standard (4 options)', # 'standard (4 options)', 'standard (10 options)', 'vision'
71
+ },
72
+ }
73
+ self._run_dataset_test('mmmu_pro', dataset_args=dataset_args, limit=10)
74
+
75
+ def test_qwen3_vl_collection(self):
76
+ dataset_args = {
77
+ 'dataset_id': 'outputs/qwen3_vl_test.jsonl',
78
+ 'shuffle': True,
79
+ }
80
+ self._run_dataset_test('data_collection', dataset_args, limit=100)
81
+
82
+ def test_real_world_qa(self):
83
+ dataset_args = {
84
+ 'subset_list': ['default']
85
+ }
86
+ self._run_dataset_test('real_world_qa', dataset_args=dataset_args, limit=10)
87
+
88
+ def test_ai2d(self):
89
+ dataset_args = {
90
+ 'subset_list': ['default']
91
+ }
92
+ self._run_dataset_test('ai2d', dataset_args=dataset_args)
93
+
94
+ def test_cc_bench(self):
95
+ dataset_args = {
96
+ 'subset_list': ['cc']
97
+ }
98
+ self._run_dataset_test('cc_bench', dataset_args=dataset_args)
99
+
100
+ def test_mm_bench(self):
101
+ dataset_args = {
102
+ 'subset_list': ['cn', 'en']
103
+ }
104
+ self._run_dataset_test('mm_bench', dataset_args=dataset_args)
105
+
106
+ def test_mm_star(self):
107
+ dataset_args = {
108
+ # 'subset_list': ['val']
109
+ }
110
+ self._run_dataset_test('mm_star', dataset_args=dataset_args)
111
+
112
+ def test_omni_bench(self):
113
+ dataset_args = {
114
+ 'extra_params': {
115
+ 'use_image': True, # Whether to use image input, if False, use text alternative image content.
116
+ 'use_audio': True, # Whether to use audio input, if False, use text alternative audio content.
117
+ }
118
+ }
119
+ self._run_dataset_test('omni_bench', dataset_args=dataset_args, model='qwen-omni-turbo')
120
+
121
+ def test_olympiad_bench(self):
122
+ dataset_args = {
123
+ 'subset_list': [
124
+ # 'OE_MM_maths_en_COMP',
125
+ # 'OE_MM_maths_zh_CEE',
126
+ # 'OE_MM_maths_zh_COMP',
127
+ # 'OE_MM_physics_en_COMP',
128
+ # 'OE_MM_physics_zh_CEE',
129
+ # 'OE_TO_maths_en_COMP',
130
+ # 'OE_TO_maths_zh_CEE',
131
+ # 'OE_TO_maths_zh_COMP',
132
+ # 'OE_TO_physics_en_COMP',
133
+ # 'OE_TO_physics_zh_CEE',
134
+ 'TP_TO_maths_zh_CEE',
135
+ ]
136
+ }
137
+ self._run_dataset_test('olympiad_bench', dataset_args=dataset_args)
tests/cli/test_all.py CHANGED
@@ -17,44 +17,44 @@ os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
17
17
  logger = get_logger()
18
18
 
19
19
  datasets=[
20
- 'iquiz',
21
- 'ifeval',
22
- 'mmlu',
23
- 'mmlu_pro',
24
- 'musr',
25
- 'process_bench',
26
- 'race',
27
- 'trivia_qa',
28
- 'cmmlu',
29
- 'humaneval',
30
- 'gsm8k',
31
- 'bbh',
32
- 'competition_math',
33
- 'math_500',
34
- 'aime24',
35
- 'gpqa_diamond',
36
- 'arc',
37
- 'ceval',
38
- 'hellaswag',
39
- 'general_mcq',
40
- 'general_qa',
41
- 'super_gpqa',
42
- # 'live_code_bench',
43
- 'mmlu_redux',
44
- 'simple_qa',
45
- 'chinese_simpleqa',
46
- 'alpaca_eval',
47
- 'arena_hard',
48
- 'maritime_bench',
49
- 'drop',
50
- 'winogrande',
51
- 'tool_bench',
52
- 'frames',
53
- 'docmath',
54
- 'needle_haystack',
55
- 'bfcl_v3',
56
- 'hle',
57
- 'tau_bench',
20
+ 'iquiz',
21
+ 'ifeval',
22
+ 'mmlu',
23
+ 'mmlu_pro',
24
+ 'musr',
25
+ 'process_bench',
26
+ 'race',
27
+ 'trivia_qa',
28
+ 'cmmlu',
29
+ 'humaneval',
30
+ 'gsm8k',
31
+ 'bbh',
32
+ 'competition_math',
33
+ 'math_500',
34
+ 'aime24',
35
+ 'gpqa_diamond',
36
+ 'arc',
37
+ 'ceval',
38
+ 'hellaswag',
39
+ 'general_mcq',
40
+ 'general_qa',
41
+ 'super_gpqa',
42
+ # 'live_code_bench',
43
+ 'mmlu_redux',
44
+ 'simple_qa',
45
+ 'chinese_simpleqa',
46
+ 'alpaca_eval',
47
+ 'arena_hard',
48
+ 'maritime_bench',
49
+ 'drop',
50
+ 'winogrande',
51
+ 'tool_bench',
52
+ 'frames',
53
+ 'docmath',
54
+ 'needle_haystack',
55
+ 'bfcl_v3',
56
+ 'hle',
57
+ 'tau_bench',
58
58
  ]
59
59
 
60
60
  # Reverse the datasets list to ensure the order is from most recent to oldest
@@ -150,7 +150,6 @@ dataset_args={
150
150
  }
151
151
 
152
152
  class TestRun(unittest.TestCase):
153
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
154
153
  def test_benchmarks(self):
155
154
  from evalscope.config import TaskConfig
156
155
 
@@ -180,19 +179,60 @@ class TestRun(unittest.TestCase):
180
179
 
181
180
  run_task(task_cfg=task_cfg)
182
181
 
182
+ def test_vlm_benchmark(self):
183
+ from evalscope.config import TaskConfig
184
+
185
+ task_cfg = TaskConfig(
186
+ model='qwen-vl-plus',
187
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
188
+ api_key= env.get('DASHSCOPE_API_KEY'),
189
+ eval_type=EvalType.SERVICE,
190
+ datasets=[
191
+ 'mmmu',
192
+ # 'math_vista',
193
+ ],
194
+ dataset_args={
195
+ 'mmmu': {
196
+ 'subset_list': ['Accounting']
197
+ },
198
+ 'math_vista': {
199
+ 'subset_list': ['default']
200
+ }
201
+ },
202
+ eval_batch_size=1,
203
+ limit=1,
204
+ stream=True,
205
+ generation_config={
206
+ 'temperature': 0,
207
+ 'n': 1,
208
+ 'max_tokens': 4096,
209
+ 'image_height': 512,
210
+ 'image_width': 512,
211
+ 'image_num': 2,
212
+ },
213
+ judge_worker_num=5,
214
+ judge_strategy=JudgeStrategy.AUTO,
215
+ judge_model_args={
216
+ 'model_id': 'qwen2.5-72b-instruct',
217
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
218
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
219
+ }
220
+ )
221
+
222
+ run_task(task_cfg=task_cfg)
183
223
 
184
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
185
224
  def test_ci_lite(self):
186
225
  from evalscope.config import TaskConfig
187
226
 
227
+ api_key = env.get('DASHSCOPE_API_KEY')
228
+
188
229
  task_cfg = TaskConfig(
189
230
  model='qwen-plus',
190
231
  api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
191
- api_key= env.get('DASHSCOPE_API_KEY'),
192
- eval_type=EvalType.SERVICE,
232
+ api_key=api_key,
233
+ eval_type=EvalType.SERVICE if api_key else EvalType.MOCK_LLM,
193
234
  datasets=[
194
235
  'general_mcq',
195
- 'general_qa',
196
236
  'iquiz',
197
237
  ],
198
238
  dataset_args={
@@ -52,16 +52,19 @@ class TestCollection(unittest.TestCase):
52
52
  api_key=env.get('DASHSCOPE_API_KEY'),
53
53
  eval_type=EvalType.SERVICE,
54
54
  datasets=['data_collection'],
55
- dataset_args={'data_collection': {
56
- 'local_path': 'outputs/mixed_data_test.jsonl'
57
- # 'local_path': 'outputs/weighted_mixed_data.jsonl'
58
- }},
55
+ dataset_args={
56
+ 'data_collection': {
57
+ # 'local_path': 'outputs/test_mix.jsonl'
58
+ 'local_path': 'outputs/mixed_data_test.jsonl',
59
+ 'shuffle': True,
60
+ }
61
+ },
59
62
  eval_batch_size=5,
60
63
  generation_config = {
61
64
  'max_tokens': 10000,
62
65
  'temperature': 0.0,
63
66
  },
64
- limit=50,
67
+ limit=10,
65
68
  # use_cache='outputs/20250822_161804'
66
69
  )
67
70
  run_task(task_cfg=task_cfg)