evalscope 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (87) hide show
  1. evalscope/api/benchmark/adapters/default_data_adapter.py +6 -4
  2. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  3. evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
  4. evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
  5. evalscope/api/benchmark/benchmark.py +27 -2
  6. evalscope/api/benchmark/meta.py +3 -0
  7. evalscope/api/evaluator/evaluator.py +5 -0
  8. evalscope/api/evaluator/state.py +5 -0
  9. evalscope/api/messages/chat_message.py +6 -1
  10. evalscope/api/mixin/__init__.py +1 -0
  11. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  12. evalscope/api/mixin/sandbox_mixin.py +204 -0
  13. evalscope/api/model/generate_config.py +0 -3
  14. evalscope/api/model/model.py +1 -1
  15. evalscope/api/tool/tool_info.py +1 -1
  16. evalscope/arguments.py +6 -0
  17. evalscope/benchmarks/ai2d/__init__.py +0 -0
  18. evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
  19. evalscope/benchmarks/amc/__init__.py +0 -0
  20. evalscope/benchmarks/amc/amc_adapter.py +46 -0
  21. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  22. evalscope/benchmarks/bfcl/bfcl_adapter.py +141 -2
  23. evalscope/benchmarks/bfcl/generation.py +7 -7
  24. evalscope/benchmarks/drop/drop_adapter.py +1 -1
  25. evalscope/benchmarks/healthbench/__init__.py +0 -0
  26. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  27. evalscope/benchmarks/healthbench/utils.py +102 -0
  28. evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
  29. evalscope/benchmarks/humaneval/utils.py +235 -0
  30. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  31. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
  32. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  33. evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
  34. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  35. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
  36. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  37. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  38. evalscope/benchmarks/mm_star/__init__.py +0 -0
  39. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  40. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
  41. evalscope/benchmarks/multi_if/__init__.py +0 -0
  42. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  43. evalscope/benchmarks/multi_if/metrics.py +120 -0
  44. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  45. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
  46. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  47. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  48. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  49. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  50. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  51. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  52. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  53. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
  54. evalscope/config.py +24 -1
  55. evalscope/constants.py +3 -0
  56. evalscope/evaluator/evaluator.py +25 -7
  57. evalscope/metrics/metric.py +27 -2
  58. evalscope/models/model_apis.py +10 -8
  59. evalscope/models/utils/openai.py +1 -2
  60. evalscope/perf/arguments.py +2 -0
  61. evalscope/perf/plugin/api/base.py +2 -2
  62. evalscope/perf/plugin/api/default_api.py +7 -7
  63. evalscope/perf/plugin/api/openai_api.py +83 -19
  64. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  65. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  66. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  67. evalscope/perf/utils/benchmark_util.py +1 -2
  68. evalscope/report/combinator.py +0 -25
  69. evalscope/report/report.py +8 -4
  70. evalscope/run.py +1 -1
  71. evalscope/utils/function_utils.py +41 -0
  72. evalscope/utils/import_utils.py +63 -13
  73. evalscope/utils/io_utils.py +19 -11
  74. evalscope/utils/json_schema.py +23 -2
  75. evalscope/utils/logger.py +19 -0
  76. evalscope/utils/model_utils.py +1 -1
  77. evalscope/version.py +2 -2
  78. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/METADATA +6 -10
  79. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/RECORD +87 -59
  80. tests/benchmark/test_eval.py +51 -7
  81. tests/benchmark/test_sandbox.py +81 -0
  82. tests/benchmark/test_vlm.py +60 -3
  83. tests/perf/test_perf.py +40 -12
  84. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
  85. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
  86. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
  87. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
tests/perf/test_perf.py CHANGED
@@ -16,7 +16,7 @@ class TestPerf(unittest.TestCase):
16
16
  def tearDown(self) -> None:
17
17
  pass
18
18
 
19
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
19
+
20
20
  def test_run_perf(self):
21
21
  task_cfg = {
22
22
  'url': 'http://127.0.0.1:8001/v1/chat/completions',
@@ -30,7 +30,7 @@ class TestPerf(unittest.TestCase):
30
30
  }
31
31
  run_perf_benchmark(task_cfg)
32
32
 
33
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
33
+
34
34
  def test_run_perf_stream(self):
35
35
  task_cfg = {
36
36
  'url': 'http://127.0.0.1:8801/v1/chat/completions',
@@ -44,7 +44,7 @@ class TestPerf(unittest.TestCase):
44
44
  }
45
45
  run_perf_benchmark(task_cfg)
46
46
 
47
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
47
+
48
48
  def test_run_perf_speed_benchmark(self):
49
49
  task_cfg = {
50
50
  'url': 'http://127.0.0.1:8001/v1/completions',
@@ -58,7 +58,7 @@ class TestPerf(unittest.TestCase):
58
58
  }
59
59
  run_perf_benchmark(task_cfg)
60
60
 
61
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
61
+
62
62
  def test_run_perf_local(self):
63
63
  task_cfg = {
64
64
  'parallel': 1,
@@ -70,7 +70,7 @@ class TestPerf(unittest.TestCase):
70
70
  }
71
71
  run_perf_benchmark(task_cfg)
72
72
 
73
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
73
+
74
74
  def test_run_perf_local_stream(self):
75
75
  task_cfg = {
76
76
  'parallel': 1,
@@ -83,7 +83,7 @@ class TestPerf(unittest.TestCase):
83
83
  }
84
84
  run_perf_benchmark(task_cfg)
85
85
 
86
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
86
+
87
87
  def test_run_perf_local_speed_benchmark(self):
88
88
  task_cfg = {
89
89
  'parallel': 1,
@@ -96,7 +96,7 @@ class TestPerf(unittest.TestCase):
96
96
  }
97
97
  run_perf_benchmark(task_cfg)
98
98
 
99
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
99
+
100
100
  def test_run_perf_local_random(self):
101
101
  from evalscope.perf.arguments import Arguments
102
102
  task_cfg = Arguments(
@@ -119,7 +119,35 @@ class TestPerf(unittest.TestCase):
119
119
  print(metrics_result)
120
120
  print(percentile_result)
121
121
 
122
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
122
+ def test_run_completion_endpoint(self):
123
+ if not env.get('DASHSCOPE_API_KEY'):
124
+ self.skipTest('DASHSCOPE_API_KEY is not set.')
125
+ return
126
+
127
+ from evalscope.perf.arguments import Arguments
128
+ task_cfg = Arguments(
129
+ parallel=[1, 2],
130
+ number=[2, 4],
131
+ model='qwen2.5-coder-7b-instruct',
132
+ url='https://dashscope.aliyuncs.com/compatible-mode/v1/completions',
133
+ api_key=env.get('DASHSCOPE_API_KEY'),
134
+ api='openai',
135
+ dataset='random',
136
+ min_tokens=100,
137
+ max_tokens=100,
138
+ prefix_length=0,
139
+ min_prompt_length=1024,
140
+ max_prompt_length=1024,
141
+ stream=False,
142
+ tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
143
+ seed=None,
144
+ extra_args={'ignore_eos': True}
145
+ )
146
+ metrics_result, percentile_result = run_perf_benchmark(task_cfg)
147
+ print(metrics_result)
148
+ print(percentile_result)
149
+
150
+
123
151
  def test_run_perf_multi_parallel(self):
124
152
  if not env.get('DASHSCOPE_API_KEY'):
125
153
  self.skipTest('DASHSCOPE_API_KEY is not set.')
@@ -129,7 +157,7 @@ class TestPerf(unittest.TestCase):
129
157
  task_cfg = Arguments(
130
158
  parallel=[1, 2],
131
159
  number=[2, 4],
132
- model='qwen2.5-7b-instruct',
160
+ model='qwen-plus',
133
161
  url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
134
162
  api_key=env.get('DASHSCOPE_API_KEY'),
135
163
  api='openai',
@@ -147,7 +175,7 @@ class TestPerf(unittest.TestCase):
147
175
  print(metrics_result)
148
176
  print(percentile_result)
149
177
 
150
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
178
+
151
179
  def test_run_perf_random_vl(self):
152
180
  from evalscope.perf.arguments import Arguments
153
181
  task_cfg = Arguments(
@@ -157,7 +185,7 @@ class TestPerf(unittest.TestCase):
157
185
  url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
158
186
  api_key=env.get('DASHSCOPE_API_KEY'),
159
187
  api='openai',
160
- dataset='kontext_bench',
188
+ dataset='random_vl',
161
189
  min_tokens=100,
162
190
  max_tokens=100,
163
191
  prefix_length=0,
@@ -166,7 +194,7 @@ class TestPerf(unittest.TestCase):
166
194
  image_height=512,
167
195
  image_width=512,
168
196
  image_num=2,
169
- tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
197
+ tokenizer_path='Qwen/Qwen2.5-VL-7B-Instruct',
170
198
  seed=None,
171
199
  extra_args={'ignore_eos': True}
172
200
  )