evalscope 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/adapters/default_data_adapter.py +6 -4
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
- evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
- evalscope/api/benchmark/benchmark.py +27 -2
- evalscope/api/benchmark/meta.py +3 -0
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +5 -0
- evalscope/api/messages/chat_message.py +6 -1
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +204 -0
- evalscope/api/model/generate_config.py +0 -3
- evalscope/api/model/model.py +1 -1
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/arguments.py +6 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +46 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/bfcl_adapter.py +141 -2
- evalscope/benchmarks/bfcl/generation.py +7 -7
- evalscope/benchmarks/drop/drop_adapter.py +1 -1
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
- evalscope/config.py +24 -1
- evalscope/constants.py +3 -0
- evalscope/evaluator/evaluator.py +25 -7
- evalscope/metrics/metric.py +27 -2
- evalscope/models/model_apis.py +10 -8
- evalscope/models/utils/openai.py +1 -2
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/plugin/api/base.py +2 -2
- evalscope/perf/plugin/api/default_api.py +7 -7
- evalscope/perf/plugin/api/openai_api.py +83 -19
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/report/combinator.py +0 -25
- evalscope/report/report.py +8 -4
- evalscope/run.py +1 -1
- evalscope/utils/function_utils.py +41 -0
- evalscope/utils/import_utils.py +63 -13
- evalscope/utils/io_utils.py +19 -11
- evalscope/utils/json_schema.py +23 -2
- evalscope/utils/logger.py +19 -0
- evalscope/utils/model_utils.py +1 -1
- evalscope/version.py +2 -2
- {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/METADATA +6 -10
- {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/RECORD +87 -59
- tests/benchmark/test_eval.py +51 -7
- tests/benchmark/test_sandbox.py +81 -0
- tests/benchmark/test_vlm.py +60 -3
- tests/perf/test_perf.py +40 -12
- {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
tests/perf/test_perf.py
CHANGED
|
@@ -16,7 +16,7 @@ class TestPerf(unittest.TestCase):
|
|
|
16
16
|
def tearDown(self) -> None:
|
|
17
17
|
pass
|
|
18
18
|
|
|
19
|
-
|
|
19
|
+
|
|
20
20
|
def test_run_perf(self):
|
|
21
21
|
task_cfg = {
|
|
22
22
|
'url': 'http://127.0.0.1:8001/v1/chat/completions',
|
|
@@ -30,7 +30,7 @@ class TestPerf(unittest.TestCase):
|
|
|
30
30
|
}
|
|
31
31
|
run_perf_benchmark(task_cfg)
|
|
32
32
|
|
|
33
|
-
|
|
33
|
+
|
|
34
34
|
def test_run_perf_stream(self):
|
|
35
35
|
task_cfg = {
|
|
36
36
|
'url': 'http://127.0.0.1:8801/v1/chat/completions',
|
|
@@ -44,7 +44,7 @@ class TestPerf(unittest.TestCase):
|
|
|
44
44
|
}
|
|
45
45
|
run_perf_benchmark(task_cfg)
|
|
46
46
|
|
|
47
|
-
|
|
47
|
+
|
|
48
48
|
def test_run_perf_speed_benchmark(self):
|
|
49
49
|
task_cfg = {
|
|
50
50
|
'url': 'http://127.0.0.1:8001/v1/completions',
|
|
@@ -58,7 +58,7 @@ class TestPerf(unittest.TestCase):
|
|
|
58
58
|
}
|
|
59
59
|
run_perf_benchmark(task_cfg)
|
|
60
60
|
|
|
61
|
-
|
|
61
|
+
|
|
62
62
|
def test_run_perf_local(self):
|
|
63
63
|
task_cfg = {
|
|
64
64
|
'parallel': 1,
|
|
@@ -70,7 +70,7 @@ class TestPerf(unittest.TestCase):
|
|
|
70
70
|
}
|
|
71
71
|
run_perf_benchmark(task_cfg)
|
|
72
72
|
|
|
73
|
-
|
|
73
|
+
|
|
74
74
|
def test_run_perf_local_stream(self):
|
|
75
75
|
task_cfg = {
|
|
76
76
|
'parallel': 1,
|
|
@@ -83,7 +83,7 @@ class TestPerf(unittest.TestCase):
|
|
|
83
83
|
}
|
|
84
84
|
run_perf_benchmark(task_cfg)
|
|
85
85
|
|
|
86
|
-
|
|
86
|
+
|
|
87
87
|
def test_run_perf_local_speed_benchmark(self):
|
|
88
88
|
task_cfg = {
|
|
89
89
|
'parallel': 1,
|
|
@@ -96,7 +96,7 @@ class TestPerf(unittest.TestCase):
|
|
|
96
96
|
}
|
|
97
97
|
run_perf_benchmark(task_cfg)
|
|
98
98
|
|
|
99
|
-
|
|
99
|
+
|
|
100
100
|
def test_run_perf_local_random(self):
|
|
101
101
|
from evalscope.perf.arguments import Arguments
|
|
102
102
|
task_cfg = Arguments(
|
|
@@ -119,7 +119,35 @@ class TestPerf(unittest.TestCase):
|
|
|
119
119
|
print(metrics_result)
|
|
120
120
|
print(percentile_result)
|
|
121
121
|
|
|
122
|
-
|
|
122
|
+
def test_run_completion_endpoint(self):
|
|
123
|
+
if not env.get('DASHSCOPE_API_KEY'):
|
|
124
|
+
self.skipTest('DASHSCOPE_API_KEY is not set.')
|
|
125
|
+
return
|
|
126
|
+
|
|
127
|
+
from evalscope.perf.arguments import Arguments
|
|
128
|
+
task_cfg = Arguments(
|
|
129
|
+
parallel=[1, 2],
|
|
130
|
+
number=[2, 4],
|
|
131
|
+
model='qwen2.5-coder-7b-instruct',
|
|
132
|
+
url='https://dashscope.aliyuncs.com/compatible-mode/v1/completions',
|
|
133
|
+
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
134
|
+
api='openai',
|
|
135
|
+
dataset='random',
|
|
136
|
+
min_tokens=100,
|
|
137
|
+
max_tokens=100,
|
|
138
|
+
prefix_length=0,
|
|
139
|
+
min_prompt_length=1024,
|
|
140
|
+
max_prompt_length=1024,
|
|
141
|
+
stream=False,
|
|
142
|
+
tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
|
|
143
|
+
seed=None,
|
|
144
|
+
extra_args={'ignore_eos': True}
|
|
145
|
+
)
|
|
146
|
+
metrics_result, percentile_result = run_perf_benchmark(task_cfg)
|
|
147
|
+
print(metrics_result)
|
|
148
|
+
print(percentile_result)
|
|
149
|
+
|
|
150
|
+
|
|
123
151
|
def test_run_perf_multi_parallel(self):
|
|
124
152
|
if not env.get('DASHSCOPE_API_KEY'):
|
|
125
153
|
self.skipTest('DASHSCOPE_API_KEY is not set.')
|
|
@@ -129,7 +157,7 @@ class TestPerf(unittest.TestCase):
|
|
|
129
157
|
task_cfg = Arguments(
|
|
130
158
|
parallel=[1, 2],
|
|
131
159
|
number=[2, 4],
|
|
132
|
-
model='
|
|
160
|
+
model='qwen-plus',
|
|
133
161
|
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
134
162
|
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
135
163
|
api='openai',
|
|
@@ -147,7 +175,7 @@ class TestPerf(unittest.TestCase):
|
|
|
147
175
|
print(metrics_result)
|
|
148
176
|
print(percentile_result)
|
|
149
177
|
|
|
150
|
-
|
|
178
|
+
|
|
151
179
|
def test_run_perf_random_vl(self):
|
|
152
180
|
from evalscope.perf.arguments import Arguments
|
|
153
181
|
task_cfg = Arguments(
|
|
@@ -157,7 +185,7 @@ class TestPerf(unittest.TestCase):
|
|
|
157
185
|
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
158
186
|
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
159
187
|
api='openai',
|
|
160
|
-
dataset='
|
|
188
|
+
dataset='random_vl',
|
|
161
189
|
min_tokens=100,
|
|
162
190
|
max_tokens=100,
|
|
163
191
|
prefix_length=0,
|
|
@@ -166,7 +194,7 @@ class TestPerf(unittest.TestCase):
|
|
|
166
194
|
image_height=512,
|
|
167
195
|
image_width=512,
|
|
168
196
|
image_num=2,
|
|
169
|
-
tokenizer_path='Qwen/Qwen2.5-
|
|
197
|
+
tokenizer_path='Qwen/Qwen2.5-VL-7B-Instruct',
|
|
170
198
|
seed=None,
|
|
171
199
|
extra_args={'ignore_eos': True}
|
|
172
200
|
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|