evalscope 0.17.0__py3-none-any.whl → 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- evalscope/benchmarks/data_adapter.py +9 -4
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +2 -1
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +118 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
- evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
- evalscope/benchmarks/utils.py +1 -0
- evalscope/constants.py +5 -21
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +5 -3
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/completion_parsers.py +7 -0
- evalscope/metrics/llm_judge.py +6 -5
- evalscope/metrics/metrics.py +19 -7
- evalscope/models/__init__.py +4 -8
- evalscope/models/adapters/__init__.py +4 -9
- evalscope/models/adapters/base_adapter.py +4 -0
- evalscope/models/adapters/bfcl_adapter.py +2 -0
- evalscope/models/adapters/chat_adapter.py +3 -0
- evalscope/models/adapters/choice_adapter.py +4 -0
- evalscope/models/adapters/custom_adapter.py +7 -3
- evalscope/models/adapters/server_adapter.py +2 -0
- evalscope/models/adapters/t2i_adapter.py +3 -0
- evalscope/models/adapters/tau_bench_adapter.py +189 -0
- evalscope/models/register.py +0 -14
- evalscope/perf/arguments.py +13 -0
- evalscope/perf/benchmark.py +38 -39
- evalscope/perf/http_client.py +30 -86
- evalscope/perf/main.py +2 -2
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +22 -4
- evalscope/perf/plugin/api/custom_api.py +212 -55
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +105 -0
- evalscope/perf/plugin/api/openai_api.py +17 -19
- evalscope/perf/plugin/datasets/__init__.py +10 -7
- evalscope/perf/plugin/datasets/base.py +22 -1
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +4 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +2 -1
- evalscope/perf/plugin/datasets/random_dataset.py +15 -4
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/benchmark_util.py +14 -20
- evalscope/perf/utils/db_util.py +79 -61
- evalscope/utils/io_utils.py +10 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/METADATA +54 -34
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/RECORD +65 -58
- tests/cli/test_all.py +18 -2
- tests/cli/test_run.py +25 -37
- tests/perf/test_perf.py +29 -2
- evalscope/models/model.py +0 -189
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -16,6 +16,28 @@ from evalscope.utils.logger import get_logger
|
|
|
16
16
|
logger = get_logger()
|
|
17
17
|
|
|
18
18
|
|
|
19
|
+
class DatabaseColumns:
|
|
20
|
+
REQUEST = 'request'
|
|
21
|
+
START_TIME = 'start_time'
|
|
22
|
+
CHUNK_TIMES = 'chunk_times'
|
|
23
|
+
SUCCESS = 'success'
|
|
24
|
+
RESPONSE_MESSAGES = 'response_messages'
|
|
25
|
+
COMPLETED_TIME = 'completed_time'
|
|
26
|
+
LATENCY = 'latency'
|
|
27
|
+
FIRST_CHUNK_LATENCY = 'first_chunk_latency'
|
|
28
|
+
PROMPT_TOKENS = 'prompt_tokens'
|
|
29
|
+
COMPLETION_TOKENS = 'completion_tokens'
|
|
30
|
+
MAX_GPU_MEMORY_COST = 'max_gpu_memory_cost'
|
|
31
|
+
TIME_PER_OUTPUT_TOKEN = 'time_per_output_token'
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def load_prompt(prompt_path_or_text):
|
|
35
|
+
if prompt_path_or_text.startswith('@'):
|
|
36
|
+
with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
|
|
37
|
+
return file.read()
|
|
38
|
+
return prompt_path_or_text
|
|
39
|
+
|
|
40
|
+
|
|
19
41
|
def encode_data(data) -> str:
|
|
20
42
|
"""Encodes data using base64 and pickle."""
|
|
21
43
|
return base64.b64encode(pickle.dumps(data)).decode('utf-8')
|
|
@@ -34,20 +56,20 @@ def transpose_results(data):
|
|
|
34
56
|
|
|
35
57
|
|
|
36
58
|
def create_result_table(cursor):
|
|
37
|
-
cursor.execute('''CREATE TABLE IF NOT EXISTS result(
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
59
|
+
cursor.execute(f'''CREATE TABLE IF NOT EXISTS result(
|
|
60
|
+
{DatabaseColumns.REQUEST} TEXT,
|
|
61
|
+
{DatabaseColumns.START_TIME} REAL,
|
|
62
|
+
{DatabaseColumns.CHUNK_TIMES} TEXT,
|
|
63
|
+
{DatabaseColumns.SUCCESS} INTEGER,
|
|
64
|
+
{DatabaseColumns.RESPONSE_MESSAGES} TEXT,
|
|
65
|
+
{DatabaseColumns.COMPLETED_TIME} REAL,
|
|
66
|
+
{DatabaseColumns.LATENCY} REAL,
|
|
67
|
+
{DatabaseColumns.FIRST_CHUNK_LATENCY} REAL,
|
|
68
|
+
{DatabaseColumns.PROMPT_TOKENS} INTEGER,
|
|
69
|
+
{DatabaseColumns.COMPLETION_TOKENS} INTEGER,
|
|
70
|
+
{DatabaseColumns.MAX_GPU_MEMORY_COST} REAL,
|
|
71
|
+
{DatabaseColumns.TIME_PER_OUTPUT_TOKEN} REAL
|
|
72
|
+
)''')
|
|
51
73
|
|
|
52
74
|
|
|
53
75
|
def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
|
|
@@ -67,24 +89,21 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
|
|
|
67
89
|
|
|
68
90
|
if benchmark_data.success:
|
|
69
91
|
# Add additional columns for success case
|
|
70
|
-
additional_columns = (
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
request, start_time, chunk_times, success, response_messages,
|
|
81
|
-
completed_time, latency, first_chunk_latency,
|
|
82
|
-
n_chunks, chunk_time, prompt_tokens, completion_tokens, max_gpu_memory_cost
|
|
83
|
-
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
|
|
92
|
+
additional_columns = (benchmark_data.query_latency, benchmark_data.first_chunk_latency,
|
|
93
|
+
benchmark_data.prompt_tokens, benchmark_data.completion_tokens,
|
|
94
|
+
benchmark_data.max_gpu_memory_cost, benchmark_data.time_per_output_token)
|
|
95
|
+
query = f"""INSERT INTO result(
|
|
96
|
+
{DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
|
|
97
|
+
{DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME},
|
|
98
|
+
{DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY}, {DatabaseColumns.PROMPT_TOKENS},
|
|
99
|
+
{DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.MAX_GPU_MEMORY_COST},
|
|
100
|
+
{DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
|
|
101
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
|
|
84
102
|
cursor.execute(query, common_columns + additional_columns)
|
|
85
103
|
else:
|
|
86
|
-
query = """INSERT INTO result(
|
|
87
|
-
|
|
104
|
+
query = f"""INSERT INTO result(
|
|
105
|
+
{DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
|
|
106
|
+
{DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME}
|
|
88
107
|
) VALUES (?, ?, ?, ?, ?, ?)"""
|
|
89
108
|
cursor.execute(query, common_columns)
|
|
90
109
|
|
|
@@ -160,44 +179,43 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
160
179
|
logger.error(f'Error parsing chunk times: {e}')
|
|
161
180
|
return []
|
|
162
181
|
|
|
163
|
-
query_sql =
|
|
164
|
-
|
|
165
|
-
|
|
182
|
+
query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES}, {DatabaseColumns.SUCCESS},
|
|
183
|
+
{DatabaseColumns.COMPLETED_TIME}, {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY},
|
|
184
|
+
{DatabaseColumns.PROMPT_TOKENS},
|
|
185
|
+
{DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
|
|
186
|
+
FROM result WHERE {DatabaseColumns.SUCCESS}=1'''
|
|
166
187
|
|
|
167
188
|
percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
|
|
168
189
|
|
|
169
190
|
with sqlite3.connect(result_db_path) as con:
|
|
170
|
-
|
|
191
|
+
cursor = con.cursor()
|
|
192
|
+
cursor.execute(query_sql)
|
|
193
|
+
columns = [description[0] for description in cursor.description]
|
|
194
|
+
rows = cursor.fetchall()
|
|
171
195
|
|
|
172
|
-
#
|
|
173
|
-
|
|
174
|
-
LATENCY_INDEX = 4
|
|
175
|
-
FIRST_CHUNK_LATENCY_INDEX = 5
|
|
176
|
-
CHUNK_TIME_INDEX = 7
|
|
177
|
-
PROMPT_TOKENS_INDEX = 8
|
|
178
|
-
COMPLETION_TOKENS_INDEX = 9
|
|
196
|
+
# Create column index mapping
|
|
197
|
+
col_indices = {col: idx for idx, col in enumerate(columns)}
|
|
179
198
|
|
|
180
199
|
# Prepare data for each metric
|
|
181
200
|
inter_token_latencies_all = []
|
|
182
201
|
for row in rows:
|
|
183
|
-
inter_token_latencies_all.extend(inter_token_latencies(row[
|
|
202
|
+
inter_token_latencies_all.extend(inter_token_latencies(row[col_indices[DatabaseColumns.CHUNK_TIMES]]))
|
|
184
203
|
|
|
185
204
|
metrics = {
|
|
186
|
-
PercentileMetrics.TTFT: [row[
|
|
205
|
+
PercentileMetrics.TTFT: [row[col_indices[DatabaseColumns.FIRST_CHUNK_LATENCY]] for row in rows],
|
|
187
206
|
PercentileMetrics.ITL:
|
|
188
207
|
inter_token_latencies_all,
|
|
189
|
-
PercentileMetrics.TPOT:
|
|
190
|
-
[
|
|
191
|
-
|
|
192
|
-
PercentileMetrics.
|
|
193
|
-
PercentileMetrics.INPUT_TOKENS: [row[PROMPT_TOKENS_INDEX] for row in rows],
|
|
194
|
-
PercentileMetrics.OUTPUT_TOKENS: [row[COMPLETION_TOKENS_INDEX] for row in rows],
|
|
208
|
+
PercentileMetrics.TPOT: [row[col_indices[DatabaseColumns.TIME_PER_OUTPUT_TOKEN]] for row in rows],
|
|
209
|
+
PercentileMetrics.LATENCY: [row[col_indices[DatabaseColumns.LATENCY]] for row in rows],
|
|
210
|
+
PercentileMetrics.INPUT_TOKENS: [row[col_indices[DatabaseColumns.PROMPT_TOKENS]] for row in rows],
|
|
211
|
+
PercentileMetrics.OUTPUT_TOKENS: [row[col_indices[DatabaseColumns.COMPLETION_TOKENS]] for row in rows],
|
|
195
212
|
PercentileMetrics.OUTPUT_THROUGHPUT:
|
|
196
|
-
[(row[
|
|
197
|
-
for row in rows],
|
|
198
|
-
PercentileMetrics.TOTAL_THROUGHPUT:
|
|
199
|
-
|
|
200
|
-
|
|
213
|
+
[(row[col_indices[DatabaseColumns.COMPLETION_TOKENS]] / row[col_indices[DatabaseColumns.LATENCY]])
|
|
214
|
+
if row[col_indices[DatabaseColumns.LATENCY]] > 0 else float('nan') for row in rows],
|
|
215
|
+
PercentileMetrics.TOTAL_THROUGHPUT:
|
|
216
|
+
[((row[col_indices[DatabaseColumns.PROMPT_TOKENS]] + row[col_indices[DatabaseColumns.COMPLETION_TOKENS]])
|
|
217
|
+
/ row[col_indices[DatabaseColumns.LATENCY]])
|
|
218
|
+
if row[col_indices[DatabaseColumns.LATENCY]] > 0 else float('nan') for row in rows]
|
|
201
219
|
}
|
|
202
220
|
|
|
203
221
|
# Calculate percentiles for each metric
|
|
@@ -237,18 +255,18 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
|
|
|
237
255
|
|
|
238
256
|
|
|
239
257
|
def speed_benchmark_result(result_db_path: str):
|
|
240
|
-
query_sql = """
|
|
258
|
+
query_sql = f"""
|
|
241
259
|
SELECT
|
|
242
|
-
|
|
243
|
-
ROUND(AVG(
|
|
244
|
-
ROUND(AVG(
|
|
260
|
+
{DatabaseColumns.PROMPT_TOKENS},
|
|
261
|
+
ROUND(AVG({DatabaseColumns.COMPLETION_TOKENS} / {DatabaseColumns.LATENCY}), 2) AS avg_completion_token_per_second,
|
|
262
|
+
ROUND(AVG({DatabaseColumns.MAX_GPU_MEMORY_COST}), 2)
|
|
245
263
|
FROM
|
|
246
264
|
result
|
|
247
265
|
WHERE
|
|
248
|
-
|
|
266
|
+
{DatabaseColumns.SUCCESS} = 1 AND {DatabaseColumns.LATENCY} > 0
|
|
249
267
|
GROUP BY
|
|
250
|
-
|
|
251
|
-
"""
|
|
268
|
+
{DatabaseColumns.PROMPT_TOKENS}
|
|
269
|
+
""" # noqa: E501
|
|
252
270
|
|
|
253
271
|
with sqlite3.connect(result_db_path) as con:
|
|
254
272
|
cursor = con.cursor()
|
evalscope/utils/io_utils.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import base64
|
|
1
2
|
import csv
|
|
2
3
|
import hashlib
|
|
3
4
|
import json
|
|
@@ -5,6 +6,8 @@ import jsonlines as jsonl
|
|
|
5
6
|
import os
|
|
6
7
|
import re
|
|
7
8
|
import yaml
|
|
9
|
+
from io import BytesIO
|
|
10
|
+
from PIL import Image
|
|
8
11
|
|
|
9
12
|
from evalscope.constants import DumpMode
|
|
10
13
|
from evalscope.utils.logger import get_logger
|
|
@@ -266,3 +269,10 @@ def get_valid_list(input_list, candidate_list):
|
|
|
266
269
|
"""
|
|
267
270
|
return [i for i in input_list if i in candidate_list], \
|
|
268
271
|
[i for i in input_list if i not in candidate_list]
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def PIL_to_base64(image: Image.Image, format: str = 'JPEG') -> str:
|
|
275
|
+
buffered = BytesIO()
|
|
276
|
+
image.save(buffered, format=format)
|
|
277
|
+
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
|
278
|
+
return img_str
|
evalscope/version.py
CHANGED
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.17.
|
|
3
|
+
Version: 0.17.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
7
7
|
Author-email: contact@modelscope.cn
|
|
8
|
+
License: Apache License 2.0
|
|
8
9
|
Keywords: python,llm,evaluation
|
|
9
10
|
Classifier: Development Status :: 4 - Beta
|
|
10
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
11
|
Classifier: Operating System :: OS Independent
|
|
12
12
|
Classifier: Programming Language :: Python :: 3
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.9
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
-
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Requires-Python: >=3.9
|
|
17
18
|
Description-Content-Type: text/markdown
|
|
18
19
|
License-File: LICENSE
|
|
19
20
|
Requires-Dist: accelerate
|
|
@@ -94,7 +95,7 @@ Requires-Dist: rich; extra == "all"
|
|
|
94
95
|
Requires-Dist: sse-starlette; extra == "all"
|
|
95
96
|
Requires-Dist: transformers; extra == "all"
|
|
96
97
|
Requires-Dist: uvicorn; extra == "all"
|
|
97
|
-
Requires-Dist: gradio
|
|
98
|
+
Requires-Dist: gradio==5.4.0; extra == "all"
|
|
98
99
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
|
|
99
100
|
Requires-Dist: diffusers; extra == "all"
|
|
100
101
|
Requires-Dist: iopath; extra == "all"
|
|
@@ -103,19 +104,26 @@ Requires-Dist: open-clip-torch; extra == "all"
|
|
|
103
104
|
Requires-Dist: opencv-python; extra == "all"
|
|
104
105
|
Requires-Dist: torchvision; extra == "all"
|
|
105
106
|
Requires-Dist: bfcl-eval; extra == "all"
|
|
106
|
-
Requires-Dist: dotenv; extra == "all"
|
|
107
107
|
Requires-Dist: human-eval; extra == "all"
|
|
108
108
|
Requires-Dist: pytest; extra == "all"
|
|
109
109
|
Requires-Dist: pytest-cov; extra == "all"
|
|
110
|
+
Requires-Dist: python-dotenv; extra == "all"
|
|
110
111
|
Provides-Extra: app
|
|
111
|
-
Requires-Dist: gradio
|
|
112
|
+
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
112
113
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
113
114
|
Provides-Extra: dev
|
|
114
115
|
Requires-Dist: bfcl-eval; extra == "dev"
|
|
115
|
-
Requires-Dist: dotenv; extra == "dev"
|
|
116
116
|
Requires-Dist: human-eval; extra == "dev"
|
|
117
117
|
Requires-Dist: pytest; extra == "dev"
|
|
118
118
|
Requires-Dist: pytest-cov; extra == "dev"
|
|
119
|
+
Requires-Dist: python-dotenv; extra == "dev"
|
|
120
|
+
Provides-Extra: docs
|
|
121
|
+
Requires-Dist: docutils>=0.16.0; extra == "docs"
|
|
122
|
+
Requires-Dist: myst-parser; extra == "docs"
|
|
123
|
+
Requires-Dist: recommonmark; extra == "docs"
|
|
124
|
+
Requires-Dist: sphinx>=5.3.0; extra == "docs"
|
|
125
|
+
Requires-Dist: sphinx-design; extra == "docs"
|
|
126
|
+
Requires-Dist: sphinxawesome-theme; extra == "docs"
|
|
119
127
|
Provides-Extra: opencompass
|
|
120
128
|
Requires-Dist: ms-opencompass>=0.1.6; extra == "opencompass"
|
|
121
129
|
Provides-Extra: perf
|
|
@@ -176,16 +184,17 @@ Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
|
|
|
176
184
|
- [Basic Parameter](#basic-parameter)
|
|
177
185
|
- [Output Results](#output-results)
|
|
178
186
|
- [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
|
|
179
|
-
- [🌐 Evaluation of
|
|
187
|
+
- [🌐 Evaluation of Model API](#-evaluation-of-model-api)
|
|
180
188
|
- [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
|
|
181
|
-
- [Parameter](#parameter)
|
|
182
|
-
- [Evaluation
|
|
189
|
+
- [Parameter Description](#parameter-description)
|
|
190
|
+
- [🧪 Other Evaluation Backends](#-other-evaluation-backends)
|
|
183
191
|
- [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
|
|
184
192
|
- [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
|
|
185
|
-
- [
|
|
193
|
+
- [⚔️ Arena Mode](#️-arena-mode)
|
|
186
194
|
- [👷♂️ Contribution](#️-contribution)
|
|
195
|
+
- [📚 Citation](#-citation)
|
|
187
196
|
- [🔜 Roadmap](#-roadmap)
|
|
188
|
-
- [Star History](
|
|
197
|
+
- [⭐ Star History](#-star-history)
|
|
189
198
|
|
|
190
199
|
|
|
191
200
|
## 📝 Introduction
|
|
@@ -249,7 +258,9 @@ Please scan the QR code below to join our community groups:
|
|
|
249
258
|
|
|
250
259
|
|
|
251
260
|
## 🎉 News
|
|
252
|
-
|
|
261
|
+
- 🔥 **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
|
|
262
|
+
- 🔥 **[2025.07.16]** Support for [τ-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
|
|
263
|
+
- 🔥 **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
|
|
253
264
|
- 🔥 **[2025.07.03]** Refactored Arena Mode: now supports custom model battles, outputs a model leaderboard, and provides battle result visualization. See [reference](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
|
|
254
265
|
- 🔥 **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
|
|
255
266
|
- 🔥 **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
|
|
@@ -261,6 +272,8 @@ Please scan the QR code below to join our community groups:
|
|
|
261
272
|
- 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
|
|
262
273
|
- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
263
274
|
- 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
|
|
275
|
+
<details><summary>More</summary>
|
|
276
|
+
|
|
264
277
|
- 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
|
|
265
278
|
- 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
|
|
266
279
|
- 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
|
|
@@ -270,8 +283,6 @@ Please scan the QR code below to join our community groups:
|
|
|
270
283
|
- 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
|
|
271
284
|
- 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
|
|
272
285
|
- 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
|
|
273
|
-
<details><summary>More</summary>
|
|
274
|
-
|
|
275
286
|
- 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
|
|
276
287
|
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
277
288
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
@@ -367,33 +378,31 @@ evalscope eval \
|
|
|
367
378
|
|
|
368
379
|
When using Python code for evaluation, you need to submit the evaluation task using the `run_task` function, passing a `TaskConfig` as a parameter. It can also be a Python dictionary, yaml file path, or json file path, for example:
|
|
369
380
|
|
|
370
|
-
**Using
|
|
381
|
+
**Using `TaskConfig`**
|
|
371
382
|
|
|
372
383
|
```python
|
|
373
|
-
from evalscope
|
|
384
|
+
from evalscope import run_task, TaskConfig
|
|
374
385
|
|
|
375
|
-
task_cfg =
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
386
|
+
task_cfg = TaskConfig(
|
|
387
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
388
|
+
datasets=['gsm8k', 'arc'],
|
|
389
|
+
limit=5
|
|
390
|
+
)
|
|
380
391
|
|
|
381
392
|
run_task(task_cfg=task_cfg)
|
|
382
393
|
```
|
|
383
|
-
|
|
384
394
|
<details><summary>More Startup Methods</summary>
|
|
385
395
|
|
|
386
|
-
**Using
|
|
396
|
+
**Using Python Dictionary**
|
|
387
397
|
|
|
388
398
|
```python
|
|
389
399
|
from evalscope.run import run_task
|
|
390
|
-
from evalscope.config import TaskConfig
|
|
391
400
|
|
|
392
|
-
task_cfg =
|
|
393
|
-
model
|
|
394
|
-
datasets
|
|
395
|
-
limit
|
|
396
|
-
|
|
401
|
+
task_cfg = {
|
|
402
|
+
'model': 'Qwen/Qwen2.5-0.5B-Instruct',
|
|
403
|
+
'datasets': ['gsm8k', 'arc'],
|
|
404
|
+
'limit': 5
|
|
405
|
+
}
|
|
397
406
|
|
|
398
407
|
run_task(task_cfg=task_cfg)
|
|
399
408
|
```
|
|
@@ -496,7 +505,7 @@ To create a public link, set `share=True` in `launch()`.
|
|
|
496
505
|
|
|
497
506
|
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
|
|
498
507
|
|
|
499
|
-
## 🌐 Evaluation of
|
|
508
|
+
## 🌐 Evaluation of Model API
|
|
500
509
|
|
|
501
510
|
Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
|
|
502
511
|
|
|
@@ -547,7 +556,7 @@ evalscope eval \
|
|
|
547
556
|
Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
|
|
548
557
|
|
|
549
558
|
|
|
550
|
-
## Evaluation
|
|
559
|
+
## 🧪 Other Evaluation Backends
|
|
551
560
|
EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
|
|
552
561
|
- **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
|
|
553
562
|
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
|
|
@@ -620,6 +629,17 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
620
629
|
</table>
|
|
621
630
|
</a>
|
|
622
631
|
|
|
632
|
+
## 📚 Citation
|
|
633
|
+
|
|
634
|
+
```bibtex
|
|
635
|
+
@misc{evalscope_2024,
|
|
636
|
+
title={{EvalScope}: Evaluation Framework for Large Models},
|
|
637
|
+
author={ModelScope Team},
|
|
638
|
+
year={2024},
|
|
639
|
+
url={https://github.com/modelscope/evalscope}
|
|
640
|
+
}
|
|
641
|
+
```
|
|
642
|
+
|
|
623
643
|
## 🔜 Roadmap
|
|
624
644
|
- [x] Support for better evaluation report visualization
|
|
625
645
|
- [x] Support for mixed evaluations across multiple datasets
|
|
@@ -635,6 +655,6 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
635
655
|
- [x] MBPP
|
|
636
656
|
|
|
637
657
|
|
|
638
|
-
## Star History
|
|
658
|
+
## ⭐ Star History
|
|
639
659
|
|
|
640
660
|
[](https://star-history.com/#modelscope/evalscope&Date)
|