evalscope 0.17.0__py3-none-any.whl → 0.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (66) hide show
  1. evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
  2. evalscope/benchmarks/data_adapter.py +9 -4
  3. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -1
  4. evalscope/benchmarks/general_qa/general_qa_adapter.py +2 -1
  5. evalscope/benchmarks/hle/__init__.py +0 -0
  6. evalscope/benchmarks/hle/hle_adapter.py +118 -0
  7. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
  8. evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
  9. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  10. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
  11. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
  12. evalscope/benchmarks/utils.py +1 -0
  13. evalscope/constants.py +5 -21
  14. evalscope/evaluator/__init__.py +1 -1
  15. evalscope/evaluator/evaluator.py +5 -3
  16. evalscope/metrics/__init__.py +3 -1
  17. evalscope/metrics/completion_parsers.py +7 -0
  18. evalscope/metrics/llm_judge.py +6 -5
  19. evalscope/metrics/metrics.py +19 -7
  20. evalscope/models/__init__.py +4 -8
  21. evalscope/models/adapters/__init__.py +4 -9
  22. evalscope/models/adapters/base_adapter.py +4 -0
  23. evalscope/models/adapters/bfcl_adapter.py +2 -0
  24. evalscope/models/adapters/chat_adapter.py +3 -0
  25. evalscope/models/adapters/choice_adapter.py +4 -0
  26. evalscope/models/adapters/custom_adapter.py +7 -3
  27. evalscope/models/adapters/server_adapter.py +2 -0
  28. evalscope/models/adapters/t2i_adapter.py +3 -0
  29. evalscope/models/adapters/tau_bench_adapter.py +189 -0
  30. evalscope/models/register.py +0 -14
  31. evalscope/perf/arguments.py +13 -0
  32. evalscope/perf/benchmark.py +38 -39
  33. evalscope/perf/http_client.py +30 -86
  34. evalscope/perf/main.py +2 -2
  35. evalscope/perf/plugin/__init__.py +3 -2
  36. evalscope/perf/plugin/api/__init__.py +4 -3
  37. evalscope/perf/plugin/api/base.py +22 -4
  38. evalscope/perf/plugin/api/custom_api.py +212 -55
  39. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  40. evalscope/perf/plugin/api/default_api.py +105 -0
  41. evalscope/perf/plugin/api/openai_api.py +17 -19
  42. evalscope/perf/plugin/datasets/__init__.py +10 -7
  43. evalscope/perf/plugin/datasets/base.py +22 -1
  44. evalscope/perf/plugin/datasets/custom.py +2 -1
  45. evalscope/perf/plugin/datasets/flickr8k.py +4 -27
  46. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  47. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  48. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  49. evalscope/perf/plugin/datasets/openqa.py +2 -1
  50. evalscope/perf/plugin/datasets/random_dataset.py +15 -4
  51. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  52. evalscope/perf/plugin/registry.py +36 -16
  53. evalscope/perf/utils/benchmark_util.py +14 -20
  54. evalscope/perf/utils/db_util.py +79 -61
  55. evalscope/utils/io_utils.py +10 -0
  56. evalscope/version.py +2 -2
  57. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/METADATA +54 -34
  58. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/RECORD +65 -58
  59. tests/cli/test_all.py +18 -2
  60. tests/cli/test_run.py +25 -37
  61. tests/perf/test_perf.py +29 -2
  62. evalscope/models/model.py +0 -189
  63. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
  64. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
  65. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
  66. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
@@ -16,6 +16,28 @@ from evalscope.utils.logger import get_logger
16
16
  logger = get_logger()
17
17
 
18
18
 
19
+ class DatabaseColumns:
20
+ REQUEST = 'request'
21
+ START_TIME = 'start_time'
22
+ CHUNK_TIMES = 'chunk_times'
23
+ SUCCESS = 'success'
24
+ RESPONSE_MESSAGES = 'response_messages'
25
+ COMPLETED_TIME = 'completed_time'
26
+ LATENCY = 'latency'
27
+ FIRST_CHUNK_LATENCY = 'first_chunk_latency'
28
+ PROMPT_TOKENS = 'prompt_tokens'
29
+ COMPLETION_TOKENS = 'completion_tokens'
30
+ MAX_GPU_MEMORY_COST = 'max_gpu_memory_cost'
31
+ TIME_PER_OUTPUT_TOKEN = 'time_per_output_token'
32
+
33
+
34
+ def load_prompt(prompt_path_or_text):
35
+ if prompt_path_or_text.startswith('@'):
36
+ with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
37
+ return file.read()
38
+ return prompt_path_or_text
39
+
40
+
19
41
  def encode_data(data) -> str:
20
42
  """Encodes data using base64 and pickle."""
21
43
  return base64.b64encode(pickle.dumps(data)).decode('utf-8')
@@ -34,20 +56,20 @@ def transpose_results(data):
34
56
 
35
57
 
36
58
  def create_result_table(cursor):
37
- cursor.execute('''CREATE TABLE IF NOT EXISTS result(
38
- request TEXT,
39
- start_time REAL,
40
- chunk_times TEXT,
41
- success INTEGER,
42
- response_messages TEXT,
43
- completed_time REAL,
44
- latency REAL,
45
- first_chunk_latency REAL,
46
- n_chunks INTEGER,
47
- chunk_time REAL,
48
- prompt_tokens INTEGER,
49
- completion_tokens INTEGER,
50
- max_gpu_memory_cost REAL)''')
59
+ cursor.execute(f'''CREATE TABLE IF NOT EXISTS result(
60
+ {DatabaseColumns.REQUEST} TEXT,
61
+ {DatabaseColumns.START_TIME} REAL,
62
+ {DatabaseColumns.CHUNK_TIMES} TEXT,
63
+ {DatabaseColumns.SUCCESS} INTEGER,
64
+ {DatabaseColumns.RESPONSE_MESSAGES} TEXT,
65
+ {DatabaseColumns.COMPLETED_TIME} REAL,
66
+ {DatabaseColumns.LATENCY} REAL,
67
+ {DatabaseColumns.FIRST_CHUNK_LATENCY} REAL,
68
+ {DatabaseColumns.PROMPT_TOKENS} INTEGER,
69
+ {DatabaseColumns.COMPLETION_TOKENS} INTEGER,
70
+ {DatabaseColumns.MAX_GPU_MEMORY_COST} REAL,
71
+ {DatabaseColumns.TIME_PER_OUTPUT_TOKEN} REAL
72
+ )''')
51
73
 
52
74
 
53
75
  def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
@@ -67,24 +89,21 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
67
89
 
68
90
  if benchmark_data.success:
69
91
  # Add additional columns for success case
70
- additional_columns = (
71
- benchmark_data.query_latency,
72
- benchmark_data.first_chunk_latency,
73
- benchmark_data.n_chunks,
74
- benchmark_data.n_chunks_time,
75
- benchmark_data.prompt_tokens,
76
- benchmark_data.completion_tokens,
77
- benchmark_data.max_gpu_memory_cost,
78
- )
79
- query = """INSERT INTO result(
80
- request, start_time, chunk_times, success, response_messages,
81
- completed_time, latency, first_chunk_latency,
82
- n_chunks, chunk_time, prompt_tokens, completion_tokens, max_gpu_memory_cost
83
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
92
+ additional_columns = (benchmark_data.query_latency, benchmark_data.first_chunk_latency,
93
+ benchmark_data.prompt_tokens, benchmark_data.completion_tokens,
94
+ benchmark_data.max_gpu_memory_cost, benchmark_data.time_per_output_token)
95
+ query = f"""INSERT INTO result(
96
+ {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
97
+ {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME},
98
+ {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY}, {DatabaseColumns.PROMPT_TOKENS},
99
+ {DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.MAX_GPU_MEMORY_COST},
100
+ {DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
101
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
84
102
  cursor.execute(query, common_columns + additional_columns)
85
103
  else:
86
- query = """INSERT INTO result(
87
- request, start_time, chunk_times, success, response_messages, completed_time
104
+ query = f"""INSERT INTO result(
105
+ {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
106
+ {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME}
88
107
  ) VALUES (?, ?, ?, ?, ?, ?)"""
89
108
  cursor.execute(query, common_columns)
90
109
 
@@ -160,44 +179,43 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
160
179
  logger.error(f'Error parsing chunk times: {e}')
161
180
  return []
162
181
 
163
- query_sql = ('SELECT start_time, chunk_times, success, completed_time, latency, first_chunk_latency, '
164
- 'n_chunks, chunk_time, prompt_tokens, completion_tokens '
165
- 'FROM result WHERE success=1')
182
+ query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES}, {DatabaseColumns.SUCCESS},
183
+ {DatabaseColumns.COMPLETED_TIME}, {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY},
184
+ {DatabaseColumns.PROMPT_TOKENS},
185
+ {DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
186
+ FROM result WHERE {DatabaseColumns.SUCCESS}=1'''
166
187
 
167
188
  percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
168
189
 
169
190
  with sqlite3.connect(result_db_path) as con:
170
- rows = con.execute(query_sql).fetchall()
191
+ cursor = con.cursor()
192
+ cursor.execute(query_sql)
193
+ columns = [description[0] for description in cursor.description]
194
+ rows = cursor.fetchall()
171
195
 
172
- # Define index variables for columns
173
- CHUNK_TIMES_INDEX = 1
174
- LATENCY_INDEX = 4
175
- FIRST_CHUNK_LATENCY_INDEX = 5
176
- CHUNK_TIME_INDEX = 7
177
- PROMPT_TOKENS_INDEX = 8
178
- COMPLETION_TOKENS_INDEX = 9
196
+ # Create column index mapping
197
+ col_indices = {col: idx for idx, col in enumerate(columns)}
179
198
 
180
199
  # Prepare data for each metric
181
200
  inter_token_latencies_all = []
182
201
  for row in rows:
183
- inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))
202
+ inter_token_latencies_all.extend(inter_token_latencies(row[col_indices[DatabaseColumns.CHUNK_TIMES]]))
184
203
 
185
204
  metrics = {
186
- PercentileMetrics.TTFT: [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
205
+ PercentileMetrics.TTFT: [row[col_indices[DatabaseColumns.FIRST_CHUNK_LATENCY]] for row in rows],
187
206
  PercentileMetrics.ITL:
188
207
  inter_token_latencies_all,
189
- PercentileMetrics.TPOT:
190
- [(row[CHUNK_TIME_INDEX] / row[COMPLETION_TOKENS_INDEX]) if row[COMPLETION_TOKENS_INDEX] > 0 else float('nan')
191
- for row in rows],
192
- PercentileMetrics.LATENCY: [row[LATENCY_INDEX] for row in rows],
193
- PercentileMetrics.INPUT_TOKENS: [row[PROMPT_TOKENS_INDEX] for row in rows],
194
- PercentileMetrics.OUTPUT_TOKENS: [row[COMPLETION_TOKENS_INDEX] for row in rows],
208
+ PercentileMetrics.TPOT: [row[col_indices[DatabaseColumns.TIME_PER_OUTPUT_TOKEN]] for row in rows],
209
+ PercentileMetrics.LATENCY: [row[col_indices[DatabaseColumns.LATENCY]] for row in rows],
210
+ PercentileMetrics.INPUT_TOKENS: [row[col_indices[DatabaseColumns.PROMPT_TOKENS]] for row in rows],
211
+ PercentileMetrics.OUTPUT_TOKENS: [row[col_indices[DatabaseColumns.COMPLETION_TOKENS]] for row in rows],
195
212
  PercentileMetrics.OUTPUT_THROUGHPUT:
196
- [(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
197
- for row in rows],
198
- PercentileMetrics.TOTAL_THROUGHPUT: [((row[PROMPT_TOKENS_INDEX] + row[COMPLETION_TOKENS_INDEX])
199
- / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
200
- for row in rows]
213
+ [(row[col_indices[DatabaseColumns.COMPLETION_TOKENS]] / row[col_indices[DatabaseColumns.LATENCY]])
214
+ if row[col_indices[DatabaseColumns.LATENCY]] > 0 else float('nan') for row in rows],
215
+ PercentileMetrics.TOTAL_THROUGHPUT:
216
+ [((row[col_indices[DatabaseColumns.PROMPT_TOKENS]] + row[col_indices[DatabaseColumns.COMPLETION_TOKENS]])
217
+ / row[col_indices[DatabaseColumns.LATENCY]])
218
+ if row[col_indices[DatabaseColumns.LATENCY]] > 0 else float('nan') for row in rows]
201
219
  }
202
220
 
203
221
  # Calculate percentiles for each metric
@@ -237,18 +255,18 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
237
255
 
238
256
 
239
257
  def speed_benchmark_result(result_db_path: str):
240
- query_sql = """
258
+ query_sql = f"""
241
259
  SELECT
242
- prompt_tokens,
243
- ROUND(AVG(completion_tokens / latency), 2) AS avg_completion_token_per_second,
244
- ROUND(AVG(max_gpu_memory_cost), 2)
260
+ {DatabaseColumns.PROMPT_TOKENS},
261
+ ROUND(AVG({DatabaseColumns.COMPLETION_TOKENS} / {DatabaseColumns.LATENCY}), 2) AS avg_completion_token_per_second,
262
+ ROUND(AVG({DatabaseColumns.MAX_GPU_MEMORY_COST}), 2)
245
263
  FROM
246
264
  result
247
265
  WHERE
248
- success = 1 AND latency > 0
266
+ {DatabaseColumns.SUCCESS} = 1 AND {DatabaseColumns.LATENCY} > 0
249
267
  GROUP BY
250
- prompt_tokens
251
- """
268
+ {DatabaseColumns.PROMPT_TOKENS}
269
+ """ # noqa: E501
252
270
 
253
271
  with sqlite3.connect(result_db_path) as con:
254
272
  cursor = con.cursor()
@@ -1,3 +1,4 @@
1
+ import base64
1
2
  import csv
2
3
  import hashlib
3
4
  import json
@@ -5,6 +6,8 @@ import jsonlines as jsonl
5
6
  import os
6
7
  import re
7
8
  import yaml
9
+ from io import BytesIO
10
+ from PIL import Image
8
11
 
9
12
  from evalscope.constants import DumpMode
10
13
  from evalscope.utils.logger import get_logger
@@ -266,3 +269,10 @@ def get_valid_list(input_list, candidate_list):
266
269
  """
267
270
  return [i for i in input_list if i in candidate_list], \
268
271
  [i for i in input_list if i not in candidate_list]
272
+
273
+
274
+ def PIL_to_base64(image: Image.Image, format: str = 'JPEG') -> str:
275
+ buffered = BytesIO()
276
+ image.save(buffered, format=format)
277
+ img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
278
+ return img_str
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.17.0'
4
- __release_datetime__ = '2025-07-04 17:00:00'
3
+ __version__ = '0.17.1'
4
+ __release_datetime__ = '2025-07-18 17:00:00'
@@ -1,19 +1,20 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.17.0
3
+ Version: 0.17.1
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
7
7
  Author-email: contact@modelscope.cn
8
+ License: Apache License 2.0
8
9
  Keywords: python,llm,evaluation
9
10
  Classifier: Development Status :: 4 - Beta
10
- Classifier: License :: OSI Approved :: Apache Software License
11
11
  Classifier: Operating System :: OS Independent
12
12
  Classifier: Programming Language :: Python :: 3
13
- Classifier: Programming Language :: Python :: 3.8
14
13
  Classifier: Programming Language :: Python :: 3.9
15
14
  Classifier: Programming Language :: Python :: 3.10
16
- Requires-Python: >=3.8
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Requires-Python: >=3.9
17
18
  Description-Content-Type: text/markdown
18
19
  License-File: LICENSE
19
20
  Requires-Dist: accelerate
@@ -94,7 +95,7 @@ Requires-Dist: rich; extra == "all"
94
95
  Requires-Dist: sse-starlette; extra == "all"
95
96
  Requires-Dist: transformers; extra == "all"
96
97
  Requires-Dist: uvicorn; extra == "all"
97
- Requires-Dist: gradio>=5.4.0; extra == "all"
98
+ Requires-Dist: gradio==5.4.0; extra == "all"
98
99
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
99
100
  Requires-Dist: diffusers; extra == "all"
100
101
  Requires-Dist: iopath; extra == "all"
@@ -103,19 +104,26 @@ Requires-Dist: open-clip-torch; extra == "all"
103
104
  Requires-Dist: opencv-python; extra == "all"
104
105
  Requires-Dist: torchvision; extra == "all"
105
106
  Requires-Dist: bfcl-eval; extra == "all"
106
- Requires-Dist: dotenv; extra == "all"
107
107
  Requires-Dist: human-eval; extra == "all"
108
108
  Requires-Dist: pytest; extra == "all"
109
109
  Requires-Dist: pytest-cov; extra == "all"
110
+ Requires-Dist: python-dotenv; extra == "all"
110
111
  Provides-Extra: app
111
- Requires-Dist: gradio>=5.4.0; extra == "app"
112
+ Requires-Dist: gradio==5.4.0; extra == "app"
112
113
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
113
114
  Provides-Extra: dev
114
115
  Requires-Dist: bfcl-eval; extra == "dev"
115
- Requires-Dist: dotenv; extra == "dev"
116
116
  Requires-Dist: human-eval; extra == "dev"
117
117
  Requires-Dist: pytest; extra == "dev"
118
118
  Requires-Dist: pytest-cov; extra == "dev"
119
+ Requires-Dist: python-dotenv; extra == "dev"
120
+ Provides-Extra: docs
121
+ Requires-Dist: docutils>=0.16.0; extra == "docs"
122
+ Requires-Dist: myst-parser; extra == "docs"
123
+ Requires-Dist: recommonmark; extra == "docs"
124
+ Requires-Dist: sphinx>=5.3.0; extra == "docs"
125
+ Requires-Dist: sphinx-design; extra == "docs"
126
+ Requires-Dist: sphinxawesome-theme; extra == "docs"
119
127
  Provides-Extra: opencompass
120
128
  Requires-Dist: ms-opencompass>=0.1.6; extra == "opencompass"
121
129
  Provides-Extra: perf
@@ -176,16 +184,17 @@ Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
176
184
  - [Basic Parameter](#basic-parameter)
177
185
  - [Output Results](#output-results)
178
186
  - [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
179
- - [🌐 Evaluation of Specified Model API](#-evaluation-of-specified-model-api)
187
+ - [🌐 Evaluation of Model API](#-evaluation-of-model-api)
180
188
  - [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
181
- - [Parameter](#parameter)
182
- - [Evaluation Backend](#evaluation-backend)
189
+ - [Parameter Description](#parameter-description)
190
+ - [🧪 Other Evaluation Backends](#-other-evaluation-backends)
183
191
  - [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
184
192
  - [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
185
- - [🏟️ Arena Mode](#️-arena-mode)
193
+ - [⚔️ Arena Mode](#️-arena-mode)
186
194
  - [👷‍♂️ Contribution](#️-contribution)
195
+ - [📚 Citation](#-citation)
187
196
  - [🔜 Roadmap](#-roadmap)
188
- - [Star History](#star-history)
197
+ - [Star History](#-star-history)
189
198
 
190
199
 
191
200
  ## 📝 Introduction
@@ -249,7 +258,9 @@ Please scan the QR code below to join our community groups:
249
258
 
250
259
 
251
260
  ## 🎉 News
252
-
261
+ - 🔥 **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
262
+ - 🔥 **[2025.07.16]** Support for [τ-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
263
+ - 🔥 **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
253
264
  - 🔥 **[2025.07.03]** Refactored Arena Mode: now supports custom model battles, outputs a model leaderboard, and provides battle result visualization. See [reference](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
254
265
  - 🔥 **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
255
266
  - 🔥 **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
@@ -261,6 +272,8 @@ Please scan the QR code below to join our community groups:
261
272
  - 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
262
273
  - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
263
274
  - 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
275
+ <details><summary>More</summary>
276
+
264
277
  - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
265
278
  - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
266
279
  - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
@@ -270,8 +283,6 @@ Please scan the QR code below to join our community groups:
270
283
  - 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
271
284
  - 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
272
285
  - 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
273
- <details><summary>More</summary>
274
-
275
286
  - 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
276
287
  - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
277
288
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
@@ -367,33 +378,31 @@ evalscope eval \
367
378
 
368
379
  When using Python code for evaluation, you need to submit the evaluation task using the `run_task` function, passing a `TaskConfig` as a parameter. It can also be a Python dictionary, yaml file path, or json file path, for example:
369
380
 
370
- **Using Python Dictionary**
381
+ **Using `TaskConfig`**
371
382
 
372
383
  ```python
373
- from evalscope.run import run_task
384
+ from evalscope import run_task, TaskConfig
374
385
 
375
- task_cfg = {
376
- 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
377
- 'datasets': ['gsm8k', 'arc'],
378
- 'limit': 5
379
- }
386
+ task_cfg = TaskConfig(
387
+ model='Qwen/Qwen2.5-0.5B-Instruct',
388
+ datasets=['gsm8k', 'arc'],
389
+ limit=5
390
+ )
380
391
 
381
392
  run_task(task_cfg=task_cfg)
382
393
  ```
383
-
384
394
  <details><summary>More Startup Methods</summary>
385
395
 
386
- **Using `TaskConfig`**
396
+ **Using Python Dictionary**
387
397
 
388
398
  ```python
389
399
  from evalscope.run import run_task
390
- from evalscope.config import TaskConfig
391
400
 
392
- task_cfg = TaskConfig(
393
- model='Qwen/Qwen2.5-0.5B-Instruct',
394
- datasets=['gsm8k', 'arc'],
395
- limit=5
396
- )
401
+ task_cfg = {
402
+ 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
403
+ 'datasets': ['gsm8k', 'arc'],
404
+ 'limit': 5
405
+ }
397
406
 
398
407
  run_task(task_cfg=task_cfg)
399
408
  ```
@@ -496,7 +505,7 @@ To create a public link, set `share=True` in `launch()`.
496
505
 
497
506
  For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
498
507
 
499
- ## 🌐 Evaluation of Specified Model API
508
+ ## 🌐 Evaluation of Model API
500
509
 
501
510
  Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
502
511
 
@@ -547,7 +556,7 @@ evalscope eval \
547
556
  Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
548
557
 
549
558
 
550
- ## Evaluation Backend
559
+ ## 🧪 Other Evaluation Backends
551
560
  EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
552
561
  - **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
553
562
  - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
@@ -620,6 +629,17 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
620
629
  </table>
621
630
  </a>
622
631
 
632
+ ## 📚 Citation
633
+
634
+ ```bibtex
635
+ @misc{evalscope_2024,
636
+ title={{EvalScope}: Evaluation Framework for Large Models},
637
+ author={ModelScope Team},
638
+ year={2024},
639
+ url={https://github.com/modelscope/evalscope}
640
+ }
641
+ ```
642
+
623
643
  ## 🔜 Roadmap
624
644
  - [x] Support for better evaluation report visualization
625
645
  - [x] Support for mixed evaluations across multiple datasets
@@ -635,6 +655,6 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
635
655
  - [x] MBPP
636
656
 
637
657
 
638
- ## Star History
658
+ ## Star History
639
659
 
640
660
  [![Star History Chart](https://api.star-history.com/svg?repos=modelscope/evalscope&type=Date)](https://star-history.com/#modelscope/evalscope&Date)