evalscope 0.12.1__py3-none-any.whl → 0.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +6 -1
- evalscope/benchmarks/arc/arc_adapter.py +3 -3
- evalscope/benchmarks/benchmark.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +2 -1
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +32 -4
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -4
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +20 -24
- evalscope/benchmarks/humaneval/humaneval_adapter.py +8 -5
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
- evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -2
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +148 -1
- evalscope/benchmarks/super_gpqa/utils.py +0 -5
- evalscope/collections/evaluator.py +4 -4
- evalscope/config.py +11 -3
- evalscope/constants.py +8 -0
- evalscope/evaluator/evaluator.py +56 -17
- evalscope/metrics/llm_judge.py +104 -0
- evalscope/models/custom_adapter.py +1 -1
- evalscope/perf/arguments.py +11 -40
- evalscope/perf/benchmark.py +39 -28
- evalscope/perf/http_client.py +9 -1
- evalscope/perf/main.py +2 -1
- evalscope/perf/plugin/datasets/__init__.py +1 -0
- evalscope/perf/plugin/datasets/openqa.py +6 -11
- evalscope/perf/plugin/datasets/random_dataset.py +51 -0
- evalscope/perf/utils/db_util.py +3 -0
- evalscope/run.py +15 -3
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/version.py +2 -2
- {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/METADATA +56 -38
- {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/RECORD +50 -36
- tests/cli/test_all.py +144 -0
- tests/cli/test_collection.py +27 -1
- tests/cli/test_run.py +103 -11
- tests/perf/test_perf.py +23 -0
- {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/LICENSE +0 -0
- {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/WHEEL +0 -0
- {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/top_level.txt +0 -0
evalscope/perf/benchmark.py
CHANGED
|
@@ -150,39 +150,45 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
|
|
|
150
150
|
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
151
151
|
wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
|
|
152
152
|
|
|
153
|
-
|
|
154
|
-
cursor = con.cursor()
|
|
155
|
-
create_result_table(cursor)
|
|
156
|
-
with tqdm(desc='Processing') as pbar:
|
|
157
|
-
while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
|
|
158
|
-
try:
|
|
159
|
-
# Attempt to get benchmark data from the queue with a timeout
|
|
160
|
-
benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
|
|
161
|
-
benchmark_data_queue.task_done()
|
|
162
|
-
except asyncio.TimeoutError:
|
|
163
|
-
# If timeout, continue to the next iteration
|
|
164
|
-
continue
|
|
153
|
+
collected_benchmark_data = []
|
|
165
154
|
|
|
166
|
-
|
|
167
|
-
|
|
155
|
+
with tqdm(desc='Processing') as pbar:
|
|
156
|
+
while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
|
|
157
|
+
try:
|
|
158
|
+
# Attempt to get benchmark data from the queue with a timeout
|
|
159
|
+
benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
|
|
160
|
+
benchmark_data_queue.task_done()
|
|
161
|
+
except asyncio.TimeoutError:
|
|
162
|
+
# If timeout, continue to the next iteration
|
|
163
|
+
continue
|
|
168
164
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
con.commit()
|
|
165
|
+
# Update metrics based on the benchmark data
|
|
166
|
+
metrics.update_metrics(benchmark_data, api_plugin)
|
|
172
167
|
|
|
173
|
-
|
|
174
|
-
|
|
168
|
+
# Collect benchmark data for later database insertion
|
|
169
|
+
collected_benchmark_data.append(benchmark_data)
|
|
175
170
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
wandb.log(message)
|
|
171
|
+
# Create a message with the updated metrics
|
|
172
|
+
message = metrics.create_message()
|
|
179
173
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
logger.info(msg)
|
|
174
|
+
# Log the message to wandb if the api key is provided
|
|
175
|
+
if args.wandb_api_key:
|
|
176
|
+
wandb.log(message)
|
|
184
177
|
|
|
185
|
-
|
|
178
|
+
# Log the message to the logger every n queries
|
|
179
|
+
if int(metrics.n_total_queries) % args.log_every_n_query == 0:
|
|
180
|
+
msg = json.dumps(message, ensure_ascii=False, indent=2)
|
|
181
|
+
logger.info(msg)
|
|
182
|
+
|
|
183
|
+
pbar.update(1) # Update the progress bar
|
|
184
|
+
|
|
185
|
+
# Now perform database operations after all benchmark data has been processed
|
|
186
|
+
with sqlite3.connect(result_db_path) as con:
|
|
187
|
+
cursor = con.cursor()
|
|
188
|
+
create_result_table(cursor)
|
|
189
|
+
for benchmark_data in collected_benchmark_data:
|
|
190
|
+
insert_benchmark_data(cursor, benchmark_data)
|
|
191
|
+
con.commit()
|
|
186
192
|
|
|
187
193
|
return metrics, result_db_path
|
|
188
194
|
|
|
@@ -199,7 +205,7 @@ async def start_server(args: Arguments) -> bool:
|
|
|
199
205
|
else:
|
|
200
206
|
args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
|
|
201
207
|
|
|
202
|
-
if not await test_connection(args):
|
|
208
|
+
if (not args.no_test_connection) and (not await test_connection(args)):
|
|
203
209
|
raise TimeoutError('Test connection failed')
|
|
204
210
|
|
|
205
211
|
|
|
@@ -209,9 +215,14 @@ async def benchmark(args: Arguments) -> None:
|
|
|
209
215
|
loop = asyncio.get_running_loop()
|
|
210
216
|
add_signal_handlers(loop)
|
|
211
217
|
|
|
218
|
+
# init queue
|
|
212
219
|
request_queue = asyncio.Queue()
|
|
213
220
|
benchmark_data_queue = asyncio.Queue()
|
|
214
221
|
|
|
222
|
+
# reset event
|
|
223
|
+
query_send_completed_event.clear()
|
|
224
|
+
data_process_completed_event.clear()
|
|
225
|
+
|
|
215
226
|
async def create_send_request_tasks():
|
|
216
227
|
tasks: List[asyncio.Task] = []
|
|
217
228
|
for idx in range(args.parallel):
|
evalscope/perf/http_client.py
CHANGED
|
@@ -145,7 +145,15 @@ async def test_connection(args: Arguments) -> bool:
|
|
|
145
145
|
client = AioHttpClient(args)
|
|
146
146
|
async with client:
|
|
147
147
|
if 'chat/completions' in args.url:
|
|
148
|
-
request = {
|
|
148
|
+
request = {
|
|
149
|
+
'messages': [{
|
|
150
|
+
'role': 'user',
|
|
151
|
+
'content': 'hello'
|
|
152
|
+
}],
|
|
153
|
+
'model': args.model,
|
|
154
|
+
'max_tokens': 10,
|
|
155
|
+
'stream': args.stream
|
|
156
|
+
}
|
|
149
157
|
else:
|
|
150
158
|
request = {'prompt': 'hello', 'model': args.model, 'max_tokens': 10}
|
|
151
159
|
async for is_error, state_code, response_data in client.post(request):
|
evalscope/perf/main.py
CHANGED
|
@@ -32,9 +32,10 @@ def run_perf_benchmark(args):
|
|
|
32
32
|
if platform.system() == 'Windows':
|
|
33
33
|
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
|
34
34
|
|
|
35
|
-
loop = asyncio.
|
|
35
|
+
loop = asyncio.new_event_loop()
|
|
36
36
|
if platform.system() != 'Windows':
|
|
37
37
|
add_signal_handlers(loop)
|
|
38
|
+
|
|
38
39
|
loop.run_until_complete(benchmark(args))
|
|
39
40
|
|
|
40
41
|
|
|
@@ -3,4 +3,5 @@ from evalscope.perf.plugin.datasets.flickr8k import FlickrDatasetPlugin
|
|
|
3
3
|
from evalscope.perf.plugin.datasets.line_by_line import LineByLineDatasetPlugin
|
|
4
4
|
from evalscope.perf.plugin.datasets.longalpaca import LongAlpacaDatasetPlugin
|
|
5
5
|
from evalscope.perf.plugin.datasets.openqa import OpenqaDatasetPlugin
|
|
6
|
+
from evalscope.perf.plugin.datasets.random_dataset import RandomDatasetPlugin
|
|
6
7
|
from evalscope.perf.plugin.datasets.speed_benchmark import SpeedBenchmarkDatasetPlugin, SpeedBenchmarkLongDatasetPlugin
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import
|
|
2
|
+
import os
|
|
3
3
|
from typing import Any, Dict, Iterator, List
|
|
4
4
|
|
|
5
5
|
from evalscope.perf.arguments import Arguments
|
|
@@ -18,16 +18,11 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
|
|
|
18
18
|
|
|
19
19
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
20
20
|
if not self.query_parameters.dataset_path:
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
'open_qa.jsonl',
|
|
27
|
-
'--local_dir',
|
|
28
|
-
'./data',
|
|
29
|
-
])
|
|
30
|
-
self.query_parameters.dataset_path = './data/open_qa.jsonl'
|
|
21
|
+
from modelscope import dataset_snapshot_download
|
|
22
|
+
|
|
23
|
+
file_name = 'open_qa.jsonl'
|
|
24
|
+
local_path = dataset_snapshot_download('AI-ModelScope/HC3-Chinese', allow_patterns=[file_name])
|
|
25
|
+
self.query_parameters.dataset_path = os.path.join(local_path, file_name)
|
|
31
26
|
|
|
32
27
|
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
33
28
|
item = json.loads(item)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from typing import Dict, Iterator, List
|
|
3
|
+
|
|
4
|
+
from evalscope.perf.arguments import Arguments
|
|
5
|
+
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
6
|
+
from evalscope.perf.plugin.registry import register_dataset
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_dataset('random')
|
|
10
|
+
class RandomDatasetPlugin(DatasetPluginBase):
|
|
11
|
+
"""Read dataset and return prompt.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, query_parameters: Arguments):
|
|
15
|
+
super().__init__(query_parameters)
|
|
16
|
+
assert self.query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer_path`.' # noqa: E501
|
|
17
|
+
|
|
18
|
+
from modelscope import AutoTokenizer
|
|
19
|
+
self.tokenizer = AutoTokenizer.from_pretrained(self.query_parameters.tokenizer_path, trust_remote_code=True)
|
|
20
|
+
self.prefix_length = self.query_parameters.prefix_length
|
|
21
|
+
self.prefix_ids = self.get_random_inputs(self.prefix_length)
|
|
22
|
+
self.template_len = self.get_template_len()
|
|
23
|
+
self.number = self.query_parameters.number or 1
|
|
24
|
+
|
|
25
|
+
def build_messages(self) -> Iterator[List[Dict]]:
|
|
26
|
+
min_prompt_length = self.query_parameters.min_prompt_length - self.template_len
|
|
27
|
+
max_prompt_length = self.query_parameters.max_prompt_length - self.template_len + 1
|
|
28
|
+
|
|
29
|
+
assert min_prompt_length >= 0, f'min_prompt_length should be greater than or equal to the template length {self.template_len}.' # noqa: E501
|
|
30
|
+
assert max_prompt_length >= min_prompt_length, 'max_prompt_length should be greater than or equal to min_prompt_length.' # noqa: E501
|
|
31
|
+
|
|
32
|
+
# refer to https://github.com/vllm-project/vllm/blob/ed6e9075d31e32c8548b480a47d1ffb77da1f54c/benchmarks/benchmark_serving.py#L366C1-L399C1 # noqa: E501
|
|
33
|
+
input_lens = np.random.randint(min_prompt_length, max_prompt_length, size=self.number)
|
|
34
|
+
offsets = np.random.randint(0, self.tokenizer.vocab_size, size=self.number)
|
|
35
|
+
|
|
36
|
+
for i in range(self.number):
|
|
37
|
+
prompt_ids = (offsets[i] + i + np.arange(input_lens[i])) % self.tokenizer.vocab_size
|
|
38
|
+
prompt = self.tokenizer.decode(
|
|
39
|
+
self.prefix_ids + prompt_ids.tolist(), skip_special_tokens=False, clean_up_tokenization_spaces=False)
|
|
40
|
+
yield [{'role': 'user', 'content': prompt}]
|
|
41
|
+
|
|
42
|
+
def get_random_inputs(self, length: int) -> List[int]:
|
|
43
|
+
if length <= 0:
|
|
44
|
+
return []
|
|
45
|
+
input_ids = np.random.randint(0, self.tokenizer.vocab_size, size=length).tolist()
|
|
46
|
+
return input_ids
|
|
47
|
+
|
|
48
|
+
def get_template_len(self):
|
|
49
|
+
empty_message = [{'role': 'user', 'content': ''}]
|
|
50
|
+
template = self.tokenizer.apply_chat_template(empty_message, tokenize=True, add_generation_prompt=True)
|
|
51
|
+
return len(template)
|
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -2,6 +2,7 @@ import base64
|
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
4
|
import pickle
|
|
5
|
+
import re
|
|
5
6
|
import sqlite3
|
|
6
7
|
import sys
|
|
7
8
|
from datetime import datetime
|
|
@@ -91,6 +92,8 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
|
|
|
91
92
|
def get_output_path(args: Arguments) -> str:
|
|
92
93
|
current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
93
94
|
output_path = os.path.join(args.outputs_dir, current_time, f'{args.name or args.model_id}')
|
|
95
|
+
# Filter illegal characters
|
|
96
|
+
output_path = re.sub(r'[<>:"|?*]', '_', output_path)
|
|
94
97
|
if not os.path.exists(output_path):
|
|
95
98
|
os.makedirs(output_path, exist_ok=True)
|
|
96
99
|
logger.info(f'Save the result to: {output_path}')
|
evalscope/run.py
CHANGED
|
@@ -39,9 +39,11 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
|
|
|
39
39
|
configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
|
|
40
40
|
|
|
41
41
|
if task_cfg.eval_backend != EvalBackend.NATIVE:
|
|
42
|
-
|
|
42
|
+
result = run_non_native_backend(task_cfg, outputs)
|
|
43
43
|
else:
|
|
44
|
-
|
|
44
|
+
result = evaluate_model(task_cfg, outputs)
|
|
45
|
+
|
|
46
|
+
return result
|
|
45
47
|
|
|
46
48
|
|
|
47
49
|
def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
@@ -117,6 +119,16 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
117
119
|
res_dict = evaluator.eval()
|
|
118
120
|
eval_results[evaluator.dataset_name] = res_dict
|
|
119
121
|
|
|
122
|
+
# Clean up
|
|
123
|
+
if base_model is not None:
|
|
124
|
+
import gc
|
|
125
|
+
import torch
|
|
126
|
+
|
|
127
|
+
del base_model
|
|
128
|
+
del evaluators
|
|
129
|
+
torch.cuda.empty_cache()
|
|
130
|
+
gc.collect()
|
|
131
|
+
|
|
120
132
|
return eval_results
|
|
121
133
|
|
|
122
134
|
|
|
@@ -132,7 +144,7 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
|
|
|
132
144
|
# EvaluatorCollection is a collection of evaluators
|
|
133
145
|
from evalscope.collections import EvaluatorCollection
|
|
134
146
|
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
135
|
-
return EvaluatorCollection(task_cfg, data_adapter, outputs)
|
|
147
|
+
return EvaluatorCollection(task_cfg, data_adapter, outputs, base_model)
|
|
136
148
|
|
|
137
149
|
# Initialize model adapter
|
|
138
150
|
model_adapter = initialize_model_adapter(task_cfg, benchmark, base_model)
|
|
@@ -8,7 +8,7 @@ import random
|
|
|
8
8
|
import torch
|
|
9
9
|
from typing import List
|
|
10
10
|
|
|
11
|
-
from evalscope.
|
|
11
|
+
from evalscope.third_party.longbench_write.tools.openai_api import OpenaiApi
|
|
12
12
|
from evalscope.third_party.longbench_write.utils import count_words
|
|
13
13
|
from evalscope.utils import get_logger
|
|
14
14
|
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.13.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -175,16 +175,29 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
|
|
|
175
175
|
> ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
|
|
176
176
|
|
|
177
177
|
## 📋 Contents
|
|
178
|
-
- [
|
|
179
|
-
- [
|
|
180
|
-
- [
|
|
181
|
-
- [
|
|
178
|
+
- [📋 Contents](#-contents)
|
|
179
|
+
- [📝 Introduction](#-introduction)
|
|
180
|
+
- [☎ User Groups](#-user-groups)
|
|
181
|
+
- [🎉 News](#-news)
|
|
182
|
+
- [🛠️ Installation](#️-installation)
|
|
183
|
+
- [Method 1: Install Using pip](#method-1-install-using-pip)
|
|
184
|
+
- [Method 2: Install from Source](#method-2-install-from-source)
|
|
185
|
+
- [🚀 Quick Start](#-quick-start)
|
|
186
|
+
- [Method 1. Using Command Line](#method-1-using-command-line)
|
|
187
|
+
- [Method 2. Using Python Code](#method-2-using-python-code)
|
|
188
|
+
- [Basic Parameter](#basic-parameter)
|
|
189
|
+
- [Output Results](#output-results)
|
|
190
|
+
- [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
|
|
191
|
+
- [🌐 Evaluation of Specified Model API](#-evaluation-of-specified-model-api)
|
|
192
|
+
- [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
|
|
193
|
+
- [Parameter](#parameter)
|
|
182
194
|
- [Evaluation Backend](#evaluation-backend)
|
|
183
|
-
- [
|
|
184
|
-
- [
|
|
185
|
-
- [Arena Mode](
|
|
186
|
-
- [Contribution](#️-contribution)
|
|
187
|
-
- [Roadmap](#-roadmap)
|
|
195
|
+
- [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
|
|
196
|
+
- [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
|
|
197
|
+
- [🏟️ Arena Mode](#️-arena-mode)
|
|
198
|
+
- [👷♂️ Contribution](#️-contribution)
|
|
199
|
+
- [🔜 Roadmap](#-roadmap)
|
|
200
|
+
- [Star History](#star-history)
|
|
188
201
|
|
|
189
202
|
|
|
190
203
|
## 📝 Introduction
|
|
@@ -226,6 +239,9 @@ Please scan the QR code below to join our community groups:
|
|
|
226
239
|
|
|
227
240
|
## 🎉 News
|
|
228
241
|
|
|
242
|
+
- 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
|
|
243
|
+
- 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
|
|
244
|
+
- 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
|
|
229
245
|
- 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
|
|
230
246
|
- 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
|
|
231
247
|
- 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
|
|
@@ -262,23 +278,24 @@ Please scan the QR code below to join our community groups:
|
|
|
262
278
|
We recommend using conda to manage your environment and installing dependencies with pip:
|
|
263
279
|
|
|
264
280
|
1. Create a conda environment (optional)
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
281
|
+
```shell
|
|
282
|
+
# It is recommended to use Python 3.10
|
|
283
|
+
conda create -n evalscope python=3.10
|
|
284
|
+
# Activate the conda environment
|
|
285
|
+
conda activate evalscope
|
|
286
|
+
```
|
|
271
287
|
|
|
272
288
|
2. Install dependencies using pip
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
289
|
+
```shell
|
|
290
|
+
pip install evalscope # Install Native backend (default)
|
|
291
|
+
# Additional options
|
|
292
|
+
pip install 'evalscope[opencompass]' # Install OpenCompass backend
|
|
293
|
+
pip install 'evalscope[vlmeval]' # Install VLMEvalKit backend
|
|
294
|
+
pip install 'evalscope[rag]' # Install RAGEval backend
|
|
295
|
+
pip install 'evalscope[perf]' # Install dependencies for the model performance testing module
|
|
296
|
+
pip install 'evalscope[app]' # Install dependencies for visualization
|
|
297
|
+
pip install 'evalscope[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
|
|
298
|
+
```
|
|
282
299
|
|
|
283
300
|
> [!WARNING]
|
|
284
301
|
> As the project has been renamed to `evalscope`, for versions `v0.4.3` or earlier, you can install using the following command:
|
|
@@ -292,21 +309,22 @@ We recommend using conda to manage your environment and installing dependencies
|
|
|
292
309
|
|
|
293
310
|
### Method 2: Install from Source
|
|
294
311
|
1. Download the source code
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
312
|
+
```shell
|
|
313
|
+
git clone https://github.com/modelscope/evalscope.git
|
|
314
|
+
```
|
|
298
315
|
|
|
299
316
|
2. Install dependencies
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
317
|
+
```shell
|
|
318
|
+
cd evalscope/
|
|
319
|
+
pip install -e . # Install Native backend
|
|
320
|
+
# Additional options
|
|
321
|
+
pip install -e '.[opencompass]' # Install OpenCompass backend
|
|
322
|
+
pip install -e '.[vlmeval]' # Install VLMEvalKit backend
|
|
323
|
+
pip install -e '.[rag]' # Install RAGEval backend
|
|
324
|
+
pip install -e '.[perf]' # Install Perf dependencies
|
|
325
|
+
pip install -e '.[app]' # Install visualization dependencies
|
|
326
|
+
pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
|
|
327
|
+
```
|
|
310
328
|
|
|
311
329
|
|
|
312
330
|
## 🚀 Quick Start
|