evalscope 0.13.0__py3-none-any.whl → 0.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/collections/evaluator.py +1 -1
- evalscope/config.py +5 -2
- evalscope/constants.py +1 -0
- evalscope/evaluator/evaluator.py +5 -4
- evalscope/models/custom_adapter.py +1 -1
- evalscope/perf/arguments.py +11 -40
- evalscope/perf/benchmark.py +34 -28
- evalscope/perf/main.py +1 -1
- evalscope/perf/plugin/datasets/__init__.py +1 -0
- evalscope/perf/plugin/datasets/openqa.py +6 -11
- evalscope/perf/plugin/datasets/random_dataset.py +51 -0
- evalscope/perf/utils/db_util.py +3 -0
- evalscope/run.py +14 -2
- evalscope/version.py +2 -2
- {evalscope-0.13.0.dist-info → evalscope-0.13.1.dist-info}/METADATA +33 -30
- {evalscope-0.13.0.dist-info → evalscope-0.13.1.dist-info}/RECORD +22 -21
- tests/cli/test_run.py +41 -11
- tests/perf/test_perf.py +23 -0
- {evalscope-0.13.0.dist-info → evalscope-0.13.1.dist-info}/LICENSE +0 -0
- {evalscope-0.13.0.dist-info → evalscope-0.13.1.dist-info}/WHEEL +0 -0
- {evalscope-0.13.0.dist-info → evalscope-0.13.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.13.0.dist-info → evalscope-0.13.1.dist-info}/top_level.txt +0 -0
|
@@ -181,7 +181,7 @@ class EvaluatorCollection:
|
|
|
181
181
|
answers_list = jsonl_to_list(pred_file_path)
|
|
182
182
|
indices = set()
|
|
183
183
|
for answer in answers_list:
|
|
184
|
-
index = answer
|
|
184
|
+
index = answer.get(AnswerKeys.INDEX)
|
|
185
185
|
answer_dict[index] = answer
|
|
186
186
|
indices.add(index)
|
|
187
187
|
data = []
|
evalscope/config.py
CHANGED
|
@@ -81,7 +81,7 @@ class TaskConfig:
|
|
|
81
81
|
def __post_init__(self):
|
|
82
82
|
if (not self.model_id) and self.model:
|
|
83
83
|
if isinstance(self.model, CustomModel):
|
|
84
|
-
self.model_id =
|
|
84
|
+
self.model_id = self.model.config.get('model_id', 'custom_model')
|
|
85
85
|
else:
|
|
86
86
|
self.model_id = os.path.basename(self.model).rstrip(os.sep)
|
|
87
87
|
# fix path error, see http://github.com/modelscope/evalscope/issues/377
|
|
@@ -92,7 +92,10 @@ class TaskConfig:
|
|
|
92
92
|
self.eval_batch_size = 8 if self.eval_type == EvalType.SERVICE else 1
|
|
93
93
|
|
|
94
94
|
def to_dict(self):
|
|
95
|
-
|
|
95
|
+
result = self.__dict__.copy()
|
|
96
|
+
if isinstance(self.model, CustomModel):
|
|
97
|
+
result['model'] = self.model.__class__.__name__
|
|
98
|
+
return result
|
|
96
99
|
|
|
97
100
|
def __str__(self):
|
|
98
101
|
return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
|
evalscope/constants.py
CHANGED
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -81,7 +81,7 @@ class Evaluator(object):
|
|
|
81
81
|
for subset_name, prompts_list in prompts.items():
|
|
82
82
|
limit = self.task_cfg.limit or len(prompts_list)
|
|
83
83
|
for index, prompt in enumerate(prompts_list[:limit]):
|
|
84
|
-
prompt[
|
|
84
|
+
prompt[AnswerKeys.INDEX] = index
|
|
85
85
|
limited_prompts[subset_name].append(prompt)
|
|
86
86
|
|
|
87
87
|
return limited_prompts
|
|
@@ -97,7 +97,8 @@ class Evaluator(object):
|
|
|
97
97
|
answer_d[AnswerKeys.ANSWER_ID] = answer_id
|
|
98
98
|
answer_d[AnswerKeys.SUBSET_NAME] = subset_name
|
|
99
99
|
answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
|
|
100
|
-
answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
|
|
100
|
+
# answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
|
|
101
|
+
answer_d[AnswerKeys.INDEX] = input_d[AnswerKeys.INDEX]
|
|
101
102
|
return answer_d
|
|
102
103
|
|
|
103
104
|
def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
|
|
@@ -117,7 +118,7 @@ class Evaluator(object):
|
|
|
117
118
|
return answers_list, prompts_list
|
|
118
119
|
|
|
119
120
|
def get_answered_indices(answers_list: List[Dict]) -> List[int]:
|
|
120
|
-
indices = [answer
|
|
121
|
+
indices = [answer.get(AnswerKeys.INDEX) for answer in answers_list]
|
|
121
122
|
|
|
122
123
|
if all(index is None for index in indices):
|
|
123
124
|
return list(range(len(answers_list)))
|
|
@@ -238,7 +239,7 @@ class Evaluator(object):
|
|
|
238
239
|
pred = pred_content
|
|
239
240
|
|
|
240
241
|
choice[ReviewKeys.REVIEW] = {
|
|
241
|
-
ReviewKeys.GOLD: gold_content,
|
|
242
|
+
ReviewKeys.GOLD: gold_content if gold_content != raw_input_d else '*Same as Input*',
|
|
242
243
|
ReviewKeys.PRED: pred,
|
|
243
244
|
ReviewKeys.RESULT: review_result
|
|
244
245
|
}
|
|
@@ -66,4 +66,4 @@ class CustomModelAdapter(BaseModelAdapter):
|
|
|
66
66
|
else:
|
|
67
67
|
raise TypeError(f'Unsupported inputs type: {type(input_prompt)}')
|
|
68
68
|
|
|
69
|
-
return self.custom_model.predict(prompts=in_prompts, **kwargs)
|
|
69
|
+
return self.custom_model.predict(prompts=in_prompts, origin_inputs=inputs, **kwargs)
|
evalscope/perf/arguments.py
CHANGED
|
@@ -24,6 +24,7 @@ class Arguments:
|
|
|
24
24
|
connect_timeout: int = 600 # Connection timeout in seconds
|
|
25
25
|
read_timeout: int = 600 # Read timeout in seconds
|
|
26
26
|
api_key: Optional[str] = None
|
|
27
|
+
no_test_connection: bool = False # Test the connection before starting the benchmark
|
|
27
28
|
|
|
28
29
|
# Performance and parallelism
|
|
29
30
|
number: Optional[int] = None # Number of requests to be made
|
|
@@ -40,8 +41,9 @@ class Arguments:
|
|
|
40
41
|
outputs_dir: str = DEFAULT_WORK_DIR
|
|
41
42
|
|
|
42
43
|
# Prompt settings
|
|
43
|
-
max_prompt_length: int =
|
|
44
|
+
max_prompt_length: int = 131072 # Maximum length of the prompt
|
|
44
45
|
min_prompt_length: int = 0 # Minimum length of the prompt
|
|
46
|
+
prefix_length: int = 0 # Length of the prefix, only for random dataset
|
|
45
47
|
prompt: Optional[str] = None # The prompt text
|
|
46
48
|
query_template: Optional[str] = None # Template for the query
|
|
47
49
|
|
|
@@ -65,44 +67,12 @@ class Arguments:
|
|
|
65
67
|
|
|
66
68
|
@staticmethod
|
|
67
69
|
def from_args(args):
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
connect_timeout=args.connect_timeout,
|
|
75
|
-
read_timeout=args.read_timeout,
|
|
76
|
-
number=args.number,
|
|
77
|
-
parallel=args.parallel,
|
|
78
|
-
rate=args.rate,
|
|
79
|
-
log_every_n_query=args.log_every_n_query,
|
|
80
|
-
headers=args.headers,
|
|
81
|
-
wandb_api_key=args.wandb_api_key,
|
|
82
|
-
name=args.name,
|
|
83
|
-
outputs_dir=args.outputs_dir,
|
|
84
|
-
debug=args.debug,
|
|
85
|
-
tokenizer_path=args.tokenizer_path,
|
|
86
|
-
api=args.api,
|
|
87
|
-
max_prompt_length=args.max_prompt_length,
|
|
88
|
-
min_prompt_length=args.min_prompt_length,
|
|
89
|
-
prompt=args.prompt,
|
|
90
|
-
query_template=args.query_template,
|
|
91
|
-
dataset=args.dataset,
|
|
92
|
-
dataset_path=args.dataset_path,
|
|
93
|
-
frequency_penalty=args.frequency_penalty,
|
|
94
|
-
logprobs=args.logprobs,
|
|
95
|
-
max_tokens=args.max_tokens,
|
|
96
|
-
min_tokens=args.min_tokens,
|
|
97
|
-
n_choices=args.n_choices,
|
|
98
|
-
seed=args.seed,
|
|
99
|
-
stop=args.stop,
|
|
100
|
-
stop_token_ids=args.stop_token_ids,
|
|
101
|
-
stream=args.stream,
|
|
102
|
-
temperature=args.temperature,
|
|
103
|
-
top_p=args.top_p,
|
|
104
|
-
top_k=args.top_k,
|
|
105
|
-
)
|
|
70
|
+
# Convert Namespace to a dictionary and filter out None values
|
|
71
|
+
args_dict = {k: v for k, v in vars(args).items() if v is not None}
|
|
72
|
+
|
|
73
|
+
if 'func' in args_dict:
|
|
74
|
+
del args_dict['func'] # Note: compat CLI arguments
|
|
75
|
+
return Arguments(**args_dict)
|
|
106
76
|
|
|
107
77
|
def __post_init__(self):
|
|
108
78
|
self.headers = self.headers or {} # Default to empty dictionary
|
|
@@ -153,6 +123,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
153
123
|
parser.add_argument('--api-key', type=str, required=False, default=None, help='The API key for authentication')
|
|
154
124
|
parser.add_argument('--connect-timeout', type=int, default=600, help='The network connection timeout')
|
|
155
125
|
parser.add_argument('--read-timeout', type=int, default=600, help='The network read timeout')
|
|
126
|
+
parser.add_argument('--no-test-connection', action='store_false', default=False, help='Do not test the connection before starting the benchmark') # noqa: E501
|
|
156
127
|
|
|
157
128
|
# Performance and parallelism
|
|
158
129
|
parser.add_argument('-n', '--number', type=int, default=None, help='How many requests to be made')
|
|
@@ -168,6 +139,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
168
139
|
# Prompt settings
|
|
169
140
|
parser.add_argument('--max-prompt-length', type=int, default=sys.maxsize, help='Maximum input prompt length')
|
|
170
141
|
parser.add_argument('--min-prompt-length', type=int, default=0, help='Minimum input prompt length')
|
|
142
|
+
parser.add_argument('--prefix-length', type=int, default=0, help='The prefix length')
|
|
171
143
|
parser.add_argument('--prompt', type=str, required=False, default=None, help='Specified the request prompt')
|
|
172
144
|
parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
|
|
173
145
|
|
|
@@ -193,7 +165,6 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
193
165
|
parser.add_argument('--temperature', type=float, help='The sample temperature', default=None)
|
|
194
166
|
parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
|
|
195
167
|
parser.add_argument('--top-k', type=int, help='Sampling top k', default=None)
|
|
196
|
-
|
|
197
168
|
# yapf: enable
|
|
198
169
|
|
|
199
170
|
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -150,39 +150,45 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
|
|
|
150
150
|
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
151
151
|
wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
|
|
152
152
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
153
|
+
collected_benchmark_data = []
|
|
154
|
+
|
|
155
|
+
with tqdm(desc='Processing') as pbar:
|
|
156
|
+
while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
|
|
157
|
+
try:
|
|
158
|
+
# Attempt to get benchmark data from the queue with a timeout
|
|
159
|
+
benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
|
|
160
|
+
benchmark_data_queue.task_done()
|
|
161
|
+
except asyncio.TimeoutError:
|
|
162
|
+
# If timeout, continue to the next iteration
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
# Update metrics based on the benchmark data
|
|
166
|
+
metrics.update_metrics(benchmark_data, api_plugin)
|
|
165
167
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
+
# Collect benchmark data for later database insertion
|
|
169
|
+
collected_benchmark_data.append(benchmark_data)
|
|
168
170
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
con.commit()
|
|
171
|
+
# Create a message with the updated metrics
|
|
172
|
+
message = metrics.create_message()
|
|
172
173
|
|
|
173
|
-
|
|
174
|
-
|
|
174
|
+
# Log the message to wandb if the api key is provided
|
|
175
|
+
if args.wandb_api_key:
|
|
176
|
+
wandb.log(message)
|
|
175
177
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
178
|
+
# Log the message to the logger every n queries
|
|
179
|
+
if int(metrics.n_total_queries) % args.log_every_n_query == 0:
|
|
180
|
+
msg = json.dumps(message, ensure_ascii=False, indent=2)
|
|
181
|
+
logger.info(msg)
|
|
179
182
|
|
|
180
|
-
|
|
181
|
-
if int(metrics.n_total_queries) % args.log_every_n_query == 0:
|
|
182
|
-
msg = json.dumps(message, ensure_ascii=False, indent=2)
|
|
183
|
-
logger.info(msg)
|
|
183
|
+
pbar.update(1) # Update the progress bar
|
|
184
184
|
|
|
185
|
-
|
|
185
|
+
# Now perform database operations after all benchmark data has been processed
|
|
186
|
+
with sqlite3.connect(result_db_path) as con:
|
|
187
|
+
cursor = con.cursor()
|
|
188
|
+
create_result_table(cursor)
|
|
189
|
+
for benchmark_data in collected_benchmark_data:
|
|
190
|
+
insert_benchmark_data(cursor, benchmark_data)
|
|
191
|
+
con.commit()
|
|
186
192
|
|
|
187
193
|
return metrics, result_db_path
|
|
188
194
|
|
|
@@ -199,7 +205,7 @@ async def start_server(args: Arguments) -> bool:
|
|
|
199
205
|
else:
|
|
200
206
|
args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
|
|
201
207
|
|
|
202
|
-
if not await test_connection(args):
|
|
208
|
+
if (not args.no_test_connection) and (not await test_connection(args)):
|
|
203
209
|
raise TimeoutError('Test connection failed')
|
|
204
210
|
|
|
205
211
|
|
evalscope/perf/main.py
CHANGED
|
@@ -32,7 +32,7 @@ def run_perf_benchmark(args):
|
|
|
32
32
|
if platform.system() == 'Windows':
|
|
33
33
|
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
|
34
34
|
|
|
35
|
-
loop = asyncio.
|
|
35
|
+
loop = asyncio.new_event_loop()
|
|
36
36
|
if platform.system() != 'Windows':
|
|
37
37
|
add_signal_handlers(loop)
|
|
38
38
|
|
|
@@ -3,4 +3,5 @@ from evalscope.perf.plugin.datasets.flickr8k import FlickrDatasetPlugin
|
|
|
3
3
|
from evalscope.perf.plugin.datasets.line_by_line import LineByLineDatasetPlugin
|
|
4
4
|
from evalscope.perf.plugin.datasets.longalpaca import LongAlpacaDatasetPlugin
|
|
5
5
|
from evalscope.perf.plugin.datasets.openqa import OpenqaDatasetPlugin
|
|
6
|
+
from evalscope.perf.plugin.datasets.random_dataset import RandomDatasetPlugin
|
|
6
7
|
from evalscope.perf.plugin.datasets.speed_benchmark import SpeedBenchmarkDatasetPlugin, SpeedBenchmarkLongDatasetPlugin
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import
|
|
2
|
+
import os
|
|
3
3
|
from typing import Any, Dict, Iterator, List
|
|
4
4
|
|
|
5
5
|
from evalscope.perf.arguments import Arguments
|
|
@@ -18,16 +18,11 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
|
|
|
18
18
|
|
|
19
19
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
20
20
|
if not self.query_parameters.dataset_path:
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
'open_qa.jsonl',
|
|
27
|
-
'--local_dir',
|
|
28
|
-
'./data',
|
|
29
|
-
])
|
|
30
|
-
self.query_parameters.dataset_path = './data/open_qa.jsonl'
|
|
21
|
+
from modelscope import dataset_snapshot_download
|
|
22
|
+
|
|
23
|
+
file_name = 'open_qa.jsonl'
|
|
24
|
+
local_path = dataset_snapshot_download('AI-ModelScope/HC3-Chinese', allow_patterns=[file_name])
|
|
25
|
+
self.query_parameters.dataset_path = os.path.join(local_path, file_name)
|
|
31
26
|
|
|
32
27
|
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
33
28
|
item = json.loads(item)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from typing import Dict, Iterator, List
|
|
3
|
+
|
|
4
|
+
from evalscope.perf.arguments import Arguments
|
|
5
|
+
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
6
|
+
from evalscope.perf.plugin.registry import register_dataset
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_dataset('random')
|
|
10
|
+
class RandomDatasetPlugin(DatasetPluginBase):
|
|
11
|
+
"""Read dataset and return prompt.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, query_parameters: Arguments):
|
|
15
|
+
super().__init__(query_parameters)
|
|
16
|
+
assert self.query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer_path`.' # noqa: E501
|
|
17
|
+
|
|
18
|
+
from modelscope import AutoTokenizer
|
|
19
|
+
self.tokenizer = AutoTokenizer.from_pretrained(self.query_parameters.tokenizer_path, trust_remote_code=True)
|
|
20
|
+
self.prefix_length = self.query_parameters.prefix_length
|
|
21
|
+
self.prefix_ids = self.get_random_inputs(self.prefix_length)
|
|
22
|
+
self.template_len = self.get_template_len()
|
|
23
|
+
self.number = self.query_parameters.number or 1
|
|
24
|
+
|
|
25
|
+
def build_messages(self) -> Iterator[List[Dict]]:
|
|
26
|
+
min_prompt_length = self.query_parameters.min_prompt_length - self.template_len
|
|
27
|
+
max_prompt_length = self.query_parameters.max_prompt_length - self.template_len + 1
|
|
28
|
+
|
|
29
|
+
assert min_prompt_length >= 0, f'min_prompt_length should be greater than or equal to the template length {self.template_len}.' # noqa: E501
|
|
30
|
+
assert max_prompt_length >= min_prompt_length, 'max_prompt_length should be greater than or equal to min_prompt_length.' # noqa: E501
|
|
31
|
+
|
|
32
|
+
# refer to https://github.com/vllm-project/vllm/blob/ed6e9075d31e32c8548b480a47d1ffb77da1f54c/benchmarks/benchmark_serving.py#L366C1-L399C1 # noqa: E501
|
|
33
|
+
input_lens = np.random.randint(min_prompt_length, max_prompt_length, size=self.number)
|
|
34
|
+
offsets = np.random.randint(0, self.tokenizer.vocab_size, size=self.number)
|
|
35
|
+
|
|
36
|
+
for i in range(self.number):
|
|
37
|
+
prompt_ids = (offsets[i] + i + np.arange(input_lens[i])) % self.tokenizer.vocab_size
|
|
38
|
+
prompt = self.tokenizer.decode(
|
|
39
|
+
self.prefix_ids + prompt_ids.tolist(), skip_special_tokens=False, clean_up_tokenization_spaces=False)
|
|
40
|
+
yield [{'role': 'user', 'content': prompt}]
|
|
41
|
+
|
|
42
|
+
def get_random_inputs(self, length: int) -> List[int]:
|
|
43
|
+
if length <= 0:
|
|
44
|
+
return []
|
|
45
|
+
input_ids = np.random.randint(0, self.tokenizer.vocab_size, size=length).tolist()
|
|
46
|
+
return input_ids
|
|
47
|
+
|
|
48
|
+
def get_template_len(self):
|
|
49
|
+
empty_message = [{'role': 'user', 'content': ''}]
|
|
50
|
+
template = self.tokenizer.apply_chat_template(empty_message, tokenize=True, add_generation_prompt=True)
|
|
51
|
+
return len(template)
|
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -2,6 +2,7 @@ import base64
|
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
4
|
import pickle
|
|
5
|
+
import re
|
|
5
6
|
import sqlite3
|
|
6
7
|
import sys
|
|
7
8
|
from datetime import datetime
|
|
@@ -91,6 +92,8 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
|
|
|
91
92
|
def get_output_path(args: Arguments) -> str:
|
|
92
93
|
current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
93
94
|
output_path = os.path.join(args.outputs_dir, current_time, f'{args.name or args.model_id}')
|
|
95
|
+
# Filter illegal characters
|
|
96
|
+
output_path = re.sub(r'[<>:"|?*]', '_', output_path)
|
|
94
97
|
if not os.path.exists(output_path):
|
|
95
98
|
os.makedirs(output_path, exist_ok=True)
|
|
96
99
|
logger.info(f'Save the result to: {output_path}')
|
evalscope/run.py
CHANGED
|
@@ -39,9 +39,11 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
|
|
|
39
39
|
configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
|
|
40
40
|
|
|
41
41
|
if task_cfg.eval_backend != EvalBackend.NATIVE:
|
|
42
|
-
|
|
42
|
+
result = run_non_native_backend(task_cfg, outputs)
|
|
43
43
|
else:
|
|
44
|
-
|
|
44
|
+
result = evaluate_model(task_cfg, outputs)
|
|
45
|
+
|
|
46
|
+
return result
|
|
45
47
|
|
|
46
48
|
|
|
47
49
|
def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
@@ -117,6 +119,16 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
117
119
|
res_dict = evaluator.eval()
|
|
118
120
|
eval_results[evaluator.dataset_name] = res_dict
|
|
119
121
|
|
|
122
|
+
# Clean up
|
|
123
|
+
if base_model is not None:
|
|
124
|
+
import gc
|
|
125
|
+
import torch
|
|
126
|
+
|
|
127
|
+
del base_model
|
|
128
|
+
del evaluators
|
|
129
|
+
torch.cuda.empty_cache()
|
|
130
|
+
gc.collect()
|
|
131
|
+
|
|
120
132
|
return eval_results
|
|
121
133
|
|
|
122
134
|
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.13.
|
|
3
|
+
Version: 0.13.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -239,7 +239,8 @@ Please scan the QR code below to join our community groups:
|
|
|
239
239
|
|
|
240
240
|
## 🎉 News
|
|
241
241
|
|
|
242
|
-
- 🔥 **[2025.03.
|
|
242
|
+
- 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
|
|
243
|
+
- 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
|
|
243
244
|
- 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
|
|
244
245
|
- 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
|
|
245
246
|
- 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
|
|
@@ -277,23 +278,24 @@ Please scan the QR code below to join our community groups:
|
|
|
277
278
|
We recommend using conda to manage your environment and installing dependencies with pip:
|
|
278
279
|
|
|
279
280
|
1. Create a conda environment (optional)
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
281
|
+
```shell
|
|
282
|
+
# It is recommended to use Python 3.10
|
|
283
|
+
conda create -n evalscope python=3.10
|
|
284
|
+
# Activate the conda environment
|
|
285
|
+
conda activate evalscope
|
|
286
|
+
```
|
|
286
287
|
|
|
287
288
|
2. Install dependencies using pip
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
289
|
+
```shell
|
|
290
|
+
pip install evalscope # Install Native backend (default)
|
|
291
|
+
# Additional options
|
|
292
|
+
pip install 'evalscope[opencompass]' # Install OpenCompass backend
|
|
293
|
+
pip install 'evalscope[vlmeval]' # Install VLMEvalKit backend
|
|
294
|
+
pip install 'evalscope[rag]' # Install RAGEval backend
|
|
295
|
+
pip install 'evalscope[perf]' # Install dependencies for the model performance testing module
|
|
296
|
+
pip install 'evalscope[app]' # Install dependencies for visualization
|
|
297
|
+
pip install 'evalscope[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
|
|
298
|
+
```
|
|
297
299
|
|
|
298
300
|
> [!WARNING]
|
|
299
301
|
> As the project has been renamed to `evalscope`, for versions `v0.4.3` or earlier, you can install using the following command:
|
|
@@ -307,21 +309,22 @@ We recommend using conda to manage your environment and installing dependencies
|
|
|
307
309
|
|
|
308
310
|
### Method 2: Install from Source
|
|
309
311
|
1. Download the source code
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
312
|
+
```shell
|
|
313
|
+
git clone https://github.com/modelscope/evalscope.git
|
|
314
|
+
```
|
|
313
315
|
|
|
314
316
|
2. Install dependencies
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
317
|
+
```shell
|
|
318
|
+
cd evalscope/
|
|
319
|
+
pip install -e . # Install Native backend
|
|
320
|
+
# Additional options
|
|
321
|
+
pip install -e '.[opencompass]' # Install OpenCompass backend
|
|
322
|
+
pip install -e '.[vlmeval]' # Install VLMEvalKit backend
|
|
323
|
+
pip install -e '.[rag]' # Install RAGEval backend
|
|
324
|
+
pip install -e '.[perf]' # Install Perf dependencies
|
|
325
|
+
pip install -e '.[app]' # Install visualization dependencies
|
|
326
|
+
pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
|
|
327
|
+
```
|
|
325
328
|
|
|
326
329
|
|
|
327
330
|
## 🚀 Quick Start
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
2
|
evalscope/arguments.py,sha256=VhZd7a8PoZK01qFCMEADLINqLYi6njRqRb50iR1l1lo,5241
|
|
3
|
-
evalscope/config.py,sha256=
|
|
4
|
-
evalscope/constants.py,sha256=
|
|
5
|
-
evalscope/run.py,sha256=
|
|
3
|
+
evalscope/config.py,sha256=wLrc8a7z28IFPRaeUzot5HGtSDY_13KR-3kRyFKEGx8,9476
|
|
4
|
+
evalscope/constants.py,sha256=Cgzkoz4R3MC3YLtbCM2fmSwF8Z2kuxYdOC8t9FWJj9w,3740
|
|
5
|
+
evalscope/run.py,sha256=LUCdnNzNIfHSWvxu3gxAsHEDX7hT5mcVnV4lSY5h0iA,6007
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
7
|
evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
|
|
8
|
-
evalscope/version.py,sha256=
|
|
8
|
+
evalscope/version.py,sha256=Y30-zF2dwch3upMc0t5yNNjIgvI-LQQWFhftRQgXvOk,119
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -180,11 +180,11 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
|
|
|
180
180
|
evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
|
|
181
181
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
182
182
|
evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
|
|
183
|
-
evalscope/collections/evaluator.py,sha256=
|
|
183
|
+
evalscope/collections/evaluator.py,sha256=YJy8Dj35XCdCwhNDwZecJkeW1_ZgIOsuRLFzfe3SyV8,12724
|
|
184
184
|
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
185
185
|
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
186
186
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
187
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
187
|
+
evalscope/evaluator/evaluator.py,sha256=szRQrXH5ILpUljb14lcunuOt185H8Um1paviTokraA4,19845
|
|
188
188
|
evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
|
|
189
189
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
190
190
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
|
|
@@ -203,7 +203,7 @@ evalscope/models/__init__.py,sha256=i9vcOBMEF_UM7C2gpmh2GsQk3njwqevoQ6A4CnP1fHs,
|
|
|
203
203
|
evalscope/models/base_adapter.py,sha256=7PbRwfD5PIZCBYVds6ZHI8TBY9C5i2LdPOTu88FJWlY,3414
|
|
204
204
|
evalscope/models/chat_adapter.py,sha256=5-yz7L41OdeBO9J_qRkEZcduATrYIMe__UFfh7BzjIc,6277
|
|
205
205
|
evalscope/models/choice_adapter.py,sha256=fnJdo-FMJ-zvNLbEJGc73odgWXIxtVudL00JIf2vzsA,8239
|
|
206
|
-
evalscope/models/custom_adapter.py,sha256=
|
|
206
|
+
evalscope/models/custom_adapter.py,sha256=AGztmZ0aT0g2flh4B4NaiZ8LCDg8tT0gVNxmrP5W1mA,2401
|
|
207
207
|
evalscope/models/local_model.py,sha256=yydggBCLcBAmUWbBhv7o2CA3RbG0DwDZharPdrkbNcg,2628
|
|
208
208
|
evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
|
|
209
209
|
evalscope/models/register.py,sha256=4vX6AfScAzwD7UkncbuejfAiQHznQkK5hvtG6jEUbWo,809
|
|
@@ -212,10 +212,10 @@ evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk
|
|
|
212
212
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
213
213
|
evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
|
|
214
214
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
215
|
-
evalscope/perf/arguments.py,sha256=
|
|
216
|
-
evalscope/perf/benchmark.py,sha256=
|
|
215
|
+
evalscope/perf/arguments.py,sha256=hBR6TXCoLkHRLxrwXacmierfFZhyQaT5hnKAfp-vE6I,8990
|
|
216
|
+
evalscope/perf/benchmark.py,sha256=VYcFhSoZXcLoNXpFYxOFxLbBLv_8Tn74Qklim7vELCM,9889
|
|
217
217
|
evalscope/perf/http_client.py,sha256=xMakdQkJ2cgIOd-yOmHEW0vbGKTJ0JWhLFt9IFtUP8Q,7473
|
|
218
|
-
evalscope/perf/main.py,sha256=
|
|
218
|
+
evalscope/perf/main.py,sha256=w-yDbl0osaTAMgC-JNPpqIq2LQ7U4c-Ht7Amj8Nbjc8,1278
|
|
219
219
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
220
220
|
evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
|
|
221
221
|
evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
|
|
@@ -223,18 +223,19 @@ evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqY
|
|
|
223
223
|
evalscope/perf/plugin/api/custom_api.py,sha256=ay1AGi4y2opjwyRl0J0A54-vLB-pBj3QBFkzog0KA-g,3787
|
|
224
224
|
evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
|
|
225
225
|
evalscope/perf/plugin/api/openai_api.py,sha256=KQRQMOfQceKQtrvTE-SyhNHcDoGuQ0900yh7r74Hcoo,7560
|
|
226
|
-
evalscope/perf/plugin/datasets/__init__.py,sha256=
|
|
226
|
+
evalscope/perf/plugin/datasets/__init__.py,sha256=Z6Jc0RxJS_z0nBBV1-b0-56Ija60AtQ7I_67gY6ZfdQ,568
|
|
227
227
|
evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
|
|
228
228
|
evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
|
|
229
229
|
evalscope/perf/plugin/datasets/flickr8k.py,sha256=UzAIFIO0m5inWOkWM1mO6wfV2HOuXAqiTxCJ4b0SiZM,1589
|
|
230
230
|
evalscope/perf/plugin/datasets/line_by_line.py,sha256=IKVZMpKei6XW9DTm9VEssWHE96i1lTqMf0621dA_img,836
|
|
231
231
|
evalscope/perf/plugin/datasets/longalpaca.py,sha256=2aENqCly_DX1dyNcurYsLFJIvXYFph6jWm7z7XETvMk,1176
|
|
232
|
-
evalscope/perf/plugin/datasets/openqa.py,sha256=
|
|
232
|
+
evalscope/perf/plugin/datasets/openqa.py,sha256=_aVXs2s8wbmtoB6ZO-pNjUZvBVxRUYdoJDGv5-BumtI,1342
|
|
233
|
+
evalscope/perf/plugin/datasets/random_dataset.py,sha256=wPyY5kk2zKnc8u9uYEl-vQ6BLHeWbdC8EHEAZNFSDeU,2702
|
|
233
234
|
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
|
|
234
235
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
235
236
|
evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
|
|
236
237
|
evalscope/perf/utils/benchmark_util.py,sha256=4TyQ_tE5odcjKDFDueI3jrC0vld6QxmTreOd5_SP4vE,5802
|
|
237
|
-
evalscope/perf/utils/db_util.py,sha256=
|
|
238
|
+
evalscope/perf/utils/db_util.py,sha256=hRXixxpNBrACF43reOJV5SoO1vj34cqoNMaTKH_oLLE,9100
|
|
238
239
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
239
240
|
evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
|
|
240
241
|
evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
@@ -312,9 +313,9 @@ tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
|
312
313
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
313
314
|
tests/cli/test_all.py,sha256=1wwXtdjBmWYLhs5TXOJhZBwPm2qd9FYFqQSemXWKNUs,3865
|
|
314
315
|
tests/cli/test_collection.py,sha256=V-_M7ngwekMGqPuI16jjJZyAK2XLE4Z6QTn-8B5ykgU,4071
|
|
315
|
-
tests/cli/test_run.py,sha256=
|
|
316
|
+
tests/cli/test_run.py,sha256=Gk8uCT0IjDSf2sf-TXeQFV83ovNzRs4GcAkQ1DhRJEU,15929
|
|
316
317
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
317
|
-
tests/perf/test_perf.py,sha256=
|
|
318
|
+
tests/perf/test_perf.py,sha256=mfXTCsD9RaCef3b4CLvm8ErxBUaWzn-EKKhOxD65i3A,3817
|
|
318
319
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
319
320
|
tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
|
|
320
321
|
tests/rag/test_mteb.py,sha256=t64FXE-ZsOCLiRJrw-dIDIhKd1OXiaglXaeERs0lOh4,4643
|
|
@@ -325,9 +326,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
|
|
|
325
326
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
326
327
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
327
328
|
tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
|
|
328
|
-
evalscope-0.13.
|
|
329
|
-
evalscope-0.13.
|
|
330
|
-
evalscope-0.13.
|
|
331
|
-
evalscope-0.13.
|
|
332
|
-
evalscope-0.13.
|
|
333
|
-
evalscope-0.13.
|
|
329
|
+
evalscope-0.13.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
330
|
+
evalscope-0.13.1.dist-info/METADATA,sha256=luYebd_U93wnTkXcv_MYPfd9-JRz51DjWB6Bh6phspU,33546
|
|
331
|
+
evalscope-0.13.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
332
|
+
evalscope-0.13.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
333
|
+
evalscope-0.13.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
334
|
+
evalscope-0.13.1.dist-info/RECORD,,
|
tests/cli/test_run.py
CHANGED
|
@@ -203,7 +203,7 @@ class TestRun(unittest.TestCase):
|
|
|
203
203
|
print(res)
|
|
204
204
|
|
|
205
205
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
206
|
-
def
|
|
206
|
+
def test_run_one_task(self):
|
|
207
207
|
from evalscope.config import TaskConfig
|
|
208
208
|
|
|
209
209
|
task_cfg = TaskConfig(
|
|
@@ -223,6 +223,33 @@ class TestRun(unittest.TestCase):
|
|
|
223
223
|
|
|
224
224
|
run_task(task_cfg=task_cfg)
|
|
225
225
|
|
|
226
|
+
|
|
227
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
228
|
+
def test_run_task_loop(self):
|
|
229
|
+
os.environ['CUDA_VISIBLE_DEVICES'] = '2'
|
|
230
|
+
from evalscope.config import TaskConfig
|
|
231
|
+
|
|
232
|
+
task_cfg1 = TaskConfig(
|
|
233
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
234
|
+
model_id='model1',
|
|
235
|
+
datasets=['iquiz'],
|
|
236
|
+
limit=10
|
|
237
|
+
)
|
|
238
|
+
task_cfg2 = TaskConfig(
|
|
239
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
240
|
+
model_id='model2',
|
|
241
|
+
datasets=['iquiz'],
|
|
242
|
+
limit=10
|
|
243
|
+
)
|
|
244
|
+
task_cfg3 = TaskConfig(
|
|
245
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
246
|
+
model_id='model3',
|
|
247
|
+
datasets=['iquiz'],
|
|
248
|
+
limit=10
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
run_task(task_cfg=[task_cfg1, task_cfg2, task_cfg3])
|
|
252
|
+
|
|
226
253
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
227
254
|
def test_run_server_model(self):
|
|
228
255
|
from evalscope.config import TaskConfig
|
|
@@ -365,20 +392,20 @@ class TestRun(unittest.TestCase):
|
|
|
365
392
|
from evalscope.config import TaskConfig
|
|
366
393
|
|
|
367
394
|
task_cfg = TaskConfig(
|
|
368
|
-
model='
|
|
395
|
+
model='qwq-32b',
|
|
369
396
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
370
397
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
371
398
|
eval_type=EvalType.SERVICE,
|
|
372
399
|
datasets=[
|
|
373
400
|
# 'math_500',
|
|
374
|
-
'aime24',
|
|
401
|
+
# 'aime24',
|
|
375
402
|
# 'competition_math',
|
|
376
403
|
# 'arc',
|
|
377
404
|
# 'gsm8k'
|
|
378
405
|
# 'truthful_qa',
|
|
379
406
|
# 'simple_qa',
|
|
380
407
|
# # 'chinese_simpleqa',
|
|
381
|
-
|
|
408
|
+
'live_code_bench',
|
|
382
409
|
# 'humaneval'
|
|
383
410
|
# 'general_qa'
|
|
384
411
|
],
|
|
@@ -387,10 +414,9 @@ class TestRun(unittest.TestCase):
|
|
|
387
414
|
'subset_list': ['Level 4']
|
|
388
415
|
},
|
|
389
416
|
'live_code_bench': {
|
|
390
|
-
'subset_list': ['v4_v5'],
|
|
391
417
|
'extra_params': {
|
|
392
|
-
'start_date': '2024-
|
|
393
|
-
'end_date': '2025-
|
|
418
|
+
'start_date': '2024-08-01',
|
|
419
|
+
'end_date': '2025-02-28'
|
|
394
420
|
},
|
|
395
421
|
'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
|
|
396
422
|
},
|
|
@@ -402,19 +428,23 @@ class TestRun(unittest.TestCase):
|
|
|
402
428
|
]
|
|
403
429
|
},
|
|
404
430
|
},
|
|
405
|
-
eval_batch_size=
|
|
406
|
-
limit=5,
|
|
431
|
+
eval_batch_size=10,
|
|
432
|
+
# limit=5,
|
|
407
433
|
judge_strategy=JudgeStrategy.AUTO,
|
|
434
|
+
judge_worker_num=8,
|
|
408
435
|
judge_model_args={
|
|
409
436
|
'model_id': 'qwen2.5-7b-instruct',
|
|
410
437
|
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
411
438
|
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
412
439
|
},
|
|
413
440
|
generation_config={
|
|
414
|
-
'max_new_tokens':
|
|
441
|
+
'max_new_tokens': 20000,
|
|
415
442
|
'temperature': 0.0,
|
|
416
443
|
'seed': 42,
|
|
417
|
-
}
|
|
444
|
+
},
|
|
445
|
+
timeout=60000,
|
|
446
|
+
stream=True,
|
|
447
|
+
# use_cache='outputs/20250320_143658'
|
|
418
448
|
)
|
|
419
449
|
|
|
420
450
|
run_task(task_cfg=task_cfg)
|
tests/perf/test_perf.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
import os
|
|
3
|
+
from dotenv import dotenv_values
|
|
3
4
|
|
|
5
|
+
env = dotenv_values('.env')
|
|
4
6
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
|
5
7
|
import unittest
|
|
6
8
|
|
|
@@ -96,6 +98,27 @@ class TestPerf(unittest.TestCase):
|
|
|
96
98
|
}
|
|
97
99
|
run_perf_benchmark(task_cfg)
|
|
98
100
|
|
|
101
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
102
|
+
def test_run_perf_local_random(self):
|
|
103
|
+
from evalscope.perf.arguments import Arguments
|
|
104
|
+
task_cfg = Arguments(
|
|
105
|
+
parallel=20,
|
|
106
|
+
model='Qwen2.5-0.5B-Instruct',
|
|
107
|
+
url='http://127.0.0.1:8801/v1/chat/completions',
|
|
108
|
+
api='openai',
|
|
109
|
+
dataset='random',
|
|
110
|
+
min_tokens=1024,
|
|
111
|
+
max_tokens=1024,
|
|
112
|
+
prefix_length=0,
|
|
113
|
+
min_prompt_length=1024,
|
|
114
|
+
max_prompt_length=1024,
|
|
115
|
+
number=40,
|
|
116
|
+
tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
|
|
117
|
+
seed=None,
|
|
118
|
+
debug= True,
|
|
119
|
+
)
|
|
120
|
+
run_perf_benchmark(task_cfg)
|
|
121
|
+
|
|
99
122
|
|
|
100
123
|
if __name__ == '__main__':
|
|
101
124
|
unittest.main(buffer=False)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|