evalscope 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -1
- evalscope/backend/rag_eval/utils/llm.py +4 -5
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
- evalscope/benchmarks/arena_hard/utils.py +162 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
- evalscope/benchmarks/data_adapter.py +26 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -11
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
- evalscope/benchmarks/live_code_bench/testing_util.py +3 -3
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
- evalscope/collections/evaluator.py +1 -1
- evalscope/config.py +6 -3
- evalscope/constants.py +1 -0
- evalscope/evaluator/evaluator.py +5 -4
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/models/chat_adapter.py +32 -11
- evalscope/models/custom_adapter.py +1 -1
- evalscope/perf/arguments.py +19 -46
- evalscope/perf/benchmark.py +64 -90
- evalscope/perf/main.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +4 -2
- evalscope/perf/plugin/datasets/__init__.py +1 -0
- evalscope/perf/plugin/datasets/openqa.py +6 -11
- evalscope/perf/plugin/datasets/random_dataset.py +51 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/utils/db_util.py +5 -2
- evalscope/run.py +14 -2
- evalscope/version.py +2 -2
- {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/METADATA +42 -78
- {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/RECORD +45 -37
- tests/cli/test_all.py +33 -24
- tests/cli/test_run.py +69 -22
- tests/perf/test_perf.py +23 -0
- tests/rag/test_ragas.py +4 -1
- {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/LICENSE +0 -0
- {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/WHEEL +0 -0
- {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/entry_points.txt +0 -0
- {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/top_level.txt +0 -0
evalscope/perf/benchmark.py
CHANGED
|
@@ -9,7 +9,7 @@ import threading
|
|
|
9
9
|
import time
|
|
10
10
|
from http import HTTPStatus
|
|
11
11
|
from tqdm import tqdm
|
|
12
|
-
from typing import List
|
|
12
|
+
from typing import AsyncGenerator, List
|
|
13
13
|
|
|
14
14
|
from evalscope.perf.arguments import Arguments
|
|
15
15
|
from evalscope.perf.http_client import AioHttpClient, test_connection
|
|
@@ -21,92 +21,68 @@ from evalscope.perf.utils.local_server import start_app
|
|
|
21
21
|
from evalscope.utils.logger import get_logger
|
|
22
22
|
|
|
23
23
|
logger = get_logger()
|
|
24
|
-
|
|
24
|
+
|
|
25
25
|
data_process_completed_event = asyncio.Event()
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
@exception_handler
|
|
29
|
-
async def
|
|
29
|
+
async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
|
|
30
30
|
query_generator_class = ApiRegistry(args.api)
|
|
31
31
|
query_generator = query_generator_class(args.tokenizer_path)
|
|
32
32
|
|
|
33
33
|
def load_prompt(prompt_path_or_text):
|
|
34
|
-
"""Load the prompt from a file or directly from the input text."""
|
|
35
34
|
if prompt_path_or_text.startswith('@'):
|
|
36
35
|
with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
|
|
37
36
|
return file.read()
|
|
38
37
|
return prompt_path_or_text
|
|
39
38
|
|
|
40
|
-
async def
|
|
41
|
-
"""Dispatch a single request with optional rate limiting."""
|
|
42
|
-
await request_queue.put(request)
|
|
43
|
-
if args.rate != -1:
|
|
44
|
-
interval = np.random.exponential(1.0 / args.rate)
|
|
45
|
-
await asyncio.sleep(interval)
|
|
46
|
-
|
|
47
|
-
async def dispatch_requests_from_prompt(messages):
|
|
48
|
-
"""Generate and dispatch requests based on the given prompt."""
|
|
39
|
+
async def generate_requests_from_prompt(messages):
|
|
49
40
|
request = query_generator.build_request(messages, args)
|
|
50
|
-
if args.number is None:
|
|
51
|
-
await dispatch_request(request)
|
|
52
|
-
return 1
|
|
53
41
|
for _ in range(args.number):
|
|
54
|
-
|
|
55
|
-
return args.number
|
|
42
|
+
yield request
|
|
56
43
|
|
|
57
|
-
async def
|
|
58
|
-
"""Generate and dispatch requests based on the dataset."""
|
|
59
|
-
total_query_count = 0
|
|
44
|
+
async def generate_requests_from_dataset():
|
|
60
45
|
message_generator_class = DatasetRegistry(args.dataset)
|
|
61
46
|
message_generator = message_generator_class(args)
|
|
62
47
|
|
|
48
|
+
count = 0
|
|
63
49
|
for messages in message_generator:
|
|
64
50
|
request = query_generator.build_request(messages, args)
|
|
65
|
-
if request is None:
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
break
|
|
51
|
+
if request is not None:
|
|
52
|
+
yield request
|
|
53
|
+
count += 1
|
|
54
|
+
if args.number and count >= args.number:
|
|
55
|
+
break
|
|
71
56
|
|
|
72
|
-
return total_query_count
|
|
73
|
-
|
|
74
|
-
# Load prompt or dataset and dispatch requests accordingly
|
|
75
57
|
if args.prompt:
|
|
76
58
|
prompt = load_prompt(args.prompt)
|
|
77
59
|
messages = [{'role': 'user', 'content': prompt}]
|
|
78
|
-
|
|
60
|
+
generator = generate_requests_from_prompt(messages)
|
|
79
61
|
elif args.dataset:
|
|
80
|
-
|
|
62
|
+
generator = generate_requests_from_dataset()
|
|
81
63
|
else:
|
|
82
64
|
raise Exception('Either prompt or dataset is required!')
|
|
83
65
|
|
|
84
|
-
|
|
66
|
+
async for request in generator:
|
|
67
|
+
yield request
|
|
68
|
+
if args.rate != -1:
|
|
69
|
+
interval = np.random.exponential(1.0 / args.rate)
|
|
70
|
+
await asyncio.sleep(interval)
|
|
85
71
|
|
|
86
72
|
|
|
87
73
|
@exception_handler
|
|
88
|
-
async def
|
|
89
|
-
|
|
90
|
-
|
|
74
|
+
async def send_request(
|
|
75
|
+
semaphore: asyncio.Semaphore,
|
|
76
|
+
request: dict,
|
|
91
77
|
benchmark_data_queue: asyncio.Queue,
|
|
92
78
|
args: Arguments,
|
|
93
79
|
):
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
try:
|
|
98
|
-
# Attempt to get a request from the queue with a timeout
|
|
99
|
-
request = await asyncio.wait_for(request_queue.get(), timeout=0.0001)
|
|
100
|
-
request_queue.task_done()
|
|
101
|
-
except asyncio.TimeoutError:
|
|
102
|
-
# If timeout, continue to the next iteration
|
|
103
|
-
continue
|
|
104
|
-
|
|
105
|
-
# Initialize benchmark data for the current request
|
|
80
|
+
async with semaphore:
|
|
81
|
+
client = AioHttpClient(args)
|
|
82
|
+
async with client:
|
|
106
83
|
benchmark_data = BenchmarkData(request=request)
|
|
107
84
|
collected_messages = []
|
|
108
85
|
try:
|
|
109
|
-
# Send the request and process the response
|
|
110
86
|
async for is_error, state_code, response_data in client.post(request):
|
|
111
87
|
if is_error or state_code != HTTPStatus.OK:
|
|
112
88
|
logger.error(f'Request: {request} failed, state_code: {state_code}, data: {response_data}')
|
|
@@ -124,7 +100,6 @@ async def send_requests_worker(
|
|
|
124
100
|
logger.exception(e)
|
|
125
101
|
logger.error(f'Request query: {request} exception')
|
|
126
102
|
finally:
|
|
127
|
-
# Record completion time and collected messages
|
|
128
103
|
benchmark_data.completed_time = time.perf_counter()
|
|
129
104
|
benchmark_data.response_messages = collected_messages
|
|
130
105
|
await benchmark_data_queue.put(benchmark_data)
|
|
@@ -150,39 +125,45 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
|
|
|
150
125
|
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
151
126
|
wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
|
|
152
127
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
continue
|
|
128
|
+
collected_benchmark_data = []
|
|
129
|
+
|
|
130
|
+
with tqdm(desc='Processing', total=args.number) as pbar:
|
|
131
|
+
while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
|
|
132
|
+
try:
|
|
133
|
+
# Attempt to get benchmark data from the queue with a timeout
|
|
134
|
+
benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
|
|
135
|
+
benchmark_data_queue.task_done()
|
|
136
|
+
except asyncio.TimeoutError:
|
|
137
|
+
# If timeout, continue to the next iteration
|
|
138
|
+
continue
|
|
165
139
|
|
|
166
|
-
|
|
167
|
-
|
|
140
|
+
# Update metrics based on the benchmark data
|
|
141
|
+
metrics.update_metrics(benchmark_data, api_plugin)
|
|
168
142
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
con.commit()
|
|
143
|
+
# Collect benchmark data for later database insertion
|
|
144
|
+
collected_benchmark_data.append(benchmark_data)
|
|
172
145
|
|
|
173
|
-
|
|
174
|
-
|
|
146
|
+
# Create a message with the updated metrics
|
|
147
|
+
message = metrics.create_message()
|
|
175
148
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
149
|
+
# Log the message to wandb if the api key is provided
|
|
150
|
+
if args.wandb_api_key:
|
|
151
|
+
wandb.log(message)
|
|
179
152
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
153
|
+
# Log the message to the logger every n queries
|
|
154
|
+
if int(metrics.n_total_queries) % args.log_every_n_query == 0:
|
|
155
|
+
msg = json.dumps(message, ensure_ascii=False, indent=2)
|
|
156
|
+
logger.info(msg)
|
|
184
157
|
|
|
185
|
-
|
|
158
|
+
pbar.update(1) # Update the progress bar
|
|
159
|
+
|
|
160
|
+
# Now perform database operations after all benchmark data has been processed
|
|
161
|
+
with sqlite3.connect(result_db_path) as con:
|
|
162
|
+
cursor = con.cursor()
|
|
163
|
+
create_result_table(cursor)
|
|
164
|
+
for benchmark_data in collected_benchmark_data:
|
|
165
|
+
insert_benchmark_data(cursor, benchmark_data)
|
|
166
|
+
con.commit()
|
|
186
167
|
|
|
187
168
|
return metrics, result_db_path
|
|
188
169
|
|
|
@@ -199,7 +180,7 @@ async def start_server(args: Arguments) -> bool:
|
|
|
199
180
|
else:
|
|
200
181
|
args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
|
|
201
182
|
|
|
202
|
-
if not await test_connection(args):
|
|
183
|
+
if (not args.no_test_connection) and (not await test_connection(args)):
|
|
203
184
|
raise TimeoutError('Test connection failed')
|
|
204
185
|
|
|
205
186
|
|
|
@@ -210,39 +191,32 @@ async def benchmark(args: Arguments) -> None:
|
|
|
210
191
|
add_signal_handlers(loop)
|
|
211
192
|
|
|
212
193
|
# init queue
|
|
213
|
-
request_queue = asyncio.Queue()
|
|
214
194
|
benchmark_data_queue = asyncio.Queue()
|
|
215
195
|
|
|
216
196
|
# reset event
|
|
217
|
-
query_send_completed_event.clear()
|
|
218
197
|
data_process_completed_event.clear()
|
|
219
198
|
|
|
199
|
+
semaphore = asyncio.Semaphore(args.parallel)
|
|
200
|
+
|
|
220
201
|
async def create_send_request_tasks():
|
|
221
202
|
tasks: List[asyncio.Task] = []
|
|
222
|
-
for
|
|
223
|
-
task = asyncio.create_task(
|
|
203
|
+
async for request in get_requests(args):
|
|
204
|
+
task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args))
|
|
224
205
|
tasks.append(task)
|
|
225
206
|
return tasks
|
|
226
207
|
|
|
227
208
|
async def run_tasks():
|
|
228
209
|
await start_server(args)
|
|
229
210
|
|
|
230
|
-
dispatch_task = asyncio.create_task(dispatch_requests_worker(request_queue, args))
|
|
231
211
|
statistic_benchmark_metric_task = asyncio.create_task(
|
|
232
212
|
statistic_benchmark_metric_worker(benchmark_data_queue, args))
|
|
233
213
|
send_request_tasks = await create_send_request_tasks()
|
|
234
214
|
|
|
235
|
-
expected_number_of_queries = await dispatch_task
|
|
236
|
-
await request_queue.join()
|
|
237
|
-
query_send_completed_event.set()
|
|
238
|
-
|
|
239
215
|
await asyncio.gather(*send_request_tasks, return_exceptions=True)
|
|
240
216
|
await benchmark_data_queue.join()
|
|
241
217
|
data_process_completed_event.set()
|
|
242
218
|
|
|
243
219
|
metrics, result_db_path = await statistic_benchmark_metric_task
|
|
244
|
-
summary_result(args, metrics,
|
|
245
|
-
|
|
246
|
-
await asyncio.sleep(0.250)
|
|
220
|
+
summary_result(args, metrics, result_db_path)
|
|
247
221
|
|
|
248
222
|
await run_tasks()
|
evalscope/perf/main.py
CHANGED
|
@@ -32,7 +32,7 @@ def run_perf_benchmark(args):
|
|
|
32
32
|
if platform.system() == 'Windows':
|
|
33
33
|
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
|
34
34
|
|
|
35
|
-
loop = asyncio.
|
|
35
|
+
loop = asyncio.new_event_loop()
|
|
36
36
|
if platform.system() != 'Windows':
|
|
37
37
|
add_signal_handlers(loop)
|
|
38
38
|
|
|
@@ -70,7 +70,7 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
70
70
|
def __compose_query_from_parameter(self, payload: Dict, param: Arguments):
|
|
71
71
|
payload['model'] = param.model
|
|
72
72
|
if param.max_tokens is not None:
|
|
73
|
-
payload['
|
|
73
|
+
payload['max_completion_tokens'] = param.max_tokens
|
|
74
74
|
if param.min_tokens is not None:
|
|
75
75
|
payload['min_tokens'] = param.min_tokens
|
|
76
76
|
if param.frequency_penalty is not None:
|
|
@@ -94,9 +94,11 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
94
94
|
payload['top_p'] = param.top_p
|
|
95
95
|
if param.top_k is not None:
|
|
96
96
|
payload['top_k'] = param.top_k
|
|
97
|
+
if param.extra_args is not None:
|
|
98
|
+
payload.update(param.extra_args)
|
|
97
99
|
return payload
|
|
98
100
|
|
|
99
|
-
def parse_responses(self, responses, request: Any = None, **kwargs) ->
|
|
101
|
+
def parse_responses(self, responses, request: Any = None, **kwargs) -> tuple[int, int]:
|
|
100
102
|
"""Parser responses and return number of request and response tokens.
|
|
101
103
|
Only one response for non-stream, multiple responses for stream.
|
|
102
104
|
"""
|
|
@@ -3,4 +3,5 @@ from evalscope.perf.plugin.datasets.flickr8k import FlickrDatasetPlugin
|
|
|
3
3
|
from evalscope.perf.plugin.datasets.line_by_line import LineByLineDatasetPlugin
|
|
4
4
|
from evalscope.perf.plugin.datasets.longalpaca import LongAlpacaDatasetPlugin
|
|
5
5
|
from evalscope.perf.plugin.datasets.openqa import OpenqaDatasetPlugin
|
|
6
|
+
from evalscope.perf.plugin.datasets.random_dataset import RandomDatasetPlugin
|
|
6
7
|
from evalscope.perf.plugin.datasets.speed_benchmark import SpeedBenchmarkDatasetPlugin, SpeedBenchmarkLongDatasetPlugin
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import
|
|
2
|
+
import os
|
|
3
3
|
from typing import Any, Dict, Iterator, List
|
|
4
4
|
|
|
5
5
|
from evalscope.perf.arguments import Arguments
|
|
@@ -18,16 +18,11 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
|
|
|
18
18
|
|
|
19
19
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
20
20
|
if not self.query_parameters.dataset_path:
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
'open_qa.jsonl',
|
|
27
|
-
'--local_dir',
|
|
28
|
-
'./data',
|
|
29
|
-
])
|
|
30
|
-
self.query_parameters.dataset_path = './data/open_qa.jsonl'
|
|
21
|
+
from modelscope import dataset_snapshot_download
|
|
22
|
+
|
|
23
|
+
file_name = 'open_qa.jsonl'
|
|
24
|
+
local_path = dataset_snapshot_download('AI-ModelScope/HC3-Chinese', allow_patterns=[file_name])
|
|
25
|
+
self.query_parameters.dataset_path = os.path.join(local_path, file_name)
|
|
31
26
|
|
|
32
27
|
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
33
28
|
item = json.loads(item)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from typing import Dict, Iterator, List
|
|
3
|
+
|
|
4
|
+
from evalscope.perf.arguments import Arguments
|
|
5
|
+
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
6
|
+
from evalscope.perf.plugin.registry import register_dataset
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_dataset('random')
|
|
10
|
+
class RandomDatasetPlugin(DatasetPluginBase):
|
|
11
|
+
"""Read dataset and return prompt.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, query_parameters: Arguments):
|
|
15
|
+
super().__init__(query_parameters)
|
|
16
|
+
assert self.query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer_path`.' # noqa: E501
|
|
17
|
+
|
|
18
|
+
from modelscope import AutoTokenizer
|
|
19
|
+
self.tokenizer = AutoTokenizer.from_pretrained(self.query_parameters.tokenizer_path, trust_remote_code=True)
|
|
20
|
+
self.prefix_length = self.query_parameters.prefix_length
|
|
21
|
+
self.prefix_ids = self.get_random_inputs(self.prefix_length)
|
|
22
|
+
self.template_len = self.get_template_len()
|
|
23
|
+
self.number = self.query_parameters.number or 1
|
|
24
|
+
|
|
25
|
+
def build_messages(self) -> Iterator[List[Dict]]:
|
|
26
|
+
min_prompt_length = self.query_parameters.min_prompt_length - self.template_len
|
|
27
|
+
max_prompt_length = self.query_parameters.max_prompt_length - self.template_len + 1
|
|
28
|
+
|
|
29
|
+
assert min_prompt_length >= 0, f'min_prompt_length should be greater than or equal to the template length {self.template_len}.' # noqa: E501
|
|
30
|
+
assert max_prompt_length >= min_prompt_length, 'max_prompt_length should be greater than or equal to min_prompt_length.' # noqa: E501
|
|
31
|
+
|
|
32
|
+
# refer to https://github.com/vllm-project/vllm/blob/ed6e9075d31e32c8548b480a47d1ffb77da1f54c/benchmarks/benchmark_serving.py#L366C1-L399C1 # noqa: E501
|
|
33
|
+
input_lens = np.random.randint(min_prompt_length, max_prompt_length, size=self.number)
|
|
34
|
+
offsets = np.random.randint(0, self.tokenizer.vocab_size, size=self.number)
|
|
35
|
+
|
|
36
|
+
for i in range(self.number):
|
|
37
|
+
prompt_ids = (offsets[i] + i + np.arange(input_lens[i])) % self.tokenizer.vocab_size
|
|
38
|
+
prompt = self.tokenizer.decode(
|
|
39
|
+
self.prefix_ids + prompt_ids.tolist(), skip_special_tokens=False, clean_up_tokenization_spaces=False)
|
|
40
|
+
yield [{'role': 'user', 'content': prompt}]
|
|
41
|
+
|
|
42
|
+
def get_random_inputs(self, length: int) -> List[int]:
|
|
43
|
+
if length <= 0:
|
|
44
|
+
return []
|
|
45
|
+
input_ids = np.random.randint(0, self.tokenizer.vocab_size, size=length).tolist()
|
|
46
|
+
return input_ids
|
|
47
|
+
|
|
48
|
+
def get_template_len(self):
|
|
49
|
+
empty_message = [{'role': 'user', 'content': ''}]
|
|
50
|
+
template = self.tokenizer.apply_chat_template(empty_message, tokenize=True, add_generation_prompt=True)
|
|
51
|
+
return len(template)
|
|
@@ -3,6 +3,9 @@ from typing import Dict, Iterator, List, Tuple
|
|
|
3
3
|
from evalscope.perf.arguments import Arguments
|
|
4
4
|
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
5
5
|
from evalscope.perf.plugin.registry import register_dataset
|
|
6
|
+
from evalscope.utils.logger import get_logger
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
6
9
|
|
|
7
10
|
|
|
8
11
|
@register_dataset('speed_benchmark')
|
|
@@ -18,6 +21,14 @@ class SpeedBenchmarkDatasetPlugin(DatasetPluginBase):
|
|
|
18
21
|
def __init__(self, query_parameters: Arguments):
|
|
19
22
|
super().__init__(query_parameters)
|
|
20
23
|
|
|
24
|
+
url = self.query_parameters.url
|
|
25
|
+
if url.endswith('v1/chat/completions'):
|
|
26
|
+
logger.warning(
|
|
27
|
+
'The API URL is not set correctly for `speed_benchmark`. Using `v1/completions` instead of `v1/chat/completions` by system.' # noqa
|
|
28
|
+
)
|
|
29
|
+
url = url.replace('v1/chat/completions', 'v1/completions')
|
|
30
|
+
self.query_parameters.url = url
|
|
31
|
+
|
|
21
32
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
22
33
|
for input_len in self.INPUT_LENGTH:
|
|
23
34
|
for _ in range(self.REPEAT):
|
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -2,6 +2,7 @@ import base64
|
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
4
|
import pickle
|
|
5
|
+
import re
|
|
5
6
|
import sqlite3
|
|
6
7
|
import sys
|
|
7
8
|
from datetime import datetime
|
|
@@ -91,6 +92,8 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
|
|
|
91
92
|
def get_output_path(args: Arguments) -> str:
|
|
92
93
|
current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
93
94
|
output_path = os.path.join(args.outputs_dir, current_time, f'{args.name or args.model_id}')
|
|
95
|
+
# Filter illegal characters
|
|
96
|
+
output_path = re.sub(r'[<>:"|?*]', '_', output_path)
|
|
94
97
|
if not os.path.exists(output_path):
|
|
95
98
|
os.makedirs(output_path, exist_ok=True)
|
|
96
99
|
logger.info(f'Save the result to: {output_path}')
|
|
@@ -191,12 +194,12 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
191
194
|
return results
|
|
192
195
|
|
|
193
196
|
|
|
194
|
-
def summary_result(args: Arguments, metrics: BenchmarkMetrics,
|
|
197
|
+
def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str):
|
|
195
198
|
result_path = os.path.dirname(result_db_path)
|
|
196
199
|
write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
|
|
197
200
|
|
|
198
201
|
data = metrics.create_message()
|
|
199
|
-
data.update({'Expected number of requests':
|
|
202
|
+
data.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
|
|
200
203
|
write_json_file(data, os.path.join(result_path, 'benchmark_summary.json'))
|
|
201
204
|
|
|
202
205
|
# Print summary in a table
|
evalscope/run.py
CHANGED
|
@@ -39,9 +39,11 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
|
|
|
39
39
|
configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
|
|
40
40
|
|
|
41
41
|
if task_cfg.eval_backend != EvalBackend.NATIVE:
|
|
42
|
-
|
|
42
|
+
result = run_non_native_backend(task_cfg, outputs)
|
|
43
43
|
else:
|
|
44
|
-
|
|
44
|
+
result = evaluate_model(task_cfg, outputs)
|
|
45
|
+
|
|
46
|
+
return result
|
|
45
47
|
|
|
46
48
|
|
|
47
49
|
def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
@@ -117,6 +119,16 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
117
119
|
res_dict = evaluator.eval()
|
|
118
120
|
eval_results[evaluator.dataset_name] = res_dict
|
|
119
121
|
|
|
122
|
+
# Clean up
|
|
123
|
+
if base_model is not None:
|
|
124
|
+
import gc
|
|
125
|
+
import torch
|
|
126
|
+
|
|
127
|
+
del base_model
|
|
128
|
+
del evaluators
|
|
129
|
+
torch.cuda.empty_cache()
|
|
130
|
+
gc.collect()
|
|
131
|
+
|
|
120
132
|
return eval_results
|
|
121
133
|
|
|
122
134
|
|
evalscope/version.py
CHANGED