evalscope 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +10 -0
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +3 -3
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
- evalscope/benchmarks/data_adapter.py +4 -2
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +133 -0
- evalscope/benchmarks/drop/utils.py +59 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +8 -4
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +67 -0
- evalscope/benchmarks/tool_bench/utils.py +202 -0
- evalscope/benchmarks/utils.py +3 -2
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
- evalscope/collections/evaluator.py +76 -26
- evalscope/config.py +46 -15
- evalscope/evaluator/evaluator.py +48 -14
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +3 -3
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/models/adapters/chat_adapter.py +51 -34
- evalscope/models/adapters/server_adapter.py +15 -19
- evalscope/perf/arguments.py +14 -5
- evalscope/perf/benchmark.py +4 -9
- evalscope/perf/main.py +69 -17
- evalscope/perf/utils/benchmark_util.py +33 -15
- evalscope/perf/utils/db_util.py +32 -20
- evalscope/perf/utils/log_utils.py +1 -1
- evalscope/perf/utils/rich_display.py +186 -0
- evalscope/report/app.py +47 -34
- evalscope/report/utils.py +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/deprecation_utils.py +42 -0
- evalscope/version.py +2 -2
- {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/METADATA +49 -25
- {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/RECORD +48 -38
- tests/aigc/test_t2i.py +4 -4
- tests/cli/test_all.py +3 -0
- tests/cli/test_collection.py +2 -1
- tests/cli/test_run.py +37 -14
- tests/perf/test_perf.py +27 -2
- {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/LICENSE +0 -0
- {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/WHEEL +0 -0
- {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import time
|
|
3
3
|
import torch
|
|
4
|
-
from typing import Any, Dict, List, Tuple, Union
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
5
5
|
|
|
6
6
|
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, Usage
|
|
7
7
|
from evalscope.utils.logger import get_logger
|
|
@@ -58,19 +58,15 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
58
58
|
return generation_config
|
|
59
59
|
|
|
60
60
|
def _model_generate(self,
|
|
61
|
-
|
|
62
|
-
system_prompts: List[str] = None,
|
|
61
|
+
formatted_prompts: List[str],
|
|
63
62
|
infer_cfg: Dict[str, Any] = None) -> Tuple[List[List[str]], List[int]]:
|
|
64
63
|
"""
|
|
65
64
|
Args:
|
|
66
|
-
|
|
67
|
-
system_prompts: The system prompts.
|
|
65
|
+
formatted_prompts: The formatted prompts.
|
|
68
66
|
infer_cfg: The inference configuration.
|
|
69
67
|
Returns:
|
|
70
68
|
The prediction results.
|
|
71
69
|
"""
|
|
72
|
-
if system_prompts is None:
|
|
73
|
-
system_prompts = []
|
|
74
70
|
if infer_cfg is None:
|
|
75
71
|
infer_cfg = {}
|
|
76
72
|
|
|
@@ -92,27 +88,6 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
92
88
|
self.generation_config.update(**infer_cfg)
|
|
93
89
|
fix_do_sample_warning(self.generation_config)
|
|
94
90
|
|
|
95
|
-
# For chat model, use the chat template to format the input
|
|
96
|
-
if self.tokenizer.chat_template is not None:
|
|
97
|
-
formatted_prompts = []
|
|
98
|
-
for i, query in enumerate(queries):
|
|
99
|
-
messages = [ChatMessage(role='user', content=query)]
|
|
100
|
-
if i < len(system_prompts) and system_prompts[i]:
|
|
101
|
-
messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
|
|
102
|
-
# whether thinking is needed
|
|
103
|
-
enable_thinking = infer_cfg.get('enable_thinking', None)
|
|
104
|
-
if enable_thinking is not None:
|
|
105
|
-
prompts = self.tokenizer.apply_chat_template(
|
|
106
|
-
messages, tokenize=False, add_generation_prompt=True, enable_thinking=enable_thinking)
|
|
107
|
-
else:
|
|
108
|
-
prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
109
|
-
formatted_prompts.append(prompts)
|
|
110
|
-
else:
|
|
111
|
-
# For base model, use the queries as the input
|
|
112
|
-
formatted_prompts = queries
|
|
113
|
-
|
|
114
|
-
logger.debug(f'formatted_prompts: {formatted_prompts}')
|
|
115
|
-
|
|
116
91
|
# Get input ids
|
|
117
92
|
inputs = self.tokenizer(
|
|
118
93
|
formatted_prompts, return_tensors='pt', padding=True, truncation=True,
|
|
@@ -136,26 +111,68 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
136
111
|
|
|
137
112
|
return responses, input_lengths
|
|
138
113
|
|
|
139
|
-
|
|
140
|
-
def predict(self, inputs: List[dict], infer_cfg: dict = {}) -> List[dict]:
|
|
114
|
+
def _prepare_inputs(self, inputs: List[dict], infer_cfg: dict = {}) -> List[str]:
|
|
141
115
|
"""
|
|
116
|
+
Prepare the inputs for the model.
|
|
142
117
|
Args:
|
|
143
118
|
inputs: The input data.
|
|
144
119
|
infer_cfg: The inference configuration.
|
|
145
120
|
Returns:
|
|
146
|
-
The
|
|
121
|
+
The prepared inputs and system prompts.
|
|
147
122
|
"""
|
|
148
|
-
|
|
149
|
-
# Process inputs
|
|
150
123
|
queries = []
|
|
151
124
|
system_prompts = []
|
|
125
|
+
message_list = []
|
|
152
126
|
|
|
153
127
|
for input_item in inputs:
|
|
154
128
|
queries.append(input_item['data'][0])
|
|
155
129
|
system_prompts.append(input_item.get('system_prompt', None))
|
|
130
|
+
if input_item.get('messages', None):
|
|
131
|
+
message_list.append(input_item.get('messages', None))
|
|
132
|
+
|
|
133
|
+
# For non chat model, use the original queries as the input
|
|
134
|
+
if self.tokenizer.chat_template is None:
|
|
135
|
+
return queries
|
|
136
|
+
|
|
137
|
+
# For chat model, use the messages as the input
|
|
138
|
+
# if message_list is None, use the queries as the input
|
|
139
|
+
if len(message_list) == 0:
|
|
140
|
+
for i, query in enumerate(queries):
|
|
141
|
+
messages = [ChatMessage(role='user', content=query)]
|
|
142
|
+
if i < len(system_prompts) and system_prompts[i]:
|
|
143
|
+
messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
|
|
144
|
+
message_list.append(messages)
|
|
145
|
+
|
|
146
|
+
# Format the messages
|
|
147
|
+
formatted_prompts = []
|
|
148
|
+
for messages in message_list:
|
|
149
|
+
# apply chat template
|
|
150
|
+
chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
|
|
151
|
+
if chat_template_kwargs is not None:
|
|
152
|
+
prompts = self.tokenizer.apply_chat_template(
|
|
153
|
+
messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
|
|
154
|
+
else:
|
|
155
|
+
prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
156
|
+
formatted_prompts.append(prompts)
|
|
157
|
+
|
|
158
|
+
logger.debug(f'formatted_prompts: {formatted_prompts}')
|
|
159
|
+
return formatted_prompts
|
|
160
|
+
|
|
161
|
+
@torch.no_grad()
|
|
162
|
+
def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = {}) -> List[dict]:
|
|
163
|
+
"""
|
|
164
|
+
Args:
|
|
165
|
+
inputs: The input data.
|
|
166
|
+
infer_cfg: The inference configuration.
|
|
167
|
+
Returns:
|
|
168
|
+
The prediction results.
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
# Process inputs
|
|
172
|
+
formatted_prompts = self._prepare_inputs(inputs, infer_cfg)
|
|
156
173
|
|
|
157
174
|
# Run inference
|
|
158
|
-
responses, input_lengths = self._model_generate(
|
|
175
|
+
responses, input_lengths = self._model_generate(formatted_prompts, infer_cfg)
|
|
159
176
|
|
|
160
177
|
# Process outputs
|
|
161
178
|
results = []
|
|
@@ -43,7 +43,7 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
43
43
|
sig = signature(self.client.chat.completions.create)
|
|
44
44
|
return list(sig.parameters.keys())
|
|
45
45
|
|
|
46
|
-
def predict(self, inputs: List[dict], infer_cfg: dict = None) -> List[dict]:
|
|
46
|
+
def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
|
|
47
47
|
"""
|
|
48
48
|
Model prediction func.
|
|
49
49
|
|
|
@@ -65,23 +65,26 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
65
65
|
|
|
66
66
|
def process_single_input(self, input_item: dict, infer_cfg: dict) -> dict:
|
|
67
67
|
"""Process a single input item."""
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
query = '\n'.join(''.join(item) for item in data)
|
|
71
|
-
system_prompt = input_item.get('system_prompt', None)
|
|
68
|
+
if input_item.get('messages', None):
|
|
69
|
+
content = input_item['messages']
|
|
72
70
|
else:
|
|
73
|
-
|
|
74
|
-
system_prompt = input_item.get('system_prompt', None)
|
|
75
|
-
|
|
76
|
-
content = self.make_request_content(query, system_prompt)
|
|
71
|
+
content = self.make_request_content(input_item)
|
|
77
72
|
request_json = self.make_request(content, infer_cfg)
|
|
78
73
|
response = self.send_request(request_json)
|
|
79
74
|
return response
|
|
80
75
|
|
|
81
|
-
def make_request_content(self,
|
|
76
|
+
def make_request_content(self, input_item: dict) -> list:
|
|
82
77
|
"""
|
|
83
78
|
Make request content for OpenAI API.
|
|
84
79
|
"""
|
|
80
|
+
data: list = input_item['data']
|
|
81
|
+
if isinstance(data[0], tuple): # for truthful_qa and hellaswag
|
|
82
|
+
query = '\n'.join(''.join(item) for item in data)
|
|
83
|
+
system_prompt = input_item.get('system_prompt', None)
|
|
84
|
+
else:
|
|
85
|
+
query = data[0]
|
|
86
|
+
system_prompt = input_item.get('system_prompt', None)
|
|
87
|
+
|
|
85
88
|
messages = []
|
|
86
89
|
if system_prompt:
|
|
87
90
|
messages.append({'role': 'system', 'content': system_prompt})
|
|
@@ -90,16 +93,9 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
90
93
|
|
|
91
94
|
return messages
|
|
92
95
|
|
|
93
|
-
def make_request(self, content: list, infer_cfg: dict
|
|
96
|
+
def make_request(self, content: list, infer_cfg: dict) -> dict:
|
|
94
97
|
"""Make request to remote API."""
|
|
95
98
|
# Format request JSON according to OpenAI API format
|
|
96
|
-
from evalscope.config import DEFAULT_GENERATION_CONFIG
|
|
97
|
-
if infer_cfg == DEFAULT_GENERATION_CONFIG:
|
|
98
|
-
infer_cfg = {
|
|
99
|
-
'max_tokens': 2048,
|
|
100
|
-
'temperature': 0.0,
|
|
101
|
-
}
|
|
102
|
-
|
|
103
99
|
request_json = {'model': self.model_id, 'messages': content, **infer_cfg}
|
|
104
100
|
|
|
105
101
|
if self.timeout:
|
|
@@ -137,7 +133,7 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
137
133
|
return response.model_dump(exclude_unset=True)
|
|
138
134
|
except Exception as e:
|
|
139
135
|
logger.error(f'Error when calling remote API: {str(e)}')
|
|
140
|
-
raise
|
|
136
|
+
raise e
|
|
141
137
|
|
|
142
138
|
def _collect_stream_response(self, response_stream: List[ChatCompletionChunk]) -> ChatCompletion:
|
|
143
139
|
collected_chunks = []
|
evalscope/perf/arguments.py
CHANGED
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
import os
|
|
4
4
|
import sys
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
|
-
from typing import Any, Dict, List, Optional
|
|
6
|
+
from typing import Any, Dict, List, Optional, Union
|
|
7
7
|
|
|
8
8
|
from evalscope.constants import DEFAULT_WORK_DIR
|
|
9
9
|
|
|
@@ -27,8 +27,8 @@ class Arguments:
|
|
|
27
27
|
no_test_connection: bool = False # Test the connection before starting the benchmark
|
|
28
28
|
|
|
29
29
|
# Performance and parallelism
|
|
30
|
-
number: int = 1000 # Number of requests to be made
|
|
31
|
-
parallel: int = 1 # Number of parallel requests
|
|
30
|
+
number: Union[int, List[int]] = 1000 # Number of requests to be made
|
|
31
|
+
parallel: Union[int, List[int]] = 1 # Number of parallel requests
|
|
32
32
|
rate: int = -1 # Rate limit for requests (default: -1, no limit)
|
|
33
33
|
|
|
34
34
|
# Logging and debugging
|
|
@@ -98,6 +98,15 @@ class Arguments:
|
|
|
98
98
|
if self.apply_chat_template is None:
|
|
99
99
|
self.apply_chat_template = self.url.strip('/').endswith('chat/completions')
|
|
100
100
|
|
|
101
|
+
# Set number and parallel to lists if they are integers
|
|
102
|
+
if isinstance(self.number, int):
|
|
103
|
+
self.number = [self.number]
|
|
104
|
+
if isinstance(self.parallel, int):
|
|
105
|
+
self.parallel = [self.parallel]
|
|
106
|
+
assert len(self.number) == len(
|
|
107
|
+
self.parallel
|
|
108
|
+
), f'The length of number and parallel should be the same, but got number: {self.number} and parallel: {self.parallel}' # noqa: E501
|
|
109
|
+
|
|
101
110
|
def __str__(self):
|
|
102
111
|
return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
|
|
103
112
|
|
|
@@ -143,8 +152,8 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
143
152
|
parser.add_argument('--no-test-connection', action='store_false', default=False, help='Do not test the connection before starting the benchmark') # noqa: E501
|
|
144
153
|
|
|
145
154
|
# Performance and parallelism
|
|
146
|
-
parser.add_argument('-n', '--number', type=int, default=1000, help='How many requests to be made')
|
|
147
|
-
parser.add_argument('--parallel', type=int, default=1, help='Set number of concurrency requests, default 1')
|
|
155
|
+
parser.add_argument('-n', '--number', type=int, default=1000, nargs='+', help='How many requests to be made')
|
|
156
|
+
parser.add_argument('--parallel', type=int, default=1, nargs='+', help='Set number of concurrency requests, default 1') # noqa: E501
|
|
148
157
|
parser.add_argument('--rate', type=int, default=-1, help='Number of requests per second. default None')
|
|
149
158
|
|
|
150
159
|
# Logging and debugging
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -9,7 +9,7 @@ import threading
|
|
|
9
9
|
import time
|
|
10
10
|
from http import HTTPStatus
|
|
11
11
|
from tqdm import tqdm
|
|
12
|
-
from typing import AsyncGenerator, List
|
|
12
|
+
from typing import AsyncGenerator, Dict, List, Tuple
|
|
13
13
|
|
|
14
14
|
from evalscope.perf.arguments import Arguments
|
|
15
15
|
from evalscope.perf.http_client import AioHttpClient, test_connection
|
|
@@ -18,7 +18,6 @@ from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
|
18
18
|
from evalscope.perf.utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, summary_result
|
|
19
19
|
from evalscope.perf.utils.handler import add_signal_handlers, exception_handler
|
|
20
20
|
from evalscope.perf.utils.local_server import start_app
|
|
21
|
-
from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
|
|
22
21
|
from evalscope.utils.logger import get_logger
|
|
23
22
|
|
|
24
23
|
logger = get_logger()
|
|
@@ -116,11 +115,6 @@ async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args:
|
|
|
116
115
|
|
|
117
116
|
result_db_path = get_result_db_path(args)
|
|
118
117
|
|
|
119
|
-
if args.wandb_api_key:
|
|
120
|
-
init_wandb(args)
|
|
121
|
-
if args.swanlab_api_key:
|
|
122
|
-
init_swanlab(args)
|
|
123
|
-
|
|
124
118
|
collected_benchmark_data = []
|
|
125
119
|
|
|
126
120
|
with tqdm(desc='Processing', total=args.number) as pbar:
|
|
@@ -180,7 +174,7 @@ async def connect_test(args: Arguments) -> bool:
|
|
|
180
174
|
|
|
181
175
|
|
|
182
176
|
@exception_handler
|
|
183
|
-
async def benchmark(args: Arguments) ->
|
|
177
|
+
async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
|
|
184
178
|
if platform.system() != 'Windows':
|
|
185
179
|
loop = asyncio.get_running_loop()
|
|
186
180
|
add_signal_handlers(loop)
|
|
@@ -205,4 +199,5 @@ async def benchmark(args: Arguments) -> None:
|
|
|
205
199
|
data_process_completed_event.set()
|
|
206
200
|
|
|
207
201
|
metrics, result_db_path = await statistic_benchmark_metric_task
|
|
208
|
-
summary_result(args, metrics, result_db_path)
|
|
202
|
+
metrics_result, percentile_result = summary_result(args, metrics, result_db_path)
|
|
203
|
+
return metrics_result, percentile_result
|
evalscope/perf/main.py
CHANGED
|
@@ -1,32 +1,32 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import copy
|
|
2
3
|
import os
|
|
3
4
|
import platform
|
|
5
|
+
import time
|
|
4
6
|
from argparse import Namespace
|
|
5
7
|
|
|
6
|
-
from evalscope.perf.
|
|
7
|
-
from evalscope.perf.benchmark import benchmark
|
|
8
|
-
from evalscope.perf.utils.db_util import get_output_path
|
|
9
|
-
from evalscope.perf.utils.handler import add_signal_handlers
|
|
8
|
+
from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
|
|
10
9
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
11
10
|
from evalscope.utils.utils import seed_everything
|
|
11
|
+
from .arguments import Arguments, parse_args
|
|
12
|
+
from .benchmark import benchmark
|
|
13
|
+
from .utils.db_util import get_output_path
|
|
14
|
+
from .utils.handler import add_signal_handlers
|
|
15
|
+
from .utils.rich_display import print_summary
|
|
12
16
|
|
|
13
17
|
logger = get_logger()
|
|
14
18
|
|
|
15
19
|
|
|
16
|
-
def
|
|
17
|
-
if isinstance(args,
|
|
18
|
-
args =
|
|
19
|
-
|
|
20
|
-
args =
|
|
21
|
-
|
|
22
|
-
if args.seed is not None:
|
|
23
|
-
seed_everything(args.seed)
|
|
20
|
+
def run_one_benchmark(args: Arguments, output_path: str = None):
|
|
21
|
+
if isinstance(args.parallel, list):
|
|
22
|
+
args.parallel = args.parallel[0]
|
|
23
|
+
if isinstance(args.number, list):
|
|
24
|
+
args.number = args.number[0]
|
|
24
25
|
|
|
25
26
|
# Setup logger and output
|
|
26
|
-
args.outputs_dir =
|
|
27
|
-
configure_logging(args.debug, os.path.join(args.outputs_dir, 'benchmark.log'))
|
|
27
|
+
args.outputs_dir = output_path
|
|
28
28
|
|
|
29
|
-
logger.info('Starting benchmark
|
|
29
|
+
logger.info('Starting benchmark with args: ')
|
|
30
30
|
logger.info(args)
|
|
31
31
|
|
|
32
32
|
if platform.system() == 'Windows':
|
|
@@ -36,9 +36,61 @@ def run_perf_benchmark(args):
|
|
|
36
36
|
if platform.system() != 'Windows':
|
|
37
37
|
add_signal_handlers(loop)
|
|
38
38
|
|
|
39
|
-
loop.run_until_complete(benchmark(args))
|
|
39
|
+
return loop.run_until_complete(benchmark(args))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def run_multi_benchmark(args: Arguments, output_path: str = None):
|
|
43
|
+
results = []
|
|
44
|
+
number_list = copy.deepcopy(args.number)
|
|
45
|
+
parallel_list = copy.deepcopy(args.parallel)
|
|
46
|
+
for i, (number, parallel) in enumerate(zip(number_list, parallel_list)):
|
|
47
|
+
args.number = number
|
|
48
|
+
args.parallel = parallel
|
|
49
|
+
# Set up output path for each run
|
|
50
|
+
cur_output_path = os.path.join(output_path, f'parallel_{parallel}_number_{number}')
|
|
51
|
+
os.makedirs(cur_output_path, exist_ok=True)
|
|
52
|
+
# Start the benchmark
|
|
53
|
+
metrics_result = run_one_benchmark(args, output_path=cur_output_path)
|
|
54
|
+
# Save the results
|
|
55
|
+
results.append(metrics_result)
|
|
56
|
+
# Sleep between runs to avoid overwhelming the server
|
|
57
|
+
if i < len(number_list) - 1:
|
|
58
|
+
logger.info('Sleeping for 5 seconds before the next run...')
|
|
59
|
+
time.sleep(5)
|
|
60
|
+
# Analyze results
|
|
61
|
+
print_summary(results, args.model_id)
|
|
62
|
+
return results
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def run_perf_benchmark(args):
|
|
66
|
+
# Check if args is a dictionary or Namespace
|
|
67
|
+
if isinstance(args, dict):
|
|
68
|
+
args = Arguments(**args)
|
|
69
|
+
elif isinstance(args, Namespace):
|
|
70
|
+
args = Arguments.from_args(args)
|
|
71
|
+
|
|
72
|
+
if args.seed is not None:
|
|
73
|
+
seed_everything(args.seed)
|
|
74
|
+
|
|
75
|
+
# Initialize output directory
|
|
76
|
+
output_path = get_output_path(args)
|
|
77
|
+
configure_logging(args.debug, os.path.join(output_path, 'benchmark.log'))
|
|
78
|
+
|
|
79
|
+
# Initialize wandb and swanlab
|
|
80
|
+
if args.wandb_api_key:
|
|
81
|
+
init_wandb(args)
|
|
82
|
+
if args.swanlab_api_key:
|
|
83
|
+
init_swanlab(args)
|
|
84
|
+
|
|
85
|
+
# Start benchmark
|
|
86
|
+
if len(args.number) == 1:
|
|
87
|
+
return run_one_benchmark(args, output_path=output_path)
|
|
88
|
+
else:
|
|
89
|
+
return run_multi_benchmark(args, output_path=output_path)
|
|
40
90
|
|
|
41
91
|
|
|
42
92
|
if __name__ == '__main__':
|
|
43
93
|
args = Arguments.from_args(parse_args())
|
|
44
|
-
run_perf_benchmark(args)
|
|
94
|
+
metrics_result, percentile_result = run_perf_benchmark(args)
|
|
95
|
+
print(metrics_result)
|
|
96
|
+
print(percentile_result)
|
|
@@ -51,6 +51,24 @@ class BenchmarkData:
|
|
|
51
51
|
self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
|
|
52
52
|
|
|
53
53
|
|
|
54
|
+
class Metrics:
|
|
55
|
+
TIME_TAKEN_FOR_TESTS = 'Time taken for tests (s)'
|
|
56
|
+
NUMBER_OF_CONCURRENCY = 'Number of concurrency'
|
|
57
|
+
TOTAL_REQUESTS = 'Total requests'
|
|
58
|
+
SUCCEED_REQUESTS = 'Succeed requests'
|
|
59
|
+
FAILED_REQUESTS = 'Failed requests'
|
|
60
|
+
OUTPUT_TOKEN_THROUGHPUT = 'Output token throughput (tok/s)'
|
|
61
|
+
TOTAL_TOKEN_THROUGHPUT = 'Total token throughput (tok/s)'
|
|
62
|
+
REQUEST_THROUGHPUT = 'Request throughput (req/s)'
|
|
63
|
+
AVERAGE_LATENCY = 'Average latency (s)'
|
|
64
|
+
AVERAGE_TIME_TO_FIRST_TOKEN = 'Average time to first token (s)'
|
|
65
|
+
AVERAGE_TIME_PER_OUTPUT_TOKEN = 'Average time per output token (s)'
|
|
66
|
+
AVERAGE_INPUT_TOKENS_PER_REQUEST = 'Average input tokens per request'
|
|
67
|
+
AVERAGE_OUTPUT_TOKENS_PER_REQUEST = 'Average output tokens per request'
|
|
68
|
+
AVERAGE_PACKAGE_LATENCY = 'Average package latency (s)'
|
|
69
|
+
AVERAGE_PACKAGE_PER_REQUEST = 'Average package per request'
|
|
70
|
+
|
|
71
|
+
|
|
54
72
|
@dataclass
|
|
55
73
|
class BenchmarkMetrics:
|
|
56
74
|
concurrency: int = 0
|
|
@@ -125,20 +143,20 @@ class BenchmarkMetrics:
|
|
|
125
143
|
|
|
126
144
|
def create_message(self, default_ndigits=4):
|
|
127
145
|
message = {
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
146
|
+
Metrics.TIME_TAKEN_FOR_TESTS: round(self.total_time, default_ndigits),
|
|
147
|
+
Metrics.NUMBER_OF_CONCURRENCY: self.concurrency,
|
|
148
|
+
Metrics.TOTAL_REQUESTS: int(self.n_total_queries),
|
|
149
|
+
Metrics.SUCCEED_REQUESTS: self.n_succeed_queries,
|
|
150
|
+
Metrics.FAILED_REQUESTS: self.n_failed_queries,
|
|
151
|
+
Metrics.OUTPUT_TOKEN_THROUGHPUT: round(self.avg_output_token_per_seconds, default_ndigits),
|
|
152
|
+
Metrics.TOTAL_TOKEN_THROUGHPUT: round(self.avg_total_token_per_seconds, default_ndigits),
|
|
153
|
+
Metrics.REQUEST_THROUGHPUT: round(self.qps, default_ndigits),
|
|
154
|
+
Metrics.AVERAGE_LATENCY: round(self.avg_latency, default_ndigits),
|
|
155
|
+
Metrics.AVERAGE_TIME_TO_FIRST_TOKEN: round(self.avg_first_chunk_latency, default_ndigits),
|
|
156
|
+
Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN: round(self.avg_time_per_token, default_ndigits),
|
|
157
|
+
Metrics.AVERAGE_INPUT_TOKENS_PER_REQUEST: round(self.avg_prompt_tokens, default_ndigits),
|
|
158
|
+
Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST: round(self.avg_completion_tokens, default_ndigits),
|
|
159
|
+
Metrics.AVERAGE_PACKAGE_LATENCY: round(self.avg_chunk_time, default_ndigits),
|
|
160
|
+
Metrics.AVERAGE_PACKAGE_PER_REQUEST: round(self.n_avg_chunks, default_ndigits),
|
|
143
161
|
}
|
|
144
162
|
return message
|
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -7,7 +7,7 @@ import sqlite3
|
|
|
7
7
|
import sys
|
|
8
8
|
from datetime import datetime
|
|
9
9
|
from tabulate import tabulate
|
|
10
|
-
from typing import Dict, List
|
|
10
|
+
from typing import Dict, List, Tuple
|
|
11
11
|
|
|
12
12
|
from evalscope.perf.arguments import Arguments
|
|
13
13
|
from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
@@ -111,6 +111,18 @@ def get_result_db_path(args: Arguments):
|
|
|
111
111
|
return result_db_path
|
|
112
112
|
|
|
113
113
|
|
|
114
|
+
class PercentileMetrics:
|
|
115
|
+
TTFT = 'TTFT (s)'
|
|
116
|
+
ITL = 'ITL (s)'
|
|
117
|
+
TPOT = 'TPOT (s)'
|
|
118
|
+
LATENCY = 'Latency (s)'
|
|
119
|
+
INPUT_TOKENS = 'Input tokens'
|
|
120
|
+
OUTPUT_TOKENS = 'Output tokens'
|
|
121
|
+
OUTPUT_THROUGHPUT = 'Output (tok/s)'
|
|
122
|
+
TOTAL_THROUGHPUT = 'Total (tok/s)'
|
|
123
|
+
PERCENTILES = 'Percentiles'
|
|
124
|
+
|
|
125
|
+
|
|
114
126
|
def calculate_percentiles(data: List[float], percentiles: List[int]) -> Dict[int, float]:
|
|
115
127
|
"""
|
|
116
128
|
Calculate the percentiles for a specific list of data.
|
|
@@ -157,10 +169,6 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
157
169
|
with sqlite3.connect(result_db_path) as con:
|
|
158
170
|
rows = con.execute(query_sql).fetchall()
|
|
159
171
|
|
|
160
|
-
if len(rows) < len(percentiles):
|
|
161
|
-
logger.info('Too little data to calculate quantiles!')
|
|
162
|
-
return {}
|
|
163
|
-
|
|
164
172
|
# Define index variables for columns
|
|
165
173
|
CHUNK_TIMES_INDEX = 1
|
|
166
174
|
LATENCY_INDEX = 4
|
|
@@ -175,24 +183,25 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
175
183
|
inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))
|
|
176
184
|
|
|
177
185
|
metrics = {
|
|
178
|
-
|
|
179
|
-
|
|
186
|
+
PercentileMetrics.TTFT: [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
|
|
187
|
+
PercentileMetrics.ITL:
|
|
180
188
|
inter_token_latencies_all,
|
|
181
|
-
|
|
189
|
+
PercentileMetrics.TPOT:
|
|
182
190
|
[(row[CHUNK_TIME_INDEX] / row[COMPLETION_TOKENS_INDEX]) if row[COMPLETION_TOKENS_INDEX] > 0 else float('nan')
|
|
183
191
|
for row in rows],
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
192
|
+
PercentileMetrics.LATENCY: [row[LATENCY_INDEX] for row in rows],
|
|
193
|
+
PercentileMetrics.INPUT_TOKENS: [row[PROMPT_TOKENS_INDEX] for row in rows],
|
|
194
|
+
PercentileMetrics.OUTPUT_TOKENS: [row[COMPLETION_TOKENS_INDEX] for row in rows],
|
|
195
|
+
PercentileMetrics.OUTPUT_THROUGHPUT:
|
|
188
196
|
[(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
|
|
189
197
|
for row in rows],
|
|
190
|
-
|
|
191
|
-
|
|
198
|
+
PercentileMetrics.TOTAL_THROUGHPUT: [((row[PROMPT_TOKENS_INDEX] + row[COMPLETION_TOKENS_INDEX])
|
|
199
|
+
/ row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
|
|
200
|
+
for row in rows]
|
|
192
201
|
}
|
|
193
202
|
|
|
194
203
|
# Calculate percentiles for each metric
|
|
195
|
-
results = {
|
|
204
|
+
results = {PercentileMetrics.PERCENTILES: [f'{p}%' for p in percentiles]}
|
|
196
205
|
for metric_name, data in metrics.items():
|
|
197
206
|
metric_percentiles = calculate_percentiles(data, percentiles)
|
|
198
207
|
results[metric_name] = [metric_percentiles[p] for p in percentiles]
|
|
@@ -200,16 +209,15 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
200
209
|
return results
|
|
201
210
|
|
|
202
211
|
|
|
203
|
-
def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str):
|
|
212
|
+
def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str) -> Tuple[Dict, Dict]:
|
|
204
213
|
result_path = os.path.dirname(result_db_path)
|
|
205
214
|
write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
|
|
206
215
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
write_json_file(data, os.path.join(result_path, 'benchmark_summary.json'))
|
|
216
|
+
metrics_result = metrics.create_message()
|
|
217
|
+
write_json_file(metrics_result, os.path.join(result_path, 'benchmark_summary.json'))
|
|
210
218
|
|
|
211
219
|
# Print summary in a table
|
|
212
|
-
table = tabulate(list(
|
|
220
|
+
table = tabulate(list(metrics_result.items()), headers=['Key', 'Value'], tablefmt='grid')
|
|
213
221
|
logger.info('\nBenchmarking summary:\n' + table)
|
|
214
222
|
|
|
215
223
|
# Get percentile results
|
|
@@ -223,6 +231,10 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
|
|
|
223
231
|
if args.dataset.startswith('speed_benchmark'):
|
|
224
232
|
speed_benchmark_result(result_db_path)
|
|
225
233
|
|
|
234
|
+
logger.info(f'Save the summary to: {result_path}')
|
|
235
|
+
|
|
236
|
+
return metrics_result, percentile_result
|
|
237
|
+
|
|
226
238
|
|
|
227
239
|
def speed_benchmark_result(result_db_path: str):
|
|
228
240
|
query_sql = """
|
|
@@ -35,7 +35,7 @@ def init_swanlab(args: Arguments) -> None:
|
|
|
35
35
|
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
36
36
|
swanlab.config.update({'framework': '📏evalscope'})
|
|
37
37
|
swanlab.init(
|
|
38
|
-
project='perf_benchmark',
|
|
38
|
+
project=os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
|
|
39
39
|
name=name,
|
|
40
40
|
config=args.to_dict(),
|
|
41
41
|
mode='local' if args.swanlab_api_key == 'local' else None)
|