evalscope 0.15.1__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +10 -0
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
- evalscope/benchmarks/data_adapter.py +4 -2
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +133 -0
- evalscope/benchmarks/drop/utils.py +59 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +67 -0
- evalscope/benchmarks/tool_bench/utils.py +202 -0
- evalscope/benchmarks/utils.py +3 -2
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
- evalscope/collections/evaluator.py +76 -26
- evalscope/config.py +46 -15
- evalscope/evaluator/evaluator.py +43 -15
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +3 -3
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/models/adapters/chat_adapter.py +51 -34
- evalscope/models/adapters/server_adapter.py +15 -19
- evalscope/perf/arguments.py +14 -5
- evalscope/perf/benchmark.py +0 -6
- evalscope/perf/main.py +65 -15
- evalscope/perf/utils/benchmark_util.py +33 -15
- evalscope/perf/utils/db_util.py +25 -15
- evalscope/perf/utils/log_utils.py +1 -1
- evalscope/perf/utils/rich_display.py +186 -0
- evalscope/report/app.py +47 -34
- evalscope/report/utils.py +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/deprecation_utils.py +42 -0
- evalscope/version.py +2 -2
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/METADATA +45 -21
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/RECORD +46 -36
- tests/cli/test_all.py +3 -0
- tests/cli/test_collection.py +2 -1
- tests/cli/test_run.py +28 -12
- tests/perf/test_perf.py +23 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/LICENSE +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/WHEEL +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/top_level.txt +0 -0
|
@@ -43,7 +43,7 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
43
43
|
sig = signature(self.client.chat.completions.create)
|
|
44
44
|
return list(sig.parameters.keys())
|
|
45
45
|
|
|
46
|
-
def predict(self, inputs: List[dict], infer_cfg: dict = None) -> List[dict]:
|
|
46
|
+
def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
|
|
47
47
|
"""
|
|
48
48
|
Model prediction func.
|
|
49
49
|
|
|
@@ -65,23 +65,26 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
65
65
|
|
|
66
66
|
def process_single_input(self, input_item: dict, infer_cfg: dict) -> dict:
|
|
67
67
|
"""Process a single input item."""
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
query = '\n'.join(''.join(item) for item in data)
|
|
71
|
-
system_prompt = input_item.get('system_prompt', None)
|
|
68
|
+
if input_item.get('messages', None):
|
|
69
|
+
content = input_item['messages']
|
|
72
70
|
else:
|
|
73
|
-
|
|
74
|
-
system_prompt = input_item.get('system_prompt', None)
|
|
75
|
-
|
|
76
|
-
content = self.make_request_content(query, system_prompt)
|
|
71
|
+
content = self.make_request_content(input_item)
|
|
77
72
|
request_json = self.make_request(content, infer_cfg)
|
|
78
73
|
response = self.send_request(request_json)
|
|
79
74
|
return response
|
|
80
75
|
|
|
81
|
-
def make_request_content(self,
|
|
76
|
+
def make_request_content(self, input_item: dict) -> list:
|
|
82
77
|
"""
|
|
83
78
|
Make request content for OpenAI API.
|
|
84
79
|
"""
|
|
80
|
+
data: list = input_item['data']
|
|
81
|
+
if isinstance(data[0], tuple): # for truthful_qa and hellaswag
|
|
82
|
+
query = '\n'.join(''.join(item) for item in data)
|
|
83
|
+
system_prompt = input_item.get('system_prompt', None)
|
|
84
|
+
else:
|
|
85
|
+
query = data[0]
|
|
86
|
+
system_prompt = input_item.get('system_prompt', None)
|
|
87
|
+
|
|
85
88
|
messages = []
|
|
86
89
|
if system_prompt:
|
|
87
90
|
messages.append({'role': 'system', 'content': system_prompt})
|
|
@@ -90,16 +93,9 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
90
93
|
|
|
91
94
|
return messages
|
|
92
95
|
|
|
93
|
-
def make_request(self, content: list, infer_cfg: dict
|
|
96
|
+
def make_request(self, content: list, infer_cfg: dict) -> dict:
|
|
94
97
|
"""Make request to remote API."""
|
|
95
98
|
# Format request JSON according to OpenAI API format
|
|
96
|
-
from evalscope.config import DEFAULT_GENERATION_CONFIG
|
|
97
|
-
if infer_cfg == DEFAULT_GENERATION_CONFIG:
|
|
98
|
-
infer_cfg = {
|
|
99
|
-
'max_tokens': 2048,
|
|
100
|
-
'temperature': 0.0,
|
|
101
|
-
}
|
|
102
|
-
|
|
103
99
|
request_json = {'model': self.model_id, 'messages': content, **infer_cfg}
|
|
104
100
|
|
|
105
101
|
if self.timeout:
|
|
@@ -137,7 +133,7 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
137
133
|
return response.model_dump(exclude_unset=True)
|
|
138
134
|
except Exception as e:
|
|
139
135
|
logger.error(f'Error when calling remote API: {str(e)}')
|
|
140
|
-
raise
|
|
136
|
+
raise e
|
|
141
137
|
|
|
142
138
|
def _collect_stream_response(self, response_stream: List[ChatCompletionChunk]) -> ChatCompletion:
|
|
143
139
|
collected_chunks = []
|
evalscope/perf/arguments.py
CHANGED
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
import os
|
|
4
4
|
import sys
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
|
-
from typing import Any, Dict, List, Optional
|
|
6
|
+
from typing import Any, Dict, List, Optional, Union
|
|
7
7
|
|
|
8
8
|
from evalscope.constants import DEFAULT_WORK_DIR
|
|
9
9
|
|
|
@@ -27,8 +27,8 @@ class Arguments:
|
|
|
27
27
|
no_test_connection: bool = False # Test the connection before starting the benchmark
|
|
28
28
|
|
|
29
29
|
# Performance and parallelism
|
|
30
|
-
number: int = 1000 # Number of requests to be made
|
|
31
|
-
parallel: int = 1 # Number of parallel requests
|
|
30
|
+
number: Union[int, List[int]] = 1000 # Number of requests to be made
|
|
31
|
+
parallel: Union[int, List[int]] = 1 # Number of parallel requests
|
|
32
32
|
rate: int = -1 # Rate limit for requests (default: -1, no limit)
|
|
33
33
|
|
|
34
34
|
# Logging and debugging
|
|
@@ -98,6 +98,15 @@ class Arguments:
|
|
|
98
98
|
if self.apply_chat_template is None:
|
|
99
99
|
self.apply_chat_template = self.url.strip('/').endswith('chat/completions')
|
|
100
100
|
|
|
101
|
+
# Set number and parallel to lists if they are integers
|
|
102
|
+
if isinstance(self.number, int):
|
|
103
|
+
self.number = [self.number]
|
|
104
|
+
if isinstance(self.parallel, int):
|
|
105
|
+
self.parallel = [self.parallel]
|
|
106
|
+
assert len(self.number) == len(
|
|
107
|
+
self.parallel
|
|
108
|
+
), f'The length of number and parallel should be the same, but got number: {self.number} and parallel: {self.parallel}' # noqa: E501
|
|
109
|
+
|
|
101
110
|
def __str__(self):
|
|
102
111
|
return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
|
|
103
112
|
|
|
@@ -143,8 +152,8 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
143
152
|
parser.add_argument('--no-test-connection', action='store_false', default=False, help='Do not test the connection before starting the benchmark') # noqa: E501
|
|
144
153
|
|
|
145
154
|
# Performance and parallelism
|
|
146
|
-
parser.add_argument('-n', '--number', type=int, default=1000, help='How many requests to be made')
|
|
147
|
-
parser.add_argument('--parallel', type=int, default=1, help='Set number of concurrency requests, default 1')
|
|
155
|
+
parser.add_argument('-n', '--number', type=int, default=1000, nargs='+', help='How many requests to be made')
|
|
156
|
+
parser.add_argument('--parallel', type=int, default=1, nargs='+', help='Set number of concurrency requests, default 1') # noqa: E501
|
|
148
157
|
parser.add_argument('--rate', type=int, default=-1, help='Number of requests per second. default None')
|
|
149
158
|
|
|
150
159
|
# Logging and debugging
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -18,7 +18,6 @@ from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
|
18
18
|
from evalscope.perf.utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, summary_result
|
|
19
19
|
from evalscope.perf.utils.handler import add_signal_handlers, exception_handler
|
|
20
20
|
from evalscope.perf.utils.local_server import start_app
|
|
21
|
-
from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
|
|
22
21
|
from evalscope.utils.logger import get_logger
|
|
23
22
|
|
|
24
23
|
logger = get_logger()
|
|
@@ -116,11 +115,6 @@ async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args:
|
|
|
116
115
|
|
|
117
116
|
result_db_path = get_result_db_path(args)
|
|
118
117
|
|
|
119
|
-
if args.wandb_api_key:
|
|
120
|
-
init_wandb(args)
|
|
121
|
-
if args.swanlab_api_key:
|
|
122
|
-
init_swanlab(args)
|
|
123
|
-
|
|
124
118
|
collected_benchmark_data = []
|
|
125
119
|
|
|
126
120
|
with tqdm(desc='Processing', total=args.number) as pbar:
|
evalscope/perf/main.py
CHANGED
|
@@ -1,32 +1,32 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import copy
|
|
2
3
|
import os
|
|
3
4
|
import platform
|
|
5
|
+
import time
|
|
4
6
|
from argparse import Namespace
|
|
5
7
|
|
|
6
|
-
from evalscope.perf.
|
|
7
|
-
from evalscope.perf.benchmark import benchmark
|
|
8
|
-
from evalscope.perf.utils.db_util import get_output_path
|
|
9
|
-
from evalscope.perf.utils.handler import add_signal_handlers
|
|
8
|
+
from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
|
|
10
9
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
11
10
|
from evalscope.utils.utils import seed_everything
|
|
11
|
+
from .arguments import Arguments, parse_args
|
|
12
|
+
from .benchmark import benchmark
|
|
13
|
+
from .utils.db_util import get_output_path
|
|
14
|
+
from .utils.handler import add_signal_handlers
|
|
15
|
+
from .utils.rich_display import print_summary
|
|
12
16
|
|
|
13
17
|
logger = get_logger()
|
|
14
18
|
|
|
15
19
|
|
|
16
|
-
def
|
|
17
|
-
if isinstance(args,
|
|
18
|
-
args =
|
|
19
|
-
|
|
20
|
-
args =
|
|
21
|
-
|
|
22
|
-
if args.seed is not None:
|
|
23
|
-
seed_everything(args.seed)
|
|
20
|
+
def run_one_benchmark(args: Arguments, output_path: str = None):
|
|
21
|
+
if isinstance(args.parallel, list):
|
|
22
|
+
args.parallel = args.parallel[0]
|
|
23
|
+
if isinstance(args.number, list):
|
|
24
|
+
args.number = args.number[0]
|
|
24
25
|
|
|
25
26
|
# Setup logger and output
|
|
26
|
-
args.outputs_dir =
|
|
27
|
-
configure_logging(args.debug, os.path.join(args.outputs_dir, 'benchmark.log'))
|
|
27
|
+
args.outputs_dir = output_path
|
|
28
28
|
|
|
29
|
-
logger.info('Starting benchmark
|
|
29
|
+
logger.info('Starting benchmark with args: ')
|
|
30
30
|
logger.info(args)
|
|
31
31
|
|
|
32
32
|
if platform.system() == 'Windows':
|
|
@@ -39,6 +39,56 @@ def run_perf_benchmark(args):
|
|
|
39
39
|
return loop.run_until_complete(benchmark(args))
|
|
40
40
|
|
|
41
41
|
|
|
42
|
+
def run_multi_benchmark(args: Arguments, output_path: str = None):
|
|
43
|
+
results = []
|
|
44
|
+
number_list = copy.deepcopy(args.number)
|
|
45
|
+
parallel_list = copy.deepcopy(args.parallel)
|
|
46
|
+
for i, (number, parallel) in enumerate(zip(number_list, parallel_list)):
|
|
47
|
+
args.number = number
|
|
48
|
+
args.parallel = parallel
|
|
49
|
+
# Set up output path for each run
|
|
50
|
+
cur_output_path = os.path.join(output_path, f'parallel_{parallel}_number_{number}')
|
|
51
|
+
os.makedirs(cur_output_path, exist_ok=True)
|
|
52
|
+
# Start the benchmark
|
|
53
|
+
metrics_result = run_one_benchmark(args, output_path=cur_output_path)
|
|
54
|
+
# Save the results
|
|
55
|
+
results.append(metrics_result)
|
|
56
|
+
# Sleep between runs to avoid overwhelming the server
|
|
57
|
+
if i < len(number_list) - 1:
|
|
58
|
+
logger.info('Sleeping for 5 seconds before the next run...')
|
|
59
|
+
time.sleep(5)
|
|
60
|
+
# Analyze results
|
|
61
|
+
print_summary(results, args.model_id)
|
|
62
|
+
return results
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def run_perf_benchmark(args):
|
|
66
|
+
# Check if args is a dictionary or Namespace
|
|
67
|
+
if isinstance(args, dict):
|
|
68
|
+
args = Arguments(**args)
|
|
69
|
+
elif isinstance(args, Namespace):
|
|
70
|
+
args = Arguments.from_args(args)
|
|
71
|
+
|
|
72
|
+
if args.seed is not None:
|
|
73
|
+
seed_everything(args.seed)
|
|
74
|
+
|
|
75
|
+
# Initialize output directory
|
|
76
|
+
output_path = get_output_path(args)
|
|
77
|
+
configure_logging(args.debug, os.path.join(output_path, 'benchmark.log'))
|
|
78
|
+
|
|
79
|
+
# Initialize wandb and swanlab
|
|
80
|
+
if args.wandb_api_key:
|
|
81
|
+
init_wandb(args)
|
|
82
|
+
if args.swanlab_api_key:
|
|
83
|
+
init_swanlab(args)
|
|
84
|
+
|
|
85
|
+
# Start benchmark
|
|
86
|
+
if len(args.number) == 1:
|
|
87
|
+
return run_one_benchmark(args, output_path=output_path)
|
|
88
|
+
else:
|
|
89
|
+
return run_multi_benchmark(args, output_path=output_path)
|
|
90
|
+
|
|
91
|
+
|
|
42
92
|
if __name__ == '__main__':
|
|
43
93
|
args = Arguments.from_args(parse_args())
|
|
44
94
|
metrics_result, percentile_result = run_perf_benchmark(args)
|
|
@@ -51,6 +51,24 @@ class BenchmarkData:
|
|
|
51
51
|
self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
|
|
52
52
|
|
|
53
53
|
|
|
54
|
+
class Metrics:
|
|
55
|
+
TIME_TAKEN_FOR_TESTS = 'Time taken for tests (s)'
|
|
56
|
+
NUMBER_OF_CONCURRENCY = 'Number of concurrency'
|
|
57
|
+
TOTAL_REQUESTS = 'Total requests'
|
|
58
|
+
SUCCEED_REQUESTS = 'Succeed requests'
|
|
59
|
+
FAILED_REQUESTS = 'Failed requests'
|
|
60
|
+
OUTPUT_TOKEN_THROUGHPUT = 'Output token throughput (tok/s)'
|
|
61
|
+
TOTAL_TOKEN_THROUGHPUT = 'Total token throughput (tok/s)'
|
|
62
|
+
REQUEST_THROUGHPUT = 'Request throughput (req/s)'
|
|
63
|
+
AVERAGE_LATENCY = 'Average latency (s)'
|
|
64
|
+
AVERAGE_TIME_TO_FIRST_TOKEN = 'Average time to first token (s)'
|
|
65
|
+
AVERAGE_TIME_PER_OUTPUT_TOKEN = 'Average time per output token (s)'
|
|
66
|
+
AVERAGE_INPUT_TOKENS_PER_REQUEST = 'Average input tokens per request'
|
|
67
|
+
AVERAGE_OUTPUT_TOKENS_PER_REQUEST = 'Average output tokens per request'
|
|
68
|
+
AVERAGE_PACKAGE_LATENCY = 'Average package latency (s)'
|
|
69
|
+
AVERAGE_PACKAGE_PER_REQUEST = 'Average package per request'
|
|
70
|
+
|
|
71
|
+
|
|
54
72
|
@dataclass
|
|
55
73
|
class BenchmarkMetrics:
|
|
56
74
|
concurrency: int = 0
|
|
@@ -125,20 +143,20 @@ class BenchmarkMetrics:
|
|
|
125
143
|
|
|
126
144
|
def create_message(self, default_ndigits=4):
|
|
127
145
|
message = {
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
146
|
+
Metrics.TIME_TAKEN_FOR_TESTS: round(self.total_time, default_ndigits),
|
|
147
|
+
Metrics.NUMBER_OF_CONCURRENCY: self.concurrency,
|
|
148
|
+
Metrics.TOTAL_REQUESTS: int(self.n_total_queries),
|
|
149
|
+
Metrics.SUCCEED_REQUESTS: self.n_succeed_queries,
|
|
150
|
+
Metrics.FAILED_REQUESTS: self.n_failed_queries,
|
|
151
|
+
Metrics.OUTPUT_TOKEN_THROUGHPUT: round(self.avg_output_token_per_seconds, default_ndigits),
|
|
152
|
+
Metrics.TOTAL_TOKEN_THROUGHPUT: round(self.avg_total_token_per_seconds, default_ndigits),
|
|
153
|
+
Metrics.REQUEST_THROUGHPUT: round(self.qps, default_ndigits),
|
|
154
|
+
Metrics.AVERAGE_LATENCY: round(self.avg_latency, default_ndigits),
|
|
155
|
+
Metrics.AVERAGE_TIME_TO_FIRST_TOKEN: round(self.avg_first_chunk_latency, default_ndigits),
|
|
156
|
+
Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN: round(self.avg_time_per_token, default_ndigits),
|
|
157
|
+
Metrics.AVERAGE_INPUT_TOKENS_PER_REQUEST: round(self.avg_prompt_tokens, default_ndigits),
|
|
158
|
+
Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST: round(self.avg_completion_tokens, default_ndigits),
|
|
159
|
+
Metrics.AVERAGE_PACKAGE_LATENCY: round(self.avg_chunk_time, default_ndigits),
|
|
160
|
+
Metrics.AVERAGE_PACKAGE_PER_REQUEST: round(self.n_avg_chunks, default_ndigits),
|
|
143
161
|
}
|
|
144
162
|
return message
|
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -111,6 +111,18 @@ def get_result_db_path(args: Arguments):
|
|
|
111
111
|
return result_db_path
|
|
112
112
|
|
|
113
113
|
|
|
114
|
+
class PercentileMetrics:
|
|
115
|
+
TTFT = 'TTFT (s)'
|
|
116
|
+
ITL = 'ITL (s)'
|
|
117
|
+
TPOT = 'TPOT (s)'
|
|
118
|
+
LATENCY = 'Latency (s)'
|
|
119
|
+
INPUT_TOKENS = 'Input tokens'
|
|
120
|
+
OUTPUT_TOKENS = 'Output tokens'
|
|
121
|
+
OUTPUT_THROUGHPUT = 'Output (tok/s)'
|
|
122
|
+
TOTAL_THROUGHPUT = 'Total (tok/s)'
|
|
123
|
+
PERCENTILES = 'Percentiles'
|
|
124
|
+
|
|
125
|
+
|
|
114
126
|
def calculate_percentiles(data: List[float], percentiles: List[int]) -> Dict[int, float]:
|
|
115
127
|
"""
|
|
116
128
|
Calculate the percentiles for a specific list of data.
|
|
@@ -157,10 +169,6 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
157
169
|
with sqlite3.connect(result_db_path) as con:
|
|
158
170
|
rows = con.execute(query_sql).fetchall()
|
|
159
171
|
|
|
160
|
-
if len(rows) < len(percentiles):
|
|
161
|
-
logger.info('Too little data to calculate quantiles!')
|
|
162
|
-
return {}
|
|
163
|
-
|
|
164
172
|
# Define index variables for columns
|
|
165
173
|
CHUNK_TIMES_INDEX = 1
|
|
166
174
|
LATENCY_INDEX = 4
|
|
@@ -175,24 +183,25 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
175
183
|
inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))
|
|
176
184
|
|
|
177
185
|
metrics = {
|
|
178
|
-
|
|
179
|
-
|
|
186
|
+
PercentileMetrics.TTFT: [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
|
|
187
|
+
PercentileMetrics.ITL:
|
|
180
188
|
inter_token_latencies_all,
|
|
181
|
-
|
|
189
|
+
PercentileMetrics.TPOT:
|
|
182
190
|
[(row[CHUNK_TIME_INDEX] / row[COMPLETION_TOKENS_INDEX]) if row[COMPLETION_TOKENS_INDEX] > 0 else float('nan')
|
|
183
191
|
for row in rows],
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
192
|
+
PercentileMetrics.LATENCY: [row[LATENCY_INDEX] for row in rows],
|
|
193
|
+
PercentileMetrics.INPUT_TOKENS: [row[PROMPT_TOKENS_INDEX] for row in rows],
|
|
194
|
+
PercentileMetrics.OUTPUT_TOKENS: [row[COMPLETION_TOKENS_INDEX] for row in rows],
|
|
195
|
+
PercentileMetrics.OUTPUT_THROUGHPUT:
|
|
188
196
|
[(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
|
|
189
197
|
for row in rows],
|
|
190
|
-
|
|
191
|
-
|
|
198
|
+
PercentileMetrics.TOTAL_THROUGHPUT: [((row[PROMPT_TOKENS_INDEX] + row[COMPLETION_TOKENS_INDEX])
|
|
199
|
+
/ row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
|
|
200
|
+
for row in rows]
|
|
192
201
|
}
|
|
193
202
|
|
|
194
203
|
# Calculate percentiles for each metric
|
|
195
|
-
results = {
|
|
204
|
+
results = {PercentileMetrics.PERCENTILES: [f'{p}%' for p in percentiles]}
|
|
196
205
|
for metric_name, data in metrics.items():
|
|
197
206
|
metric_percentiles = calculate_percentiles(data, percentiles)
|
|
198
207
|
results[metric_name] = [metric_percentiles[p] for p in percentiles]
|
|
@@ -205,7 +214,6 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
|
|
|
205
214
|
write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
|
|
206
215
|
|
|
207
216
|
metrics_result = metrics.create_message()
|
|
208
|
-
metrics_result.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
|
|
209
217
|
write_json_file(metrics_result, os.path.join(result_path, 'benchmark_summary.json'))
|
|
210
218
|
|
|
211
219
|
# Print summary in a table
|
|
@@ -223,6 +231,8 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
|
|
|
223
231
|
if args.dataset.startswith('speed_benchmark'):
|
|
224
232
|
speed_benchmark_result(result_db_path)
|
|
225
233
|
|
|
234
|
+
logger.info(f'Save the summary to: {result_path}')
|
|
235
|
+
|
|
226
236
|
return metrics_result, percentile_result
|
|
227
237
|
|
|
228
238
|
|
|
@@ -35,7 +35,7 @@ def init_swanlab(args: Arguments) -> None:
|
|
|
35
35
|
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
36
36
|
swanlab.config.update({'framework': '📏evalscope'})
|
|
37
37
|
swanlab.init(
|
|
38
|
-
project='perf_benchmark',
|
|
38
|
+
project=os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
|
|
39
39
|
name=name,
|
|
40
40
|
config=args.to_dict(),
|
|
41
41
|
mode='local' if args.swanlab_api_key == 'local' else None)
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# the following code is largely adapted from https://github.com/lework/llm-benchmark
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from rich.console import Console
|
|
5
|
+
from rich.panel import Panel
|
|
6
|
+
from rich.style import Style
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
from rich.text import Text
|
|
9
|
+
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
from .benchmark_util import Metrics
|
|
12
|
+
from .db_util import PercentileMetrics
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def analyze_results(all_results):
|
|
18
|
+
"""Analyze all test results and generate a summary report"""
|
|
19
|
+
summary = []
|
|
20
|
+
total_tokens = 0
|
|
21
|
+
total_time = 0
|
|
22
|
+
|
|
23
|
+
for result in all_results:
|
|
24
|
+
total_metrics = result[0]
|
|
25
|
+
percentile_metrics = result[1]
|
|
26
|
+
percentiles = percentile_metrics[PercentileMetrics.PERCENTILES]
|
|
27
|
+
try:
|
|
28
|
+
concurrency = total_metrics.get(Metrics.NUMBER_OF_CONCURRENCY, 0)
|
|
29
|
+
rps = total_metrics.get(Metrics.REQUEST_THROUGHPUT, 0)
|
|
30
|
+
avg_latency = total_metrics.get(Metrics.AVERAGE_LATENCY, 0)
|
|
31
|
+
p99_latency = percentile_metrics.get(PercentileMetrics.LATENCY)[percentiles.index('99%')]
|
|
32
|
+
avg_tps = total_metrics.get(Metrics.OUTPUT_TOKEN_THROUGHPUT, 0)
|
|
33
|
+
avg_ttft = total_metrics.get(Metrics.AVERAGE_TIME_TO_FIRST_TOKEN, 0)
|
|
34
|
+
p99_ttft = percentile_metrics.get(PercentileMetrics.TTFT)[percentiles.index('99%')]
|
|
35
|
+
success_rate = (total_metrics.get(Metrics.SUCCEED_REQUESTS, 0)
|
|
36
|
+
/ total_metrics.get(Metrics.TOTAL_REQUESTS, 1)) * 100
|
|
37
|
+
avg_tpot = total_metrics.get(Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN, 0)
|
|
38
|
+
p99_tpot = percentile_metrics.get(PercentileMetrics.TPOT)[percentiles.index('99%')]
|
|
39
|
+
|
|
40
|
+
# Ensure all values are valid numbers
|
|
41
|
+
if any(x is None for x in [concurrency, rps, avg_latency, p99_latency, avg_tps, avg_ttft]):
|
|
42
|
+
logger.warning(f'Warning: Test results for concurrency {concurrency} contain invalid data, skipped')
|
|
43
|
+
continue
|
|
44
|
+
|
|
45
|
+
summary.append([
|
|
46
|
+
concurrency,
|
|
47
|
+
f'{rps:.2f}' if rps is not None else 'N/A',
|
|
48
|
+
f'{avg_latency:.3f}' if avg_latency is not None else 'N/A',
|
|
49
|
+
f'{p99_latency:.3f}' if p99_latency is not None else 'N/A',
|
|
50
|
+
f'{avg_tps:.2f}' if avg_tps is not None else 'N/A',
|
|
51
|
+
f'{avg_ttft:.3f}' if avg_ttft is not None else 'N/A',
|
|
52
|
+
f'{success_rate:.1f}%' if success_rate is not None else 'N/A',
|
|
53
|
+
f'{p99_ttft:.3f}' if p99_ttft is not None else 'N/A',
|
|
54
|
+
f'{avg_tpot:.3f}' if avg_tpot is not None else 'N/A',
|
|
55
|
+
f'{p99_tpot:.3f}' if p99_tpot is not None else 'N/A',
|
|
56
|
+
])
|
|
57
|
+
|
|
58
|
+
total_tokens += total_metrics.get(Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST, 0) * total_metrics.get(
|
|
59
|
+
Metrics.SUCCEED_REQUESTS, 0)
|
|
60
|
+
total_time += total_metrics.get(Metrics.TIME_TAKEN_FOR_TESTS, 0)
|
|
61
|
+
except Exception as e:
|
|
62
|
+
logger.warning(
|
|
63
|
+
f"Warning: Error processing results for concurrency {result.get('concurrency', 'unknown')}: {str(e)}")
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
if not summary:
|
|
67
|
+
logger.warning('Error: No valid test result data')
|
|
68
|
+
return [], 0, 0
|
|
69
|
+
|
|
70
|
+
return summary, total_tokens, total_time
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def print_summary(all_results, model_name):
|
|
74
|
+
"""Print test results summary"""
|
|
75
|
+
summary, total_tokens, total_time = analyze_results(all_results)
|
|
76
|
+
|
|
77
|
+
if not summary:
|
|
78
|
+
logger.warning('No available test result data to display')
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
console = Console(width=100) # Set fixed width
|
|
82
|
+
|
|
83
|
+
# Create title panel
|
|
84
|
+
title = Text('Performance Test Summary Report', style='bold')
|
|
85
|
+
console.print(Panel(title, width=60))
|
|
86
|
+
|
|
87
|
+
# Print basic information
|
|
88
|
+
basic_info = Table(show_header=False, width=60)
|
|
89
|
+
basic_info.add_column('Name', style='cyan', width=25)
|
|
90
|
+
basic_info.add_column('Value', style='green', width=35)
|
|
91
|
+
|
|
92
|
+
basic_info.add_row('Model', model_name)
|
|
93
|
+
basic_info.add_row('Total Generated', f'{total_tokens:,} tokens')
|
|
94
|
+
basic_info.add_row('Total Test Time', f'{total_time:.2f} seconds')
|
|
95
|
+
basic_info.add_row('Avg Output Rate', f'{total_tokens/total_time:.2f} tokens/sec')
|
|
96
|
+
|
|
97
|
+
console.print('\nBasic Information:')
|
|
98
|
+
console.print(basic_info)
|
|
99
|
+
|
|
100
|
+
# Create detailed performance metrics table
|
|
101
|
+
table = Table(
|
|
102
|
+
title='Detailed Performance Metrics',
|
|
103
|
+
show_header=True,
|
|
104
|
+
header_style='bold cyan',
|
|
105
|
+
border_style='blue',
|
|
106
|
+
width=100, # Set total table width
|
|
107
|
+
pad_edge=False, # Reduce edge padding
|
|
108
|
+
min_width=60, # Minimum width
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Add columns (set fixed column widths)
|
|
112
|
+
table.add_column('Conc.', justify='right', style='cyan')
|
|
113
|
+
table.add_column('RPS', justify='right')
|
|
114
|
+
table.add_column('Avg Lat.(s)', justify='right')
|
|
115
|
+
table.add_column('P99 Lat.(s)', justify='right')
|
|
116
|
+
table.add_column('Gen. toks/s', justify='right')
|
|
117
|
+
table.add_column('Avg TTFT(s)', justify='right')
|
|
118
|
+
table.add_column('P99 TTFT(s)', justify='right')
|
|
119
|
+
table.add_column('Avg TPOT(s)', justify='right')
|
|
120
|
+
table.add_column('P99 TPOT(s)', justify='right')
|
|
121
|
+
table.add_column('Success Rate', justify='right', style='green')
|
|
122
|
+
|
|
123
|
+
# Add data rows
|
|
124
|
+
for row in summary:
|
|
125
|
+
try:
|
|
126
|
+
# Set row style based on success rate
|
|
127
|
+
success_rate = float(row[6].rstrip('%'))
|
|
128
|
+
row_style = 'green' if success_rate >= 95 else 'yellow' if success_rate >= 80 else 'red'
|
|
129
|
+
|
|
130
|
+
table.add_row(
|
|
131
|
+
str(row[0]), # Concurrency
|
|
132
|
+
f'{float(row[1]):.2f}', # RPS
|
|
133
|
+
f'{float(row[2]):.3f}', # Average Latency
|
|
134
|
+
f'{float(row[3]):.3f}', # P99 Latency
|
|
135
|
+
f'{float(row[4]):.2f}', # Average TPS
|
|
136
|
+
f'{float(row[5]):.3f}', # First Token Latency
|
|
137
|
+
f'{float(row[7]):.3f}', # P99 TTFT
|
|
138
|
+
f'{float(row[8]):.3f}', # Average TPOT
|
|
139
|
+
f'{float(row[9]):.3f}', # P99 TPOT
|
|
140
|
+
row[6], # Success Rate
|
|
141
|
+
style=row_style)
|
|
142
|
+
except ValueError as e:
|
|
143
|
+
console.print(f'Warning: Error processing row data: {str(e)}', style='bold red')
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
console.print('\n')
|
|
147
|
+
console.print(table)
|
|
148
|
+
|
|
149
|
+
# Calculate and display best performance configuration
|
|
150
|
+
try:
|
|
151
|
+
best_rps_idx = np.argmax([float(row[1]) if row[1] != 'N/A' else -1 for row in summary])
|
|
152
|
+
best_latency_idx = np.argmin([float(row[2]) if row[2] != 'N/A' else float('inf') for row in summary])
|
|
153
|
+
|
|
154
|
+
perf_info = Table(title='Best Performance Configuration', show_header=False, box=None, width=60)
|
|
155
|
+
perf_info.add_column('Metric', style='cyan', width=20)
|
|
156
|
+
perf_info.add_column('Value', style='green', width=40)
|
|
157
|
+
|
|
158
|
+
perf_info.add_row('Highest RPS', f'Concurrency {summary[best_rps_idx][0]} ({summary[best_rps_idx][1]} req/sec)')
|
|
159
|
+
perf_info.add_row('Lowest Latency',
|
|
160
|
+
f'Concurrency {summary[best_latency_idx][0]} ({summary[best_latency_idx][2]} seconds)')
|
|
161
|
+
|
|
162
|
+
console.print('\n')
|
|
163
|
+
console.print(perf_info)
|
|
164
|
+
|
|
165
|
+
# Performance recommendations
|
|
166
|
+
recommendations = []
|
|
167
|
+
if best_rps_idx == len(summary) - 1:
|
|
168
|
+
recommendations.append(
|
|
169
|
+
'The system seems not to have reached its performance bottleneck, try higher concurrency')
|
|
170
|
+
elif best_rps_idx == 0:
|
|
171
|
+
recommendations.append('Consider lowering concurrency, current load may be too high')
|
|
172
|
+
else:
|
|
173
|
+
recommendations.append(f'Optimal concurrency range is around {summary[best_rps_idx][0]}')
|
|
174
|
+
|
|
175
|
+
success_rate = float(summary[-1][6][:-1])
|
|
176
|
+
if success_rate < 95:
|
|
177
|
+
recommendations.append(
|
|
178
|
+
'Success rate is low at high concurrency, check system resources or reduce concurrency')
|
|
179
|
+
|
|
180
|
+
recommend_text = Text('\nPerformance Recommendations:', style='bold cyan')
|
|
181
|
+
console.print(recommend_text)
|
|
182
|
+
for rec in recommendations:
|
|
183
|
+
console.print(f'• {rec}', style='yellow')
|
|
184
|
+
|
|
185
|
+
except Exception as e:
|
|
186
|
+
console.print(f'Warning: Error generating performance analysis: {str(e)}', style='bold red')
|