evalscope 0.15.1__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (46) hide show
  1. evalscope/arguments.py +10 -0
  2. evalscope/backend/rag_eval/utils/llm.py +1 -1
  3. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
  4. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
  5. evalscope/benchmarks/data_adapter.py +4 -2
  6. evalscope/benchmarks/drop/__init__.py +0 -0
  7. evalscope/benchmarks/drop/drop_adapter.py +133 -0
  8. evalscope/benchmarks/drop/utils.py +59 -0
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
  10. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
  11. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  12. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +67 -0
  13. evalscope/benchmarks/tool_bench/utils.py +202 -0
  14. evalscope/benchmarks/utils.py +3 -2
  15. evalscope/benchmarks/winogrande/__init__.py +0 -0
  16. evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
  17. evalscope/collections/evaluator.py +76 -26
  18. evalscope/config.py +46 -15
  19. evalscope/evaluator/evaluator.py +43 -15
  20. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  21. evalscope/metrics/llm_judge.py +3 -3
  22. evalscope/metrics/rouge_metric.py +11 -13
  23. evalscope/models/adapters/chat_adapter.py +51 -34
  24. evalscope/models/adapters/server_adapter.py +15 -19
  25. evalscope/perf/arguments.py +14 -5
  26. evalscope/perf/benchmark.py +0 -6
  27. evalscope/perf/main.py +65 -15
  28. evalscope/perf/utils/benchmark_util.py +33 -15
  29. evalscope/perf/utils/db_util.py +25 -15
  30. evalscope/perf/utils/log_utils.py +1 -1
  31. evalscope/perf/utils/rich_display.py +186 -0
  32. evalscope/report/app.py +47 -34
  33. evalscope/report/utils.py +1 -1
  34. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  35. evalscope/utils/deprecation_utils.py +42 -0
  36. evalscope/version.py +2 -2
  37. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/METADATA +45 -21
  38. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/RECORD +46 -36
  39. tests/cli/test_all.py +3 -0
  40. tests/cli/test_collection.py +2 -1
  41. tests/cli/test_run.py +28 -12
  42. tests/perf/test_perf.py +23 -0
  43. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/LICENSE +0 -0
  44. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/WHEEL +0 -0
  45. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/entry_points.txt +0 -0
  46. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/top_level.txt +0 -0
@@ -43,7 +43,7 @@ class ServerModelAdapter(BaseModelAdapter):
43
43
  sig = signature(self.client.chat.completions.create)
44
44
  return list(sig.parameters.keys())
45
45
 
46
- def predict(self, inputs: List[dict], infer_cfg: dict = None) -> List[dict]:
46
+ def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
47
47
  """
48
48
  Model prediction func.
49
49
 
@@ -65,23 +65,26 @@ class ServerModelAdapter(BaseModelAdapter):
65
65
 
66
66
  def process_single_input(self, input_item: dict, infer_cfg: dict) -> dict:
67
67
  """Process a single input item."""
68
- data: list = input_item['data']
69
- if isinstance(data[0], tuple): # for truthful_qa and hellaswag
70
- query = '\n'.join(''.join(item) for item in data)
71
- system_prompt = input_item.get('system_prompt', None)
68
+ if input_item.get('messages', None):
69
+ content = input_item['messages']
72
70
  else:
73
- query = data[0]
74
- system_prompt = input_item.get('system_prompt', None)
75
-
76
- content = self.make_request_content(query, system_prompt)
71
+ content = self.make_request_content(input_item)
77
72
  request_json = self.make_request(content, infer_cfg)
78
73
  response = self.send_request(request_json)
79
74
  return response
80
75
 
81
- def make_request_content(self, query: str, system_prompt: Optional[str] = None) -> list:
76
+ def make_request_content(self, input_item: dict) -> list:
82
77
  """
83
78
  Make request content for OpenAI API.
84
79
  """
80
+ data: list = input_item['data']
81
+ if isinstance(data[0], tuple): # for truthful_qa and hellaswag
82
+ query = '\n'.join(''.join(item) for item in data)
83
+ system_prompt = input_item.get('system_prompt', None)
84
+ else:
85
+ query = data[0]
86
+ system_prompt = input_item.get('system_prompt', None)
87
+
85
88
  messages = []
86
89
  if system_prompt:
87
90
  messages.append({'role': 'system', 'content': system_prompt})
@@ -90,16 +93,9 @@ class ServerModelAdapter(BaseModelAdapter):
90
93
 
91
94
  return messages
92
95
 
93
- def make_request(self, content: list, infer_cfg: dict = {}) -> dict:
96
+ def make_request(self, content: list, infer_cfg: dict) -> dict:
94
97
  """Make request to remote API."""
95
98
  # Format request JSON according to OpenAI API format
96
- from evalscope.config import DEFAULT_GENERATION_CONFIG
97
- if infer_cfg == DEFAULT_GENERATION_CONFIG:
98
- infer_cfg = {
99
- 'max_tokens': 2048,
100
- 'temperature': 0.0,
101
- }
102
-
103
99
  request_json = {'model': self.model_id, 'messages': content, **infer_cfg}
104
100
 
105
101
  if self.timeout:
@@ -137,7 +133,7 @@ class ServerModelAdapter(BaseModelAdapter):
137
133
  return response.model_dump(exclude_unset=True)
138
134
  except Exception as e:
139
135
  logger.error(f'Error when calling remote API: {str(e)}')
140
- raise
136
+ raise e
141
137
 
142
138
  def _collect_stream_response(self, response_stream: List[ChatCompletionChunk]) -> ChatCompletion:
143
139
  collected_chunks = []
@@ -3,7 +3,7 @@ import json
3
3
  import os
4
4
  import sys
5
5
  from dataclasses import dataclass, field
6
- from typing import Any, Dict, List, Optional
6
+ from typing import Any, Dict, List, Optional, Union
7
7
 
8
8
  from evalscope.constants import DEFAULT_WORK_DIR
9
9
 
@@ -27,8 +27,8 @@ class Arguments:
27
27
  no_test_connection: bool = False # Test the connection before starting the benchmark
28
28
 
29
29
  # Performance and parallelism
30
- number: int = 1000 # Number of requests to be made
31
- parallel: int = 1 # Number of parallel requests
30
+ number: Union[int, List[int]] = 1000 # Number of requests to be made
31
+ parallel: Union[int, List[int]] = 1 # Number of parallel requests
32
32
  rate: int = -1 # Rate limit for requests (default: -1, no limit)
33
33
 
34
34
  # Logging and debugging
@@ -98,6 +98,15 @@ class Arguments:
98
98
  if self.apply_chat_template is None:
99
99
  self.apply_chat_template = self.url.strip('/').endswith('chat/completions')
100
100
 
101
+ # Set number and parallel to lists if they are integers
102
+ if isinstance(self.number, int):
103
+ self.number = [self.number]
104
+ if isinstance(self.parallel, int):
105
+ self.parallel = [self.parallel]
106
+ assert len(self.number) == len(
107
+ self.parallel
108
+ ), f'The length of number and parallel should be the same, but got number: {self.number} and parallel: {self.parallel}' # noqa: E501
109
+
101
110
  def __str__(self):
102
111
  return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
103
112
 
@@ -143,8 +152,8 @@ def add_argument(parser: argparse.ArgumentParser):
143
152
  parser.add_argument('--no-test-connection', action='store_false', default=False, help='Do not test the connection before starting the benchmark') # noqa: E501
144
153
 
145
154
  # Performance and parallelism
146
- parser.add_argument('-n', '--number', type=int, default=1000, help='How many requests to be made')
147
- parser.add_argument('--parallel', type=int, default=1, help='Set number of concurrency requests, default 1')
155
+ parser.add_argument('-n', '--number', type=int, default=1000, nargs='+', help='How many requests to be made')
156
+ parser.add_argument('--parallel', type=int, default=1, nargs='+', help='Set number of concurrency requests, default 1') # noqa: E501
148
157
  parser.add_argument('--rate', type=int, default=-1, help='Number of requests per second. default None')
149
158
 
150
159
  # Logging and debugging
@@ -18,7 +18,6 @@ from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
18
18
  from evalscope.perf.utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, summary_result
19
19
  from evalscope.perf.utils.handler import add_signal_handlers, exception_handler
20
20
  from evalscope.perf.utils.local_server import start_app
21
- from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
22
21
  from evalscope.utils.logger import get_logger
23
22
 
24
23
  logger = get_logger()
@@ -116,11 +115,6 @@ async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args:
116
115
 
117
116
  result_db_path = get_result_db_path(args)
118
117
 
119
- if args.wandb_api_key:
120
- init_wandb(args)
121
- if args.swanlab_api_key:
122
- init_swanlab(args)
123
-
124
118
  collected_benchmark_data = []
125
119
 
126
120
  with tqdm(desc='Processing', total=args.number) as pbar:
evalscope/perf/main.py CHANGED
@@ -1,32 +1,32 @@
1
1
  import asyncio
2
+ import copy
2
3
  import os
3
4
  import platform
5
+ import time
4
6
  from argparse import Namespace
5
7
 
6
- from evalscope.perf.arguments import Arguments, parse_args
7
- from evalscope.perf.benchmark import benchmark
8
- from evalscope.perf.utils.db_util import get_output_path
9
- from evalscope.perf.utils.handler import add_signal_handlers
8
+ from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
10
9
  from evalscope.utils.logger import configure_logging, get_logger
11
10
  from evalscope.utils.utils import seed_everything
11
+ from .arguments import Arguments, parse_args
12
+ from .benchmark import benchmark
13
+ from .utils.db_util import get_output_path
14
+ from .utils.handler import add_signal_handlers
15
+ from .utils.rich_display import print_summary
12
16
 
13
17
  logger = get_logger()
14
18
 
15
19
 
16
- def run_perf_benchmark(args):
17
- if isinstance(args, dict):
18
- args = Arguments(**args)
19
- elif isinstance(args, Namespace):
20
- args = Arguments.from_args(args)
21
-
22
- if args.seed is not None:
23
- seed_everything(args.seed)
20
+ def run_one_benchmark(args: Arguments, output_path: str = None):
21
+ if isinstance(args.parallel, list):
22
+ args.parallel = args.parallel[0]
23
+ if isinstance(args.number, list):
24
+ args.number = args.number[0]
24
25
 
25
26
  # Setup logger and output
26
- args.outputs_dir = get_output_path(args)
27
- configure_logging(args.debug, os.path.join(args.outputs_dir, 'benchmark.log'))
27
+ args.outputs_dir = output_path
28
28
 
29
- logger.info('Starting benchmark...')
29
+ logger.info('Starting benchmark with args: ')
30
30
  logger.info(args)
31
31
 
32
32
  if platform.system() == 'Windows':
@@ -39,6 +39,56 @@ def run_perf_benchmark(args):
39
39
  return loop.run_until_complete(benchmark(args))
40
40
 
41
41
 
42
+ def run_multi_benchmark(args: Arguments, output_path: str = None):
43
+ results = []
44
+ number_list = copy.deepcopy(args.number)
45
+ parallel_list = copy.deepcopy(args.parallel)
46
+ for i, (number, parallel) in enumerate(zip(number_list, parallel_list)):
47
+ args.number = number
48
+ args.parallel = parallel
49
+ # Set up output path for each run
50
+ cur_output_path = os.path.join(output_path, f'parallel_{parallel}_number_{number}')
51
+ os.makedirs(cur_output_path, exist_ok=True)
52
+ # Start the benchmark
53
+ metrics_result = run_one_benchmark(args, output_path=cur_output_path)
54
+ # Save the results
55
+ results.append(metrics_result)
56
+ # Sleep between runs to avoid overwhelming the server
57
+ if i < len(number_list) - 1:
58
+ logger.info('Sleeping for 5 seconds before the next run...')
59
+ time.sleep(5)
60
+ # Analyze results
61
+ print_summary(results, args.model_id)
62
+ return results
63
+
64
+
65
+ def run_perf_benchmark(args):
66
+ # Check if args is a dictionary or Namespace
67
+ if isinstance(args, dict):
68
+ args = Arguments(**args)
69
+ elif isinstance(args, Namespace):
70
+ args = Arguments.from_args(args)
71
+
72
+ if args.seed is not None:
73
+ seed_everything(args.seed)
74
+
75
+ # Initialize output directory
76
+ output_path = get_output_path(args)
77
+ configure_logging(args.debug, os.path.join(output_path, 'benchmark.log'))
78
+
79
+ # Initialize wandb and swanlab
80
+ if args.wandb_api_key:
81
+ init_wandb(args)
82
+ if args.swanlab_api_key:
83
+ init_swanlab(args)
84
+
85
+ # Start benchmark
86
+ if len(args.number) == 1:
87
+ return run_one_benchmark(args, output_path=output_path)
88
+ else:
89
+ return run_multi_benchmark(args, output_path=output_path)
90
+
91
+
42
92
  if __name__ == '__main__':
43
93
  args = Arguments.from_args(parse_args())
44
94
  metrics_result, percentile_result = run_perf_benchmark(args)
@@ -51,6 +51,24 @@ class BenchmarkData:
51
51
  self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
52
52
 
53
53
 
54
+ class Metrics:
55
+ TIME_TAKEN_FOR_TESTS = 'Time taken for tests (s)'
56
+ NUMBER_OF_CONCURRENCY = 'Number of concurrency'
57
+ TOTAL_REQUESTS = 'Total requests'
58
+ SUCCEED_REQUESTS = 'Succeed requests'
59
+ FAILED_REQUESTS = 'Failed requests'
60
+ OUTPUT_TOKEN_THROUGHPUT = 'Output token throughput (tok/s)'
61
+ TOTAL_TOKEN_THROUGHPUT = 'Total token throughput (tok/s)'
62
+ REQUEST_THROUGHPUT = 'Request throughput (req/s)'
63
+ AVERAGE_LATENCY = 'Average latency (s)'
64
+ AVERAGE_TIME_TO_FIRST_TOKEN = 'Average time to first token (s)'
65
+ AVERAGE_TIME_PER_OUTPUT_TOKEN = 'Average time per output token (s)'
66
+ AVERAGE_INPUT_TOKENS_PER_REQUEST = 'Average input tokens per request'
67
+ AVERAGE_OUTPUT_TOKENS_PER_REQUEST = 'Average output tokens per request'
68
+ AVERAGE_PACKAGE_LATENCY = 'Average package latency (s)'
69
+ AVERAGE_PACKAGE_PER_REQUEST = 'Average package per request'
70
+
71
+
54
72
  @dataclass
55
73
  class BenchmarkMetrics:
56
74
  concurrency: int = 0
@@ -125,20 +143,20 @@ class BenchmarkMetrics:
125
143
 
126
144
  def create_message(self, default_ndigits=4):
127
145
  message = {
128
- 'Time taken for tests (s)': round(self.total_time, default_ndigits),
129
- 'Number of concurrency': self.concurrency,
130
- 'Total requests': int(self.n_total_queries),
131
- 'Succeed requests': self.n_succeed_queries,
132
- 'Failed requests': self.n_failed_queries,
133
- 'Output token throughput (tok/s)': round(self.avg_output_token_per_seconds, default_ndigits),
134
- 'Total token throughput (tok/s)': round(self.avg_total_token_per_seconds, default_ndigits),
135
- 'Request throughput (req/s)': round(self.qps, default_ndigits),
136
- 'Average latency (s)': round(self.avg_latency, default_ndigits),
137
- 'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
138
- 'Average time per output token (s)': round(self.avg_time_per_token, default_ndigits),
139
- 'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits),
140
- 'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits),
141
- 'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
142
- 'Average package per request': round(self.n_avg_chunks, default_ndigits),
146
+ Metrics.TIME_TAKEN_FOR_TESTS: round(self.total_time, default_ndigits),
147
+ Metrics.NUMBER_OF_CONCURRENCY: self.concurrency,
148
+ Metrics.TOTAL_REQUESTS: int(self.n_total_queries),
149
+ Metrics.SUCCEED_REQUESTS: self.n_succeed_queries,
150
+ Metrics.FAILED_REQUESTS: self.n_failed_queries,
151
+ Metrics.OUTPUT_TOKEN_THROUGHPUT: round(self.avg_output_token_per_seconds, default_ndigits),
152
+ Metrics.TOTAL_TOKEN_THROUGHPUT: round(self.avg_total_token_per_seconds, default_ndigits),
153
+ Metrics.REQUEST_THROUGHPUT: round(self.qps, default_ndigits),
154
+ Metrics.AVERAGE_LATENCY: round(self.avg_latency, default_ndigits),
155
+ Metrics.AVERAGE_TIME_TO_FIRST_TOKEN: round(self.avg_first_chunk_latency, default_ndigits),
156
+ Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN: round(self.avg_time_per_token, default_ndigits),
157
+ Metrics.AVERAGE_INPUT_TOKENS_PER_REQUEST: round(self.avg_prompt_tokens, default_ndigits),
158
+ Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST: round(self.avg_completion_tokens, default_ndigits),
159
+ Metrics.AVERAGE_PACKAGE_LATENCY: round(self.avg_chunk_time, default_ndigits),
160
+ Metrics.AVERAGE_PACKAGE_PER_REQUEST: round(self.n_avg_chunks, default_ndigits),
143
161
  }
144
162
  return message
@@ -111,6 +111,18 @@ def get_result_db_path(args: Arguments):
111
111
  return result_db_path
112
112
 
113
113
 
114
+ class PercentileMetrics:
115
+ TTFT = 'TTFT (s)'
116
+ ITL = 'ITL (s)'
117
+ TPOT = 'TPOT (s)'
118
+ LATENCY = 'Latency (s)'
119
+ INPUT_TOKENS = 'Input tokens'
120
+ OUTPUT_TOKENS = 'Output tokens'
121
+ OUTPUT_THROUGHPUT = 'Output (tok/s)'
122
+ TOTAL_THROUGHPUT = 'Total (tok/s)'
123
+ PERCENTILES = 'Percentiles'
124
+
125
+
114
126
  def calculate_percentiles(data: List[float], percentiles: List[int]) -> Dict[int, float]:
115
127
  """
116
128
  Calculate the percentiles for a specific list of data.
@@ -157,10 +169,6 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
157
169
  with sqlite3.connect(result_db_path) as con:
158
170
  rows = con.execute(query_sql).fetchall()
159
171
 
160
- if len(rows) < len(percentiles):
161
- logger.info('Too little data to calculate quantiles!')
162
- return {}
163
-
164
172
  # Define index variables for columns
165
173
  CHUNK_TIMES_INDEX = 1
166
174
  LATENCY_INDEX = 4
@@ -175,24 +183,25 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
175
183
  inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))
176
184
 
177
185
  metrics = {
178
- 'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
179
- 'ITL (s)':
186
+ PercentileMetrics.TTFT: [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
187
+ PercentileMetrics.ITL:
180
188
  inter_token_latencies_all,
181
- 'TPOT (s)':
189
+ PercentileMetrics.TPOT:
182
190
  [(row[CHUNK_TIME_INDEX] / row[COMPLETION_TOKENS_INDEX]) if row[COMPLETION_TOKENS_INDEX] > 0 else float('nan')
183
191
  for row in rows],
184
- 'Latency (s)': [row[LATENCY_INDEX] for row in rows],
185
- 'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows],
186
- 'Output tokens': [row[COMPLETION_TOKENS_INDEX] for row in rows],
187
- 'Output throughput(tok/s)':
192
+ PercentileMetrics.LATENCY: [row[LATENCY_INDEX] for row in rows],
193
+ PercentileMetrics.INPUT_TOKENS: [row[PROMPT_TOKENS_INDEX] for row in rows],
194
+ PercentileMetrics.OUTPUT_TOKENS: [row[COMPLETION_TOKENS_INDEX] for row in rows],
195
+ PercentileMetrics.OUTPUT_THROUGHPUT:
188
196
  [(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
189
197
  for row in rows],
190
- 'Total throughput(tok/s)': [((row[PROMPT_TOKENS_INDEX] + row[COMPLETION_TOKENS_INDEX])
191
- / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan') for row in rows]
198
+ PercentileMetrics.TOTAL_THROUGHPUT: [((row[PROMPT_TOKENS_INDEX] + row[COMPLETION_TOKENS_INDEX])
199
+ / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
200
+ for row in rows]
192
201
  }
193
202
 
194
203
  # Calculate percentiles for each metric
195
- results = {'Percentile': [f'{p}%' for p in percentiles]}
204
+ results = {PercentileMetrics.PERCENTILES: [f'{p}%' for p in percentiles]}
196
205
  for metric_name, data in metrics.items():
197
206
  metric_percentiles = calculate_percentiles(data, percentiles)
198
207
  results[metric_name] = [metric_percentiles[p] for p in percentiles]
@@ -205,7 +214,6 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
205
214
  write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
206
215
 
207
216
  metrics_result = metrics.create_message()
208
- metrics_result.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
209
217
  write_json_file(metrics_result, os.path.join(result_path, 'benchmark_summary.json'))
210
218
 
211
219
  # Print summary in a table
@@ -223,6 +231,8 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
223
231
  if args.dataset.startswith('speed_benchmark'):
224
232
  speed_benchmark_result(result_db_path)
225
233
 
234
+ logger.info(f'Save the summary to: {result_path}')
235
+
226
236
  return metrics_result, percentile_result
227
237
 
228
238
 
@@ -35,7 +35,7 @@ def init_swanlab(args: Arguments) -> None:
35
35
  name = args.name if args.name else f'{args.model_id}_{current_time}'
36
36
  swanlab.config.update({'framework': '📏evalscope'})
37
37
  swanlab.init(
38
- project='perf_benchmark',
38
+ project=os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
39
39
  name=name,
40
40
  config=args.to_dict(),
41
41
  mode='local' if args.swanlab_api_key == 'local' else None)
@@ -0,0 +1,186 @@
1
+ # the following code is largely adapted from https://github.com/lework/llm-benchmark
2
+
3
+ import numpy as np
4
+ from rich.console import Console
5
+ from rich.panel import Panel
6
+ from rich.style import Style
7
+ from rich.table import Table
8
+ from rich.text import Text
9
+
10
+ from evalscope.utils.logger import get_logger
11
+ from .benchmark_util import Metrics
12
+ from .db_util import PercentileMetrics
13
+
14
+ logger = get_logger()
15
+
16
+
17
+ def analyze_results(all_results):
18
+ """Analyze all test results and generate a summary report"""
19
+ summary = []
20
+ total_tokens = 0
21
+ total_time = 0
22
+
23
+ for result in all_results:
24
+ total_metrics = result[0]
25
+ percentile_metrics = result[1]
26
+ percentiles = percentile_metrics[PercentileMetrics.PERCENTILES]
27
+ try:
28
+ concurrency = total_metrics.get(Metrics.NUMBER_OF_CONCURRENCY, 0)
29
+ rps = total_metrics.get(Metrics.REQUEST_THROUGHPUT, 0)
30
+ avg_latency = total_metrics.get(Metrics.AVERAGE_LATENCY, 0)
31
+ p99_latency = percentile_metrics.get(PercentileMetrics.LATENCY)[percentiles.index('99%')]
32
+ avg_tps = total_metrics.get(Metrics.OUTPUT_TOKEN_THROUGHPUT, 0)
33
+ avg_ttft = total_metrics.get(Metrics.AVERAGE_TIME_TO_FIRST_TOKEN, 0)
34
+ p99_ttft = percentile_metrics.get(PercentileMetrics.TTFT)[percentiles.index('99%')]
35
+ success_rate = (total_metrics.get(Metrics.SUCCEED_REQUESTS, 0)
36
+ / total_metrics.get(Metrics.TOTAL_REQUESTS, 1)) * 100
37
+ avg_tpot = total_metrics.get(Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN, 0)
38
+ p99_tpot = percentile_metrics.get(PercentileMetrics.TPOT)[percentiles.index('99%')]
39
+
40
+ # Ensure all values are valid numbers
41
+ if any(x is None for x in [concurrency, rps, avg_latency, p99_latency, avg_tps, avg_ttft]):
42
+ logger.warning(f'Warning: Test results for concurrency {concurrency} contain invalid data, skipped')
43
+ continue
44
+
45
+ summary.append([
46
+ concurrency,
47
+ f'{rps:.2f}' if rps is not None else 'N/A',
48
+ f'{avg_latency:.3f}' if avg_latency is not None else 'N/A',
49
+ f'{p99_latency:.3f}' if p99_latency is not None else 'N/A',
50
+ f'{avg_tps:.2f}' if avg_tps is not None else 'N/A',
51
+ f'{avg_ttft:.3f}' if avg_ttft is not None else 'N/A',
52
+ f'{success_rate:.1f}%' if success_rate is not None else 'N/A',
53
+ f'{p99_ttft:.3f}' if p99_ttft is not None else 'N/A',
54
+ f'{avg_tpot:.3f}' if avg_tpot is not None else 'N/A',
55
+ f'{p99_tpot:.3f}' if p99_tpot is not None else 'N/A',
56
+ ])
57
+
58
+ total_tokens += total_metrics.get(Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST, 0) * total_metrics.get(
59
+ Metrics.SUCCEED_REQUESTS, 0)
60
+ total_time += total_metrics.get(Metrics.TIME_TAKEN_FOR_TESTS, 0)
61
+ except Exception as e:
62
+ logger.warning(
63
+ f"Warning: Error processing results for concurrency {result.get('concurrency', 'unknown')}: {str(e)}")
64
+ continue
65
+
66
+ if not summary:
67
+ logger.warning('Error: No valid test result data')
68
+ return [], 0, 0
69
+
70
+ return summary, total_tokens, total_time
71
+
72
+
73
+ def print_summary(all_results, model_name):
74
+ """Print test results summary"""
75
+ summary, total_tokens, total_time = analyze_results(all_results)
76
+
77
+ if not summary:
78
+ logger.warning('No available test result data to display')
79
+ return
80
+
81
+ console = Console(width=100) # Set fixed width
82
+
83
+ # Create title panel
84
+ title = Text('Performance Test Summary Report', style='bold')
85
+ console.print(Panel(title, width=60))
86
+
87
+ # Print basic information
88
+ basic_info = Table(show_header=False, width=60)
89
+ basic_info.add_column('Name', style='cyan', width=25)
90
+ basic_info.add_column('Value', style='green', width=35)
91
+
92
+ basic_info.add_row('Model', model_name)
93
+ basic_info.add_row('Total Generated', f'{total_tokens:,} tokens')
94
+ basic_info.add_row('Total Test Time', f'{total_time:.2f} seconds')
95
+ basic_info.add_row('Avg Output Rate', f'{total_tokens/total_time:.2f} tokens/sec')
96
+
97
+ console.print('\nBasic Information:')
98
+ console.print(basic_info)
99
+
100
+ # Create detailed performance metrics table
101
+ table = Table(
102
+ title='Detailed Performance Metrics',
103
+ show_header=True,
104
+ header_style='bold cyan',
105
+ border_style='blue',
106
+ width=100, # Set total table width
107
+ pad_edge=False, # Reduce edge padding
108
+ min_width=60, # Minimum width
109
+ )
110
+
111
+ # Add columns (set fixed column widths)
112
+ table.add_column('Conc.', justify='right', style='cyan')
113
+ table.add_column('RPS', justify='right')
114
+ table.add_column('Avg Lat.(s)', justify='right')
115
+ table.add_column('P99 Lat.(s)', justify='right')
116
+ table.add_column('Gen. toks/s', justify='right')
117
+ table.add_column('Avg TTFT(s)', justify='right')
118
+ table.add_column('P99 TTFT(s)', justify='right')
119
+ table.add_column('Avg TPOT(s)', justify='right')
120
+ table.add_column('P99 TPOT(s)', justify='right')
121
+ table.add_column('Success Rate', justify='right', style='green')
122
+
123
+ # Add data rows
124
+ for row in summary:
125
+ try:
126
+ # Set row style based on success rate
127
+ success_rate = float(row[6].rstrip('%'))
128
+ row_style = 'green' if success_rate >= 95 else 'yellow' if success_rate >= 80 else 'red'
129
+
130
+ table.add_row(
131
+ str(row[0]), # Concurrency
132
+ f'{float(row[1]):.2f}', # RPS
133
+ f'{float(row[2]):.3f}', # Average Latency
134
+ f'{float(row[3]):.3f}', # P99 Latency
135
+ f'{float(row[4]):.2f}', # Average TPS
136
+ f'{float(row[5]):.3f}', # First Token Latency
137
+ f'{float(row[7]):.3f}', # P99 TTFT
138
+ f'{float(row[8]):.3f}', # Average TPOT
139
+ f'{float(row[9]):.3f}', # P99 TPOT
140
+ row[6], # Success Rate
141
+ style=row_style)
142
+ except ValueError as e:
143
+ console.print(f'Warning: Error processing row data: {str(e)}', style='bold red')
144
+ continue
145
+
146
+ console.print('\n')
147
+ console.print(table)
148
+
149
+ # Calculate and display best performance configuration
150
+ try:
151
+ best_rps_idx = np.argmax([float(row[1]) if row[1] != 'N/A' else -1 for row in summary])
152
+ best_latency_idx = np.argmin([float(row[2]) if row[2] != 'N/A' else float('inf') for row in summary])
153
+
154
+ perf_info = Table(title='Best Performance Configuration', show_header=False, box=None, width=60)
155
+ perf_info.add_column('Metric', style='cyan', width=20)
156
+ perf_info.add_column('Value', style='green', width=40)
157
+
158
+ perf_info.add_row('Highest RPS', f'Concurrency {summary[best_rps_idx][0]} ({summary[best_rps_idx][1]} req/sec)')
159
+ perf_info.add_row('Lowest Latency',
160
+ f'Concurrency {summary[best_latency_idx][0]} ({summary[best_latency_idx][2]} seconds)')
161
+
162
+ console.print('\n')
163
+ console.print(perf_info)
164
+
165
+ # Performance recommendations
166
+ recommendations = []
167
+ if best_rps_idx == len(summary) - 1:
168
+ recommendations.append(
169
+ 'The system seems not to have reached its performance bottleneck, try higher concurrency')
170
+ elif best_rps_idx == 0:
171
+ recommendations.append('Consider lowering concurrency, current load may be too high')
172
+ else:
173
+ recommendations.append(f'Optimal concurrency range is around {summary[best_rps_idx][0]}')
174
+
175
+ success_rate = float(summary[-1][6][:-1])
176
+ if success_rate < 95:
177
+ recommendations.append(
178
+ 'Success rate is low at high concurrency, check system resources or reduce concurrency')
179
+
180
+ recommend_text = Text('\nPerformance Recommendations:', style='bold cyan')
181
+ console.print(recommend_text)
182
+ for rec in recommendations:
183
+ console.print(f'• {rec}', style='yellow')
184
+
185
+ except Exception as e:
186
+ console.print(f'Warning: Error generating performance analysis: {str(e)}', style='bold red')