evalscope 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (48) hide show
  1. evalscope/arguments.py +10 -0
  2. evalscope/backend/rag_eval/utils/llm.py +1 -1
  3. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +3 -3
  4. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
  5. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
  6. evalscope/benchmarks/data_adapter.py +4 -2
  7. evalscope/benchmarks/drop/__init__.py +0 -0
  8. evalscope/benchmarks/drop/drop_adapter.py +133 -0
  9. evalscope/benchmarks/drop/utils.py +59 -0
  10. evalscope/benchmarks/general_qa/general_qa_adapter.py +8 -4
  11. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
  12. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  13. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +67 -0
  14. evalscope/benchmarks/tool_bench/utils.py +202 -0
  15. evalscope/benchmarks/utils.py +3 -2
  16. evalscope/benchmarks/winogrande/__init__.py +0 -0
  17. evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
  18. evalscope/collections/evaluator.py +76 -26
  19. evalscope/config.py +46 -15
  20. evalscope/evaluator/evaluator.py +48 -14
  21. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  22. evalscope/metrics/llm_judge.py +3 -3
  23. evalscope/metrics/rouge_metric.py +11 -13
  24. evalscope/models/adapters/chat_adapter.py +51 -34
  25. evalscope/models/adapters/server_adapter.py +15 -19
  26. evalscope/perf/arguments.py +14 -5
  27. evalscope/perf/benchmark.py +4 -9
  28. evalscope/perf/main.py +69 -17
  29. evalscope/perf/utils/benchmark_util.py +33 -15
  30. evalscope/perf/utils/db_util.py +32 -20
  31. evalscope/perf/utils/log_utils.py +1 -1
  32. evalscope/perf/utils/rich_display.py +186 -0
  33. evalscope/report/app.py +47 -34
  34. evalscope/report/utils.py +1 -1
  35. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  36. evalscope/utils/deprecation_utils.py +42 -0
  37. evalscope/version.py +2 -2
  38. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/METADATA +49 -25
  39. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/RECORD +48 -38
  40. tests/aigc/test_t2i.py +4 -4
  41. tests/cli/test_all.py +3 -0
  42. tests/cli/test_collection.py +2 -1
  43. tests/cli/test_run.py +37 -14
  44. tests/perf/test_perf.py +27 -2
  45. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/LICENSE +0 -0
  46. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/WHEEL +0 -0
  47. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/entry_points.txt +0 -0
  48. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  import time
3
3
  import torch
4
- from typing import Any, Dict, List, Tuple, Union
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
5
 
6
6
  from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, Usage
7
7
  from evalscope.utils.logger import get_logger
@@ -58,19 +58,15 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
58
58
  return generation_config
59
59
 
60
60
  def _model_generate(self,
61
- queries: List[str],
62
- system_prompts: List[str] = None,
61
+ formatted_prompts: List[str],
63
62
  infer_cfg: Dict[str, Any] = None) -> Tuple[List[List[str]], List[int]]:
64
63
  """
65
64
  Args:
66
- queries: The input queries.
67
- system_prompts: The system prompts.
65
+ formatted_prompts: The formatted prompts.
68
66
  infer_cfg: The inference configuration.
69
67
  Returns:
70
68
  The prediction results.
71
69
  """
72
- if system_prompts is None:
73
- system_prompts = []
74
70
  if infer_cfg is None:
75
71
  infer_cfg = {}
76
72
 
@@ -92,27 +88,6 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
92
88
  self.generation_config.update(**infer_cfg)
93
89
  fix_do_sample_warning(self.generation_config)
94
90
 
95
- # For chat model, use the chat template to format the input
96
- if self.tokenizer.chat_template is not None:
97
- formatted_prompts = []
98
- for i, query in enumerate(queries):
99
- messages = [ChatMessage(role='user', content=query)]
100
- if i < len(system_prompts) and system_prompts[i]:
101
- messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
102
- # whether thinking is needed
103
- enable_thinking = infer_cfg.get('enable_thinking', None)
104
- if enable_thinking is not None:
105
- prompts = self.tokenizer.apply_chat_template(
106
- messages, tokenize=False, add_generation_prompt=True, enable_thinking=enable_thinking)
107
- else:
108
- prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
109
- formatted_prompts.append(prompts)
110
- else:
111
- # For base model, use the queries as the input
112
- formatted_prompts = queries
113
-
114
- logger.debug(f'formatted_prompts: {formatted_prompts}')
115
-
116
91
  # Get input ids
117
92
  inputs = self.tokenizer(
118
93
  formatted_prompts, return_tensors='pt', padding=True, truncation=True,
@@ -136,26 +111,68 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
136
111
 
137
112
  return responses, input_lengths
138
113
 
139
- @torch.no_grad()
140
- def predict(self, inputs: List[dict], infer_cfg: dict = {}) -> List[dict]:
114
+ def _prepare_inputs(self, inputs: List[dict], infer_cfg: dict = {}) -> List[str]:
141
115
  """
116
+ Prepare the inputs for the model.
142
117
  Args:
143
118
  inputs: The input data.
144
119
  infer_cfg: The inference configuration.
145
120
  Returns:
146
- The prediction results.
121
+ The prepared inputs and system prompts.
147
122
  """
148
-
149
- # Process inputs
150
123
  queries = []
151
124
  system_prompts = []
125
+ message_list = []
152
126
 
153
127
  for input_item in inputs:
154
128
  queries.append(input_item['data'][0])
155
129
  system_prompts.append(input_item.get('system_prompt', None))
130
+ if input_item.get('messages', None):
131
+ message_list.append(input_item.get('messages', None))
132
+
133
+ # For non chat model, use the original queries as the input
134
+ if self.tokenizer.chat_template is None:
135
+ return queries
136
+
137
+ # For chat model, use the messages as the input
138
+ # if message_list is None, use the queries as the input
139
+ if len(message_list) == 0:
140
+ for i, query in enumerate(queries):
141
+ messages = [ChatMessage(role='user', content=query)]
142
+ if i < len(system_prompts) and system_prompts[i]:
143
+ messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
144
+ message_list.append(messages)
145
+
146
+ # Format the messages
147
+ formatted_prompts = []
148
+ for messages in message_list:
149
+ # apply chat template
150
+ chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
151
+ if chat_template_kwargs is not None:
152
+ prompts = self.tokenizer.apply_chat_template(
153
+ messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
154
+ else:
155
+ prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
156
+ formatted_prompts.append(prompts)
157
+
158
+ logger.debug(f'formatted_prompts: {formatted_prompts}')
159
+ return formatted_prompts
160
+
161
+ @torch.no_grad()
162
+ def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = {}) -> List[dict]:
163
+ """
164
+ Args:
165
+ inputs: The input data.
166
+ infer_cfg: The inference configuration.
167
+ Returns:
168
+ The prediction results.
169
+ """
170
+
171
+ # Process inputs
172
+ formatted_prompts = self._prepare_inputs(inputs, infer_cfg)
156
173
 
157
174
  # Run inference
158
- responses, input_lengths = self._model_generate(queries, system_prompts, infer_cfg)
175
+ responses, input_lengths = self._model_generate(formatted_prompts, infer_cfg)
159
176
 
160
177
  # Process outputs
161
178
  results = []
@@ -43,7 +43,7 @@ class ServerModelAdapter(BaseModelAdapter):
43
43
  sig = signature(self.client.chat.completions.create)
44
44
  return list(sig.parameters.keys())
45
45
 
46
- def predict(self, inputs: List[dict], infer_cfg: dict = None) -> List[dict]:
46
+ def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
47
47
  """
48
48
  Model prediction func.
49
49
 
@@ -65,23 +65,26 @@ class ServerModelAdapter(BaseModelAdapter):
65
65
 
66
66
  def process_single_input(self, input_item: dict, infer_cfg: dict) -> dict:
67
67
  """Process a single input item."""
68
- data: list = input_item['data']
69
- if isinstance(data[0], tuple): # for truthful_qa and hellaswag
70
- query = '\n'.join(''.join(item) for item in data)
71
- system_prompt = input_item.get('system_prompt', None)
68
+ if input_item.get('messages', None):
69
+ content = input_item['messages']
72
70
  else:
73
- query = data[0]
74
- system_prompt = input_item.get('system_prompt', None)
75
-
76
- content = self.make_request_content(query, system_prompt)
71
+ content = self.make_request_content(input_item)
77
72
  request_json = self.make_request(content, infer_cfg)
78
73
  response = self.send_request(request_json)
79
74
  return response
80
75
 
81
- def make_request_content(self, query: str, system_prompt: Optional[str] = None) -> list:
76
+ def make_request_content(self, input_item: dict) -> list:
82
77
  """
83
78
  Make request content for OpenAI API.
84
79
  """
80
+ data: list = input_item['data']
81
+ if isinstance(data[0], tuple): # for truthful_qa and hellaswag
82
+ query = '\n'.join(''.join(item) for item in data)
83
+ system_prompt = input_item.get('system_prompt', None)
84
+ else:
85
+ query = data[0]
86
+ system_prompt = input_item.get('system_prompt', None)
87
+
85
88
  messages = []
86
89
  if system_prompt:
87
90
  messages.append({'role': 'system', 'content': system_prompt})
@@ -90,16 +93,9 @@ class ServerModelAdapter(BaseModelAdapter):
90
93
 
91
94
  return messages
92
95
 
93
- def make_request(self, content: list, infer_cfg: dict = {}) -> dict:
96
+ def make_request(self, content: list, infer_cfg: dict) -> dict:
94
97
  """Make request to remote API."""
95
98
  # Format request JSON according to OpenAI API format
96
- from evalscope.config import DEFAULT_GENERATION_CONFIG
97
- if infer_cfg == DEFAULT_GENERATION_CONFIG:
98
- infer_cfg = {
99
- 'max_tokens': 2048,
100
- 'temperature': 0.0,
101
- }
102
-
103
99
  request_json = {'model': self.model_id, 'messages': content, **infer_cfg}
104
100
 
105
101
  if self.timeout:
@@ -137,7 +133,7 @@ class ServerModelAdapter(BaseModelAdapter):
137
133
  return response.model_dump(exclude_unset=True)
138
134
  except Exception as e:
139
135
  logger.error(f'Error when calling remote API: {str(e)}')
140
- raise
136
+ raise e
141
137
 
142
138
  def _collect_stream_response(self, response_stream: List[ChatCompletionChunk]) -> ChatCompletion:
143
139
  collected_chunks = []
@@ -3,7 +3,7 @@ import json
3
3
  import os
4
4
  import sys
5
5
  from dataclasses import dataclass, field
6
- from typing import Any, Dict, List, Optional
6
+ from typing import Any, Dict, List, Optional, Union
7
7
 
8
8
  from evalscope.constants import DEFAULT_WORK_DIR
9
9
 
@@ -27,8 +27,8 @@ class Arguments:
27
27
  no_test_connection: bool = False # Test the connection before starting the benchmark
28
28
 
29
29
  # Performance and parallelism
30
- number: int = 1000 # Number of requests to be made
31
- parallel: int = 1 # Number of parallel requests
30
+ number: Union[int, List[int]] = 1000 # Number of requests to be made
31
+ parallel: Union[int, List[int]] = 1 # Number of parallel requests
32
32
  rate: int = -1 # Rate limit for requests (default: -1, no limit)
33
33
 
34
34
  # Logging and debugging
@@ -98,6 +98,15 @@ class Arguments:
98
98
  if self.apply_chat_template is None:
99
99
  self.apply_chat_template = self.url.strip('/').endswith('chat/completions')
100
100
 
101
+ # Set number and parallel to lists if they are integers
102
+ if isinstance(self.number, int):
103
+ self.number = [self.number]
104
+ if isinstance(self.parallel, int):
105
+ self.parallel = [self.parallel]
106
+ assert len(self.number) == len(
107
+ self.parallel
108
+ ), f'The length of number and parallel should be the same, but got number: {self.number} and parallel: {self.parallel}' # noqa: E501
109
+
101
110
  def __str__(self):
102
111
  return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
103
112
 
@@ -143,8 +152,8 @@ def add_argument(parser: argparse.ArgumentParser):
143
152
  parser.add_argument('--no-test-connection', action='store_false', default=False, help='Do not test the connection before starting the benchmark') # noqa: E501
144
153
 
145
154
  # Performance and parallelism
146
- parser.add_argument('-n', '--number', type=int, default=1000, help='How many requests to be made')
147
- parser.add_argument('--parallel', type=int, default=1, help='Set number of concurrency requests, default 1')
155
+ parser.add_argument('-n', '--number', type=int, default=1000, nargs='+', help='How many requests to be made')
156
+ parser.add_argument('--parallel', type=int, default=1, nargs='+', help='Set number of concurrency requests, default 1') # noqa: E501
148
157
  parser.add_argument('--rate', type=int, default=-1, help='Number of requests per second. default None')
149
158
 
150
159
  # Logging and debugging
@@ -9,7 +9,7 @@ import threading
9
9
  import time
10
10
  from http import HTTPStatus
11
11
  from tqdm import tqdm
12
- from typing import AsyncGenerator, List
12
+ from typing import AsyncGenerator, Dict, List, Tuple
13
13
 
14
14
  from evalscope.perf.arguments import Arguments
15
15
  from evalscope.perf.http_client import AioHttpClient, test_connection
@@ -18,7 +18,6 @@ from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
18
18
  from evalscope.perf.utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, summary_result
19
19
  from evalscope.perf.utils.handler import add_signal_handlers, exception_handler
20
20
  from evalscope.perf.utils.local_server import start_app
21
- from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
22
21
  from evalscope.utils.logger import get_logger
23
22
 
24
23
  logger = get_logger()
@@ -116,11 +115,6 @@ async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args:
116
115
 
117
116
  result_db_path = get_result_db_path(args)
118
117
 
119
- if args.wandb_api_key:
120
- init_wandb(args)
121
- if args.swanlab_api_key:
122
- init_swanlab(args)
123
-
124
118
  collected_benchmark_data = []
125
119
 
126
120
  with tqdm(desc='Processing', total=args.number) as pbar:
@@ -180,7 +174,7 @@ async def connect_test(args: Arguments) -> bool:
180
174
 
181
175
 
182
176
  @exception_handler
183
- async def benchmark(args: Arguments) -> None:
177
+ async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
184
178
  if platform.system() != 'Windows':
185
179
  loop = asyncio.get_running_loop()
186
180
  add_signal_handlers(loop)
@@ -205,4 +199,5 @@ async def benchmark(args: Arguments) -> None:
205
199
  data_process_completed_event.set()
206
200
 
207
201
  metrics, result_db_path = await statistic_benchmark_metric_task
208
- summary_result(args, metrics, result_db_path)
202
+ metrics_result, percentile_result = summary_result(args, metrics, result_db_path)
203
+ return metrics_result, percentile_result
evalscope/perf/main.py CHANGED
@@ -1,32 +1,32 @@
1
1
  import asyncio
2
+ import copy
2
3
  import os
3
4
  import platform
5
+ import time
4
6
  from argparse import Namespace
5
7
 
6
- from evalscope.perf.arguments import Arguments, parse_args
7
- from evalscope.perf.benchmark import benchmark
8
- from evalscope.perf.utils.db_util import get_output_path
9
- from evalscope.perf.utils.handler import add_signal_handlers
8
+ from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
10
9
  from evalscope.utils.logger import configure_logging, get_logger
11
10
  from evalscope.utils.utils import seed_everything
11
+ from .arguments import Arguments, parse_args
12
+ from .benchmark import benchmark
13
+ from .utils.db_util import get_output_path
14
+ from .utils.handler import add_signal_handlers
15
+ from .utils.rich_display import print_summary
12
16
 
13
17
  logger = get_logger()
14
18
 
15
19
 
16
- def run_perf_benchmark(args):
17
- if isinstance(args, dict):
18
- args = Arguments(**args)
19
- elif isinstance(args, Namespace):
20
- args = Arguments.from_args(args)
21
-
22
- if args.seed is not None:
23
- seed_everything(args.seed)
20
+ def run_one_benchmark(args: Arguments, output_path: str = None):
21
+ if isinstance(args.parallel, list):
22
+ args.parallel = args.parallel[0]
23
+ if isinstance(args.number, list):
24
+ args.number = args.number[0]
24
25
 
25
26
  # Setup logger and output
26
- args.outputs_dir = get_output_path(args)
27
- configure_logging(args.debug, os.path.join(args.outputs_dir, 'benchmark.log'))
27
+ args.outputs_dir = output_path
28
28
 
29
- logger.info('Starting benchmark...')
29
+ logger.info('Starting benchmark with args: ')
30
30
  logger.info(args)
31
31
 
32
32
  if platform.system() == 'Windows':
@@ -36,9 +36,61 @@ def run_perf_benchmark(args):
36
36
  if platform.system() != 'Windows':
37
37
  add_signal_handlers(loop)
38
38
 
39
- loop.run_until_complete(benchmark(args))
39
+ return loop.run_until_complete(benchmark(args))
40
+
41
+
42
+ def run_multi_benchmark(args: Arguments, output_path: str = None):
43
+ results = []
44
+ number_list = copy.deepcopy(args.number)
45
+ parallel_list = copy.deepcopy(args.parallel)
46
+ for i, (number, parallel) in enumerate(zip(number_list, parallel_list)):
47
+ args.number = number
48
+ args.parallel = parallel
49
+ # Set up output path for each run
50
+ cur_output_path = os.path.join(output_path, f'parallel_{parallel}_number_{number}')
51
+ os.makedirs(cur_output_path, exist_ok=True)
52
+ # Start the benchmark
53
+ metrics_result = run_one_benchmark(args, output_path=cur_output_path)
54
+ # Save the results
55
+ results.append(metrics_result)
56
+ # Sleep between runs to avoid overwhelming the server
57
+ if i < len(number_list) - 1:
58
+ logger.info('Sleeping for 5 seconds before the next run...')
59
+ time.sleep(5)
60
+ # Analyze results
61
+ print_summary(results, args.model_id)
62
+ return results
63
+
64
+
65
+ def run_perf_benchmark(args):
66
+ # Check if args is a dictionary or Namespace
67
+ if isinstance(args, dict):
68
+ args = Arguments(**args)
69
+ elif isinstance(args, Namespace):
70
+ args = Arguments.from_args(args)
71
+
72
+ if args.seed is not None:
73
+ seed_everything(args.seed)
74
+
75
+ # Initialize output directory
76
+ output_path = get_output_path(args)
77
+ configure_logging(args.debug, os.path.join(output_path, 'benchmark.log'))
78
+
79
+ # Initialize wandb and swanlab
80
+ if args.wandb_api_key:
81
+ init_wandb(args)
82
+ if args.swanlab_api_key:
83
+ init_swanlab(args)
84
+
85
+ # Start benchmark
86
+ if len(args.number) == 1:
87
+ return run_one_benchmark(args, output_path=output_path)
88
+ else:
89
+ return run_multi_benchmark(args, output_path=output_path)
40
90
 
41
91
 
42
92
  if __name__ == '__main__':
43
93
  args = Arguments.from_args(parse_args())
44
- run_perf_benchmark(args)
94
+ metrics_result, percentile_result = run_perf_benchmark(args)
95
+ print(metrics_result)
96
+ print(percentile_result)
@@ -51,6 +51,24 @@ class BenchmarkData:
51
51
  self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
52
52
 
53
53
 
54
+ class Metrics:
55
+ TIME_TAKEN_FOR_TESTS = 'Time taken for tests (s)'
56
+ NUMBER_OF_CONCURRENCY = 'Number of concurrency'
57
+ TOTAL_REQUESTS = 'Total requests'
58
+ SUCCEED_REQUESTS = 'Succeed requests'
59
+ FAILED_REQUESTS = 'Failed requests'
60
+ OUTPUT_TOKEN_THROUGHPUT = 'Output token throughput (tok/s)'
61
+ TOTAL_TOKEN_THROUGHPUT = 'Total token throughput (tok/s)'
62
+ REQUEST_THROUGHPUT = 'Request throughput (req/s)'
63
+ AVERAGE_LATENCY = 'Average latency (s)'
64
+ AVERAGE_TIME_TO_FIRST_TOKEN = 'Average time to first token (s)'
65
+ AVERAGE_TIME_PER_OUTPUT_TOKEN = 'Average time per output token (s)'
66
+ AVERAGE_INPUT_TOKENS_PER_REQUEST = 'Average input tokens per request'
67
+ AVERAGE_OUTPUT_TOKENS_PER_REQUEST = 'Average output tokens per request'
68
+ AVERAGE_PACKAGE_LATENCY = 'Average package latency (s)'
69
+ AVERAGE_PACKAGE_PER_REQUEST = 'Average package per request'
70
+
71
+
54
72
  @dataclass
55
73
  class BenchmarkMetrics:
56
74
  concurrency: int = 0
@@ -125,20 +143,20 @@ class BenchmarkMetrics:
125
143
 
126
144
  def create_message(self, default_ndigits=4):
127
145
  message = {
128
- 'Time taken for tests (s)': round(self.total_time, default_ndigits),
129
- 'Number of concurrency': self.concurrency,
130
- 'Total requests': int(self.n_total_queries),
131
- 'Succeed requests': self.n_succeed_queries,
132
- 'Failed requests': self.n_failed_queries,
133
- 'Output token throughput (tok/s)': round(self.avg_output_token_per_seconds, default_ndigits),
134
- 'Total token throughput (tok/s)': round(self.avg_total_token_per_seconds, default_ndigits),
135
- 'Request throughput (req/s)': round(self.qps, default_ndigits),
136
- 'Average latency (s)': round(self.avg_latency, default_ndigits),
137
- 'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
138
- 'Average time per output token (s)': round(self.avg_time_per_token, default_ndigits),
139
- 'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits),
140
- 'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits),
141
- 'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
142
- 'Average package per request': round(self.n_avg_chunks, default_ndigits),
146
+ Metrics.TIME_TAKEN_FOR_TESTS: round(self.total_time, default_ndigits),
147
+ Metrics.NUMBER_OF_CONCURRENCY: self.concurrency,
148
+ Metrics.TOTAL_REQUESTS: int(self.n_total_queries),
149
+ Metrics.SUCCEED_REQUESTS: self.n_succeed_queries,
150
+ Metrics.FAILED_REQUESTS: self.n_failed_queries,
151
+ Metrics.OUTPUT_TOKEN_THROUGHPUT: round(self.avg_output_token_per_seconds, default_ndigits),
152
+ Metrics.TOTAL_TOKEN_THROUGHPUT: round(self.avg_total_token_per_seconds, default_ndigits),
153
+ Metrics.REQUEST_THROUGHPUT: round(self.qps, default_ndigits),
154
+ Metrics.AVERAGE_LATENCY: round(self.avg_latency, default_ndigits),
155
+ Metrics.AVERAGE_TIME_TO_FIRST_TOKEN: round(self.avg_first_chunk_latency, default_ndigits),
156
+ Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN: round(self.avg_time_per_token, default_ndigits),
157
+ Metrics.AVERAGE_INPUT_TOKENS_PER_REQUEST: round(self.avg_prompt_tokens, default_ndigits),
158
+ Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST: round(self.avg_completion_tokens, default_ndigits),
159
+ Metrics.AVERAGE_PACKAGE_LATENCY: round(self.avg_chunk_time, default_ndigits),
160
+ Metrics.AVERAGE_PACKAGE_PER_REQUEST: round(self.n_avg_chunks, default_ndigits),
143
161
  }
144
162
  return message
@@ -7,7 +7,7 @@ import sqlite3
7
7
  import sys
8
8
  from datetime import datetime
9
9
  from tabulate import tabulate
10
- from typing import Dict, List
10
+ from typing import Dict, List, Tuple
11
11
 
12
12
  from evalscope.perf.arguments import Arguments
13
13
  from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
@@ -111,6 +111,18 @@ def get_result_db_path(args: Arguments):
111
111
  return result_db_path
112
112
 
113
113
 
114
+ class PercentileMetrics:
115
+ TTFT = 'TTFT (s)'
116
+ ITL = 'ITL (s)'
117
+ TPOT = 'TPOT (s)'
118
+ LATENCY = 'Latency (s)'
119
+ INPUT_TOKENS = 'Input tokens'
120
+ OUTPUT_TOKENS = 'Output tokens'
121
+ OUTPUT_THROUGHPUT = 'Output (tok/s)'
122
+ TOTAL_THROUGHPUT = 'Total (tok/s)'
123
+ PERCENTILES = 'Percentiles'
124
+
125
+
114
126
  def calculate_percentiles(data: List[float], percentiles: List[int]) -> Dict[int, float]:
115
127
  """
116
128
  Calculate the percentiles for a specific list of data.
@@ -157,10 +169,6 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
157
169
  with sqlite3.connect(result_db_path) as con:
158
170
  rows = con.execute(query_sql).fetchall()
159
171
 
160
- if len(rows) < len(percentiles):
161
- logger.info('Too little data to calculate quantiles!')
162
- return {}
163
-
164
172
  # Define index variables for columns
165
173
  CHUNK_TIMES_INDEX = 1
166
174
  LATENCY_INDEX = 4
@@ -175,24 +183,25 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
175
183
  inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))
176
184
 
177
185
  metrics = {
178
- 'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
179
- 'ITL (s)':
186
+ PercentileMetrics.TTFT: [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
187
+ PercentileMetrics.ITL:
180
188
  inter_token_latencies_all,
181
- 'TPOT (s)':
189
+ PercentileMetrics.TPOT:
182
190
  [(row[CHUNK_TIME_INDEX] / row[COMPLETION_TOKENS_INDEX]) if row[COMPLETION_TOKENS_INDEX] > 0 else float('nan')
183
191
  for row in rows],
184
- 'Latency (s)': [row[LATENCY_INDEX] for row in rows],
185
- 'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows],
186
- 'Output tokens': [row[COMPLETION_TOKENS_INDEX] for row in rows],
187
- 'Output throughput(tok/s)':
192
+ PercentileMetrics.LATENCY: [row[LATENCY_INDEX] for row in rows],
193
+ PercentileMetrics.INPUT_TOKENS: [row[PROMPT_TOKENS_INDEX] for row in rows],
194
+ PercentileMetrics.OUTPUT_TOKENS: [row[COMPLETION_TOKENS_INDEX] for row in rows],
195
+ PercentileMetrics.OUTPUT_THROUGHPUT:
188
196
  [(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
189
197
  for row in rows],
190
- 'Total throughput(tok/s)': [((row[PROMPT_TOKENS_INDEX] + row[COMPLETION_TOKENS_INDEX])
191
- / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan') for row in rows]
198
+ PercentileMetrics.TOTAL_THROUGHPUT: [((row[PROMPT_TOKENS_INDEX] + row[COMPLETION_TOKENS_INDEX])
199
+ / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
200
+ for row in rows]
192
201
  }
193
202
 
194
203
  # Calculate percentiles for each metric
195
- results = {'Percentile': [f'{p}%' for p in percentiles]}
204
+ results = {PercentileMetrics.PERCENTILES: [f'{p}%' for p in percentiles]}
196
205
  for metric_name, data in metrics.items():
197
206
  metric_percentiles = calculate_percentiles(data, percentiles)
198
207
  results[metric_name] = [metric_percentiles[p] for p in percentiles]
@@ -200,16 +209,15 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
200
209
  return results
201
210
 
202
211
 
203
- def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str):
212
+ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str) -> Tuple[Dict, Dict]:
204
213
  result_path = os.path.dirname(result_db_path)
205
214
  write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
206
215
 
207
- data = metrics.create_message()
208
- data.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
209
- write_json_file(data, os.path.join(result_path, 'benchmark_summary.json'))
216
+ metrics_result = metrics.create_message()
217
+ write_json_file(metrics_result, os.path.join(result_path, 'benchmark_summary.json'))
210
218
 
211
219
  # Print summary in a table
212
- table = tabulate(list(data.items()), headers=['Key', 'Value'], tablefmt='grid')
220
+ table = tabulate(list(metrics_result.items()), headers=['Key', 'Value'], tablefmt='grid')
213
221
  logger.info('\nBenchmarking summary:\n' + table)
214
222
 
215
223
  # Get percentile results
@@ -223,6 +231,10 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
223
231
  if args.dataset.startswith('speed_benchmark'):
224
232
  speed_benchmark_result(result_db_path)
225
233
 
234
+ logger.info(f'Save the summary to: {result_path}')
235
+
236
+ return metrics_result, percentile_result
237
+
226
238
 
227
239
  def speed_benchmark_result(result_db_path: str):
228
240
  query_sql = """
@@ -35,7 +35,7 @@ def init_swanlab(args: Arguments) -> None:
35
35
  name = args.name if args.name else f'{args.model_id}_{current_time}'
36
36
  swanlab.config.update({'framework': '📏evalscope'})
37
37
  swanlab.init(
38
- project='perf_benchmark',
38
+ project=os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
39
39
  name=name,
40
40
  config=args.to_dict(),
41
41
  mode='local' if args.swanlab_api_key == 'local' else None)