evalscope 0.15.1__py3-none-any.whl → 0.16.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (78) hide show
  1. evalscope/app/__init__.py +28 -0
  2. evalscope/{report → app}/app.py +67 -59
  3. evalscope/app/constants.py +21 -0
  4. evalscope/arguments.py +12 -1
  5. evalscope/backend/opencompass/backend_manager.py +2 -1
  6. evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
  7. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  8. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  9. evalscope/backend/rag_eval/utils/embedding.py +75 -35
  10. evalscope/backend/rag_eval/utils/llm.py +1 -1
  11. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
  12. evalscope/benchmarks/benchmark.py +1 -0
  13. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
  14. evalscope/benchmarks/data_adapter.py +101 -18
  15. evalscope/benchmarks/docmath/__init__.py +0 -0
  16. evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
  17. evalscope/benchmarks/docmath/utils.py +220 -0
  18. evalscope/benchmarks/drop/__init__.py +0 -0
  19. evalscope/benchmarks/drop/drop_adapter.py +133 -0
  20. evalscope/benchmarks/drop/utils.py +59 -0
  21. evalscope/benchmarks/frames/__init__.py +0 -0
  22. evalscope/benchmarks/frames/frames_adapter.py +90 -0
  23. evalscope/benchmarks/frames/utils.py +37 -0
  24. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
  25. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  26. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
  27. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  28. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
  29. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  30. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +70 -0
  31. evalscope/benchmarks/tool_bench/utils.py +203 -0
  32. evalscope/benchmarks/utils.py +28 -2
  33. evalscope/benchmarks/winogrande/__init__.py +0 -0
  34. evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
  35. evalscope/cli/start_app.py +2 -2
  36. evalscope/collections/__init__.py +35 -3
  37. evalscope/collections/evaluator.py +94 -32
  38. evalscope/config.py +54 -17
  39. evalscope/evaluator/evaluator.py +80 -41
  40. evalscope/metrics/__init__.py +3 -1
  41. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  42. evalscope/metrics/llm_judge.py +15 -8
  43. evalscope/metrics/math_parser.py +1 -1
  44. evalscope/metrics/rouge_metric.py +11 -13
  45. evalscope/models/adapters/chat_adapter.py +51 -34
  46. evalscope/models/adapters/server_adapter.py +17 -25
  47. evalscope/perf/arguments.py +16 -7
  48. evalscope/perf/benchmark.py +0 -15
  49. evalscope/perf/main.py +72 -15
  50. evalscope/perf/plugin/datasets/custom.py +15 -0
  51. evalscope/perf/utils/benchmark_util.py +34 -16
  52. evalscope/perf/utils/db_util.py +25 -15
  53. evalscope/perf/utils/local_server.py +1 -0
  54. evalscope/perf/utils/log_utils.py +12 -5
  55. evalscope/perf/utils/rich_display.py +186 -0
  56. evalscope/report/__init__.py +36 -4
  57. evalscope/report/combinator.py +8 -0
  58. evalscope/report/generator.py +33 -9
  59. evalscope/report/utils.py +61 -4
  60. evalscope/run.py +12 -0
  61. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  62. evalscope/utils/deprecation_utils.py +42 -0
  63. evalscope/utils/logger.py +1 -1
  64. evalscope/utils/utils.py +12 -0
  65. evalscope/version.py +2 -2
  66. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/METADATA +57 -31
  67. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/RECORD +78 -57
  68. tests/aigc/test_t2i.py +40 -3
  69. tests/cli/test_all.py +39 -32
  70. tests/cli/test_collection.py +8 -6
  71. tests/cli/test_run.py +43 -17
  72. tests/perf/test_perf.py +23 -0
  73. tests/rag/test_mteb.py +5 -5
  74. /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
  75. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
  76. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
  77. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
  78. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  import time
3
3
  import torch
4
- from typing import Any, Dict, List, Tuple, Union
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
5
 
6
6
  from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, Usage
7
7
  from evalscope.utils.logger import get_logger
@@ -58,19 +58,15 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
58
58
  return generation_config
59
59
 
60
60
  def _model_generate(self,
61
- queries: List[str],
62
- system_prompts: List[str] = None,
61
+ formatted_prompts: List[str],
63
62
  infer_cfg: Dict[str, Any] = None) -> Tuple[List[List[str]], List[int]]:
64
63
  """
65
64
  Args:
66
- queries: The input queries.
67
- system_prompts: The system prompts.
65
+ formatted_prompts: The formatted prompts.
68
66
  infer_cfg: The inference configuration.
69
67
  Returns:
70
68
  The prediction results.
71
69
  """
72
- if system_prompts is None:
73
- system_prompts = []
74
70
  if infer_cfg is None:
75
71
  infer_cfg = {}
76
72
 
@@ -92,27 +88,6 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
92
88
  self.generation_config.update(**infer_cfg)
93
89
  fix_do_sample_warning(self.generation_config)
94
90
 
95
- # For chat model, use the chat template to format the input
96
- if self.tokenizer.chat_template is not None:
97
- formatted_prompts = []
98
- for i, query in enumerate(queries):
99
- messages = [ChatMessage(role='user', content=query)]
100
- if i < len(system_prompts) and system_prompts[i]:
101
- messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
102
- # whether thinking is needed
103
- chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
104
- if chat_template_kwargs is not None:
105
- prompts = self.tokenizer.apply_chat_template(
106
- messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
107
- else:
108
- prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
109
- formatted_prompts.append(prompts)
110
- else:
111
- # For base model, use the queries as the input
112
- formatted_prompts = queries
113
-
114
- logger.debug(f'formatted_prompts: {formatted_prompts}')
115
-
116
91
  # Get input ids
117
92
  inputs = self.tokenizer(
118
93
  formatted_prompts, return_tensors='pt', padding=True, truncation=True,
@@ -136,26 +111,68 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
136
111
 
137
112
  return responses, input_lengths
138
113
 
139
- @torch.no_grad()
140
- def predict(self, inputs: List[dict], infer_cfg: dict = {}) -> List[dict]:
114
+ def _prepare_inputs(self, inputs: List[dict], infer_cfg: dict = {}) -> List[str]:
141
115
  """
116
+ Prepare the inputs for the model.
142
117
  Args:
143
118
  inputs: The input data.
144
119
  infer_cfg: The inference configuration.
145
120
  Returns:
146
- The prediction results.
121
+ The prepared inputs and system prompts.
147
122
  """
148
-
149
- # Process inputs
150
123
  queries = []
151
124
  system_prompts = []
125
+ message_list = []
152
126
 
153
127
  for input_item in inputs:
154
128
  queries.append(input_item['data'][0])
155
129
  system_prompts.append(input_item.get('system_prompt', None))
130
+ if input_item.get('messages', None):
131
+ message_list.append(input_item.get('messages', None))
132
+
133
+ # For non chat model, use the original queries as the input
134
+ if self.tokenizer.chat_template is None:
135
+ return queries
136
+
137
+ # For chat model, use the messages as the input
138
+ # if message_list is None, use the queries as the input
139
+ if len(message_list) == 0:
140
+ for i, query in enumerate(queries):
141
+ messages = [ChatMessage(role='user', content=query)]
142
+ if i < len(system_prompts) and system_prompts[i]:
143
+ messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
144
+ message_list.append(messages)
145
+
146
+ # Format the messages
147
+ formatted_prompts = []
148
+ for messages in message_list:
149
+ # apply chat template
150
+ chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
151
+ if chat_template_kwargs is not None:
152
+ prompts = self.tokenizer.apply_chat_template(
153
+ messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
154
+ else:
155
+ prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
156
+ formatted_prompts.append(prompts)
157
+
158
+ logger.debug(f'formatted_prompts: {formatted_prompts}')
159
+ return formatted_prompts
160
+
161
+ @torch.no_grad()
162
+ def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = {}) -> List[dict]:
163
+ """
164
+ Args:
165
+ inputs: The input data.
166
+ infer_cfg: The inference configuration.
167
+ Returns:
168
+ The prediction results.
169
+ """
170
+
171
+ # Process inputs
172
+ formatted_prompts = self._prepare_inputs(inputs, infer_cfg)
156
173
 
157
174
  # Run inference
158
- responses, input_lengths = self._model_generate(queries, system_prompts, infer_cfg)
175
+ responses, input_lengths = self._model_generate(formatted_prompts, infer_cfg)
159
176
 
160
177
  # Process outputs
161
178
  results = []
@@ -1,11 +1,11 @@
1
1
  import openai
2
2
  from collections import defaultdict
3
- from inspect import signature
4
3
  from openai.types.chat import ChatCompletion, ChatCompletionChunk
5
4
  from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
6
5
  from typing import List, Optional, Union
7
6
 
8
7
  from evalscope.utils.logger import get_logger
8
+ from evalscope.utils.utils import get_supported_params
9
9
  from .base_adapter import BaseModelAdapter
10
10
 
11
11
  logger = get_logger()
@@ -31,7 +31,7 @@ class ServerModelAdapter(BaseModelAdapter):
31
31
  api_key=api_key,
32
32
  base_url=self.api_url,
33
33
  )
34
- self.supported_params = self._get_supported_params()
34
+ self.supported_params = get_supported_params(self.client.chat.completions.create)
35
35
 
36
36
  self.seed = kwargs.get('seed', None)
37
37
  self.timeout = kwargs.get('timeout', 60)
@@ -39,11 +39,7 @@ class ServerModelAdapter(BaseModelAdapter):
39
39
  self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
40
40
  super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
41
41
 
42
- def _get_supported_params(self):
43
- sig = signature(self.client.chat.completions.create)
44
- return list(sig.parameters.keys())
45
-
46
- def predict(self, inputs: List[dict], infer_cfg: dict = None) -> List[dict]:
42
+ def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
47
43
  """
48
44
  Model prediction func.
49
45
 
@@ -65,23 +61,26 @@ class ServerModelAdapter(BaseModelAdapter):
65
61
 
66
62
  def process_single_input(self, input_item: dict, infer_cfg: dict) -> dict:
67
63
  """Process a single input item."""
68
- data: list = input_item['data']
69
- if isinstance(data[0], tuple): # for truthful_qa and hellaswag
70
- query = '\n'.join(''.join(item) for item in data)
71
- system_prompt = input_item.get('system_prompt', None)
64
+ if input_item.get('messages', None):
65
+ content = input_item['messages']
72
66
  else:
73
- query = data[0]
74
- system_prompt = input_item.get('system_prompt', None)
75
-
76
- content = self.make_request_content(query, system_prompt)
67
+ content = self.make_request_content(input_item)
77
68
  request_json = self.make_request(content, infer_cfg)
78
69
  response = self.send_request(request_json)
79
70
  return response
80
71
 
81
- def make_request_content(self, query: str, system_prompt: Optional[str] = None) -> list:
72
+ def make_request_content(self, input_item: dict) -> list:
82
73
  """
83
74
  Make request content for OpenAI API.
84
75
  """
76
+ data: list = input_item['data']
77
+ if isinstance(data[0], tuple): # for truthful_qa and hellaswag
78
+ query = '\n'.join(''.join(item) for item in data)
79
+ system_prompt = input_item.get('system_prompt', None)
80
+ else:
81
+ query = data[0]
82
+ system_prompt = input_item.get('system_prompt', None)
83
+
85
84
  messages = []
86
85
  if system_prompt:
87
86
  messages.append({'role': 'system', 'content': system_prompt})
@@ -90,16 +89,9 @@ class ServerModelAdapter(BaseModelAdapter):
90
89
 
91
90
  return messages
92
91
 
93
- def make_request(self, content: list, infer_cfg: dict = {}) -> dict:
92
+ def make_request(self, content: list, infer_cfg: dict) -> dict:
94
93
  """Make request to remote API."""
95
94
  # Format request JSON according to OpenAI API format
96
- from evalscope.config import DEFAULT_GENERATION_CONFIG
97
- if infer_cfg == DEFAULT_GENERATION_CONFIG:
98
- infer_cfg = {
99
- 'max_tokens': 2048,
100
- 'temperature': 0.0,
101
- }
102
-
103
95
  request_json = {'model': self.model_id, 'messages': content, **infer_cfg}
104
96
 
105
97
  if self.timeout:
@@ -137,7 +129,7 @@ class ServerModelAdapter(BaseModelAdapter):
137
129
  return response.model_dump(exclude_unset=True)
138
130
  except Exception as e:
139
131
  logger.error(f'Error when calling remote API: {str(e)}')
140
- raise
132
+ raise e
141
133
 
142
134
  def _collect_stream_response(self, response_stream: List[ChatCompletionChunk]) -> ChatCompletion:
143
135
  collected_chunks = []
@@ -3,7 +3,7 @@ import json
3
3
  import os
4
4
  import sys
5
5
  from dataclasses import dataclass, field
6
- from typing import Any, Dict, List, Optional
6
+ from typing import Any, Dict, List, Optional, Union
7
7
 
8
8
  from evalscope.constants import DEFAULT_WORK_DIR
9
9
 
@@ -27,8 +27,8 @@ class Arguments:
27
27
  no_test_connection: bool = False # Test the connection before starting the benchmark
28
28
 
29
29
  # Performance and parallelism
30
- number: int = 1000 # Number of requests to be made
31
- parallel: int = 1 # Number of parallel requests
30
+ number: Union[int, List[int]] = 1000 # Number of requests to be made
31
+ parallel: Union[int, List[int]] = 1 # Number of parallel requests
32
32
  rate: int = -1 # Rate limit for requests (default: -1, no limit)
33
33
 
34
34
  # Logging and debugging
@@ -60,8 +60,8 @@ class Arguments:
60
60
  min_tokens: Optional[int] = None # Minimum number of tokens in the response
61
61
  n_choices: Optional[int] = None # Number of response choices
62
62
  seed: Optional[int] = 0 # Random seed for reproducibility
63
- stop: Optional[List[str]] = field(default_factory=list) # Stop sequences for the response
64
- stop_token_ids: Optional[List[str]] = field(default_factory=list) # Stop token IDs for the response
63
+ stop: Optional[List[str]] = None # Stop sequences for the response
64
+ stop_token_ids: Optional[List[str]] = None # Stop token IDs for the response
65
65
  stream: Optional[bool] = True # Whether to stream the response
66
66
  temperature: float = 0.0 # Temperature setting for the response
67
67
  top_p: Optional[float] = None # Top-p (nucleus) sampling setting for the response
@@ -98,6 +98,15 @@ class Arguments:
98
98
  if self.apply_chat_template is None:
99
99
  self.apply_chat_template = self.url.strip('/').endswith('chat/completions')
100
100
 
101
+ # Set number and parallel to lists if they are integers
102
+ if isinstance(self.number, int):
103
+ self.number = [self.number]
104
+ if isinstance(self.parallel, int):
105
+ self.parallel = [self.parallel]
106
+ assert len(self.number) == len(
107
+ self.parallel
108
+ ), f'The length of number and parallel should be the same, but got number: {self.number} and parallel: {self.parallel}' # noqa: E501
109
+
101
110
  def __str__(self):
102
111
  return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
103
112
 
@@ -143,8 +152,8 @@ def add_argument(parser: argparse.ArgumentParser):
143
152
  parser.add_argument('--no-test-connection', action='store_false', default=False, help='Do not test the connection before starting the benchmark') # noqa: E501
144
153
 
145
154
  # Performance and parallelism
146
- parser.add_argument('-n', '--number', type=int, default=1000, help='How many requests to be made')
147
- parser.add_argument('--parallel', type=int, default=1, help='Set number of concurrency requests, default 1')
155
+ parser.add_argument('-n', '--number', type=int, default=1000, nargs='+', help='How many requests to be made')
156
+ parser.add_argument('--parallel', type=int, default=1, nargs='+', help='Set number of concurrency requests, default 1') # noqa: E501
148
157
  parser.add_argument('--rate', type=int, default=-1, help='Number of requests per second. default None')
149
158
 
150
159
  # Logging and debugging
@@ -1,11 +1,8 @@
1
1
  import asyncio
2
- import copy
3
2
  import json
4
3
  import numpy as np
5
- import os
6
4
  import platform
7
5
  import sqlite3
8
- import threading
9
6
  import time
10
7
  from http import HTTPStatus
11
8
  from tqdm import tqdm
@@ -17,8 +14,6 @@ from evalscope.perf.plugin.registry import ApiRegistry, DatasetRegistry
17
14
  from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
18
15
  from evalscope.perf.utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, summary_result
19
16
  from evalscope.perf.utils.handler import add_signal_handlers, exception_handler
20
- from evalscope.perf.utils.local_server import start_app
21
- from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
22
17
  from evalscope.utils.logger import get_logger
23
18
 
24
19
  logger = get_logger()
@@ -116,11 +111,6 @@ async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args:
116
111
 
117
112
  result_db_path = get_result_db_path(args)
118
113
 
119
- if args.wandb_api_key:
120
- init_wandb(args)
121
- if args.swanlab_api_key:
122
- init_swanlab(args)
123
-
124
114
  collected_benchmark_data = []
125
115
 
126
116
  with tqdm(desc='Processing', total=args.number) as pbar:
@@ -170,11 +160,6 @@ async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args:
170
160
 
171
161
  @exception_handler
172
162
  async def connect_test(args: Arguments) -> bool:
173
- if args.api.startswith('local'):
174
- # start local server
175
- server = threading.Thread(target=start_app, args=(copy.deepcopy(args), ), daemon=True)
176
- server.start()
177
-
178
163
  if (not args.no_test_connection) and (not await test_connection(args)):
179
164
  raise TimeoutError('Test connection failed')
180
165
 
evalscope/perf/main.py CHANGED
@@ -1,32 +1,34 @@
1
1
  import asyncio
2
+ import copy
2
3
  import os
3
4
  import platform
5
+ import threading
6
+ import time
4
7
  from argparse import Namespace
5
8
 
6
- from evalscope.perf.arguments import Arguments, parse_args
7
- from evalscope.perf.benchmark import benchmark
8
- from evalscope.perf.utils.db_util import get_output_path
9
- from evalscope.perf.utils.handler import add_signal_handlers
9
+ from evalscope.perf.utils.local_server import start_app
10
+ from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
10
11
  from evalscope.utils.logger import configure_logging, get_logger
11
12
  from evalscope.utils.utils import seed_everything
13
+ from .arguments import Arguments, parse_args
14
+ from .benchmark import benchmark
15
+ from .utils.db_util import get_output_path
16
+ from .utils.handler import add_signal_handlers
17
+ from .utils.rich_display import print_summary
12
18
 
13
19
  logger = get_logger()
14
20
 
15
21
 
16
- def run_perf_benchmark(args):
17
- if isinstance(args, dict):
18
- args = Arguments(**args)
19
- elif isinstance(args, Namespace):
20
- args = Arguments.from_args(args)
21
-
22
- if args.seed is not None:
23
- seed_everything(args.seed)
22
+ def run_one_benchmark(args: Arguments, output_path: str = None):
23
+ if isinstance(args.parallel, list):
24
+ args.parallel = args.parallel[0]
25
+ if isinstance(args.number, list):
26
+ args.number = args.number[0]
24
27
 
25
28
  # Setup logger and output
26
- args.outputs_dir = get_output_path(args)
27
- configure_logging(args.debug, os.path.join(args.outputs_dir, 'benchmark.log'))
29
+ args.outputs_dir = output_path
28
30
 
29
- logger.info('Starting benchmark...')
31
+ logger.info('Starting benchmark with args: ')
30
32
  logger.info(args)
31
33
 
32
34
  if platform.system() == 'Windows':
@@ -39,6 +41,61 @@ def run_perf_benchmark(args):
39
41
  return loop.run_until_complete(benchmark(args))
40
42
 
41
43
 
44
+ def run_multi_benchmark(args: Arguments, output_path: str = None):
45
+ results = []
46
+ number_list = copy.deepcopy(args.number)
47
+ parallel_list = copy.deepcopy(args.parallel)
48
+ for i, (number, parallel) in enumerate(zip(number_list, parallel_list)):
49
+ args.number = number
50
+ args.parallel = parallel
51
+ # Set up output path for each run
52
+ cur_output_path = os.path.join(output_path, f'parallel_{parallel}_number_{number}')
53
+ os.makedirs(cur_output_path, exist_ok=True)
54
+ # Start the benchmark
55
+ metrics_result = run_one_benchmark(args, output_path=cur_output_path)
56
+ # Save the results
57
+ results.append(metrics_result)
58
+ # Sleep between runs to avoid overwhelming the server
59
+ if i < len(number_list) - 1:
60
+ logger.info('Sleeping for 5 seconds before the next run...')
61
+ time.sleep(5)
62
+ # Analyze results
63
+ print_summary(results, args.model_id)
64
+ return results
65
+
66
+
67
+ def run_perf_benchmark(args):
68
+ # Check if args is a dictionary or Namespace
69
+ if isinstance(args, dict):
70
+ args = Arguments(**args)
71
+ elif isinstance(args, Namespace):
72
+ args = Arguments.from_args(args)
73
+
74
+ if args.seed is not None:
75
+ seed_everything(args.seed)
76
+
77
+ # Initialize output directory
78
+ output_path = get_output_path(args)
79
+ configure_logging(args.debug, os.path.join(output_path, 'benchmark.log'))
80
+
81
+ # Initialize wandb and swanlab
82
+ if args.wandb_api_key:
83
+ init_wandb(args)
84
+ if args.swanlab_api_key:
85
+ init_swanlab(args)
86
+
87
+ # Initialize local server if needed
88
+ if args.api.startswith('local'):
89
+ # start local server
90
+ server = threading.Thread(target=start_app, args=(copy.deepcopy(args), ), daemon=True)
91
+ server.start()
92
+ # Start benchmark
93
+ if len(args.number) == 1:
94
+ return run_one_benchmark(args, output_path=output_path)
95
+ else:
96
+ return run_multi_benchmark(args, output_path=output_path)
97
+
98
+
42
99
  if __name__ == '__main__':
43
100
  args = Arguments.from_args(parse_args())
44
101
  metrics_result, percentile_result = run_perf_benchmark(args)
@@ -22,3 +22,18 @@ class CustomDatasetPlugin(DatasetPluginBase):
22
22
  yield [{'role': 'user', 'content': prompt}]
23
23
  else:
24
24
  yield prompt
25
+
26
+
27
+ if __name__ == '__main__':
28
+ from evalscope.perf.arguments import Arguments
29
+ from evalscope.perf.main import run_perf_benchmark
30
+
31
+ args = Arguments(
32
+ model='qwen2.5-7b-instruct',
33
+ url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
34
+ dataset_path='outputs/perf_data.txt',
35
+ api_key='EMPTY',
36
+ dataset='custom',
37
+ )
38
+
39
+ run_perf_benchmark(args)
@@ -38,7 +38,7 @@ class BenchmarkData:
38
38
  self.first_chunk_latency = self.query_latency
39
39
  self.n_chunks = 1
40
40
  self.n_chunks_time = self.query_latency
41
- self.time_per_output_token = self.n_chunks_time / self.completion_tokens
41
+ self.time_per_output_token = self.n_chunks_time / self.n_chunks
42
42
 
43
43
  def _calculate_tokens(self, api_plugin):
44
44
  self.prompt_tokens, self.completion_tokens = \
@@ -51,6 +51,24 @@ class BenchmarkData:
51
51
  self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
52
52
 
53
53
 
54
+ class Metrics:
55
+ TIME_TAKEN_FOR_TESTS = 'Time taken for tests (s)'
56
+ NUMBER_OF_CONCURRENCY = 'Number of concurrency'
57
+ TOTAL_REQUESTS = 'Total requests'
58
+ SUCCEED_REQUESTS = 'Succeed requests'
59
+ FAILED_REQUESTS = 'Failed requests'
60
+ OUTPUT_TOKEN_THROUGHPUT = 'Output token throughput (tok/s)'
61
+ TOTAL_TOKEN_THROUGHPUT = 'Total token throughput (tok/s)'
62
+ REQUEST_THROUGHPUT = 'Request throughput (req/s)'
63
+ AVERAGE_LATENCY = 'Average latency (s)'
64
+ AVERAGE_TIME_TO_FIRST_TOKEN = 'Average time to first token (s)'
65
+ AVERAGE_TIME_PER_OUTPUT_TOKEN = 'Average time per output token (s)'
66
+ AVERAGE_INPUT_TOKENS_PER_REQUEST = 'Average input tokens per request'
67
+ AVERAGE_OUTPUT_TOKENS_PER_REQUEST = 'Average output tokens per request'
68
+ AVERAGE_PACKAGE_LATENCY = 'Average package latency (s)'
69
+ AVERAGE_PACKAGE_PER_REQUEST = 'Average package per request'
70
+
71
+
54
72
  @dataclass
55
73
  class BenchmarkMetrics:
56
74
  concurrency: int = 0
@@ -125,20 +143,20 @@ class BenchmarkMetrics:
125
143
 
126
144
  def create_message(self, default_ndigits=4):
127
145
  message = {
128
- 'Time taken for tests (s)': round(self.total_time, default_ndigits),
129
- 'Number of concurrency': self.concurrency,
130
- 'Total requests': int(self.n_total_queries),
131
- 'Succeed requests': self.n_succeed_queries,
132
- 'Failed requests': self.n_failed_queries,
133
- 'Output token throughput (tok/s)': round(self.avg_output_token_per_seconds, default_ndigits),
134
- 'Total token throughput (tok/s)': round(self.avg_total_token_per_seconds, default_ndigits),
135
- 'Request throughput (req/s)': round(self.qps, default_ndigits),
136
- 'Average latency (s)': round(self.avg_latency, default_ndigits),
137
- 'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
138
- 'Average time per output token (s)': round(self.avg_time_per_token, default_ndigits),
139
- 'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits),
140
- 'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits),
141
- 'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
142
- 'Average package per request': round(self.n_avg_chunks, default_ndigits),
146
+ Metrics.TIME_TAKEN_FOR_TESTS: round(self.total_time, default_ndigits),
147
+ Metrics.NUMBER_OF_CONCURRENCY: self.concurrency,
148
+ Metrics.TOTAL_REQUESTS: int(self.n_total_queries),
149
+ Metrics.SUCCEED_REQUESTS: self.n_succeed_queries,
150
+ Metrics.FAILED_REQUESTS: self.n_failed_queries,
151
+ Metrics.OUTPUT_TOKEN_THROUGHPUT: round(self.avg_output_token_per_seconds, default_ndigits),
152
+ Metrics.TOTAL_TOKEN_THROUGHPUT: round(self.avg_total_token_per_seconds, default_ndigits),
153
+ Metrics.REQUEST_THROUGHPUT: round(self.qps, default_ndigits),
154
+ Metrics.AVERAGE_LATENCY: round(self.avg_latency, default_ndigits),
155
+ Metrics.AVERAGE_TIME_TO_FIRST_TOKEN: round(self.avg_first_chunk_latency, default_ndigits),
156
+ Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN: round(self.avg_time_per_token, default_ndigits),
157
+ Metrics.AVERAGE_INPUT_TOKENS_PER_REQUEST: round(self.avg_prompt_tokens, default_ndigits),
158
+ Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST: round(self.avg_completion_tokens, default_ndigits),
159
+ Metrics.AVERAGE_PACKAGE_LATENCY: round(self.avg_chunk_time, default_ndigits),
160
+ Metrics.AVERAGE_PACKAGE_PER_REQUEST: round(self.n_avg_chunks, default_ndigits),
143
161
  }
144
162
  return message
@@ -111,6 +111,18 @@ def get_result_db_path(args: Arguments):
111
111
  return result_db_path
112
112
 
113
113
 
114
+ class PercentileMetrics:
115
+ TTFT = 'TTFT (s)'
116
+ ITL = 'ITL (s)'
117
+ TPOT = 'TPOT (s)'
118
+ LATENCY = 'Latency (s)'
119
+ INPUT_TOKENS = 'Input tokens'
120
+ OUTPUT_TOKENS = 'Output tokens'
121
+ OUTPUT_THROUGHPUT = 'Output (tok/s)'
122
+ TOTAL_THROUGHPUT = 'Total (tok/s)'
123
+ PERCENTILES = 'Percentiles'
124
+
125
+
114
126
  def calculate_percentiles(data: List[float], percentiles: List[int]) -> Dict[int, float]:
115
127
  """
116
128
  Calculate the percentiles for a specific list of data.
@@ -157,10 +169,6 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
157
169
  with sqlite3.connect(result_db_path) as con:
158
170
  rows = con.execute(query_sql).fetchall()
159
171
 
160
- if len(rows) < len(percentiles):
161
- logger.info('Too little data to calculate quantiles!')
162
- return {}
163
-
164
172
  # Define index variables for columns
165
173
  CHUNK_TIMES_INDEX = 1
166
174
  LATENCY_INDEX = 4
@@ -175,24 +183,25 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
175
183
  inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))
176
184
 
177
185
  metrics = {
178
- 'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
179
- 'ITL (s)':
186
+ PercentileMetrics.TTFT: [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
187
+ PercentileMetrics.ITL:
180
188
  inter_token_latencies_all,
181
- 'TPOT (s)':
189
+ PercentileMetrics.TPOT:
182
190
  [(row[CHUNK_TIME_INDEX] / row[COMPLETION_TOKENS_INDEX]) if row[COMPLETION_TOKENS_INDEX] > 0 else float('nan')
183
191
  for row in rows],
184
- 'Latency (s)': [row[LATENCY_INDEX] for row in rows],
185
- 'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows],
186
- 'Output tokens': [row[COMPLETION_TOKENS_INDEX] for row in rows],
187
- 'Output throughput(tok/s)':
192
+ PercentileMetrics.LATENCY: [row[LATENCY_INDEX] for row in rows],
193
+ PercentileMetrics.INPUT_TOKENS: [row[PROMPT_TOKENS_INDEX] for row in rows],
194
+ PercentileMetrics.OUTPUT_TOKENS: [row[COMPLETION_TOKENS_INDEX] for row in rows],
195
+ PercentileMetrics.OUTPUT_THROUGHPUT:
188
196
  [(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
189
197
  for row in rows],
190
- 'Total throughput(tok/s)': [((row[PROMPT_TOKENS_INDEX] + row[COMPLETION_TOKENS_INDEX])
191
- / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan') for row in rows]
198
+ PercentileMetrics.TOTAL_THROUGHPUT: [((row[PROMPT_TOKENS_INDEX] + row[COMPLETION_TOKENS_INDEX])
199
+ / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
200
+ for row in rows]
192
201
  }
193
202
 
194
203
  # Calculate percentiles for each metric
195
- results = {'Percentile': [f'{p}%' for p in percentiles]}
204
+ results = {PercentileMetrics.PERCENTILES: [f'{p}%' for p in percentiles]}
196
205
  for metric_name, data in metrics.items():
197
206
  metric_percentiles = calculate_percentiles(data, percentiles)
198
207
  results[metric_name] = [metric_percentiles[p] for p in percentiles]
@@ -205,7 +214,6 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
205
214
  write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
206
215
 
207
216
  metrics_result = metrics.create_message()
208
- metrics_result.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
209
217
  write_json_file(metrics_result, os.path.join(result_path, 'benchmark_summary.json'))
210
218
 
211
219
  # Print summary in a table
@@ -223,6 +231,8 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
223
231
  if args.dataset.startswith('speed_benchmark'):
224
232
  speed_benchmark_result(result_db_path)
225
233
 
234
+ logger.info(f'Save the summary to: {result_path}')
235
+
226
236
  return metrics_result, percentile_result
227
237
 
228
238
 
@@ -96,6 +96,7 @@ def create_app(model, attn_implementation=None) -> FastAPI:
96
96
 
97
97
 
98
98
  def start_app(args: Arguments):
99
+ logger.info('Starting local server, please wait...')
99
100
  if args.api == 'local':
100
101
  app = create_app(args.model, args.attn_implementation)
101
102
  uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
@@ -34,8 +34,15 @@ def init_swanlab(args: Arguments) -> None:
34
34
  current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
35
35
  name = args.name if args.name else f'{args.model_id}_{current_time}'
36
36
  swanlab.config.update({'framework': '📏evalscope'})
37
- swanlab.init(
38
- project='perf_benchmark',
39
- name=name,
40
- config=args.to_dict(),
41
- mode='local' if args.swanlab_api_key == 'local' else None)
37
+ init_kwargs = {
38
+ 'project': os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
39
+ 'name': name,
40
+ 'config': args.to_dict(),
41
+ 'mode': 'local' if args.swanlab_api_key == 'local' else None
42
+ }
43
+
44
+ workspace = os.getenv('SWANLAB_WORKSPACE')
45
+ if workspace:
46
+ init_kwargs['workspace'] = workspace
47
+
48
+ swanlab.init(**init_kwargs)