evalscope 0.13.2__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (57) hide show
  1. evalscope/backend/rag_eval/__init__.py +1 -1
  2. evalscope/backend/rag_eval/backend_manager.py +21 -5
  3. evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
  4. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  5. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
  6. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
  7. evalscope/backend/rag_eval/utils/embedding.py +49 -3
  8. evalscope/backend/rag_eval/utils/llm.py +4 -4
  9. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
  10. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  11. evalscope/benchmarks/data_adapter.py +6 -2
  12. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  13. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  14. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
  15. evalscope/benchmarks/live_code_bench/testing_util.py +365 -549
  16. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  17. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
  18. evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
  19. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  20. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  21. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  22. evalscope/collections/evaluator.py +4 -2
  23. evalscope/config.py +1 -1
  24. evalscope/perf/arguments.py +24 -5
  25. evalscope/perf/benchmark.py +28 -42
  26. evalscope/perf/http_client.py +2 -3
  27. evalscope/perf/plugin/api/custom_api.py +1 -1
  28. evalscope/perf/plugin/api/openai_api.py +2 -2
  29. evalscope/perf/plugin/datasets/custom.py +4 -1
  30. evalscope/perf/plugin/datasets/line_by_line.py +4 -1
  31. evalscope/perf/plugin/datasets/longalpaca.py +4 -1
  32. evalscope/perf/plugin/datasets/openqa.py +4 -1
  33. evalscope/perf/plugin/datasets/random_dataset.py +13 -6
  34. evalscope/perf/utils/benchmark_util.py +12 -6
  35. evalscope/perf/utils/db_util.py +1 -1
  36. evalscope/perf/utils/log_utils.py +41 -0
  37. evalscope/report/app.py +11 -11
  38. evalscope/run.py +7 -0
  39. evalscope/summarizer.py +2 -1
  40. evalscope/utils/utils.py +36 -25
  41. evalscope/version.py +2 -2
  42. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/METADATA +20 -15
  43. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/RECORD +55 -54
  44. tests/cli/test_all.py +4 -4
  45. tests/cli/test_collection.py +2 -1
  46. tests/cli/test_run.py +9 -8
  47. tests/perf/test_perf.py +1 -2
  48. tests/rag/test_clip_benchmark.py +0 -1
  49. tests/rag/test_mteb.py +37 -8
  50. tests/rag/test_ragas.py +29 -26
  51. tests/vlm/test_vlmeval.py +37 -1
  52. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  53. evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
  54. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/LICENSE +0 -0
  55. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/WHEEL +0 -0
  56. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/entry_points.txt +0 -0
  57. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/top_level.txt +0 -0
File without changes
@@ -0,0 +1,79 @@
1
+ from typing import Any
2
+
3
+ from evalscope.benchmarks import Benchmark, DataAdapter
4
+ from evalscope.constants import EvalType, OutputType
5
+ from evalscope.metrics import exact_match
6
+ from evalscope.utils.utils import ResponseParser
7
+
8
+ SUBSET_LIST = ['default']
9
+
10
+
11
+ @Benchmark.register(
12
+ name='maritime_bench',
13
+ pretty_name='MaritimeBench',
14
+ dataset_id='HiDolphin/MaritimeBench',
15
+ model_adapter=OutputType.GENERATION,
16
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
17
+ subset_list=SUBSET_LIST,
18
+ metric_list=['AverageAccuracy'],
19
+ eval_split='test',
20
+ prompt_template=
21
+ '题目来自于{subset_name}请回答单选题。要求只输出选项,不输出解释,将选项放在<>里,直接输出答案。示例:\n\n题目:在船舶主推进动力装置中,传动轴系在运转中承受以下复杂的应力和负荷,但不包括______。\n选项:\nA. 电磁力\nB. 压拉应力\nC. 弯曲应力\nD. 扭应力\n答:<A> 当前题目\n {query}', # noqa: E501
22
+ )
23
+ class MaritimeBenchAdapter(DataAdapter):
24
+
25
+ def __init__(self, **kwargs):
26
+ super().__init__(**kwargs)
27
+
28
+ self.choices = ['A', 'B', 'C', 'D']
29
+
30
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
31
+
32
+ prefix = ''
33
+ query = prefix + input_d['question'] + '\n'
34
+ available_choices = []
35
+ for option in self.choices:
36
+ if option in input_d and input_d[option]:
37
+ query += option + ':' + input_d[option] + '\n'
38
+ available_choices.append(option)
39
+
40
+ full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
41
+ return self.gen_prompt_data(full_prompt, choices=available_choices)
42
+
43
+ def get_gold_answer(self, input_d: dict) -> str:
44
+ """
45
+ Parse the raw input labels (gold).
46
+
47
+ Args:
48
+ input_d: input raw data. Depending on the dataset.
49
+
50
+ Returns:
51
+ The parsed input. e.g. gold answer ... Depending on the dataset.
52
+ """
53
+ return input_d['answer']
54
+
55
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
56
+ """
57
+ Parse the raw model prediction (pred).
58
+
59
+ Args:
60
+ pred: model prediction. Depending on the model.
61
+
62
+ Returns:
63
+ The parsed prediction. e.g. model answer... Depending on the model.
64
+ """
65
+
66
+ return ResponseParser.parse_bracketed_answer(result, options=self.choices)
67
+
68
+ def match(self, gold: Any, pred: Any) -> Any:
69
+ """
70
+ Match the gold answer with the predicted answer.
71
+
72
+ Args:
73
+ gold: The gold answer.
74
+ pred: The predicted answer.
75
+
76
+ Returns:
77
+ The result of the match.
78
+ """
79
+ return exact_match(gold=gold, pred=pred)
@@ -145,7 +145,7 @@ SUBJECT_MAPPING = {
145
145
  train_split='train',
146
146
  eval_split='test',
147
147
  prompt_template=
148
- 'Answer the following multiple choice question about {subset_name}. There is only one correct answer. The last line of your response should be in the format "Answer: LETTER" (without quotes), where LETTER is one of A, B, C, D. \n{query}',
148
+ """Answer the following multiple choice question about {subset_name}. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\n{query}""", # noqa: E501
149
149
  )
150
150
  class MMLUAdapter(DataAdapter):
151
151
 
@@ -224,9 +224,8 @@ class MMLUAdapter(DataAdapter):
224
224
 
225
225
  context: str = '\n'.join(few_shot_prompts) + '\n'
226
226
  context += self._generate_prompt(input_d=input_d, include_answer=False)
227
- query = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
228
227
 
229
- full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=query)
228
+ full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
230
229
 
231
230
  return self.gen_prompt_data(full_prompt)
232
231
 
@@ -249,7 +248,7 @@ class MMLUAdapter(DataAdapter):
249
248
  if self.model_adapter == OutputType.MULTIPLE_CHOICE:
250
249
  return result
251
250
  else:
252
- return ResponseParser.parse_first_option(result)
251
+ return ResponseParser.parse_first_option(result, options=self.choices)
253
252
 
254
253
  def match(self, gold: str, pred: str) -> float:
255
254
  return exact_match(gold=gold, pred=pred)
@@ -260,11 +259,10 @@ class MMLUAdapter(DataAdapter):
260
259
 
261
260
  example: str = input_d['input']
262
261
  for j in range(len(self.choices)):
263
- example += '\n{}. {}'.format(self.choices[j], input_choices[j])
262
+ example += f'\n{self.choices[j]}) {input_choices[j]}'
264
263
 
265
- example += '\nAnswer:'
266
264
  if include_answer:
267
- example += ' {}\n\n'.format(input_d['target'])
265
+ example += f"\nAnswer: {input_d['target']}\n\n"
268
266
 
269
267
  return example
270
268
 
@@ -92,7 +92,7 @@ class MMLUProAdapter(DataAdapter):
92
92
  if self.model_adapter == OutputType.MULTIPLE_CHOICE:
93
93
  return result
94
94
  else:
95
- return ResponseParser.parse_first_option(result)
95
+ return ResponseParser.parse_first_option(result, options=self.choices)
96
96
 
97
97
  def match(self, gold: str, pred: str) -> float:
98
98
  """
@@ -164,7 +164,7 @@ class MMLUReduxAdapter(DataAdapter):
164
164
  if self.model_adapter == OutputType.MULTIPLE_CHOICE:
165
165
  return result
166
166
  else:
167
- return ResponseParser.parse_first_option(result)
167
+ return ResponseParser.parse_first_option(result, options=self.choices)
168
168
 
169
169
  def match(self, gold: str, pred: str) -> float:
170
170
  """
@@ -62,7 +62,7 @@ class MuSRAdapter(DataAdapter):
62
62
  if self.model_adapter == OutputType.MULTIPLE_CHOICE:
63
63
  return result
64
64
  else:
65
- return ResponseParser.parse_first_option(result)
65
+ return ResponseParser.parse_first_option(result, options=self.choices)
66
66
 
67
67
  def match(self, gold: str, pred: str) -> float:
68
68
  """
@@ -65,7 +65,7 @@ class EvaluatorCollection:
65
65
  self.evaluators = self._initialize_evaluators()
66
66
 
67
67
  def load(self) -> tuple[list[DatasetEntry], str]:
68
- dataset_name = os.path.basename(self.data_adapter.dataset_id).split('.')[0]
68
+ dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
69
69
  raw_dataset = self.data_adapter.load()
70
70
  # limit the dataset
71
71
  if self.task_cfg.limit:
@@ -174,6 +174,7 @@ class EvaluatorCollection:
174
174
  os.makedirs(os.path.dirname(report_file_path), exist_ok=True)
175
175
  with open(report_file_path, 'w', encoding='utf-8') as f:
176
176
  json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
177
+ return report
177
178
 
178
179
  def _filter_answer(self, pred_file_path):
179
180
  answer_dict = defaultdict(dict)
@@ -274,4 +275,5 @@ class EvaluatorCollection:
274
275
  answers = self.get_answers()
275
276
  reviews = self.get_reviews(answers)
276
277
  scores = self.get_scores(reviews)
277
- self.get_report(scores)
278
+ report = self.get_report(scores)
279
+ return report
evalscope/config.py CHANGED
@@ -212,7 +212,7 @@ def parse_task_config(task_cfg) -> TaskConfig:
212
212
  logger.info('Args: Task config is provided with CommandLine type.')
213
213
  task_cfg = TaskConfig.from_args(task_cfg)
214
214
  elif isinstance(task_cfg, str):
215
- extension = task_cfg.split('.')[-1]
215
+ extension = os.path.splitext(task_cfg)[-1]
216
216
  logger.info(f'Args: Task config is provided with {extension} file type.')
217
217
  if extension in ['yaml', 'yml']:
218
218
  task_cfg = TaskConfig.from_yaml(task_cfg)
@@ -35,6 +35,7 @@ class Arguments:
35
35
  log_every_n_query: int = 10 # Log every N queries
36
36
  debug: bool = False # Debug mode
37
37
  wandb_api_key: Optional[str] = None # WandB API key for logging
38
+ swanlab_api_key: Optional[str] = None # SwanLab API key for logging
38
39
  name: Optional[str] = None # Name for the run
39
40
 
40
41
  # Output settings
@@ -46,6 +47,7 @@ class Arguments:
46
47
  prefix_length: int = 0 # Length of the prefix, only for random dataset
47
48
  prompt: Optional[str] = None # The prompt text
48
49
  query_template: Optional[str] = None # Template for the query
50
+ apply_chat_template: Optional[bool] = None # Whether to apply chat template
49
51
 
50
52
  # Dataset settings
51
53
  dataset: str = 'openqa' # Dataset type (default: 'line_by_line')
@@ -57,10 +59,10 @@ class Arguments:
57
59
  max_tokens: Optional[int] = 2048 # Maximum number of tokens in the response
58
60
  min_tokens: Optional[int] = None # Minimum number of tokens in the response
59
61
  n_choices: Optional[int] = None # Number of response choices
60
- seed: Optional[int] = 42 # Random seed for reproducibility
62
+ seed: Optional[int] = 0 # Random seed for reproducibility
61
63
  stop: Optional[List[str]] = field(default_factory=list) # Stop sequences for the response
62
64
  stop_token_ids: Optional[List[str]] = field(default_factory=list) # Stop token IDs for the response
63
- stream: Optional[bool] = False # Whether to stream the response
65
+ stream: Optional[bool] = True # Whether to stream the response
64
66
  temperature: float = 0.0 # Temperature setting for the response
65
67
  top_p: Optional[float] = None # Top-p (nucleus) sampling setting for the response
66
68
  top_k: Optional[int] = None # Top-k sampling setting for the response
@@ -76,12 +78,26 @@ class Arguments:
76
78
  return Arguments(**args_dict)
77
79
 
78
80
  def __post_init__(self):
81
+ # Set the default headers
79
82
  self.headers = self.headers or {} # Default to empty dictionary
80
83
  if self.api_key:
81
84
  # Assuming the API key is used as a Bearer token
82
85
  self.headers['Authorization'] = f'Bearer {self.api_key}'
86
+
87
+ # Set the model ID based on the model name
83
88
  self.model_id = os.path.basename(self.model)
84
89
 
90
+ # Set the URL based on the dataset type
91
+ if self.api.startswith('local'):
92
+ if self.dataset.startswith('speed_benchmark'):
93
+ self.url = f'http://127.0.0.1:{self.port}/v1/completions'
94
+ else:
95
+ self.url = f'http://127.0.0.1:{self.port}/v1/chat/completions'
96
+
97
+ # Set the apply_chat_template flag based on the URL
98
+ if self.apply_chat_template is None:
99
+ self.apply_chat_template = self.url.strip('/').endswith('chat/completions')
100
+
85
101
  def __str__(self):
86
102
  return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
87
103
 
@@ -135,7 +151,8 @@ def add_argument(parser: argparse.ArgumentParser):
135
151
  parser.add_argument('--log-every-n-query', type=int, default=10, help='Logging every n query')
136
152
  parser.add_argument('--debug', action='store_true', default=False, help='Debug request send')
137
153
  parser.add_argument('--wandb-api-key', type=str, default=None, help='The wandb API key')
138
- parser.add_argument('--name', type=str, help='The wandb db result name and result db name')
154
+ parser.add_argument('--swanlab-api-key', type=str, default=None, help='The swanlab API key')
155
+ parser.add_argument('--name', type=str, help='The wandb/swanlab db result name and result db name')
139
156
 
140
157
  # Prompt settings
141
158
  parser.add_argument('--max-prompt-length', type=int, default=sys.maxsize, help='Maximum input prompt length')
@@ -143,6 +160,8 @@ def add_argument(parser: argparse.ArgumentParser):
143
160
  parser.add_argument('--prefix-length', type=int, default=0, help='The prefix length')
144
161
  parser.add_argument('--prompt', type=str, required=False, default=None, help='Specified the request prompt')
145
162
  parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
163
+ parser.add_argument(
164
+ '--apply-chat-template', type=argparse.BooleanOptionalAction, default=None, help='Apply chat template to the prompt') # noqa: E501
146
165
 
147
166
  # Output settings
148
167
  parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
@@ -159,10 +178,10 @@ def add_argument(parser: argparse.ArgumentParser):
159
178
  parser.add_argument(
160
179
  '--min-tokens', type=int, help='The minimum number of tokens that can be generated', default=None)
161
180
  parser.add_argument('--n-choices', type=int, help='How many completion choices to generate', default=None)
162
- parser.add_argument('--seed', type=int, help='The random seed', default=42)
181
+ parser.add_argument('--seed', type=int, help='The random seed', default=0)
163
182
  parser.add_argument('--stop', nargs='*', help='The stop tokens', default=None)
164
183
  parser.add_argument('--stop-token-ids', nargs='*', help='Set the stop token IDs', default=None)
165
- parser.add_argument('--stream', action='store_true', help='Stream output with SSE', default=False)
184
+ parser.add_argument('--stream', action=argparse.BooleanOptionalAction, help='Stream output with SSE', default=True)
166
185
  parser.add_argument('--temperature', type=float, help='The sample temperature', default=0.0)
167
186
  parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
168
187
  parser.add_argument('--top-k', type=int, help='Sampling top k', default=None)
@@ -18,6 +18,7 @@ from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
18
18
  from evalscope.perf.utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, summary_result
19
19
  from evalscope.perf.utils.handler import add_signal_handlers, exception_handler
20
20
  from evalscope.perf.utils.local_server import start_app
21
+ from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
21
22
  from evalscope.utils.logger import get_logger
22
23
 
23
24
  logger = get_logger()
@@ -56,7 +57,7 @@ async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
56
57
 
57
58
  if args.prompt:
58
59
  prompt = load_prompt(args.prompt)
59
- messages = [{'role': 'user', 'content': prompt}]
60
+ messages = [{'role': 'user', 'content': prompt}] if args.apply_chat_template else prompt
60
61
  generator = generate_requests_from_prompt(messages)
61
62
  elif args.dataset:
62
63
  generator = generate_requests_from_dataset()
@@ -81,6 +82,7 @@ async def send_request(
81
82
  client = AioHttpClient(args)
82
83
  async with client:
83
84
  benchmark_data = BenchmarkData(request=request)
85
+ benchmark_data.start_time = time.perf_counter()
84
86
  collected_messages = []
85
87
  try:
86
88
  async for is_error, state_code, response_data in client.post(request):
@@ -106,24 +108,18 @@ async def send_request(
106
108
 
107
109
 
108
110
  @exception_handler
109
- async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue, args: Arguments):
111
+ async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args: Arguments):
110
112
  metrics = BenchmarkMetrics(concurrency=args.parallel)
111
113
 
112
114
  api_plugin_class = ApiRegistry(args.api)
113
115
  api_plugin = api_plugin_class(args.tokenizer_path)
114
116
 
115
117
  result_db_path = get_result_db_path(args)
116
- # Initialize wandb
117
- if args.wandb_api_key:
118
- import datetime
119
- import wandb
120
- os.environ['WANDB_SILENT'] = 'true'
121
- os.environ['WANDB_DIR'] = args.outputs_dir
122
118
 
123
- wandb.login(key=args.wandb_api_key)
124
- current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
125
- name = args.name if args.name else f'{args.model_id}_{current_time}'
126
- wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
119
+ if args.wandb_api_key:
120
+ init_wandb(args)
121
+ if args.swanlab_api_key:
122
+ init_swanlab(args)
127
123
 
128
124
  collected_benchmark_data = []
129
125
 
@@ -146,9 +142,13 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
146
142
  # Create a message with the updated metrics
147
143
  message = metrics.create_message()
148
144
 
149
- # Log the message to wandb if the api key is provided
145
+ # Log the message to wandb\swanlab if the api key is provided
150
146
  if args.wandb_api_key:
147
+ import wandb
151
148
  wandb.log(message)
149
+ if args.swanlab_api_key:
150
+ import swanlab
151
+ swanlab.log(message)
152
152
 
153
153
  # Log the message to the logger every n queries
154
154
  if int(metrics.n_total_queries) % args.log_every_n_query == 0:
@@ -169,17 +169,12 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
169
169
 
170
170
 
171
171
  @exception_handler
172
- async def start_server(args: Arguments) -> bool:
172
+ async def connect_test(args: Arguments) -> bool:
173
173
  if args.api.startswith('local'):
174
174
  # start local server
175
175
  server = threading.Thread(target=start_app, args=(copy.deepcopy(args), ), daemon=True)
176
176
  server.start()
177
177
 
178
- if args.dataset.startswith('speed_benchmark'):
179
- args.url = f'http://127.0.0.1:{args.port}/v1/completions'
180
- else:
181
- args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
182
-
183
178
  if (not args.no_test_connection) and (not await test_connection(args)):
184
179
  raise TimeoutError('Test connection failed')
185
180
 
@@ -192,31 +187,22 @@ async def benchmark(args: Arguments) -> None:
192
187
 
193
188
  # init queue
194
189
  benchmark_data_queue = asyncio.Queue()
195
-
196
190
  # reset event
197
191
  data_process_completed_event.clear()
198
-
192
+ # test connection
193
+ await connect_test(args)
194
+ # start statistic benchmark metric
195
+ statistic_benchmark_metric_task = asyncio.create_task(statistic_benchmark_metric(benchmark_data_queue, args))
196
+ # start send request
199
197
  semaphore = asyncio.Semaphore(args.parallel)
198
+ send_request_tasks: List[asyncio.Task] = []
199
+ async for request in get_requests(args):
200
+ task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args))
201
+ send_request_tasks.append(task)
200
202
 
201
- async def create_send_request_tasks():
202
- tasks: List[asyncio.Task] = []
203
- async for request in get_requests(args):
204
- task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args))
205
- tasks.append(task)
206
- return tasks
207
-
208
- async def run_tasks():
209
- await start_server(args)
210
-
211
- statistic_benchmark_metric_task = asyncio.create_task(
212
- statistic_benchmark_metric_worker(benchmark_data_queue, args))
213
- send_request_tasks = await create_send_request_tasks()
214
-
215
- await asyncio.gather(*send_request_tasks, return_exceptions=True)
216
- await benchmark_data_queue.join()
217
- data_process_completed_event.set()
218
-
219
- metrics, result_db_path = await statistic_benchmark_metric_task
220
- summary_result(args, metrics, result_db_path)
203
+ await asyncio.gather(*send_request_tasks, return_exceptions=True)
204
+ await benchmark_data_queue.join()
205
+ data_process_completed_event.set()
221
206
 
222
- await run_tasks()
207
+ metrics, result_db_path = await statistic_benchmark_metric_task
208
+ summary_result(args, metrics, result_db_path)
@@ -24,7 +24,6 @@ class AioHttpClient:
24
24
  self.connect_timeout = args.connect_timeout
25
25
  self.client = aiohttp.ClientSession(
26
26
  timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
27
- connector=aiohttp.TCPConnector(limit=1),
28
27
  trace_configs=[self._create_trace_config()] if args.debug else [])
29
28
 
30
29
  def _create_trace_config(self):
@@ -144,7 +143,7 @@ async def test_connection(args: Arguments) -> bool:
144
143
  async def attempt_connection():
145
144
  client = AioHttpClient(args)
146
145
  async with client:
147
- if 'chat/completions' in args.url:
146
+ if args.apply_chat_template:
148
147
  request = {
149
148
  'messages': [{
150
149
  'role': 'user',
@@ -164,7 +163,7 @@ async def test_connection(args: Arguments) -> bool:
164
163
  is_error, state_code, response_data = await asyncio.wait_for(
165
164
  attempt_connection(), timeout=args.connect_timeout)
166
165
  if not is_error:
167
- logger.info('Connection successful.')
166
+ logger.info('Test connection successful.')
168
167
  return True
169
168
  logger.warning(f'Retrying... <{state_code}> {response_data}')
170
169
  except Exception as e:
@@ -24,7 +24,7 @@ class CustomPlugin(ApiPluginBase):
24
24
  """
25
25
  super().__init__(model_path=mode_path)
26
26
  if mode_path is not None:
27
- from transformers import AutoTokenizer
27
+ from modelscope import AutoTokenizer
28
28
  self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
29
29
  else:
30
30
  self.tokenizer = None
@@ -24,7 +24,7 @@ class OpenaiPlugin(ApiPluginBase):
24
24
  """
25
25
  super().__init__(model_path=mode_path)
26
26
  if mode_path is not None:
27
- from transformers import AutoTokenizer
27
+ from modelscope import AutoTokenizer
28
28
  self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
29
29
  else:
30
30
  self.tokenizer = None
@@ -70,7 +70,7 @@ class OpenaiPlugin(ApiPluginBase):
70
70
  def __compose_query_from_parameter(self, payload: Dict, param: Arguments):
71
71
  payload['model'] = param.model
72
72
  if param.max_tokens is not None:
73
- payload['max_completion_tokens'] = param.max_tokens
73
+ payload['max_tokens'] = param.max_tokens
74
74
  if param.min_tokens is not None:
75
75
  payload['min_tokens'] = param.min_tokens
76
76
  if param.frequency_penalty is not None:
@@ -18,4 +18,7 @@ class CustomDatasetPlugin(DatasetPluginBase):
18
18
  prompt = item.strip()
19
19
  if len(prompt) > self.query_parameters.min_prompt_length and len(
20
20
  prompt) < self.query_parameters.max_prompt_length:
21
- yield [{'role': 'user', 'content': prompt}]
21
+ if self.query_parameters.apply_chat_template:
22
+ yield [{'role': 'user', 'content': prompt}]
23
+ else:
24
+ yield prompt
@@ -19,4 +19,7 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
19
19
  prompt = item.strip()
20
20
  if len(prompt) > self.query_parameters.min_prompt_length and len(
21
21
  prompt) < self.query_parameters.max_prompt_length:
22
- yield [{'role': 'user', 'content': prompt}]
22
+ if self.query_parameters.apply_chat_template:
23
+ yield [{'role': 'user', 'content': prompt}]
24
+ else:
25
+ yield prompt
@@ -24,4 +24,7 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
24
24
  prompt = item['instruction'].strip()
25
25
  if len(prompt) > self.query_parameters.min_prompt_length and len(
26
26
  prompt) < self.query_parameters.max_prompt_length:
27
- yield [{'role': 'user', 'content': prompt}]
27
+ if self.query_parameters.apply_chat_template:
28
+ yield [{'role': 'user', 'content': prompt}]
29
+ else:
30
+ yield prompt
@@ -29,4 +29,7 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
29
29
  prompt = item['question'].strip()
30
30
  if (len(prompt) > self.query_parameters.min_prompt_length
31
31
  and len(prompt) < self.query_parameters.max_prompt_length):
32
- yield [{'role': 'user', 'content': prompt}]
32
+ if self.query_parameters.apply_chat_template:
33
+ yield [{'role': 'user', 'content': prompt}]
34
+ else:
35
+ yield prompt
@@ -23,8 +23,12 @@ class RandomDatasetPlugin(DatasetPluginBase):
23
23
  self.number = self.query_parameters.number or 1
24
24
 
25
25
  def build_messages(self) -> Iterator[List[Dict]]:
26
- min_prompt_length = self.query_parameters.min_prompt_length - self.template_len
27
- max_prompt_length = self.query_parameters.max_prompt_length - self.template_len + 1
26
+ if self.query_parameters.apply_chat_template:
27
+ min_prompt_length = self.query_parameters.min_prompt_length - self.template_len
28
+ max_prompt_length = self.query_parameters.max_prompt_length - self.template_len + 1
29
+ else:
30
+ min_prompt_length = self.query_parameters.min_prompt_length
31
+ max_prompt_length = self.query_parameters.max_prompt_length + 1
28
32
 
29
33
  assert min_prompt_length >= 0, f'min_prompt_length should be greater than or equal to the template length {self.template_len}.' # noqa: E501
30
34
  assert max_prompt_length >= min_prompt_length, 'max_prompt_length should be greater than or equal to min_prompt_length.' # noqa: E501
@@ -34,10 +38,13 @@ class RandomDatasetPlugin(DatasetPluginBase):
34
38
  offsets = np.random.randint(0, self.tokenizer.vocab_size, size=self.number)
35
39
 
36
40
  for i in range(self.number):
37
- prompt_ids = (offsets[i] + i + np.arange(input_lens[i])) % self.tokenizer.vocab_size
38
- prompt = self.tokenizer.decode(
39
- self.prefix_ids + prompt_ids.tolist(), skip_special_tokens=False, clean_up_tokenization_spaces=False)
40
- yield [{'role': 'user', 'content': prompt}]
41
+ prompt_ids = ((offsets[i] + i + np.arange(input_lens[i])) % self.tokenizer.vocab_size).tolist()
42
+ prompt = self.tokenizer.decode(self.prefix_ids + prompt_ids)
43
+
44
+ if self.query_parameters.apply_chat_template:
45
+ yield [{'role': 'user', 'content': prompt}]
46
+ else:
47
+ yield prompt
41
48
 
42
49
  def get_random_inputs(self, length: int) -> List[int]:
43
50
  if length <= 0:
@@ -11,7 +11,7 @@ logger = get_logger()
11
11
  @dataclass
12
12
  class BenchmarkData:
13
13
  request: Any = None
14
- start_time: float = field(default_factory=time.perf_counter)
14
+ start_time: float = 0.0
15
15
  completed_time: float = 0.0
16
16
  chunk_times: List[float] = field(default_factory=list)
17
17
  success: bool = False
@@ -73,7 +73,9 @@ class BenchmarkMetrics:
73
73
  avg_chunk_time: float = -1
74
74
  avg_prompt_tokens: float = -1
75
75
  avg_completion_tokens: float = -1
76
- avg_token_per_seconds: float = -1
76
+ avg_input_token_per_seconds: float = -1
77
+ avg_output_token_per_seconds: float = -1
78
+ avg_total_token_per_seconds: float = -1
77
79
  avg_time_per_token: float = -1
78
80
  qps: float = -1
79
81
 
@@ -111,22 +113,26 @@ class BenchmarkMetrics:
111
113
  self.avg_chunk_time = self.total_chunks_time / self.n_total_chunks
112
114
  self.avg_prompt_tokens = self.n_total_prompt_tokens / self.n_succeed_queries
113
115
  self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
114
- self.avg_token_per_seconds = self.n_total_completion_tokens / self.total_time
116
+ self.avg_input_token_per_seconds = self.n_total_prompt_tokens / self.total_first_chunk_latency
117
+ self.avg_output_token_per_seconds = self.n_total_completion_tokens / self.total_time
118
+ self.avg_total_token_per_seconds = (self.n_total_prompt_tokens
119
+ + self.n_total_completion_tokens) / self.total_time
115
120
  self.avg_time_per_token = self.n_time_per_output_token / self.n_succeed_queries
116
121
  self.qps = self.n_succeed_queries / self.total_time
117
122
  except ZeroDivisionError as e:
118
123
  logger.exception(e)
119
124
  return
120
125
 
121
- def create_message(self, default_ndigits=3):
126
+ def create_message(self, default_ndigits=4):
122
127
  message = {
123
128
  'Time taken for tests (s)': round(self.total_time, default_ndigits),
124
129
  'Number of concurrency': self.concurrency,
125
130
  'Total requests': int(self.n_total_queries),
126
131
  'Succeed requests': self.n_succeed_queries,
127
132
  'Failed requests': self.n_failed_queries,
128
- 'Throughput(average tokens/s)': round(self.avg_token_per_seconds, default_ndigits),
129
- 'Average QPS': round(self.qps, default_ndigits),
133
+ 'Output token throughput (tok/s)': round(self.avg_output_token_per_seconds, default_ndigits),
134
+ 'Total token throughput (tok/s)': round(self.avg_total_token_per_seconds, default_ndigits),
135
+ 'Request throughput (req/s)': round(self.qps, default_ndigits),
130
136
  'Average latency (s)': round(self.avg_latency, default_ndigits),
131
137
  'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
132
138
  'Average time per output token (s)': round(self.avg_time_per_token, default_ndigits),
@@ -175,7 +175,7 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
175
175
 
176
176
  metrics = {
177
177
  'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
178
- 'TPOT (s)':
178
+ 'ITL (s)':
179
179
  inter_token_latencies_all,
180
180
  'Latency (s)': [row[LATENCY_INDEX] for row in rows],
181
181
  'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows],
@@ -0,0 +1,41 @@
1
+ import os
2
+
3
+ from evalscope.perf.arguments import Arguments
4
+
5
+
6
+ def init_wandb(args: Arguments) -> None:
7
+ """
8
+ Initialize WandB for logging.
9
+ """
10
+ # Initialize wandb if the api key is provided
11
+ import datetime
12
+ try:
13
+ import wandb
14
+ except ImportError:
15
+ raise RuntimeError('Cannot import wandb. Please install it with command: \n pip install wandb')
16
+ os.environ['WANDB_SILENT'] = 'true'
17
+ os.environ['WANDB_DIR'] = args.outputs_dir
18
+
19
+ wandb.login(key=args.wandb_api_key)
20
+ current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
21
+ name = args.name if args.name else f'{args.model_id}_{current_time}'
22
+ wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
23
+
24
+
25
+ def init_swanlab(args: Arguments) -> None:
26
+ import datetime
27
+ try:
28
+ import swanlab
29
+ except ImportError:
30
+ raise RuntimeError('Cannot import swanlab. Please install it with command: \n pip install swanlab')
31
+ os.environ['SWANLAB_LOG_DIR'] = args.outputs_dir
32
+ if not args.swanlab_api_key == 'local':
33
+ swanlab.login(api_key=args.swanlab_api_key)
34
+ current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
35
+ name = args.name if args.name else f'{args.model_id}_{current_time}'
36
+ swanlab.config.update({'framework': '📏evalscope'})
37
+ swanlab.init(
38
+ project='perf_benchmark',
39
+ name=name,
40
+ config=args.to_dict(),
41
+ mode='local' if args.swanlab_api_key == 'local' else None)