evalscope 0.13.1__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (72) hide show
  1. evalscope/arguments.py +1 -1
  2. evalscope/backend/rag_eval/__init__.py +1 -1
  3. evalscope/backend/rag_eval/backend_manager.py +21 -5
  4. evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
  5. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  6. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
  7. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
  8. evalscope/backend/rag_eval/utils/embedding.py +49 -3
  9. evalscope/backend/rag_eval/utils/llm.py +8 -9
  10. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
  11. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  12. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
  13. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  14. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  15. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
  16. evalscope/benchmarks/arena_hard/utils.py +162 -0
  17. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
  18. evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
  19. evalscope/benchmarks/data_adapter.py +30 -2
  20. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  21. evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -12
  22. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  23. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
  24. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
  25. evalscope/benchmarks/live_code_bench/testing_util.py +365 -549
  26. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  27. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
  28. evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
  29. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  30. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  31. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
  32. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  33. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
  34. evalscope/collections/evaluator.py +4 -2
  35. evalscope/config.py +2 -2
  36. evalscope/metrics/llm_judge.py +1 -1
  37. evalscope/models/chat_adapter.py +32 -11
  38. evalscope/perf/arguments.py +30 -9
  39. evalscope/perf/benchmark.py +57 -103
  40. evalscope/perf/http_client.py +2 -3
  41. evalscope/perf/plugin/api/custom_api.py +1 -1
  42. evalscope/perf/plugin/api/openai_api.py +4 -2
  43. evalscope/perf/plugin/datasets/custom.py +4 -1
  44. evalscope/perf/plugin/datasets/line_by_line.py +4 -1
  45. evalscope/perf/plugin/datasets/longalpaca.py +4 -1
  46. evalscope/perf/plugin/datasets/openqa.py +4 -1
  47. evalscope/perf/plugin/datasets/random_dataset.py +13 -6
  48. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  49. evalscope/perf/utils/benchmark_util.py +12 -6
  50. evalscope/perf/utils/db_util.py +3 -3
  51. evalscope/perf/utils/log_utils.py +41 -0
  52. evalscope/report/app.py +11 -11
  53. evalscope/run.py +7 -0
  54. evalscope/summarizer.py +2 -1
  55. evalscope/utils/utils.py +36 -25
  56. evalscope/version.py +2 -2
  57. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/METADATA +21 -55
  58. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/RECORD +70 -62
  59. tests/cli/test_all.py +36 -27
  60. tests/cli/test_collection.py +2 -1
  61. tests/cli/test_run.py +38 -20
  62. tests/perf/test_perf.py +1 -2
  63. tests/rag/test_clip_benchmark.py +0 -1
  64. tests/rag/test_mteb.py +37 -8
  65. tests/rag/test_ragas.py +33 -27
  66. tests/vlm/test_vlmeval.py +37 -1
  67. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  68. evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
  69. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/LICENSE +0 -0
  70. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/WHEEL +0 -0
  71. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/entry_points.txt +0 -0
  72. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,7 @@ import threading
9
9
  import time
10
10
  from http import HTTPStatus
11
11
  from tqdm import tqdm
12
- from typing import List
12
+ from typing import AsyncGenerator, List
13
13
 
14
14
  from evalscope.perf.arguments import Arguments
15
15
  from evalscope.perf.http_client import AioHttpClient, test_connection
@@ -18,95 +18,73 @@ from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
18
18
  from evalscope.perf.utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, summary_result
19
19
  from evalscope.perf.utils.handler import add_signal_handlers, exception_handler
20
20
  from evalscope.perf.utils.local_server import start_app
21
+ from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
21
22
  from evalscope.utils.logger import get_logger
22
23
 
23
24
  logger = get_logger()
24
- query_send_completed_event = asyncio.Event()
25
+
25
26
  data_process_completed_event = asyncio.Event()
26
27
 
27
28
 
28
29
  @exception_handler
29
- async def dispatch_requests_worker(request_queue: asyncio.Queue, args: Arguments):
30
+ async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
30
31
  query_generator_class = ApiRegistry(args.api)
31
32
  query_generator = query_generator_class(args.tokenizer_path)
32
33
 
33
34
  def load_prompt(prompt_path_or_text):
34
- """Load the prompt from a file or directly from the input text."""
35
35
  if prompt_path_or_text.startswith('@'):
36
36
  with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
37
37
  return file.read()
38
38
  return prompt_path_or_text
39
39
 
40
- async def dispatch_request(request):
41
- """Dispatch a single request with optional rate limiting."""
42
- await request_queue.put(request)
43
- if args.rate != -1:
44
- interval = np.random.exponential(1.0 / args.rate)
45
- await asyncio.sleep(interval)
46
-
47
- async def dispatch_requests_from_prompt(messages):
48
- """Generate and dispatch requests based on the given prompt."""
40
+ async def generate_requests_from_prompt(messages):
49
41
  request = query_generator.build_request(messages, args)
50
- if args.number is None:
51
- await dispatch_request(request)
52
- return 1
53
42
  for _ in range(args.number):
54
- await dispatch_request(request)
55
- return args.number
43
+ yield request
56
44
 
57
- async def dispatch_requests_from_dataset():
58
- """Generate and dispatch requests based on the dataset."""
59
- total_query_count = 0
45
+ async def generate_requests_from_dataset():
60
46
  message_generator_class = DatasetRegistry(args.dataset)
61
47
  message_generator = message_generator_class(args)
62
48
 
49
+ count = 0
63
50
  for messages in message_generator:
64
51
  request = query_generator.build_request(messages, args)
65
- if request is None:
66
- continue
67
- await dispatch_request(request)
68
- total_query_count += 1
69
- if args.number and total_query_count >= args.number:
70
- break
71
-
72
- return total_query_count
52
+ if request is not None:
53
+ yield request
54
+ count += 1
55
+ if args.number and count >= args.number:
56
+ break
73
57
 
74
- # Load prompt or dataset and dispatch requests accordingly
75
58
  if args.prompt:
76
59
  prompt = load_prompt(args.prompt)
77
- messages = [{'role': 'user', 'content': prompt}]
78
- total_queries = await dispatch_requests_from_prompt(messages)
60
+ messages = [{'role': 'user', 'content': prompt}] if args.apply_chat_template else prompt
61
+ generator = generate_requests_from_prompt(messages)
79
62
  elif args.dataset:
80
- total_queries = await dispatch_requests_from_dataset()
63
+ generator = generate_requests_from_dataset()
81
64
  else:
82
65
  raise Exception('Either prompt or dataset is required!')
83
66
 
84
- return total_queries
67
+ async for request in generator:
68
+ yield request
69
+ if args.rate != -1:
70
+ interval = np.random.exponential(1.0 / args.rate)
71
+ await asyncio.sleep(interval)
85
72
 
86
73
 
87
74
  @exception_handler
88
- async def send_requests_worker(
89
- task_id,
90
- request_queue: asyncio.Queue,
75
+ async def send_request(
76
+ semaphore: asyncio.Semaphore,
77
+ request: dict,
91
78
  benchmark_data_queue: asyncio.Queue,
92
79
  args: Arguments,
93
80
  ):
94
- client = AioHttpClient(args)
95
- async with client:
96
- while not (query_send_completed_event.is_set() and request_queue.empty()):
97
- try:
98
- # Attempt to get a request from the queue with a timeout
99
- request = await asyncio.wait_for(request_queue.get(), timeout=0.0001)
100
- request_queue.task_done()
101
- except asyncio.TimeoutError:
102
- # If timeout, continue to the next iteration
103
- continue
104
-
105
- # Initialize benchmark data for the current request
81
+ async with semaphore:
82
+ client = AioHttpClient(args)
83
+ async with client:
106
84
  benchmark_data = BenchmarkData(request=request)
85
+ benchmark_data.start_time = time.perf_counter()
107
86
  collected_messages = []
108
87
  try:
109
- # Send the request and process the response
110
88
  async for is_error, state_code, response_data in client.post(request):
111
89
  if is_error or state_code != HTTPStatus.OK:
112
90
  logger.error(f'Request: {request} failed, state_code: {state_code}, data: {response_data}')
@@ -124,35 +102,28 @@ async def send_requests_worker(
124
102
  logger.exception(e)
125
103
  logger.error(f'Request query: {request} exception')
126
104
  finally:
127
- # Record completion time and collected messages
128
105
  benchmark_data.completed_time = time.perf_counter()
129
106
  benchmark_data.response_messages = collected_messages
130
107
  await benchmark_data_queue.put(benchmark_data)
131
108
 
132
109
 
133
110
  @exception_handler
134
- async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue, args: Arguments):
111
+ async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args: Arguments):
135
112
  metrics = BenchmarkMetrics(concurrency=args.parallel)
136
113
 
137
114
  api_plugin_class = ApiRegistry(args.api)
138
115
  api_plugin = api_plugin_class(args.tokenizer_path)
139
116
 
140
117
  result_db_path = get_result_db_path(args)
141
- # Initialize wandb
142
- if args.wandb_api_key:
143
- import datetime
144
- import wandb
145
- os.environ['WANDB_SILENT'] = 'true'
146
- os.environ['WANDB_DIR'] = args.outputs_dir
147
118
 
148
- wandb.login(key=args.wandb_api_key)
149
- current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
150
- name = args.name if args.name else f'{args.model_id}_{current_time}'
151
- wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
119
+ if args.wandb_api_key:
120
+ init_wandb(args)
121
+ if args.swanlab_api_key:
122
+ init_swanlab(args)
152
123
 
153
124
  collected_benchmark_data = []
154
125
 
155
- with tqdm(desc='Processing') as pbar:
126
+ with tqdm(desc='Processing', total=args.number) as pbar:
156
127
  while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
157
128
  try:
158
129
  # Attempt to get benchmark data from the queue with a timeout
@@ -171,9 +142,13 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
171
142
  # Create a message with the updated metrics
172
143
  message = metrics.create_message()
173
144
 
174
- # Log the message to wandb if the api key is provided
145
+ # Log the message to wandb\swanlab if the api key is provided
175
146
  if args.wandb_api_key:
147
+ import wandb
176
148
  wandb.log(message)
149
+ if args.swanlab_api_key:
150
+ import swanlab
151
+ swanlab.log(message)
177
152
 
178
153
  # Log the message to the logger every n queries
179
154
  if int(metrics.n_total_queries) % args.log_every_n_query == 0:
@@ -194,17 +169,12 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
194
169
 
195
170
 
196
171
  @exception_handler
197
- async def start_server(args: Arguments) -> bool:
172
+ async def connect_test(args: Arguments) -> bool:
198
173
  if args.api.startswith('local'):
199
174
  # start local server
200
175
  server = threading.Thread(target=start_app, args=(copy.deepcopy(args), ), daemon=True)
201
176
  server.start()
202
177
 
203
- if args.dataset.startswith('speed_benchmark'):
204
- args.url = f'http://127.0.0.1:{args.port}/v1/completions'
205
- else:
206
- args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
207
-
208
178
  if (not args.no_test_connection) and (not await test_connection(args)):
209
179
  raise TimeoutError('Test connection failed')
210
180
 
@@ -216,39 +186,23 @@ async def benchmark(args: Arguments) -> None:
216
186
  add_signal_handlers(loop)
217
187
 
218
188
  # init queue
219
- request_queue = asyncio.Queue()
220
189
  benchmark_data_queue = asyncio.Queue()
221
-
222
190
  # reset event
223
- query_send_completed_event.clear()
224
191
  data_process_completed_event.clear()
225
-
226
- async def create_send_request_tasks():
227
- tasks: List[asyncio.Task] = []
228
- for idx in range(args.parallel):
229
- task = asyncio.create_task(send_requests_worker(idx, request_queue, benchmark_data_queue, args))
230
- tasks.append(task)
231
- return tasks
232
-
233
- async def run_tasks():
234
- await start_server(args)
235
-
236
- dispatch_task = asyncio.create_task(dispatch_requests_worker(request_queue, args))
237
- statistic_benchmark_metric_task = asyncio.create_task(
238
- statistic_benchmark_metric_worker(benchmark_data_queue, args))
239
- send_request_tasks = await create_send_request_tasks()
240
-
241
- expected_number_of_queries = await dispatch_task
242
- await request_queue.join()
243
- query_send_completed_event.set()
244
-
245
- await asyncio.gather(*send_request_tasks, return_exceptions=True)
246
- await benchmark_data_queue.join()
247
- data_process_completed_event.set()
248
-
249
- metrics, result_db_path = await statistic_benchmark_metric_task
250
- summary_result(args, metrics, expected_number_of_queries, result_db_path)
251
-
252
- await asyncio.sleep(0.250)
253
-
254
- await run_tasks()
192
+ # test connection
193
+ await connect_test(args)
194
+ # start statistic benchmark metric
195
+ statistic_benchmark_metric_task = asyncio.create_task(statistic_benchmark_metric(benchmark_data_queue, args))
196
+ # start send request
197
+ semaphore = asyncio.Semaphore(args.parallel)
198
+ send_request_tasks: List[asyncio.Task] = []
199
+ async for request in get_requests(args):
200
+ task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args))
201
+ send_request_tasks.append(task)
202
+
203
+ await asyncio.gather(*send_request_tasks, return_exceptions=True)
204
+ await benchmark_data_queue.join()
205
+ data_process_completed_event.set()
206
+
207
+ metrics, result_db_path = await statistic_benchmark_metric_task
208
+ summary_result(args, metrics, result_db_path)
@@ -24,7 +24,6 @@ class AioHttpClient:
24
24
  self.connect_timeout = args.connect_timeout
25
25
  self.client = aiohttp.ClientSession(
26
26
  timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
27
- connector=aiohttp.TCPConnector(limit=1),
28
27
  trace_configs=[self._create_trace_config()] if args.debug else [])
29
28
 
30
29
  def _create_trace_config(self):
@@ -144,7 +143,7 @@ async def test_connection(args: Arguments) -> bool:
144
143
  async def attempt_connection():
145
144
  client = AioHttpClient(args)
146
145
  async with client:
147
- if 'chat/completions' in args.url:
146
+ if args.apply_chat_template:
148
147
  request = {
149
148
  'messages': [{
150
149
  'role': 'user',
@@ -164,7 +163,7 @@ async def test_connection(args: Arguments) -> bool:
164
163
  is_error, state_code, response_data = await asyncio.wait_for(
165
164
  attempt_connection(), timeout=args.connect_timeout)
166
165
  if not is_error:
167
- logger.info('Connection successful.')
166
+ logger.info('Test connection successful.')
168
167
  return True
169
168
  logger.warning(f'Retrying... <{state_code}> {response_data}')
170
169
  except Exception as e:
@@ -24,7 +24,7 @@ class CustomPlugin(ApiPluginBase):
24
24
  """
25
25
  super().__init__(model_path=mode_path)
26
26
  if mode_path is not None:
27
- from transformers import AutoTokenizer
27
+ from modelscope import AutoTokenizer
28
28
  self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
29
29
  else:
30
30
  self.tokenizer = None
@@ -24,7 +24,7 @@ class OpenaiPlugin(ApiPluginBase):
24
24
  """
25
25
  super().__init__(model_path=mode_path)
26
26
  if mode_path is not None:
27
- from transformers import AutoTokenizer
27
+ from modelscope import AutoTokenizer
28
28
  self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
29
29
  else:
30
30
  self.tokenizer = None
@@ -94,9 +94,11 @@ class OpenaiPlugin(ApiPluginBase):
94
94
  payload['top_p'] = param.top_p
95
95
  if param.top_k is not None:
96
96
  payload['top_k'] = param.top_k
97
+ if param.extra_args is not None:
98
+ payload.update(param.extra_args)
97
99
  return payload
98
100
 
99
- def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
101
+ def parse_responses(self, responses, request: Any = None, **kwargs) -> tuple[int, int]:
100
102
  """Parser responses and return number of request and response tokens.
101
103
  Only one response for non-stream, multiple responses for stream.
102
104
  """
@@ -18,4 +18,7 @@ class CustomDatasetPlugin(DatasetPluginBase):
18
18
  prompt = item.strip()
19
19
  if len(prompt) > self.query_parameters.min_prompt_length and len(
20
20
  prompt) < self.query_parameters.max_prompt_length:
21
- yield [{'role': 'user', 'content': prompt}]
21
+ if self.query_parameters.apply_chat_template:
22
+ yield [{'role': 'user', 'content': prompt}]
23
+ else:
24
+ yield prompt
@@ -19,4 +19,7 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
19
19
  prompt = item.strip()
20
20
  if len(prompt) > self.query_parameters.min_prompt_length and len(
21
21
  prompt) < self.query_parameters.max_prompt_length:
22
- yield [{'role': 'user', 'content': prompt}]
22
+ if self.query_parameters.apply_chat_template:
23
+ yield [{'role': 'user', 'content': prompt}]
24
+ else:
25
+ yield prompt
@@ -24,4 +24,7 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
24
24
  prompt = item['instruction'].strip()
25
25
  if len(prompt) > self.query_parameters.min_prompt_length and len(
26
26
  prompt) < self.query_parameters.max_prompt_length:
27
- yield [{'role': 'user', 'content': prompt}]
27
+ if self.query_parameters.apply_chat_template:
28
+ yield [{'role': 'user', 'content': prompt}]
29
+ else:
30
+ yield prompt
@@ -29,4 +29,7 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
29
29
  prompt = item['question'].strip()
30
30
  if (len(prompt) > self.query_parameters.min_prompt_length
31
31
  and len(prompt) < self.query_parameters.max_prompt_length):
32
- yield [{'role': 'user', 'content': prompt}]
32
+ if self.query_parameters.apply_chat_template:
33
+ yield [{'role': 'user', 'content': prompt}]
34
+ else:
35
+ yield prompt
@@ -23,8 +23,12 @@ class RandomDatasetPlugin(DatasetPluginBase):
23
23
  self.number = self.query_parameters.number or 1
24
24
 
25
25
  def build_messages(self) -> Iterator[List[Dict]]:
26
- min_prompt_length = self.query_parameters.min_prompt_length - self.template_len
27
- max_prompt_length = self.query_parameters.max_prompt_length - self.template_len + 1
26
+ if self.query_parameters.apply_chat_template:
27
+ min_prompt_length = self.query_parameters.min_prompt_length - self.template_len
28
+ max_prompt_length = self.query_parameters.max_prompt_length - self.template_len + 1
29
+ else:
30
+ min_prompt_length = self.query_parameters.min_prompt_length
31
+ max_prompt_length = self.query_parameters.max_prompt_length + 1
28
32
 
29
33
  assert min_prompt_length >= 0, f'min_prompt_length should be greater than or equal to the template length {self.template_len}.' # noqa: E501
30
34
  assert max_prompt_length >= min_prompt_length, 'max_prompt_length should be greater than or equal to min_prompt_length.' # noqa: E501
@@ -34,10 +38,13 @@ class RandomDatasetPlugin(DatasetPluginBase):
34
38
  offsets = np.random.randint(0, self.tokenizer.vocab_size, size=self.number)
35
39
 
36
40
  for i in range(self.number):
37
- prompt_ids = (offsets[i] + i + np.arange(input_lens[i])) % self.tokenizer.vocab_size
38
- prompt = self.tokenizer.decode(
39
- self.prefix_ids + prompt_ids.tolist(), skip_special_tokens=False, clean_up_tokenization_spaces=False)
40
- yield [{'role': 'user', 'content': prompt}]
41
+ prompt_ids = ((offsets[i] + i + np.arange(input_lens[i])) % self.tokenizer.vocab_size).tolist()
42
+ prompt = self.tokenizer.decode(self.prefix_ids + prompt_ids)
43
+
44
+ if self.query_parameters.apply_chat_template:
45
+ yield [{'role': 'user', 'content': prompt}]
46
+ else:
47
+ yield prompt
41
48
 
42
49
  def get_random_inputs(self, length: int) -> List[int]:
43
50
  if length <= 0:
@@ -3,6 +3,9 @@ from typing import Dict, Iterator, List, Tuple
3
3
  from evalscope.perf.arguments import Arguments
4
4
  from evalscope.perf.plugin.datasets.base import DatasetPluginBase
5
5
  from evalscope.perf.plugin.registry import register_dataset
6
+ from evalscope.utils.logger import get_logger
7
+
8
+ logger = get_logger()
6
9
 
7
10
 
8
11
  @register_dataset('speed_benchmark')
@@ -18,6 +21,14 @@ class SpeedBenchmarkDatasetPlugin(DatasetPluginBase):
18
21
  def __init__(self, query_parameters: Arguments):
19
22
  super().__init__(query_parameters)
20
23
 
24
+ url = self.query_parameters.url
25
+ if url.endswith('v1/chat/completions'):
26
+ logger.warning(
27
+ 'The API URL is not set correctly for `speed_benchmark`. Using `v1/completions` instead of `v1/chat/completions` by system.' # noqa
28
+ )
29
+ url = url.replace('v1/chat/completions', 'v1/completions')
30
+ self.query_parameters.url = url
31
+
21
32
  def build_messages(self) -> Iterator[List[Dict]]:
22
33
  for input_len in self.INPUT_LENGTH:
23
34
  for _ in range(self.REPEAT):
@@ -11,7 +11,7 @@ logger = get_logger()
11
11
  @dataclass
12
12
  class BenchmarkData:
13
13
  request: Any = None
14
- start_time: float = field(default_factory=time.perf_counter)
14
+ start_time: float = 0.0
15
15
  completed_time: float = 0.0
16
16
  chunk_times: List[float] = field(default_factory=list)
17
17
  success: bool = False
@@ -73,7 +73,9 @@ class BenchmarkMetrics:
73
73
  avg_chunk_time: float = -1
74
74
  avg_prompt_tokens: float = -1
75
75
  avg_completion_tokens: float = -1
76
- avg_token_per_seconds: float = -1
76
+ avg_input_token_per_seconds: float = -1
77
+ avg_output_token_per_seconds: float = -1
78
+ avg_total_token_per_seconds: float = -1
77
79
  avg_time_per_token: float = -1
78
80
  qps: float = -1
79
81
 
@@ -111,22 +113,26 @@ class BenchmarkMetrics:
111
113
  self.avg_chunk_time = self.total_chunks_time / self.n_total_chunks
112
114
  self.avg_prompt_tokens = self.n_total_prompt_tokens / self.n_succeed_queries
113
115
  self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
114
- self.avg_token_per_seconds = self.n_total_completion_tokens / self.total_time
116
+ self.avg_input_token_per_seconds = self.n_total_prompt_tokens / self.total_first_chunk_latency
117
+ self.avg_output_token_per_seconds = self.n_total_completion_tokens / self.total_time
118
+ self.avg_total_token_per_seconds = (self.n_total_prompt_tokens
119
+ + self.n_total_completion_tokens) / self.total_time
115
120
  self.avg_time_per_token = self.n_time_per_output_token / self.n_succeed_queries
116
121
  self.qps = self.n_succeed_queries / self.total_time
117
122
  except ZeroDivisionError as e:
118
123
  logger.exception(e)
119
124
  return
120
125
 
121
- def create_message(self, default_ndigits=3):
126
+ def create_message(self, default_ndigits=4):
122
127
  message = {
123
128
  'Time taken for tests (s)': round(self.total_time, default_ndigits),
124
129
  'Number of concurrency': self.concurrency,
125
130
  'Total requests': int(self.n_total_queries),
126
131
  'Succeed requests': self.n_succeed_queries,
127
132
  'Failed requests': self.n_failed_queries,
128
- 'Throughput(average tokens/s)': round(self.avg_token_per_seconds, default_ndigits),
129
- 'Average QPS': round(self.qps, default_ndigits),
133
+ 'Output token throughput (tok/s)': round(self.avg_output_token_per_seconds, default_ndigits),
134
+ 'Total token throughput (tok/s)': round(self.avg_total_token_per_seconds, default_ndigits),
135
+ 'Request throughput (req/s)': round(self.qps, default_ndigits),
130
136
  'Average latency (s)': round(self.avg_latency, default_ndigits),
131
137
  'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
132
138
  'Average time per output token (s)': round(self.avg_time_per_token, default_ndigits),
@@ -175,7 +175,7 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
175
175
 
176
176
  metrics = {
177
177
  'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
178
- 'TPOT (s)':
178
+ 'ITL (s)':
179
179
  inter_token_latencies_all,
180
180
  'Latency (s)': [row[LATENCY_INDEX] for row in rows],
181
181
  'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows],
@@ -194,12 +194,12 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
194
194
  return results
195
195
 
196
196
 
197
- def summary_result(args: Arguments, metrics: BenchmarkMetrics, expected_number_of_queries: int, result_db_path: str):
197
+ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str):
198
198
  result_path = os.path.dirname(result_db_path)
199
199
  write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
200
200
 
201
201
  data = metrics.create_message()
202
- data.update({'Expected number of requests': expected_number_of_queries, 'Result DB path': result_db_path})
202
+ data.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
203
203
  write_json_file(data, os.path.join(result_path, 'benchmark_summary.json'))
204
204
 
205
205
  # Print summary in a table
@@ -0,0 +1,41 @@
1
+ import os
2
+
3
+ from evalscope.perf.arguments import Arguments
4
+
5
+
6
+ def init_wandb(args: Arguments) -> None:
7
+ """
8
+ Initialize WandB for logging.
9
+ """
10
+ # Initialize wandb if the api key is provided
11
+ import datetime
12
+ try:
13
+ import wandb
14
+ except ImportError:
15
+ raise RuntimeError('Cannot import wandb. Please install it with command: \n pip install wandb')
16
+ os.environ['WANDB_SILENT'] = 'true'
17
+ os.environ['WANDB_DIR'] = args.outputs_dir
18
+
19
+ wandb.login(key=args.wandb_api_key)
20
+ current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
21
+ name = args.name if args.name else f'{args.model_id}_{current_time}'
22
+ wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
23
+
24
+
25
+ def init_swanlab(args: Arguments) -> None:
26
+ import datetime
27
+ try:
28
+ import swanlab
29
+ except ImportError:
30
+ raise RuntimeError('Cannot import swanlab. Please install it with command: \n pip install swanlab')
31
+ os.environ['SWANLAB_LOG_DIR'] = args.outputs_dir
32
+ if not args.swanlab_api_key == 'local':
33
+ swanlab.login(api_key=args.swanlab_api_key)
34
+ current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
35
+ name = args.name if args.name else f'{args.model_id}_{current_time}'
36
+ swanlab.config.update({'framework': '📏evalscope'})
37
+ swanlab.init(
38
+ project='perf_benchmark',
39
+ name=name,
40
+ config=args.to_dict(),
41
+ mode='local' if args.swanlab_api_key == 'local' else None)
evalscope/report/app.py CHANGED
@@ -44,7 +44,7 @@ def scan_for_report_folders(root_path):
44
44
  continue
45
45
  datasets = []
46
46
  for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
47
- datasets.append(os.path.basename(dataset_item).split('.')[0])
47
+ datasets.append(os.path.splitext(os.path.basename(dataset_item))[0])
48
48
  datasets = DATASET_TOKEN.join(datasets)
49
49
  reports.append(
50
50
  f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}')
@@ -253,17 +253,17 @@ def process_model_prediction(item: Any):
253
253
 
254
254
 
255
255
  def normalize_score(score):
256
- if isinstance(score, bool):
257
- return 1.0 if score else 0.0
258
- elif isinstance(score, dict):
259
- for key in score:
260
- return float(score[key])
261
- return 0.0
262
- else:
263
- try:
264
- return float(score)
265
- except (ValueError, TypeError):
256
+ try:
257
+ if isinstance(score, bool):
258
+ return 1.0 if score else 0.0
259
+ elif isinstance(score, dict):
260
+ for key in score:
261
+ return float(score[key])
266
262
  return 0.0
263
+ else:
264
+ return float(score)
265
+ except (ValueError, TypeError):
266
+ return 0.0
267
267
 
268
268
 
269
269
  def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
evalscope/run.py CHANGED
@@ -58,10 +58,17 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
58
58
 
59
59
  outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
60
60
 
61
+ # Unify the output directory structure
61
62
  if task_cfg.eval_backend == EvalBackend.OPEN_COMPASS:
62
63
  task_cfg.eval_config['time_str'] = run_time
63
64
  elif task_cfg.eval_backend == EvalBackend.VLM_EVAL_KIT:
64
65
  task_cfg.eval_config['work_dir'] = task_cfg.work_dir
66
+ elif task_cfg.eval_backend == EvalBackend.RAG_EVAL:
67
+ from evalscope.backend.rag_eval import Tools
68
+ if task_cfg.eval_config['tool'].lower() == Tools.MTEB:
69
+ task_cfg.eval_config['eval']['output_folder'] = task_cfg.work_dir
70
+ elif task_cfg.eval_config['tool'].lower() == Tools.CLIP_BENCHMARK:
71
+ task_cfg.eval_config['eval']['output_dir'] = task_cfg.work_dir
65
72
  return outputs
66
73
 
67
74
 
evalscope/summarizer.py CHANGED
@@ -105,7 +105,8 @@ class Summarizer:
105
105
  summary_res: dict = csv_to_list(summary_file_path)[0]
106
106
  elif summary_file_path.endswith('json'):
107
107
  summary_res: dict = json_to_dict(summary_file_path)
108
- file_name = os.path.basename(summary_file_path).split('.')[0]
108
+ base_name = os.path.basename(summary_file_path)
109
+ file_name = os.path.splitext(base_name)[0]
109
110
  final_res_list.append({file_name: summary_res})
110
111
 
111
112
  elif eval_backend == EvalBackend.THIRD_PARTY: