evalscope 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (100) hide show
  1. evalscope/api/benchmark/__init__.py +8 -1
  2. evalscope/api/benchmark/adapters/__init__.py +1 -0
  3. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  4. evalscope/api/benchmark/benchmark.py +14 -0
  5. evalscope/api/dataset/dataset.py +21 -0
  6. evalscope/api/dataset/loader.py +6 -2
  7. evalscope/api/mixin/sandbox_mixin.py +32 -54
  8. evalscope/api/model/generate_config.py +6 -0
  9. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  10. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  11. evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
  12. evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
  13. evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
  14. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  15. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  16. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
  17. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  18. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
  19. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
  20. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
  21. evalscope/benchmarks/math_verse/__init__.py +0 -0
  22. evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
  23. evalscope/benchmarks/math_vision/__init__.py +0 -0
  24. evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
  25. evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
  26. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
  27. evalscope/benchmarks/ner/__init__.py +0 -0
  28. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  29. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  30. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  31. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  32. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  33. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  34. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  35. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  36. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  37. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  38. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  39. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  40. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  41. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  42. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  43. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  44. evalscope/benchmarks/ocr_bench_v2/utils.py +1 -0
  45. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  46. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  47. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  48. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  49. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  50. evalscope/benchmarks/poly_math/__init__.py +0 -0
  51. evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
  52. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  53. evalscope/benchmarks/pope/__init__.py +0 -0
  54. evalscope/benchmarks/pope/pope_adapter.py +111 -0
  55. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  56. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  57. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  58. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  59. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
  60. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
  61. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  62. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  63. evalscope/benchmarks/zerobench/__init__.py +0 -0
  64. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  65. evalscope/constants.py +4 -0
  66. evalscope/evaluator/evaluator.py +72 -79
  67. evalscope/metrics/math_parser.py +14 -0
  68. evalscope/metrics/metric.py +1 -1
  69. evalscope/models/utils/openai.py +4 -0
  70. evalscope/perf/arguments.py +24 -4
  71. evalscope/perf/benchmark.py +74 -89
  72. evalscope/perf/http_client.py +31 -16
  73. evalscope/perf/main.py +15 -2
  74. evalscope/perf/plugin/api/base.py +9 -7
  75. evalscope/perf/plugin/api/custom_api.py +13 -58
  76. evalscope/perf/plugin/api/default_api.py +179 -79
  77. evalscope/perf/plugin/api/openai_api.py +4 -3
  78. evalscope/perf/plugin/datasets/base.py +21 -0
  79. evalscope/perf/plugin/datasets/custom.py +2 -3
  80. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  81. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  82. evalscope/perf/plugin/datasets/openqa.py +2 -4
  83. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  84. evalscope/perf/utils/benchmark_util.py +36 -22
  85. evalscope/perf/utils/db_util.py +14 -19
  86. evalscope/perf/utils/local_server.py +0 -44
  87. evalscope/perf/utils/log_utils.py +21 -6
  88. evalscope/report/__init__.py +2 -1
  89. evalscope/run.py +4 -0
  90. evalscope/utils/function_utils.py +195 -12
  91. evalscope/utils/io_utils.py +74 -0
  92. evalscope/utils/logger.py +49 -17
  93. evalscope/utils/ner.py +377 -0
  94. evalscope/version.py +2 -2
  95. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/METADATA +235 -363
  96. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/RECORD +100 -55
  97. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
  98. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
  99. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
  100. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -0
@@ -3,8 +3,6 @@ import json
3
3
  import numpy as np
4
4
  import platform
5
5
  import sqlite3
6
- import time
7
- from http import HTTPStatus
8
6
  from tqdm import tqdm
9
7
  from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
10
8
 
@@ -82,86 +80,58 @@ async def send_request(
82
80
  request: dict,
83
81
  benchmark_data_queue: asyncio.Queue,
84
82
  args: Arguments,
85
- api_plugin: 'ApiPluginBase',
83
+ client: AioHttpClient, # reuse shared client
86
84
  ):
87
85
  async with semaphore:
88
- client = AioHttpClient(args, api_plugin)
89
- async with client:
90
- benchmark_data = BenchmarkData(request=request)
91
- benchmark_data.start_time = time.perf_counter()
92
- collected_messages = []
93
- try:
94
- async for is_error, state_code, response_data in client.post(request):
95
- if is_error or state_code != HTTPStatus.OK:
96
- error_msg = str(response_data) if response_data else 'Unknown error'
97
- logger.error(f'Request: {request} failed, state_code: {state_code}, data: {error_msg}')
98
- benchmark_data.success = False
99
- break
100
- if response_data:
101
- collected_messages.append(response_data)
102
- benchmark_data.chunk_times.append(time.perf_counter())
103
- benchmark_data.success = True
104
- benchmark_data.update_gpu_usage()
105
- except Exception as e:
106
- if response_data:
107
- collected_messages.append(response_data)
108
- benchmark_data.success = False
109
- logger.exception(e)
110
- logger.error(f'Request query: {request} exception')
111
- finally:
112
- benchmark_data.completed_time = time.perf_counter()
113
- benchmark_data.response_messages = collected_messages
114
- await benchmark_data_queue.put(benchmark_data)
86
+ benchmark_data = await client.post(request)
87
+ benchmark_data.update_gpu_usage()
88
+ await benchmark_data_queue.put(benchmark_data)
115
89
 
116
90
 
117
91
  @exception_handler
118
92
  async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args: Arguments, api_plugin: 'ApiPluginBase'):
119
93
  metrics = BenchmarkMetrics(concurrency=args.parallel)
120
-
121
94
  result_db_path = get_result_db_path(args)
122
95
 
123
- collected_benchmark_data = []
124
-
125
- with tqdm(desc='Processing', total=args.number) as pbar:
126
- while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
127
- try:
128
- # Attempt to get benchmark data from the queue with a timeout
129
- benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
130
- benchmark_data_queue.task_done()
131
- except asyncio.TimeoutError:
132
- # If timeout, continue to the next iteration
133
- continue
134
-
135
- # Update metrics based on the benchmark data
136
- metrics.update_metrics(benchmark_data, api_plugin)
137
-
138
- # Collect benchmark data for later database insertion
139
- collected_benchmark_data.append(benchmark_data)
140
-
141
- # Create a message with the updated metrics
142
- message = metrics.create_message()
96
+ # Stream inserts to DB to avoid accumulating all results in memory
97
+ commit_every = args.db_commit_interval
98
+ processed_since_commit = 0
143
99
 
144
- # Log the message to wandb\swanlab if the api key is provided
145
- if args.wandb_api_key:
146
- import wandb
147
- wandb.log(message)
148
- if args.swanlab_api_key:
149
- import swanlab
150
- swanlab.log(message)
151
-
152
- # Log the message to the logger every n queries
153
- if int(metrics.n_total_queries) % args.log_every_n_query == 0:
154
- msg = json.dumps(message, ensure_ascii=False, indent=2)
155
- logger.info(msg)
156
-
157
- pbar.update(1) # Update the progress bar
158
-
159
- # Now perform database operations after all benchmark data has been processed
160
100
  with sqlite3.connect(result_db_path) as con:
161
101
  cursor = con.cursor()
162
102
  create_result_table(cursor)
163
- for benchmark_data in collected_benchmark_data:
164
- insert_benchmark_data(cursor, benchmark_data)
103
+
104
+ with tqdm(desc='Processing', total=args.number) as pbar:
105
+ while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
106
+ try:
107
+ benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.1)
108
+ except asyncio.TimeoutError:
109
+ continue
110
+
111
+ # Update metrics and write to DB immediately
112
+ metrics.update_metrics(benchmark_data, api_plugin)
113
+ insert_benchmark_data(cursor, benchmark_data)
114
+ processed_since_commit += 1
115
+ if processed_since_commit >= commit_every:
116
+ con.commit()
117
+ processed_since_commit = 0
118
+
119
+ message = metrics.create_message()
120
+
121
+ if args.wandb_api_key:
122
+ import wandb
123
+ wandb.log(message)
124
+ if args.swanlab_api_key:
125
+ import swanlab
126
+ swanlab.log(message)
127
+
128
+ if int(metrics.n_total_queries) % args.log_every_n_query == 0:
129
+ msg = json.dumps(message, ensure_ascii=False, indent=2)
130
+ logger.info(msg)
131
+
132
+ benchmark_data_queue.task_done()
133
+ pbar.update(1)
134
+
165
135
  con.commit()
166
136
 
167
137
  return metrics, result_db_path
@@ -179,31 +149,46 @@ async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
179
149
  loop = asyncio.get_running_loop()
180
150
  add_signal_handlers(loop)
181
151
 
182
- # Create API plugin instance for request/response processing
183
152
  api_plugin_class = ApiRegistry.get_class(args.api)
184
153
  api_plugin = api_plugin_class(args)
185
154
 
186
- # init queue
187
- benchmark_data_queue = asyncio.Queue()
188
- # reset event
155
+ benchmark_data_queue: asyncio.Queue = asyncio.Queue(maxsize=max(1, args.parallel * args.queue_size_multiplier))
189
156
  data_process_completed_event.clear()
157
+
190
158
  # test connection
191
159
  await connect_test(args, api_plugin)
192
- # start statistic benchmark metric
193
- statistic_benchmark_metric_task = asyncio.create_task(
194
- statistic_benchmark_metric(benchmark_data_queue, args, api_plugin)
195
- )
196
- # start send request
197
- semaphore = asyncio.Semaphore(args.parallel)
198
- send_request_tasks: List[asyncio.Task] = []
199
- async for request in get_requests(args, api_plugin):
200
- task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args, api_plugin))
201
- send_request_tasks.append(task)
202
-
203
- await asyncio.gather(*send_request_tasks, return_exceptions=True)
204
- await benchmark_data_queue.join()
205
- data_process_completed_event.set()
206
-
207
- metrics, result_db_path = await statistic_benchmark_metric_task
160
+
161
+ # Create a single shared client session for all requests
162
+ client = AioHttpClient(args, api_plugin)
163
+ async with client:
164
+ # start statistic benchmark metric (consumer)
165
+ statistic_benchmark_metric_task = asyncio.create_task(
166
+ statistic_benchmark_metric(benchmark_data_queue, args, api_plugin)
167
+ )
168
+
169
+ # start sending requests with bounded in-flight tasks
170
+ semaphore = asyncio.Semaphore(args.parallel)
171
+ in_flight: set[asyncio.Task] = set()
172
+ max_in_flight = args.parallel * args.in_flight_task_multiplier
173
+
174
+ async for request in get_requests(args, api_plugin):
175
+ # Keep the number of scheduled tasks bounded to avoid OOM
176
+ if len(in_flight) >= max_in_flight:
177
+ done, pending = await asyncio.wait(in_flight, return_when=asyncio.FIRST_COMPLETED)
178
+ in_flight = pending
179
+
180
+ task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args, client))
181
+ in_flight.add(task)
182
+
183
+ # Wait for remaining in-flight tasks
184
+ if in_flight:
185
+ await asyncio.gather(*in_flight, return_exceptions=True)
186
+
187
+ # Drain queue and finish
188
+ await benchmark_data_queue.join()
189
+ data_process_completed_event.set()
190
+
191
+ metrics, result_db_path = await statistic_benchmark_metric_task
192
+
208
193
  metrics_result, percentile_result = summary_result(args, metrics, result_db_path)
209
194
  return metrics_result, percentile_result
@@ -3,6 +3,7 @@ import asyncio
3
3
  import time
4
4
  from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
5
5
 
6
+ from evalscope.perf.utils.benchmark_util import BenchmarkData
6
7
  from evalscope.utils.logger import get_logger
7
8
  from .arguments import Arguments
8
9
 
@@ -24,7 +25,22 @@ class AioHttpClient:
24
25
  self.read_timeout = args.read_timeout
25
26
  self.connect_timeout = args.connect_timeout
26
27
  self.api_plugin = api_plugin
28
+
29
+ # Configure connector similar to vLLM bench for better TTFT under load.
30
+ connector = aiohttp.TCPConnector(
31
+ limit=args.parallel or 0, # 0 means no limit in aiohttp; use parallel as limit if set
32
+ limit_per_host=args.parallel or 0,
33
+ ttl_dns_cache=300,
34
+ use_dns_cache=True,
35
+ keepalive_timeout=60,
36
+ enable_cleanup_closed=True,
37
+ force_close=False,
38
+ ssl=('https://' in self.url),
39
+ )
40
+
27
41
  self.client = aiohttp.ClientSession(
42
+ connector=connector,
43
+ trust_env=True,
28
44
  timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
29
45
  trace_configs=[self._create_trace_config()] if args.debug else []
30
46
  )
@@ -43,23 +59,25 @@ class AioHttpClient:
43
59
  trace_config.on_response_chunk_received.append(self.on_response_chunk_received)
44
60
  return trace_config
45
61
 
46
- async def post(self, body):
47
- """Send POST request and delegate response handling to API plugin.
48
- Yields:
49
- Tuple[bool, int, str]: (is_error, status_code, response_data)
62
+ async def post(self, body) -> BenchmarkData:
63
+ """
64
+ Send POST request and delegate response handling to API plugin.
65
+
66
+ Returns:
67
+ BenchmarkData: The benchmark data object containing request and response information.
50
68
  """
51
69
  try:
52
70
  # Delegate the request processing to the API plugin
53
- async for result in self.api_plugin.process_request(self.client, self.url, self.headers, body):
54
- yield result
71
+ output = await self.api_plugin.process_request(self.client, self.url, self.headers, body)
72
+ return output
55
73
  except asyncio.TimeoutError as e:
56
74
  logger.error(
57
75
  f'TimeoutError: connect_timeout: {self.connect_timeout}, read_timeout: {self.read_timeout}. Please set longer timeout.' # noqa: E501
58
76
  )
59
- yield (True, None, str(e))
77
+ return BenchmarkData(success=False, error=str(e))
60
78
  except (aiohttp.ClientConnectorError, Exception) as e:
61
79
  logger.error(e)
62
- yield (True, None, str(e))
80
+ return BenchmarkData(success=False, error=str(e))
63
81
 
64
82
  @staticmethod
65
83
  async def on_request_start(session, context, params: aiohttp.TraceRequestStartParams):
@@ -91,7 +109,6 @@ class AioHttpClient:
91
109
 
92
110
 
93
111
  async def test_connection(args: Arguments, api_plugin: 'ApiPluginBase') -> bool:
94
- is_error = True
95
112
  start_time = time.perf_counter()
96
113
 
97
114
  async def attempt_connection():
@@ -100,18 +117,16 @@ async def test_connection(args: Arguments, api_plugin: 'ApiPluginBase') -> bool:
100
117
  messages = [{'role': 'user', 'content': 'hello'}] if args.apply_chat_template else 'hello'
101
118
  request = api_plugin.build_request(messages)
102
119
 
103
- async for is_error, state_code, response_data in client.post(request):
104
- return is_error, state_code, response_data
120
+ output = await client.post(request)
121
+ return output
105
122
 
106
123
  while True:
107
124
  try:
108
- is_error, state_code, response_data = await asyncio.wait_for(
109
- attempt_connection(), timeout=args.connect_timeout
110
- )
111
- if not is_error:
125
+ output = await asyncio.wait_for(attempt_connection(), timeout=args.connect_timeout)
126
+ if output.success:
112
127
  logger.info('Test connection successful.')
113
128
  return True
114
- logger.warning(f'Retrying... <{state_code}> {response_data}')
129
+ logger.warning(f'Retrying... <{output.error}>')
115
130
  except Exception as e:
116
131
  logger.warning(f'Retrying... <{e}>')
117
132
 
evalscope/perf/main.py CHANGED
@@ -4,7 +4,9 @@ import os
4
4
  import platform
5
5
  import threading
6
6
  import time
7
+ import warnings
7
8
  from argparse import Namespace
9
+ from logging import warn
8
10
 
9
11
  from evalscope.perf.utils.local_server import start_app
10
12
  from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
@@ -79,9 +81,20 @@ def run_perf_benchmark(args):
79
81
  configure_logging(args.debug, os.path.join(output_path, 'benchmark.log'))
80
82
 
81
83
  # Initialize wandb and swanlab
82
- if args.wandb_api_key:
84
+ visualizer = args.visualizer
85
+ if visualizer is None:
86
+ if args.wandb_api_key is not None:
87
+ visualizer = 'wandb'
88
+ warnings.warn('--wandb-api-key is deprecated. Please use `--visualizer wandb` instead.', DeprecationWarning)
89
+ elif args.swanlab_api_key is not None:
90
+ visualizer = 'swanlab'
91
+ warnings.warn(
92
+ '--swanlab-api-key is deprecated. Please use `--visualizer swanlab` instead.', DeprecationWarning
93
+ )
94
+ args.visualizer = visualizer
95
+ if visualizer == 'wandb':
83
96
  init_wandb(args)
84
- if args.swanlab_api_key:
97
+ elif visualizer == 'swanlab':
85
98
  init_swanlab(args)
86
99
 
87
100
  # Initialize local server if needed
@@ -3,6 +3,7 @@ from abc import abstractmethod
3
3
  from typing import Any, AsyncGenerator, Dict, List, Tuple
4
4
 
5
5
  from evalscope.perf.arguments import Arguments
6
+ from evalscope.perf.utils.benchmark_util import BenchmarkData
6
7
 
7
8
 
8
9
  class ApiPluginBase:
@@ -28,13 +29,13 @@ class ApiPluginBase:
28
29
  raise NotImplementedError
29
30
 
30
31
  @abstractmethod
31
- def parse_responses(self, responses: List, request: Any = None, **kwargs: Any) -> Tuple[int, int]:
32
+ def parse_responses(self, responses: List[Dict], request: str = None, **kwargs: Any) -> Tuple[int, int]:
32
33
  """Parser responses and return number of request and response tokens.
33
34
 
34
35
  Args:
35
- responses (List[bytes]): List of http response body, for stream output,
36
+ responses (List[Dict]): List of http response body, for stream output,
36
37
  there are multiple responses, each is bytes, for general only one.
37
- request (Any): The request body.
38
+ request (str): The json string of request.
38
39
 
39
40
  Returns:
40
41
  Tuple: (Number of prompt_tokens and number of completion_tokens).
@@ -42,8 +43,9 @@ class ApiPluginBase:
42
43
  raise NotImplementedError
43
44
 
44
45
  @abstractmethod
45
- async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
46
- body: Dict) -> AsyncGenerator[Tuple[bool, int, Any], None]:
46
+ async def process_request(
47
+ self, client_session: aiohttp.ClientSession, url: str, headers: Dict, body: Dict
48
+ ) -> BenchmarkData:
47
49
  """Process the HTTP request and handle the response.
48
50
 
49
51
  Args:
@@ -52,8 +54,8 @@ class ApiPluginBase:
52
54
  headers: The request headers
53
55
  body: The request body
54
56
 
55
- Yields:
56
- Tuple[bool, int, Any]: (is_error, status_code, response_data)
57
+ Returns:
58
+ BenchmarkData: The benchmark data including response and timing info.
57
59
  """
58
60
  raise NotImplementedError
59
61
 
@@ -5,6 +5,7 @@ from typing import Any, AsyncGenerator, Dict, List, Tuple, Union
5
5
  from evalscope.perf.arguments import Arguments
6
6
  from evalscope.perf.plugin.api.base import ApiPluginBase
7
7
  from evalscope.perf.plugin.registry import register_api
8
+ from evalscope.perf.utils.benchmark_util import BenchmarkData
8
9
  from evalscope.utils.logger import get_logger
9
10
 
10
11
  logger = get_logger()
@@ -98,7 +99,7 @@ class CustomPlugin(ApiPluginBase):
98
99
 
99
100
  return payload
100
101
 
101
- def parse_responses(self, responses: List[str], request: Any = None, **kwargs) -> Tuple[int, int]:
102
+ def parse_responses(self, responses: List[Dict], request: str = None, **kwargs) -> Tuple[int, int]:
102
103
  """Parse API responses and return token counts.
103
104
 
104
105
  This method extracts the number of input and output tokens from the API responses.
@@ -106,8 +107,8 @@ class CustomPlugin(ApiPluginBase):
106
107
  to calculate it using a tokenizer.
107
108
 
108
109
  Args:
109
- responses (List[str]): List of API response strings.
110
- request (Any, optional): The original request, which might be needed for token calculation.
110
+ responses (List[Dict]): List of API response strings.
111
+ request (str, optional): The original request, which might be needed for token calculation.
111
112
  **kwargs: Additional arguments.
112
113
 
113
114
  Returns:
@@ -160,8 +161,9 @@ class CustomPlugin(ApiPluginBase):
160
161
  logger.error(f'Error parsing responses: {e}')
161
162
  return 0, 0
162
163
 
163
- async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
164
- body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
164
+ async def process_request(
165
+ self, client_session: aiohttp.ClientSession, url: str, headers: Dict, body: Dict
166
+ ) -> BenchmarkData:
165
167
  """Process the HTTP request and handle the response.
166
168
 
167
169
  This method handles sending the request to your API and processing the response,
@@ -173,60 +175,13 @@ class CustomPlugin(ApiPluginBase):
173
175
  headers (Dict): The request headers.
174
176
  body (Dict): The request body.
175
177
 
176
- Yields:
177
- Tuple[bool, int, str]: (is_error, status_code, response_data)
178
- - is_error: Whether the response indicates an error
179
- - status_code: HTTP status code
180
- - response_data: Response content
178
+ Returns:
179
+ BenchmarkData: The benchmark data including response and timing info.
181
180
  """
182
- try:
183
- # Set content type header
184
- headers = {'Content-Type': 'application/json', **headers}
185
-
186
- # Convert body to JSON
187
- data = json.dumps(body, ensure_ascii=False)
188
-
189
- # Send the request
190
- async with client_session.request('POST', url=url, data=data, headers=headers) as response: # noqa: E125
191
- # Get the status code
192
- status_code = response.status
193
-
194
- # Check if it's a streaming response
195
- if 'text/event-stream' in response.content_type:
196
- # Handle streaming response
197
- async for line in response.content:
198
- line_str = line.decode('utf-8').strip()
199
- if not line_str:
200
- continue
201
-
202
- # Check for data prefix in server-sent events
203
- if line_str.startswith('data: '):
204
- data = line_str[6:] # Remove 'data: ' prefix
205
-
206
- # Check if it's the end of the stream
207
- if data == '[DONE]':
208
- break
209
-
210
- try:
211
- # Parse the JSON data
212
- parsed_data = json.loads(data)
213
- yield (False, status_code, json.dumps(parsed_data))
214
- except json.JSONDecodeError:
215
- yield (True, status_code, f'Failed to parse JSON: {data}')
216
- else:
217
- # Handle regular response
218
- if 'application/json' in response.content_type:
219
- # JSON response
220
- content = await response.json()
221
- yield (status_code >= 400, status_code, json.dumps(content))
222
- else:
223
- # Text response
224
- content = await response.text()
225
- yield (status_code >= 400, status_code, content)
226
-
227
- except Exception as e:
228
- logger.error(f'Error in process_request: {e}')
229
- yield (True, 500, str(e))
181
+ raise NotImplementedError(
182
+ 'The `process_request` method must be implemented in a subclass. '
183
+ 'For OpenAI-compatible APIs, consider inheriting from `DefaultApiPlugin` to reuse the default implementation.' # noqa: E501
184
+ )
230
185
 
231
186
 
232
187
  if __name__ == '__main__':