evalscope 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (100) hide show
  1. evalscope/api/benchmark/__init__.py +8 -1
  2. evalscope/api/benchmark/adapters/__init__.py +1 -0
  3. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  4. evalscope/api/benchmark/benchmark.py +14 -0
  5. evalscope/api/dataset/dataset.py +21 -0
  6. evalscope/api/dataset/loader.py +6 -2
  7. evalscope/api/mixin/sandbox_mixin.py +32 -54
  8. evalscope/api/model/generate_config.py +6 -0
  9. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  10. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  11. evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
  12. evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
  13. evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
  14. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  15. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  16. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
  17. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  18. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
  19. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
  20. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
  21. evalscope/benchmarks/math_verse/__init__.py +0 -0
  22. evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
  23. evalscope/benchmarks/math_vision/__init__.py +0 -0
  24. evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
  25. evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
  26. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
  27. evalscope/benchmarks/ner/__init__.py +0 -0
  28. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  29. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  30. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  31. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  32. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  33. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  34. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  35. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  36. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  37. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  38. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  39. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  40. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  41. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  42. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  43. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  44. evalscope/benchmarks/ocr_bench_v2/utils.py +1 -0
  45. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  46. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  47. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  48. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  49. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  50. evalscope/benchmarks/poly_math/__init__.py +0 -0
  51. evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
  52. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  53. evalscope/benchmarks/pope/__init__.py +0 -0
  54. evalscope/benchmarks/pope/pope_adapter.py +111 -0
  55. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  56. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  57. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  58. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  59. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
  60. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
  61. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  62. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  63. evalscope/benchmarks/zerobench/__init__.py +0 -0
  64. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  65. evalscope/constants.py +4 -0
  66. evalscope/evaluator/evaluator.py +72 -79
  67. evalscope/metrics/math_parser.py +14 -0
  68. evalscope/metrics/metric.py +1 -1
  69. evalscope/models/utils/openai.py +4 -0
  70. evalscope/perf/arguments.py +24 -4
  71. evalscope/perf/benchmark.py +74 -89
  72. evalscope/perf/http_client.py +31 -16
  73. evalscope/perf/main.py +15 -2
  74. evalscope/perf/plugin/api/base.py +9 -7
  75. evalscope/perf/plugin/api/custom_api.py +13 -58
  76. evalscope/perf/plugin/api/default_api.py +179 -79
  77. evalscope/perf/plugin/api/openai_api.py +4 -3
  78. evalscope/perf/plugin/datasets/base.py +21 -0
  79. evalscope/perf/plugin/datasets/custom.py +2 -3
  80. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  81. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  82. evalscope/perf/plugin/datasets/openqa.py +2 -4
  83. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  84. evalscope/perf/utils/benchmark_util.py +36 -22
  85. evalscope/perf/utils/db_util.py +14 -19
  86. evalscope/perf/utils/local_server.py +0 -44
  87. evalscope/perf/utils/log_utils.py +21 -6
  88. evalscope/report/__init__.py +2 -1
  89. evalscope/run.py +4 -0
  90. evalscope/utils/function_utils.py +195 -12
  91. evalscope/utils/io_utils.py +74 -0
  92. evalscope/utils/logger.py +49 -17
  93. evalscope/utils/ner.py +377 -0
  94. evalscope/version.py +2 -2
  95. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/METADATA +235 -363
  96. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/RECORD +100 -55
  97. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
  98. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
  99. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
  100. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -0
@@ -1,24 +1,68 @@
1
1
  import aiohttp
2
2
  import json
3
- from http import HTTPStatus
4
- from typing import Any, AsyncGenerator, Dict, List, Tuple
3
+ import sys
4
+ import time
5
+ import traceback
6
+ from typing import Any, Dict
5
7
 
6
8
  from evalscope.perf.arguments import Arguments
7
9
  from evalscope.perf.plugin.api.base import ApiPluginBase
8
- from evalscope.perf.utils.local_server import ServerSentEvent
10
+ from evalscope.perf.utils.benchmark_util import BenchmarkData
9
11
  from evalscope.utils.logger import get_logger
10
12
 
11
13
  logger = get_logger()
12
14
 
13
15
 
16
+ class StreamedResponseHandler:
17
+ """Handles streaming HTTP responses by accumulating chunks until complete
18
+ messages are available."""
19
+
20
+ def __init__(self):
21
+ self.buffer = ''
22
+
23
+ def add_chunk(self, chunk_bytes: bytes) -> list[str]:
24
+ """Add a chunk of bytes to the buffer and return any complete
25
+ messages."""
26
+ chunk_str = chunk_bytes.decode('utf-8')
27
+ self.buffer += chunk_str
28
+
29
+ messages = []
30
+
31
+ # Split by double newlines (SSE message separator)
32
+ while '\n\n' in self.buffer:
33
+ message, self.buffer = self.buffer.split('\n\n', 1)
34
+ message = message.strip()
35
+ if message:
36
+ messages.append(message)
37
+
38
+ # if self.buffer is not empty, check if it is a complete message
39
+ # by removing data: prefix and check if it is a valid JSON
40
+ if self.buffer.startswith('data: '):
41
+ message_content = self.buffer.removeprefix('data: ').strip()
42
+ if message_content == '[DONE]':
43
+ messages.append(self.buffer.strip())
44
+ self.buffer = ''
45
+ elif message_content:
46
+ try:
47
+ json.loads(message_content)
48
+ messages.append(self.buffer.strip())
49
+ self.buffer = ''
50
+ except json.JSONDecodeError:
51
+ # Incomplete JSON, wait for more chunks.
52
+ pass
53
+
54
+ return messages
55
+
56
+
14
57
  class DefaultApiPlugin(ApiPluginBase):
15
58
  """Default implementation of API plugin with common HTTP handling methods."""
16
59
 
17
60
  def __init__(self, param: Arguments):
18
61
  super().__init__(param)
19
62
 
20
- async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
21
- body: Dict) -> AsyncGenerator[Tuple[bool, int, Any], None]:
63
+ async def process_request(
64
+ self, client_session: aiohttp.ClientSession, url: str, headers: Dict, body: Dict
65
+ ) -> BenchmarkData:
22
66
  """Process the HTTP request and handle the response.
23
67
 
24
68
  Args:
@@ -27,79 +71,135 @@ class DefaultApiPlugin(ApiPluginBase):
27
71
  headers: The request headers
28
72
  body: The request body
29
73
 
30
- Yields:
31
- Tuple[bool, int, Any]: (is_error, status_code, response_data)
32
- """
33
- try:
34
- headers = {'Content-Type': 'application/json', **headers}
35
- data = json.dumps(body, ensure_ascii=False) # serialize to JSON
36
- async with client_session.request('POST', url=url, data=data, headers=headers) as response:
37
- async for result in self._handle_response(response):
38
- yield result
39
- except Exception as e:
40
- logger.error(f'Error in process_request: {e}')
41
- yield (True, None, str(e))
42
-
43
- async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, Any], None]:
44
- """Handle streaming response from server-sent events.
45
-
46
- Args:
47
- response: The aiohttp response object containing a stream
48
-
49
- Yields:
50
- Tuple[bool, int, Any]: (is_error, status_code, data)
74
+ Returns:
75
+ BenchmarkData: Aggregated benchmarking data for the request/response.
51
76
  """
77
+ headers = {'Content-Type': 'application/json', **headers}
78
+ data = json.dumps(body, ensure_ascii=False) # serialize to JSON
79
+
80
+ output = BenchmarkData()
81
+ ttft = 0.0
82
+ generated_text = ''
83
+ st = time.perf_counter()
84
+ output.start_time = st
85
+ output.request = data
86
+ most_recent_timestamp = st
52
87
  try:
53
- async for chunk_bytes in response.content:
54
- chunk_bytes = chunk_bytes.strip()
55
- if not chunk_bytes:
56
- continue
57
- chunk_bytes = chunk_bytes.decode('utf-8')
58
- # NOTE: SSE comments (often used as pings) start with a colon.
59
- # These are not JSON data payload and should be skipped.
60
- if chunk_bytes.startswith(':'):
61
- continue
62
-
63
- chunk = chunk_bytes.removeprefix('data: ')
64
-
65
- if chunk != '[DONE]':
66
- data = json.loads(chunk)
67
-
68
- yield False, response.status, data
69
-
70
- except Exception as e:
71
- logger.error(f'Error in _handle_stream: {e}')
72
- yield True, response.status, str(e)
73
-
74
- async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, Any], None]:
75
- """Handle the HTTP response based on content type and status.
76
-
77
- Args:
78
- response: The aiohttp response object
79
-
80
- Yields:
81
- Tuple[bool, int, Any]: (is_error, status_code, response_data)
82
- """
83
- response_status = response.status
84
- response_content_type = response.content_type
85
- content_type_json = 'application/json'
86
- content_type_stream = 'text/event-stream'
87
- is_success = (response_status == HTTPStatus.OK)
88
-
89
- if is_success:
90
- # Handle successful response with 'text/event-stream' content type
91
- if content_type_stream in response_content_type:
92
- async for is_error, response_status, content in self._handle_stream(response):
93
- yield (is_error, response_status, content)
94
- # Handle successful response with 'application/json' content type
95
- elif content_type_json in response_content_type:
96
- content = await response.json()
97
- yield (False, response_status, content)
98
- # Handle other successful responses
99
- else:
100
- content = await response.read()
101
- yield (False, response_status, content.decode('utf-8'))
102
- else:
103
- # error is always in JSON format
104
- error = await response.json()
105
- yield (True, response_status, error)
88
+ async with client_session.post(url=url, data=data, headers=headers) as response:
89
+ content_type = response.headers.get('Content-Type', '')
90
+ if response.status == 200:
91
+ # Handle streaming responses (SSE)
92
+ if 'text/event-stream' in content_type:
93
+ handler = StreamedResponseHandler()
94
+ async for chunk_bytes in response.content.iter_any():
95
+ chunk_bytes = chunk_bytes.strip()
96
+ if not chunk_bytes:
97
+ continue
98
+
99
+ messages = handler.add_chunk(chunk_bytes)
100
+ for message in messages:
101
+ # NOTE: SSE comments (often used as pings) start with
102
+ # a colon. These are not JSON data payload and should
103
+ # be skipped.
104
+ if message.startswith(':'):
105
+ continue
106
+
107
+ chunk = message.removeprefix('data: ')
108
+
109
+ if chunk != '[DONE]':
110
+ timestamp = time.perf_counter()
111
+ data = json.loads(chunk)
112
+
113
+ if choices := data.get('choices'):
114
+ content = choices[0]['delta'].get('content')
115
+ # First token
116
+ if ttft == 0.0:
117
+ ttft = timestamp - st
118
+ output.first_chunk_latency = ttft
119
+
120
+ # Decoding phase
121
+ else:
122
+ output.inter_chunk_latency.append(timestamp - most_recent_timestamp)
123
+
124
+ generated_text += content or ''
125
+ output.response_messages.append(data)
126
+ elif usage := data.get('usage'):
127
+ output.prompt_tokens = usage.get('prompt_tokens')
128
+ output.completion_tokens = usage.get('completion_tokens')
129
+
130
+ most_recent_timestamp = timestamp
131
+
132
+ output.generated_text = generated_text
133
+ output.success = True
134
+ output.completed_time = most_recent_timestamp
135
+ output.query_latency = most_recent_timestamp - st
136
+
137
+ # Handle non-stream JSON responses
138
+ elif 'application/json' in content_type or 'application/' in content_type:
139
+ payload: Any
140
+ try:
141
+ payload = await response.json()
142
+ except Exception:
143
+ # Fallback to text if JSON parsing fails
144
+ payload = await response.text()
145
+
146
+ timestamp = time.perf_counter()
147
+ output.completed_time = timestamp
148
+ output.query_latency = timestamp - st
149
+ # For non-stream, first chunk equals full latency
150
+ output.first_chunk_latency = output.query_latency
151
+
152
+ if isinstance(payload, dict):
153
+ # Extract generated text from choices
154
+ text = ''
155
+ if choices := payload.get('choices'):
156
+ first = choices[0] if choices else {}
157
+ # Chat Completions format
158
+ msg = first.get('message') or {}
159
+ if isinstance(msg, dict) and msg.get('content') is not None:
160
+ text = msg.get('content') or ''
161
+ else:
162
+ # Legacy Completions format
163
+ text = first.get('text') or ''
164
+ generated_text = text
165
+
166
+ # Extract usage if provided
167
+ if usage := payload.get('usage'):
168
+ output.prompt_tokens = usage.get('prompt_tokens')
169
+ output.completion_tokens = usage.get('completion_tokens')
170
+
171
+ output.response_messages.append(payload)
172
+ else:
173
+ generated_text = str(payload)
174
+
175
+ output.generated_text = generated_text
176
+ output.success = True
177
+
178
+ else:
179
+ # Unknown successful content-type: read as text
180
+ raw = await response.text()
181
+ timestamp = time.perf_counter()
182
+ output.completed_time = timestamp
183
+ output.query_latency = timestamp - st
184
+ output.first_chunk_latency = output.query_latency
185
+ output.generated_text = raw
186
+ output.response_messages.append(raw)
187
+ output.success = True
188
+ else:
189
+ # Try to parse structured error, fallback to reason/text
190
+ try:
191
+ err_payload = await response.json()
192
+ output.error = json.dumps(err_payload, ensure_ascii=False)
193
+ except Exception:
194
+ try:
195
+ output.error = await response.text()
196
+ except Exception:
197
+ output.error = response.reason or ''
198
+ output.success = False
199
+ except Exception:
200
+ output.success = False
201
+ exc_info = sys.exc_info()
202
+ output.error = ''.join(traceback.format_exception(*exc_info))
203
+ logger.error(output.error)
204
+
205
+ return output
@@ -102,7 +102,7 @@ class OpenaiPlugin(DefaultApiPlugin):
102
102
  payload.update(param.extra_args)
103
103
  return payload
104
104
 
105
- def parse_responses(self, responses, request: Any = None, **kwargs) -> tuple[int, int]:
105
+ def parse_responses(self, responses: List[Dict], request: str = None, **kwargs) -> tuple[int, int]:
106
106
  """Parser responses and return number of request and response tokens.
107
107
  Only one response for non-stream, multiple responses for stream.
108
108
  """
@@ -180,7 +180,7 @@ class OpenaiPlugin(DefaultApiPlugin):
180
180
  )
181
181
  return input_tokens, output_tokens
182
182
 
183
- def _count_input_tokens(self, request: Dict) -> int:
183
+ def _count_input_tokens(self, request_str: str) -> int:
184
184
  """Count the number of input tokens in the request.
185
185
 
186
186
  This method handles different types of requests and calculates tokens for:
@@ -188,13 +188,14 @@ class OpenaiPlugin(DefaultApiPlugin):
188
188
  - Images in multimodal messages (converted to patch tokens)
189
189
 
190
190
  Args:
191
- request (Dict): The request dictionary containing either 'messages' for chat
191
+ request_str (str): The request json str containing either 'messages' for chat
192
192
  completion or 'prompt' for text completion.
193
193
 
194
194
  Returns:
195
195
  int: The total number of input tokens including text and image tokens.
196
196
  """
197
197
  input_tokens = 0
198
+ request = json.loads(request_str)
198
199
  if 'messages' in request:
199
200
  input_content = self.tokenizer.apply_chat_template(
200
201
  request['messages'], tokenize=True, add_generation_prompt=True
@@ -15,6 +15,11 @@ class DatasetPluginBase:
15
15
  dataset_path (str, optional): The input dataset path. Defaults to None.
16
16
  """
17
17
  self.query_parameters = query_parameters
18
+ if query_parameters.tokenizer_path:
19
+ from modelscope import AutoTokenizer
20
+ self.tokenizer = AutoTokenizer.from_pretrained(query_parameters.tokenizer_path, trust_remote_code=True)
21
+ else:
22
+ self.tokenizer = None
18
23
 
19
24
  def __next__(self):
20
25
  for item in self.build_messages():
@@ -85,3 +90,19 @@ class DatasetPluginBase:
85
90
  for url in image_urls:
86
91
  message['content'].append({'type': 'image_url', 'image_url': {'url': url}})
87
92
  return message
93
+
94
+ def check_prompt_length(self, prompt: str) -> Tuple[bool, int]:
95
+ """Check if the prompt length is within the specified range.
96
+
97
+ Args:
98
+ prompt (str): The input prompt string.
99
+
100
+ Returns:
101
+ Tuple[bool, int]: A tuple containing a boolean indicating whether the prompt is valid and its length.
102
+ """
103
+ if self.tokenizer is None:
104
+ prompt_length = len(prompt)
105
+ else:
106
+ prompt_length = len(self.tokenizer.encode(prompt))
107
+ is_valid = self.query_parameters.min_prompt_length <= prompt_length <= self.query_parameters.max_prompt_length
108
+ return is_valid, prompt_length
@@ -16,9 +16,8 @@ class CustomDatasetPlugin(DatasetPluginBase):
16
16
  def build_messages(self) -> Iterator[List[Dict]]:
17
17
  for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
18
18
  prompt = item.strip()
19
- if len(prompt) > self.query_parameters.min_prompt_length and len(
20
- prompt
21
- ) < self.query_parameters.max_prompt_length:
19
+ is_valid, _ = self.check_prompt_length(prompt)
20
+ if is_valid:
22
21
  if self.query_parameters.apply_chat_template:
23
22
  message = self.create_message(prompt)
24
23
  yield [message]
@@ -17,9 +17,8 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
17
17
  def build_messages(self) -> Iterator[List[Dict]]:
18
18
  for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
19
19
  prompt = item.strip()
20
- if len(prompt) > self.query_parameters.min_prompt_length and len(
21
- prompt
22
- ) < self.query_parameters.max_prompt_length:
20
+ is_valid, _ = self.check_prompt_length(prompt)
21
+ if is_valid:
23
22
  if self.query_parameters.apply_chat_template:
24
23
  message = self.create_message(prompt)
25
24
  yield [message]
@@ -22,9 +22,8 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
22
22
  ds = self.dataset_json_list(self.query_parameters.dataset_path)
23
23
  for item in ds:
24
24
  prompt = item['instruction'].strip()
25
- if len(prompt) > self.query_parameters.min_prompt_length and len(
26
- prompt
27
- ) < self.query_parameters.max_prompt_length:
25
+ is_valid, _ = self.check_prompt_length(prompt)
26
+ if is_valid:
28
27
  if self.query_parameters.apply_chat_template:
29
28
  message = self.create_message(prompt)
30
29
  yield [message]
@@ -27,10 +27,8 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
27
27
  for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
28
28
  item = json.loads(item)
29
29
  prompt = item['question'].strip()
30
- if (
31
- len(prompt) > self.query_parameters.min_prompt_length
32
- and len(prompt) < self.query_parameters.max_prompt_length
33
- ):
30
+ is_valid, _ = self.check_prompt_length(prompt)
31
+ if is_valid:
34
32
  if self.query_parameters.apply_chat_template:
35
33
  message = self.create_message(prompt)
36
34
  yield [message]
@@ -12,11 +12,9 @@ class RandomDatasetPlugin(DatasetPluginBase):
12
12
  """
13
13
 
14
14
  def __init__(self, query_parameters: Arguments):
15
+ assert query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer-path`.' # noqa: E501
15
16
  super().__init__(query_parameters)
16
- assert self.query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer_path`.' # noqa: E501
17
17
 
18
- from modelscope import AutoTokenizer
19
- self.tokenizer = AutoTokenizer.from_pretrained(self.query_parameters.tokenizer_path, trust_remote_code=True)
20
18
  self.prefix_length = self.query_parameters.prefix_length
21
19
  self.prefix_ids = self.get_random_inputs(self.prefix_length)
22
20
  self.template_len = self.get_template_len()
@@ -1,4 +1,3 @@
1
- import time
2
1
  from dataclasses import dataclass, field
3
2
  from typing import Any, List, Optional, Tuple
4
3
 
@@ -10,7 +9,7 @@ logger = get_logger()
10
9
 
11
10
  @dataclass
12
11
  class BenchmarkData:
13
- request: Any = None
12
+ request: str = None # json serialized request body
14
13
  start_time: float = 0.0
15
14
  completed_time: float = 0.0
16
15
  chunk_times: List[float] = field(default_factory=list)
@@ -24,24 +23,26 @@ class BenchmarkData:
24
23
  time_per_output_token: float = 0.0
25
24
  inter_chunk_latency: List[float] = field(default_factory=list)
26
25
 
27
- prompt_tokens = None
28
- completion_tokens = None
29
-
30
- def _calculate_query_stream_metric(self) -> None:
31
- self.query_latency = self.completed_time - self.start_time
32
- # only for stream responses
33
- if len(self.chunk_times) > 1:
34
- self.first_chunk_latency = self.chunk_times[0] - self.start_time
35
- # remove the first chunk time from the total latency
36
- self.time_per_output_token = (self.query_latency - self.first_chunk_latency
37
- ) / (self.completion_tokens - 1) if self.completion_tokens > 1 else 0.0
38
- self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
39
- else:
40
- self.first_chunk_latency = self.query_latency
26
+ # response content
27
+ generated_text: str = ''
28
+ error: Optional[str] = None
29
+ prompt_tokens: Optional[int] = None
30
+ completion_tokens: Optional[int] = None
41
31
 
42
32
  def _calculate_tokens(self, api_plugin):
43
- self.prompt_tokens, self.completion_tokens = \
44
- api_plugin.parse_responses(self.response_messages, request=self.request)
33
+ if self.prompt_tokens is None or self.completion_tokens is None:
34
+ self.prompt_tokens, self.completion_tokens = api_plugin.parse_responses(
35
+ self.response_messages, request=self.request
36
+ )
37
+
38
+ # Calculate time per output token
39
+ if self.completion_tokens and self.completion_tokens > 1:
40
+ # tpot = (latency - ttft) / (output_len - 1)
41
+ self.time_per_output_token = (self.query_latency - self.first_chunk_latency) / (self.completion_tokens - 1)
42
+
43
+ # Ensure inter-chunk latency is available (compute from chunk_times if needed)
44
+ if not self.inter_chunk_latency and self.chunk_times:
45
+ self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
45
46
 
46
47
  def update_gpu_usage(self):
47
48
  if check_import('torch', raise_warning=False):
@@ -79,6 +80,7 @@ class BenchmarkMetrics:
79
80
  n_total_prompt_tokens: int = 0
80
81
  n_total_completion_tokens: int = 0
81
82
  start_time: Optional[float] = None
83
+ last_completed_time: Optional[float] = None
82
84
  total_time: float = 1.0
83
85
  n_total_queries: int = 0
84
86
  n_time_per_output_token: float = 0.0
@@ -97,9 +99,6 @@ class BenchmarkMetrics:
97
99
 
98
100
  def update_metrics(self, benchmark_data: BenchmarkData, api_plugin):
99
101
  self.n_total_queries += 1
100
- if self.start_time is None:
101
- self.start_time = benchmark_data.start_time
102
- self.total_time = time.perf_counter() - self.start_time
103
102
 
104
103
  if benchmark_data.success:
105
104
  self.n_succeed_queries += 1
@@ -108,7 +107,6 @@ class BenchmarkMetrics:
108
107
  self.n_total_prompt_tokens += benchmark_data.prompt_tokens
109
108
  self.n_total_completion_tokens += benchmark_data.completion_tokens
110
109
 
111
- benchmark_data._calculate_query_stream_metric()
112
110
  self.total_latency += benchmark_data.query_latency
113
111
  self.total_first_chunk_latency += benchmark_data.first_chunk_latency
114
112
  self.n_time_per_output_token += benchmark_data.time_per_output_token
@@ -117,6 +115,22 @@ class BenchmarkMetrics:
117
115
  self.n_failed_queries += 1
118
116
 
119
117
  self.calculate_averages()
118
+ self.update_total_time(benchmark_data)
119
+
120
+ def update_total_time(self, benchmark_data: BenchmarkData):
121
+ # Use the earliest start_time seen so far
122
+ if self.start_time is None:
123
+ self.start_time = benchmark_data.start_time
124
+ else:
125
+ self.start_time = min(self.start_time, benchmark_data.start_time)
126
+ # Track the latest completion time
127
+ if self.last_completed_time is None:
128
+ self.last_completed_time = benchmark_data.completed_time
129
+ else:
130
+ self.last_completed_time = max(self.last_completed_time, benchmark_data.completed_time)
131
+ # Compute total_time from request lifecycle timestamps to avoid consumer overhead
132
+ if self.start_time is not None and self.last_completed_time is not None:
133
+ self.total_time = max(self.last_completed_time - self.start_time, 0.0)
120
134
 
121
135
  def calculate_averages(self):
122
136
  if self.n_succeed_queries == 0:
@@ -19,7 +19,7 @@ logger = get_logger()
19
19
  class DatabaseColumns:
20
20
  REQUEST = 'request'
21
21
  START_TIME = 'start_time'
22
- CHUNK_TIMES = 'chunk_times'
22
+ INTER_TOKEN_LATENCIES = 'inter_token_latencies'
23
23
  SUCCESS = 'success'
24
24
  RESPONSE_MESSAGES = 'response_messages'
25
25
  COMPLETED_TIME = 'completed_time'
@@ -60,7 +60,7 @@ def create_result_table(cursor):
60
60
  f'''CREATE TABLE IF NOT EXISTS result(
61
61
  {DatabaseColumns.REQUEST} TEXT,
62
62
  {DatabaseColumns.START_TIME} REAL,
63
- {DatabaseColumns.CHUNK_TIMES} TEXT,
63
+ {DatabaseColumns.INTER_TOKEN_LATENCIES} TEXT,
64
64
  {DatabaseColumns.SUCCESS} INTEGER,
65
65
  {DatabaseColumns.RESPONSE_MESSAGES} TEXT,
66
66
  {DatabaseColumns.COMPLETED_TIME} REAL,
@@ -75,15 +75,15 @@ def create_result_table(cursor):
75
75
 
76
76
 
77
77
  def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
78
- request = encode_data(benchmark_data.request)
79
- chunk_times = json.dumps(benchmark_data.chunk_times)
78
+ request = benchmark_data.request
79
+ inter_token_latencies = json.dumps(benchmark_data.inter_chunk_latency)
80
80
  response_messages = encode_data(benchmark_data.response_messages)
81
81
 
82
82
  # Columns common to both success and failure cases
83
83
  common_columns = (
84
84
  request,
85
85
  benchmark_data.start_time,
86
- chunk_times,
86
+ inter_token_latencies,
87
87
  benchmark_data.success,
88
88
  response_messages,
89
89
  benchmark_data.completed_time,
@@ -96,7 +96,7 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
96
96
  benchmark_data.completion_tokens, benchmark_data.max_gpu_memory_cost, benchmark_data.time_per_output_token
97
97
  )
98
98
  query = f"""INSERT INTO result(
99
- {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
99
+ {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES},
100
100
  {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME},
101
101
  {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY}, {DatabaseColumns.PROMPT_TOKENS},
102
102
  {DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.MAX_GPU_MEMORY_COST},
@@ -105,7 +105,7 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
105
105
  cursor.execute(query, common_columns + additional_columns)
106
106
  else:
107
107
  query = f"""INSERT INTO result(
108
- {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
108
+ {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES},
109
109
  {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME}
110
110
  ) VALUES (?, ?, ?, ?, ?, ?)"""
111
111
  cursor.execute(query, common_columns)
@@ -173,20 +173,11 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
173
173
  :param result_db_path: Path to the SQLite database file.
174
174
  :return: Dictionary of percentiles for various metrics.
175
175
  """
176
-
177
- def inter_token_latencies(chunk_times_json: str) -> List[float]:
178
- try:
179
- chunk_times = json.loads(chunk_times_json)
180
- return [t2 - t1 for t1, t2 in zip(chunk_times[:-1], chunk_times[1:])]
181
- except (json.JSONDecodeError, TypeError) as e:
182
- logger.error(f'Error parsing chunk times: {e}')
183
- return []
184
-
185
- query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES}, {DatabaseColumns.SUCCESS},
176
+ query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES}, {DatabaseColumns.SUCCESS},
186
177
  {DatabaseColumns.COMPLETED_TIME}, {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY},
187
178
  {DatabaseColumns.PROMPT_TOKENS},
188
179
  {DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
189
- FROM result WHERE {DatabaseColumns.SUCCESS}=1'''
180
+ FROM result WHERE {DatabaseColumns.SUCCESS}=1''' # noqa: E501
190
181
 
191
182
  percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
192
183
 
@@ -202,7 +193,11 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
202
193
  # Prepare data for each metric
203
194
  inter_token_latencies_all = []
204
195
  for row in rows:
205
- inter_token_latencies_all.extend(inter_token_latencies(row[col_indices[DatabaseColumns.CHUNK_TIMES]]))
196
+ try:
197
+ itl = json.loads(row[col_indices[DatabaseColumns.INTER_TOKEN_LATENCIES]]) or []
198
+ inter_token_latencies_all.extend(itl)
199
+ except (json.JSONDecodeError, TypeError) as e:
200
+ logger.error(f'Error parsing inter token latencies: {e}')
206
201
 
207
202
  metrics = {
208
203
  PercentileMetrics.TTFT: [row[col_indices[DatabaseColumns.FIRST_CHUNK_LATENCY]] for row in rows],