evalscope 0.17.0__py3-none-any.whl → 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- evalscope/benchmarks/data_adapter.py +9 -4
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +2 -1
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +118 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
- evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
- evalscope/benchmarks/utils.py +1 -0
- evalscope/constants.py +5 -21
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +5 -3
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/completion_parsers.py +7 -0
- evalscope/metrics/llm_judge.py +6 -5
- evalscope/metrics/metrics.py +19 -7
- evalscope/models/__init__.py +4 -8
- evalscope/models/adapters/__init__.py +4 -9
- evalscope/models/adapters/base_adapter.py +4 -0
- evalscope/models/adapters/bfcl_adapter.py +2 -0
- evalscope/models/adapters/chat_adapter.py +3 -0
- evalscope/models/adapters/choice_adapter.py +4 -0
- evalscope/models/adapters/custom_adapter.py +7 -3
- evalscope/models/adapters/server_adapter.py +2 -0
- evalscope/models/adapters/t2i_adapter.py +3 -0
- evalscope/models/adapters/tau_bench_adapter.py +189 -0
- evalscope/models/register.py +0 -14
- evalscope/perf/arguments.py +13 -0
- evalscope/perf/benchmark.py +38 -39
- evalscope/perf/http_client.py +30 -86
- evalscope/perf/main.py +2 -2
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +22 -4
- evalscope/perf/plugin/api/custom_api.py +212 -55
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +105 -0
- evalscope/perf/plugin/api/openai_api.py +17 -19
- evalscope/perf/plugin/datasets/__init__.py +10 -7
- evalscope/perf/plugin/datasets/base.py +22 -1
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +4 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +2 -1
- evalscope/perf/plugin/datasets/random_dataset.py +15 -4
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/benchmark_util.py +14 -20
- evalscope/perf/utils/db_util.py +79 -61
- evalscope/utils/io_utils.py +10 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/METADATA +54 -34
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/RECORD +65 -58
- tests/cli/test_all.py +18 -2
- tests/cli/test_run.py +25 -37
- tests/perf/test_perf.py +29 -2
- evalscope/models/model.py +0 -189
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import aiohttp
|
|
2
|
+
import json
|
|
3
|
+
from http import HTTPStatus
|
|
4
|
+
from typing import Any, AsyncGenerator, Dict, List, Tuple
|
|
5
|
+
|
|
6
|
+
from evalscope.perf.arguments import Arguments
|
|
7
|
+
from evalscope.perf.plugin.api.base import ApiPluginBase
|
|
8
|
+
from evalscope.perf.utils.local_server import ServerSentEvent
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DefaultApiPlugin(ApiPluginBase):
|
|
15
|
+
"""Default implementation of API plugin with common HTTP handling methods."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, param: Arguments):
|
|
18
|
+
super().__init__(param)
|
|
19
|
+
|
|
20
|
+
async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
|
|
21
|
+
body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
|
|
22
|
+
"""Process the HTTP request and handle the response.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
client_session: The aiohttp client session
|
|
26
|
+
url: The request URL
|
|
27
|
+
headers: The request headers
|
|
28
|
+
body: The request body
|
|
29
|
+
|
|
30
|
+
Yields:
|
|
31
|
+
Tuple[bool, int, str]: (is_error, status_code, response_data)
|
|
32
|
+
"""
|
|
33
|
+
try:
|
|
34
|
+
headers = {'Content-Type': 'application/json', **headers}
|
|
35
|
+
data = json.dumps(body, ensure_ascii=False) # serialize to JSON
|
|
36
|
+
async with client_session.request('POST', url=url, data=data, headers=headers) as response:
|
|
37
|
+
async for result in self._handle_response(response):
|
|
38
|
+
yield result
|
|
39
|
+
except Exception as e:
|
|
40
|
+
logger.error(f'Error in process_request: {e}')
|
|
41
|
+
yield (True, None, str(e))
|
|
42
|
+
|
|
43
|
+
async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
|
|
44
|
+
"""Handle streaming response from server-sent events.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
response: The aiohttp response object containing a stream
|
|
48
|
+
|
|
49
|
+
Yields:
|
|
50
|
+
Tuple[bool, int, Any]: (is_error, status_code, data)
|
|
51
|
+
"""
|
|
52
|
+
try:
|
|
53
|
+
async for chunk_bytes in response.content:
|
|
54
|
+
chunk_bytes = chunk_bytes.strip()
|
|
55
|
+
if not chunk_bytes:
|
|
56
|
+
continue
|
|
57
|
+
chunk_bytes = chunk_bytes.decode('utf-8')
|
|
58
|
+
# NOTE: SSE comments (often used as pings) start with a colon.
|
|
59
|
+
# These are not JSON data payload and should be skipped.
|
|
60
|
+
if chunk_bytes.startswith(':'):
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
chunk = chunk_bytes.removeprefix('data: ')
|
|
64
|
+
|
|
65
|
+
if chunk != '[DONE]':
|
|
66
|
+
data = json.loads(chunk)
|
|
67
|
+
|
|
68
|
+
yield False, response.status, data
|
|
69
|
+
|
|
70
|
+
except Exception as e:
|
|
71
|
+
logger.error(f'Error in _handle_stream: {e}')
|
|
72
|
+
yield True, response.status, str(e)
|
|
73
|
+
|
|
74
|
+
async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
|
|
75
|
+
"""Handle the HTTP response based on content type and status.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
response: The aiohttp response object
|
|
79
|
+
|
|
80
|
+
Yields:
|
|
81
|
+
Tuple[bool, int, str]: (is_error, status_code, response_data)
|
|
82
|
+
"""
|
|
83
|
+
response_status = response.status
|
|
84
|
+
response_content_type = response.content_type
|
|
85
|
+
content_type_json = 'application/json'
|
|
86
|
+
content_type_stream = 'text/event-stream'
|
|
87
|
+
is_success = (response_status == HTTPStatus.OK)
|
|
88
|
+
|
|
89
|
+
if is_success:
|
|
90
|
+
# Handle successful response with 'text/event-stream' content type
|
|
91
|
+
if content_type_stream in response_content_type:
|
|
92
|
+
async for is_error, response_status, content in self._handle_stream(response):
|
|
93
|
+
yield (is_error, response_status, content)
|
|
94
|
+
# Handle successful response with 'application/json' content type
|
|
95
|
+
elif content_type_json in response_content_type:
|
|
96
|
+
content = await response.json()
|
|
97
|
+
yield (False, response_status, json.dumps(content, ensure_ascii=False))
|
|
98
|
+
# Handle other successful responses
|
|
99
|
+
else:
|
|
100
|
+
content = await response.read()
|
|
101
|
+
yield (False, response_status, content.decode('utf-8'))
|
|
102
|
+
else:
|
|
103
|
+
# error is always in JSON format
|
|
104
|
+
error = await response.json()
|
|
105
|
+
yield (True, response_status, json.dumps(error, ensure_ascii=False))
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
-
from typing import Any, Dict,
|
|
3
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
4
4
|
|
|
5
5
|
from evalscope.perf.arguments import Arguments
|
|
6
|
-
from evalscope.perf.plugin.api.
|
|
6
|
+
from evalscope.perf.plugin.api.default_api import DefaultApiPlugin
|
|
7
7
|
from evalscope.perf.plugin.registry import register_api
|
|
8
8
|
from evalscope.utils.logger import get_logger
|
|
9
9
|
|
|
@@ -11,25 +11,25 @@ logger = get_logger()
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
@register_api(['openai', 'local_vllm', 'local'])
|
|
14
|
-
class OpenaiPlugin(
|
|
14
|
+
class OpenaiPlugin(DefaultApiPlugin):
|
|
15
15
|
"""Base of openai interface."""
|
|
16
16
|
|
|
17
|
-
def __init__(self,
|
|
18
|
-
"""
|
|
17
|
+
def __init__(self, param: Arguments):
|
|
18
|
+
"""Initialize the OpenaiPlugin.
|
|
19
19
|
|
|
20
20
|
Args:
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
21
|
+
param (Arguments): Configuration object containing parameters
|
|
22
|
+
such as the tokenizer path and model details. If a tokenizer
|
|
23
|
+
path is provided, it is used to initialize the tokenizer.
|
|
24
24
|
"""
|
|
25
|
-
super().__init__(
|
|
26
|
-
if
|
|
25
|
+
super().__init__(param=param)
|
|
26
|
+
if param.tokenizer_path is not None:
|
|
27
27
|
from modelscope import AutoTokenizer
|
|
28
|
-
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
28
|
+
self.tokenizer = AutoTokenizer.from_pretrained(param.tokenizer_path)
|
|
29
29
|
else:
|
|
30
30
|
self.tokenizer = None
|
|
31
31
|
|
|
32
|
-
def build_request(self, messages: Union[List[Dict], str], param: Arguments) -> Dict:
|
|
32
|
+
def build_request(self, messages: Union[List[Dict], str], param: Arguments = None) -> Dict:
|
|
33
33
|
"""Build the openai format request based on prompt, dataset
|
|
34
34
|
|
|
35
35
|
Args:
|
|
@@ -42,6 +42,7 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
42
42
|
Returns:
|
|
43
43
|
Dict: The request body. None if prompt format is error.
|
|
44
44
|
"""
|
|
45
|
+
param = param or self.param
|
|
45
46
|
try:
|
|
46
47
|
if param.query_template is not None:
|
|
47
48
|
if param.query_template.startswith('@'):
|
|
@@ -54,8 +55,6 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
54
55
|
else:
|
|
55
56
|
query = json.loads(param.query_template)
|
|
56
57
|
|
|
57
|
-
if 'stream' in query.keys():
|
|
58
|
-
param.stream = query['stream']
|
|
59
58
|
# replace template messages with input messages.
|
|
60
59
|
query['messages'] = messages
|
|
61
60
|
elif isinstance(messages, str):
|
|
@@ -107,7 +106,7 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
107
106
|
|
|
108
107
|
# when stream, the last response is the full usage
|
|
109
108
|
# when non-stream, the last response is the first response
|
|
110
|
-
last_response_js =
|
|
109
|
+
last_response_js = responses[-1]
|
|
111
110
|
if 'usage' in last_response_js and last_response_js['usage']:
|
|
112
111
|
input_tokens = last_response_js['usage']['prompt_tokens']
|
|
113
112
|
output_tokens = last_response_js['usage']['completion_tokens']
|
|
@@ -116,11 +115,10 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
116
115
|
# no usage information in the response, parse the response to get the tokens
|
|
117
116
|
delta_contents = {}
|
|
118
117
|
for response in responses:
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
self.__process_response_object(js, delta_contents)
|
|
118
|
+
if 'object' in response:
|
|
119
|
+
self.__process_response_object(response, delta_contents)
|
|
122
120
|
else:
|
|
123
|
-
self.__process_no_object(
|
|
121
|
+
self.__process_no_object(response, delta_contents)
|
|
124
122
|
|
|
125
123
|
input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
|
|
126
124
|
return input_tokens, output_tokens
|
|
@@ -1,7 +1,10 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
3
|
-
from
|
|
4
|
-
from
|
|
5
|
-
from
|
|
6
|
-
from
|
|
7
|
-
from
|
|
1
|
+
from .base import DatasetPluginBase
|
|
2
|
+
from .custom import CustomDatasetPlugin
|
|
3
|
+
from .flickr8k import FlickrDatasetPlugin
|
|
4
|
+
from .kontext_bench import KontextDatasetPlugin
|
|
5
|
+
from .line_by_line import LineByLineDatasetPlugin
|
|
6
|
+
from .longalpaca import LongAlpacaDatasetPlugin
|
|
7
|
+
from .openqa import OpenqaDatasetPlugin
|
|
8
|
+
from .random_dataset import RandomDatasetPlugin
|
|
9
|
+
from .random_vl_dataset import RandomVLDatasetPlugin
|
|
10
|
+
from .speed_benchmark import SpeedBenchmarkDatasetPlugin, SpeedBenchmarkLongDatasetPlugin
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import sys
|
|
3
3
|
from abc import abstractmethod
|
|
4
|
-
from typing import Any, Dict, Iterator, List, Tuple
|
|
4
|
+
from typing import Any, Dict, Iterator, List, Tuple, Union
|
|
5
5
|
|
|
6
6
|
from evalscope.perf.arguments import Arguments
|
|
7
7
|
|
|
@@ -64,3 +64,24 @@ class DatasetPluginBase:
|
|
|
64
64
|
data = json.loads(content)
|
|
65
65
|
for item in data:
|
|
66
66
|
yield item
|
|
67
|
+
|
|
68
|
+
def create_message(self, text: str, image_urls: Union[List[str], str] = None, role: str = 'user') -> Dict:
|
|
69
|
+
"""Create a message with text and optional image URLs.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
text (str): The text content of the message.
|
|
73
|
+
image_urls (List[str], optional): List of image URLs. Defaults to None.
|
|
74
|
+
role (str, optional): The role of the message sender. Defaults to "user".
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Dict: A dictionary representing the message.
|
|
78
|
+
"""
|
|
79
|
+
if image_urls is None:
|
|
80
|
+
message = {'role': role, 'content': text}
|
|
81
|
+
else:
|
|
82
|
+
message = {'role': role, 'content': [{'type': 'text', 'text': text}]}
|
|
83
|
+
if isinstance(image_urls, str):
|
|
84
|
+
image_urls = [image_urls]
|
|
85
|
+
for url in image_urls:
|
|
86
|
+
message['content'].append({'type': 'image_url', 'image_url': {'url': url}})
|
|
87
|
+
return message
|
|
@@ -19,7 +19,8 @@ class CustomDatasetPlugin(DatasetPluginBase):
|
|
|
19
19
|
if len(prompt) > self.query_parameters.min_prompt_length and len(
|
|
20
20
|
prompt) < self.query_parameters.max_prompt_length:
|
|
21
21
|
if self.query_parameters.apply_chat_template:
|
|
22
|
-
|
|
22
|
+
message = self.create_message(prompt)
|
|
23
|
+
yield [message]
|
|
23
24
|
else:
|
|
24
25
|
yield prompt
|
|
25
26
|
|
|
@@ -1,18 +1,9 @@
|
|
|
1
|
-
import base64
|
|
2
|
-
from io import BytesIO
|
|
3
|
-
from PIL import Image
|
|
4
1
|
from typing import Any, Dict, Iterator, List
|
|
5
2
|
|
|
6
3
|
from evalscope.perf.arguments import Arguments
|
|
7
4
|
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
8
5
|
from evalscope.perf.plugin.registry import register_dataset
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def PIL_to_base64(image: Image.Image) -> str:
|
|
12
|
-
buffered = BytesIO()
|
|
13
|
-
image.save(buffered, format='JPEG')
|
|
14
|
-
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
|
15
|
-
return img_str
|
|
6
|
+
from evalscope.utils.io_utils import PIL_to_base64
|
|
16
7
|
|
|
17
8
|
|
|
18
9
|
@register_dataset('flickr8k')
|
|
@@ -31,21 +22,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
|
|
|
31
22
|
for item in dataset:
|
|
32
23
|
pil_image = item['jpg']
|
|
33
24
|
text = item['txt']
|
|
34
|
-
|
|
25
|
+
base64_image = PIL_to_base64(pil_image)
|
|
35
26
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
'user',
|
|
39
|
-
'content': [
|
|
40
|
-
{
|
|
41
|
-
'type': 'text',
|
|
42
|
-
'text': text,
|
|
43
|
-
},
|
|
44
|
-
{
|
|
45
|
-
'type': 'image_url',
|
|
46
|
-
'image_url': {
|
|
47
|
-
'url': f'data:image/jpeg;base64,{base64_iamge}',
|
|
48
|
-
}
|
|
49
|
-
},
|
|
50
|
-
],
|
|
51
|
-
}]
|
|
27
|
+
message = self.create_message(text=text, image_url=f'data:image/jpeg;base64,{base64_image}')
|
|
28
|
+
yield [message]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from typing import Any, Dict, Iterator, List
|
|
2
|
+
|
|
3
|
+
from evalscope.perf.arguments import Arguments
|
|
4
|
+
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
5
|
+
from evalscope.perf.plugin.registry import register_dataset
|
|
6
|
+
from evalscope.utils.io_utils import PIL_to_base64
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_dataset('kontext_bench')
|
|
10
|
+
class KontextDatasetPlugin(DatasetPluginBase):
|
|
11
|
+
"""Read dataset and return prompt.
|
|
12
|
+
Datasets: https://modelscope.cn/datasets/black-forest-labs/kontext-bench/dataPeview
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, query_parameters: Arguments):
|
|
16
|
+
super().__init__(query_parameters)
|
|
17
|
+
|
|
18
|
+
def build_messages(self) -> Iterator[List[Dict]]:
|
|
19
|
+
from modelscope.msdatasets import MsDataset
|
|
20
|
+
dataset = MsDataset.load('black-forest-labs/kontext-bench', subset_name='default', split='test')
|
|
21
|
+
|
|
22
|
+
for item in dataset:
|
|
23
|
+
pil_image = item['image']
|
|
24
|
+
text = item['instruction']
|
|
25
|
+
base64_image = PIL_to_base64(pil_image)
|
|
26
|
+
|
|
27
|
+
message = self.create_message(text=text, image_url=f'data:image/jpeg;base64,{base64_image}')
|
|
28
|
+
yield [message]
|
|
@@ -20,6 +20,7 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
|
|
|
20
20
|
if len(prompt) > self.query_parameters.min_prompt_length and len(
|
|
21
21
|
prompt) < self.query_parameters.max_prompt_length:
|
|
22
22
|
if self.query_parameters.apply_chat_template:
|
|
23
|
-
|
|
23
|
+
message = self.create_message(prompt)
|
|
24
|
+
yield [message]
|
|
24
25
|
else:
|
|
25
26
|
yield prompt
|
|
@@ -25,6 +25,7 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
|
|
|
25
25
|
if len(prompt) > self.query_parameters.min_prompt_length and len(
|
|
26
26
|
prompt) < self.query_parameters.max_prompt_length:
|
|
27
27
|
if self.query_parameters.apply_chat_template:
|
|
28
|
-
|
|
28
|
+
message = self.create_message(prompt)
|
|
29
|
+
yield [message]
|
|
29
30
|
else:
|
|
30
31
|
yield prompt
|
|
@@ -30,6 +30,7 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
|
|
|
30
30
|
if (len(prompt) > self.query_parameters.min_prompt_length
|
|
31
31
|
and len(prompt) < self.query_parameters.max_prompt_length):
|
|
32
32
|
if self.query_parameters.apply_chat_template:
|
|
33
|
-
|
|
33
|
+
message = self.create_message(prompt)
|
|
34
|
+
yield [message]
|
|
34
35
|
else:
|
|
35
36
|
yield prompt
|
|
@@ -37,12 +37,23 @@ class RandomDatasetPlugin(DatasetPluginBase):
|
|
|
37
37
|
input_lens = np.random.randint(min_prompt_length, max_prompt_length, size=self.number)
|
|
38
38
|
offsets = np.random.randint(0, self.tokenizer.vocab_size, size=self.number)
|
|
39
39
|
|
|
40
|
+
vocab_size = self.tokenizer.vocab_size
|
|
41
|
+
|
|
40
42
|
for i in range(self.number):
|
|
41
|
-
|
|
42
|
-
|
|
43
|
+
inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) % vocab_size).tolist()
|
|
44
|
+
token_sequence = self.prefix_ids + inner_seq
|
|
45
|
+
prompt = self.tokenizer.decode(token_sequence)
|
|
46
|
+
|
|
47
|
+
# After decoding the prompt we have to encode and decode it again.
|
|
48
|
+
# This is done because in some cases N consecutive tokens
|
|
49
|
+
# give a string tokenized into != N number of tokens.
|
|
50
|
+
total_input_len = self.prefix_length + int(input_lens[i])
|
|
51
|
+
re_encoded_sequence = self.tokenizer.encode(prompt, add_special_tokens=False)[:total_input_len]
|
|
52
|
+
prompt = self.tokenizer.decode(re_encoded_sequence)
|
|
43
53
|
|
|
44
54
|
if self.query_parameters.apply_chat_template:
|
|
45
|
-
|
|
55
|
+
message = self.create_message(prompt)
|
|
56
|
+
yield [message]
|
|
46
57
|
else:
|
|
47
58
|
yield prompt
|
|
48
59
|
|
|
@@ -53,6 +64,6 @@ class RandomDatasetPlugin(DatasetPluginBase):
|
|
|
53
64
|
return input_ids
|
|
54
65
|
|
|
55
66
|
def get_template_len(self):
|
|
56
|
-
empty_message = [
|
|
67
|
+
empty_message = [self.create_message(text='')]
|
|
57
68
|
template = self.tokenizer.apply_chat_template(empty_message, tokenize=True, add_generation_prompt=True)
|
|
58
69
|
return len(template)
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from PIL import Image, ImageDraw
|
|
3
|
+
from typing import Dict, Iterator, List
|
|
4
|
+
|
|
5
|
+
from evalscope.perf.arguments import Arguments
|
|
6
|
+
from evalscope.perf.plugin.datasets.random_dataset import RandomDatasetPlugin
|
|
7
|
+
from evalscope.perf.plugin.registry import register_dataset
|
|
8
|
+
from evalscope.utils.io_utils import PIL_to_base64
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@register_dataset('random_vl')
|
|
12
|
+
class RandomVLDatasetPlugin(RandomDatasetPlugin):
|
|
13
|
+
"""Random Vision-Language Dataset Plugin for multimodal model stress testing."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, query_parameters: Arguments):
|
|
16
|
+
super().__init__(query_parameters)
|
|
17
|
+
|
|
18
|
+
# Vision-language specific parameters
|
|
19
|
+
self.image_width = query_parameters.image_width
|
|
20
|
+
self.image_height = query_parameters.image_height
|
|
21
|
+
self.image_format = query_parameters.image_format
|
|
22
|
+
self.image_num = query_parameters.image_num
|
|
23
|
+
|
|
24
|
+
assert self.image_num > 0, 'image_num must be greater than 0.'
|
|
25
|
+
|
|
26
|
+
def build_messages(self) -> Iterator[List[Dict]]:
|
|
27
|
+
# Reuse parent's message generation logic
|
|
28
|
+
for messages in super().build_messages():
|
|
29
|
+
prompt = messages[0]['content'] if isinstance(messages[0], dict) else messages[0]
|
|
30
|
+
|
|
31
|
+
# Generate random images based on image_num
|
|
32
|
+
images_b64 = []
|
|
33
|
+
for _ in range(self.image_num):
|
|
34
|
+
images_b64.append(f'data:image/png;base64,{self._generate_random_image_b64()}')
|
|
35
|
+
|
|
36
|
+
message = self.create_message(text=prompt, image_urls=images_b64)
|
|
37
|
+
yield [message]
|
|
38
|
+
|
|
39
|
+
def _generate_random_image_b64(self) -> str:
|
|
40
|
+
"""Generate a random image and return as base64 string."""
|
|
41
|
+
# Create a random colored image
|
|
42
|
+
color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
|
|
43
|
+
image = Image.new(self.image_format, (self.image_width, self.image_height), color)
|
|
44
|
+
|
|
45
|
+
# Add some random shapes for variety
|
|
46
|
+
draw = ImageDraw.Draw(image)
|
|
47
|
+
for _ in range(random.randint(1, 5)):
|
|
48
|
+
shape_type = random.choice(['rectangle', 'ellipse', 'line'])
|
|
49
|
+
|
|
50
|
+
# Generate two random points
|
|
51
|
+
x1 = random.randint(0, self.image_width - 1)
|
|
52
|
+
y1 = random.randint(0, self.image_height - 1)
|
|
53
|
+
x2 = random.randint(0, self.image_width - 1)
|
|
54
|
+
y2 = random.randint(0, self.image_height - 1)
|
|
55
|
+
|
|
56
|
+
# Ensure proper coordinate ordering (x1 <= x2, y1 <= y2)
|
|
57
|
+
if x1 > x2:
|
|
58
|
+
x1, x2 = x2, x1
|
|
59
|
+
if y1 > y2:
|
|
60
|
+
y1, y2 = y2, y1
|
|
61
|
+
|
|
62
|
+
# Ensure we have at least a 1-pixel difference
|
|
63
|
+
if x1 == x2:
|
|
64
|
+
x2 = min(x1 + 1, self.image_width - 1)
|
|
65
|
+
if y1 == y2:
|
|
66
|
+
y2 = min(y1 + 1, self.image_height - 1)
|
|
67
|
+
|
|
68
|
+
coords = [x1, y1, x2, y2]
|
|
69
|
+
|
|
70
|
+
shape_color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
|
|
71
|
+
|
|
72
|
+
if shape_type == 'rectangle':
|
|
73
|
+
draw.rectangle(coords, fill=shape_color)
|
|
74
|
+
elif shape_type == 'ellipse':
|
|
75
|
+
draw.ellipse(coords, fill=shape_color)
|
|
76
|
+
else:
|
|
77
|
+
draw.line(coords, fill=shape_color, width=random.randint(1, 5))
|
|
78
|
+
|
|
79
|
+
# Convert to base64
|
|
80
|
+
return PIL_to_base64(image, format='PNG')
|
|
@@ -1,23 +1,25 @@
|
|
|
1
|
-
from typing import Any, List, Type, Union
|
|
1
|
+
from typing import TYPE_CHECKING, Any, List, Type, Union
|
|
2
2
|
|
|
3
|
+
if TYPE_CHECKING:
|
|
4
|
+
from .api import ApiPluginBase
|
|
5
|
+
from .datasets import DatasetPluginBase
|
|
3
6
|
|
|
4
|
-
class PluginRegistry:
|
|
5
|
-
|
|
6
|
-
def __init__(self):
|
|
7
|
-
self._registry = {}
|
|
8
7
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
return cls
|
|
8
|
+
class PluginRegistry:
|
|
9
|
+
_registry = {}
|
|
12
10
|
|
|
13
|
-
|
|
14
|
-
|
|
11
|
+
@classmethod
|
|
12
|
+
def register(cls, name, plugin_cls):
|
|
13
|
+
cls._registry[name] = plugin_cls
|
|
14
|
+
return plugin_cls
|
|
15
15
|
|
|
16
|
-
|
|
17
|
-
|
|
16
|
+
@classmethod
|
|
17
|
+
def get_class(cls, name):
|
|
18
|
+
return cls._registry[name]
|
|
18
19
|
|
|
19
|
-
|
|
20
|
-
|
|
20
|
+
@classmethod
|
|
21
|
+
def all_classes(cls):
|
|
22
|
+
return list(cls._registry.keys())
|
|
21
23
|
|
|
22
24
|
|
|
23
25
|
def register_dataset(name: Union[str, List[str]]):
|
|
@@ -50,5 +52,23 @@ def register_api(name: Union[str, List[str]]):
|
|
|
50
52
|
return class_decorator
|
|
51
53
|
|
|
52
54
|
|
|
53
|
-
DatasetRegistry
|
|
54
|
-
|
|
55
|
+
class DatasetRegistry(PluginRegistry):
|
|
56
|
+
"""Registry for dataset plugins."""
|
|
57
|
+
_registry = {}
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def get_class(cls, name: str) -> Type['DatasetPluginBase']:
|
|
61
|
+
if name not in cls._registry:
|
|
62
|
+
raise ValueError(f"Dataset plugin '{name}' is not registered.")
|
|
63
|
+
return cls._registry[name]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class ApiRegistry(PluginRegistry):
|
|
67
|
+
"""Registry for API plugins."""
|
|
68
|
+
_registry = {}
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def get_class(cls, name: str) -> Type['ApiPluginBase']:
|
|
72
|
+
if name not in cls._registry:
|
|
73
|
+
raise ValueError(f"API plugin '{name}' is not registered.")
|
|
74
|
+
return cls._registry[name]
|
|
@@ -20,25 +20,24 @@ class BenchmarkData:
|
|
|
20
20
|
# late init
|
|
21
21
|
query_latency: float = 0.0
|
|
22
22
|
first_chunk_latency: float = 0.0
|
|
23
|
-
n_chunks: int = 0
|
|
24
|
-
n_chunks_time: float = 0.0
|
|
25
23
|
max_gpu_memory_cost = 0
|
|
26
24
|
time_per_output_token: float = 0.0
|
|
25
|
+
inter_chunk_latency: List[float] = field(default_factory=list)
|
|
27
26
|
|
|
28
27
|
prompt_tokens = None
|
|
29
28
|
completion_tokens = None
|
|
30
29
|
|
|
31
|
-
def _calculate_query_stream_metric(self) ->
|
|
30
|
+
def _calculate_query_stream_metric(self) -> None:
|
|
32
31
|
self.query_latency = self.completed_time - self.start_time
|
|
32
|
+
# only for stream responses
|
|
33
33
|
if len(self.chunk_times) > 1:
|
|
34
34
|
self.first_chunk_latency = self.chunk_times[0] - self.start_time
|
|
35
|
-
|
|
36
|
-
self.
|
|
35
|
+
# remove the first chunk time from the total latency
|
|
36
|
+
self.time_per_output_token = (self.query_latency - self.first_chunk_latency) / (
|
|
37
|
+
self.completion_tokens - 1) if self.completion_tokens > 1 else 0.0
|
|
38
|
+
self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
|
|
37
39
|
else:
|
|
38
40
|
self.first_chunk_latency = self.query_latency
|
|
39
|
-
self.n_chunks = 1
|
|
40
|
-
self.n_chunks_time = self.query_latency
|
|
41
|
-
self.time_per_output_token = self.n_chunks_time / self.n_chunks if self.n_chunks != 0 else 0.0
|
|
42
41
|
|
|
43
42
|
def _calculate_tokens(self, api_plugin):
|
|
44
43
|
self.prompt_tokens, self.completion_tokens = \
|
|
@@ -63,10 +62,9 @@ class Metrics:
|
|
|
63
62
|
AVERAGE_LATENCY = 'Average latency (s)'
|
|
64
63
|
AVERAGE_TIME_TO_FIRST_TOKEN = 'Average time to first token (s)'
|
|
65
64
|
AVERAGE_TIME_PER_OUTPUT_TOKEN = 'Average time per output token (s)'
|
|
65
|
+
AVERAGE_INTER_TOKEN_LATENCY = 'Average inter-token latency (s)'
|
|
66
66
|
AVERAGE_INPUT_TOKENS_PER_REQUEST = 'Average input tokens per request'
|
|
67
67
|
AVERAGE_OUTPUT_TOKENS_PER_REQUEST = 'Average output tokens per request'
|
|
68
|
-
AVERAGE_PACKAGE_LATENCY = 'Average package latency (s)'
|
|
69
|
-
AVERAGE_PACKAGE_PER_REQUEST = 'Average package per request'
|
|
70
68
|
|
|
71
69
|
|
|
72
70
|
@dataclass
|
|
@@ -76,25 +74,23 @@ class BenchmarkMetrics:
|
|
|
76
74
|
n_failed_queries: int = 0
|
|
77
75
|
total_first_chunk_latency: float = 0.0
|
|
78
76
|
total_latency: float = 0.0
|
|
79
|
-
n_total_chunks: int = 0
|
|
80
77
|
n_total_prompt_tokens: int = 0
|
|
81
78
|
n_total_completion_tokens: int = 0
|
|
82
|
-
total_chunks_time: float = 0.0
|
|
83
79
|
start_time: Optional[float] = None
|
|
84
80
|
total_time: float = 1.0
|
|
85
81
|
n_total_queries: int = 0
|
|
86
82
|
n_time_per_output_token: float = 0.0
|
|
83
|
+
n_total_inter_token_latency: List[float] = field(default_factory=list)
|
|
87
84
|
|
|
88
85
|
avg_first_chunk_latency: float = -1
|
|
89
86
|
avg_latency: float = -1
|
|
90
|
-
n_avg_chunks: float = -1
|
|
91
|
-
avg_chunk_time: float = -1
|
|
92
87
|
avg_prompt_tokens: float = -1
|
|
93
88
|
avg_completion_tokens: float = -1
|
|
94
89
|
avg_input_token_per_seconds: float = -1
|
|
95
90
|
avg_output_token_per_seconds: float = -1
|
|
96
91
|
avg_total_token_per_seconds: float = -1
|
|
97
92
|
avg_time_per_token: float = -1
|
|
93
|
+
avg_inter_token_latency: float = -1
|
|
98
94
|
qps: float = -1
|
|
99
95
|
|
|
100
96
|
def update_metrics(self, benchmark_data: BenchmarkData, api_plugin):
|
|
@@ -113,9 +109,8 @@ class BenchmarkMetrics:
|
|
|
113
109
|
benchmark_data._calculate_query_stream_metric()
|
|
114
110
|
self.total_latency += benchmark_data.query_latency
|
|
115
111
|
self.total_first_chunk_latency += benchmark_data.first_chunk_latency
|
|
116
|
-
self.n_total_chunks += benchmark_data.n_chunks
|
|
117
|
-
self.total_chunks_time += benchmark_data.n_chunks_time
|
|
118
112
|
self.n_time_per_output_token += benchmark_data.time_per_output_token
|
|
113
|
+
self.n_total_inter_token_latency += benchmark_data.inter_chunk_latency
|
|
119
114
|
else:
|
|
120
115
|
self.n_failed_queries += 1
|
|
121
116
|
|
|
@@ -127,8 +122,6 @@ class BenchmarkMetrics:
|
|
|
127
122
|
try:
|
|
128
123
|
self.avg_first_chunk_latency = self.total_first_chunk_latency / self.n_succeed_queries
|
|
129
124
|
self.avg_latency = self.total_latency / self.n_succeed_queries
|
|
130
|
-
self.n_avg_chunks = self.n_total_chunks / self.n_succeed_queries
|
|
131
|
-
self.avg_chunk_time = self.total_chunks_time / self.n_total_chunks
|
|
132
125
|
self.avg_prompt_tokens = self.n_total_prompt_tokens / self.n_succeed_queries
|
|
133
126
|
self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
|
|
134
127
|
self.avg_input_token_per_seconds = self.n_total_prompt_tokens / self.total_first_chunk_latency
|
|
@@ -136,6 +129,8 @@ class BenchmarkMetrics:
|
|
|
136
129
|
self.avg_total_token_per_seconds = (self.n_total_prompt_tokens
|
|
137
130
|
+ self.n_total_completion_tokens) / self.total_time
|
|
138
131
|
self.avg_time_per_token = self.n_time_per_output_token / self.n_succeed_queries
|
|
132
|
+
self.avg_inter_token_latency = sum(self.n_total_inter_token_latency) / len(
|
|
133
|
+
self.n_total_inter_token_latency) if self.n_total_inter_token_latency else 0.0
|
|
139
134
|
self.qps = self.n_succeed_queries / self.total_time
|
|
140
135
|
except ZeroDivisionError as e:
|
|
141
136
|
logger.exception(e)
|
|
@@ -154,9 +149,8 @@ class BenchmarkMetrics:
|
|
|
154
149
|
Metrics.AVERAGE_LATENCY: round(self.avg_latency, default_ndigits),
|
|
155
150
|
Metrics.AVERAGE_TIME_TO_FIRST_TOKEN: round(self.avg_first_chunk_latency, default_ndigits),
|
|
156
151
|
Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN: round(self.avg_time_per_token, default_ndigits),
|
|
152
|
+
Metrics.AVERAGE_INTER_TOKEN_LATENCY: round(self.avg_inter_token_latency, default_ndigits),
|
|
157
153
|
Metrics.AVERAGE_INPUT_TOKENS_PER_REQUEST: round(self.avg_prompt_tokens, default_ndigits),
|
|
158
154
|
Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST: round(self.avg_completion_tokens, default_ndigits),
|
|
159
|
-
Metrics.AVERAGE_PACKAGE_LATENCY: round(self.avg_chunk_time, default_ndigits),
|
|
160
|
-
Metrics.AVERAGE_PACKAGE_PER_REQUEST: round(self.n_avg_chunks, default_ndigits),
|
|
161
155
|
}
|
|
162
156
|
return message
|