evalscope 0.17.0__py3-none-any.whl → 0.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (66) hide show
  1. evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
  2. evalscope/benchmarks/data_adapter.py +9 -4
  3. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -1
  4. evalscope/benchmarks/general_qa/general_qa_adapter.py +2 -1
  5. evalscope/benchmarks/hle/__init__.py +0 -0
  6. evalscope/benchmarks/hle/hle_adapter.py +118 -0
  7. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
  8. evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
  9. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  10. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
  11. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
  12. evalscope/benchmarks/utils.py +1 -0
  13. evalscope/constants.py +5 -21
  14. evalscope/evaluator/__init__.py +1 -1
  15. evalscope/evaluator/evaluator.py +5 -3
  16. evalscope/metrics/__init__.py +3 -1
  17. evalscope/metrics/completion_parsers.py +7 -0
  18. evalscope/metrics/llm_judge.py +6 -5
  19. evalscope/metrics/metrics.py +19 -7
  20. evalscope/models/__init__.py +4 -8
  21. evalscope/models/adapters/__init__.py +4 -9
  22. evalscope/models/adapters/base_adapter.py +4 -0
  23. evalscope/models/adapters/bfcl_adapter.py +2 -0
  24. evalscope/models/adapters/chat_adapter.py +3 -0
  25. evalscope/models/adapters/choice_adapter.py +4 -0
  26. evalscope/models/adapters/custom_adapter.py +7 -3
  27. evalscope/models/adapters/server_adapter.py +2 -0
  28. evalscope/models/adapters/t2i_adapter.py +3 -0
  29. evalscope/models/adapters/tau_bench_adapter.py +189 -0
  30. evalscope/models/register.py +0 -14
  31. evalscope/perf/arguments.py +13 -0
  32. evalscope/perf/benchmark.py +38 -39
  33. evalscope/perf/http_client.py +30 -86
  34. evalscope/perf/main.py +2 -2
  35. evalscope/perf/plugin/__init__.py +3 -2
  36. evalscope/perf/plugin/api/__init__.py +4 -3
  37. evalscope/perf/plugin/api/base.py +22 -4
  38. evalscope/perf/plugin/api/custom_api.py +212 -55
  39. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  40. evalscope/perf/plugin/api/default_api.py +105 -0
  41. evalscope/perf/plugin/api/openai_api.py +17 -19
  42. evalscope/perf/plugin/datasets/__init__.py +10 -7
  43. evalscope/perf/plugin/datasets/base.py +22 -1
  44. evalscope/perf/plugin/datasets/custom.py +2 -1
  45. evalscope/perf/plugin/datasets/flickr8k.py +4 -27
  46. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  47. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  48. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  49. evalscope/perf/plugin/datasets/openqa.py +2 -1
  50. evalscope/perf/plugin/datasets/random_dataset.py +15 -4
  51. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  52. evalscope/perf/plugin/registry.py +36 -16
  53. evalscope/perf/utils/benchmark_util.py +14 -20
  54. evalscope/perf/utils/db_util.py +79 -61
  55. evalscope/utils/io_utils.py +10 -0
  56. evalscope/version.py +2 -2
  57. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/METADATA +54 -34
  58. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/RECORD +65 -58
  59. tests/cli/test_all.py +18 -2
  60. tests/cli/test_run.py +25 -37
  61. tests/perf/test_perf.py +29 -2
  62. evalscope/models/model.py +0 -189
  63. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
  64. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
  65. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
  66. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,105 @@
1
+ import aiohttp
2
+ import json
3
+ from http import HTTPStatus
4
+ from typing import Any, AsyncGenerator, Dict, List, Tuple
5
+
6
+ from evalscope.perf.arguments import Arguments
7
+ from evalscope.perf.plugin.api.base import ApiPluginBase
8
+ from evalscope.perf.utils.local_server import ServerSentEvent
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
12
+
13
+
14
+ class DefaultApiPlugin(ApiPluginBase):
15
+ """Default implementation of API plugin with common HTTP handling methods."""
16
+
17
+ def __init__(self, param: Arguments):
18
+ super().__init__(param)
19
+
20
+ async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
21
+ body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
22
+ """Process the HTTP request and handle the response.
23
+
24
+ Args:
25
+ client_session: The aiohttp client session
26
+ url: The request URL
27
+ headers: The request headers
28
+ body: The request body
29
+
30
+ Yields:
31
+ Tuple[bool, int, str]: (is_error, status_code, response_data)
32
+ """
33
+ try:
34
+ headers = {'Content-Type': 'application/json', **headers}
35
+ data = json.dumps(body, ensure_ascii=False) # serialize to JSON
36
+ async with client_session.request('POST', url=url, data=data, headers=headers) as response:
37
+ async for result in self._handle_response(response):
38
+ yield result
39
+ except Exception as e:
40
+ logger.error(f'Error in process_request: {e}')
41
+ yield (True, None, str(e))
42
+
43
+ async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
44
+ """Handle streaming response from server-sent events.
45
+
46
+ Args:
47
+ response: The aiohttp response object containing a stream
48
+
49
+ Yields:
50
+ Tuple[bool, int, Any]: (is_error, status_code, data)
51
+ """
52
+ try:
53
+ async for chunk_bytes in response.content:
54
+ chunk_bytes = chunk_bytes.strip()
55
+ if not chunk_bytes:
56
+ continue
57
+ chunk_bytes = chunk_bytes.decode('utf-8')
58
+ # NOTE: SSE comments (often used as pings) start with a colon.
59
+ # These are not JSON data payload and should be skipped.
60
+ if chunk_bytes.startswith(':'):
61
+ continue
62
+
63
+ chunk = chunk_bytes.removeprefix('data: ')
64
+
65
+ if chunk != '[DONE]':
66
+ data = json.loads(chunk)
67
+
68
+ yield False, response.status, data
69
+
70
+ except Exception as e:
71
+ logger.error(f'Error in _handle_stream: {e}')
72
+ yield True, response.status, str(e)
73
+
74
+ async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
75
+ """Handle the HTTP response based on content type and status.
76
+
77
+ Args:
78
+ response: The aiohttp response object
79
+
80
+ Yields:
81
+ Tuple[bool, int, str]: (is_error, status_code, response_data)
82
+ """
83
+ response_status = response.status
84
+ response_content_type = response.content_type
85
+ content_type_json = 'application/json'
86
+ content_type_stream = 'text/event-stream'
87
+ is_success = (response_status == HTTPStatus.OK)
88
+
89
+ if is_success:
90
+ # Handle successful response with 'text/event-stream' content type
91
+ if content_type_stream in response_content_type:
92
+ async for is_error, response_status, content in self._handle_stream(response):
93
+ yield (is_error, response_status, content)
94
+ # Handle successful response with 'application/json' content type
95
+ elif content_type_json in response_content_type:
96
+ content = await response.json()
97
+ yield (False, response_status, json.dumps(content, ensure_ascii=False))
98
+ # Handle other successful responses
99
+ else:
100
+ content = await response.read()
101
+ yield (False, response_status, content.decode('utf-8'))
102
+ else:
103
+ # error is always in JSON format
104
+ error = await response.json()
105
+ yield (True, response_status, json.dumps(error, ensure_ascii=False))
@@ -1,9 +1,9 @@
1
1
  import json
2
2
  import os
3
- from typing import Any, Dict, Iterator, List, Union
3
+ from typing import Any, Dict, List, Tuple, Union
4
4
 
5
5
  from evalscope.perf.arguments import Arguments
6
- from evalscope.perf.plugin.api.base import ApiPluginBase
6
+ from evalscope.perf.plugin.api.default_api import DefaultApiPlugin
7
7
  from evalscope.perf.plugin.registry import register_api
8
8
  from evalscope.utils.logger import get_logger
9
9
 
@@ -11,25 +11,25 @@ logger = get_logger()
11
11
 
12
12
 
13
13
  @register_api(['openai', 'local_vllm', 'local'])
14
- class OpenaiPlugin(ApiPluginBase):
14
+ class OpenaiPlugin(DefaultApiPlugin):
15
15
  """Base of openai interface."""
16
16
 
17
- def __init__(self, mode_path: str):
18
- """Init the plugin
17
+ def __init__(self, param: Arguments):
18
+ """Initialize the OpenaiPlugin.
19
19
 
20
20
  Args:
21
- mode_path (str): The model path, we use the tokenizer
22
- weight in the model to calculate the number of the
23
- input and output tokens.
21
+ param (Arguments): Configuration object containing parameters
22
+ such as the tokenizer path and model details. If a tokenizer
23
+ path is provided, it is used to initialize the tokenizer.
24
24
  """
25
- super().__init__(model_path=mode_path)
26
- if mode_path is not None:
25
+ super().__init__(param=param)
26
+ if param.tokenizer_path is not None:
27
27
  from modelscope import AutoTokenizer
28
- self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
28
+ self.tokenizer = AutoTokenizer.from_pretrained(param.tokenizer_path)
29
29
  else:
30
30
  self.tokenizer = None
31
31
 
32
- def build_request(self, messages: Union[List[Dict], str], param: Arguments) -> Dict:
32
+ def build_request(self, messages: Union[List[Dict], str], param: Arguments = None) -> Dict:
33
33
  """Build the openai format request based on prompt, dataset
34
34
 
35
35
  Args:
@@ -42,6 +42,7 @@ class OpenaiPlugin(ApiPluginBase):
42
42
  Returns:
43
43
  Dict: The request body. None if prompt format is error.
44
44
  """
45
+ param = param or self.param
45
46
  try:
46
47
  if param.query_template is not None:
47
48
  if param.query_template.startswith('@'):
@@ -54,8 +55,6 @@ class OpenaiPlugin(ApiPluginBase):
54
55
  else:
55
56
  query = json.loads(param.query_template)
56
57
 
57
- if 'stream' in query.keys():
58
- param.stream = query['stream']
59
58
  # replace template messages with input messages.
60
59
  query['messages'] = messages
61
60
  elif isinstance(messages, str):
@@ -107,7 +106,7 @@ class OpenaiPlugin(ApiPluginBase):
107
106
 
108
107
  # when stream, the last response is the full usage
109
108
  # when non-stream, the last response is the first response
110
- last_response_js = json.loads(responses[-1])
109
+ last_response_js = responses[-1]
111
110
  if 'usage' in last_response_js and last_response_js['usage']:
112
111
  input_tokens = last_response_js['usage']['prompt_tokens']
113
112
  output_tokens = last_response_js['usage']['completion_tokens']
@@ -116,11 +115,10 @@ class OpenaiPlugin(ApiPluginBase):
116
115
  # no usage information in the response, parse the response to get the tokens
117
116
  delta_contents = {}
118
117
  for response in responses:
119
- js = json.loads(response)
120
- if 'object' in js:
121
- self.__process_response_object(js, delta_contents)
118
+ if 'object' in response:
119
+ self.__process_response_object(response, delta_contents)
122
120
  else:
123
- self.__process_no_object(js, delta_contents)
121
+ self.__process_no_object(response, delta_contents)
124
122
 
125
123
  input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
126
124
  return input_tokens, output_tokens
@@ -1,7 +1,10 @@
1
- from evalscope.perf.plugin.datasets.custom import CustomDatasetPlugin
2
- from evalscope.perf.plugin.datasets.flickr8k import FlickrDatasetPlugin
3
- from evalscope.perf.plugin.datasets.line_by_line import LineByLineDatasetPlugin
4
- from evalscope.perf.plugin.datasets.longalpaca import LongAlpacaDatasetPlugin
5
- from evalscope.perf.plugin.datasets.openqa import OpenqaDatasetPlugin
6
- from evalscope.perf.plugin.datasets.random_dataset import RandomDatasetPlugin
7
- from evalscope.perf.plugin.datasets.speed_benchmark import SpeedBenchmarkDatasetPlugin, SpeedBenchmarkLongDatasetPlugin
1
+ from .base import DatasetPluginBase
2
+ from .custom import CustomDatasetPlugin
3
+ from .flickr8k import FlickrDatasetPlugin
4
+ from .kontext_bench import KontextDatasetPlugin
5
+ from .line_by_line import LineByLineDatasetPlugin
6
+ from .longalpaca import LongAlpacaDatasetPlugin
7
+ from .openqa import OpenqaDatasetPlugin
8
+ from .random_dataset import RandomDatasetPlugin
9
+ from .random_vl_dataset import RandomVLDatasetPlugin
10
+ from .speed_benchmark import SpeedBenchmarkDatasetPlugin, SpeedBenchmarkLongDatasetPlugin
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  import sys
3
3
  from abc import abstractmethod
4
- from typing import Any, Dict, Iterator, List, Tuple
4
+ from typing import Any, Dict, Iterator, List, Tuple, Union
5
5
 
6
6
  from evalscope.perf.arguments import Arguments
7
7
 
@@ -64,3 +64,24 @@ class DatasetPluginBase:
64
64
  data = json.loads(content)
65
65
  for item in data:
66
66
  yield item
67
+
68
+ def create_message(self, text: str, image_urls: Union[List[str], str] = None, role: str = 'user') -> Dict:
69
+ """Create a message with text and optional image URLs.
70
+
71
+ Args:
72
+ text (str): The text content of the message.
73
+ image_urls (List[str], optional): List of image URLs. Defaults to None.
74
+ role (str, optional): The role of the message sender. Defaults to "user".
75
+
76
+ Returns:
77
+ Dict: A dictionary representing the message.
78
+ """
79
+ if image_urls is None:
80
+ message = {'role': role, 'content': text}
81
+ else:
82
+ message = {'role': role, 'content': [{'type': 'text', 'text': text}]}
83
+ if isinstance(image_urls, str):
84
+ image_urls = [image_urls]
85
+ for url in image_urls:
86
+ message['content'].append({'type': 'image_url', 'image_url': {'url': url}})
87
+ return message
@@ -19,7 +19,8 @@ class CustomDatasetPlugin(DatasetPluginBase):
19
19
  if len(prompt) > self.query_parameters.min_prompt_length and len(
20
20
  prompt) < self.query_parameters.max_prompt_length:
21
21
  if self.query_parameters.apply_chat_template:
22
- yield [{'role': 'user', 'content': prompt}]
22
+ message = self.create_message(prompt)
23
+ yield [message]
23
24
  else:
24
25
  yield prompt
25
26
 
@@ -1,18 +1,9 @@
1
- import base64
2
- from io import BytesIO
3
- from PIL import Image
4
1
  from typing import Any, Dict, Iterator, List
5
2
 
6
3
  from evalscope.perf.arguments import Arguments
7
4
  from evalscope.perf.plugin.datasets.base import DatasetPluginBase
8
5
  from evalscope.perf.plugin.registry import register_dataset
9
-
10
-
11
- def PIL_to_base64(image: Image.Image) -> str:
12
- buffered = BytesIO()
13
- image.save(buffered, format='JPEG')
14
- img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
15
- return img_str
6
+ from evalscope.utils.io_utils import PIL_to_base64
16
7
 
17
8
 
18
9
  @register_dataset('flickr8k')
@@ -31,21 +22,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
31
22
  for item in dataset:
32
23
  pil_image = item['jpg']
33
24
  text = item['txt']
34
- base64_iamge = PIL_to_base64(pil_image)
25
+ base64_image = PIL_to_base64(pil_image)
35
26
 
36
- yield [{
37
- 'role':
38
- 'user',
39
- 'content': [
40
- {
41
- 'type': 'text',
42
- 'text': text,
43
- },
44
- {
45
- 'type': 'image_url',
46
- 'image_url': {
47
- 'url': f'data:image/jpeg;base64,{base64_iamge}',
48
- }
49
- },
50
- ],
51
- }]
27
+ message = self.create_message(text=text, image_url=f'data:image/jpeg;base64,{base64_image}')
28
+ yield [message]
@@ -0,0 +1,28 @@
1
+ from typing import Any, Dict, Iterator, List
2
+
3
+ from evalscope.perf.arguments import Arguments
4
+ from evalscope.perf.plugin.datasets.base import DatasetPluginBase
5
+ from evalscope.perf.plugin.registry import register_dataset
6
+ from evalscope.utils.io_utils import PIL_to_base64
7
+
8
+
9
+ @register_dataset('kontext_bench')
10
+ class KontextDatasetPlugin(DatasetPluginBase):
11
+ """Read dataset and return prompt.
12
+ Datasets: https://modelscope.cn/datasets/black-forest-labs/kontext-bench/dataPeview
13
+ """
14
+
15
+ def __init__(self, query_parameters: Arguments):
16
+ super().__init__(query_parameters)
17
+
18
+ def build_messages(self) -> Iterator[List[Dict]]:
19
+ from modelscope.msdatasets import MsDataset
20
+ dataset = MsDataset.load('black-forest-labs/kontext-bench', subset_name='default', split='test')
21
+
22
+ for item in dataset:
23
+ pil_image = item['image']
24
+ text = item['instruction']
25
+ base64_image = PIL_to_base64(pil_image)
26
+
27
+ message = self.create_message(text=text, image_url=f'data:image/jpeg;base64,{base64_image}')
28
+ yield [message]
@@ -20,6 +20,7 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
20
20
  if len(prompt) > self.query_parameters.min_prompt_length and len(
21
21
  prompt) < self.query_parameters.max_prompt_length:
22
22
  if self.query_parameters.apply_chat_template:
23
- yield [{'role': 'user', 'content': prompt}]
23
+ message = self.create_message(prompt)
24
+ yield [message]
24
25
  else:
25
26
  yield prompt
@@ -25,6 +25,7 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
25
25
  if len(prompt) > self.query_parameters.min_prompt_length and len(
26
26
  prompt) < self.query_parameters.max_prompt_length:
27
27
  if self.query_parameters.apply_chat_template:
28
- yield [{'role': 'user', 'content': prompt}]
28
+ message = self.create_message(prompt)
29
+ yield [message]
29
30
  else:
30
31
  yield prompt
@@ -30,6 +30,7 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
30
30
  if (len(prompt) > self.query_parameters.min_prompt_length
31
31
  and len(prompt) < self.query_parameters.max_prompt_length):
32
32
  if self.query_parameters.apply_chat_template:
33
- yield [{'role': 'user', 'content': prompt}]
33
+ message = self.create_message(prompt)
34
+ yield [message]
34
35
  else:
35
36
  yield prompt
@@ -37,12 +37,23 @@ class RandomDatasetPlugin(DatasetPluginBase):
37
37
  input_lens = np.random.randint(min_prompt_length, max_prompt_length, size=self.number)
38
38
  offsets = np.random.randint(0, self.tokenizer.vocab_size, size=self.number)
39
39
 
40
+ vocab_size = self.tokenizer.vocab_size
41
+
40
42
  for i in range(self.number):
41
- prompt_ids = ((offsets[i] + i + np.arange(input_lens[i])) % self.tokenizer.vocab_size).tolist()
42
- prompt = self.tokenizer.decode(self.prefix_ids + prompt_ids)
43
+ inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) % vocab_size).tolist()
44
+ token_sequence = self.prefix_ids + inner_seq
45
+ prompt = self.tokenizer.decode(token_sequence)
46
+
47
+ # After decoding the prompt we have to encode and decode it again.
48
+ # This is done because in some cases N consecutive tokens
49
+ # give a string tokenized into != N number of tokens.
50
+ total_input_len = self.prefix_length + int(input_lens[i])
51
+ re_encoded_sequence = self.tokenizer.encode(prompt, add_special_tokens=False)[:total_input_len]
52
+ prompt = self.tokenizer.decode(re_encoded_sequence)
43
53
 
44
54
  if self.query_parameters.apply_chat_template:
45
- yield [{'role': 'user', 'content': prompt}]
55
+ message = self.create_message(prompt)
56
+ yield [message]
46
57
  else:
47
58
  yield prompt
48
59
 
@@ -53,6 +64,6 @@ class RandomDatasetPlugin(DatasetPluginBase):
53
64
  return input_ids
54
65
 
55
66
  def get_template_len(self):
56
- empty_message = [{'role': 'user', 'content': ''}]
67
+ empty_message = [self.create_message(text='')]
57
68
  template = self.tokenizer.apply_chat_template(empty_message, tokenize=True, add_generation_prompt=True)
58
69
  return len(template)
@@ -0,0 +1,80 @@
1
+ import random
2
+ from PIL import Image, ImageDraw
3
+ from typing import Dict, Iterator, List
4
+
5
+ from evalscope.perf.arguments import Arguments
6
+ from evalscope.perf.plugin.datasets.random_dataset import RandomDatasetPlugin
7
+ from evalscope.perf.plugin.registry import register_dataset
8
+ from evalscope.utils.io_utils import PIL_to_base64
9
+
10
+
11
+ @register_dataset('random_vl')
12
+ class RandomVLDatasetPlugin(RandomDatasetPlugin):
13
+ """Random Vision-Language Dataset Plugin for multimodal model stress testing."""
14
+
15
+ def __init__(self, query_parameters: Arguments):
16
+ super().__init__(query_parameters)
17
+
18
+ # Vision-language specific parameters
19
+ self.image_width = query_parameters.image_width
20
+ self.image_height = query_parameters.image_height
21
+ self.image_format = query_parameters.image_format
22
+ self.image_num = query_parameters.image_num
23
+
24
+ assert self.image_num > 0, 'image_num must be greater than 0.'
25
+
26
+ def build_messages(self) -> Iterator[List[Dict]]:
27
+ # Reuse parent's message generation logic
28
+ for messages in super().build_messages():
29
+ prompt = messages[0]['content'] if isinstance(messages[0], dict) else messages[0]
30
+
31
+ # Generate random images based on image_num
32
+ images_b64 = []
33
+ for _ in range(self.image_num):
34
+ images_b64.append(f'data:image/png;base64,{self._generate_random_image_b64()}')
35
+
36
+ message = self.create_message(text=prompt, image_urls=images_b64)
37
+ yield [message]
38
+
39
+ def _generate_random_image_b64(self) -> str:
40
+ """Generate a random image and return as base64 string."""
41
+ # Create a random colored image
42
+ color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
43
+ image = Image.new(self.image_format, (self.image_width, self.image_height), color)
44
+
45
+ # Add some random shapes for variety
46
+ draw = ImageDraw.Draw(image)
47
+ for _ in range(random.randint(1, 5)):
48
+ shape_type = random.choice(['rectangle', 'ellipse', 'line'])
49
+
50
+ # Generate two random points
51
+ x1 = random.randint(0, self.image_width - 1)
52
+ y1 = random.randint(0, self.image_height - 1)
53
+ x2 = random.randint(0, self.image_width - 1)
54
+ y2 = random.randint(0, self.image_height - 1)
55
+
56
+ # Ensure proper coordinate ordering (x1 <= x2, y1 <= y2)
57
+ if x1 > x2:
58
+ x1, x2 = x2, x1
59
+ if y1 > y2:
60
+ y1, y2 = y2, y1
61
+
62
+ # Ensure we have at least a 1-pixel difference
63
+ if x1 == x2:
64
+ x2 = min(x1 + 1, self.image_width - 1)
65
+ if y1 == y2:
66
+ y2 = min(y1 + 1, self.image_height - 1)
67
+
68
+ coords = [x1, y1, x2, y2]
69
+
70
+ shape_color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
71
+
72
+ if shape_type == 'rectangle':
73
+ draw.rectangle(coords, fill=shape_color)
74
+ elif shape_type == 'ellipse':
75
+ draw.ellipse(coords, fill=shape_color)
76
+ else:
77
+ draw.line(coords, fill=shape_color, width=random.randint(1, 5))
78
+
79
+ # Convert to base64
80
+ return PIL_to_base64(image, format='PNG')
@@ -1,23 +1,25 @@
1
- from typing import Any, List, Type, Union
1
+ from typing import TYPE_CHECKING, Any, List, Type, Union
2
2
 
3
+ if TYPE_CHECKING:
4
+ from .api import ApiPluginBase
5
+ from .datasets import DatasetPluginBase
3
6
 
4
- class PluginRegistry:
5
-
6
- def __init__(self):
7
- self._registry = {}
8
7
 
9
- def register(self, name, cls):
10
- self._registry[name] = cls
11
- return cls
8
+ class PluginRegistry:
9
+ _registry = {}
12
10
 
13
- def get_class(self, name):
14
- return self._registry[name]
11
+ @classmethod
12
+ def register(cls, name, plugin_cls):
13
+ cls._registry[name] = plugin_cls
14
+ return plugin_cls
15
15
 
16
- def all_classes(self):
17
- return list(self._registry.keys())
16
+ @classmethod
17
+ def get_class(cls, name):
18
+ return cls._registry[name]
18
19
 
19
- def __call__(self, name: str) -> Any:
20
- return self.get_class(name)
20
+ @classmethod
21
+ def all_classes(cls):
22
+ return list(cls._registry.keys())
21
23
 
22
24
 
23
25
  def register_dataset(name: Union[str, List[str]]):
@@ -50,5 +52,23 @@ def register_api(name: Union[str, List[str]]):
50
52
  return class_decorator
51
53
 
52
54
 
53
- DatasetRegistry = PluginRegistry()
54
- ApiRegistry = PluginRegistry()
55
+ class DatasetRegistry(PluginRegistry):
56
+ """Registry for dataset plugins."""
57
+ _registry = {}
58
+
59
+ @classmethod
60
+ def get_class(cls, name: str) -> Type['DatasetPluginBase']:
61
+ if name not in cls._registry:
62
+ raise ValueError(f"Dataset plugin '{name}' is not registered.")
63
+ return cls._registry[name]
64
+
65
+
66
+ class ApiRegistry(PluginRegistry):
67
+ """Registry for API plugins."""
68
+ _registry = {}
69
+
70
+ @classmethod
71
+ def get_class(cls, name: str) -> Type['ApiPluginBase']:
72
+ if name not in cls._registry:
73
+ raise ValueError(f"API plugin '{name}' is not registered.")
74
+ return cls._registry[name]
@@ -20,25 +20,24 @@ class BenchmarkData:
20
20
  # late init
21
21
  query_latency: float = 0.0
22
22
  first_chunk_latency: float = 0.0
23
- n_chunks: int = 0
24
- n_chunks_time: float = 0.0
25
23
  max_gpu_memory_cost = 0
26
24
  time_per_output_token: float = 0.0
25
+ inter_chunk_latency: List[float] = field(default_factory=list)
27
26
 
28
27
  prompt_tokens = None
29
28
  completion_tokens = None
30
29
 
31
- def _calculate_query_stream_metric(self) -> Tuple[float, int, float]:
30
+ def _calculate_query_stream_metric(self) -> None:
32
31
  self.query_latency = self.completed_time - self.start_time
32
+ # only for stream responses
33
33
  if len(self.chunk_times) > 1:
34
34
  self.first_chunk_latency = self.chunk_times[0] - self.start_time
35
- self.n_chunks = len(self.chunk_times) - 2 # remove last and first chunk
36
- self.n_chunks_time = self.chunk_times[-2] - self.chunk_times[0]
35
+ # remove the first chunk time from the total latency
36
+ self.time_per_output_token = (self.query_latency - self.first_chunk_latency) / (
37
+ self.completion_tokens - 1) if self.completion_tokens > 1 else 0.0
38
+ self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
37
39
  else:
38
40
  self.first_chunk_latency = self.query_latency
39
- self.n_chunks = 1
40
- self.n_chunks_time = self.query_latency
41
- self.time_per_output_token = self.n_chunks_time / self.n_chunks if self.n_chunks != 0 else 0.0
42
41
 
43
42
  def _calculate_tokens(self, api_plugin):
44
43
  self.prompt_tokens, self.completion_tokens = \
@@ -63,10 +62,9 @@ class Metrics:
63
62
  AVERAGE_LATENCY = 'Average latency (s)'
64
63
  AVERAGE_TIME_TO_FIRST_TOKEN = 'Average time to first token (s)'
65
64
  AVERAGE_TIME_PER_OUTPUT_TOKEN = 'Average time per output token (s)'
65
+ AVERAGE_INTER_TOKEN_LATENCY = 'Average inter-token latency (s)'
66
66
  AVERAGE_INPUT_TOKENS_PER_REQUEST = 'Average input tokens per request'
67
67
  AVERAGE_OUTPUT_TOKENS_PER_REQUEST = 'Average output tokens per request'
68
- AVERAGE_PACKAGE_LATENCY = 'Average package latency (s)'
69
- AVERAGE_PACKAGE_PER_REQUEST = 'Average package per request'
70
68
 
71
69
 
72
70
  @dataclass
@@ -76,25 +74,23 @@ class BenchmarkMetrics:
76
74
  n_failed_queries: int = 0
77
75
  total_first_chunk_latency: float = 0.0
78
76
  total_latency: float = 0.0
79
- n_total_chunks: int = 0
80
77
  n_total_prompt_tokens: int = 0
81
78
  n_total_completion_tokens: int = 0
82
- total_chunks_time: float = 0.0
83
79
  start_time: Optional[float] = None
84
80
  total_time: float = 1.0
85
81
  n_total_queries: int = 0
86
82
  n_time_per_output_token: float = 0.0
83
+ n_total_inter_token_latency: List[float] = field(default_factory=list)
87
84
 
88
85
  avg_first_chunk_latency: float = -1
89
86
  avg_latency: float = -1
90
- n_avg_chunks: float = -1
91
- avg_chunk_time: float = -1
92
87
  avg_prompt_tokens: float = -1
93
88
  avg_completion_tokens: float = -1
94
89
  avg_input_token_per_seconds: float = -1
95
90
  avg_output_token_per_seconds: float = -1
96
91
  avg_total_token_per_seconds: float = -1
97
92
  avg_time_per_token: float = -1
93
+ avg_inter_token_latency: float = -1
98
94
  qps: float = -1
99
95
 
100
96
  def update_metrics(self, benchmark_data: BenchmarkData, api_plugin):
@@ -113,9 +109,8 @@ class BenchmarkMetrics:
113
109
  benchmark_data._calculate_query_stream_metric()
114
110
  self.total_latency += benchmark_data.query_latency
115
111
  self.total_first_chunk_latency += benchmark_data.first_chunk_latency
116
- self.n_total_chunks += benchmark_data.n_chunks
117
- self.total_chunks_time += benchmark_data.n_chunks_time
118
112
  self.n_time_per_output_token += benchmark_data.time_per_output_token
113
+ self.n_total_inter_token_latency += benchmark_data.inter_chunk_latency
119
114
  else:
120
115
  self.n_failed_queries += 1
121
116
 
@@ -127,8 +122,6 @@ class BenchmarkMetrics:
127
122
  try:
128
123
  self.avg_first_chunk_latency = self.total_first_chunk_latency / self.n_succeed_queries
129
124
  self.avg_latency = self.total_latency / self.n_succeed_queries
130
- self.n_avg_chunks = self.n_total_chunks / self.n_succeed_queries
131
- self.avg_chunk_time = self.total_chunks_time / self.n_total_chunks
132
125
  self.avg_prompt_tokens = self.n_total_prompt_tokens / self.n_succeed_queries
133
126
  self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
134
127
  self.avg_input_token_per_seconds = self.n_total_prompt_tokens / self.total_first_chunk_latency
@@ -136,6 +129,8 @@ class BenchmarkMetrics:
136
129
  self.avg_total_token_per_seconds = (self.n_total_prompt_tokens
137
130
  + self.n_total_completion_tokens) / self.total_time
138
131
  self.avg_time_per_token = self.n_time_per_output_token / self.n_succeed_queries
132
+ self.avg_inter_token_latency = sum(self.n_total_inter_token_latency) / len(
133
+ self.n_total_inter_token_latency) if self.n_total_inter_token_latency else 0.0
139
134
  self.qps = self.n_succeed_queries / self.total_time
140
135
  except ZeroDivisionError as e:
141
136
  logger.exception(e)
@@ -154,9 +149,8 @@ class BenchmarkMetrics:
154
149
  Metrics.AVERAGE_LATENCY: round(self.avg_latency, default_ndigits),
155
150
  Metrics.AVERAGE_TIME_TO_FIRST_TOKEN: round(self.avg_first_chunk_latency, default_ndigits),
156
151
  Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN: round(self.avg_time_per_token, default_ndigits),
152
+ Metrics.AVERAGE_INTER_TOKEN_LATENCY: round(self.avg_inter_token_latency, default_ndigits),
157
153
  Metrics.AVERAGE_INPUT_TOKENS_PER_REQUEST: round(self.avg_prompt_tokens, default_ndigits),
158
154
  Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST: round(self.avg_completion_tokens, default_ndigits),
159
- Metrics.AVERAGE_PACKAGE_LATENCY: round(self.avg_chunk_time, default_ndigits),
160
- Metrics.AVERAGE_PACKAGE_PER_REQUEST: round(self.n_avg_chunks, default_ndigits),
161
155
  }
162
156
  return message