PyPI - evalscope - Versions diffs - 0.17.0__py3-none-any.whl → 0.17.1__py3-none-any.whl - Mend

evalscope 0.17.0py3-none-any.whl → 0.17.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (66) hide show

evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
evalscope/benchmarks/data_adapter.py +9 -4
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +2 -1
evalscope/benchmarks/hle/__init__.py +0 -0
evalscope/benchmarks/hle/hle_adapter.py +118 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
evalscope/benchmarks/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
evalscope/benchmarks/utils.py +1 -0
evalscope/constants.py +5 -21
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +5 -3
evalscope/metrics/__init__.py +3 -1
evalscope/metrics/completion_parsers.py +7 -0
evalscope/metrics/llm_judge.py +6 -5
evalscope/metrics/metrics.py +19 -7
evalscope/models/__init__.py +4 -8
evalscope/models/adapters/__init__.py +4 -9
evalscope/models/adapters/base_adapter.py +4 -0
evalscope/models/adapters/bfcl_adapter.py +2 -0
evalscope/models/adapters/chat_adapter.py +3 -0
evalscope/models/adapters/choice_adapter.py +4 -0
evalscope/models/adapters/custom_adapter.py +7 -3
evalscope/models/adapters/server_adapter.py +2 -0
evalscope/models/adapters/t2i_adapter.py +3 -0
evalscope/models/adapters/tau_bench_adapter.py +189 -0
evalscope/models/register.py +0 -14
evalscope/perf/arguments.py +13 -0
evalscope/perf/benchmark.py +38 -39
evalscope/perf/http_client.py +30 -86
evalscope/perf/main.py +2 -2
evalscope/perf/plugin/__init__.py +3 -2
evalscope/perf/plugin/api/__init__.py +4 -3
evalscope/perf/plugin/api/base.py +22 -4
evalscope/perf/plugin/api/custom_api.py +212 -55
evalscope/perf/plugin/api/dashscope_api.py +4 -10
evalscope/perf/plugin/api/default_api.py +105 -0
evalscope/perf/plugin/api/openai_api.py +17 -19
evalscope/perf/plugin/datasets/__init__.py +10 -7
evalscope/perf/plugin/datasets/base.py +22 -1
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +4 -27
evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +2 -1
evalscope/perf/plugin/datasets/random_dataset.py +15 -4
evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
evalscope/perf/plugin/registry.py +36 -16
evalscope/perf/utils/benchmark_util.py +14 -20
evalscope/perf/utils/db_util.py +79 -61
evalscope/utils/io_utils.py +10 -0
evalscope/version.py +2 -2
{evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/METADATA +54 -34
{evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/RECORD +65 -58
tests/cli/test_all.py +18 -2
tests/cli/test_run.py +25 -37
tests/perf/test_perf.py +29 -2
evalscope/models/model.py +0 -189
{evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
{evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
{evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0

evalscope/perf/plugin/api/default_api.py ADDED Viewed

@@ -0,0 +1,105 @@
+import aiohttp
+import json
+from http import HTTPStatus
+from typing import Any, AsyncGenerator, Dict, List, Tuple
+from evalscope.perf.arguments import Arguments
+from evalscope.perf.plugin.api.base import ApiPluginBase
+from evalscope.perf.utils.local_server import ServerSentEvent
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+class DefaultApiPlugin(ApiPluginBase):
+    """Default implementation of API plugin with common HTTP handling methods."""
+    def __init__(self, param: Arguments):
+        super().__init__(param)
+    async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
+                              body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
+        """Process the HTTP request and handle the response.
+        Args:
+            client_session: The aiohttp client session
+            url: The request URL
+            headers: The request headers
+            body: The request body
+        Yields:
+            Tuple[bool, int, str]: (is_error, status_code, response_data)
+        """
+        try:
+            headers = {'Content-Type': 'application/json', **headers}
+            data = json.dumps(body, ensure_ascii=False)  # serialize to JSON
+            async with client_session.request('POST', url=url, data=data, headers=headers) as response:
+                async for result in self._handle_response(response):
+                    yield result
+        except Exception as e:
+            logger.error(f'Error in process_request: {e}')
+            yield (True, None, str(e))
+    async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
+        """Handle streaming response from server-sent events.
+        Args:
+            response: The aiohttp response object containing a stream
+        Yields:
+            Tuple[bool, int, Any]: (is_error, status_code, data)
+        """
+        try:
+            async for chunk_bytes in response.content:
+                chunk_bytes = chunk_bytes.strip()
+                if not chunk_bytes:
+                    continue
+                chunk_bytes = chunk_bytes.decode('utf-8')
+                # NOTE: SSE comments (often used as pings) start with a colon.
+                # These are not JSON data payload and should be skipped.
+                if chunk_bytes.startswith(':'):
+                    continue
+                chunk = chunk_bytes.removeprefix('data: ')
+                if chunk != '[DONE]':
+                    data = json.loads(chunk)
+                    yield False, response.status, data
+        except Exception as e:
+            logger.error(f'Error in _handle_stream: {e}')
+            yield True, response.status, str(e)
+    async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
+        """Handle the HTTP response based on content type and status.
+        Args:
+            response: The aiohttp response object
+        Yields:
+            Tuple[bool, int, str]: (is_error, status_code, response_data)
+        """
+        response_status = response.status
+        response_content_type = response.content_type
+        content_type_json = 'application/json'
+        content_type_stream = 'text/event-stream'
+        is_success = (response_status == HTTPStatus.OK)
+        if is_success:
+            # Handle successful response with 'text/event-stream' content type
+            if content_type_stream in response_content_type:
+                async for is_error, response_status, content in self._handle_stream(response):
+                    yield (is_error, response_status, content)
+            # Handle successful response with 'application/json' content type
+            elif content_type_json in response_content_type:
+                content = await response.json()
+                yield (False, response_status, json.dumps(content, ensure_ascii=False))
+            # Handle other successful responses
+            else:
+                content = await response.read()
+                yield (False, response_status, content.decode('utf-8'))
+        else:
+            # error is always in JSON format
+            error = await response.json()
+            yield (True, response_status, json.dumps(error, ensure_ascii=False))

evalscope/perf/plugin/api/openai_api.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import json
 import os
-from typing import Any, Dict, Iterator, List, Union
+from typing import Any, Dict, List, Tuple, Union
 from evalscope.perf.arguments import Arguments
-from evalscope.perf.plugin.api.base import ApiPluginBase
+from evalscope.perf.plugin.api.default_api import DefaultApiPlugin
 from evalscope.perf.plugin.registry import register_api
 from evalscope.utils.logger import get_logger
@@ -11,25 +11,25 @@ logger = get_logger()
 @register_api(['openai', 'local_vllm', 'local'])
-class OpenaiPlugin(ApiPluginBase):
+class OpenaiPlugin(DefaultApiPlugin):
     """Base of openai interface."""
-    def __init__(self, mode_path: str):
-        """Init the plugin
+    def __init__(self, param: Arguments):
+        """Initialize the OpenaiPlugin.
         Args:
-            mode_path (str): The model path, we use the tokenizer
-                weight in the model to calculate the number of the
-                input and output tokens.
+            param (Arguments): Configuration object containing parameters
+                such as the tokenizer path and model details. If a tokenizer
+                path is provided, it is used to initialize the tokenizer.
         """
-        super().__init__(model_path=mode_path)
-        if mode_path is not None:
+        super().__init__(param=param)
+        if param.tokenizer_path is not None:
             from modelscope import AutoTokenizer
-            self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
+            self.tokenizer = AutoTokenizer.from_pretrained(param.tokenizer_path)
         else:
             self.tokenizer = None
-    def build_request(self, messages: Union[List[Dict], str], param: Arguments) -> Dict:
+    def build_request(self, messages: Union[List[Dict], str], param: Arguments = None) -> Dict:
         """Build the openai format request based on prompt, dataset
         Args:
@@ -42,6 +42,7 @@ class OpenaiPlugin(ApiPluginBase):
         Returns:
             Dict: The request body. None if prompt format is error.
         """
+        param = param or self.param
         try:
             if param.query_template is not None:
                 if param.query_template.startswith('@'):
@@ -54,8 +55,6 @@ class OpenaiPlugin(ApiPluginBase):
                 else:
                     query = json.loads(param.query_template)
-                if 'stream' in query.keys():
-                    param.stream = query['stream']
                 # replace template messages with input messages.
                 query['messages'] = messages
             elif isinstance(messages, str):
@@ -107,7 +106,7 @@ class OpenaiPlugin(ApiPluginBase):
         # when stream, the last response is the full usage
         # when non-stream, the last response is the first response
-        last_response_js = json.loads(responses[-1])
+        last_response_js = responses[-1]
         if 'usage' in last_response_js and last_response_js['usage']:
             input_tokens = last_response_js['usage']['prompt_tokens']
             output_tokens = last_response_js['usage']['completion_tokens']
@@ -116,11 +115,10 @@ class OpenaiPlugin(ApiPluginBase):
         # no usage information in the response, parse the response to get the tokens
         delta_contents = {}
         for response in responses:
-            js = json.loads(response)
-            if 'object' in js:
-                self.__process_response_object(js, delta_contents)
+            if 'object' in response:
+                self.__process_response_object(response, delta_contents)
             else:
-                self.__process_no_object(js, delta_contents)
+                self.__process_no_object(response, delta_contents)
         input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
         return input_tokens, output_tokens

evalscope/perf/plugin/datasets/__init__.py CHANGED Viewed

@@ -1,7 +1,10 @@
-from evalscope.perf.plugin.datasets.custom import CustomDatasetPlugin
-from evalscope.perf.plugin.datasets.flickr8k import FlickrDatasetPlugin
-from evalscope.perf.plugin.datasets.line_by_line import LineByLineDatasetPlugin
-from evalscope.perf.plugin.datasets.longalpaca import LongAlpacaDatasetPlugin
-from evalscope.perf.plugin.datasets.openqa import OpenqaDatasetPlugin
-from evalscope.perf.plugin.datasets.random_dataset import RandomDatasetPlugin
-from evalscope.perf.plugin.datasets.speed_benchmark import SpeedBenchmarkDatasetPlugin, SpeedBenchmarkLongDatasetPlugin
+from .base import DatasetPluginBase
+from .custom import CustomDatasetPlugin
+from .flickr8k import FlickrDatasetPlugin
+from .kontext_bench import KontextDatasetPlugin
+from .line_by_line import LineByLineDatasetPlugin
+from .longalpaca import LongAlpacaDatasetPlugin
+from .openqa import OpenqaDatasetPlugin
+from .random_dataset import RandomDatasetPlugin
+from .random_vl_dataset import RandomVLDatasetPlugin
+from .speed_benchmark import SpeedBenchmarkDatasetPlugin, SpeedBenchmarkLongDatasetPlugin

evalscope/perf/plugin/datasets/base.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
 import sys
 from abc import abstractmethod
-from typing import Any, Dict, Iterator, List, Tuple
+from typing import Any, Dict, Iterator, List, Tuple, Union
 from evalscope.perf.arguments import Arguments
@@ -64,3 +64,24 @@ class DatasetPluginBase:
         data = json.loads(content)
         for item in data:
             yield item
+    def create_message(self, text: str, image_urls: Union[List[str], str] = None, role: str = 'user') -> Dict:
+        """Create a message with text and optional image URLs.
+        Args:
+            text (str): The text content of the message.
+            image_urls (List[str], optional): List of image URLs. Defaults to None.
+            role (str, optional): The role of the message sender. Defaults to "user".
+        Returns:
+            Dict: A dictionary representing the message.
+        """
+        if image_urls is None:
+            message = {'role': role, 'content': text}
+        else:
+            message = {'role': role, 'content': [{'type': 'text', 'text': text}]}
+            if isinstance(image_urls, str):
+                image_urls = [image_urls]
+            for url in image_urls:
+                message['content'].append({'type': 'image_url', 'image_url': {'url': url}})
+        return message

evalscope/perf/plugin/datasets/custom.py CHANGED Viewed

@@ -19,7 +19,8 @@ class CustomDatasetPlugin(DatasetPluginBase):
             if len(prompt) > self.query_parameters.min_prompt_length and len(
                     prompt) < self.query_parameters.max_prompt_length:
                 if self.query_parameters.apply_chat_template:
-                    yield [{'role': 'user', 'content': prompt}]
+                    message = self.create_message(prompt)
+                    yield [message]
                 else:
                     yield prompt

evalscope/perf/plugin/datasets/flickr8k.py CHANGED Viewed

@@ -1,18 +1,9 @@
-import base64
-from io import BytesIO
-from PIL import Image
 from typing import Any, Dict, Iterator, List
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.datasets.base import DatasetPluginBase
 from evalscope.perf.plugin.registry import register_dataset
-def PIL_to_base64(image: Image.Image) -> str:
-    buffered = BytesIO()
-    image.save(buffered, format='JPEG')
-    img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
-    return img_str
+from evalscope.utils.io_utils import PIL_to_base64
 @register_dataset('flickr8k')
@@ -31,21 +22,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
         for item in dataset:
             pil_image = item['jpg']
             text = item['txt']
-            base64_iamge = PIL_to_base64(pil_image)
+            base64_image = PIL_to_base64(pil_image)
-            yield [{
-                'role':
-                'user',
-                'content': [
-                    {
-                        'type': 'text',
-                        'text': text,
-                    },
-                    {
-                        'type': 'image_url',
-                        'image_url': {
-                            'url': f'data:image/jpeg;base64,{base64_iamge}',
-                        }
-                    },
-                ],
-            }]
+            message = self.create_message(text=text, image_url=f'data:image/jpeg;base64,{base64_image}')
+            yield [message]

evalscope/perf/plugin/datasets/kontext_bench.py ADDED Viewed

@@ -0,0 +1,28 @@
+from typing import Any, Dict, Iterator, List
+from evalscope.perf.arguments import Arguments
+from evalscope.perf.plugin.datasets.base import DatasetPluginBase
+from evalscope.perf.plugin.registry import register_dataset
+from evalscope.utils.io_utils import PIL_to_base64
+@register_dataset('kontext_bench')
+class KontextDatasetPlugin(DatasetPluginBase):
+    """Read dataset and return prompt.
+    Datasets: https://modelscope.cn/datasets/black-forest-labs/kontext-bench/dataPeview
+    """
+    def __init__(self, query_parameters: Arguments):
+        super().__init__(query_parameters)
+    def build_messages(self) -> Iterator[List[Dict]]:
+        from modelscope.msdatasets import MsDataset
+        dataset = MsDataset.load('black-forest-labs/kontext-bench', subset_name='default', split='test')
+        for item in dataset:
+            pil_image = item['image']
+            text = item['instruction']
+            base64_image = PIL_to_base64(pil_image)
+            message = self.create_message(text=text, image_url=f'data:image/jpeg;base64,{base64_image}')
+            yield [message]

evalscope/perf/plugin/datasets/line_by_line.py CHANGED Viewed

@@ -20,6 +20,7 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
             if len(prompt) > self.query_parameters.min_prompt_length and len(
                     prompt) < self.query_parameters.max_prompt_length:
                 if self.query_parameters.apply_chat_template:
-                    yield [{'role': 'user', 'content': prompt}]
+                    message = self.create_message(prompt)
+                    yield [message]
                 else:
                     yield prompt

evalscope/perf/plugin/datasets/longalpaca.py CHANGED Viewed

@@ -25,6 +25,7 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
             if len(prompt) > self.query_parameters.min_prompt_length and len(
                     prompt) < self.query_parameters.max_prompt_length:
                 if self.query_parameters.apply_chat_template:
-                    yield [{'role': 'user', 'content': prompt}]
+                    message = self.create_message(prompt)
+                    yield [message]
                 else:
                     yield prompt

evalscope/perf/plugin/datasets/openqa.py CHANGED Viewed

@@ -30,6 +30,7 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
             if (len(prompt) > self.query_parameters.min_prompt_length
                     and len(prompt) < self.query_parameters.max_prompt_length):
                 if self.query_parameters.apply_chat_template:
-                    yield [{'role': 'user', 'content': prompt}]
+                    message = self.create_message(prompt)
+                    yield [message]
                 else:
                     yield prompt

evalscope/perf/plugin/datasets/random_dataset.py CHANGED Viewed

@@ -37,12 +37,23 @@ class RandomDatasetPlugin(DatasetPluginBase):
         input_lens = np.random.randint(min_prompt_length, max_prompt_length, size=self.number)
         offsets = np.random.randint(0, self.tokenizer.vocab_size, size=self.number)
+        vocab_size = self.tokenizer.vocab_size
         for i in range(self.number):
-            prompt_ids = ((offsets[i] + i + np.arange(input_lens[i])) % self.tokenizer.vocab_size).tolist()
-            prompt = self.tokenizer.decode(self.prefix_ids + prompt_ids)
+            inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) % vocab_size).tolist()
+            token_sequence = self.prefix_ids + inner_seq
+            prompt = self.tokenizer.decode(token_sequence)
+            # After decoding the prompt we have to encode and decode it again.
+            # This is done because in some cases N consecutive tokens
+            # give a string tokenized into != N number of tokens.
+            total_input_len = self.prefix_length + int(input_lens[i])
+            re_encoded_sequence = self.tokenizer.encode(prompt, add_special_tokens=False)[:total_input_len]
+            prompt = self.tokenizer.decode(re_encoded_sequence)
             if self.query_parameters.apply_chat_template:
-                yield [{'role': 'user', 'content': prompt}]
+                message = self.create_message(prompt)
+                yield [message]
             else:
                 yield prompt
@@ -53,6 +64,6 @@ class RandomDatasetPlugin(DatasetPluginBase):
         return input_ids
     def get_template_len(self):
-        empty_message = [{'role': 'user', 'content': ''}]
+        empty_message = [self.create_message(text='')]
         template = self.tokenizer.apply_chat_template(empty_message, tokenize=True, add_generation_prompt=True)
         return len(template)

evalscope/perf/plugin/datasets/random_vl_dataset.py ADDED Viewed

@@ -0,0 +1,80 @@
+import random
+from PIL import Image, ImageDraw
+from typing import Dict, Iterator, List
+from evalscope.perf.arguments import Arguments
+from evalscope.perf.plugin.datasets.random_dataset import RandomDatasetPlugin
+from evalscope.perf.plugin.registry import register_dataset
+from evalscope.utils.io_utils import PIL_to_base64
+@register_dataset('random_vl')
+class RandomVLDatasetPlugin(RandomDatasetPlugin):
+    """Random Vision-Language Dataset Plugin for multimodal model stress testing."""
+    def __init__(self, query_parameters: Arguments):
+        super().__init__(query_parameters)
+        # Vision-language specific parameters
+        self.image_width = query_parameters.image_width
+        self.image_height = query_parameters.image_height
+        self.image_format = query_parameters.image_format
+        self.image_num = query_parameters.image_num
+        assert self.image_num > 0, 'image_num must be greater than 0.'
+    def build_messages(self) -> Iterator[List[Dict]]:
+        # Reuse parent's message generation logic
+        for messages in super().build_messages():
+            prompt = messages[0]['content'] if isinstance(messages[0], dict) else messages[0]
+            # Generate random images based on image_num
+            images_b64 = []
+            for _ in range(self.image_num):
+                images_b64.append(f'data:image/png;base64,{self._generate_random_image_b64()}')
+            message = self.create_message(text=prompt, image_urls=images_b64)
+            yield [message]
+    def _generate_random_image_b64(self) -> str:
+        """Generate a random image and return as base64 string."""
+        # Create a random colored image
+        color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
+        image = Image.new(self.image_format, (self.image_width, self.image_height), color)
+        # Add some random shapes for variety
+        draw = ImageDraw.Draw(image)
+        for _ in range(random.randint(1, 5)):
+            shape_type = random.choice(['rectangle', 'ellipse', 'line'])
+            # Generate two random points
+            x1 = random.randint(0, self.image_width - 1)
+            y1 = random.randint(0, self.image_height - 1)
+            x2 = random.randint(0, self.image_width - 1)
+            y2 = random.randint(0, self.image_height - 1)
+            # Ensure proper coordinate ordering (x1 <= x2, y1 <= y2)
+            if x1 > x2:
+                x1, x2 = x2, x1
+            if y1 > y2:
+                y1, y2 = y2, y1
+            # Ensure we have at least a 1-pixel difference
+            if x1 == x2:
+                x2 = min(x1 + 1, self.image_width - 1)
+            if y1 == y2:
+                y2 = min(y1 + 1, self.image_height - 1)
+            coords = [x1, y1, x2, y2]
+            shape_color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
+            if shape_type == 'rectangle':
+                draw.rectangle(coords, fill=shape_color)
+            elif shape_type == 'ellipse':
+                draw.ellipse(coords, fill=shape_color)
+            else:
+                draw.line(coords, fill=shape_color, width=random.randint(1, 5))
+        # Convert to base64
+        return PIL_to_base64(image, format='PNG')

evalscope/perf/plugin/registry.py CHANGED Viewed

@@ -1,23 +1,25 @@
-from typing import Any, List, Type, Union
+from typing import TYPE_CHECKING, Any, List, Type, Union
+if TYPE_CHECKING:
+    from .api import ApiPluginBase
+    from .datasets import DatasetPluginBase
-class PluginRegistry:
-    def __init__(self):
-        self._registry = {}
-    def register(self, name, cls):
-        self._registry[name] = cls
-        return cls
+class PluginRegistry:
+    _registry = {}
-    def get_class(self, name):
-        return self._registry[name]
+    @classmethod
+    def register(cls, name, plugin_cls):
+        cls._registry[name] = plugin_cls
+        return plugin_cls
-    def all_classes(self):
-        return list(self._registry.keys())
+    @classmethod
+    def get_class(cls, name):
+        return cls._registry[name]
-    def __call__(self, name: str) -> Any:
-        return self.get_class(name)
+    @classmethod
+    def all_classes(cls):
+        return list(cls._registry.keys())
 def register_dataset(name: Union[str, List[str]]):
@@ -50,5 +52,23 @@ def register_api(name: Union[str, List[str]]):
     return class_decorator
-DatasetRegistry = PluginRegistry()
-ApiRegistry = PluginRegistry()
+class DatasetRegistry(PluginRegistry):
+    """Registry for dataset plugins."""
+    _registry = {}
+    @classmethod
+    def get_class(cls, name: str) -> Type['DatasetPluginBase']:
+        if name not in cls._registry:
+            raise ValueError(f"Dataset plugin '{name}' is not registered.")
+        return cls._registry[name]
+class ApiRegistry(PluginRegistry):
+    """Registry for API plugins."""
+    _registry = {}
+    @classmethod
+    def get_class(cls, name: str) -> Type['ApiPluginBase']:
+        if name not in cls._registry:
+            raise ValueError(f"API plugin '{name}' is not registered.")
+        return cls._registry[name]

evalscope/perf/utils/benchmark_util.py CHANGED Viewed

@@ -20,25 +20,24 @@ class BenchmarkData:
     # late init
     query_latency: float = 0.0
     first_chunk_latency: float = 0.0
-    n_chunks: int = 0
-    n_chunks_time: float = 0.0
     max_gpu_memory_cost = 0
     time_per_output_token: float = 0.0
+    inter_chunk_latency: List[float] = field(default_factory=list)
     prompt_tokens = None
     completion_tokens = None
-    def _calculate_query_stream_metric(self) -> Tuple[float, int, float]:
+    def _calculate_query_stream_metric(self) -> None:
         self.query_latency = self.completed_time - self.start_time
+        # only for stream responses
         if len(self.chunk_times) > 1:
             self.first_chunk_latency = self.chunk_times[0] - self.start_time
-            self.n_chunks = len(self.chunk_times) - 2  # remove last and first chunk
-            self.n_chunks_time = self.chunk_times[-2] - self.chunk_times[0]
+            # remove the first chunk time from the total latency
+            self.time_per_output_token = (self.query_latency - self.first_chunk_latency) / (
+                self.completion_tokens - 1) if self.completion_tokens > 1 else 0.0
+            self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
         else:
             self.first_chunk_latency = self.query_latency
-            self.n_chunks = 1
-            self.n_chunks_time = self.query_latency
-        self.time_per_output_token = self.n_chunks_time / self.n_chunks if self.n_chunks != 0 else 0.0
     def _calculate_tokens(self, api_plugin):
         self.prompt_tokens, self.completion_tokens = \
@@ -63,10 +62,9 @@ class Metrics:
     AVERAGE_LATENCY = 'Average latency (s)'
     AVERAGE_TIME_TO_FIRST_TOKEN = 'Average time to first token (s)'
     AVERAGE_TIME_PER_OUTPUT_TOKEN = 'Average time per output token (s)'
+    AVERAGE_INTER_TOKEN_LATENCY = 'Average inter-token latency (s)'
     AVERAGE_INPUT_TOKENS_PER_REQUEST = 'Average input tokens per request'
     AVERAGE_OUTPUT_TOKENS_PER_REQUEST = 'Average output tokens per request'
-    AVERAGE_PACKAGE_LATENCY = 'Average package latency (s)'
-    AVERAGE_PACKAGE_PER_REQUEST = 'Average package per request'
 @dataclass
@@ -76,25 +74,23 @@ class BenchmarkMetrics:
     n_failed_queries: int = 0
     total_first_chunk_latency: float = 0.0
     total_latency: float = 0.0
-    n_total_chunks: int = 0
     n_total_prompt_tokens: int = 0
     n_total_completion_tokens: int = 0
-    total_chunks_time: float = 0.0
     start_time: Optional[float] = None
     total_time: float = 1.0
     n_total_queries: int = 0
     n_time_per_output_token: float = 0.0
+    n_total_inter_token_latency: List[float] = field(default_factory=list)
     avg_first_chunk_latency: float = -1
     avg_latency: float = -1
-    n_avg_chunks: float = -1
-    avg_chunk_time: float = -1
     avg_prompt_tokens: float = -1
     avg_completion_tokens: float = -1
     avg_input_token_per_seconds: float = -1
     avg_output_token_per_seconds: float = -1
     avg_total_token_per_seconds: float = -1
     avg_time_per_token: float = -1
+    avg_inter_token_latency: float = -1
     qps: float = -1
     def update_metrics(self, benchmark_data: BenchmarkData, api_plugin):
@@ -113,9 +109,8 @@ class BenchmarkMetrics:
             benchmark_data._calculate_query_stream_metric()
             self.total_latency += benchmark_data.query_latency
             self.total_first_chunk_latency += benchmark_data.first_chunk_latency
-            self.n_total_chunks += benchmark_data.n_chunks
-            self.total_chunks_time += benchmark_data.n_chunks_time
             self.n_time_per_output_token += benchmark_data.time_per_output_token
+            self.n_total_inter_token_latency += benchmark_data.inter_chunk_latency
         else:
             self.n_failed_queries += 1
@@ -127,8 +122,6 @@ class BenchmarkMetrics:
         try:
             self.avg_first_chunk_latency = self.total_first_chunk_latency / self.n_succeed_queries
             self.avg_latency = self.total_latency / self.n_succeed_queries
-            self.n_avg_chunks = self.n_total_chunks / self.n_succeed_queries
-            self.avg_chunk_time = self.total_chunks_time / self.n_total_chunks
             self.avg_prompt_tokens = self.n_total_prompt_tokens / self.n_succeed_queries
             self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
             self.avg_input_token_per_seconds = self.n_total_prompt_tokens / self.total_first_chunk_latency
@@ -136,6 +129,8 @@ class BenchmarkMetrics:
             self.avg_total_token_per_seconds = (self.n_total_prompt_tokens
                                                 + self.n_total_completion_tokens) / self.total_time
             self.avg_time_per_token = self.n_time_per_output_token / self.n_succeed_queries
+            self.avg_inter_token_latency = sum(self.n_total_inter_token_latency) / len(
+                self.n_total_inter_token_latency) if self.n_total_inter_token_latency else 0.0
             self.qps = self.n_succeed_queries / self.total_time
         except ZeroDivisionError as e:
             logger.exception(e)
@@ -154,9 +149,8 @@ class BenchmarkMetrics:
             Metrics.AVERAGE_LATENCY: round(self.avg_latency, default_ndigits),
             Metrics.AVERAGE_TIME_TO_FIRST_TOKEN: round(self.avg_first_chunk_latency, default_ndigits),
             Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN: round(self.avg_time_per_token, default_ndigits),
+            Metrics.AVERAGE_INTER_TOKEN_LATENCY: round(self.avg_inter_token_latency, default_ndigits),
             Metrics.AVERAGE_INPUT_TOKENS_PER_REQUEST: round(self.avg_prompt_tokens, default_ndigits),
             Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST: round(self.avg_completion_tokens, default_ndigits),
-            Metrics.AVERAGE_PACKAGE_LATENCY: round(self.avg_chunk_time, default_ndigits),
-            Metrics.AVERAGE_PACKAGE_PER_REQUEST: round(self.n_avg_chunks, default_ndigits),
         }
         return message

evalscope 0.17.0__py3-none-any.whl → 0.17.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.17.0py3-none-any.whl → 0.17.1py3-none-any.whl