PyPI - evalscope - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

evalscope 1.1.0py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (100) hide show

evalscope/api/benchmark/__init__.py +8 -1
evalscope/api/benchmark/adapters/__init__.py +1 -0
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/benchmark.py +14 -0
evalscope/api/dataset/dataset.py +21 -0
evalscope/api/dataset/loader.py +6 -2
evalscope/api/mixin/sandbox_mixin.py +32 -54
evalscope/api/model/generate_config.py +6 -0
evalscope/benchmarks/aa_lcr/__init__.py +0 -0
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench_v2/utils.py +1 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +111 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/constants.py +4 -0
evalscope/evaluator/evaluator.py +72 -79
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +1 -1
evalscope/models/utils/openai.py +4 -0
evalscope/perf/arguments.py +24 -4
evalscope/perf/benchmark.py +74 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +179 -79
evalscope/perf/plugin/api/openai_api.py +4 -3
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/utils/benchmark_util.py +36 -22
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +0 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +2 -1
evalscope/run.py +4 -0
evalscope/utils/function_utils.py +195 -12
evalscope/utils/io_utils.py +74 -0
evalscope/utils/logger.py +49 -17
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/METADATA +235 -363
{evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/RECORD +100 -55
{evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
{evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
{evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
{evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -0

evalscope/perf/plugin/api/default_api.py CHANGED Viewed

@@ -1,24 +1,68 @@
 import aiohttp
 import json
-from http import HTTPStatus
-from typing import Any, AsyncGenerator, Dict, List, Tuple
+import sys
+import time
+import traceback
+from typing import Any, Dict
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.api.base import ApiPluginBase
-from evalscope.perf.utils.local_server import ServerSentEvent
+from evalscope.perf.utils.benchmark_util import BenchmarkData
 from evalscope.utils.logger import get_logger
 logger = get_logger()
+class StreamedResponseHandler:
+    """Handles streaming HTTP responses by accumulating chunks until complete
+    messages are available."""
+    def __init__(self):
+        self.buffer = ''
+    def add_chunk(self, chunk_bytes: bytes) -> list[str]:
+        """Add a chunk of bytes to the buffer and return any complete
+        messages."""
+        chunk_str = chunk_bytes.decode('utf-8')
+        self.buffer += chunk_str
+        messages = []
+        # Split by double newlines (SSE message separator)
+        while '\n\n' in self.buffer:
+            message, self.buffer = self.buffer.split('\n\n', 1)
+            message = message.strip()
+            if message:
+                messages.append(message)
+        # if self.buffer is not empty, check if it is a complete message
+        # by removing data: prefix and check if it is a valid JSON
+        if self.buffer.startswith('data: '):
+            message_content = self.buffer.removeprefix('data: ').strip()
+            if message_content == '[DONE]':
+                messages.append(self.buffer.strip())
+                self.buffer = ''
+            elif message_content:
+                try:
+                    json.loads(message_content)
+                    messages.append(self.buffer.strip())
+                    self.buffer = ''
+                except json.JSONDecodeError:
+                    # Incomplete JSON, wait for more chunks.
+                    pass
+        return messages
 class DefaultApiPlugin(ApiPluginBase):
     """Default implementation of API plugin with common HTTP handling methods."""
     def __init__(self, param: Arguments):
         super().__init__(param)
-    async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
-                              body: Dict) -> AsyncGenerator[Tuple[bool, int, Any], None]:
+    async def process_request(
+        self, client_session: aiohttp.ClientSession, url: str, headers: Dict, body: Dict
+    ) -> BenchmarkData:
         """Process the HTTP request and handle the response.
         Args:
@@ -27,79 +71,135 @@ class DefaultApiPlugin(ApiPluginBase):
             headers: The request headers
             body: The request body
-        Yields:
-            Tuple[bool, int, Any]: (is_error, status_code, response_data)
-        """
-        try:
-            headers = {'Content-Type': 'application/json', **headers}
-            data = json.dumps(body, ensure_ascii=False)  # serialize to JSON
-            async with client_session.request('POST', url=url, data=data, headers=headers) as response:
-                async for result in self._handle_response(response):
-                    yield result
-        except Exception as e:
-            logger.error(f'Error in process_request: {e}')
-            yield (True, None, str(e))
-    async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, Any], None]:
-        """Handle streaming response from server-sent events.
-        Args:
-            response: The aiohttp response object containing a stream
-        Yields:
-            Tuple[bool, int, Any]: (is_error, status_code, data)
+        Returns:
+            BenchmarkData: Aggregated benchmarking data for the request/response.
         """
+        headers = {'Content-Type': 'application/json', **headers}
+        data = json.dumps(body, ensure_ascii=False)  # serialize to JSON
+        output = BenchmarkData()
+        ttft = 0.0
+        generated_text = ''
+        st = time.perf_counter()
+        output.start_time = st
+        output.request = data
+        most_recent_timestamp = st
         try:
-            async for chunk_bytes in response.content:
-                chunk_bytes = chunk_bytes.strip()
-                if not chunk_bytes:
-                    continue
-                chunk_bytes = chunk_bytes.decode('utf-8')
-                # NOTE: SSE comments (often used as pings) start with a colon.
-                # These are not JSON data payload and should be skipped.
-                if chunk_bytes.startswith(':'):
-                    continue
-                chunk = chunk_bytes.removeprefix('data: ')
-                if chunk != '[DONE]':
-                    data = json.loads(chunk)
-                    yield False, response.status, data
-        except Exception as e:
-            logger.error(f'Error in _handle_stream: {e}')
-            yield True, response.status, str(e)
-    async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, Any], None]:
-        """Handle the HTTP response based on content type and status.
-        Args:
-            response: The aiohttp response object
-        Yields:
-            Tuple[bool, int, Any]: (is_error, status_code, response_data)
-        """
-        response_status = response.status
-        response_content_type = response.content_type
-        content_type_json = 'application/json'
-        content_type_stream = 'text/event-stream'
-        is_success = (response_status == HTTPStatus.OK)
-        if is_success:
-            # Handle successful response with 'text/event-stream' content type
-            if content_type_stream in response_content_type:
-                async for is_error, response_status, content in self._handle_stream(response):
-                    yield (is_error, response_status, content)
-            # Handle successful response with 'application/json' content type
-            elif content_type_json in response_content_type:
-                content = await response.json()
-                yield (False, response_status, content)
-            # Handle other successful responses
-            else:
-                content = await response.read()
-                yield (False, response_status, content.decode('utf-8'))
-        else:
-            # error is always in JSON format
-            error = await response.json()
-            yield (True, response_status, error)
+            async with client_session.post(url=url, data=data, headers=headers) as response:
+                content_type = response.headers.get('Content-Type', '')
+                if response.status == 200:
+                    # Handle streaming responses (SSE)
+                    if 'text/event-stream' in content_type:
+                        handler = StreamedResponseHandler()
+                        async for chunk_bytes in response.content.iter_any():
+                            chunk_bytes = chunk_bytes.strip()
+                            if not chunk_bytes:
+                                continue
+                            messages = handler.add_chunk(chunk_bytes)
+                            for message in messages:
+                                # NOTE: SSE comments (often used as pings) start with
+                                # a colon. These are not JSON data payload and should
+                                # be skipped.
+                                if message.startswith(':'):
+                                    continue
+                                chunk = message.removeprefix('data: ')
+                                if chunk != '[DONE]':
+                                    timestamp = time.perf_counter()
+                                    data = json.loads(chunk)
+                                    if choices := data.get('choices'):
+                                        content = choices[0]['delta'].get('content')
+                                        # First token
+                                        if ttft == 0.0:
+                                            ttft = timestamp - st
+                                            output.first_chunk_latency = ttft
+                                        # Decoding phase
+                                        else:
+                                            output.inter_chunk_latency.append(timestamp - most_recent_timestamp)
+                                        generated_text += content or ''
+                                        output.response_messages.append(data)
+                                    elif usage := data.get('usage'):
+                                        output.prompt_tokens = usage.get('prompt_tokens')
+                                        output.completion_tokens = usage.get('completion_tokens')
+                                    most_recent_timestamp = timestamp
+                        output.generated_text = generated_text
+                        output.success = True
+                        output.completed_time = most_recent_timestamp
+                        output.query_latency = most_recent_timestamp - st
+                    # Handle non-stream JSON responses
+                    elif 'application/json' in content_type or 'application/' in content_type:
+                        payload: Any
+                        try:
+                            payload = await response.json()
+                        except Exception:
+                            # Fallback to text if JSON parsing fails
+                            payload = await response.text()
+                        timestamp = time.perf_counter()
+                        output.completed_time = timestamp
+                        output.query_latency = timestamp - st
+                        # For non-stream, first chunk equals full latency
+                        output.first_chunk_latency = output.query_latency
+                        if isinstance(payload, dict):
+                            # Extract generated text from choices
+                            text = ''
+                            if choices := payload.get('choices'):
+                                first = choices[0] if choices else {}
+                                # Chat Completions format
+                                msg = first.get('message') or {}
+                                if isinstance(msg, dict) and msg.get('content') is not None:
+                                    text = msg.get('content') or ''
+                                else:
+                                    # Legacy Completions format
+                                    text = first.get('text') or ''
+                            generated_text = text
+                            # Extract usage if provided
+                            if usage := payload.get('usage'):
+                                output.prompt_tokens = usage.get('prompt_tokens')
+                                output.completion_tokens = usage.get('completion_tokens')
+                            output.response_messages.append(payload)
+                        else:
+                            generated_text = str(payload)
+                        output.generated_text = generated_text
+                        output.success = True
+                    else:
+                        # Unknown successful content-type: read as text
+                        raw = await response.text()
+                        timestamp = time.perf_counter()
+                        output.completed_time = timestamp
+                        output.query_latency = timestamp - st
+                        output.first_chunk_latency = output.query_latency
+                        output.generated_text = raw
+                        output.response_messages.append(raw)
+                        output.success = True
+                else:
+                    # Try to parse structured error, fallback to reason/text
+                    try:
+                        err_payload = await response.json()
+                        output.error = json.dumps(err_payload, ensure_ascii=False)
+                    except Exception:
+                        try:
+                            output.error = await response.text()
+                        except Exception:
+                            output.error = response.reason or ''
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = ''.join(traceback.format_exception(*exc_info))
+            logger.error(output.error)
+        return output

evalscope/perf/plugin/api/openai_api.py CHANGED Viewed

@@ -102,7 +102,7 @@ class OpenaiPlugin(DefaultApiPlugin):
             payload.update(param.extra_args)
         return payload
-    def parse_responses(self, responses, request: Any = None, **kwargs) -> tuple[int, int]:
+    def parse_responses(self, responses: List[Dict], request: str = None, **kwargs) -> tuple[int, int]:
         """Parser responses and return number of request and response tokens.
         Only one response for non-stream, multiple responses for stream.
         """
@@ -180,7 +180,7 @@ class OpenaiPlugin(DefaultApiPlugin):
             )
         return input_tokens, output_tokens
-    def _count_input_tokens(self, request: Dict) -> int:
+    def _count_input_tokens(self, request_str: str) -> int:
         """Count the number of input tokens in the request.
         This method handles different types of requests and calculates tokens for:
@@ -188,13 +188,14 @@ class OpenaiPlugin(DefaultApiPlugin):
         - Images in multimodal messages (converted to patch tokens)
         Args:
-            request (Dict): The request dictionary containing either 'messages' for chat
+            request_str (str): The request json str containing either 'messages' for chat
                           completion or 'prompt' for text completion.
         Returns:
             int: The total number of input tokens including text and image tokens.
         """
         input_tokens = 0
+        request = json.loads(request_str)
         if 'messages' in request:
             input_content = self.tokenizer.apply_chat_template(
                 request['messages'], tokenize=True, add_generation_prompt=True

evalscope/perf/plugin/datasets/base.py CHANGED Viewed

@@ -15,6 +15,11 @@ class DatasetPluginBase:
             dataset_path (str, optional): The input dataset path. Defaults to None.
         """
         self.query_parameters = query_parameters
+        if query_parameters.tokenizer_path:
+            from modelscope import AutoTokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(query_parameters.tokenizer_path, trust_remote_code=True)
+        else:
+            self.tokenizer = None
     def __next__(self):
         for item in self.build_messages():
@@ -85,3 +90,19 @@ class DatasetPluginBase:
             for url in image_urls:
                 message['content'].append({'type': 'image_url', 'image_url': {'url': url}})
         return message
+    def check_prompt_length(self, prompt: str) -> Tuple[bool, int]:
+        """Check if the prompt length is within the specified range.
+        Args:
+            prompt (str): The input prompt string.
+        Returns:
+            Tuple[bool, int]: A tuple containing a boolean indicating whether the prompt is valid and its length.
+        """
+        if self.tokenizer is None:
+            prompt_length = len(prompt)
+        else:
+            prompt_length = len(self.tokenizer.encode(prompt))
+        is_valid = self.query_parameters.min_prompt_length <= prompt_length <= self.query_parameters.max_prompt_length
+        return is_valid, prompt_length

evalscope/perf/plugin/datasets/custom.py CHANGED Viewed

@@ -16,9 +16,8 @@ class CustomDatasetPlugin(DatasetPluginBase):
     def build_messages(self) -> Iterator[List[Dict]]:
         for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
             prompt = item.strip()
-            if len(prompt) > self.query_parameters.min_prompt_length and len(
-                prompt
-            ) < self.query_parameters.max_prompt_length:
+            is_valid, _ = self.check_prompt_length(prompt)
+            if is_valid:
                 if self.query_parameters.apply_chat_template:
                     message = self.create_message(prompt)
                     yield [message]

evalscope/perf/plugin/datasets/line_by_line.py CHANGED Viewed

@@ -17,9 +17,8 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
     def build_messages(self) -> Iterator[List[Dict]]:
         for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
             prompt = item.strip()
-            if len(prompt) > self.query_parameters.min_prompt_length and len(
-                prompt
-            ) < self.query_parameters.max_prompt_length:
+            is_valid, _ = self.check_prompt_length(prompt)
+            if is_valid:
                 if self.query_parameters.apply_chat_template:
                     message = self.create_message(prompt)
                     yield [message]

evalscope/perf/plugin/datasets/longalpaca.py CHANGED Viewed

@@ -22,9 +22,8 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
             ds = self.dataset_json_list(self.query_parameters.dataset_path)
         for item in ds:
             prompt = item['instruction'].strip()
-            if len(prompt) > self.query_parameters.min_prompt_length and len(
-                prompt
-            ) < self.query_parameters.max_prompt_length:
+            is_valid, _ = self.check_prompt_length(prompt)
+            if is_valid:
                 if self.query_parameters.apply_chat_template:
                     message = self.create_message(prompt)
                     yield [message]

evalscope/perf/plugin/datasets/openqa.py CHANGED Viewed

@@ -27,10 +27,8 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
         for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
             item = json.loads(item)
             prompt = item['question'].strip()
-            if (
-                len(prompt) > self.query_parameters.min_prompt_length
-                and len(prompt) < self.query_parameters.max_prompt_length
-            ):
+            is_valid, _ = self.check_prompt_length(prompt)
+            if is_valid:
                 if self.query_parameters.apply_chat_template:
                     message = self.create_message(prompt)
                     yield [message]

evalscope/perf/plugin/datasets/random_dataset.py CHANGED Viewed

@@ -12,11 +12,9 @@ class RandomDatasetPlugin(DatasetPluginBase):
     """
     def __init__(self, query_parameters: Arguments):
+        assert query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer-path`.'  # noqa: E501
         super().__init__(query_parameters)
-        assert self.query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer_path`.'  # noqa: E501
-        from modelscope import AutoTokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained(self.query_parameters.tokenizer_path, trust_remote_code=True)
         self.prefix_length = self.query_parameters.prefix_length
         self.prefix_ids = self.get_random_inputs(self.prefix_length)
         self.template_len = self.get_template_len()

evalscope/perf/utils/benchmark_util.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import time
 from dataclasses import dataclass, field
 from typing import Any, List, Optional, Tuple
@@ -10,7 +9,7 @@ logger = get_logger()
 @dataclass
 class BenchmarkData:
-    request: Any = None
+    request: str = None  # json serialized request body
     start_time: float = 0.0
     completed_time: float = 0.0
     chunk_times: List[float] = field(default_factory=list)
@@ -24,24 +23,26 @@ class BenchmarkData:
     time_per_output_token: float = 0.0
     inter_chunk_latency: List[float] = field(default_factory=list)
-    prompt_tokens = None
-    completion_tokens = None
-    def _calculate_query_stream_metric(self) -> None:
-        self.query_latency = self.completed_time - self.start_time
-        # only for stream responses
-        if len(self.chunk_times) > 1:
-            self.first_chunk_latency = self.chunk_times[0] - self.start_time
-            # remove the first chunk time from the total latency
-            self.time_per_output_token = (self.query_latency - self.first_chunk_latency
-                                          ) / (self.completion_tokens - 1) if self.completion_tokens > 1 else 0.0
-            self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
-        else:
-            self.first_chunk_latency = self.query_latency
+    # response content
+    generated_text: str = ''
+    error: Optional[str] = None
+    prompt_tokens: Optional[int] = None
+    completion_tokens: Optional[int] = None
     def _calculate_tokens(self, api_plugin):
-        self.prompt_tokens, self.completion_tokens = \
-            api_plugin.parse_responses(self.response_messages, request=self.request)
+        if self.prompt_tokens is None or self.completion_tokens is None:
+            self.prompt_tokens, self.completion_tokens = api_plugin.parse_responses(
+                self.response_messages, request=self.request
+            )
+        # Calculate time per output token
+        if self.completion_tokens and self.completion_tokens > 1:
+            # tpot = (latency - ttft) / (output_len - 1)
+            self.time_per_output_token = (self.query_latency - self.first_chunk_latency) / (self.completion_tokens - 1)
+        # Ensure inter-chunk latency is available (compute from chunk_times if needed)
+        if not self.inter_chunk_latency and self.chunk_times:
+            self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
     def update_gpu_usage(self):
         if check_import('torch', raise_warning=False):
@@ -79,6 +80,7 @@ class BenchmarkMetrics:
     n_total_prompt_tokens: int = 0
     n_total_completion_tokens: int = 0
     start_time: Optional[float] = None
+    last_completed_time: Optional[float] = None
     total_time: float = 1.0
     n_total_queries: int = 0
     n_time_per_output_token: float = 0.0
@@ -97,9 +99,6 @@ class BenchmarkMetrics:
     def update_metrics(self, benchmark_data: BenchmarkData, api_plugin):
         self.n_total_queries += 1
-        if self.start_time is None:
-            self.start_time = benchmark_data.start_time
-        self.total_time = time.perf_counter() - self.start_time
         if benchmark_data.success:
             self.n_succeed_queries += 1
@@ -108,7 +107,6 @@ class BenchmarkMetrics:
             self.n_total_prompt_tokens += benchmark_data.prompt_tokens
             self.n_total_completion_tokens += benchmark_data.completion_tokens
-            benchmark_data._calculate_query_stream_metric()
             self.total_latency += benchmark_data.query_latency
             self.total_first_chunk_latency += benchmark_data.first_chunk_latency
             self.n_time_per_output_token += benchmark_data.time_per_output_token
@@ -117,6 +115,22 @@ class BenchmarkMetrics:
             self.n_failed_queries += 1
         self.calculate_averages()
+        self.update_total_time(benchmark_data)
+    def update_total_time(self, benchmark_data: BenchmarkData):
+        # Use the earliest start_time seen so far
+        if self.start_time is None:
+            self.start_time = benchmark_data.start_time
+        else:
+            self.start_time = min(self.start_time, benchmark_data.start_time)
+        # Track the latest completion time
+        if self.last_completed_time is None:
+            self.last_completed_time = benchmark_data.completed_time
+        else:
+            self.last_completed_time = max(self.last_completed_time, benchmark_data.completed_time)
+        # Compute total_time from request lifecycle timestamps to avoid consumer overhead
+        if self.start_time is not None and self.last_completed_time is not None:
+            self.total_time = max(self.last_completed_time - self.start_time, 0.0)
     def calculate_averages(self):
         if self.n_succeed_queries == 0:

evalscope/perf/utils/db_util.py CHANGED Viewed

@@ -19,7 +19,7 @@ logger = get_logger()
 class DatabaseColumns:
     REQUEST = 'request'
     START_TIME = 'start_time'
-    CHUNK_TIMES = 'chunk_times'
+    INTER_TOKEN_LATENCIES = 'inter_token_latencies'
     SUCCESS = 'success'
     RESPONSE_MESSAGES = 'response_messages'
     COMPLETED_TIME = 'completed_time'
@@ -60,7 +60,7 @@ def create_result_table(cursor):
         f'''CREATE TABLE IF NOT EXISTS result(
                       {DatabaseColumns.REQUEST} TEXT,
                       {DatabaseColumns.START_TIME} REAL,
-                      {DatabaseColumns.CHUNK_TIMES} TEXT,
+                      {DatabaseColumns.INTER_TOKEN_LATENCIES} TEXT,
                       {DatabaseColumns.SUCCESS} INTEGER,
                       {DatabaseColumns.RESPONSE_MESSAGES} TEXT,
                       {DatabaseColumns.COMPLETED_TIME} REAL,
@@ -75,15 +75,15 @@ def create_result_table(cursor):
 def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
-    request = encode_data(benchmark_data.request)
-    chunk_times = json.dumps(benchmark_data.chunk_times)
+    request = benchmark_data.request
+    inter_token_latencies = json.dumps(benchmark_data.inter_chunk_latency)
     response_messages = encode_data(benchmark_data.response_messages)
     # Columns common to both success and failure cases
     common_columns = (
         request,
         benchmark_data.start_time,
-        chunk_times,
+        inter_token_latencies,
         benchmark_data.success,
         response_messages,
         benchmark_data.completed_time,
@@ -96,7 +96,7 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
             benchmark_data.completion_tokens, benchmark_data.max_gpu_memory_cost, benchmark_data.time_per_output_token
         )
         query = f"""INSERT INTO result(
-                      {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
+                      {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES},
                       {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME},
                       {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY}, {DatabaseColumns.PROMPT_TOKENS},
                       {DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.MAX_GPU_MEMORY_COST},
@@ -105,7 +105,7 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
         cursor.execute(query, common_columns + additional_columns)
     else:
         query = f"""INSERT INTO result(
-                      {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
+                      {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES},
                       {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME}
                    ) VALUES (?, ?, ?, ?, ?, ?)"""
         cursor.execute(query, common_columns)
@@ -173,20 +173,11 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
     :param result_db_path: Path to the SQLite database file.
     :return: Dictionary of percentiles for various metrics.
     """
-    def inter_token_latencies(chunk_times_json: str) -> List[float]:
-        try:
-            chunk_times = json.loads(chunk_times_json)
-            return [t2 - t1 for t1, t2 in zip(chunk_times[:-1], chunk_times[1:])]
-        except (json.JSONDecodeError, TypeError) as e:
-            logger.error(f'Error parsing chunk times: {e}')
-            return []
-    query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES}, {DatabaseColumns.SUCCESS},
+    query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES}, {DatabaseColumns.SUCCESS},
                     {DatabaseColumns.COMPLETED_TIME}, {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY},
                     {DatabaseColumns.PROMPT_TOKENS},
                     {DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
-                    FROM result WHERE {DatabaseColumns.SUCCESS}=1'''
+                    FROM result WHERE {DatabaseColumns.SUCCESS}=1'''  # noqa: E501
     percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
@@ -202,7 +193,11 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
     # Prepare data for each metric
     inter_token_latencies_all = []
     for row in rows:
-        inter_token_latencies_all.extend(inter_token_latencies(row[col_indices[DatabaseColumns.CHUNK_TIMES]]))
+        try:
+            itl = json.loads(row[col_indices[DatabaseColumns.INTER_TOKEN_LATENCIES]]) or []
+            inter_token_latencies_all.extend(itl)
+        except (json.JSONDecodeError, TypeError) as e:
+            logger.error(f'Error parsing inter token latencies: {e}')
     metrics = {
         PercentileMetrics.TTFT: [row[col_indices[DatabaseColumns.FIRST_CHUNK_LATENCY]] for row in rows],

evalscope 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

Potentially problematic release.

evalscope 1.1.0py3-none-any.whl → 1.1.1py3-none-any.whl