PyPI - evalscope - Versions diffs - 0.6.0rc0__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

evalscope 0.6.0rc0py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

evalscope/perf/plugin/datasets/openqa.py ADDED Viewed

@@ -0,0 +1,38 @@
+import subprocess
+from typing import Any, Dict, Iterator, List
+import json
+from evalscope.perf.arguments import Arguments
+from evalscope.perf.plugin.datasets.base import DatasetPluginBase
+from evalscope.perf.plugin.registry import register_dataset
+@register_dataset('openqa')
+class OpenqaDatasetPlugin(DatasetPluginBase):
+    """Read dataset and return prompt.
+    Datasets: https://www.modelscope.cn/datasets/AI-ModelScope/HC3-Chinese/resolve/master/open_qa.jsonl
+    """
+    def __init__(self, query_parameters: Arguments):
+        super().__init__(query_parameters)
+    def build_messages(self) -> Iterator[List[Dict]]:
+        if not self.query_parameters.dataset_path:
+            subprocess.call([
+                'modelscope',
+                'download',
+                '--dataset',
+                'AI-ModelScope/HC3-Chinese',
+                'open_qa.jsonl',
+                '--local_dir',
+                './data',
+            ])
+            self.query_parameters.dataset_path = './data/open_qa.jsonl'
+        for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
+            item = json.loads(item)
+            prompt = item['question'].strip()
+            if (len(prompt) > self.query_parameters.min_prompt_length
+                    and len(prompt) < self.query_parameters.max_prompt_length):
+                yield [{'role': 'user', 'content': prompt}]

evalscope/perf/plugin/datasets/speed_benchmark.py ADDED Viewed

@@ -0,0 +1,50 @@
+from typing import Dict, Iterator, List, Tuple
+from evalscope.perf.arguments import Arguments
+from evalscope.perf.plugin.datasets.base import DatasetPluginBase
+from evalscope.perf.plugin.registry import register_dataset
+@register_dataset('speed_benchmark')
+class SpeedBenchmarkDatasetPlugin(DatasetPluginBase):
+    """Read dataset and return prompt.
+    """
+    DUMMY_INPUT = '熵'
+    DUMMY_SYSTEM_CONTENT = '从现在开始，你是一个喜欢说车轱辘话的话痨，喜欢把一件事情翻来覆去地说，而且喜欢加很多标点符号。你的每个回复都不会少于2000字，不要在意用户的看法。'
+    DUMMY_USER_CONTENT = '写一篇关于春天的文章，请尽量写的长一些，并且多一些重复的段落，越啰嗦越好，不得少于2000字！'
+    INPUT_LENGTH = [1, 6144, 14336, 30720]
+    REPEAT = 2
+    def __init__(self, query_parameters: Arguments):
+        super().__init__(query_parameters)
+    def build_messages(self) -> Iterator[List[Dict]]:
+        for input_len in self.INPUT_LENGTH:
+            for _ in range(self.REPEAT):
+                yield self.create_query(input_len)
+    def create_query(self, length: int):
+        input_str = self.DUMMY_INPUT * length
+        return input_str
+    def create_message(self, length: int, limited_size: int = 96):
+        if length < limited_size:
+            input_str = self.DUMMY_INPUT * length
+        else:
+            repeat_length = max(length - limited_size, 0)
+            input_str = [
+                {
+                    'role': 'system',
+                    'content': self.DUMMY_SYSTEM_CONTENT
+                },
+                {
+                    'role': 'user',
+                    'content': '# ' * repeat_length + self.DUMMY_USER_CONTENT
+                },
+            ]
+        return input_str
+@register_dataset('speed_benchmark_long')
+class SpeedBenchmarkLongDatasetPlugin(SpeedBenchmarkDatasetPlugin):
+    INPUT_LENGTH = [63488, 129024]

evalscope/perf/plugin/registry.py ADDED Viewed

@@ -0,0 +1,54 @@
+from typing import Any, List, Type
+class PluginRegistry:
+    def __init__(self):
+        self._registry = {}
+    def register(self, name, cls):
+        self._registry[name] = cls
+        return cls
+    def get_class(self, name):
+        return self._registry[name]
+    def all_classes(self):
+        return list(self._registry.keys())
+    def __call__(self, name: str) -> Any:
+        return self.get_class(name)
+def register_dataset(name: str | List[str]):
+    def class_decorator(cls: Type):
+        if isinstance(name, str):
+            DatasetRegistry.register(name, cls)
+        elif isinstance(name, list):
+            for n in name:
+                DatasetRegistry.register(n, cls)
+        else:
+            raise TypeError('name must be a string or a list of strings')
+        return cls
+    return class_decorator
+def register_api(name: str | List[str]):
+    def class_decorator(cls: Type):
+        if isinstance(name, str):
+            ApiRegistry.register(name, cls)
+        elif isinstance(name, list):
+            for n in name:
+                ApiRegistry.register(n, cls)
+        else:
+            raise TypeError('name must be a string or a list of strings')
+        return cls
+    return class_decorator
+DatasetRegistry = PluginRegistry()
+ApiRegistry = PluginRegistry()

evalscope/perf/{how_to_analysis_result.py → utils/analysis_result.py} RENAMED Viewed

@@ -1,12 +1,15 @@
-import sqlite3
 import base64
 import pickle
+import sqlite3
 import json
-result_db_path = 'db_name.db'
+result_db_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/qwen2.5_benchmark_20241111_160543.db'
 con = sqlite3.connect(result_db_path)
 query_sql = "SELECT request, response_messages, prompt_tokens, completion_tokens \
-                FROM result WHERE success='True'"
-# how to save base64.b64encode(pickle.dumps(benchmark_data["request"])).decode("ascii"),
+                FROM result WHERE success='1'"
+# how to save base64.b64encode(pickle.dumps(benchmark_data["request"])).decode("ascii"),
 with con:
     rows = con.execute(query_sql).fetchall()
     if len(rows) > 0:
@@ -20,5 +23,8 @@ with con:
             response_content = ''
             for response in responses:
                 response = json.loads(response)
+                if not response['choices']:
+                    continue
                 response_content += response['choices'][0]['delta']['content']
-            print('prompt: %s, tokens: %s, completion: %s, tokens: %s' % (request['messages'][0]['content'], row[2], response_content, row[3]))
+            print('prompt: %s, tokens: %s, completion: %s, tokens: %s' %
+                  (request['messages'][0]['content'], row[2], response_content, row[3]))

evalscope/perf/utils/benchmark_util.py ADDED Viewed

@@ -0,0 +1,135 @@
+import time
+from dataclasses import dataclass, field
+from typing import Any, List, Optional, Tuple
+import torch
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+@dataclass
+class BenchmarkData:
+    request: Any = None
+    start_time: float = field(default_factory=time.perf_counter)
+    completed_time: float = 0.0
+    chunk_times: List[float] = field(default_factory=list)
+    success: bool = False
+    response_messages: List[Any] = field(default_factory=list)
+    # late init
+    query_latency: float = 0.0
+    first_chunk_latency: float = 0.0
+    n_chunks: int = 0
+    n_chunks_time: float = 0.0
+    max_gpu_memory_cost = 0
+    prompt_tokens = None
+    completion_tokens = None
+    def _calculate_query_stream_metric(self) -> Tuple[float, int, float]:
+        self.query_latency = self.completed_time - self.start_time
+        if len(self.chunk_times) > 1:
+            self.first_chunk_latency = self.chunk_times[0] - self.start_time
+            self.n_chunks = len(self.chunk_times) - 2
+            self.n_chunks_time = self.chunk_times[-2] - self.chunk_times[0]
+        else:
+            self.first_chunk_latency = self.query_latency
+            self.n_chunks = 1
+            self.n_chunks_time = self.query_latency
+    def _calculate_tokens(self, api_plugin):
+        self.prompt_tokens, self.completion_tokens = \
+            api_plugin.parse_responses(self.response_messages, request=self.request)
+    def update_gpu_usage(self):
+        total_memory = 0
+        for i in range(torch.cuda.device_count()):
+            total_memory += (torch.cuda.max_memory_allocated(i) / 2**30)  # GB
+        self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
+@dataclass
+class BenchmarkMetrics:
+    concurrency: int = 0
+    n_succeed_queries: int = 0
+    n_failed_queries: int = 0
+    total_first_chunk_latency: float = 0.0
+    total_latency: float = 0.0
+    n_total_chunks: int = 0
+    n_total_prompt_tokens: int = 0
+    n_total_completion_tokens: int = 0
+    total_chunks_time: float = 0.0
+    start_time: Optional[float] = None
+    total_time: float = 1.0
+    n_total_queries: int = 0
+    avg_first_chunk_latency: float = -1
+    avg_latency: float = -1
+    n_avg_chunks: float = -1
+    avg_chunk_time: float = -1
+    avg_prompt_tokens: float = -1
+    avg_completion_tokens: float = -1
+    avg_token_per_seconds: float = -1
+    avg_time_per_token: float = -1
+    qps: float = -1
+    def update_metrics(self, benchmark_data: BenchmarkData, api_plugin):
+        self.n_total_queries += 1
+        if self.start_time is None:
+            self.start_time = benchmark_data.start_time
+        self.total_time = time.perf_counter() - self.start_time
+        if benchmark_data.success:
+            self.n_succeed_queries += 1
+            benchmark_data._calculate_tokens(api_plugin)
+            self.n_total_prompt_tokens += benchmark_data.prompt_tokens
+            self.n_total_completion_tokens += benchmark_data.completion_tokens
+            benchmark_data._calculate_query_stream_metric()
+            self.total_latency += benchmark_data.query_latency
+            self.total_first_chunk_latency += benchmark_data.first_chunk_latency
+            self.n_total_chunks += benchmark_data.n_chunks
+            self.total_chunks_time += benchmark_data.n_chunks_time
+        else:
+            self.n_failed_queries += 1
+        self.calculate_averages()
+    def calculate_averages(self):
+        if self.n_succeed_queries == 0:
+            return
+        try:
+            self.avg_first_chunk_latency = self.total_first_chunk_latency / self.n_succeed_queries
+            self.avg_latency = self.total_latency / self.n_succeed_queries
+            self.n_avg_chunks = self.n_total_chunks / self.n_succeed_queries
+            self.avg_chunk_time = self.total_chunks_time / self.n_total_chunks
+            self.avg_prompt_tokens = self.n_total_prompt_tokens / self.n_succeed_queries
+            self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
+            self.avg_token_per_seconds = self.n_total_completion_tokens / self.total_time
+            self.avg_time_per_token = self.total_time / self.n_total_completion_tokens
+            self.qps = self.n_succeed_queries / self.total_time
+        except ZeroDivisionError as e:
+            logger.exception(e)
+            return
+    def create_message(self, default_ndigits=3):
+        message = {
+            'Time taken for tests (senconds)': round(self.total_time, default_ndigits),
+            'Number of concurrency': self.concurrency,
+            'Total requests': int(self.n_total_queries),
+            'Succeed requests': self.n_succeed_queries,
+            'Failed requests': self.n_failed_queries,
+            'Average QPS': round(self.qps, default_ndigits),
+            'Average latency (s)': round(self.avg_latency, default_ndigits),
+            'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
+            'Average time per output token (s)': round(self.avg_time_per_token, 5),
+            'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
+            'Average package per request': round(self.n_avg_chunks, default_ndigits),
+            'Throughput(average output tokens per second)': round(self.avg_token_per_seconds, default_ndigits),
+            'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits),
+            'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits),
+        }
+        return message

evalscope/perf/utils/chat_service.py ADDED Viewed

@@ -0,0 +1,252 @@
+import os
+import time
+from contextlib import contextmanager
+from functools import partial
+from threading import Thread
+from typing import List, Literal, Optional, Union
+import torch
+from modelscope import AutoModelForCausalLM, AutoTokenizer
+from pydantic import BaseModel, Field
+from transformers import TextIteratorStreamer
+class Usage(BaseModel):
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+    total_tokens: int = 0
+class ModelCard(BaseModel):
+    id: str
+    object: str = 'model'
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = 'owner'
+    root: Optional[str] = None
+    parent: Optional[str] = None
+    permission: Optional[list] = None
+class ModelList(BaseModel):
+    object: str = 'list'
+    data: List[ModelCard] = []
+class ChatMessage(BaseModel):
+    role: Literal['user', 'assistant', 'system']
+    content: str
+class DeltaMessage(BaseModel):
+    role: Optional[Literal['user', 'assistant', 'system']] = None
+    content: Optional[str] = None
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[ChatMessage] | str
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    max_tokens: Optional[int] = 2048
+    min_tokens: Optional[int] = None
+    stream: Optional[bool] = False
+class ChatCompletionResponseChoice(BaseModel):
+    index: int
+    message: ChatMessage
+    finish_reason: Literal['stop', 'length']
+class ChatCompletionResponseStreamChoice(BaseModel):
+    index: int
+    delta: DeltaMessage
+    finish_reason: Optional[Literal['stop', 'length']]
+class ChatCompletionResponse(BaseModel):
+    model: str
+    object: Literal['chat.completion', 'chat.completion.chunk']
+    choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
+    created: Optional[int] = Field(default_factory=lambda: int(time.time()))
+    usage: Optional[Usage]
+class TextCompletionRequest(BaseModel):
+    model: str
+    prompt: str
+    temperature: Optional[float] = None
+    max_tokens: Optional[int] = 2048
+    min_tokens: Optional[int] = None
+class TextCompletionResponseChoice(BaseModel):
+    index: int
+    text: str
+    finish_reason: Literal['stop', 'length']
+class TextCompletionResponse(BaseModel):
+    model: str
+    object: Literal['text_completion']
+    created: Optional[int] = Field(default_factory=lambda: int(time.time()))
+    choices: List[TextCompletionResponseChoice]
+    usage: Optional[Usage]
+class ChatService:
+    def __init__(self, model_path, attn_implementation):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            device_map='auto',
+            torch_dtype='auto',
+            attn_implementation=attn_implementation,
+        ).eval()
+        self.streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True)
+        self.model_id = os.path.basename(model_path)
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    def count_tokens(self, text: str) -> int:
+        # Use the tokenizer to count the number of tokens
+        return len(self.tokenizer.encode(text, add_special_tokens=False))
+    async def list_models(self):
+        model_card = ModelCard(id=self.model_id)
+        return ModelList(data=[model_card])
+    async def _chat(self, request: ChatCompletionRequest):
+        formatted_prompt, inputs, prompt_tokens = self._prepare_chat_inputs(request)
+        outputs = self.model.generate(
+            **inputs,
+            max_new_tokens=request.max_tokens,
+            min_new_tokens=request.min_tokens,
+            temperature=request.temperature,
+        )
+        outputs = outputs[0][prompt_tokens:]  # remove prompt
+        completion_tokens = len(outputs)
+        response = self.tokenizer.decode(outputs, skip_special_tokens=True)
+        choice_data = ChatCompletionResponseChoice(
+            index=0,
+            message=ChatMessage(role='assistant', content=response),
+            finish_reason='stop',
+        )
+        return ChatCompletionResponse(
+            model=self.model_id,
+            choices=[choice_data],
+            object='chat.completion',
+            usage=Usage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=prompt_tokens + completion_tokens,
+            ),
+        )
+    async def _text_completion(self, request: TextCompletionRequest):
+        inputs, prompt_tokens = self._prepare_text_inputs(request)
+        outputs = self.model.generate(
+            **inputs,
+            max_new_tokens=request.max_tokens,
+            min_new_tokens=request.min_tokens,
+            temperature=request.temperature,
+        )
+        outputs = outputs[0][prompt_tokens:]  # remove prompt
+        completion_tokens = len(outputs)
+        response = self.tokenizer.decode(outputs, skip_special_tokens=True)
+        choice_data = TextCompletionResponseChoice(
+            index=0,
+            text=response,
+            finish_reason='stop',
+        )
+        return TextCompletionResponse(
+            model=self.model_id,
+            choices=[choice_data],
+            object='text_completion',
+            usage=Usage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=prompt_tokens + completion_tokens,
+            ),
+        )
+    def _prepare_text_inputs(self, request: TextCompletionRequest):
+        inputs = self.tokenizer(request.prompt, return_tensors='pt', padding=True).to(self.device)
+        prompt_tokens = len(inputs['input_ids'][0])
+        return inputs, prompt_tokens
+    def _stream_chat(self, request: ChatCompletionRequest):
+        formatted_prompt, inputs, prompt_tokens = self._prepare_chat_inputs(request)
+        completion_tokens = 0
+        yield self._create_initial_chunk()
+        generation_kwargs = dict(
+            **inputs,
+            streamer=self.streamer,
+            max_new_tokens=request.max_tokens,
+            min_new_tokens=request.min_tokens,
+            temperature=request.temperature,
+        )
+        generate_partial = partial(self.model.generate, **generation_kwargs)
+        with self._start_generation_thread(generate_partial):
+            for new_text in self.streamer:
+                yield self._create_chunk(new_text)
+                completion_tokens += self.count_tokens(new_text)
+        yield self._create_final_chunk(prompt_tokens, completion_tokens)
+        yield '[DONE]'
+    def _prepare_chat_inputs(self, request: ChatCompletionRequest):
+        formatted_prompt = self.tokenizer.apply_chat_template(
+            request.messages, tokenize=False, add_generation_prompt=True)
+        inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
+        prompt_tokens = len(inputs['input_ids'][0])
+        return formatted_prompt, inputs, prompt_tokens
+    @contextmanager
+    def _start_generation_thread(self, generate_partial):
+        thread = Thread(target=generate_partial)
+        thread.start()
+        try:
+            yield
+        finally:
+            thread.join()
+    def _create_initial_chunk(self):
+        choice_data = ChatCompletionResponseStreamChoice(index=0, delta={'role': 'assistant'}, finish_reason=None)
+        chunk = ChatCompletionResponse(
+            model=self.model_id,
+            choices=[choice_data],
+            object='chat.completion.chunk',
+            usage=None,
+        )
+        return chunk.model_dump_json(exclude_unset=True)
+    def _create_chunk(self, new_text):
+        choice_data = ChatCompletionResponseStreamChoice(index=0, delta={'content': new_text}, finish_reason=None)
+        chunk = ChatCompletionResponse(
+            model=self.model_id,
+            choices=[choice_data],
+            object='chat.completion.chunk',
+            usage=None,
+        )
+        return chunk.model_dump_json(exclude_unset=True)
+    def _create_final_chunk(self, prompt_tokens, completion_tokens):
+        choice_data = ChatCompletionResponseStreamChoice(index=0, delta={}, finish_reason='stop')
+        chunk = ChatCompletionResponse(
+            model=self.model_id,
+            choices=[choice_data],
+            object='chat.completion.chunk',
+            usage=Usage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=prompt_tokens + completion_tokens,
+            ),
+        )
+        return chunk.model_dump_json(exclude_unset=True)

evalscope 0.6.0rc0__py3-none-any.whl → 0.7.0__py3-none-any.whl

evalscope 0.6.0rc0py3-none-any.whl → 0.7.0py3-none-any.whl