PyPI - evalscope - Versions diffs - 0.13.1__py3-none-any.whl → 0.13.2__py3-none-any.whl - Mend

evalscope 0.13.1py3-none-any.whl → 0.13.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (35) hide show

evalscope/arguments.py +1 -1
evalscope/backend/rag_eval/utils/llm.py +4 -5
evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
evalscope/benchmarks/arena_hard/__init__.py +0 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
evalscope/benchmarks/arena_hard/utils.py +162 -0
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
evalscope/benchmarks/data_adapter.py +26 -2
evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -11
evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
evalscope/benchmarks/live_code_bench/testing_util.py +3 -3
evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
evalscope/config.py +1 -1
evalscope/metrics/llm_judge.py +1 -1
evalscope/models/chat_adapter.py +32 -11
evalscope/perf/arguments.py +8 -6
evalscope/perf/benchmark.py +31 -63
evalscope/perf/plugin/api/openai_api.py +4 -2
evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
evalscope/perf/utils/db_util.py +2 -2
evalscope/version.py +2 -2
{evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/METADATA +10 -49
{evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/RECORD +35 -28
tests/cli/test_all.py +33 -24
tests/cli/test_run.py +35 -18
tests/rag/test_ragas.py +4 -1
{evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/LICENSE +0 -0
{evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/WHEEL +0 -0
{evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/entry_points.txt +0 -0
{evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py ADDED Viewed

@@ -0,0 +1,182 @@
+from collections import defaultdict
+from typing import Any, Dict
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType, OutputType
+from evalscope.metrics import exact_match
+from evalscope.utils.logger import get_logger
+from evalscope.utils.utils import ResponseParser
+logger = get_logger()
+SUBSET_LIST = [
+    'abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
+    'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
+    'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
+    'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
+    'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics',
+    'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics',
+    'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history',
+    'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning',
+    'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition',
+    'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine',
+    'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology',
+    'world_religions'
+]
+SUBJECT_MAPPING = {
+    'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
+    'anatomy': ['Anatomy', 'health', 'Other'],
+    'astronomy': ['Astronomy', 'physics', 'STEM'],
+    'business_ethics': ['Business Ethics', 'business', 'Other'],
+    'clinical_knowledge': ['Clinical Knowledge', 'health', 'Other'],
+    'college_biology': ['College Biology', 'biology', 'STEM'],
+    'college_chemistry': ['College Chemistry', 'chemistry', 'STEM'],
+    'college_computer_science': ['College Computer Science', 'computer science', 'STEM'],
+    'college_mathematics': ['College Mathematics', 'math', 'STEM'],
+    'college_medicine': ['College Medicine', 'health', 'Other'],
+    'college_physics': ['College Physics', 'physics', 'STEM'],
+    'computer_security': ['Computer Security', 'computer science', 'STEM'],
+    'conceptual_physics': ['Conceptual Physics', 'physics', 'STEM'],
+    'econometrics': ['Econometrics', 'economics', 'Social Science'],
+    'electrical_engineering': ['Electrical Engineering', 'engineering', 'STEM'],
+    'elementary_mathematics': ['Elementary Mathematics', 'math', 'STEM'],
+    'formal_logic': ['Formal Logic', 'philosophy', 'Humanities'],
+    'global_facts': ['Global Facts', 'other', 'Other'],
+    'high_school_biology': ['High School Biology', 'biology', 'STEM'],
+    'high_school_chemistry': ['High School Chemistry', 'chemistry', 'STEM'],
+    'high_school_computer_science': ['High School Computer Science', 'computer science', 'STEM'],
+    'high_school_european_history': ['High School European History', 'history', 'Humanities'],
+    'high_school_geography': ['High School Geography', 'geography', 'Social Science'],
+    'high_school_government_and_politics': ['High School Government And Politics', 'politics', 'Social Science'],
+    'high_school_macroeconomics': ['High School Macroeconomics', 'economics', 'Social Science'],
+    'high_school_mathematics': ['High School Mathematics', 'math', 'STEM'],
+    'high_school_microeconomics': ['High School Microeconomics', 'economics', 'Social Science'],
+    'high_school_physics': ['High School Physics', 'physics', 'STEM'],
+    'high_school_psychology': ['High School Psychology', 'psychology', 'Social Science'],
+    'high_school_statistics': ['High School Statistics', 'math', 'STEM'],
+    'high_school_us_history': ['High School Us History', 'history', 'Humanities'],
+    'high_school_world_history': ['High School World History', 'history', 'Humanities'],
+    'human_aging': ['Human Aging', 'health', 'Other'],
+    'human_sexuality': ['Human Sexuality', 'culture', 'Social Science'],
+    'international_law': ['International Law', 'law', 'Humanities'],
+    'jurisprudence': ['Jurisprudence', 'law', 'Humanities'],
+    'logical_fallacies': ['Logical Fallacies', 'philosophy', 'Humanities'],
+    'machine_learning': ['Machine Learning', 'computer science', 'STEM'],
+    'management': ['Management', 'business', 'Other'],
+    'marketing': ['Marketing', 'business', 'Other'],
+    'medical_genetics': ['Medical Genetics', 'health', 'Other'],
+    'miscellaneous': ['Miscellaneous', 'other', 'Other'],
+    'moral_disputes': ['Moral Disputes', 'philosophy', 'Humanities'],
+    'moral_scenarios': ['Moral Scenarios', 'philosophy', 'Humanities'],
+    'nutrition': ['Nutrition', 'health', 'Other'],
+    'philosophy': ['Philosophy', 'philosophy', 'Humanities'],
+    'prehistory': ['Prehistory', 'history', 'Humanities'],
+    'professional_accounting': ['Professional Accounting', 'other', 'Other'],
+    'professional_law': ['Professional Law', 'law', 'Humanities'],
+    'professional_medicine': ['Professional Medicine', 'health', 'Other'],
+    'professional_psychology': ['Professional Psychology', 'psychology', 'Social Science'],
+    'public_relations': ['Public Relations', 'politics', 'Social Science'],
+    'security_studies': ['Security Studies', 'politics', 'Social Science'],
+    'sociology': ['Sociology', 'culture', 'Social Science'],
+    'us_foreign_policy': ['Us Foreign Policy', 'politics', 'Social Science'],
+    'virology': ['Virology', 'health', 'Other'],
+    'world_religions': ['World Religions', 'philosophy', 'Humanities'],
+}
+@Benchmark.register(
+    name='mmlu_redux',
+    pretty_name='MMLU-Redux',
+    dataset_id='AI-ModelScope/mmlu-redux-2.0',
+    model_adapter=OutputType.GENERATION,
+    output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
+    subset_list=SUBSET_LIST,
+    metric_list=['AverageAccuracy'],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='test',
+    prompt_template=
+    'The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n{query}',  # noqa: E501
+)
+class MMLUReduxAdapter(DataAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if self.few_shot_num > 0:
+            self.few_shot_num = 0
+            logger.warning('Few-shot examples are not supported for MMLU-Redux dataset. Setting few_shot_num to 0.')
+        self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
+        self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
+    def gen_prompt(self, input_d: Dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
+        if self.few_shot_num > 0:
+            prefix = self.format_fewshot_examples(few_shot_list)
+        else:
+            prefix = ''
+        query = prefix + 'Q: ' + input_d['question'] + '\n' + \
+            self.__form_options(input_d['choices']) + '\n'
+        full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
+        return self.gen_prompt_data(full_prompt)
+    def format_fewshot_examples(self, few_shot_list):
+        # load few-shot prompts for each category
+        prompts = ''
+        for index, d in enumerate(few_shot_list):
+            prompts += 'Q: ' + d['question'] + '\n' + \
+                self.__form_options(d['choices']) + '\n'
+        return prompts
+    def __form_options(self, options: list):
+        option_str = 'Options are:\n'
+        for opt, choice in zip(options, self.choices):
+            option_str += f'({choice}): {opt}' + '\n'
+        return option_str
+    def get_gold_answer(self, input_d: dict) -> str:
+        """
+        Parse the raw input labels (gold).
+        Args:
+            input_d: input raw data. Depending on the dataset.
+        Returns:
+            The parsed input. e.g. gold answer ... Depending on the dataset.
+        """
+        answer_index = int(input_d['answer'])
+        return self.choices[answer_index]
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
+        """
+        Parse the predicted result and extract proper answer.
+        Args:
+            result: Predicted answer from the model. Usually a string for chat.
+            raw_input_d: The raw input. Depending on the dataset.
+            eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
+        Returns:
+            The parsed answer. Depending on the dataset. Usually a string for chat.
+        """
+        if self.model_adapter == OutputType.MULTIPLE_CHOICE:
+            return result
+        else:
+            return ResponseParser.parse_first_option(result)
+    def match(self, gold: str, pred: str) -> float:
+        """
+        Match the gold answer and the predicted answer.
+        Args:
+            gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
+                        e.g. 'A', extracted from get_gold_answer method.
+            pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
+                        e.g. 'B', extracted from parse_pred_result method.
+        Returns:
+            The match result. Usually a score (float) for chat/multiple-choice-questions.
+        """
+        return exact_match(gold=gold, pred=pred)

evalscope/benchmarks/simple_qa/simple_qa_adapter.py CHANGED Viewed

@@ -126,7 +126,7 @@ class SimpleQAAdapter(DataAdapter):
     def match(self, gold: str, pred: str) -> float:
         # simple match
-        logger.warning(f'Please use LLMJudge to match the result for SimpleQA')
+        logger.warning(f'Please use LLMJudge to match the result for {self.name}')
         is_correct = 1 if gold.lower().strip() == pred.lower().strip() else 0
         is_incorrect = not is_correct
         is_not_attempted = 0
@@ -159,9 +159,6 @@ class SimpleQAAdapter(DataAdapter):
             review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
         """
         # zip dict answers
-        res_dict = defaultdict(list)
-        for res in review_res_list:
-            for key, value in res.items():
-                res_dict[key].append(value)
+        res_dict = super().compute_dict_metric(review_res_list, **kwargs)
         return super().compute_metric(res_dict, **kwargs)

evalscope/config.py CHANGED Viewed

@@ -75,7 +75,7 @@ class TaskConfig:
     # LLMJudge arguments
     judge_strategy: str = JudgeStrategy.AUTO
-    judge_worker_num: int = 8
+    judge_worker_num: int = 1
     judge_model_args: Optional[Dict] = field(default_factory=lambda: {})
     def __post_init__(self):

evalscope/metrics/llm_judge.py CHANGED Viewed

@@ -49,7 +49,7 @@ class LLMJudge:
         """
         self.api_key = api_key or os.environ.get('OPENAI_API_KEY', 'EMPTY')
         self.api_url = api_url or os.environ.get('OPENAI_API_BASE', 'https://api.openai.com/v1')
-        self.model_id = model_id or os.environ.get('LOCAL_LLM', 'gpt-3.5-turbo')
+        self.model_id = model_id or os.environ.get('LOCAL_LLM', 'gpt-4')
         self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
         self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
         self.generation_config = generation_config

evalscope/models/chat_adapter.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import os
 import time
 import torch
-from typing import List, Union
+from typing import Any, Dict, List, Tuple, Union
 from evalscope.constants import OutputType
 from evalscope.models.base_adapter import BaseModelAdapter
 from evalscope.models.local_model import LocalModel
 from evalscope.models.register import register_model_adapter
-from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
+from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, Usage
 from evalscope.utils.logger import get_logger
 from evalscope.utils.model_utils import fix_do_sample_warning
@@ -60,7 +60,10 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
         return generation_config
-    def _model_generate(self, queries: List[str], system_prompts: List[str] = None, infer_cfg: dict = {}) -> List[str]:
+    def _model_generate(self,
+                        queries: List[str],
+                        system_prompts: List[str] = None,
+                        infer_cfg: Dict[str, Any] = None) -> Tuple[List[List[str]], List[int]]:
         """
         Args:
             queries: The input queries.
@@ -69,6 +72,11 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
         Returns:
             The prediction results.
         """
+        if system_prompts is None:
+            system_prompts = []
+        if infer_cfg is None:
+            infer_cfg = {}
         # Process infer_cfg
         num_return_sequences = infer_cfg.get('num_return_sequences', 1)
         if num_return_sequences > 1:
@@ -111,7 +119,9 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
         # Run inference
         output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
+        # Decode output
         responses = []
+        input_lengths = [len(self.tokenizer.encode(prompt)) for prompt in formatted_prompts]
         for i in range(0, len(output_ids), num_return_sequences):
             query_responses = []
             for j in range(num_return_sequences):
@@ -121,7 +131,7 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
                 query_responses.append(response)
             responses.append(query_responses)
-        return responses
+        return responses, input_lengths
     @torch.no_grad()
     def predict(self, inputs: List[dict], infer_cfg: dict = {}) -> List[dict]:
@@ -141,22 +151,33 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
             queries.append(input_item['data'][0])
             system_prompts.append(input_item.get('system_prompt', None))
-        responses = self._model_generate(queries, system_prompts, infer_cfg)
+        # Run inference
+        responses, input_lengths = self._model_generate(queries, system_prompts, infer_cfg)
+        # Process outputs
         results = []
-        for response in responses:
-            choices_list = [
-                ChatCompletionResponseChoice(
+        for response, input_length in zip(responses, input_lengths):
+            choices_list = []
+            completion_tokens = 0
+            for index, one_response in enumerate(response):
+                choice = ChatCompletionResponseChoice(
                     index=index, message=ChatMessage(content=one_response, role='assistant'), finish_reason='stop')
-                for index, one_response in enumerate(response)
-            ]
+                choices_list.append(choice)
+                completion_tokens += len(self.tokenizer.encode(one_response))
+            usage = Usage(
+                prompt_tokens=input_length,
+                completion_tokens=completion_tokens,
+                total_tokens=input_length + completion_tokens)
             res_d = ChatCompletionResponse(
                 model=self.model_id,
                 choices=choices_list,
                 object='chat.completion',
                 created=int(time.time()),
-                usage=None).model_dump(exclude_unset=True)
+                usage=usage).model_dump(exclude_unset=True)
             results.append(res_d)

evalscope/perf/arguments.py CHANGED Viewed

@@ -27,7 +27,7 @@ class Arguments:
     no_test_connection: bool = False  # Test the connection before starting the benchmark
     # Performance and parallelism
-    number: Optional[int] = None  # Number of requests to be made
+    number: int = 1000  # Number of requests to be made
     parallel: int = 1  # Number of parallel requests
     rate: int = -1  # Rate limit for requests (default: -1, no limit)
@@ -60,10 +60,11 @@ class Arguments:
     seed: Optional[int] = 42  # Random seed for reproducibility
     stop: Optional[List[str]] = field(default_factory=list)  # Stop sequences for the response
     stop_token_ids: Optional[List[str]] = field(default_factory=list)  # Stop token IDs for the response
-    stream: Optional[bool] = None  # Whether to stream the response
-    temperature: Optional[float] = None  # Temperature setting for the response
+    stream: Optional[bool] = False  # Whether to stream the response
+    temperature: float = 0.0  # Temperature setting for the response
     top_p: Optional[float] = None  # Top-p (nucleus) sampling setting for the response
     top_k: Optional[int] = None  # Top-k sampling setting for the response
+    extra_args: Optional[Dict[str, Any]] = None  # Extra arguments
     @staticmethod
     def from_args(args):
@@ -126,7 +127,7 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--no-test-connection', action='store_false', default=False, help='Do not test the connection before starting the benchmark')  # noqa: E501
     # Performance and parallelism
-    parser.add_argument('-n', '--number', type=int, default=None, help='How many requests to be made')
+    parser.add_argument('-n', '--number', type=int, default=1000, help='How many requests to be made')
     parser.add_argument('--parallel', type=int, default=1, help='Set number of concurrency requests, default 1')
     parser.add_argument('--rate', type=int, default=-1, help='Number of requests per second. default None')
@@ -161,10 +162,11 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--seed', type=int, help='The random seed', default=42)
     parser.add_argument('--stop', nargs='*', help='The stop tokens', default=None)
     parser.add_argument('--stop-token-ids', nargs='*', help='Set the stop token IDs', default=None)
-    parser.add_argument('--stream', action='store_true', help='Stream output with SSE', default=None)
-    parser.add_argument('--temperature', type=float, help='The sample temperature', default=None)
+    parser.add_argument('--stream', action='store_true', help='Stream output with SSE', default=False)
+    parser.add_argument('--temperature', type=float, help='The sample temperature', default=0.0)
     parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
     parser.add_argument('--top-k', type=int, help='Sampling top k', default=None)
+    parser.add_argument('--extra-args', type=json.loads, default='{}', help='Extra arguments, should in JSON format',)
     # yapf: enable

evalscope/perf/benchmark.py CHANGED Viewed

@@ -9,7 +9,7 @@ import threading
 import time
 from http import HTTPStatus
 from tqdm import tqdm
-from typing import List
+from typing import AsyncGenerator, List
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.http_client import AioHttpClient, test_connection
@@ -21,92 +21,68 @@ from evalscope.perf.utils.local_server import start_app
 from evalscope.utils.logger import get_logger
 logger = get_logger()
-query_send_completed_event = asyncio.Event()
 data_process_completed_event = asyncio.Event()
 @exception_handler
-async def dispatch_requests_worker(request_queue: asyncio.Queue, args: Arguments):
+async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
     query_generator_class = ApiRegistry(args.api)
     query_generator = query_generator_class(args.tokenizer_path)
     def load_prompt(prompt_path_or_text):
-        """Load the prompt from a file or directly from the input text."""
         if prompt_path_or_text.startswith('@'):
             with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
                 return file.read()
         return prompt_path_or_text
-    async def dispatch_request(request):
-        """Dispatch a single request with optional rate limiting."""
-        await request_queue.put(request)
-        if args.rate != -1:
-            interval = np.random.exponential(1.0 / args.rate)
-            await asyncio.sleep(interval)
-    async def dispatch_requests_from_prompt(messages):
-        """Generate and dispatch requests based on the given prompt."""
+    async def generate_requests_from_prompt(messages):
         request = query_generator.build_request(messages, args)
-        if args.number is None:
-            await dispatch_request(request)
-            return 1
         for _ in range(args.number):
-            await dispatch_request(request)
-        return args.number
+            yield request
-    async def dispatch_requests_from_dataset():
-        """Generate and dispatch requests based on the dataset."""
-        total_query_count = 0
+    async def generate_requests_from_dataset():
         message_generator_class = DatasetRegistry(args.dataset)
         message_generator = message_generator_class(args)
+        count = 0
         for messages in message_generator:
             request = query_generator.build_request(messages, args)
-            if request is None:
-                continue
-            await dispatch_request(request)
-            total_query_count += 1
-            if args.number and total_query_count >= args.number:
-                break
-        return total_query_count
+            if request is not None:
+                yield request
+                count += 1
+                if args.number and count >= args.number:
+                    break
-    # Load prompt or dataset and dispatch requests accordingly
     if args.prompt:
         prompt = load_prompt(args.prompt)
         messages = [{'role': 'user', 'content': prompt}]
-        total_queries = await dispatch_requests_from_prompt(messages)
+        generator = generate_requests_from_prompt(messages)
     elif args.dataset:
-        total_queries = await dispatch_requests_from_dataset()
+        generator = generate_requests_from_dataset()
     else:
         raise Exception('Either prompt or dataset is required!')
-    return total_queries
+    async for request in generator:
+        yield request
+        if args.rate != -1:
+            interval = np.random.exponential(1.0 / args.rate)
+            await asyncio.sleep(interval)
 @exception_handler
-async def send_requests_worker(
-    task_id,
-    request_queue: asyncio.Queue,
+async def send_request(
+    semaphore: asyncio.Semaphore,
+    request: dict,
     benchmark_data_queue: asyncio.Queue,
     args: Arguments,
 ):
-    client = AioHttpClient(args)
-    async with client:
-        while not (query_send_completed_event.is_set() and request_queue.empty()):
-            try:
-                # Attempt to get a request from the queue with a timeout
-                request = await asyncio.wait_for(request_queue.get(), timeout=0.0001)
-                request_queue.task_done()
-            except asyncio.TimeoutError:
-                # If timeout, continue to the next iteration
-                continue
-            # Initialize benchmark data for the current request
+    async with semaphore:
+        client = AioHttpClient(args)
+        async with client:
             benchmark_data = BenchmarkData(request=request)
             collected_messages = []
             try:
-                # Send the request and process the response
                 async for is_error, state_code, response_data in client.post(request):
                     if is_error or state_code != HTTPStatus.OK:
                         logger.error(f'Request: {request} failed, state_code: {state_code}, data: {response_data}')
@@ -124,7 +100,6 @@ async def send_requests_worker(
                 logger.exception(e)
                 logger.error(f'Request query: {request} exception')
             finally:
-                # Record completion time and collected messages
                 benchmark_data.completed_time = time.perf_counter()
                 benchmark_data.response_messages = collected_messages
                 await benchmark_data_queue.put(benchmark_data)
@@ -152,7 +127,7 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
     collected_benchmark_data = []
-    with tqdm(desc='Processing') as pbar:
+    with tqdm(desc='Processing', total=args.number) as pbar:
         while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
             try:
                 # Attempt to get benchmark data from the queue with a timeout
@@ -216,39 +191,32 @@ async def benchmark(args: Arguments) -> None:
         add_signal_handlers(loop)
     # init queue
-    request_queue = asyncio.Queue()
     benchmark_data_queue = asyncio.Queue()
     # reset event
-    query_send_completed_event.clear()
     data_process_completed_event.clear()
+    semaphore = asyncio.Semaphore(args.parallel)
     async def create_send_request_tasks():
         tasks: List[asyncio.Task] = []
-        for idx in range(args.parallel):
-            task = asyncio.create_task(send_requests_worker(idx, request_queue, benchmark_data_queue, args))
+        async for request in get_requests(args):
+            task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args))
             tasks.append(task)
         return tasks
     async def run_tasks():
         await start_server(args)
-        dispatch_task = asyncio.create_task(dispatch_requests_worker(request_queue, args))
         statistic_benchmark_metric_task = asyncio.create_task(
             statistic_benchmark_metric_worker(benchmark_data_queue, args))
         send_request_tasks = await create_send_request_tasks()
-        expected_number_of_queries = await dispatch_task
-        await request_queue.join()
-        query_send_completed_event.set()
         await asyncio.gather(*send_request_tasks, return_exceptions=True)
         await benchmark_data_queue.join()
         data_process_completed_event.set()
         metrics, result_db_path = await statistic_benchmark_metric_task
-        summary_result(args, metrics, expected_number_of_queries, result_db_path)
-        await asyncio.sleep(0.250)
+        summary_result(args, metrics, result_db_path)
     await run_tasks()

evalscope/perf/plugin/api/openai_api.py CHANGED Viewed

@@ -70,7 +70,7 @@ class OpenaiPlugin(ApiPluginBase):
     def __compose_query_from_parameter(self, payload: Dict, param: Arguments):
         payload['model'] = param.model
         if param.max_tokens is not None:
-            payload['max_tokens'] = param.max_tokens
+            payload['max_completion_tokens'] = param.max_tokens
         if param.min_tokens is not None:
             payload['min_tokens'] = param.min_tokens
         if param.frequency_penalty is not None:
@@ -94,9 +94,11 @@ class OpenaiPlugin(ApiPluginBase):
             payload['top_p'] = param.top_p
         if param.top_k is not None:
             payload['top_k'] = param.top_k
+        if param.extra_args is not None:
+            payload.update(param.extra_args)
         return payload
-    def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
+    def parse_responses(self, responses, request: Any = None, **kwargs) -> tuple[int, int]:
         """Parser responses and return number of request and response tokens.
         Only one response for non-stream, multiple responses for stream.
         """

evalscope/perf/plugin/datasets/speed_benchmark.py CHANGED Viewed

@@ -3,6 +3,9 @@ from typing import Dict, Iterator, List, Tuple
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.datasets.base import DatasetPluginBase
 from evalscope.perf.plugin.registry import register_dataset
+from evalscope.utils.logger import get_logger
+logger = get_logger()
 @register_dataset('speed_benchmark')
@@ -18,6 +21,14 @@ class SpeedBenchmarkDatasetPlugin(DatasetPluginBase):
     def __init__(self, query_parameters: Arguments):
         super().__init__(query_parameters)
+        url = self.query_parameters.url
+        if url.endswith('v1/chat/completions'):
+            logger.warning(
+                'The API URL is not set correctly for `speed_benchmark`. Using `v1/completions` instead of `v1/chat/completions` by system.'  # noqa
+            )
+            url = url.replace('v1/chat/completions', 'v1/completions')
+            self.query_parameters.url = url
     def build_messages(self) -> Iterator[List[Dict]]:
         for input_len in self.INPUT_LENGTH:
             for _ in range(self.REPEAT):

evalscope/perf/utils/db_util.py CHANGED Viewed

@@ -194,12 +194,12 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
     return results
-def summary_result(args: Arguments, metrics: BenchmarkMetrics, expected_number_of_queries: int, result_db_path: str):
+def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str):
     result_path = os.path.dirname(result_db_path)
     write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
     data = metrics.create_message()
-    data.update({'Expected number of requests': expected_number_of_queries, 'Result DB path': result_db_path})
+    data.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
     write_json_file(data, os.path.join(result_path, 'benchmark_summary.json'))
     # Print summary in a table

evalscope/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-__version__ = '0.13.1'
-__release_datetime__ = '2025-03-24 18:00:00'
+__version__ = '0.13.2'
+__release_datetime__ = '2025-04-01 20:00:00'

evalscope 0.13.1__py3-none-any.whl → 0.13.2__py3-none-any.whl

Potentially problematic release.

evalscope 0.13.1py3-none-any.whl → 0.13.2py3-none-any.whl