PyPI - evalscope - Versions diffs - 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl - Mend

evalscope 0.16.3py3-none-any.whl → 0.17.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (157) hide show

evalscope/app/app.py +9 -762
evalscope/app/constants.py +1 -0
evalscope/app/ui/__init__.py +20 -0
evalscope/app/ui/app_ui.py +52 -0
evalscope/app/ui/multi_model.py +323 -0
evalscope/app/ui/sidebar.py +42 -0
evalscope/app/ui/single_model.py +202 -0
evalscope/app/ui/visualization.py +36 -0
evalscope/app/utils/data_utils.py +178 -0
evalscope/app/utils/localization.py +221 -0
evalscope/app/utils/text_utils.py +119 -0
evalscope/app/utils/visualization.py +91 -0
evalscope/backend/opencompass/backend_manager.py +2 -1
evalscope/backend/rag_eval/backend_manager.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +1 -1
evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
evalscope/benchmarks/__init__.py +15 -1
evalscope/benchmarks/aime/aime24_adapter.py +2 -1
evalscope/benchmarks/aime/aime25_adapter.py +2 -1
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
evalscope/benchmarks/arc/arc_adapter.py +1 -1
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
evalscope/benchmarks/arena_hard/utils.py +0 -12
evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
evalscope/benchmarks/data_adapter.py +29 -9
evalscope/benchmarks/general_arena/__init__.py +0 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
evalscope/benchmarks/general_arena/utils.py +226 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
evalscope/benchmarks/hle/__init__.py +0 -0
evalscope/benchmarks/hle/hle_adapter.py +118 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
evalscope/benchmarks/musr/musr_adapter.py +1 -1
evalscope/benchmarks/race/race_adapter.py +1 -1
evalscope/benchmarks/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
evalscope/benchmarks/utils.py +2 -2
evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
evalscope/config.py +8 -123
evalscope/constants.py +5 -21
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +20 -15
evalscope/metrics/__init__.py +9 -1
evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
evalscope/metrics/llm_judge.py +106 -20
evalscope/metrics/metrics.py +20 -8
evalscope/models/__init__.py +4 -8
evalscope/models/adapters/__init__.py +4 -9
evalscope/models/adapters/base_adapter.py +4 -0
evalscope/models/adapters/bfcl_adapter.py +2 -0
evalscope/models/adapters/chat_adapter.py +3 -0
evalscope/models/adapters/choice_adapter.py +4 -0
evalscope/models/adapters/custom_adapter.py +7 -3
evalscope/models/adapters/server_adapter.py +4 -2
evalscope/models/adapters/t2i_adapter.py +3 -0
evalscope/models/adapters/tau_bench_adapter.py +189 -0
evalscope/models/custom/dummy_model.py +3 -3
evalscope/models/register.py +0 -14
evalscope/perf/arguments.py +15 -16
evalscope/perf/benchmark.py +38 -39
evalscope/perf/http_client.py +30 -86
evalscope/perf/main.py +3 -3
evalscope/perf/plugin/__init__.py +3 -2
evalscope/perf/plugin/api/__init__.py +4 -3
evalscope/perf/plugin/api/base.py +22 -4
evalscope/perf/plugin/api/custom_api.py +212 -55
evalscope/perf/plugin/api/dashscope_api.py +4 -10
evalscope/perf/plugin/api/default_api.py +105 -0
evalscope/perf/plugin/api/openai_api.py +17 -19
evalscope/perf/plugin/datasets/__init__.py +10 -7
evalscope/perf/plugin/datasets/base.py +22 -1
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +4 -27
evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +2 -1
evalscope/perf/plugin/datasets/random_dataset.py +15 -4
evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
evalscope/perf/plugin/registry.py +36 -16
evalscope/perf/utils/analysis_result.py +24 -23
evalscope/perf/utils/benchmark_util.py +14 -20
evalscope/perf/utils/db_util.py +79 -61
evalscope/report/__init__.py +1 -1
evalscope/report/utils.py +34 -15
evalscope/run.py +1 -1
evalscope/summarizer.py +1 -2
evalscope/utils/__init__.py +63 -2
evalscope/utils/argument_utils.py +64 -0
evalscope/utils/import_utils.py +16 -0
evalscope/utils/io_utils.py +55 -4
evalscope/utils/model_utils.py +37 -1
evalscope/version.py +2 -2
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
tests/aigc/test_t2i.py +1 -1
tests/cli/test_all.py +68 -4
tests/cli/test_collection.py +1 -1
tests/cli/test_custom.py +261 -0
tests/cli/test_run.py +34 -70
tests/perf/test_perf.py +31 -4
tests/rag/test_clip_benchmark.py +2 -1
tests/rag/test_mteb.py +3 -1
tests/rag/test_ragas.py +3 -1
tests/swift/test_run_swift_eval.py +2 -1
tests/swift/test_run_swift_vlm_eval.py +2 -1
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
tests/utils.py +13 -0
tests/vlm/test_vlmeval.py +8 -2
evalscope/evaluator/rating_eval.py +0 -157
evalscope/evaluator/reviewer/__init__.py +0 -1
evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
evalscope/models/model.py +0 -189
evalscope/registry/__init__.py +0 -1
evalscope/registry/config/cfg_arena.yaml +0 -77
evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
evalscope/registry/config/cfg_single.yaml +0 -78
evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
evalscope/registry/data/qa_browser/battle.jsonl +0 -634
evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
evalscope/registry/data/question.jsonl +0 -80
evalscope/registry/tasks/arc.yaml +0 -28
evalscope/registry/tasks/bbh.yaml +0 -26
evalscope/registry/tasks/bbh_mini.yaml +0 -26
evalscope/registry/tasks/ceval.yaml +0 -27
evalscope/registry/tasks/ceval_mini.yaml +0 -26
evalscope/registry/tasks/cmmlu.yaml +0 -27
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
evalscope/registry/tasks/general_qa.yaml +0 -27
evalscope/registry/tasks/gsm8k.yaml +0 -29
evalscope/registry/tasks/mmlu.yaml +0 -29
evalscope/registry/tasks/mmlu_mini.yaml +0 -27
evalscope/run_arena.py +0 -202
evalscope/utils/arena_utils.py +0 -217
evalscope/utils/completion_parsers.py +0 -82
/evalscope/{utils → benchmarks}/filters.py +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0

evalscope/perf/arguments.py CHANGED Viewed

@@ -6,10 +6,11 @@ from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Union
 from evalscope.constants import DEFAULT_WORK_DIR
+from evalscope.utils import BaseArgument
 @dataclass
-class Arguments:
+class Arguments(BaseArgument):
     # Model and API
     model: str  # Model name or path
     model_id: Optional[str] = None  # Model identifier
@@ -30,6 +31,7 @@ class Arguments:
     number: Union[int, List[int]] = 1000  # Number of requests to be made
     parallel: Union[int, List[int]] = 1  # Number of parallel requests
     rate: int = -1  # Rate limit for requests (default: -1, no limit)
+    sleep_interval: int = 5  # Sleep interval between performance runs, in seconds
     # Logging and debugging
     log_every_n_query: int = 10  # Log every N queries
@@ -48,6 +50,11 @@ class Arguments:
     prompt: Optional[str] = None  # The prompt text
     query_template: Optional[str] = None  # Template for the query
     apply_chat_template: Optional[bool] = None  # Whether to apply chat template
+    # random vl settings
+    image_width: int = 224  # Width of the image for random VL dataset
+    image_height: int = 224  # Height of the image for random VL dataset
+    image_format: str = 'RGB'  # Image format for random VL dataset
+    image_num: int = 1  # Number of images for random VL dataset
     # Dataset settings
     dataset: str = 'openqa'  # Dataset type (default: 'line_by_line')
@@ -69,15 +76,6 @@ class Arguments:
     top_k: Optional[int] = None  # Top-k sampling setting for the response
     extra_args: Optional[Dict[str, Any]] = None  # Extra arguments
-    @staticmethod
-    def from_args(args):
-        # Convert Namespace to a dictionary and filter out None values
-        args_dict = {k: v for k, v in vars(args).items() if v is not None}
-        if 'func' in args_dict:
-            del args_dict['func']  # Note: compat CLI arguments
-        return Arguments(**args_dict)
     def __post_init__(self):
         # Set the default headers
         self.headers = self.headers or {}  # Default to empty dictionary
@@ -108,12 +106,6 @@ class Arguments:
             self.parallel
         ), f'The length of number and parallel should be the same, but got number: {self.number} and parallel: {self.parallel}'  # noqa: E501
-    def __str__(self):
-        return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
-    def to_dict(self) -> Dict[str, Any]:
-        return self.__dict__
 class ParseKVAction(argparse.Action):
@@ -156,6 +148,8 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('-n', '--number', type=int, default=1000, nargs='+', help='How many requests to be made')
     parser.add_argument('--parallel', type=int, default=1, nargs='+', help='Set number of concurrency requests, default 1')  # noqa: E501
     parser.add_argument('--rate', type=int, default=-1, help='Number of requests per second. default None')
+    parser.add_argument(
+        '--sleep-interval', type=int, default=5, help='Sleep interval between performance runs, in seconds. Default 5')  # noqa: E501
     # Logging and debugging
     parser.add_argument('--log-every-n-query', type=int, default=10, help='Logging every n query')
@@ -172,6 +166,11 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
     parser.add_argument(
         '--apply-chat-template', type=argparse.BooleanOptionalAction, default=None, help='Apply chat template to the prompt')  # noqa: E501
+    # random vl settings
+    parser.add_argument('--image-width', type=int, default=224, help='Width of the image for random VL dataset')
+    parser.add_argument('--image-height', type=int, default=224, help='Height of the image for random VL dataset')
+    parser.add_argument('--image-format', type=str, default='RGB', help='Image format for random VL dataset')
+    parser.add_argument('--image-num', type=int, default=1, help='Number of images for random VL dataset')
     # Output settings
     parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')

evalscope/perf/benchmark.py CHANGED Viewed

@@ -6,15 +6,18 @@ import sqlite3
 import time
 from http import HTTPStatus
 from tqdm import tqdm
-from typing import AsyncGenerator, Dict, List, Tuple
-from evalscope.perf.arguments import Arguments
-from evalscope.perf.http_client import AioHttpClient, test_connection
-from evalscope.perf.plugin.registry import ApiRegistry, DatasetRegistry
-from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
-from evalscope.perf.utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, summary_result
-from evalscope.perf.utils.handler import add_signal_handlers, exception_handler
+from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
 from evalscope.utils.logger import get_logger
+from .arguments import Arguments
+from .http_client import AioHttpClient, test_connection
+from .plugin import ApiRegistry, DatasetRegistry
+from .utils.benchmark_util import BenchmarkData, BenchmarkMetrics
+from .utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, load_prompt, summary_result
+from .utils.handler import add_signal_handlers, exception_handler
+if TYPE_CHECKING:
+    from .plugin import ApiPluginBase, DatasetPluginBase
 logger = get_logger()
@@ -22,28 +25,22 @@ data_process_completed_event = asyncio.Event()
 @exception_handler
-async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
-    query_generator_class = ApiRegistry(args.api)
-    query_generator = query_generator_class(args.tokenizer_path)
-    def load_prompt(prompt_path_or_text):
-        if prompt_path_or_text.startswith('@'):
-            with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
-                return file.read()
-        return prompt_path_or_text
-    async def generate_requests_from_prompt(messages):
-        request = query_generator.build_request(messages, args)
+async def get_requests(args: Arguments, api_plugin: 'ApiPluginBase') -> AsyncGenerator[dict, None]:
+    async def generate_requests_from_prompt():
+        prompt = load_prompt(args.prompt)
+        messages = [{'role': 'user', 'content': prompt}] if args.apply_chat_template else prompt
+        request = api_plugin.build_request(messages)
         for _ in range(args.number):
             yield request
     async def generate_requests_from_dataset():
-        message_generator_class = DatasetRegistry(args.dataset)
+        message_generator_class = DatasetRegistry.get_class(args.dataset)
         message_generator = message_generator_class(args)
         dataset_messages = []
         try:
-            for messages in message_generator:
+            for messages in message_generator.build_messages():
                 dataset_messages.append(messages)
         except StopIteration:
             pass
@@ -56,7 +53,7 @@ async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
         while count < args.number:
             messages = dataset_messages[dataset_index]
-            request = query_generator.build_request(messages, args)
+            request = api_plugin.build_request(messages)
             if request is not None:
                 yield request
                 count += 1
@@ -64,13 +61,11 @@ async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
             dataset_index = (dataset_index + 1) % len(dataset_messages)
     if args.prompt:
-        prompt = load_prompt(args.prompt)
-        messages = [{'role': 'user', 'content': prompt}] if args.apply_chat_template else prompt
-        generator = generate_requests_from_prompt(messages)
+        generator = generate_requests_from_prompt()
     elif args.dataset:
         generator = generate_requests_from_dataset()
     else:
-        raise Exception('Either prompt or dataset is required!')
+        raise ValueError('Either prompt or dataset is required!')
     async for request in generator:
         yield request
@@ -85,9 +80,10 @@ async def send_request(
     request: dict,
     benchmark_data_queue: asyncio.Queue,
     args: Arguments,
+    api_plugin: 'ApiPluginBase',
 ):
     async with semaphore:
-        client = AioHttpClient(args)
+        client = AioHttpClient(args, api_plugin)
         async with client:
             benchmark_data = BenchmarkData(request=request)
             benchmark_data.start_time = time.perf_counter()
@@ -95,7 +91,8 @@ async def send_request(
             try:
                 async for is_error, state_code, response_data in client.post(request):
                     if is_error or state_code != HTTPStatus.OK:
-                        logger.error(f'Request: {request} failed, state_code: {state_code}, data: {response_data}')
+                        error_msg = str(response_data) if response_data else 'Unknown error'
+                        logger.error(f'Request: {request} failed, state_code: {state_code}, data: {error_msg}')
                         benchmark_data.success = False
                         break
                     if response_data:
@@ -116,12 +113,9 @@ async def send_request(
 @exception_handler
-async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args: Arguments):
+async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args: Arguments, api_plugin: 'ApiPluginBase'):
     metrics = BenchmarkMetrics(concurrency=args.parallel)
-    api_plugin_class = ApiRegistry(args.api)
-    api_plugin = api_plugin_class(args.tokenizer_path)
     result_db_path = get_result_db_path(args)
     collected_benchmark_data = []
@@ -172,8 +166,8 @@ async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args:
 @exception_handler
-async def connect_test(args: Arguments) -> bool:
-    if (not args.no_test_connection) and (not await test_connection(args)):
+async def connect_test(args: Arguments, api_plugin) -> bool:
+    if (not args.no_test_connection) and (not await test_connection(args, api_plugin)):
         raise TimeoutError('Test connection failed')
@@ -183,19 +177,24 @@ async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
         loop = asyncio.get_running_loop()
         add_signal_handlers(loop)
+    # Create API plugin instance for request/response processing
+    api_plugin_class = ApiRegistry.get_class(args.api)
+    api_plugin = api_plugin_class(args)
     # init queue
     benchmark_data_queue = asyncio.Queue()
     # reset event
     data_process_completed_event.clear()
     # test connection
-    await connect_test(args)
+    await connect_test(args, api_plugin)
     # start statistic benchmark metric
-    statistic_benchmark_metric_task = asyncio.create_task(statistic_benchmark_metric(benchmark_data_queue, args))
+    statistic_benchmark_metric_task = asyncio.create_task(
+        statistic_benchmark_metric(benchmark_data_queue, args, api_plugin))
     # start send request
     semaphore = asyncio.Semaphore(args.parallel)
     send_request_tasks: List[asyncio.Task] = []
-    async for request in get_requests(args):
-        task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args))
+    async for request in get_requests(args, api_plugin):
+        task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args, api_plugin))
         send_request_tasks.append(task)
     await asyncio.gather(*send_request_tasks, return_exceptions=True)

evalscope/perf/http_client.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import aiohttp
 import asyncio
-import json
 import time
-from http import HTTPStatus
-from typing import AsyncGenerator, Dict, List, Tuple
+from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
-from evalscope.perf.arguments import Arguments
-from evalscope.perf.utils.local_server import ServerSentEvent
 from evalscope.utils.logger import get_logger
+from .arguments import Arguments
+if TYPE_CHECKING:
+    from .plugin.api.base import ApiPluginBase
 logger = get_logger()
@@ -17,95 +17,48 @@ class AioHttpClient:
     def __init__(
         self,
         args: Arguments,
+        api_plugin: 'ApiPluginBase',
     ):
         self.url = args.url
         self.headers = {'user-agent': 'modelscope_bench', **(args.headers or {})}
         self.read_timeout = args.read_timeout
         self.connect_timeout = args.connect_timeout
+        self.api_plugin = api_plugin
         self.client = aiohttp.ClientSession(
             timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
             trace_configs=[self._create_trace_config()] if args.debug else [])
-    def _create_trace_config(self):
-        trace_config = aiohttp.TraceConfig()
-        trace_config.on_request_start.append(self.on_request_start)
-        trace_config.on_request_chunk_sent.append(self.on_request_chunk_sent)
-        trace_config.on_response_chunk_received.append(self.on_response_chunk_received)
-        return trace_config
     async def __aenter__(self):
         pass
     async def __aexit__(self, exc_type, exc, tb):
         await self.client.close()
-    async def _handle_stream(self, response: aiohttp.ClientResponse):
-        is_error = False
-        async for line in response.content:
-            line = line.decode('utf8').rstrip('\n\r')
-            sse_msg = ServerSentEvent.decode(line)
-            if sse_msg:
-                logger.debug(f'Response recevied: {line}')
-                if sse_msg.event == 'error':
-                    is_error = True
-                if sse_msg.data:
-                    if sse_msg.data.startswith('[DONE]'):
-                        break
-                    yield is_error, response.status, sse_msg.data
-    async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
-        response_status = response.status
-        response_content_type = response.content_type
-        content_type_json = 'application/json'
-        content_type_event_stream = 'text/event-stream'
-        is_success = response_status == HTTPStatus.OK
-        if is_success:
-            # Handle successful response with 'text/event-stream' content type
-            if content_type_event_stream in response_content_type:
-                async for is_error, response_status, content in self._handle_stream(response):
-                    yield (is_error, response_status, content)
-            # Handle successful response with 'application/json' content type
-            elif content_type_json in response_content_type:
-                content = await response.json()
-                if content.get('object') == 'error':
-                    yield (True, content.get('code'), content.get('message'))  # DashScope
-                else:
-                    yield (False, response_status, json.dumps(content, ensure_ascii=False))
-            # Handle other successful responses
-            else:
-                content = await response.read()
-                yield (False, response_status, content)
-        else:
-            # Handle error response with 'application/json' content type
-            if content_type_json in response_content_type:
-                error = await response.json()
-                yield (True, response_status, json.dumps(error, ensure_ascii=False))
-            # Handle error response with 'text/event-stream' content type
-            elif content_type_event_stream in response_content_type:
-                async for _, _, data in self._handle_stream(response):
-                    error = json.loads(data)
-                    yield (True, response_status, json.dumps(error, ensure_ascii=False))
-            # Handle other error responses
-            else:
-                msg = await response.read()
-                yield (True, response_status, msg.decode('utf-8'))
+    def _create_trace_config(self):
+        """Create trace configuration for debugging."""
+        trace_config = aiohttp.TraceConfig()
+        trace_config.on_request_start.append(self.on_request_start)
+        trace_config.on_request_chunk_sent.append(self.on_request_chunk_sent)
+        trace_config.on_response_chunk_received.append(self.on_response_chunk_received)
+        return trace_config
     async def post(self, body):
-        headers = {'Content-Type': 'application/json', **self.headers}
+        """Send POST request and delegate response handling to API plugin.
+        Yields:
+            Tuple[bool, int, str]: (is_error, status_code, response_data)
+        """
         try:
-            data = json.dumps(body, ensure_ascii=False)  # serialize to JSON
-            async with self.client.request('POST', url=self.url, data=data, headers=headers) as response:
-                async for rsp in self._handle_response(response):
-                    yield rsp
-        except asyncio.TimeoutError:
+            # Delegate the request processing to the API plugin
+            async for result in self.api_plugin.process_request(self.client, self.url, self.headers, body):
+                yield result
+        except asyncio.TimeoutError as e:
             logger.error(
-                f'TimeoutError: connect_timeout: {self.connect_timeout}, read_timeout: {self.read_timeout}. Please set longger timeout.'  # noqa: E501
+                f'TimeoutError: connect_timeout: {self.connect_timeout}, read_timeout: {self.read_timeout}. Please set longer timeout.'  # noqa: E501
             )
-            yield (True, None, 'Timeout')
+            yield (True, None, str(e))
         except (aiohttp.ClientConnectorError, Exception) as e:
             logger.error(e)
-            yield (True, None, e)
+            yield (True, None, str(e))
     @staticmethod
     async def on_request_start(session, context, params: aiohttp.TraceRequestStartParams):
@@ -136,25 +89,16 @@ class AioHttpClient:
         logger.debug(f'Request received: <{method=},  {url=}, {truncated_chunk=}>')
-async def test_connection(args: Arguments) -> bool:
+async def test_connection(args: Arguments, api_plugin: 'ApiPluginBase') -> bool:
     is_error = True
     start_time = time.perf_counter()
     async def attempt_connection():
-        client = AioHttpClient(args)
+        client = AioHttpClient(args, api_plugin)
         async with client:
-            if args.apply_chat_template:
-                request = {
-                    'messages': [{
-                        'role': 'user',
-                        'content': 'hello'
-                    }],
-                    'model': args.model,
-                    'max_tokens': 10,
-                    'stream': args.stream
-                }
-            else:
-                request = {'prompt': 'hello', 'model': args.model, 'max_tokens': 10}
+            messages = [{'role': 'user', 'content': 'hello'}] if args.apply_chat_template else 'hello'
+            request = api_plugin.build_request(messages)
             async for is_error, state_code, response_data in client.post(request):
                 return is_error, state_code, response_data

evalscope/perf/main.py CHANGED Viewed

@@ -9,7 +9,7 @@ from argparse import Namespace
 from evalscope.perf.utils.local_server import start_app
 from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
 from evalscope.utils.logger import configure_logging, get_logger
-from evalscope.utils.utils import seed_everything
+from evalscope.utils.model_utils import seed_everything
 from .arguments import Arguments, parse_args
 from .benchmark import benchmark
 from .utils.db_util import get_output_path
@@ -57,8 +57,8 @@ def run_multi_benchmark(args: Arguments, output_path: str = None):
         results.append(metrics_result)
         # Sleep between runs to avoid overwhelming the server
         if i < len(number_list) - 1:
-            logger.info('Sleeping for 5 seconds before the next run...')
-            time.sleep(5)
+            logger.info(f'Sleeping for {args.sleep_interval} seconds before the next run...')
+            time.sleep(args.sleep_interval)
     # Analyze results
     print_summary(results, args.model_id)
     return results

evalscope/perf/plugin/__init__.py CHANGED Viewed

@@ -1,2 +1,3 @@
-from evalscope.perf.plugin.api import *
-from evalscope.perf.plugin.datasets import *
+from .api import *
+from .datasets import *
+from .registry import ApiRegistry, DatasetRegistry

evalscope/perf/plugin/api/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
-from evalscope.perf.plugin.api.custom_api import CustomPlugin
-from evalscope.perf.plugin.api.dashscope_api import DashScopeApiPlugin
-from evalscope.perf.plugin.api.openai_api import OpenaiPlugin
+from .base import ApiPluginBase
+from .custom_api import CustomPlugin
+from .dashscope_api import DashScopeApiPlugin
+from .openai_api import OpenaiPlugin

evalscope/perf/plugin/api/base.py CHANGED Viewed

@@ -1,16 +1,18 @@
+import aiohttp
 from abc import abstractmethod
-from typing import Any, Dict, List, Tuple
+from typing import Any, AsyncGenerator, Dict, List, Tuple
 from evalscope.perf.arguments import Arguments
 class ApiPluginBase:
-    def __init__(self, model_path: str) -> None:
-        self.model_path = model_path
+    def __init__(self, param: Arguments) -> None:
+        self.param = param
+        self.model_path = param.tokenizer_path
     @abstractmethod
-    def build_request(self, messages: List[Dict], param: Arguments) -> Dict:
+    def build_request(self, messages: List[Dict], param: Arguments = None) -> Dict:
         """Build a api request body.
         Args:
@@ -39,6 +41,22 @@ class ApiPluginBase:
         """
         raise NotImplementedError
+    @abstractmethod
+    async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
+                              body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
+        """Process the HTTP request and handle the response.
+        Args:
+            client_session: The aiohttp client session
+            url: The request URL
+            headers: The request headers
+            body: The request body
+        Yields:
+            Tuple[bool, int, str]: (is_error, status_code, response_data)
+        """
+        raise NotImplementedError
     @staticmethod
     def replace_values(input_json: Any, model: str, prompt: str):
         if isinstance(input_json, dict):

evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.16.3py3-none-any.whl → 0.17.1py3-none-any.whl