PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (148) hide show

evalscope/api/benchmark/__init__.py +1 -1
evalscope/api/benchmark/adapters/__init__.py +2 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +7 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +62 -2
evalscope/api/benchmark/meta.py +9 -0
evalscope/api/dataset/dataset.py +6 -6
evalscope/api/dataset/loader.py +2 -1
evalscope/api/evaluator/cache.py +24 -1
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +204 -0
evalscope/api/model/generate_config.py +1 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/single_model.py +3 -3
evalscope/app/utils/data_utils.py +7 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
evalscope/benchmarks/amc/amc_adapter.py +46 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/bfcl_adapter.py +142 -7
evalscope/benchmarks/bfcl/generation.py +9 -9
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
evalscope/benchmarks/drop/drop_adapter.py +1 -1
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +6 -5
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/tau_bench/generation.py +1 -1
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +20 -19
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +96 -14
evalscope/constants.py +11 -0
evalscope/evaluator/evaluator.py +30 -10
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/metric.py +27 -2
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +3 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +8 -6
evalscope/perf/arguments.py +2 -0
evalscope/perf/benchmark.py +2 -0
evalscope/perf/plugin/api/base.py +2 -2
evalscope/perf/plugin/api/default_api.py +7 -7
evalscope/perf/plugin/api/openai_api.py +83 -19
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +7 -5
evalscope/perf/utils/local_server.py +3 -0
evalscope/report/__init__.py +0 -1
evalscope/report/combinator.py +0 -25
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +9 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +41 -0
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +56 -7
evalscope/utils/json_schema.py +23 -2
evalscope/utils/logger.py +19 -0
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +23 -6
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/METADATA +17 -24
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/RECORD +145 -103
tests/benchmark/test_eval.py +80 -37
tests/benchmark/test_image_edit.py +65 -0
tests/benchmark/test_sandbox.py +81 -0
tests/benchmark/test_vlm.py +137 -0
tests/cli/test_all.py +83 -43
tests/cli/test_collection.py +8 -5
tests/cli/test_reasoning.py +81 -0
tests/common.py +73 -0
tests/perf/test_perf.py +44 -14
tests/rag/test_clip_benchmark.py +0 -3
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/aigc/__init__.py +0 -1
/evalscope/benchmarks/{aigc → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → amc}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → healthbench}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
/tests/{aigc → benchmark}/test_t2i.py +0 -0

evalscope/metrics/llm_judge.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import re
 from typing import Any, Dict, List, Optional
+from evalscope.api.messages import ChatMessage, ChatMessageSystem, ChatMessageUser
 from evalscope.constants import JudgeScoreType
 from evalscope.utils.logger import get_logger
@@ -109,20 +110,31 @@ class LLMJudge:
             config=GenerateConfig(**self.generation_config),
         )
-    def judge(self, prompt: str, system_prompt: Optional[str] = None) -> str:
+    def judge(
+        self,
+        prompt: str = '',
+        system_prompt: Optional[str] = None,
+        messages: Optional[List[ChatMessage]] = None
+    ) -> str:
         """
+        Generate a response from the LLM based on the provided prompt and context.
+        If messages is provided, it will be used as the input context.
         Args:
             prompt (str): The prompt to evaluate
             system_prompt (str, optional): The system prompt to use for the evaluation
+            messages (List[ChatMessage], optional): A list of chat messages to include in the evaluation
         Returns:
             str: The response from the LLM
         """
-        from evalscope.api.messages import ChatMessageSystem, ChatMessageUser
-        system_content = system_prompt or self.system_prompt
-        input_messages = [ChatMessageUser(content=prompt)]
-        if system_content:
-            input_messages.insert(0, ChatMessageSystem(content=system_content))
+        # parse messages
+        if messages is not None:
+            input_messages = messages
+        else:
+            system_content = system_prompt or self.system_prompt
+            input_messages = [ChatMessageUser(content=prompt)]
+            if system_content:
+                input_messages.insert(0, ChatMessageSystem(content=system_content))
         try:
             # Send request using ServerModelAdapter
             response = self.model.generate(input_messages)

evalscope/metrics/metric.py CHANGED Viewed

@@ -6,11 +6,19 @@ from evalscope.api.registry import register_aggregation, register_metric
 from .metrics import mean
+def normalize_text(text: str) -> str:
+    """Normalize text by lowering case and stripping whitespace."""
+    return text.strip().lower()
 @register_metric(name='exact_match')
 class ExactMatch(Metric):
     def apply(self, predictions, references):
-        return [float(prediction == reference) for prediction, reference in zip(predictions, references)]
+        return [
+            float(normalize_text(prediction) == normalize_text(reference))
+            for prediction, reference in zip(predictions, references)
+        ]
 @register_metric(name='acc')
@@ -202,6 +210,9 @@ class Mean(Aggregator):
     name = 'mean'
+    def agg_func(self, values: List[float]) -> float:
+        return mean(values)
     def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
         """Aggregate scores by computing the mean for each metric.
@@ -230,7 +241,7 @@ class Mean(Aggregator):
             if values:  # Only process non-empty value lists
                 aggregated_scores.append(
                     AggScore(
-                        score=mean(values),
+                        score=self.agg_func(values),
                         metric_name=metric_name,
                         aggregation_name=self.name,
                         num=len(values),
@@ -241,6 +252,20 @@ class Mean(Aggregator):
         return aggregated_scores
+@register_aggregation(name='clipped_mean')
+class ClippedMean(Mean):
+    name = 'clipped_mean'
+    def __init__(self, clip_min: float = 0.0, clip_max: float = 1.0):
+        self.clip_min = clip_min
+        self.clip_max = clip_max
+    def agg_func(self, values: List[float]) -> float:
+        clipped_values = min(max(mean(values), self.clip_min), self.clip_max)
+        return clipped_values
 @register_aggregation(name='pass_at_k')
 class PassAtK(Aggregator):

evalscope/models/image_edit_model.py ADDED Viewed

@@ -0,0 +1,125 @@
+from __future__ import annotations
+import importlib
+import time
+import torch
+from logging import getLogger
+from typing import Any, Dict, List, Literal, Optional, Protocol, Tuple, Union, cast
+from evalscope.api.messages import (
+    ChatMessage,
+    ChatMessageAssistant,
+    ContentAudio,
+    ContentImage,
+    ContentText,
+    ContentVideo,
+)
+from evalscope.api.model import (
+    ChatCompletionChoice,
+    GenerateConfig,
+    Logprob,
+    Logprobs,
+    ModelAPI,
+    ModelOutput,
+    ModelUsage,
+    TopLogprob,
+)
+from evalscope.api.tool import ToolChoice, ToolInfo
+from evalscope.utils.io_utils import PIL_to_base64, base64_to_PIL
+from evalscope.utils.model_utils import get_device
+logger = getLogger()
+class ImageEditAPI(ModelAPI):
+    def __init__(
+        self,
+        model_name: str,
+        base_url: Optional[str] = None,
+        api_key: Optional[str] = None,
+        config: GenerateConfig = GenerateConfig(),
+        **model_args: Any,
+    ):
+        super().__init__(
+            model_name=model_name,
+            base_url=base_url,
+            api_key=api_key,
+            config=config,
+        )
+        # collect known model_args (then delete them so we can pass the rest on)
+        def collect_model_arg(name: str) -> Optional[Any]:
+            nonlocal model_args
+            value = model_args.get(name, None)
+            if value is not None:
+                model_args.pop(name)
+            return value
+        model_path = collect_model_arg('model_path')
+        torch_dtype = collect_model_arg('precision') or collect_model_arg('torch_dtype')
+        device_map = collect_model_arg('device_map')
+        # torch dtype
+        DTYPE_MAP = {'float16': torch.float16, 'float32': torch.float32, 'bfloat16': torch.bfloat16, 'auto': 'auto'}
+        if isinstance(torch_dtype, str) and torch_dtype != 'auto':
+            torch_dtype = DTYPE_MAP.get(torch_dtype, torch.float32)
+        self.torch_dtype = torch_dtype
+        self.device = device_map or get_device()
+        self.pipeline_cls = collect_model_arg('pipeline_cls')
+        # default to DiffusionPipeline if not specified
+        if self.pipeline_cls is None:
+            if 'qwen' in model_name.lower():
+                self.pipeline_cls = 'QwenImageEditPipeline'
+            else:
+                logger.error('Pipeline class not found. Please provide a valid `pipeline_cls` in model args.')
+                raise ValueError('Invalid pipeline class.')
+        model_name_or_path = model_path or model_name
+        # from modelscope import pipeline_cls
+        module = getattr(importlib.import_module('modelscope'), self.pipeline_cls)
+        logger.info(f'Loading model {model_name_or_path} with {self.pipeline_cls} ...')
+        self.model = module.from_pretrained(
+            model_name_or_path,
+            torch_dtype=self.torch_dtype,
+            **model_args,
+        )
+        self.model.to(self.device)
+    def generate(
+        self,
+        input: List[ChatMessage],
+        tools: List[ToolInfo],
+        tool_choice: ToolChoice,
+        config: GenerateConfig,
+    ) -> ModelOutput:
+        # prepare generator
+        kwargs: Dict[str, Any] = {}
+        if config.num_inference_steps is not None:
+            kwargs['num_inference_steps'] = config.num_inference_steps
+        kwargs.update(config.model_extra)
+        # assume the first text as prompt
+        content = input[0].content
+        assert isinstance(content[0], ContentText) and isinstance(content[1], ContentImage), \
+            'Invalid content types, expected (ContentText, ContentImage)'
+        prompt = content[0].text
+        input_image_base64 = content[1].image
+        input_image = base64_to_PIL(input_image_base64)
+        # get the first image as output
+        output = self.model(image=input_image, prompt=prompt, **kwargs)
+        image = output.images[0]
+        image_base64 = PIL_to_base64(image)
+        return ModelOutput(
+            model=self.model_name,
+            choices=[ChatCompletionChoice.from_content(content=[ContentImage(image=image_base64)])],
+            time=time.time(),
+        )

evalscope/models/model_apis.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from evalscope.api.model import ModelAPI
 from evalscope.api.registry import register_model_api
 from evalscope.utils.deprecation_utils import deprecated
+from evalscope.utils.import_utils import check_import
 @register_model_api(name='mock_llm')
@@ -27,6 +28,8 @@ def server() -> type[ModelAPI]:
 @register_model_api(name='llm_ckpt')
 def llm_ckpt() -> type[ModelAPI]:
+    check_import('torch', package='torch', raise_error=True, feature_name='llm_ckpt')
     from .modelscope import ModelScopeAPI
     return ModelScopeAPI
@@ -35,6 +38,8 @@ def llm_ckpt() -> type[ModelAPI]:
 @register_model_api(name='checkpoint')
 @deprecated(since='1.0.0', remove_in='1.1.0', alternative='llm_ckpt')
 def checkpoint() -> type[ModelAPI]:
+    check_import('torch', package='torch', raise_error=True, feature_name='llm_ckpt')
     from .modelscope import ModelScopeAPI
     return ModelScopeAPI
@@ -42,6 +47,23 @@ def checkpoint() -> type[ModelAPI]:
 @register_model_api(name='text2image')
 def text2image() -> type[ModelAPI]:
+    check_import(['torch', 'torchvision', 'diffusers'],
+                 package='evalscope[aigc]',
+                 raise_error=True,
+                 feature_name='text2image')
     from .text2image_model import Text2ImageAPI
     return Text2ImageAPI
+@register_model_api(name='image_editing')
+def image_editing() -> type[ModelAPI]:
+    check_import(['torch', 'torchvision', 'diffusers'],
+                 package='evalscope[aigc]',
+                 raise_error=True,
+                 feature_name='image_editing')
+    from .image_edit_model import ImageEditAPI
+    return ImageEditAPI

evalscope/models/openai_compatible.py CHANGED Viewed

@@ -48,6 +48,9 @@ class OpenAICompatibleAPI(ModelAPI):
         self.base_url = base_url or os.environ.get('EVALSCOPE_BASE_URL', None)
         assert self.base_url, f'Base URL for {model_name} not found'
+        # remove trailing slash from base_url
+        self.base_url = self.base_url.rstrip('/').removesuffix('/chat/completions')
         # create http client
         self.client = OpenAI(
             api_key=self.api_key,

evalscope/models/text2image_model.py CHANGED Viewed

@@ -107,8 +107,8 @@ class Text2ImageAPI(ModelAPI):
             kwargs['num_inference_steps'] = config.num_inference_steps
         if config.guidance_scale is not None:
             kwargs['guidance_scale'] = config.guidance_scale
-        if config.extra_body is not None:
-            kwargs.update(config.extra_body)
+        # update with extra model parameters
+        kwargs.update(config.model_extra)
         # assume the first text as prompt
         prompt = input[0].text

evalscope/models/utils/openai.py CHANGED Viewed

@@ -104,10 +104,9 @@ def openai_chat_completion_part(content: Content) -> ChatCompletionContentPartPa
         )
     elif content.type == 'audio':
         audio_data_uri = file_as_data_uri(content.audio)
-        audio_data = audio_data_uri.split('base64,')[1]
         return ChatCompletionContentPartInputAudioParam(
-            type='input_audio', input_audio=dict(data=audio_data, format=content.format)
+            type='input_audio', input_audio=dict(data=audio_data_uri, format=content.format)
         )
     else:
@@ -209,7 +208,7 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
     return params
-def openai_assistant_content(message: ChatMessageAssistant) -> str:
+def openai_assistant_content(message: ChatMessageAssistant, include_reasoning=True) -> str:
     # In agent bridge scenarios, we could encounter concepts such as reasoning and
     # .internal use in the ChatMessageAssistant that are not supported by the OpenAI
     # choices API. This code smuggles that data into the plain text so that it
@@ -220,7 +219,7 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
     else:
         content = ''
         for c in message.content:
-            if c.type == 'reasoning':
+            if c.type == 'reasoning' and include_reasoning:
                 attribs = ''
                 if c.signature is not None:
                     attribs = f'{attribs} signature="{c.signature}"'
@@ -239,11 +238,14 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
     return content
-def openai_chat_choices(choices: List[ChatCompletionChoice]) -> List[Choice]:
+def openai_chat_choices(choices: List[ChatCompletionChoice], include_reasoning: bool = True) -> List[Choice]:
     oai_choices: List[Choice] = []
     for index, choice in enumerate(choices):
-        content = openai_assistant_content(choice.message)
+        # Handle content
+        content = openai_assistant_content(choice.message, include_reasoning=include_reasoning)
+        # Handle tool calls
         if choice.message.tool_calls:
             tool_calls = [openai_chat_tool_call(tc) for tc in choice.message.tool_calls]
         else:

evalscope/perf/arguments.py CHANGED Viewed

@@ -55,6 +55,7 @@ class Arguments(BaseArgument):
     image_height: int = 224  # Height of the image for random VL dataset
     image_format: str = 'RGB'  # Image format for random VL dataset
     image_num: int = 1  # Number of images for random VL dataset
+    image_patch_size: int = 28  # Patch size for image tokenizer, only for local image token calculation
     # Dataset settings
     dataset: str = 'openqa'  # Dataset type (default: 'line_by_line')
@@ -171,6 +172,7 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--image-height', type=int, default=224, help='Height of the image for random VL dataset')
     parser.add_argument('--image-format', type=str, default='RGB', help='Image format for random VL dataset')
     parser.add_argument('--image-num', type=int, default=1, help='Number of images for random VL dataset')
+    parser.add_argument('--image-patch-size', type=int, default=28, help='Patch size for image tokenizer, only for local image token calculation')  # noqa: E501
     # Output settings
     parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')

evalscope/perf/benchmark.py CHANGED Viewed

@@ -42,6 +42,8 @@ async def get_requests(args: Arguments, api_plugin: 'ApiPluginBase') -> AsyncGen
         try:
             for messages in message_generator.build_messages():
                 dataset_messages.append(messages)
+                if len(dataset_messages) >= args.number:
+                    break
         except StopIteration:
             pass

evalscope/perf/plugin/api/base.py CHANGED Viewed

@@ -43,7 +43,7 @@ class ApiPluginBase:
     @abstractmethod
     async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
-                              body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
+                              body: Dict) -> AsyncGenerator[Tuple[bool, int, Any], None]:
         """Process the HTTP request and handle the response.
         Args:
@@ -53,7 +53,7 @@ class ApiPluginBase:
             body: The request body
         Yields:
-            Tuple[bool, int, str]: (is_error, status_code, response_data)
+            Tuple[bool, int, Any]: (is_error, status_code, response_data)
         """
         raise NotImplementedError

evalscope/perf/plugin/api/default_api.py CHANGED Viewed

@@ -18,7 +18,7 @@ class DefaultApiPlugin(ApiPluginBase):
         super().__init__(param)
     async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
-                              body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
+                              body: Dict) -> AsyncGenerator[Tuple[bool, int, Any], None]:
         """Process the HTTP request and handle the response.
         Args:
@@ -28,7 +28,7 @@ class DefaultApiPlugin(ApiPluginBase):
             body: The request body
         Yields:
-            Tuple[bool, int, str]: (is_error, status_code, response_data)
+            Tuple[bool, int, Any]: (is_error, status_code, response_data)
         """
         try:
             headers = {'Content-Type': 'application/json', **headers}
@@ -40,7 +40,7 @@ class DefaultApiPlugin(ApiPluginBase):
             logger.error(f'Error in process_request: {e}')
             yield (True, None, str(e))
-    async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
+    async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, Any], None]:
         """Handle streaming response from server-sent events.
         Args:
@@ -71,14 +71,14 @@ class DefaultApiPlugin(ApiPluginBase):
             logger.error(f'Error in _handle_stream: {e}')
             yield True, response.status, str(e)
-    async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
+    async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, Any], None]:
         """Handle the HTTP response based on content type and status.
         Args:
             response: The aiohttp response object
         Yields:
-            Tuple[bool, int, str]: (is_error, status_code, response_data)
+            Tuple[bool, int, Any]: (is_error, status_code, response_data)
         """
         response_status = response.status
         response_content_type = response.content_type
@@ -94,7 +94,7 @@ class DefaultApiPlugin(ApiPluginBase):
             # Handle successful response with 'application/json' content type
             elif content_type_json in response_content_type:
                 content = await response.json()
-                yield (False, response_status, json.dumps(content, ensure_ascii=False))
+                yield (False, response_status, content)
             # Handle other successful responses
             else:
                 content = await response.read()
@@ -102,4 +102,4 @@ class DefaultApiPlugin(ApiPluginBase):
         else:
             # error is always in JSON format
             error = await response.json()
-            yield (True, response_status, json.dumps(error, ensure_ascii=False))
+            yield (True, response_status, error)

evalscope/perf/plugin/api/openai_api.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import json
+import math
 import os
+from collections import defaultdict
 from typing import Any, Dict, List, Tuple, Union
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.api.default_api import DefaultApiPlugin
 from evalscope.perf.plugin.registry import register_api
+from evalscope.utils.io_utils import base64_to_PIL
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -113,7 +116,7 @@ class OpenaiPlugin(DefaultApiPlugin):
             return input_tokens, output_tokens
         # no usage information in the response, parse the response to get the tokens
-        delta_contents = {}
+        delta_contents = defaultdict(list)
         for response in responses:
             if 'object' in response:
                 self.__process_response_object(response, delta_contents)
@@ -123,41 +126,46 @@ class OpenaiPlugin(DefaultApiPlugin):
         input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
         return input_tokens, output_tokens
-    def __process_response_object(self, js, delta_contents):
-        if js['object'] == 'chat.completion':
-            for choice in js['choices']:
+    def __process_response_object(self, response, delta_contents):
+        if not response.get('choices'):
+            return
+        if response['object'] == 'chat.completion':
+            for choice in response['choices']:
                 delta_contents[choice['index']] = [choice['message']['content']]
-        elif js['object'] == 'text_completion':
-            for choice in js['choices']:
-                delta_contents[choice['index']] = [choice['text']]
-        elif js['object'] == 'chat.completion.chunk':
-            for choice in js.get('choices', []):
+        elif response['object'] == 'text_completion':
+            for choice in response['choices']:
+                if 'text' in choice and 'index' in choice:
+                    delta_contents[choice['index']].append(choice['text'])
+        elif response['object'] == 'chat.completion.chunk':
+            for choice in response['choices']:
                 if 'delta' in choice and 'index' in choice:
                     delta = choice['delta']
                     idx = choice['index']
                     if 'content' in delta:
-                        delta_content = delta['content']
-                        delta_contents.setdefault(idx, []).append(delta_content)
+                        delta_contents[idx].append(delta['content'])
-    def __process_no_object(self, js, delta_contents):
+    def __process_no_object(self, response, delta_contents):
         #  assume the response is a single choice
-        for choice in js['choices']:
+        if not response.get('choices'):
+            return
+        for choice in response['choices']:
             if 'delta' in choice:
                 delta = choice['delta']
                 idx = choice['index']
                 if 'content' in delta:
-                    delta_content = delta['content']
-                    delta_contents.setdefault(idx, []).append(delta_content)
+                    delta_contents[idx].append(delta['content'])
             else:
                 delta_contents[choice['index']] = [choice['message']['content']]
-    def __calculate_tokens_from_content(self, request, delta_contents):
+    def __calculate_tokens_from_content(self, request, content):
         input_tokens = output_tokens = 0
         if self.tokenizer is not None:
-            for idx, choice_contents in delta_contents.items():
+            # Calculate input tokens
+            input_tokens += self._count_input_tokens(request)
+            for idx, choice_contents in content.items():
                 full_response_content = ''.join(choice_contents)
-                input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
-                output_tokens += len(self.tokenizer.encode(full_response_content))
+                # Calculate output tokens
+                output_tokens += self._count_output_tokens(full_response_content)
         else:
             raise ValueError(
                 'Error: Unable to retrieve usage information\n\n'
@@ -171,3 +179,59 @@ class OpenaiPlugin(DefaultApiPlugin):
                 'please open an issue on our GitHub repository https://github.com/modelscope/evalscope .'
             )
         return input_tokens, output_tokens
+    def _count_input_tokens(self, request: Dict) -> int:
+        """Count the number of input tokens in the request.
+        This method handles different types of requests and calculates tokens for:
+        - Text content in messages or prompts
+        - Images in multimodal messages (converted to patch tokens)
+        Args:
+            request (Dict): The request dictionary containing either 'messages' for chat
+                          completion or 'prompt' for text completion.
+        Returns:
+            int: The total number of input tokens including text and image tokens.
+        """
+        input_tokens = 0
+        if 'messages' in request:
+            input_content = self.tokenizer.apply_chat_template(
+                request['messages'], tokenize=True, add_generation_prompt=True
+            )
+            input_tokens += len(input_content)
+            # handle image tokens if any
+            for message in request['messages']:
+                content = message.get('content', '')
+                if isinstance(content, str):
+                    continue
+                for cont in content:
+                    if cont['type'] == 'image_url':
+                        try:
+                            # assuming image_url is base64 string
+                            image_base64 = cont['image_url']['url']
+                            image = base64_to_PIL(image_base64)
+                            # Use math.ceil for more accurate token count when image dimensions
+                            # aren't perfectly divisible by patch size
+                            n_patches = (
+                                math.ceil(image.height / self.param.image_patch_size)
+                                * math.ceil(image.width / self.param.image_patch_size)
+                            )
+                            input_tokens += n_patches
+                        except Exception as e:
+                            logger.warning(f'Failed to process image for token counting: {e}')
+                            # Continue processing other content without failing
+        elif 'prompt' in request:
+            input_tokens += len(self.tokenizer.encode(request['prompt'], add_special_tokens=False))
+        return input_tokens
+    def _count_output_tokens(self, response: str) -> int:
+        """Count the number of output tokens in the response. Only string response is supported.
+        Args:
+            response (str): The API response text.
+        Returns:
+            int: The number of output tokens.
+        """
+        return len(self.tokenizer.encode(response, add_special_tokens=False))

evalscope/perf/plugin/datasets/flickr8k.py CHANGED Viewed

@@ -22,7 +22,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
         for item in dataset:
             pil_image = item['jpg']
             text = item['txt']
-            base64_image = PIL_to_base64(pil_image)
+            base64_image = PIL_to_base64(pil_image, add_header=True)
-            message = self.create_message(text=text, image_urls=f'data:image/jpeg;base64,{base64_image}')
+            message = self.create_message(text=text, image_urls=base64_image)
             yield [message]

evalscope/perf/plugin/datasets/kontext_bench.py CHANGED Viewed

@@ -22,7 +22,7 @@ class KontextDatasetPlugin(DatasetPluginBase):
         for item in dataset:
             pil_image = item['image']
             text = item['instruction']
-            base64_image = PIL_to_base64(pil_image)
+            base64_image = PIL_to_base64(pil_image, add_header=True)
-            message = self.create_message(text=text, image_urls=f'data:image/jpeg;base64,{base64_image}')
+            message = self.create_message(text=text, image_urls=base64_image)
             yield [message]

evalscope/perf/plugin/datasets/random_vl_dataset.py CHANGED Viewed

@@ -31,7 +31,7 @@ class RandomVLDatasetPlugin(RandomDatasetPlugin):
             # Generate random images based on image_num
             images_b64 = []
             for _ in range(self.image_num):
-                images_b64.append(f'data:image/png;base64,{self._generate_random_image_b64()}')
+                images_b64.append(self._generate_random_image_b64())
             message = self.create_message(text=prompt, image_urls=images_b64)
             yield [message]
@@ -77,4 +77,4 @@ class RandomVLDatasetPlugin(RandomDatasetPlugin):
                 draw.line(coords, fill=shape_color, width=random.randint(1, 5))
         # Convert to base64
-        return PIL_to_base64(image, format='PNG')
+        return PIL_to_base64(image, format='PNG', add_header=True)

evalscope 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

Potentially problematic release.

evalscope 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl