PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (97) hide show

evalscope/api/benchmark/__init__.py +1 -1
evalscope/api/benchmark/adapters/__init__.py +2 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +1 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +7 -6
evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
evalscope/api/benchmark/benchmark.py +35 -0
evalscope/api/benchmark/meta.py +6 -0
evalscope/api/dataset/dataset.py +6 -6
evalscope/api/dataset/loader.py +2 -1
evalscope/api/evaluator/cache.py +24 -1
evalscope/api/evaluator/state.py +12 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +47 -2
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +0 -1
evalscope/api/model/generate_config.py +1 -3
evalscope/api/model/model.py +4 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/single_model.py +3 -3
evalscope/app/utils/data_utils.py +7 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/arguments.py +2 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/bfcl/bfcl_adapter.py +2 -6
evalscope/benchmarks/bfcl/generation.py +2 -2
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +5 -1
evalscope/benchmarks/tau_bench/generation.py +1 -1
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +15 -19
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +72 -13
evalscope/constants.py +8 -0
evalscope/evaluator/evaluator.py +6 -4
evalscope/metrics/llm_judge.py +19 -7
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +20 -0
evalscope/models/openai_compatible.py +3 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +7 -4
evalscope/perf/benchmark.py +2 -0
evalscope/perf/utils/benchmark_util.py +8 -5
evalscope/perf/utils/local_server.py +3 -0
evalscope/report/__init__.py +0 -1
evalscope/report/generator.py +8 -87
evalscope/run.py +9 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/chat_service.py +1 -1
evalscope/utils/import_utils.py +23 -1
evalscope/utils/io_utils.py +42 -1
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +23 -6
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/METADATA +12 -15
{evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/RECORD +94 -80
tests/benchmark/test_eval.py +30 -31
tests/benchmark/test_image_edit.py +65 -0
tests/benchmark/test_vlm.py +80 -0
tests/cli/test_all.py +83 -43
tests/cli/test_collection.py +8 -5
tests/cli/test_reasoning.py +81 -0
tests/common.py +73 -0
tests/perf/test_perf.py +4 -2
tests/rag/test_clip_benchmark.py +0 -3
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/aigc/__init__.py +0 -1
/evalscope/benchmarks/{aigc → image_edit}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → image_edit/gedit}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → math_vista}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
/tests/{aigc → benchmark}/test_t2i.py +0 -0

evalscope/config.py CHANGED Viewed

@@ -6,7 +6,7 @@ from argparse import Namespace
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Union
-from evalscope.api.model import GenerateConfig
+from evalscope.api.model import GenerateConfig, Model, ModelAPI
 from evalscope.constants import (
     DEFAULT_DATASET_CACHE_DIR,
     DEFAULT_WORK_DIR,
@@ -15,7 +15,6 @@ from evalscope.constants import (
     HubType,
     JudgeStrategy,
     ModelTask,
-    OutputType,
 )
 from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
 from evalscope.utils.deprecation_utils import deprecated_warning
@@ -28,51 +27,102 @@ logger = get_logger()
 @dataclass
 class TaskConfig(BaseArgument):
     # Model-related arguments
-    model: Optional[str] = None
+    model: Optional[Union[str, Model, ModelAPI]] = None
+    """The model to be evaluated. Can be a string path, Model object, or ModelAPI object."""
     model_id: Optional[str] = None
+    """Unique identifier for the model. Auto-generated from model name if not provided."""
     model_args: Dict = field(default_factory=dict)
+    """Additional arguments to pass to the model during initialization."""
     model_task: str = ModelTask.TEXT_GENERATION
+    """The type of task the model performs (e.g., text generation, image generation)."""
     # Template-related arguments
     chat_template: Optional[str] = None
+    """Chat template to use for formatting conversations with the model."""
     # Dataset-related arguments
     datasets: List[str] = field(default_factory=list)
+    """List of dataset names to evaluate the model on."""
     dataset_args: Dict = field(default_factory=dict)
+    """Additional arguments to pass to datasets during loading."""
     dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
+    """Directory where datasets are cached locally."""
     dataset_hub: str = HubType.MODELSCOPE
-    repeats: int = 1  # Number of times to repeat the dataset items for k-metrics
+    """Hub platform to download datasets from (e.g., ModelScope, HuggingFace)."""
+    repeats: int = 1
+    """Number of times to repeat the dataset items for k-metrics evaluation."""
     # Generation configuration arguments
     generation_config: Union[Dict, GenerateConfig] = field(default_factory=dict)
+    """Configuration parameters for text/image generation."""
     # Evaluation-related arguments
     eval_type: str = EvalType.CHECKPOINT
+    """Type of evaluation: checkpoint, service, or mock."""
     eval_backend: str = EvalBackend.NATIVE
+    """Backend framework to use for evaluation."""
     eval_config: Union[str, Dict, None] = None
+    """Additional evaluation configuration parameters."""
     limit: Optional[Union[int, float]] = None
+    """Maximum number of samples to evaluate. Can be int (count) or float (fraction)."""
     eval_batch_size: int = 1
+    """Batch size for evaluation processing."""
     # Cache and working directory arguments
     use_cache: Optional[str] = None
+    """Whether to use cached results and which cache strategy to apply."""
     rerun_review: bool = False
+    """Whether to rerun the review process even if results exist."""
     work_dir: str = DEFAULT_WORK_DIR
+    """Working directory for storing evaluation results and temporary files."""
     # Debug and runtime mode arguments
     ignore_errors: bool = False
+    """Whether to continue evaluation when encountering errors."""
     debug: bool = False
-    dry_run: bool = False
+    """Enable debug mode for detailed logging and error reporting."""
     seed: Optional[int] = 42
-    api_url: Optional[str] = None  # Only used for server model
-    api_key: Optional[str] = 'EMPTY'  # Only used for server model
-    timeout: Optional[float] = None  # Only used for server model
-    stream: Optional[bool] = None  # Only used for server model
+    """Random seed for reproducible results."""
+    api_url: Optional[str] = None
+    """API endpoint URL for server-based model evaluation."""
+    api_key: Optional[str] = 'EMPTY'
+    """API key for authenticating with server-based models."""
+    timeout: Optional[float] = None
+    """Request timeout in seconds for server-based models."""
+    stream: Optional[bool] = None
+    """Whether to use streaming responses for server-based models."""
     # LLMJudge arguments
     judge_strategy: str = JudgeStrategy.AUTO
+    """Strategy for LLM-based judgment (auto, single, pairwise)."""
     judge_worker_num: int = 1
+    """Number of worker processes for parallel LLM judging."""
     judge_model_args: Optional[Dict] = field(default_factory=dict)
+    """Additional arguments for the judge model configuration."""
     analysis_report: bool = False
+    """Whether to generate detailed analysis reports after evaluation."""
     def __post_init__(self):
         self.__init_model_and_id()
@@ -88,14 +138,15 @@ class TaskConfig(BaseArgument):
         if self.model is None:
             self.model = self.model_task
             self.eval_type = EvalType.MOCK_LLM
-        else:
-            if self.model_task == ModelTask.IMAGE_GENERATION:
-                self.eval_type = EvalType.TEXT2IMAGE
         # Set model_id if not provided
         if not self.model_id:
-            if self.model:
+            if isinstance(self.model, str):
                 self.model_id = safe_filename(os.path.basename(self.model))
+            elif isinstance(self.model, Model):
+                self.model_id = safe_filename(self.model.name)
+            elif isinstance(self.model, ModelAPI):
+                self.model_id = safe_filename(self.model.model_name)
             else:
                 self.model_id = 'dummy_model'
@@ -113,6 +164,11 @@ class TaskConfig(BaseArgument):
                     'num_inference_steps': 50,
                     'guidance_scale': 9.0,
                 }
+                if self.eval_batch_size != 1:
+                    logger.warning(
+                        'For image generation task, we only support eval_batch_size=1 for now, changed to 1.'
+                    )
+                    self.eval_batch_size = 1
             elif self.model_task == ModelTask.TEXT_GENERATION:
                 if self.eval_type == EvalType.CHECKPOINT:
                     self.generation_config = {
@@ -185,6 +241,9 @@ class TaskConfig(BaseArgument):
         result = copy.deepcopy(self.__dict__)
         del result['api_key']  # Do not expose api_key in the config
+        if isinstance(self.model, (Model, ModelAPI)):
+            result['model'] = self.model.__class__.__name__
         if isinstance(self.generation_config, GenerateConfig):
             result['generation_config'] = self.generation_config.model_dump(exclude_unset=True)
         return result

evalscope/constants.py CHANGED Viewed

@@ -70,6 +70,7 @@ class EvalType:
     CHECKPOINT = 'llm_ckpt'  # native model checkpoint
     SERVICE = 'openai_api'  # model service
     TEXT2IMAGE = 'text2image'  # image generation service
+    IMAGE_EDITING = 'image_editing'  # image editing service
 class OutputType:
@@ -127,3 +128,10 @@ class Tags:
     RETRIEVAL = 'Retrieval'
     FUNCTION_CALLING = 'FunctionCalling'
     TEXT_TO_IMAGE = 'TextToImage'
+    IMAGE_EDITING = 'ImageEditing'
+    MULTI_MODAL = 'MultiModal'
+class FileConstants:
+    IMAGE_PATH = 'image_path'
+    ID = 'id'

evalscope/evaluator/evaluator.py CHANGED Viewed

@@ -96,7 +96,9 @@ class DefaultEvaluator(Evaluator):
         # Process each subset (e.g., test, validation) independently
         for subset, dataset in dataset_dict.items():
-            assert len(dataset) > 0, f'No samples found in subset: {subset}'
+            if len(dataset) == 0:
+                logger.info(f'No samples found in subset: {subset}, skipping.')
+                continue
             subset_score = self.evaluate_subset(subset, dataset)
             agg_score_dict[subset] = subset_score
@@ -181,7 +183,7 @@ class DefaultEvaluator(Evaluator):
                         model_result = self.cache_manager.save_prediction_cache(
                             subset, task_state, self.benchmark.save_metadata
                         )
-                        logger.debug(f'Model result: \n{model_result.model_dump_json(indent=2)}')
+                        logger.debug(f'Model result: \n{model_result.pretty_print()}')
                     except Exception as exc:
                         logger.error(f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}')
@@ -261,10 +263,10 @@ class DefaultEvaluator(Evaluator):
                             sample_score=sample_score,
                             save_metadata=self.benchmark.save_metadata
                         )
-                        logger.debug(f'Review result: \n{review_result.model_dump_json(indent=2)}')
+                        logger.debug(f'Review result: \n{review_result.pretty_print()}')
                     except Exception as exc:
-                        logger.error(f'Error when review sample {task_state.sample_id}: {exc}')
+                        logger.error(f'Error when review sample {task_state.sample_id}: due to {exc}')
                         if self.task_config.ignore_errors:
                             logger.warning('Error ignored, continuing with next sample.')
                         else:

evalscope/metrics/llm_judge.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import re
 from typing import Any, Dict, List, Optional
+from evalscope.api.messages import ChatMessage, ChatMessageSystem, ChatMessageUser
 from evalscope.constants import JudgeScoreType
 from evalscope.utils.logger import get_logger
@@ -109,20 +110,31 @@ class LLMJudge:
             config=GenerateConfig(**self.generation_config),
         )
-    def judge(self, prompt: str, system_prompt: Optional[str] = None) -> str:
+    def judge(
+        self,
+        prompt: str = '',
+        system_prompt: Optional[str] = None,
+        messages: Optional[List[ChatMessage]] = None
+    ) -> str:
         """
+        Generate a response from the LLM based on the provided prompt and context.
+        If messages is provided, it will be used as the input context.
         Args:
             prompt (str): The prompt to evaluate
             system_prompt (str, optional): The system prompt to use for the evaluation
+            messages (List[ChatMessage], optional): A list of chat messages to include in the evaluation
         Returns:
             str: The response from the LLM
         """
-        from evalscope.api.messages import ChatMessageSystem, ChatMessageUser
-        system_content = system_prompt or self.system_prompt
-        input_messages = [ChatMessageUser(content=prompt)]
-        if system_content:
-            input_messages.insert(0, ChatMessageSystem(content=system_content))
+        # parse messages
+        if messages is not None:
+            input_messages = messages
+        else:
+            system_content = system_prompt or self.system_prompt
+            input_messages = [ChatMessageUser(content=prompt)]
+            if system_content:
+                input_messages.insert(0, ChatMessageSystem(content=system_content))
         try:
             # Send request using ServerModelAdapter
             response = self.model.generate(input_messages)

evalscope/models/image_edit_model.py ADDED Viewed

@@ -0,0 +1,125 @@
+from __future__ import annotations
+import importlib
+import time
+import torch
+from logging import getLogger
+from typing import Any, Dict, List, Literal, Optional, Protocol, Tuple, Union, cast
+from evalscope.api.messages import (
+    ChatMessage,
+    ChatMessageAssistant,
+    ContentAudio,
+    ContentImage,
+    ContentText,
+    ContentVideo,
+)
+from evalscope.api.model import (
+    ChatCompletionChoice,
+    GenerateConfig,
+    Logprob,
+    Logprobs,
+    ModelAPI,
+    ModelOutput,
+    ModelUsage,
+    TopLogprob,
+)
+from evalscope.api.tool import ToolChoice, ToolInfo
+from evalscope.utils.io_utils import PIL_to_base64, base64_to_PIL
+from evalscope.utils.model_utils import get_device
+logger = getLogger()
+class ImageEditAPI(ModelAPI):
+    def __init__(
+        self,
+        model_name: str,
+        base_url: Optional[str] = None,
+        api_key: Optional[str] = None,
+        config: GenerateConfig = GenerateConfig(),
+        **model_args: Any,
+    ):
+        super().__init__(
+            model_name=model_name,
+            base_url=base_url,
+            api_key=api_key,
+            config=config,
+        )
+        # collect known model_args (then delete them so we can pass the rest on)
+        def collect_model_arg(name: str) -> Optional[Any]:
+            nonlocal model_args
+            value = model_args.get(name, None)
+            if value is not None:
+                model_args.pop(name)
+            return value
+        model_path = collect_model_arg('model_path')
+        torch_dtype = collect_model_arg('precision') or collect_model_arg('torch_dtype')
+        device_map = collect_model_arg('device_map')
+        # torch dtype
+        DTYPE_MAP = {'float16': torch.float16, 'float32': torch.float32, 'bfloat16': torch.bfloat16, 'auto': 'auto'}
+        if isinstance(torch_dtype, str) and torch_dtype != 'auto':
+            torch_dtype = DTYPE_MAP.get(torch_dtype, torch.float32)
+        self.torch_dtype = torch_dtype
+        self.device = device_map or get_device()
+        self.pipeline_cls = collect_model_arg('pipeline_cls')
+        # default to DiffusionPipeline if not specified
+        if self.pipeline_cls is None:
+            if 'qwen' in model_name.lower():
+                self.pipeline_cls = 'QwenImageEditPipeline'
+            else:
+                logger.error('Pipeline class not found. Please provide a valid `pipeline_cls` in model args.')
+                raise ValueError('Invalid pipeline class.')
+        model_name_or_path = model_path or model_name
+        # from modelscope import pipeline_cls
+        module = getattr(importlib.import_module('modelscope'), self.pipeline_cls)
+        logger.info(f'Loading model {model_name_or_path} with {self.pipeline_cls} ...')
+        self.model = module.from_pretrained(
+            model_name_or_path,
+            torch_dtype=self.torch_dtype,
+            **model_args,
+        )
+        self.model.to(self.device)
+    def generate(
+        self,
+        input: List[ChatMessage],
+        tools: List[ToolInfo],
+        tool_choice: ToolChoice,
+        config: GenerateConfig,
+    ) -> ModelOutput:
+        # prepare generator
+        kwargs: Dict[str, Any] = {}
+        if config.num_inference_steps is not None:
+            kwargs['num_inference_steps'] = config.num_inference_steps
+        kwargs.update(config.model_extra)
+        # assume the first text as prompt
+        content = input[0].content
+        assert isinstance(content[0], ContentText) and isinstance(content[1], ContentImage), \
+            'Invalid content types, expected (ContentText, ContentImage)'
+        prompt = content[0].text
+        input_image_base64 = content[1].image
+        input_image = base64_to_PIL(input_image_base64)
+        # get the first image as output
+        output = self.model(image=input_image, prompt=prompt, **kwargs)
+        image = output.images[0]
+        image_base64 = PIL_to_base64(image)
+        return ModelOutput(
+            model=self.model_name,
+            choices=[ChatCompletionChoice.from_content(content=[ContentImage(image=image_base64)])],
+            time=time.time(),
+        )

evalscope/models/model_apis.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from evalscope.api.model import ModelAPI
 from evalscope.api.registry import register_model_api
 from evalscope.utils.deprecation_utils import deprecated
+from evalscope.utils.import_utils import check_import
 @register_model_api(name='mock_llm')
@@ -27,6 +28,8 @@ def server() -> type[ModelAPI]:
 @register_model_api(name='llm_ckpt')
 def llm_ckpt() -> type[ModelAPI]:
+    check_import('torch', package='torch', raise_error=True)
     from .modelscope import ModelScopeAPI
     return ModelScopeAPI
@@ -35,6 +38,8 @@ def llm_ckpt() -> type[ModelAPI]:
 @register_model_api(name='checkpoint')
 @deprecated(since='1.0.0', remove_in='1.1.0', alternative='llm_ckpt')
 def checkpoint() -> type[ModelAPI]:
+    check_import('torch', package='torch', raise_error=True)
     from .modelscope import ModelScopeAPI
     return ModelScopeAPI
@@ -42,6 +47,21 @@ def checkpoint() -> type[ModelAPI]:
 @register_model_api(name='text2image')
 def text2image() -> type[ModelAPI]:
+    check_import('torch', package='evalscope[aigc]', raise_error=True)
+    check_import('torchvision', package='evalscope[aigc]', raise_error=True)
+    check_import('diffusers', package='evalscope[aigc]', raise_error=True)
     from .text2image_model import Text2ImageAPI
     return Text2ImageAPI
+@register_model_api(name='image_editing')
+def image_editing() -> type[ModelAPI]:
+    check_import('torch', package='evalscope[aigc]', raise_error=True)
+    check_import('torchvision', package='evalscope[aigc]', raise_error=True)
+    check_import('diffusers', package='evalscope[aigc]', raise_error=True)
+    from .image_edit_model import ImageEditAPI
+    return ImageEditAPI

evalscope/models/openai_compatible.py CHANGED Viewed

@@ -48,6 +48,9 @@ class OpenAICompatibleAPI(ModelAPI):
         self.base_url = base_url or os.environ.get('EVALSCOPE_BASE_URL', None)
         assert self.base_url, f'Base URL for {model_name} not found'
+        # remove trailing slash from base_url
+        self.base_url = self.base_url.rstrip('/').removesuffix('/chat/completions')
         # create http client
         self.client = OpenAI(
             api_key=self.api_key,

evalscope/models/text2image_model.py CHANGED Viewed

@@ -107,8 +107,8 @@ class Text2ImageAPI(ModelAPI):
             kwargs['num_inference_steps'] = config.num_inference_steps
         if config.guidance_scale is not None:
             kwargs['guidance_scale'] = config.guidance_scale
-        if config.extra_body is not None:
-            kwargs.update(config.extra_body)
+        # update with extra model parameters
+        kwargs.update(config.model_extra)
         # assume the first text as prompt
         prompt = input[0].text

evalscope/models/utils/openai.py CHANGED Viewed

@@ -209,7 +209,7 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
     return params
-def openai_assistant_content(message: ChatMessageAssistant) -> str:
+def openai_assistant_content(message: ChatMessageAssistant, include_reasoning=True) -> str:
     # In agent bridge scenarios, we could encounter concepts such as reasoning and
     # .internal use in the ChatMessageAssistant that are not supported by the OpenAI
     # choices API. This code smuggles that data into the plain text so that it
@@ -220,7 +220,7 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
     else:
         content = ''
         for c in message.content:
-            if c.type == 'reasoning':
+            if c.type == 'reasoning' and include_reasoning:
                 attribs = ''
                 if c.signature is not None:
                     attribs = f'{attribs} signature="{c.signature}"'
@@ -239,11 +239,14 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
     return content
-def openai_chat_choices(choices: List[ChatCompletionChoice]) -> List[Choice]:
+def openai_chat_choices(choices: List[ChatCompletionChoice], include_reasoning: bool = True) -> List[Choice]:
     oai_choices: List[Choice] = []
     for index, choice in enumerate(choices):
-        content = openai_assistant_content(choice.message)
+        # Handle content
+        content = openai_assistant_content(choice.message, include_reasoning=include_reasoning)
+        # Handle tool calls
         if choice.message.tool_calls:
             tool_calls = [openai_chat_tool_call(tc) for tc in choice.message.tool_calls]
         else:

evalscope/perf/benchmark.py CHANGED Viewed

@@ -42,6 +42,8 @@ async def get_requests(args: Arguments, api_plugin: 'ApiPluginBase') -> AsyncGen
         try:
             for messages in message_generator.build_messages():
                 dataset_messages.append(messages)
+                if len(dataset_messages) >= args.number:
+                    break
         except StopIteration:
             pass

evalscope/perf/utils/benchmark_util.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import time
-import torch
 from dataclasses import dataclass, field
 from typing import Any, List, Optional, Tuple
+from evalscope.utils.import_utils import check_import
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -44,10 +44,13 @@ class BenchmarkData:
             api_plugin.parse_responses(self.response_messages, request=self.request)
     def update_gpu_usage(self):
-        total_memory = 0
-        for i in range(torch.cuda.device_count()):
-            total_memory += (torch.cuda.max_memory_allocated(i) / 2**30)  # GB
-        self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
+        if check_import('torch'):
+            import torch
+            total_memory = 0
+            for i in range(torch.cuda.device_count()):
+                total_memory += (torch.cuda.max_memory_allocated(i) / 2**30)  # GB
+            self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
 class Metrics:

evalscope/perf/utils/local_server.py CHANGED Viewed

@@ -9,6 +9,7 @@ from sse_starlette.sse import EventSourceResponse
 from evalscope.perf.arguments import Arguments
 from evalscope.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
+from evalscope.utils.import_utils import check_import
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -101,6 +102,8 @@ def create_app(model, attn_implementation=None) -> FastAPI:
 def start_app(args: Arguments):
     logger.info('Starting local server, please wait...')
     if args.api == 'local':
+        check_import('torch', 'torch', raise_error=True)
         app = create_app(args.model, args.attn_implementation)
         uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)

evalscope/report/__init__.py CHANGED Viewed

@@ -14,7 +14,6 @@ else:
             'gen_table',
             'get_data_frame',
             'get_report_list',
-            'gen_report_table',
         ],
         'generator': [
             'ReportGenerator',

evalscope 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

evalscope 1.0.0py3-none-any.whl → 1.0.1py3-none-any.whl