PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (148) hide show

evalscope/api/benchmark/__init__.py +1 -1
evalscope/api/benchmark/adapters/__init__.py +2 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +7 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +62 -2
evalscope/api/benchmark/meta.py +9 -0
evalscope/api/dataset/dataset.py +6 -6
evalscope/api/dataset/loader.py +2 -1
evalscope/api/evaluator/cache.py +24 -1
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +204 -0
evalscope/api/model/generate_config.py +1 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/single_model.py +3 -3
evalscope/app/utils/data_utils.py +7 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
evalscope/benchmarks/amc/amc_adapter.py +46 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/bfcl_adapter.py +142 -7
evalscope/benchmarks/bfcl/generation.py +9 -9
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
evalscope/benchmarks/drop/drop_adapter.py +1 -1
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +6 -5
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/tau_bench/generation.py +1 -1
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +20 -19
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +96 -14
evalscope/constants.py +11 -0
evalscope/evaluator/evaluator.py +30 -10
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/metric.py +27 -2
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +3 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +8 -6
evalscope/perf/arguments.py +2 -0
evalscope/perf/benchmark.py +2 -0
evalscope/perf/plugin/api/base.py +2 -2
evalscope/perf/plugin/api/default_api.py +7 -7
evalscope/perf/plugin/api/openai_api.py +83 -19
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +7 -5
evalscope/perf/utils/local_server.py +3 -0
evalscope/report/__init__.py +0 -1
evalscope/report/combinator.py +0 -25
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +9 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +41 -0
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +56 -7
evalscope/utils/json_schema.py +23 -2
evalscope/utils/logger.py +19 -0
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +23 -6
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/METADATA +17 -24
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/RECORD +145 -103
tests/benchmark/test_eval.py +80 -37
tests/benchmark/test_image_edit.py +65 -0
tests/benchmark/test_sandbox.py +81 -0
tests/benchmark/test_vlm.py +137 -0
tests/cli/test_all.py +83 -43
tests/cli/test_collection.py +8 -5
tests/cli/test_reasoning.py +81 -0
tests/common.py +73 -0
tests/perf/test_perf.py +44 -14
tests/rag/test_clip_benchmark.py +0 -3
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/aigc/__init__.py +0 -1
/evalscope/benchmarks/{aigc → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → amc}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → healthbench}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
/tests/{aigc → benchmark}/test_t2i.py +0 -0

evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py ADDED Viewed

@@ -0,0 +1,64 @@
+import re
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+SUBSET_LIST = ['default']
+OPEN_PROMPT = (
+    'Read the picture and solve the following problem step by step.'
+    'The last line of your response should be of the form'
+    ' "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.\n\n'
+    '{question}\n\n'
+    'Remember to put your answer on its own line at the end in the form'
+    ' "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem,'
+    ' and you do not need to use a \\boxed command.'
+)
+@register_benchmark(
+    BenchmarkMeta(
+        name='real_world_qa',
+        pretty_name='RealWorldQA',
+        tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
+        description=
+        'RealWorldQA is a benchmark designed to evaluate the real-world spatial understanding capabilities of multimodal AI models, contributed by XAI. It assesses how well these models comprehend physical environments. The benchmark consists of 700+ images, each accompanied by a question and a verifiable answer. These images are drawn from real-world scenarios, including those captured from vehicles. The goal is to advance AI models\' understanding of our physical world.',  # noqa: E501
+        dataset_id='lmms-lab/RealWorldQA',
+        subset_list=SUBSET_LIST,
+        metric_list=['acc'],
+        eval_split='test',
+        prompt_template=OPEN_PROMPT,
+    )
+)
+class RealWorldQAAdapter(VisionLanguageAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        content_list: list[Content] = [ContentText(text=OPEN_PROMPT.format(question=record['question']))]
+        image = record.get('image')
+        if image:
+            image_base64 = bytes_to_base64(image['bytes'], format='webp', add_header=True)
+            content_list.append(ContentImage(image=image_base64))
+        return Sample(
+            input=[ChatMessageUser(content=content_list)],
+            target=record['answer'],
+            metadata={'image_path': record['image_path']}
+        )
+    def extract_answer(self, prediction: str, task_state: TaskState) -> str:
+        pattern = r'ANSWER:\s*(.*)'
+        match = re.search(pattern, prediction)
+        if match:
+            return match.group(1).strip()
+        return ''

evalscope/benchmarks/tau_bench/generation.py CHANGED Viewed

@@ -45,7 +45,7 @@ def _patch_agent_solve(model: Model):
                 input=[dict_to_chat_message(msg) for msg in messages],
                 tools=[ToolInfo.model_validate(tool['function']) for tool in self.tools_info]
             )
-            oai_res = openai_chat_choices(res.choices)
+            oai_res = openai_chat_choices(res.choices, include_reasoning=False)
             next_message = oai_res[0].message.model_dump(exclude_none=True)

evalscope/benchmarks/tau_bench/tau_bench_adapter.py CHANGED Viewed

@@ -13,6 +13,7 @@ from evalscope.api.registry import register_benchmark
 from evalscope.constants import Tags
 from evalscope.utils import get_logger
 from evalscope.utils.function_utils import run_once
+from evalscope.utils.import_utils import check_import
 logger = get_logger()
@@ -35,8 +36,8 @@ logger = get_logger()
             'api_key': 'EMPTY',
             'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
             'generation_config': {
-                'temperature': 0.7,
-                'max_new_tokens': 1024
+                'temperature': 0.0,
+                'max_tokens': 4096,
             }
         }
     )
@@ -46,22 +47,18 @@ class TauBenchAdapter(DefaultDataAdapter):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        spec = importlib.util.find_spec('tau_bench')
-        if spec is None:
-            raise ImportError(
-                '`tau_bench` not found, please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating.'  # noqa: E501
-            )
+        check_import(
+            'tau_bench',
+            package='git+https://github.com/sierra-research/tau-bench',
+            raise_error=True,
+            feature_name=self.pretty_name
+        )
         # setup user model args
         self.user_model = self.extra_params.get('user_model', 'qwen-plus')
         self.api_key = self.extra_params.get('api_key', 'EMPTY')
         self.api_base = self.extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
-        self.generation_config = self.extra_params.get(
-            'generation_config', {
-                'temperature': 0.7,
-                'max_new_tokens': 1024
-            }
-        )
+        self.generation_config = self.extra_params.get('generation_config', {'temperature': 0.0, 'max_tokens': 4096})
         self._patch_env_completion()
@@ -84,10 +81,10 @@ class TauBenchAdapter(DefaultDataAdapter):
             res = user_server.generate(input=[dict_to_chat_message(msg) for msg in messages])
-            message = res.message.model_dump(exclude_none=True)
+            message = {'role': 'assistant', 'content': res.completion}
             self.messages.append(message)
             self.total_cost = 0
-            return message['content']
+            return res.completion
         # get the current instance of TauBenchAdapter
         adapter_instance = self
@@ -114,7 +111,11 @@ class TauBenchAdapter(DefaultDataAdapter):
                 })
             # load dataset
             dataset = DictDataLoader(
-                dict_list=tasks, sample_fields=self.record_to_sample, limit=self.limit, repeats=self.repeats
+                dict_list=tasks,
+                sample_fields=self.record_to_sample,
+                limit=self.limit,
+                repeats=self.repeats,
+                shuffle=self.shuffle,
             ).load()
             data_dict[env_name] = dataset
@@ -145,15 +146,15 @@ class TauBenchAdapter(DefaultDataAdapter):
         try:
             # Parse the prediction to get the reward
-            res = task_state.metadata
-            reward = res.get('reward', 0.0)
+            task_result = task_state.metadata['task_result']
+            reward = task_result.get('reward', 0.0)
             score.value = {
                 'Pass^1': float(reward),
             }
             score.explanation = f'Task completed with reward: {reward}'
             score.metadata = {
-                'task_result': res,
+                'task_result': task_result,
                 'env_name': task_state.metadata.get('env_name', 'unknown'),
                 'task_index': task_state.metadata.get('task_index', -1)
             }

evalscope/benchmarks/text2image/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py RENAMED Viewed

@@ -16,8 +16,10 @@ logger = get_logger()
 @register_benchmark(
     BenchmarkMeta(
         name='evalmuse',
+        pretty_name='EvalMuse',
         dataset_id='AI-ModelScope/T2V-Eval-Prompts',
-        description='EvalMuse Text-to-Image Benchmark',
+        description='EvalMuse Text-to-Image Benchmark. Used for evaluating the quality '
+        'and semantic alignment of finely generated images',
         tags=[Tags.TEXT_TO_IMAGE],
         subset_list=['EvalMuse'],
         metric_list=['FGA_BLIP2Score'],

evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py RENAMED Viewed

@@ -4,7 +4,6 @@ import os
 from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
 from evalscope.api.dataset import Sample
 from evalscope.api.messages import ChatMessageUser
-from evalscope.api.metric.scorer import Score
 from evalscope.api.registry import get_metric, register_benchmark
 from evalscope.constants import Tags
 from evalscope.utils.logger import get_logger
@@ -15,8 +14,9 @@ logger = get_logger()
 @register_benchmark(
     BenchmarkMeta(
         name='genai_bench',
+        pretty_name='GenAI-Bench',
         dataset_id='AI-ModelScope/T2V-Eval-Prompts',
-        description='GenAI-Bench Text-to-Image Benchmark',
+        description='GenAI-Bench Text-to-Image Benchmark. Includes 1600 prompts for text-to-image task.',
         tags=[Tags.TEXT_TO_IMAGE],
         subset_list=['GenAI-Bench-1600'],
         metric_list=['VQAScore'],

evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py RENAMED Viewed

@@ -16,7 +16,7 @@ logger = get_logger()
         name='general_t2i',
         dataset_id='general_t2i',
         description='General Text-to-Image Benchmark',
-        tags=[Tags.TEXT_TO_IMAGE],
+        tags=[Tags.TEXT_TO_IMAGE, Tags.CUSTOM],
         subset_list=['default'],
         metric_list=['PickScore'],
         few_shot_num=0,

evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py RENAMED Viewed

@@ -14,8 +14,10 @@ logger = get_logger()
 @register_benchmark(
     BenchmarkMeta(
         name='hpdv2',
+        pretty_name='HPD-v2',
         dataset_id='AI-ModelScope/T2V-Eval-Prompts',
-        description='HPDv2 Text-to-Image Benchmark',
+        description='HPDv2 Text-to-Image Benchmark. Evaluation metrics based on human preferences, '
+        'trained on the Human Preference Dataset (HPD v2)',
         tags=[Tags.TEXT_TO_IMAGE],
         subset_list=['HPDv2'],
         metric_list=['HPSv2.1Score'],
@@ -41,7 +43,10 @@ class HPDv2Adapter(Text2ImageAdapter):
         return Sample(
             input=[ChatMessageUser(content=record['prompt'])],
             metadata={
+                'id': record['id'],
+                'prompt': record['prompt'],
                 'category': record.get('tags', {}).get('category', ''),
-                'tags': record.get('tags', {})
+                'tags': record.get('tags', {}),
+                'image_path': record.get('image_path', ''),  # Optional field for existing image path
             }
         )

evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py RENAMED Viewed

@@ -10,6 +10,7 @@ logger = get_logger()
 @register_benchmark(
     BenchmarkMeta(
         name='tifa160',
+        pretty_name='TIFA-160',
         dataset_id='AI-ModelScope/T2V-Eval-Prompts',
         description='TIFA-160 Text-to-Image Benchmark',
         tags=[Tags.TEXT_TO_IMAGE],

evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py CHANGED Viewed

@@ -37,6 +37,7 @@ TRUTHFUL_QA_PROMPT = (
         dataset_id='evalscope/truthful_qa',
         metric_list=['multi_choice_acc'],
         subset_list=['multiple_choice'],
+        shuffle_choices=True,
         few_shot_num=0,
         train_split=None,
         eval_split='validation',
@@ -55,8 +56,6 @@ class TruthfulQaAdapter(MultiChoiceAdapter):
         super().__init__(**kwargs)
-        self.shuffle_choices = True
         self.multiple_correct = self.extra_params.get('multiple_correct', False)
         if self.multiple_correct:
             self.prompt_template = MultipleChoiceTemplate.MULTIPLE_ANSWER

evalscope/cli/start_app.py CHANGED Viewed

@@ -28,6 +28,12 @@ class StartAppCMD(CLICommand):
         parser.set_defaults(func=subparser_func)
     def execute(self):
-        from evalscope.app import create_app
+        try:
+            from evalscope.app import create_app
+        except ImportError as e:
+            raise ImportError(
+                f'Failed to import create_app from evalscope.app, due to {e}. '
+                "Please run `pip install 'evalscope[app]'`."
+            )
         create_app(self.args)

evalscope/cli/start_perf.py CHANGED Viewed

@@ -28,6 +28,12 @@ class PerfBenchCMD(CLICommand):
         parser.set_defaults(func=subparser_func)
     def execute(self):
-        from evalscope.perf.main import run_perf_benchmark
+        try:
+            from evalscope.perf.main import run_perf_benchmark
+        except ImportError as e:
+            raise ImportError(
+                f'Failed to import run_perf_benchmark from evalscope.perf.main, due to {e}. '
+                "Please run `pip install 'evalscope[perf]'`."
+            )
         run_perf_benchmark(self.args)

evalscope/config.py CHANGED Viewed

@@ -6,7 +6,7 @@ from argparse import Namespace
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Union
-from evalscope.api.model import GenerateConfig
+from evalscope.api.model import GenerateConfig, Model, ModelAPI
 from evalscope.constants import (
     DEFAULT_DATASET_CACHE_DIR,
     DEFAULT_WORK_DIR,
@@ -15,10 +15,10 @@ from evalscope.constants import (
     HubType,
     JudgeStrategy,
     ModelTask,
-    OutputType,
 )
 from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
 from evalscope.utils.deprecation_utils import deprecated_warning
+from evalscope.utils.import_utils import check_import
 from evalscope.utils.io_utils import dict_to_yaml, gen_hash, safe_filename
 from evalscope.utils.logger import get_logger
@@ -28,51 +28,115 @@ logger = get_logger()
 @dataclass
 class TaskConfig(BaseArgument):
     # Model-related arguments
-    model: Optional[str] = None
+    model: Optional[Union[str, Model, ModelAPI]] = None
+    """The model to be evaluated. Can be a string path, Model object, or ModelAPI object."""
     model_id: Optional[str] = None
+    """Unique identifier for the model. Auto-generated from model name if not provided."""
     model_args: Dict = field(default_factory=dict)
+    """Additional arguments to pass to the model during initialization."""
     model_task: str = ModelTask.TEXT_GENERATION
+    """The type of task the model performs (e.g., text generation, image generation)."""
     # Template-related arguments
     chat_template: Optional[str] = None
+    """Chat template to use for formatting conversations with the model."""
     # Dataset-related arguments
     datasets: List[str] = field(default_factory=list)
+    """List of dataset names to evaluate the model on."""
     dataset_args: Dict = field(default_factory=dict)
+    """Additional arguments to pass to datasets during loading."""
     dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
+    """Directory where datasets are cached locally."""
     dataset_hub: str = HubType.MODELSCOPE
-    repeats: int = 1  # Number of times to repeat the dataset items for k-metrics
+    """Hub platform to download datasets from (e.g., ModelScope, HuggingFace)."""
+    repeats: int = 1
+    """Number of times to repeat the dataset items for k-metrics evaluation."""
     # Generation configuration arguments
     generation_config: Union[Dict, GenerateConfig] = field(default_factory=dict)
+    """Configuration parameters for text/image generation."""
     # Evaluation-related arguments
     eval_type: str = EvalType.CHECKPOINT
+    """Type of evaluation: checkpoint, service, or mock."""
     eval_backend: str = EvalBackend.NATIVE
+    """Backend framework to use for evaluation."""
     eval_config: Union[str, Dict, None] = None
+    """Additional evaluation configuration parameters."""
     limit: Optional[Union[int, float]] = None
+    """Maximum number of samples to evaluate. Can be int (count) or float (fraction)."""
     eval_batch_size: int = 1
+    """Batch size for evaluation processing."""
     # Cache and working directory arguments
     use_cache: Optional[str] = None
+    """Whether to use cached results and which cache strategy to apply."""
     rerun_review: bool = False
+    """Whether to rerun the review process even if results exist."""
     work_dir: str = DEFAULT_WORK_DIR
+    """Working directory for storing evaluation results and temporary files."""
     # Debug and runtime mode arguments
     ignore_errors: bool = False
+    """Whether to continue evaluation when encountering errors."""
     debug: bool = False
-    dry_run: bool = False
+    """Enable debug mode for detailed logging and error reporting."""
     seed: Optional[int] = 42
-    api_url: Optional[str] = None  # Only used for server model
-    api_key: Optional[str] = 'EMPTY'  # Only used for server model
-    timeout: Optional[float] = None  # Only used for server model
-    stream: Optional[bool] = None  # Only used for server model
+    """Random seed for reproducible results."""
+    api_url: Optional[str] = None
+    """API endpoint URL for server-based model evaluation."""
+    api_key: Optional[str] = 'EMPTY'
+    """API key for authenticating with server-based models."""
+    timeout: Optional[float] = None
+    """Request timeout in seconds for server-based models."""
+    stream: Optional[bool] = None
+    """Whether to use streaming responses for server-based models."""
     # LLMJudge arguments
     judge_strategy: str = JudgeStrategy.AUTO
+    """Strategy for LLM-based judgment (auto, single, pairwise)."""
     judge_worker_num: int = 1
+    """Number of worker processes for parallel LLM judging."""
     judge_model_args: Optional[Dict] = field(default_factory=dict)
+    """Additional arguments for the judge model configuration."""
     analysis_report: bool = False
+    """Whether to generate detailed analysis reports after evaluation."""
+    # Sandbox configuration arguments
+    use_sandbox: bool = False
+    """Whether to execute code in a sandboxed environment."""
+    sandbox_type: Optional[str] = 'docker'
+    """Type of sandbox environment for code execution (e.g., docker). Default is 'docker'."""
+    sandbox_manager_config: Optional[Dict] = field(default_factory=dict)
+    """Configuration for the sandbox manager. Default is local manager. If url is provided, it will use remote manager."""
+    sandbox_config: Optional[Dict] = field(default_factory=dict)
+    """Configuration for sandboxed code execution environments."""
     def __post_init__(self):
         self.__init_model_and_id()
@@ -82,20 +146,22 @@ class TaskConfig(BaseArgument):
         # Set default generation_config and model_args
         self.__init_default_generation_config()
         self.__init_default_model_args()
+        self.__init_default_sandbox_config()
     def __init_model_and_id(self):
         # Set model to DummyCustomModel if not provided
         if self.model is None:
             self.model = self.model_task
             self.eval_type = EvalType.MOCK_LLM
-        else:
-            if self.model_task == ModelTask.IMAGE_GENERATION:
-                self.eval_type = EvalType.TEXT2IMAGE
         # Set model_id if not provided
         if not self.model_id:
-            if self.model:
+            if isinstance(self.model, str):
                 self.model_id = safe_filename(os.path.basename(self.model))
+            elif isinstance(self.model, Model):
+                self.model_id = safe_filename(self.model.name)
+            elif isinstance(self.model, ModelAPI):
+                self.model_id = safe_filename(self.model.model_name)
             else:
                 self.model_id = 'dummy_model'
@@ -113,6 +179,11 @@ class TaskConfig(BaseArgument):
                     'num_inference_steps': 50,
                     'guidance_scale': 9.0,
                 }
+                if self.eval_batch_size != 1:
+                    logger.warning(
+                        'For image generation task, we only support eval_batch_size=1 for now, changed to 1.'
+                    )
+                    self.eval_batch_size = 1
             elif self.model_task == ModelTask.TEXT_GENERATION:
                 if self.eval_type == EvalType.CHECKPOINT:
                     self.generation_config = {
@@ -167,6 +238,14 @@ class TaskConfig(BaseArgument):
                     'precision': 'torch.float16',
                 }
+    def __init_default_sandbox_config(self):
+        if not self.use_sandbox:
+            return
+        check_import('ms_enclave', 'ms_enclave[docker]', raise_error=True)
+        if not self.sandbox_type:
+            self.sandbox_type = 'docker'
     def update(self, other: Union['TaskConfig', dict]):
         if isinstance(other, TaskConfig):
             other = other.to_dict()
@@ -182,9 +261,12 @@ class TaskConfig(BaseArgument):
             logger.warning(f'Failed to dump overall task config: {e}')
     def to_dict(self):
-        result = copy.deepcopy(self.__dict__)
+        result = copy.copy(self.__dict__)
         del result['api_key']  # Do not expose api_key in the config
+        if isinstance(self.model, (Model, ModelAPI)):
+            result['model'] = self.model.__class__.__name__
         if isinstance(self.generation_config, GenerateConfig):
             result['generation_config'] = self.generation_config.model_dump(exclude_unset=True)
         return result

evalscope/constants.py CHANGED Viewed

@@ -15,6 +15,7 @@ DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR  # compatible with old versio
 DEFAULT_EVALSCOPE_CACHE_DIR = os.path.expanduser(
     os.getenv('EVALSCOPE_CACHE', '~/.cache/evalscope')
 )  # ~/.cache/evalscope
+IS_BUILD_DOC = os.getenv('BUILD_DOC', '0') == '1'  # To avoid some heavy dependencies when building doc
 class HubType:
@@ -70,6 +71,7 @@ class EvalType:
     CHECKPOINT = 'llm_ckpt'  # native model checkpoint
     SERVICE = 'openai_api'  # model service
     TEXT2IMAGE = 'text2image'  # image generation service
+    IMAGE_EDITING = 'image_editing'  # image editing service
 class OutputType:
@@ -127,3 +129,12 @@ class Tags:
     RETRIEVAL = 'Retrieval'
     FUNCTION_CALLING = 'FunctionCalling'
     TEXT_TO_IMAGE = 'TextToImage'
+    IMAGE_EDITING = 'ImageEditing'
+    MULTI_MODAL = 'MultiModal'
+    MULTI_LINGUAL = 'MultiLingual'
+    MULTI_TURN = 'MultiTurn'
+class FileConstants:
+    IMAGE_PATH = 'image_path'
+    ID = 'id'

evalscope/evaluator/evaluator.py CHANGED Viewed

@@ -8,8 +8,9 @@ and report generation.
 """
 import os
+import traceback
 from collections import defaultdict
-from concurrent.futures import ThreadPoolExecutor, as_completed
+from concurrent.futures import ThreadPoolExecutor, TimeoutError, as_completed
 from tqdm import tqdm
 from typing import TYPE_CHECKING, Dict, List, Tuple, Union
@@ -17,6 +18,7 @@ from evalscope.api.dataset import Dataset, DatasetDict, Sample
 from evalscope.api.evaluator import CacheManager, Evaluator, TaskState
 from evalscope.api.metric import AggScore, SampleScore
 from evalscope.report import Report, gen_table
+from evalscope.utils.logger import get_logger
 if TYPE_CHECKING:
     from evalscope.api.benchmark import DataAdapter
@@ -24,8 +26,6 @@ if TYPE_CHECKING:
     from evalscope.config import TaskConfig
     from evalscope.utils.io_utils import OutputsStructure
-from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -96,12 +96,17 @@ class DefaultEvaluator(Evaluator):
         # Process each subset (e.g., test, validation) independently
         for subset, dataset in dataset_dict.items():
-            assert len(dataset) > 0, f'No samples found in subset: {subset}'
+            if len(dataset) == 0:
+                logger.info(f'No samples found in subset: {subset}, skipping.')
+                continue
             subset_score = self.evaluate_subset(subset, dataset)
             agg_score_dict[subset] = subset_score
         # Generate the report based on aggregated scores
         report = self.get_report(agg_score_dict)
+        # Finalize the evaluation process
+        self.finalize()
         return report
     def evaluate_subset(self, subset: str, dataset: Dataset) -> List[AggScore]:
@@ -181,10 +186,13 @@ class DefaultEvaluator(Evaluator):
                         model_result = self.cache_manager.save_prediction_cache(
                             subset, task_state, self.benchmark.save_metadata
                         )
-                        logger.debug(f'Model result: \n{model_result.model_dump_json(indent=2)}')
+                        logger.debug(f'Model result: \n{model_result.pretty_print()}')
                     except Exception as exc:
-                        logger.error(f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}')
+                        tb_str = traceback.format_exc()
+                        logger.error(
+                            f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}\nTraceback:\n{tb_str}'
+                        )
                         if self.task_config.ignore_errors:
                             logger.warning('Error ignored, continuing with next sample.')
                         else:
@@ -251,7 +259,13 @@ class DefaultEvaluator(Evaluator):
                 for future in as_completed(future_to_task_state):
                     task_state = future_to_task_state[future]
                     try:
-                        sample_score = future.result()
+                        try:
+                            sample_score = future.result()
+                        except TimeoutError:
+                            logger.warning(
+                                f'Timeout when reviewing sample {task_state.sample_id}, setting score to zero.'
+                            )
+                            sample_score = SampleScore(sample_id=task_state.sample_id, scores={})
                         sample_score_list.append(sample_score)
                         # Save the review result to cache for future use
@@ -261,10 +275,13 @@ class DefaultEvaluator(Evaluator):
                             sample_score=sample_score,
                             save_metadata=self.benchmark.save_metadata
                         )
-                        logger.debug(f'Review result: \n{review_result.model_dump_json(indent=2)}')
+                        logger.debug(f'Review result: \n{review_result.pretty_print()}')
                     except Exception as exc:
-                        logger.error(f'Error when review sample {task_state.sample_id}: {exc}')
+                        tb_str = traceback.format_exc()
+                        logger.error(
+                            f'Error when review sample {task_state.sample_id}: due to {exc}\nTraceback:\n{tb_str}'
+                        )
                         if self.task_config.ignore_errors:
                             logger.warning('Error ignored, continuing with next sample.')
                         else:
@@ -317,7 +334,7 @@ class DefaultEvaluator(Evaluator):
         # Generate and display a summary table of results
         try:
-            report_table = gen_table(report_list=[report], add_overall_metric=True)
+            report_table = gen_table(report_list=[report], add_overall_metric=self.benchmark.add_overall_metric)
             logger.info(f'\n{self.benchmark_name} report table:'
                         f'\n{report_table} \n')
         except Exception:
@@ -335,3 +352,6 @@ class DefaultEvaluator(Evaluator):
         report.to_json(report_file)
         logger.info(f'Dump report to: {report_file} \n')
         return report
+    def finalize(self, *args, **kwargs):
+        self.benchmark.finalize(*args, **kwargs)

evalscope 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

Potentially problematic release.

evalscope 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl