PyPI - evalscope - Versions diffs - 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

evalscope 0.8.1py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (105) hide show

evalscope/perf/plugin/api/openai_api.py CHANGED Viewed

@@ -96,60 +96,64 @@ class OpenaiPlugin(ApiPluginBase):
     def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
         """Parser responses and return number of request and response tokens.
-           sample of the output delta:
-           {"id":"4","object":"chat.completion.chunk","created":1714030870,"model":"llama3","choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":null}]}
+        Only one response for non-stream, multiple responses for stream.
+        """
+        # when stream, the last response is the full usage
+        # when non-stream, the last response is the first response
+        last_response_js = json.loads(responses[-1])
+        if 'usage' in last_response_js and last_response_js['usage']:
+            input_tokens = last_response_js['usage']['prompt_tokens']
+            output_tokens = last_response_js['usage']['completion_tokens']
+            return input_tokens, output_tokens
-        Args:
-            responses (List[bytes]): List of http response body, for stream output,
-                there are multiple responses, for general only one.
-            kwargs: (Any): The command line --parameter content.
-        Returns:
-            Tuple: Return number of prompt token and number of completion tokens.
-        """
-        full_response_content = ''
+        # no usage information in the response, parse the response to get the tokens
         delta_contents = {}
-        input_tokens = None
-        output_tokens = None
         for response in responses:
             js = json.loads(response)
-            if js['object'] == 'chat.completion':
-                for choice in js['choices']:
-                    delta_contents[choice['index']] = [choice['message']['content']]
-                input_tokens = js['usage']['prompt_tokens']
-                output_tokens = js['usage']['completion_tokens']
-            elif js['object'] == 'text_completion':
-                for choice in js['choices']:
-                    delta_contents[choice['index']] = [choice['text']]
-                input_tokens = js['usage']['prompt_tokens']
-                output_tokens = js['usage']['completion_tokens']
-            elif js['object'] == 'chat.completion.chunk':
-                if 'choices' in js:
-                    for choice in js['choices']:
-                        if 'delta' in choice and 'index' in choice:
-                            delta = choice['delta']
-                            idx = choice['index']
-                            if 'content' in delta:
-                                delta_content = delta['content']
-                                if idx in delta_contents:
-                                    delta_contents[idx].append(delta_content)
-                                else:
-                                    delta_contents[idx] = [delta_content]
-                # usage in chunk: {"id":"","object":"chat.completion.chunk","created":1718269986,"model":"llama3",
-                # "choices":[],"usage":{"prompt_tokens":32,"total_tokens":384,"completion_tokens":352}}
-                if 'usage' in js and js['usage']:
-                    input_tokens = js['usage']['prompt_tokens']
-                    output_tokens = js['usage']['completion_tokens']
-        if (input_tokens is None and output_tokens is None and self.tokenizer is not None):
-            input_tokens = 0
-            output_tokens = 0
+            if 'object' in js:
+                self.__process_response_object(js, delta_contents)
+            else:
+                self.__process_no_object(js, delta_contents)
+        input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
+        return input_tokens, output_tokens
+    def __process_response_object(self, js, delta_contents):
+        if js['object'] == 'chat.completion':
+            for choice in js['choices']:
+                delta_contents[choice['index']] = [choice['message']['content']]
+        elif js['object'] == 'text_completion':
+            for choice in js['choices']:
+                delta_contents[choice['index']] = [choice['text']]
+        elif js['object'] == 'chat.completion.chunk':
+            for choice in js.get('choices', []):
+                if 'delta' in choice and 'index' in choice:
+                    delta = choice['delta']
+                    idx = choice['index']
+                    if 'content' in delta:
+                        delta_content = delta['content']
+                        delta_contents.setdefault(idx, []).append(delta_content)
+    def __process_no_object(self, js, delta_contents):
+        #  assume the response is a single choice
+        for choice in js['choices']:
+            if 'delta' in choice:
+                delta = choice['delta']
+                idx = choice['index']
+                if 'content' in delta:
+                    delta_content = delta['content']
+                    delta_contents.setdefault(idx, []).append(delta_content)
+            else:
+                delta_contents[choice['index']] = [choice['message']['content']]
+    def __calculate_tokens_from_content(self, request, delta_contents):
+        input_tokens = output_tokens = 0
+        if self.tokenizer is not None:
             for idx, choice_contents in delta_contents.items():
-                full_response_content = ''.join([m for m in choice_contents])
+                full_response_content = ''.join(choice_contents)
                 input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
                 output_tokens += len(self.tokenizer.encode(full_response_content))
-        elif input_tokens is None and output_tokens is None:  # no usage info get.
-            input_tokens = 0
-            output_tokens = 0
+        else:
             logger.warning('No usage information found. Please specify `--tokenizer-path` to generate usage details.')
         return input_tokens, output_tokens

evalscope/perf/utils/local_server.py CHANGED Viewed

@@ -103,6 +103,7 @@ def start_app(args: Arguments):
     elif args.api == 'local_vllm':
         os.environ['VLLM_USE_MODELSCOPE'] = 'True'
         os.environ['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = '1'
+        os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
         # yapf: disable
         proc = subprocess.Popen([
             'python', '-m', 'vllm.entrypoints.openai.api_server',

evalscope/run.py CHANGED Viewed

@@ -2,27 +2,23 @@
 """
 Run evaluation for LLMs.
 """
-import logging
 import os.path
-import torch
 from argparse import Namespace
 from datetime import datetime
 from typing import List, Optional, Union
 from evalscope.arguments import parse_args
+from evalscope.benchmarks import Benchmark, BenchmarkMeta
 from evalscope.config import TaskConfig, parse_task_config
-from evalscope.constants import DEFAULT_MODEL_REVISION, DEFAULT_WORK_DIR, EvalBackend, EvalType
+from evalscope.constants import DEFAULT_WORK_DIR, EvalBackend
 from evalscope.evaluator import Evaluator
-from evalscope.models.custom import CustomModel
-from evalscope.utils import import_module_util, seed_everything
+from evalscope.models import LocalModel, get_local_model, initialize_model_adapter
+from evalscope.utils import seed_everything
 from evalscope.utils.io_utils import OutputsStructure, are_paths_same
 from evalscope.utils.logger import configure_logging, get_logger
 logger = get_logger()
-BENCHMARK_PATH_PREFIX = 'evalscope.benchmarks.'
-MEMBERS_TO_IMPORT = ['DATASET_ID', 'SUBSET_LIST', 'DataAdapterClass', 'ModelAdapterClass']
 def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]) -> Union[dict, List[dict]]:
     """Run evaluation task(s) based on the provided configuration."""
@@ -38,15 +34,13 @@ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]
 def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
     """Run a single evaluation task."""
-    seed_everything(task_cfg.seed)
+    if task_cfg.seed is not None:
+        seed_everything(task_cfg.seed)
     outputs = setup_work_directory(task_cfg, run_time)
     configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
-    task_cfg.dump_yaml(outputs.configs_dir)
-    logger.info(task_cfg)
     if task_cfg.eval_backend != EvalBackend.NATIVE:
-        return run_non_native_backend(task_cfg)
+        return run_non_native_backend(task_cfg, outputs)
     else:
         return evaluate_model(task_cfg, outputs)
@@ -68,7 +62,7 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
     return outputs
-def run_non_native_backend(task_cfg: TaskConfig) -> dict:
+def run_non_native_backend(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
     """Run evaluation using a non-native backend."""
     eval_backend = task_cfg.eval_backend
     eval_config = task_cfg.eval_config
@@ -78,6 +72,10 @@ def run_non_native_backend(task_cfg: TaskConfig) -> dict:
     backend_manager_class = get_backend_manager_class(eval_backend)
     backend_manager = backend_manager_class(config=eval_config)
+    task_cfg.dump_yaml(outputs.configs_dir)
+    logger.info(task_cfg)
     backend_manager.run()
     return dict()
@@ -102,75 +100,48 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
     """Evaluate the model based on the provided task configuration."""
     # Initialize evaluator
     eval_results = {}
+    base_model = get_local_model(task_cfg)
+    evaluators = []
     for dataset_name in task_cfg.datasets:
-        evaluator = create_evaluator(task_cfg, dataset_name, outputs)
+        evaluator = create_evaluator(task_cfg, dataset_name, outputs, base_model)
+        evaluators.append(evaluator)
+    # dump task_cfg to outputs.configs_dir after creating evaluators
+    task_cfg.dump_yaml(outputs.configs_dir)
+    logger.info(task_cfg)
+    for evaluator in evaluators:
         res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit)
         eval_results[dataset_name] = res_dict
     return eval_results
-def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure):
+def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: LocalModel):
     """Create an evaluator object for the specified dataset."""
-    imported_modules = import_module_util(BENCHMARK_PATH_PREFIX, dataset_name, MEMBERS_TO_IMPORT)
-    model_adapter = initialize_model_adapter(task_cfg, dataset_name, imported_modules)
-    dataset_config = task_cfg.dataset_args.get(dataset_name, {})
-    dataset_name_or_path = dataset_config.get('local_path') or imported_modules['DATASET_ID']
-    in_prompt_template = dataset_config.get('prompt_template', '')
-    few_shot_num = dataset_config.get('few_shot_num', None)
-    few_shot_random = dataset_config.get('few_shot_random', True)
-    data_adapter = imported_modules['DataAdapterClass'](
-        few_shot_num=few_shot_num,
-        few_shot_random=few_shot_random,
-        prompt_template=in_prompt_template,
-        outputs=outputs,
-    )
-    in_subset_list = dataset_config.get('subset_list', imported_modules['SUBSET_LIST'])
-    logger.info(f'Evaluating on subsets for {dataset_name}: {in_subset_list}\n')
+    if dataset_name == 'data_collection':
+        # EvaluatorCollection is a collection of evaluators
+        from evalscope.collections import EvaluatorCollection
+        return EvaluatorCollection(task_cfg, outputs)
+    benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
+    data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
+    model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
+    # update task_cfg.dataset_args
+    task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
     return Evaluator(
-        dataset_name_or_path=dataset_name_or_path,
-        subset_list=in_subset_list,
+        dataset_name_or_path=benchmark.dataset_id,
         data_adapter=data_adapter,
         model_adapter=model_adapter,
-        use_cache=task_cfg.use_cache,
         outputs=outputs,
-        datasets_dir=task_cfg.dataset_dir,
-        datasets_hub=task_cfg.dataset_hub,
-        stage=task_cfg.stage,
-        eval_type=task_cfg.eval_type,
-        overall_task_cfg=task_cfg,
+        task_cfg=task_cfg,
     )
-def initialize_model_adapter(task_cfg: TaskConfig, dataset_name: str, imported_modules):
-    """Initialize the model adapter based on the task configuration."""
-    if task_cfg.dry_run:
-        from evalscope.models.dummy_chat_model import DummyChatModel
-        return DummyChatModel(model_cfg=dict())
-    elif task_cfg.eval_type == EvalType.CUSTOM:
-        if not isinstance(task_cfg.model, CustomModel):
-            raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
-        from evalscope.models.model_adapter import CustomModelAdapter
-        return CustomModelAdapter(custom_model=task_cfg.model)
-    else:
-        device_map = task_cfg.model_args.get('device_map', 'auto') if torch.cuda.is_available() else None
-        model_precision = task_cfg.model_args.get('precision', torch.float16)
-        if isinstance(model_precision, str) and model_precision != 'auto':
-            model_precision = eval(model_precision)
-        return imported_modules['ModelAdapterClass'](
-            model_id=task_cfg.model,
-            model_revision=task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION),
-            device_map=device_map,
-            torch_dtype=model_precision,
-            generation_config=task_cfg.generation_config,
-            chat_template=task_cfg.chat_template)
 def main():
     args = parse_args()
     run_task(args)

evalscope/run_arena.py CHANGED Viewed

@@ -10,7 +10,7 @@ from tqdm import tqdm
 from evalscope.constants import EvalConfigKeys
 from evalscope.evaluator.rating_eval import RatingEvaluate
-from evalscope.models.model_adapter import ChatGenerationModelAdapter
+from evalscope.models import ChatGenerationModelAdapter
 from evalscope.utils import get_obj_from_cfg
 from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list, yaml_to_dict
 from evalscope.utils.logger import get_logger

evalscope/utils/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.constants import *
+from evalscope.utils.model_utils import EvalBackend
 from evalscope.utils.utils import *

evalscope/utils/chat_service.py CHANGED Viewed

@@ -3,11 +3,10 @@ import time
 import torch
 from contextlib import contextmanager
 from functools import partial
-from modelscope import AutoModelForCausalLM, AutoTokenizer
 from pydantic import BaseModel, Field
 from threading import Thread
 from transformers import TextIteratorStreamer
-from typing import List, Literal, Optional, Union
+from typing import Any, List, Literal, Optional, Union
 class Usage(BaseModel):
@@ -66,7 +65,7 @@ class ChatCompletionResponseStreamChoice(BaseModel):
 class ChatCompletionResponse(BaseModel):
     model: str
     object: Literal['chat.completion', 'chat.completion.chunk']
-    choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
+    choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, Any]]
     created: Optional[int] = Field(default_factory=lambda: int(time.time()))
     usage: Optional[Usage]
@@ -96,6 +95,8 @@ class TextCompletionResponse(BaseModel):
 class ChatService:
     def __init__(self, model_path, attn_implementation):
+        from modelscope import AutoModelForCausalLM, AutoTokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path,

evalscope/utils/io_utils.py CHANGED Viewed

@@ -160,3 +160,11 @@ def are_paths_same(path1, path2):
     real_path2 = os.path.realpath(os.path.abspath(os.path.expanduser(path2)))
     return real_path1 == real_path2
+def dict_to_json(d: dict, json_file: str):
+    """
+    Dump dict to json file.
+    """
+    with open(json_file, 'w') as f:
+        json.dump(d, f, indent=4, ensure_ascii=False)

evalscope/utils/logger.py CHANGED Viewed

@@ -14,6 +14,10 @@ DEFAULT_LEVEL = logging.DEBUG if os.getenv('LOG_LEVEL', 'INFO') == 'DEBUG' else
 logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL)
+# disable datasets logging
+logging.getLogger('datasets').setLevel(logging.WARNING)
+logging.getLogger('modelscope').setLevel(logging.WARNING)
 def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, file_mode: str = 'w', force=False):
     """Get logging logger

evalscope/utils/model_utils.py CHANGED Viewed

@@ -1,6 +1,16 @@
+from enum import Enum
 from transformers import GenerationConfig
+class EvalBackend(Enum):
+    #  NOTE: compatible with ms-swfit v2.x
+    NATIVE = 'Native'
+    OPEN_COMPASS = 'OpenCompass'
+    VLM_EVAL_KIT = 'VLMEvalKit'
+    RAG_EVAL = 'RAGEval'
+    THIRD_PARTY = 'ThirdParty'
 def fix_do_sample_warning(generation_config: GenerationConfig) -> None:
     # Use the default values of temperature/top_p/top_k in generation_config.
     if generation_config.temperature == 0:

evalscope/utils/utils.py CHANGED Viewed

@@ -121,7 +121,6 @@ class ResponseParser:
             f'([{options_concat}])\s?是正确答案',
             f'选项\s?([{options_concat}])\s?正确',
             f'所以答\s?([{options_concat}])',
-            f'1.\s?([{options_concat}])[.。$]?$',
             f'所以\s?([{options_concat}][.。$]?$)',
             f'所有\s?([{options_concat}][.。$]?$)',
             f'[\s，：:,]([{options_concat}])[。，,\.]?$',
@@ -137,16 +136,15 @@ class ResponseParser:
             f'答案为(.*?)[{options_concat}]',
             f'固选(.*?)[{options_concat}]',
             f'答案应该是(.*?)[{options_concat}]',
-            f'[Tt]he answer is [{options_concat}]',
+            f'[Tt]he answer is \(?[{options_concat}]\)?',
             f'[Tt]he correct answer is [{options_concat}]',
             f'[Tt]he correct answer is:\n[{options_concat}]',
             f'(\s|^)[{options_concat}][\s。，,\.$]',  # noqa
-            f'[{options_concat}]',
             f'^选项\s?([{options_concat}])',
             f'^([{options_concat}])\s?选?项',
             f'(\s|^)[{options_concat}][\s。，,：:\.$]',
             f'(\s|^)[{options_concat}](\s|$)',
-            f'1.\s?(.*?)$',
+            f'[{options_concat}]',
         ]
         regexes = [re.compile(pattern) for pattern in patterns]
@@ -169,6 +167,7 @@ class ResponseParser:
         """
         patterns = [
             r'[Aa]nswer:\s*(\w+)',
+            r'answer is \(?(\w+)\)?',
             r'[Tt]he correct answer is:\s*(\w+)',
             r'[Tt]he correct answer is:\n\s*(\w+)',
             r'[Tt]he correct answer is:\n\n-\s*(\w+)',
@@ -199,27 +198,6 @@ class ResponseParser:
-def import_module_util(import_path_prefix: str, module_name: str, members_to_import: list) -> dict:
-    """
-    Import module utility function.
-    Args:
-        import_path_prefix: e.g. 'evalscope.benchmarks.'
-        module_name: The module name to import. e.g. 'mmlu'
-        members_to_import: The members to import.
-            e.g. ['DATASET_ID', 'SUBJECT_MAPPING', 'SUBSET_LIST', 'DataAdapterClass']
-    Returns:
-        dict: imported modules map. e.g. {'DATASET_ID': 'mmlu', 'SUBJECT_MAPPING': {...}, ...}
-    """
-    imported_modules = {}
-    module = importlib.import_module(import_path_prefix + module_name)
-    for member_name in members_to_import:
-        imported_modules[member_name] = getattr(module, member_name)
-    return imported_modules
 def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
     """
     Normalize score.

evalscope/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-__version__ = '0.8.1'
-__release_datetime__ = '2024-12-17 20:00:00'
+__version__ = '0.9.0'
+__release_datetime__ = '2025-01-03 18:00:00'

{evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: evalscope
-Version: 0.8.1
+Version: 0.9.0
 Summary: EvalScope: Lightweight LLMs Evaluation Framework
 Home-page: https://github.com/modelscope/evalscope
 Author: ModelScope team
@@ -84,7 +84,7 @@ Requires-Dist: transformers-stream-generator; extra == "all"
 Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
 Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
 Requires-Dist: mteb==1.19.4; extra == "all"
-Requires-Dist: ragas==0.2.7; extra == "all"
+Requires-Dist: ragas==0.2.9; extra == "all"
 Requires-Dist: webdataset>0.2.0; extra == "all"
 Requires-Dist: aiohttp; extra == "all"
 Requires-Dist: fastapi; extra == "all"
@@ -129,7 +129,7 @@ Requires-Dist: transformers; extra == "perf"
 Requires-Dist: unicorn; extra == "perf"
 Provides-Extra: rag
 Requires-Dist: mteb==1.19.4; extra == "rag"
-Requires-Dist: ragas==0.2.7; extra == "rag"
+Requires-Dist: ragas==0.2.9; extra == "rag"
 Requires-Dist: webdataset>0.2.0; extra == "rag"
 Provides-Extra: vlmeval
 Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
@@ -160,14 +160,16 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
 > ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
 ## 📋 Contents
-- [Introduction](#introduction)
-- [News](#News)
-- [Installation](#installation)
-- [Quick Start](#quick-start)
+- [Introduction](#-introduction)
+- [News](#-news)
+- [Installation](#️-installation)
+- [Quick Start](#-quick-start)
 - [Evaluation Backend](#evaluation-backend)
-- [Custom Dataset Evaluation](#custom-dataset-evaluation)
-- [Model Serving Performance Evaluation](#Model-Serving-Performance-Evaluation)
-- [Arena Mode](#arena-mode)
+- [Custom Dataset Evaluation](#️-custom-dataset-evaluation)
+- [Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
+- [Arena Mode](#-arena-mode)
+- [Contribution](#️-contribution)
+- [Roadmap](#-roadmap)
 ## 📝 Introduction
@@ -181,6 +183,8 @@ The framework accommodates multiple evaluation scenarios such as end-to-end RAG
   <br>EvalScope Framework.
 </p>
+<details><summary>Framework Description</summary>
 The architecture includes the following modules:
 1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
 2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
@@ -194,13 +198,27 @@ The architecture includes the following modules:
 5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
 6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
+</details>
+## ☎ User Groups
+Please scan the QR code below to join our community groups:
+[Discord Group](https://discord.com/invite/D27yfEFVz5)              |  WeChat Group | DingTalk Group
+:-------------------------:|:-------------------------:|:-------------------------:
+<img src="docs/asset/discord_qr.jpg" width="160" height="160">  |  <img src="docs/asset/wechat.png" width="160" height="160"> | <img src="docs/asset/dingding.png" width="160" height="160">
 ## 🎉 News
+- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
 - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
 - 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
 - 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
 - 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
 - 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
+<details><summary>More</summary>
 - 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
 - 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
 - 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
@@ -212,7 +230,7 @@ The architecture includes the following modules:
 - 🔥 **[2024.06.13]** EvalScope seamlessly integrates with the fine-tuning framework SWIFT, providing full-chain support from LLM training to evaluation.
 - 🔥 **[2024.06.13]** Integrated the Agent evaluation dataset ToolBench.
+</details>
 ## 🛠️ Installation
 ### Method 1: Install Using pip
@@ -402,7 +420,7 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
 - **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
-## Model Serving Performance Evaluation
+## 📈 Model Serving Performance Evaluation
 A stress testing tool focused on large language models, which can be customized to support various dataset formats and different API protocol formats.
 Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
@@ -427,19 +445,32 @@ Speed Benchmark Results:
 +---------------+-----------------+----------------+
 ```
-## Custom Dataset Evaluation
+## 🖊️ Custom Dataset Evaluation
 EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
-## Arena Mode
+## 🏟️ Arena Mode
 The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
 Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
+## 👷‍♂️ Contribution
+EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn), is continuously optimizing its benchmark evaluation features! We invite you to refer to the [Contribution Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html) to easily add your own evaluation benchmarks and share your contributions with the community. Let’s work together to support the growth of EvalScope and make our tools even better! Join us now!
+<a href="https://github.com/modelscope/evalscope/graphs/contributors" target="_blank">
+  <table>
+    <tr>
+      <th colspan="2">
+        <br><img src="https://contrib.rocks/image?repo=modelscope/evalscope"><br><br>
+      </th>
+    </tr>
+  </table>
+</a>
-## TO-DO List
+## 🔜 Roadmap
+- [ ] Support for better evaluation report visualization
+- [x] Support for mixed evaluations across multiple datasets
 - [x] RAG evaluation
 - [x] VLM evaluation
 - [x] Agents evaluation
@@ -450,8 +481,6 @@ Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/lates
   - [ ] GAIA
   - [ ] GPQA
   - [x] MBPP
-- [ ] Auto-reviewer
-  - [ ] Qwen-max
 ## Star History

evalscope 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.8.1py3-none-any.whl → 0.9.0py3-none-any.whl