PyPI - evalscope - Versions diffs - 0.11.0__py3-none-any.whl → 0.12.1__py3-none-any.whl - Mend

evalscope 0.11.0py3-none-any.whl → 0.12.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (89) hide show

evalscope/arguments.py +3 -1
evalscope/benchmarks/{aime24 → aime}/aime24_adapter.py +3 -3
evalscope/benchmarks/aime/aime25_adapter.py +49 -0
evalscope/benchmarks/arc/arc_adapter.py +14 -17
evalscope/benchmarks/bbh/bbh_adapter.py +6 -11
evalscope/benchmarks/benchmark.py +12 -10
evalscope/benchmarks/ceval/ceval_adapter.py +10 -15
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +11 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +6 -20
evalscope/benchmarks/data_adapter.py +82 -19
evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +15 -22
evalscope/benchmarks/general_qa/general_qa_adapter.py +29 -16
evalscope/benchmarks/gpqa/gpqa_adapter.py +13 -8
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -4
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +8 -12
evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -2
evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -4
evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
evalscope/benchmarks/math_500/math_500_adapter.py +9 -4
evalscope/benchmarks/mmlu/mmlu_adapter.py +11 -16
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +24 -36
evalscope/benchmarks/musr/__init__.py +0 -0
evalscope/benchmarks/musr/musr_adapter.py +71 -0
evalscope/benchmarks/process_bench/__init__.py +0 -0
evalscope/benchmarks/process_bench/critique_template.txt +13 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +99 -0
evalscope/benchmarks/race/race_adapter.py +12 -16
evalscope/benchmarks/simple_qa/__init__.py +0 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +20 -0
evalscope/benchmarks/super_gpqa/__init__.py +0 -0
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
evalscope/benchmarks/super_gpqa/utils.py +90 -0
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +7 -14
evalscope/benchmarks/utils.py +43 -0
evalscope/cli/start_app.py +4 -1
evalscope/cli/start_eval.py +4 -3
evalscope/cli/start_perf.py +4 -2
evalscope/collections/evaluator.py +16 -1
evalscope/config.py +13 -3
evalscope/constants.py +7 -0
evalscope/evaluator/evaluator.py +3 -1
evalscope/metrics/__init__.py +2 -1
evalscope/metrics/metrics.py +23 -2
evalscope/metrics/named_metrics.py +1 -0
evalscope/models/__init__.py +2 -1
evalscope/models/base_adapter.py +32 -6
evalscope/models/chat_adapter.py +4 -1
evalscope/models/choice_adapter.py +4 -0
evalscope/models/custom_adapter.py +2 -0
evalscope/models/local_model.py +3 -2
evalscope/models/register.py +28 -0
evalscope/models/server_adapter.py +107 -29
evalscope/perf/__init__.py +0 -1
evalscope/perf/arguments.py +18 -8
evalscope/perf/http_client.py +8 -6
evalscope/perf/plugin/api/openai_api.py +11 -1
evalscope/perf/utils/analysis_result.py +1 -1
evalscope/perf/utils/benchmark_util.py +6 -2
evalscope/report/app.py +15 -8
evalscope/report/combinator.py +2 -2
evalscope/run.py +6 -5
evalscope/third_party/thinkbench/__init__.py +3 -0
evalscope/third_party/thinkbench/eval.py +429 -0
evalscope/third_party/thinkbench/infer.py +130 -0
evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
evalscope/third_party/thinkbench/tools/__init__.py +0 -0
evalscope/third_party/thinkbench/tools/llm.py +48 -0
evalscope/third_party/thinkbench/tools/utils.py +13 -0
evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
evalscope/utils/chat_service.py +1 -0
evalscope/utils/filters.py +59 -0
evalscope/utils/logger.py +3 -3
evalscope/utils/model_utils.py +17 -1
evalscope/utils/utils.py +45 -45
evalscope/version.py +2 -2
{evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/METADATA +14 -5
{evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/RECORD +89 -65
tests/cli/test_collection.py +1 -1
tests/cli/test_run.py +151 -32
/evalscope/benchmarks/{aime24 → aime}/__init__.py +0 -0
{evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/LICENSE +0 -0
{evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/WHEEL +0 -0
{evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/top_level.txt +0 -0

evalscope/third_party/thinkbench/tools/llm.py ADDED Viewed

@@ -0,0 +1,48 @@
+import os
+from openai import OpenAI
+def request_url(llm_config, content):
+    try:
+        client = OpenAI(
+            api_key=llm_config['api_key'],
+            base_url=llm_config['base_url'],
+        )
+        completion = client.chat.completions.create(
+            model=llm_config['model_name'],
+            messages=[{'role': 'user', 'content': content}]
+        )
+        return completion.choices[0].message.content
+    except Exception as e:
+        print(e)
+        return None
+def request_qwen(content):
+    try:
+        client = OpenAI(
+            api_key=os.getenv('DASHSCOPE_API_KEY'),
+            base_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
+        )
+        completion = client.chat.completions.create(
+            model='qwen-max',
+            messages=[{'role': 'user', 'content': content}]
+        )
+        return completion.choices[0].message.content
+    except Exception as e:
+        print(e)
+def request_local(content):
+    try:
+        client = OpenAI(
+            api_key='EMPTY',
+            base_url='http://0.0.0.0:8801/v1',
+        )
+        completion = client.chat.completions.create(
+            model='Qwen2.5-72B-Instruct',
+            messages=[{'role': 'user', 'content': content}]
+        )
+        return completion.choices[0].message.content
+    except Exception as e:
+        print(e)

evalscope/third_party/thinkbench/tools/utils.py ADDED Viewed

@@ -0,0 +1,13 @@
+import re
+def extract_answer(solution_text: str):
+    boxed_pattern = r'\\boxed\{([^}]*)\}'
+    matches = re.findall(boxed_pattern, solution_text)
+    if matches:
+        last_boxed_content = matches[-1]
+        number_pattern = r'-?\d+'
+        number_matches = re.findall(number_pattern, last_boxed_content)
+        if number_matches:
+            return number_matches[-1].strip()
+    return None

evalscope/third_party/toolbench_static/llm/swift_infer.py CHANGED Viewed

@@ -1,37 +1,67 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
 from dataclasses import dataclass
-from swift.llm import get_default_template_type, get_model_tokenizer, get_template, inference
-from swift.utils import seed_everything
-# TODO: Support custom model for swift infer
+from swift.llm import InferEngine, InferRequest, PtEngine, RequestConfig, get_template
+# 设置GPU环境变量
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 @dataclass
 class SwiftInferArgs:
     model_id_or_path: str
     model_type: str
+    infer_backend: str = 'vllm'  # 可选 'pt', 'vllm', 'lmdeploy'
     max_new_tokens: int = 2048
+    temperature: float = 0.1
+    max_batch_size: int = 16
 class SwiftInfer:
     def __init__(self, args: SwiftInferArgs):
-        model_type = args.model_type
-        template_type = get_default_template_type(model_type)
-        model, tokenizer = get_model_tokenizer(
-            model_type, model_id_or_path=args.model_id_or_path, model_kwargs={'device_map': 'auto'})
-        model.generation_config.max_new_tokens = args.max_new_tokens
-        print(f'** Generation config: {model.generation_config}')
+        # infer backend模型初始化
+        if args.infer_backend == 'pt':
+            self.engine: InferEngine = PtEngine(args.model_id_or_path, max_batch_size=args.max_batch_size)
+        elif args.infer_backend == 'vllm':
+            from swift.llm import VllmEngine
+            self.engine: InferEngine = VllmEngine(args.model_id_or_path, max_model_len=8192)
+        elif args.infer_backend == 'lmdeploy':
+            from swift.llm import LmdeployEngine
+            self.engine: InferEngine = LmdeployEngine(args.model_id_or_path)
+        else:
+            raise ValueError(f'Unsupported infer_backend: {args.infer_backend}')
-        template = get_template(template_type, tokenizer)
-        seed_everything(42)
-        self.tokenizer = tokenizer
-        self.model = model
-        self.template = template
+        # 基本配置获取 （可选）
+        self.request_config = RequestConfig(
+            max_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            stream=False  # 可以透传参数改为True进行流式推理
+        )
     def predict(self, system: str, query: str, history: list):
+        # Swift 3.0标准接口中，消息传入的格式是：
+        # messages: [{"role": "system", "content": "<SYSTEM_PROMPT>"},
+        #            {"role": "user", "content": "用户问题内容"},
+        #            {"role": "assistant", "content": "助手回答内容"}, ...]
+        messages = []
+        if system.strip():
+            messages.append({'role': 'system', 'content': system})
+        # 将历史对话拼接进message中
+        for qa_pair in history:
+            # 假定 history 中每个元素形如 ("user input", "model response")，请根据你的数据格式进行调整。
+            user_answer, model_response = qa_pair
+            messages.append({'role': 'user', 'content': user_answer})
+            messages.append({'role': 'assistant', 'content': model_response})
+        # 添加本次用户问题
+        messages.append({'role': 'user', 'content': query})
+        infer_request = InferRequest(messages=messages)
+        # 进行推理
+        response = self.engine.infer([infer_request], self.request_config)
-        response, history = inference(self.model, self.template, query=query, system=system, history=history)
+        # 提取模型返回的文本结果（假设非stream模式）
+        result_text = response[0].choices[0].message.content.strip()
-        return response
+        return result_text

evalscope/utils/chat_service.py CHANGED Viewed

@@ -32,6 +32,7 @@ class ModelList(BaseModel):
 class ChatMessage(BaseModel):
     role: Literal['user', 'assistant', 'system']
     content: str
+    reasoning_content: Optional[str] = None
 class DeltaMessage(BaseModel):

evalscope/utils/filters.py ADDED Viewed

@@ -0,0 +1,59 @@
+import re
+from typing import Any, Callable, Dict
+class Filter:
+    """
+    A base Filter class that implements the registry pattern
+    """
+    _registry: Dict[str, Callable[[str, Any], str]] = {}
+    @classmethod
+    def register(cls, name: str) -> Callable:
+        """
+        Decorator to register a new filter function
+        """
+        def decorator(func: Callable[[str, Any], str]) -> Callable[[str, Any], str]:
+            cls._registry[name] = func
+            return func
+        return decorator
+    @classmethod
+    def get_filter(cls, name: str) -> Callable:
+        """
+        Get a registered filter by name
+        """
+        return cls._registry.get(name)
+    @classmethod
+    def apply(cls, name: str, value: str, *args, **kwargs) -> str:
+        """
+        Apply a registered filter to a value
+        """
+        filter_func = cls.get_filter(name)
+        if filter_func is None:
+            raise ValueError(f'Filter {name} not found')
+        return filter_func(value, *args, **kwargs)
+@Filter.register('remove_until')
+def remove_until(value: str, marker: str) -> str:
+    """
+    Remove everything before the last occurrence of marker
+    """
+    if marker not in value:
+        return value
+    return value[value.rindex(marker) + len(marker):]
+@Filter.register('extract')
+def extract(value: str, pattern: str) -> str:
+    """
+    Extract content from string using regex pattern
+    """
+    match = re.search(pattern, value)
+    if match:
+        return match.group(0)
+    return ''

evalscope/utils/logger.py CHANGED Viewed

@@ -12,12 +12,12 @@ detailed_formatter = logging.Formatter(detailed_format)
 simple_formatter = logging.Formatter(simple_format)
 DEFAULT_LEVEL = logging.DEBUG if os.getenv('LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
-logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL)
+logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL, force=True)
-# disable datasets logging
+# set logging level
 logging.getLogger('datasets').setLevel(logging.WARNING)
-logging.getLogger('modelscope').setLevel(logging.WARNING)
 logging.getLogger('httpx').setLevel(logging.WARNING)
+logging.getLogger('modelscope').setLevel(logging.ERROR)
 def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, file_mode: str = 'w', force=False):

evalscope/utils/model_utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
+import os
 from enum import Enum
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional, Tuple, Union
 if TYPE_CHECKING:
     from transformers import GenerationConfig
@@ -22,3 +23,18 @@ def fix_do_sample_warning(generation_config: 'GenerationConfig') -> None:
         generation_config.temperature = 1.
         generation_config.top_p = 1.
         generation_config.top_k = 50
+def get_device() -> str:
+    from transformers.utils import is_torch_cuda_available, is_torch_mps_available, is_torch_npu_available
+    if is_torch_npu_available():
+        device = 'npu'
+    elif is_torch_mps_available():
+        device = 'mps'
+    elif is_torch_cuda_available():
+        device = 'cuda'
+    else:
+        device = 'cpu'
+    return device

evalscope/utils/utils.py CHANGED Viewed

@@ -101,50 +101,50 @@ class ResponseParser:
         options_concat = '|'.join([str(i) for i in options])
         patterns = [
-            f'答案是?\s?([{options_concat}])',
-            f'答案是?\s?：([{options_concat}])',
-            f'答案是?\s?:([{options_concat}])',
-            f'答案应该?是\s?([{options_concat}])',
-            f'答案应该?选\s?([{options_concat}])',
-            f'答案为\s?([{options_concat}])',
-            f'答案选\s?([{options_concat}])',
-            f'选择?\s?([{options_concat}])',
-            f'故选?\s?([{options_concat}])'
-            f'只有选?项?\s?([{options_concat}])\s?是?对',
-            f'只有选?项?\s?([{options_concat}])\s?是?错',
-            f'只有选?项?\s?([{options_concat}])\s?不?正确',
-            f'只有选?项?\s?([{options_concat}])\s?错误',
-            f'说法不?对选?项?的?是\s?([{options_concat}])',
-            f'说法不?正确选?项?的?是\s?([{options_concat}])',
-            f'说法错误选?项?的?是\s?([{options_concat}])',
-            f'([{options_concat}])\s?是正确的',
-            f'([{options_concat}])\s?是正确答案',
-            f'选项\s?([{options_concat}])\s?正确',
-            f'所以答\s?([{options_concat}])',
-            f'所以\s?([{options_concat}][.。$]?$)',
-            f'所有\s?([{options_concat}][.。$]?$)',
-            f'[\s，：:,]([{options_concat}])[。，,\.]?$',
-            f'[\s，,：:][故即]([{options_concat}])[。\.]?$',
-            f'[\s，,：:]因此([{options_concat}])[。\.]?$',
-            f'[是为。]\s?([{options_concat}])[。\.]?$',
-            f'因此\s?([{options_concat}])[。\.]?$',
-            f'显然\s?([{options_concat}])[。\.]?$',
-            f'答案是\s?(\S+)(?:。|$)',
-            f'答案应该是\s?(\S+)(?:。|$)',
-            f'答案为\s?(\S+)(?:。|$)',
-            f'答案是(.*?)[{options_concat}]',
-            f'答案为(.*?)[{options_concat}]',
-            f'固选(.*?)[{options_concat}]',
-            f'答案应该是(.*?)[{options_concat}]',
-            f'[Tt]he answer is \(?[{options_concat}]\)?',
-            f'[Tt]he correct answer is [{options_concat}]',
-            f'[Tt]he correct answer is:\n[{options_concat}]',
-            f'(\s|^)[{options_concat}][\s。，,\.$]',  # noqa
-            f'^选项\s?([{options_concat}])',
-            f'^([{options_concat}])\s?选?项',
-            f'(\s|^)[{options_concat}][\s。，,：:\.$]',
-            f'(\s|^)[{options_concat}](\s|$)',
-            f'[{options_concat}]',
+            rf'答案是?\s?([{options_concat}])',
+            rf'答案是?\s?：([{options_concat}])',
+            rf'答案是?\s?:([{options_concat}])',
+            rf'答案应该?是\s?([{options_concat}])',
+            rf'答案应该?选\s?([{options_concat}])',
+            rf'答案为\s?([{options_concat}])',
+            rf'答案选\s?([{options_concat}])',
+            rf'选择?\s?([{options_concat}])',
+            rf'故选?\s?([{options_concat}])'
+            rf'只有选?项?\s?([{options_concat}])\s?是?对',
+            rf'只有选?项?\s?([{options_concat}])\s?是?错',
+            rf'只有选?项?\s?([{options_concat}])\s?不?正确',
+            rf'只有选?项?\s?([{options_concat}])\s?错误',
+            rf'说法不?对选?项?的?是\s?([{options_concat}])',
+            rf'说法不?正确选?项?的?是\s?([{options_concat}])',
+            rf'说法错误选?项?的?是\s?([{options_concat}])',
+            rf'([{options_concat}])\s?是正确的',
+            rf'([{options_concat}])\s?是正确答案',
+            rf'选项\s?([{options_concat}])\s?正确',
+            rf'所以答\s?([{options_concat}])',
+            rf'所以\s?([{options_concat}][.。$]?$)',
+            rf'所有\s?([{options_concat}][.。$]?$)',
+            rf'[\s，：:,]([{options_concat}])[。，,\.]?$',
+            rf'[\s，,：:][故即]([{options_concat}])[。\.]?$',
+            rf'[\s，,：:]因此([{options_concat}])[。\.]?$',
+            rf'[是为。]\s?([{options_concat}])[。\.]?$',
+            rf'因此\s?([{options_concat}])[。\.]?$',
+            rf'显然\s?([{options_concat}])[。\.]?$',
+            rf'答案是\s?(\S+)(?:。|$)',
+            rf'答案应该是\s?(\S+)(?:。|$)',
+            rf'答案为\s?(\S+)(?:。|$)',
+            rf'答案是(.*?)[{options_concat}]',
+            rf'答案为(.*?)[{options_concat}]',
+            rf'固选(.*?)[{options_concat}]',
+            rf'答案应该是(.*?)[{options_concat}]',
+            rf'[Tt]he answer is \(?[{options_concat}]\)?',
+            rf'[Tt]he correct answer is [{options_concat}]',
+            rf'[Tt]he correct answer is:\n[{options_concat}]',
+            rf'(\s|^)[{options_concat}][\s。，,\.$]',  # noqa
+            rf'^选项\s?([{options_concat}])',
+            rf'^([{options_concat}])\s?选?项',
+            rf'(\s|^)[{options_concat}][\s。，,：:\.$]',
+            rf'(\s|^)[{options_concat}](\s|$)',
+            rf'[{options_concat}]',
         ]
         regexes = [re.compile(pattern) for pattern in patterns]
@@ -166,8 +166,8 @@ class ResponseParser:
             text: The text to parse.
         """
         patterns = [
-            r'[Aa]nswer:\s*(\w+)',
             r'answer is \(?(\w+)\)?',
+            r'[Aa]nswer:\s*(\w+)',
             r'[Tt]he correct answer is:\s*(\w+)',
             r'[Tt]he correct answer is:\n\s*(\w+)',
             r'[Tt]he correct answer is:\n\n-\s*(\w+)',

evalscope/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-__version__ = '0.11.0'
-__release_datetime__ = '2025-02-13 12:00:00'
+__version__ = '0.12.1'
+__release_datetime__ = '2025-03-10 21:00:00'

{evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: evalscope
-Version: 0.11.0
+Version: 0.12.1
 Summary: EvalScope: Lightweight LLMs Evaluation Framework
 Home-page: https://github.com/modelscope/evalscope
 Author: ModelScope team
@@ -21,6 +21,7 @@ Requires-Dist: accelerate
 Requires-Dist: cachetools
 Requires-Dist: datasets<=3.2.0,>=3.0.0
 Requires-Dist: editdistance
+Requires-Dist: immutabledict
 Requires-Dist: jieba
 Requires-Dist: jsonlines
 Requires-Dist: langdetect
@@ -58,6 +59,7 @@ Requires-Dist: accelerate; extra == "all"
 Requires-Dist: cachetools; extra == "all"
 Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
 Requires-Dist: editdistance; extra == "all"
+Requires-Dist: immutabledict; extra == "all"
 Requires-Dist: jieba; extra == "all"
 Requires-Dist: jsonlines; extra == "all"
 Requires-Dist: langdetect; extra == "all"
@@ -101,10 +103,10 @@ Requires-Dist: sse-starlette; extra == "all"
 Requires-Dist: transformers; extra == "all"
 Requires-Dist: unicorn; extra == "all"
 Requires-Dist: gradio==5.4.0; extra == "all"
-Requires-Dist: plotly>=5.23.0; extra == "all"
+Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
 Provides-Extra: app
 Requires-Dist: gradio==5.4.0; extra == "app"
-Requires-Dist: plotly>=5.23.0; extra == "app"
+Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
 Provides-Extra: inner
 Requires-Dist: absl-py; extra == "inner"
 Requires-Dist: accelerate; extra == "inner"
@@ -223,7 +225,14 @@ Please scan the QR code below to join our community groups:
 ## 🎉 News
-- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets，refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
+- 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
+- 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
+- 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
+- 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
+- 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
+- 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
+- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets，refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
 - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
 - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
 - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
@@ -460,7 +469,7 @@ Then, you can use the following command to evaluate the model API service:
 ```shell
 evalscope eval \
  --model qwen2.5 \
- --api-url http://127.0.0.1:8801/v1/chat/completions \
+ --api-url http://127.0.0.1:8801/v1 \
  --api-key EMPTY \
  --eval-type service \
  --datasets gsm8k \

evalscope 0.11.0__py3-none-any.whl → 0.12.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.11.0py3-none-any.whl → 0.12.1py3-none-any.whl