PyPI - evalscope - Versions diffs - 0.17.0__py3-none-any.whl → 0.17.1__py3-none-any.whl - Mend

evalscope 0.17.0py3-none-any.whl → 0.17.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (66) hide show

evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
evalscope/benchmarks/data_adapter.py +9 -4
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +2 -1
evalscope/benchmarks/hle/__init__.py +0 -0
evalscope/benchmarks/hle/hle_adapter.py +118 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
evalscope/benchmarks/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
evalscope/benchmarks/utils.py +1 -0
evalscope/constants.py +5 -21
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +5 -3
evalscope/metrics/__init__.py +3 -1
evalscope/metrics/completion_parsers.py +7 -0
evalscope/metrics/llm_judge.py +6 -5
evalscope/metrics/metrics.py +19 -7
evalscope/models/__init__.py +4 -8
evalscope/models/adapters/__init__.py +4 -9
evalscope/models/adapters/base_adapter.py +4 -0
evalscope/models/adapters/bfcl_adapter.py +2 -0
evalscope/models/adapters/chat_adapter.py +3 -0
evalscope/models/adapters/choice_adapter.py +4 -0
evalscope/models/adapters/custom_adapter.py +7 -3
evalscope/models/adapters/server_adapter.py +2 -0
evalscope/models/adapters/t2i_adapter.py +3 -0
evalscope/models/adapters/tau_bench_adapter.py +189 -0
evalscope/models/register.py +0 -14
evalscope/perf/arguments.py +13 -0
evalscope/perf/benchmark.py +38 -39
evalscope/perf/http_client.py +30 -86
evalscope/perf/main.py +2 -2
evalscope/perf/plugin/__init__.py +3 -2
evalscope/perf/plugin/api/__init__.py +4 -3
evalscope/perf/plugin/api/base.py +22 -4
evalscope/perf/plugin/api/custom_api.py +212 -55
evalscope/perf/plugin/api/dashscope_api.py +4 -10
evalscope/perf/plugin/api/default_api.py +105 -0
evalscope/perf/plugin/api/openai_api.py +17 -19
evalscope/perf/plugin/datasets/__init__.py +10 -7
evalscope/perf/plugin/datasets/base.py +22 -1
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +4 -27
evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +2 -1
evalscope/perf/plugin/datasets/random_dataset.py +15 -4
evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
evalscope/perf/plugin/registry.py +36 -16
evalscope/perf/utils/benchmark_util.py +14 -20
evalscope/perf/utils/db_util.py +79 -61
evalscope/utils/io_utils.py +10 -0
evalscope/version.py +2 -2
{evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/METADATA +54 -34
{evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/RECORD +65 -58
tests/cli/test_all.py +18 -2
tests/cli/test_run.py +25 -37
tests/perf/test_perf.py +29 -2
evalscope/models/model.py +0 -189
{evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
{evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
{evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0

evalscope/perf/utils/db_util.py CHANGED Viewed

@@ -16,6 +16,28 @@ from evalscope.utils.logger import get_logger
 logger = get_logger()
+class DatabaseColumns:
+    REQUEST = 'request'
+    START_TIME = 'start_time'
+    CHUNK_TIMES = 'chunk_times'
+    SUCCESS = 'success'
+    RESPONSE_MESSAGES = 'response_messages'
+    COMPLETED_TIME = 'completed_time'
+    LATENCY = 'latency'
+    FIRST_CHUNK_LATENCY = 'first_chunk_latency'
+    PROMPT_TOKENS = 'prompt_tokens'
+    COMPLETION_TOKENS = 'completion_tokens'
+    MAX_GPU_MEMORY_COST = 'max_gpu_memory_cost'
+    TIME_PER_OUTPUT_TOKEN = 'time_per_output_token'
+def load_prompt(prompt_path_or_text):
+    if prompt_path_or_text.startswith('@'):
+        with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
+            return file.read()
+    return prompt_path_or_text
 def encode_data(data) -> str:
     """Encodes data using base64 and pickle."""
     return base64.b64encode(pickle.dumps(data)).decode('utf-8')
@@ -34,20 +56,20 @@ def transpose_results(data):
 def create_result_table(cursor):
-    cursor.execute('''CREATE TABLE IF NOT EXISTS result(
-                      request TEXT,
-                      start_time REAL,
-                      chunk_times TEXT,
-                      success INTEGER,
-                      response_messages TEXT,
-                      completed_time REAL,
-                      latency REAL,
-                      first_chunk_latency REAL,
-                      n_chunks INTEGER,
-                      chunk_time REAL,
-                      prompt_tokens INTEGER,
-                      completion_tokens INTEGER,
-                      max_gpu_memory_cost REAL)''')
+    cursor.execute(f'''CREATE TABLE IF NOT EXISTS result(
+                      {DatabaseColumns.REQUEST} TEXT,
+                      {DatabaseColumns.START_TIME} REAL,
+                      {DatabaseColumns.CHUNK_TIMES} TEXT,
+                      {DatabaseColumns.SUCCESS} INTEGER,
+                      {DatabaseColumns.RESPONSE_MESSAGES} TEXT,
+                      {DatabaseColumns.COMPLETED_TIME} REAL,
+                      {DatabaseColumns.LATENCY} REAL,
+                      {DatabaseColumns.FIRST_CHUNK_LATENCY} REAL,
+                      {DatabaseColumns.PROMPT_TOKENS} INTEGER,
+                      {DatabaseColumns.COMPLETION_TOKENS} INTEGER,
+                      {DatabaseColumns.MAX_GPU_MEMORY_COST} REAL,
+                      {DatabaseColumns.TIME_PER_OUTPUT_TOKEN} REAL
+                   )''')
 def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
@@ -67,24 +89,21 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
     if benchmark_data.success:
         # Add additional columns for success case
-        additional_columns = (
-            benchmark_data.query_latency,
-            benchmark_data.first_chunk_latency,
-            benchmark_data.n_chunks,
-            benchmark_data.n_chunks_time,
-            benchmark_data.prompt_tokens,
-            benchmark_data.completion_tokens,
-            benchmark_data.max_gpu_memory_cost,
-        )
-        query = """INSERT INTO result(
-                      request, start_time, chunk_times, success, response_messages,
-                      completed_time, latency, first_chunk_latency,
-                      n_chunks, chunk_time, prompt_tokens, completion_tokens, max_gpu_memory_cost
-                   ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
+        additional_columns = (benchmark_data.query_latency, benchmark_data.first_chunk_latency,
+                              benchmark_data.prompt_tokens, benchmark_data.completion_tokens,
+                              benchmark_data.max_gpu_memory_cost, benchmark_data.time_per_output_token)
+        query = f"""INSERT INTO result(
+                      {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
+                      {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME},
+                      {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY}, {DatabaseColumns.PROMPT_TOKENS},
+                      {DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.MAX_GPU_MEMORY_COST},
+                      {DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
+                   ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
         cursor.execute(query, common_columns + additional_columns)
     else:
-        query = """INSERT INTO result(
-                      request, start_time, chunk_times, success, response_messages, completed_time
+        query = f"""INSERT INTO result(
+                      {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
+                      {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME}
                    ) VALUES (?, ?, ?, ?, ?, ?)"""
         cursor.execute(query, common_columns)
@@ -160,44 +179,43 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
             logger.error(f'Error parsing chunk times: {e}')
             return []
-    query_sql = ('SELECT start_time, chunk_times, success, completed_time, latency, first_chunk_latency, '
-                 'n_chunks, chunk_time, prompt_tokens, completion_tokens '
-                 'FROM result WHERE success=1')
+    query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES}, {DatabaseColumns.SUCCESS},
+                    {DatabaseColumns.COMPLETED_TIME}, {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY},
+                    {DatabaseColumns.PROMPT_TOKENS},
+                    {DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
+                    FROM result WHERE {DatabaseColumns.SUCCESS}=1'''
     percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
     with sqlite3.connect(result_db_path) as con:
-        rows = con.execute(query_sql).fetchall()
+        cursor = con.cursor()
+        cursor.execute(query_sql)
+        columns = [description[0] for description in cursor.description]
+        rows = cursor.fetchall()
-    # Define index variables for columns
-    CHUNK_TIMES_INDEX = 1
-    LATENCY_INDEX = 4
-    FIRST_CHUNK_LATENCY_INDEX = 5
-    CHUNK_TIME_INDEX = 7
-    PROMPT_TOKENS_INDEX = 8
-    COMPLETION_TOKENS_INDEX = 9
+    # Create column index mapping
+    col_indices = {col: idx for idx, col in enumerate(columns)}
     # Prepare data for each metric
     inter_token_latencies_all = []
     for row in rows:
-        inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))
+        inter_token_latencies_all.extend(inter_token_latencies(row[col_indices[DatabaseColumns.CHUNK_TIMES]]))
     metrics = {
-        PercentileMetrics.TTFT: [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
+        PercentileMetrics.TTFT: [row[col_indices[DatabaseColumns.FIRST_CHUNK_LATENCY]] for row in rows],
         PercentileMetrics.ITL:
         inter_token_latencies_all,
-        PercentileMetrics.TPOT:
-        [(row[CHUNK_TIME_INDEX] / row[COMPLETION_TOKENS_INDEX]) if row[COMPLETION_TOKENS_INDEX] > 0 else float('nan')
-         for row in rows],
-        PercentileMetrics.LATENCY: [row[LATENCY_INDEX] for row in rows],
-        PercentileMetrics.INPUT_TOKENS: [row[PROMPT_TOKENS_INDEX] for row in rows],
-        PercentileMetrics.OUTPUT_TOKENS: [row[COMPLETION_TOKENS_INDEX] for row in rows],
+        PercentileMetrics.TPOT: [row[col_indices[DatabaseColumns.TIME_PER_OUTPUT_TOKEN]] for row in rows],
+        PercentileMetrics.LATENCY: [row[col_indices[DatabaseColumns.LATENCY]] for row in rows],
+        PercentileMetrics.INPUT_TOKENS: [row[col_indices[DatabaseColumns.PROMPT_TOKENS]] for row in rows],
+        PercentileMetrics.OUTPUT_TOKENS: [row[col_indices[DatabaseColumns.COMPLETION_TOKENS]] for row in rows],
         PercentileMetrics.OUTPUT_THROUGHPUT:
-        [(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
-         for row in rows],
-        PercentileMetrics.TOTAL_THROUGHPUT: [((row[PROMPT_TOKENS_INDEX] + row[COMPLETION_TOKENS_INDEX])
-                                              / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
-                                             for row in rows]
+        [(row[col_indices[DatabaseColumns.COMPLETION_TOKENS]] / row[col_indices[DatabaseColumns.LATENCY]])
+         if row[col_indices[DatabaseColumns.LATENCY]] > 0 else float('nan') for row in rows],
+        PercentileMetrics.TOTAL_THROUGHPUT:
+        [((row[col_indices[DatabaseColumns.PROMPT_TOKENS]] + row[col_indices[DatabaseColumns.COMPLETION_TOKENS]])
+          / row[col_indices[DatabaseColumns.LATENCY]])
+         if row[col_indices[DatabaseColumns.LATENCY]] > 0 else float('nan') for row in rows]
     }
     # Calculate percentiles for each metric
@@ -237,18 +255,18 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
 def speed_benchmark_result(result_db_path: str):
-    query_sql = """
+    query_sql = f"""
         SELECT
-            prompt_tokens,
-            ROUND(AVG(completion_tokens / latency), 2) AS avg_completion_token_per_second,
-            ROUND(AVG(max_gpu_memory_cost), 2)
+            {DatabaseColumns.PROMPT_TOKENS},
+            ROUND(AVG({DatabaseColumns.COMPLETION_TOKENS} / {DatabaseColumns.LATENCY}), 2) AS avg_completion_token_per_second,
+            ROUND(AVG({DatabaseColumns.MAX_GPU_MEMORY_COST}), 2)
         FROM
             result
         WHERE
-            success = 1 AND latency > 0
+            {DatabaseColumns.SUCCESS} = 1 AND {DatabaseColumns.LATENCY} > 0
         GROUP BY
-            prompt_tokens
-    """
+            {DatabaseColumns.PROMPT_TOKENS}
+    """  # noqa: E501
     with sqlite3.connect(result_db_path) as con:
         cursor = con.cursor()

evalscope/utils/io_utils.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import base64
 import csv
 import hashlib
 import json
@@ -5,6 +6,8 @@ import jsonlines as jsonl
 import os
 import re
 import yaml
+from io import BytesIO
+from PIL import Image
 from evalscope.constants import DumpMode
 from evalscope.utils.logger import get_logger
@@ -266,3 +269,10 @@ def get_valid_list(input_list, candidate_list):
     """
     return [i for i in input_list if i in candidate_list], \
            [i for i in input_list if i not in candidate_list]
+def PIL_to_base64(image: Image.Image, format: str = 'JPEG') -> str:
+    buffered = BytesIO()
+    image.save(buffered, format=format)
+    img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
+    return img_str

evalscope/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-__version__ = '0.17.0'
-__release_datetime__ = '2025-07-04 17:00:00'
+__version__ = '0.17.1'
+__release_datetime__ = '2025-07-18 17:00:00'

{evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/METADATA RENAMED Viewed

@@ -1,19 +1,20 @@
 Metadata-Version: 2.1
 Name: evalscope
-Version: 0.17.0
+Version: 0.17.1
 Summary: EvalScope: Lightweight LLMs Evaluation Framework
 Home-page: https://github.com/modelscope/evalscope
 Author: ModelScope team
 Author-email: contact@modelscope.cn
+License: Apache License 2.0
 Keywords: python,llm,evaluation
 Classifier: Development Status :: 4 - Beta
-Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.8
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
-Requires-Python: >=3.8
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: accelerate
@@ -94,7 +95,7 @@ Requires-Dist: rich; extra == "all"
 Requires-Dist: sse-starlette; extra == "all"
 Requires-Dist: transformers; extra == "all"
 Requires-Dist: uvicorn; extra == "all"
-Requires-Dist: gradio>=5.4.0; extra == "all"
+Requires-Dist: gradio==5.4.0; extra == "all"
 Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
 Requires-Dist: diffusers; extra == "all"
 Requires-Dist: iopath; extra == "all"
@@ -103,19 +104,26 @@ Requires-Dist: open-clip-torch; extra == "all"
 Requires-Dist: opencv-python; extra == "all"
 Requires-Dist: torchvision; extra == "all"
 Requires-Dist: bfcl-eval; extra == "all"
-Requires-Dist: dotenv; extra == "all"
 Requires-Dist: human-eval; extra == "all"
 Requires-Dist: pytest; extra == "all"
 Requires-Dist: pytest-cov; extra == "all"
+Requires-Dist: python-dotenv; extra == "all"
 Provides-Extra: app
-Requires-Dist: gradio>=5.4.0; extra == "app"
+Requires-Dist: gradio==5.4.0; extra == "app"
 Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
 Provides-Extra: dev
 Requires-Dist: bfcl-eval; extra == "dev"
-Requires-Dist: dotenv; extra == "dev"
 Requires-Dist: human-eval; extra == "dev"
 Requires-Dist: pytest; extra == "dev"
 Requires-Dist: pytest-cov; extra == "dev"
+Requires-Dist: python-dotenv; extra == "dev"
+Provides-Extra: docs
+Requires-Dist: docutils>=0.16.0; extra == "docs"
+Requires-Dist: myst-parser; extra == "docs"
+Requires-Dist: recommonmark; extra == "docs"
+Requires-Dist: sphinx>=5.3.0; extra == "docs"
+Requires-Dist: sphinx-design; extra == "docs"
+Requires-Dist: sphinxawesome-theme; extra == "docs"
 Provides-Extra: opencompass
 Requires-Dist: ms-opencompass>=0.1.6; extra == "opencompass"
 Provides-Extra: perf
@@ -176,16 +184,17 @@ Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
   - [Basic Parameter](#basic-parameter)
   - [Output Results](#output-results)
 - [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
-- [🌐 Evaluation of Specified Model API](#-evaluation-of-specified-model-api)
+- [🌐 Evaluation of Model API](#-evaluation-of-model-api)
 - [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
-  - [Parameter](#parameter)
-- [Evaluation Backend](#evaluation-backend)
+  - [Parameter Description](#parameter-description)
+- [🧪 Other Evaluation Backends](#-other-evaluation-backends)
 - [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
 - [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
-- [🏟️ Arena Mode](#️-arena-mode)
+- [⚔️ Arena Mode](#️-arena-mode)
 - [👷‍♂️ Contribution](#️-contribution)
+- [📚 Citation](#-citation)
 - [🔜 Roadmap](#-roadmap)
-- [Star History](#star-history)
+- [⭐ Star History](#-star-history)
 ## 📝 Introduction
@@ -249,7 +258,9 @@ Please scan the QR code below to join our community groups:
 ## 🎉 News
+- 🔥 **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
+- 🔥 **[2025.07.16]** Support for [τ-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
+- 🔥 **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
 - 🔥 **[2025.07.03]** Refactored Arena Mode: now supports custom model battles, outputs a model leaderboard, and provides battle result visualization. See [reference](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
 - 🔥 **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
 - 🔥 **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
@@ -261,6 +272,8 @@ Please scan the QR code below to join our community groups:
 - 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
 - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
 - 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
+<details><summary>More</summary>
 - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
 - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
 - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
@@ -270,8 +283,6 @@ Please scan the QR code below to join our community groups:
 - 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
 - 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
 - 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
-<details><summary>More</summary>
 - 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
 - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets，refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
 - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
@@ -367,33 +378,31 @@ evalscope eval \
 When using Python code for evaluation, you need to submit the evaluation task using the `run_task` function, passing a `TaskConfig` as a parameter. It can also be a Python dictionary, yaml file path, or json file path, for example:
-**Using Python Dictionary**
+**Using `TaskConfig`**
 ```python
-from evalscope.run import run_task
+from evalscope import run_task, TaskConfig
-task_cfg = {
-    'model': 'Qwen/Qwen2.5-0.5B-Instruct',
-    'datasets': ['gsm8k', 'arc'],
-    'limit': 5
-}
+task_cfg = TaskConfig(
+    model='Qwen/Qwen2.5-0.5B-Instruct',
+    datasets=['gsm8k', 'arc'],
+    limit=5
+)
 run_task(task_cfg=task_cfg)
 ```
 <details><summary>More Startup Methods</summary>
-**Using `TaskConfig`**
+**Using Python Dictionary**
 ```python
 from evalscope.run import run_task
-from evalscope.config import TaskConfig
-task_cfg = TaskConfig(
-    model='Qwen/Qwen2.5-0.5B-Instruct',
-    datasets=['gsm8k', 'arc'],
-    limit=5
-)
+task_cfg = {
+    'model': 'Qwen/Qwen2.5-0.5B-Instruct',
+    'datasets': ['gsm8k', 'arc'],
+    'limit': 5
+}
 run_task(task_cfg=task_cfg)
 ```
@@ -496,7 +505,7 @@ To create a public link, set `share=True` in `launch()`.
 For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
-## 🌐 Evaluation of Specified Model API
+## 🌐 Evaluation of Model API
 Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
@@ -547,7 +556,7 @@ evalscope eval \
 Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
-## Evaluation Backend
+## 🧪 Other Evaluation Backends
 EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
 - **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
 - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
@@ -620,6 +629,17 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
   </table>
 </a>
+## 📚 Citation
+```bibtex
+@misc{evalscope_2024,
+    title={{EvalScope}: Evaluation Framework for Large Models},
+    author={ModelScope Team},
+    year={2024},
+    url={https://github.com/modelscope/evalscope}
+}
+```
 ## 🔜 Roadmap
 - [x] Support for better evaluation report visualization
 - [x] Support for mixed evaluations across multiple datasets
@@ -635,6 +655,6 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
   - [x] MBPP
-## Star History
+## ⭐ Star History
 [![Star History Chart](https://api.star-history.com/svg?repos=modelscope/evalscope&type=Date)](https://star-history.com/#modelscope/evalscope&Date)

evalscope 0.17.0__py3-none-any.whl → 0.17.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.17.0py3-none-any.whl → 0.17.1py3-none-any.whl