PyPI - evalscope - Versions diffs - 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl - Mend

evalscope 0.15.0py3-none-any.whl → 0.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (48) hide show

evalscope/arguments.py +10 -0
evalscope/backend/rag_eval/utils/llm.py +1 -1
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +3 -3
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
evalscope/benchmarks/data_adapter.py +4 -2
evalscope/benchmarks/drop/__init__.py +0 -0
evalscope/benchmarks/drop/drop_adapter.py +133 -0
evalscope/benchmarks/drop/utils.py +59 -0
evalscope/benchmarks/general_qa/general_qa_adapter.py +8 -4
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
evalscope/benchmarks/tool_bench/__init__.py +0 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +67 -0
evalscope/benchmarks/tool_bench/utils.py +202 -0
evalscope/benchmarks/utils.py +3 -2
evalscope/benchmarks/winogrande/__init__.py +0 -0
evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
evalscope/collections/evaluator.py +76 -26
evalscope/config.py +46 -15
evalscope/evaluator/evaluator.py +48 -14
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
evalscope/metrics/llm_judge.py +3 -3
evalscope/metrics/rouge_metric.py +11 -13
evalscope/models/adapters/chat_adapter.py +51 -34
evalscope/models/adapters/server_adapter.py +15 -19
evalscope/perf/arguments.py +14 -5
evalscope/perf/benchmark.py +4 -9
evalscope/perf/main.py +69 -17
evalscope/perf/utils/benchmark_util.py +33 -15
evalscope/perf/utils/db_util.py +32 -20
evalscope/perf/utils/log_utils.py +1 -1
evalscope/perf/utils/rich_display.py +186 -0
evalscope/report/app.py +47 -34
evalscope/report/utils.py +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
evalscope/utils/deprecation_utils.py +42 -0
evalscope/version.py +2 -2
{evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/METADATA +49 -25
{evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/RECORD +48 -38
tests/aigc/test_t2i.py +4 -4
tests/cli/test_all.py +3 -0
tests/cli/test_collection.py +2 -1
tests/cli/test_run.py +37 -14
tests/perf/test_perf.py +27 -2
{evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/LICENSE +0 -0
{evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/WHEEL +0 -0
{evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/top_level.txt +0 -0

evalscope/perf/utils/rich_display.py ADDED Viewed

@@ -0,0 +1,186 @@
+# the following code is largely adapted from https://github.com/lework/llm-benchmark
+import numpy as np
+from rich.console import Console
+from rich.panel import Panel
+from rich.style import Style
+from rich.table import Table
+from rich.text import Text
+from evalscope.utils.logger import get_logger
+from .benchmark_util import Metrics
+from .db_util import PercentileMetrics
+logger = get_logger()
+def analyze_results(all_results):
+    """Analyze all test results and generate a summary report"""
+    summary = []
+    total_tokens = 0
+    total_time = 0
+    for result in all_results:
+        total_metrics = result[0]
+        percentile_metrics = result[1]
+        percentiles = percentile_metrics[PercentileMetrics.PERCENTILES]
+        try:
+            concurrency = total_metrics.get(Metrics.NUMBER_OF_CONCURRENCY, 0)
+            rps = total_metrics.get(Metrics.REQUEST_THROUGHPUT, 0)
+            avg_latency = total_metrics.get(Metrics.AVERAGE_LATENCY, 0)
+            p99_latency = percentile_metrics.get(PercentileMetrics.LATENCY)[percentiles.index('99%')]
+            avg_tps = total_metrics.get(Metrics.OUTPUT_TOKEN_THROUGHPUT, 0)
+            avg_ttft = total_metrics.get(Metrics.AVERAGE_TIME_TO_FIRST_TOKEN, 0)
+            p99_ttft = percentile_metrics.get(PercentileMetrics.TTFT)[percentiles.index('99%')]
+            success_rate = (total_metrics.get(Metrics.SUCCEED_REQUESTS, 0)
+                            / total_metrics.get(Metrics.TOTAL_REQUESTS, 1)) * 100
+            avg_tpot = total_metrics.get(Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN, 0)
+            p99_tpot = percentile_metrics.get(PercentileMetrics.TPOT)[percentiles.index('99%')]
+            # Ensure all values are valid numbers
+            if any(x is None for x in [concurrency, rps, avg_latency, p99_latency, avg_tps, avg_ttft]):
+                logger.warning(f'Warning: Test results for concurrency {concurrency} contain invalid data, skipped')
+                continue
+            summary.append([
+                concurrency,
+                f'{rps:.2f}' if rps is not None else 'N/A',
+                f'{avg_latency:.3f}' if avg_latency is not None else 'N/A',
+                f'{p99_latency:.3f}' if p99_latency is not None else 'N/A',
+                f'{avg_tps:.2f}' if avg_tps is not None else 'N/A',
+                f'{avg_ttft:.3f}' if avg_ttft is not None else 'N/A',
+                f'{success_rate:.1f}%' if success_rate is not None else 'N/A',
+                f'{p99_ttft:.3f}' if p99_ttft is not None else 'N/A',
+                f'{avg_tpot:.3f}' if avg_tpot is not None else 'N/A',
+                f'{p99_tpot:.3f}' if p99_tpot is not None else 'N/A',
+            ])
+            total_tokens += total_metrics.get(Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST, 0) * total_metrics.get(
+                Metrics.SUCCEED_REQUESTS, 0)
+            total_time += total_metrics.get(Metrics.TIME_TAKEN_FOR_TESTS, 0)
+        except Exception as e:
+            logger.warning(
+                f"Warning: Error processing results for concurrency {result.get('concurrency', 'unknown')}: {str(e)}")
+            continue
+    if not summary:
+        logger.warning('Error: No valid test result data')
+        return [], 0, 0
+    return summary, total_tokens, total_time
+def print_summary(all_results, model_name):
+    """Print test results summary"""
+    summary, total_tokens, total_time = analyze_results(all_results)
+    if not summary:
+        logger.warning('No available test result data to display')
+        return
+    console = Console(width=100)  # Set fixed width
+    # Create title panel
+    title = Text('Performance Test Summary Report', style='bold')
+    console.print(Panel(title, width=60))
+    # Print basic information
+    basic_info = Table(show_header=False, width=60)
+    basic_info.add_column('Name', style='cyan', width=25)
+    basic_info.add_column('Value', style='green', width=35)
+    basic_info.add_row('Model', model_name)
+    basic_info.add_row('Total Generated', f'{total_tokens:,} tokens')
+    basic_info.add_row('Total Test Time', f'{total_time:.2f} seconds')
+    basic_info.add_row('Avg Output Rate', f'{total_tokens/total_time:.2f} tokens/sec')
+    console.print('\nBasic Information:')
+    console.print(basic_info)
+    # Create detailed performance metrics table
+    table = Table(
+        title='Detailed Performance Metrics',
+        show_header=True,
+        header_style='bold cyan',
+        border_style='blue',
+        width=100,  # Set total table width
+        pad_edge=False,  # Reduce edge padding
+        min_width=60,  # Minimum width
+    )
+    # Add columns (set fixed column widths)
+    table.add_column('Conc.', justify='right', style='cyan')
+    table.add_column('RPS', justify='right')
+    table.add_column('Avg Lat.(s)', justify='right')
+    table.add_column('P99 Lat.(s)', justify='right')
+    table.add_column('Gen. toks/s', justify='right')
+    table.add_column('Avg TTFT(s)', justify='right')
+    table.add_column('P99 TTFT(s)', justify='right')
+    table.add_column('Avg TPOT(s)', justify='right')
+    table.add_column('P99 TPOT(s)', justify='right')
+    table.add_column('Success Rate', justify='right', style='green')
+    # Add data rows
+    for row in summary:
+        try:
+            # Set row style based on success rate
+            success_rate = float(row[6].rstrip('%'))
+            row_style = 'green' if success_rate >= 95 else 'yellow' if success_rate >= 80 else 'red'
+            table.add_row(
+                str(row[0]),  # Concurrency
+                f'{float(row[1]):.2f}',  # RPS
+                f'{float(row[2]):.3f}',  # Average Latency
+                f'{float(row[3]):.3f}',  # P99 Latency
+                f'{float(row[4]):.2f}',  # Average TPS
+                f'{float(row[5]):.3f}',  # First Token Latency
+                f'{float(row[7]):.3f}',  # P99 TTFT
+                f'{float(row[8]):.3f}',  # Average TPOT
+                f'{float(row[9]):.3f}',  # P99 TPOT
+                row[6],  # Success Rate
+                style=row_style)
+        except ValueError as e:
+            console.print(f'Warning: Error processing row data: {str(e)}', style='bold red')
+            continue
+    console.print('\n')
+    console.print(table)
+    # Calculate and display best performance configuration
+    try:
+        best_rps_idx = np.argmax([float(row[1]) if row[1] != 'N/A' else -1 for row in summary])
+        best_latency_idx = np.argmin([float(row[2]) if row[2] != 'N/A' else float('inf') for row in summary])
+        perf_info = Table(title='Best Performance Configuration', show_header=False, box=None, width=60)
+        perf_info.add_column('Metric', style='cyan', width=20)
+        perf_info.add_column('Value', style='green', width=40)
+        perf_info.add_row('Highest RPS', f'Concurrency {summary[best_rps_idx][0]} ({summary[best_rps_idx][1]} req/sec)')
+        perf_info.add_row('Lowest Latency',
+                          f'Concurrency {summary[best_latency_idx][0]} ({summary[best_latency_idx][2]} seconds)')
+        console.print('\n')
+        console.print(perf_info)
+        # Performance recommendations
+        recommendations = []
+        if best_rps_idx == len(summary) - 1:
+            recommendations.append(
+                'The system seems not to have reached its performance bottleneck, try higher concurrency')
+        elif best_rps_idx == 0:
+            recommendations.append('Consider lowering concurrency, current load may be too high')
+        else:
+            recommendations.append(f'Optimal concurrency range is around {summary[best_rps_idx][0]}')
+        success_rate = float(summary[-1][6][:-1])
+        if success_rate < 95:
+            recommendations.append(
+                'Success rate is low at high concurrency, check system resources or reduce concurrency')
+        recommend_text = Text('\nPerformance Recommendations:', style='bold cyan')
+        console.print(recommend_text)
+        for rec in recommendations:
+            console.print(f'• {rec}', style='yellow')
+    except Exception as e:
+        console.print(f'Warning: Error generating performance analysis: {str(e)}', style='bold red')

evalscope/report/app.py CHANGED Viewed

@@ -223,6 +223,33 @@ def plot_multi_report_radar(df: pd.DataFrame):
     return fig
+def convert_markdown_image(text):
+    if not os.path.isfile(text):
+        return text
+    # Convert the image path to a markdown image tag
+    if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
+        text = os.path.abspath(text)
+        image_tag = f'![image](gradio_api/file={text})'
+        logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
+        return image_tag
+    return text
+def convert_html_tags(text):
+    # match begin label
+    text = re.sub(r'<(\w+)>', r'[\1]', text)
+    # match end label
+    text = re.sub(r'</(\w+)>', r'[/\1]', text)
+    return text
+def process_string(string: str, max_length: int = 2048) -> str:
+    string = convert_html_tags(string)  # for display labels e.g.
+    if max_length and len(string) > max_length:
+        return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
+    return string
 def dict_to_markdown(data) -> str:
     markdown_lines = []
@@ -230,55 +257,41 @@ def dict_to_markdown(data) -> str:
         bold_key = f'**{key}**'
         if isinstance(value, list):
-            value_str = '\n' + '\n'.join([f'  - {item}' for item in value])
+            value_str = '\n' + '\n'.join([f'- {process_model_prediction(item, max_length=None)}' for item in value])
         elif isinstance(value, dict):
             value_str = dict_to_markdown(value)
         else:
             value_str = str(value)
-        value_str = process_string(value_str)
-        markdown_line = f'{bold_key}: {value_str}'
+        value_str = process_string(value_str, max_length=None)  # Convert HTML tags but don't truncate
+        markdown_line = f'{bold_key}:\n{value_str}'
         markdown_lines.append(markdown_line)
     return '\n\n'.join(markdown_lines)
-def convert_html_tags(text):
-    # match begin label
-    text = re.sub(r'<(\w+)>', r'[\1]', text)
-    # match end label
-    text = re.sub(r'</(\w+)>', r'[/\1]', text)
-    return text
-def convert_markdown_image(text):
-    if not os.path.isfile(text):
-        return text
-    # Convert the image path to a markdown image tag
-    if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
-        text = os.path.abspath(text)
-        image_tag = f'![image](gradio_api/file={text})'
-        logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
-        return image_tag
-    return text
+def process_model_prediction(item: Any, max_length: int = 2048) -> str:
+    """
+    Process model prediction output into a formatted string.
+    Args:
+        item: The item to process. Can be a string, list, or dictionary.
+        max_length: The maximum length of the output string.
-def process_string(string: str, max_length: int = 2048) -> str:
-    string = convert_html_tags(string)  # for display labels e.g. `<think>`
-    if len(string) > max_length:
-        return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
-    return string
-def process_model_prediction(item: Any):
+    Returns:
+        A formatted string representation of the input.
+    """
     if isinstance(item, dict):
-        res = dict_to_markdown(item)
-        return process_string(res)
+        result = dict_to_markdown(item)
     elif isinstance(item, list):
-        res = '\n'.join([process_model_prediction(item) for item in item])
-        return process_string(res)
+        result = '\n'.join([f'- {process_model_prediction(i, max_length=None)}' for i in item])
     else:
-        return process_string(str(item))
+        result = str(item)
+    # Apply HTML tag conversion and truncation only at the final output
+    if max_length is not None:
+        return process_string(result, max_length)
+    return result
 def normalize_score(score):

evalscope/report/utils.py CHANGED Viewed

@@ -96,7 +96,7 @@ class Report:
     @classmethod
     def from_json(cls, json_file: str):
-        with open(json_file, 'r') as f:
+        with open(json_file, 'r', encoding='utf-8') as f:
             data = json.load(f)
         return cls.from_dict(data)

evalscope/third_party/toolbench_static/toolbench_static.py CHANGED Viewed

@@ -6,11 +6,12 @@ from typing import Union
 from evalscope.third_party.toolbench_static.eval import EvalArgs, run_eval
 from evalscope.third_party.toolbench_static.infer import InferArgs, run_infer
 from evalscope.utils import get_logger
+from evalscope.utils.deprecation_utils import deprecated
 from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
 logger = get_logger()
+@deprecated(since='0.15.1', remove_in='0.18.0', alternative='Native implementation of ToolBench')
 def run_task(task_cfg: Union[str, dict]):
     if isinstance(task_cfg, str):

evalscope/utils/deprecation_utils.py ADDED Viewed

@@ -0,0 +1,42 @@
+import functools
+import inspect
+from typing import Callable, Optional
+from .logger import get_logger
+logger = get_logger()
+def deprecated(since: str, remove_in: Optional[str] = None, alternative: Optional[str] = None) -> Callable:
+    """
+    Decorator to mark functions as deprecated.
+    :param since: String indicating the version since deprecation
+    :param remove_in: Optional string indicating the version when it will be removed
+    :param alternative: Optional string suggesting an alternative
+    :return: Decorated function
+    """
+    def decorator(func: Callable) -> Callable:
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            # Get the file name where the function is defined
+            file_name = inspect.getfile(func)
+            # Construct the warning message
+            warning_parts = [
+                f'{func.__name__} in {file_name} has been deprecated since version {since}',
+                f'and will be removed in version {remove_in}' if remove_in else None,
+                f'Use {alternative} instead' if alternative else None
+            ]
+            warning_message = '. '.join(filter(None, warning_parts))
+            # Log the warning
+            logger.warning(warning_message)
+            return func(*args, **kwargs)
+        return wrapper
+    return decorator

evalscope/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-__version__ = '0.15.0'
-__release_datetime__ = '2025-04-29 00:00:00'
+__version__ = '0.16.0'
+__release_datetime__ = '2025-05-19 18:00:00'

{evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: evalscope
-Version: 0.15.0
+Version: 0.16.0
 Summary: EvalScope: Lightweight LLMs Evaluation Framework
 Home-page: https://github.com/modelscope/evalscope
 Author: ModelScope team
@@ -26,12 +26,11 @@ Requires-Dist: latex2sympy2
 Requires-Dist: matplotlib
 Requires-Dist: modelscope[framework]
 Requires-Dist: nltk>=3.9
-Requires-Dist: omegaconf
 Requires-Dist: openai
 Requires-Dist: pandas
 Requires-Dist: pillow
 Requires-Dist: pyarrow
-Requires-Dist: pyyaml
+Requires-Dist: pyyaml>=5.1
 Requires-Dist: requests
 Requires-Dist: rouge-chinese
 Requires-Dist: rouge-score>=0.1.0
@@ -48,6 +47,7 @@ Requires-Dist: word2number
 Provides-Extra: aigc
 Requires-Dist: diffusers; extra == "aigc"
 Requires-Dist: iopath; extra == "aigc"
+Requires-Dist: omegaconf; extra == "aigc"
 Requires-Dist: open-clip-torch; extra == "aigc"
 Requires-Dist: opencv-python; extra == "aigc"
 Provides-Extra: all
@@ -61,12 +61,11 @@ Requires-Dist: latex2sympy2; extra == "all"
 Requires-Dist: matplotlib; extra == "all"
 Requires-Dist: modelscope[framework]; extra == "all"
 Requires-Dist: nltk>=3.9; extra == "all"
-Requires-Dist: omegaconf; extra == "all"
 Requires-Dist: openai; extra == "all"
 Requires-Dist: pandas; extra == "all"
 Requires-Dist: pillow; extra == "all"
 Requires-Dist: pyarrow; extra == "all"
-Requires-Dist: pyyaml; extra == "all"
+Requires-Dist: pyyaml>=5.1; extra == "all"
 Requires-Dist: requests; extra == "all"
 Requires-Dist: rouge-chinese; extra == "all"
 Requires-Dist: rouge-score>=0.1.0; extra == "all"
@@ -92,13 +91,15 @@ Requires-Dist: webdataset>0.2.0; extra == "all"
 Requires-Dist: aiohttp; extra == "all"
 Requires-Dist: fastapi; extra == "all"
 Requires-Dist: numpy; extra == "all"
+Requires-Dist: rich; extra == "all"
 Requires-Dist: sse-starlette; extra == "all"
 Requires-Dist: transformers; extra == "all"
-Requires-Dist: unicorn; extra == "all"
+Requires-Dist: uvicorn; extra == "all"
 Requires-Dist: gradio==5.4.0; extra == "all"
 Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
 Requires-Dist: diffusers; extra == "all"
 Requires-Dist: iopath; extra == "all"
+Requires-Dist: omegaconf; extra == "all"
 Requires-Dist: open-clip-torch; extra == "all"
 Requires-Dist: opencv-python; extra == "all"
 Provides-Extra: app
@@ -110,9 +111,10 @@ Provides-Extra: perf
 Requires-Dist: aiohttp; extra == "perf"
 Requires-Dist: fastapi; extra == "perf"
 Requires-Dist: numpy; extra == "perf"
+Requires-Dist: rich; extra == "perf"
 Requires-Dist: sse-starlette; extra == "perf"
 Requires-Dist: transformers; extra == "perf"
-Requires-Dist: unicorn; extra == "perf"
+Requires-Dist: uvicorn; extra == "perf"
 Provides-Extra: rag
 Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
 Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
@@ -177,9 +179,23 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
 ## 📝 Introduction
-EvalScope is [ModelScope](https://modelscope.cn/)'s official framework for model evaluation and benchmarking, designed for diverse assessment needs. It supports various model types including large language models, multimodal, embedding, reranker, and CLIP models.
+EvalScope is a comprehensive model evaluation and performance benchmarking framework meticulously crafted by the [ModelScope Community](https://modelscope.cn/), offering a one-stop solution for your model assessment needs. Regardless of the type of model you are developing, EvalScope is equipped to cater to your requirements:
+- 🧠 Large Language Models
+- 🎨 Multimodal Models
+- 🔍 Embedding Models
+- 🏆 Reranker Models
+- 🖼️ CLIP Models
+- 🎭 AIGC Models (Image-to-Text/Video)
+- ...and more!
+EvalScope is not merely an evaluation tool; it is a valuable ally in your model optimization journey:
+- 🏅 Equipped with multiple industry-recognized benchmarks and evaluation metrics: MMLU, CMMLU, C-Eval, GSM8K, etc.
+- 📊 Model inference performance stress testing: Ensuring your model excels in real-world applications.
+- 🚀 Seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, enabling one-click evaluations and providing full-chain support from training to assessment for your model development.
-The framework accommodates multiple evaluation scenarios such as end-to-end RAG evaluation, arena mode, and inference performance testing. It features built-in benchmarks and metrics like MMLU, CMMLU, C-Eval, and GSM8K. Seamlessly integrated with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, EvalScope enables one-click evaluations, offering comprehensive support for model training and assessment 🚀
+Below is the overall architecture diagram of EvalScope:
 <p align="center">
   <img src="docs/en/_static/images/evalscope_framework.png" width="70%">
@@ -214,6 +230,8 @@ Please scan the QR code below to join our community groups:
 ## 🎉 News
+- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
+- 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
 - 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
 - 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
 - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
@@ -479,26 +497,27 @@ For more customized evaluations, such as customizing model parameters or dataset
 ```shell
 evalscope eval \
- --model Qwen/Qwen2.5-0.5B-Instruct \
- --model-args revision=master,precision=torch.float16,device_map=auto \
- --generation-config do_sample=true,temperature=0.5 \
+ --model Qwen/Qwen3-0.6B \
+ --model-args '{"revision": "master", "precision": "torch.float16", "device_map": "auto"}' \
+ --generation-config '{"do_sample":true,"temperature":0.6,"max_new_tokens":512,"chat_template_kwargs":{"enable_thinking": false}}' \
  --dataset-args '{"gsm8k": {"few_shot_num": 0, "few_shot_random": false}}' \
  --datasets gsm8k \
  --limit 10
 ```
-### Parameter
-- `--model-args`: Model loading parameters, separated by commas in `key=value` format. Default parameters:
-  - `revision`: Model version, default is `master`
-  - `precision`: Model precision, default is `auto`
-  - `device_map`: Model device allocation, default is `auto`
-- `--generation-config`: Generation parameters, separated by commas in `key=value` format. Default parameters:
-  - `do_sample`: Whether to use sampling, default is `false`
-  - `max_length`: Maximum length, default is 2048
-  - `max_new_tokens`: Maximum length of generation, default is 512
-- `--dataset-args`: Configuration parameters for evaluation datasets, passed in `json` format. The key is the dataset name, and the value is the parameters. Note that it needs to correspond one-to-one with the values in the `--datasets` parameter:
+### Parameter Description
+- `--model-args`: Model loading parameters, passed as a JSON string:
+  - `revision`: Model version
+  - `precision`: Model precision
+  - `device_map`: Device allocation for the model
+- `--generation-config`: Generation parameters, passed as a JSON string and parsed as a dictionary:
+  - `do_sample`: Whether to use sampling
+  - `temperature`: Generation temperature
+  - `max_new_tokens`: Maximum length of generated tokens
+  - `chat_template_kwargs`: Model inference template parameters
+- `--dataset-args`: Settings for the evaluation dataset, passed as a JSON string where the key is the dataset name and the value is the parameters. Note that these need to correspond one-to-one with the values in the `--datasets` parameter:
   - `few_shot_num`: Number of few-shot examples
-  - `few_shot_random`: Whether to randomly sample few-shot data, if not set, defaults to `true`
+  - `few_shot_random`: Whether to randomly sample few-shot data; if not set, defaults to `true`
 Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
@@ -517,6 +536,11 @@ A stress testing tool focused on large language models, which can be customized
 Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
+**Output example**
+![multi_perf](docs/en/user_guides/stress_test/images/multi_perf.png)
 **Supports wandb for recording results**
 ![wandb sample](https://modelscope.oss-cn-beijing.aliyuncs.com/resource/wandb_sample.png)
@@ -565,7 +589,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
 </a>
 ## 🔜 Roadmap
-- [ ] Support for better evaluation report visualization
+- [x] Support for better evaluation report visualization
 - [x] Support for mixed evaluations across multiple datasets
 - [x] RAG evaluation
 - [x] VLM evaluation
@@ -575,7 +599,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
 - [x] Multi-modal evaluation
 - [ ] Benchmarks
   - [ ] GAIA
-  - [ ] GPQA
+  - [x] GPQA
   - [x] MBPP

evalscope 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.15.0py3-none-any.whl → 0.16.0py3-none-any.whl