PyPI - evalscope - Versions diffs - 0.13.0__py3-none-any.whl → 0.13.1__py3-none-any.whl - Mend

evalscope 0.13.0py3-none-any.whl → 0.13.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (22) hide show

evalscope/collections/evaluator.py +1 -1
evalscope/config.py +5 -2
evalscope/constants.py +1 -0
evalscope/evaluator/evaluator.py +5 -4
evalscope/models/custom_adapter.py +1 -1
evalscope/perf/arguments.py +11 -40
evalscope/perf/benchmark.py +34 -28
evalscope/perf/main.py +1 -1
evalscope/perf/plugin/datasets/__init__.py +1 -0
evalscope/perf/plugin/datasets/openqa.py +6 -11
evalscope/perf/plugin/datasets/random_dataset.py +51 -0
evalscope/perf/utils/db_util.py +3 -0
evalscope/run.py +14 -2
evalscope/version.py +2 -2
{evalscope-0.13.0.dist-info → evalscope-0.13.1.dist-info}/METADATA +33 -30
{evalscope-0.13.0.dist-info → evalscope-0.13.1.dist-info}/RECORD +22 -21
tests/cli/test_run.py +41 -11
tests/perf/test_perf.py +23 -0
{evalscope-0.13.0.dist-info → evalscope-0.13.1.dist-info}/LICENSE +0 -0
{evalscope-0.13.0.dist-info → evalscope-0.13.1.dist-info}/WHEEL +0 -0
{evalscope-0.13.0.dist-info → evalscope-0.13.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.13.0.dist-info → evalscope-0.13.1.dist-info}/top_level.txt +0 -0

evalscope/collections/evaluator.py CHANGED Viewed

@@ -181,7 +181,7 @@ class EvaluatorCollection:
             answers_list = jsonl_to_list(pred_file_path)
             indices = set()
             for answer in answers_list:
-                index = answer[AnswerKeys.ORIGIN_PROMPT].get('index')
+                index = answer.get(AnswerKeys.INDEX)
                 answer_dict[index] = answer
                 indices.add(index)
             data = []

evalscope/config.py CHANGED Viewed

@@ -81,7 +81,7 @@ class TaskConfig:
     def __post_init__(self):
         if (not self.model_id) and self.model:
             if isinstance(self.model, CustomModel):
-                self.model_id = type(self.model).__name__
+                self.model_id = self.model.config.get('model_id', 'custom_model')
             else:
                 self.model_id = os.path.basename(self.model).rstrip(os.sep)
             # fix path error, see http://github.com/modelscope/evalscope/issues/377
@@ -92,7 +92,10 @@ class TaskConfig:
             self.eval_batch_size = 8 if self.eval_type == EvalType.SERVICE else 1
     def to_dict(self):
-        return self.__dict__
+        result = self.__dict__.copy()
+        if isinstance(self.model, CustomModel):
+            result['model'] = self.model.__class__.__name__
+        return result
     def __str__(self):
         return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)

evalscope/constants.py CHANGED Viewed

@@ -77,6 +77,7 @@ class ArenaMode:
 class AnswerKeys:
+    INDEX = 'index'
     ANSWER_ID = 'answer_id'
     RAW_INPUT = 'raw_input'
     ORIGIN_PROMPT = 'origin_prompt'

evalscope/evaluator/evaluator.py CHANGED Viewed

@@ -81,7 +81,7 @@ class Evaluator(object):
         for subset_name, prompts_list in prompts.items():
             limit = self.task_cfg.limit or len(prompts_list)
             for index, prompt in enumerate(prompts_list[:limit]):
-                prompt['index'] = index
+                prompt[AnswerKeys.INDEX] = index
                 limited_prompts[subset_name].append(prompt)
         return limited_prompts
@@ -97,7 +97,8 @@ class Evaluator(object):
         answer_d[AnswerKeys.ANSWER_ID] = answer_id
         answer_d[AnswerKeys.SUBSET_NAME] = subset_name
         answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
-        answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
+        # answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
+        answer_d[AnswerKeys.INDEX] = input_d[AnswerKeys.INDEX]
         return answer_d
     def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
@@ -117,7 +118,7 @@ class Evaluator(object):
             return answers_list, prompts_list
         def get_answered_indices(answers_list: List[Dict]) -> List[int]:
-            indices = [answer[AnswerKeys.ORIGIN_PROMPT].get('index') for answer in answers_list]
+            indices = [answer.get(AnswerKeys.INDEX) for answer in answers_list]
             if all(index is None for index in indices):
                 return list(range(len(answers_list)))
@@ -238,7 +239,7 @@ class Evaluator(object):
                     pred = pred_content
             choice[ReviewKeys.REVIEW] = {
-                ReviewKeys.GOLD: gold_content,
+                ReviewKeys.GOLD: gold_content if gold_content != raw_input_d else '*Same as Input*',
                 ReviewKeys.PRED: pred,
                 ReviewKeys.RESULT: review_result
             }

evalscope/models/custom_adapter.py CHANGED Viewed

@@ -66,4 +66,4 @@ class CustomModelAdapter(BaseModelAdapter):
             else:
                 raise TypeError(f'Unsupported inputs type: {type(input_prompt)}')
-        return self.custom_model.predict(prompts=in_prompts, **kwargs)
+        return self.custom_model.predict(prompts=in_prompts, origin_inputs=inputs, **kwargs)

evalscope/perf/arguments.py CHANGED Viewed

@@ -24,6 +24,7 @@ class Arguments:
     connect_timeout: int = 600  # Connection timeout in seconds
     read_timeout: int = 600  # Read timeout in seconds
     api_key: Optional[str] = None
+    no_test_connection: bool = False  # Test the connection before starting the benchmark
     # Performance and parallelism
     number: Optional[int] = None  # Number of requests to be made
@@ -40,8 +41,9 @@ class Arguments:
     outputs_dir: str = DEFAULT_WORK_DIR
     # Prompt settings
-    max_prompt_length: int = sys.maxsize  # Maximum length of the prompt
+    max_prompt_length: int = 131072  # Maximum length of the prompt
     min_prompt_length: int = 0  # Minimum length of the prompt
+    prefix_length: int = 0  # Length of the prefix, only for random dataset
     prompt: Optional[str] = None  # The prompt text
     query_template: Optional[str] = None  # Template for the query
@@ -65,44 +67,12 @@ class Arguments:
     @staticmethod
     def from_args(args):
-        return Arguments(
-            model=args.model,
-            attn_implementation=args.attn_implementation,
-            url=args.url,
-            port=args.port,
-            api_key=args.api_key,
-            connect_timeout=args.connect_timeout,
-            read_timeout=args.read_timeout,
-            number=args.number,
-            parallel=args.parallel,
-            rate=args.rate,
-            log_every_n_query=args.log_every_n_query,
-            headers=args.headers,
-            wandb_api_key=args.wandb_api_key,
-            name=args.name,
-            outputs_dir=args.outputs_dir,
-            debug=args.debug,
-            tokenizer_path=args.tokenizer_path,
-            api=args.api,
-            max_prompt_length=args.max_prompt_length,
-            min_prompt_length=args.min_prompt_length,
-            prompt=args.prompt,
-            query_template=args.query_template,
-            dataset=args.dataset,
-            dataset_path=args.dataset_path,
-            frequency_penalty=args.frequency_penalty,
-            logprobs=args.logprobs,
-            max_tokens=args.max_tokens,
-            min_tokens=args.min_tokens,
-            n_choices=args.n_choices,
-            seed=args.seed,
-            stop=args.stop,
-            stop_token_ids=args.stop_token_ids,
-            stream=args.stream,
-            temperature=args.temperature,
-            top_p=args.top_p,
-            top_k=args.top_k,
-        )
+        # Convert Namespace to a dictionary and filter out None values
+        args_dict = {k: v for k, v in vars(args).items() if v is not None}
+        if 'func' in args_dict:
+            del args_dict['func']  # Note: compat CLI arguments
+        return Arguments(**args_dict)
     def __post_init__(self):
         self.headers = self.headers or {}  # Default to empty dictionary
@@ -153,6 +123,7 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--api-key', type=str, required=False, default=None, help='The API key for authentication')
     parser.add_argument('--connect-timeout', type=int, default=600, help='The network connection timeout')
     parser.add_argument('--read-timeout', type=int, default=600, help='The network read timeout')
+    parser.add_argument('--no-test-connection', action='store_false', default=False, help='Do not test the connection before starting the benchmark')  # noqa: E501
     # Performance and parallelism
     parser.add_argument('-n', '--number', type=int, default=None, help='How many requests to be made')
@@ -168,6 +139,7 @@ def add_argument(parser: argparse.ArgumentParser):
     # Prompt settings
     parser.add_argument('--max-prompt-length', type=int, default=sys.maxsize, help='Maximum input prompt length')
     parser.add_argument('--min-prompt-length', type=int, default=0, help='Minimum input prompt length')
+    parser.add_argument('--prefix-length', type=int, default=0, help='The prefix length')
     parser.add_argument('--prompt', type=str, required=False, default=None, help='Specified the request prompt')
     parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
@@ -193,7 +165,6 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--temperature', type=float, help='The sample temperature', default=None)
     parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
     parser.add_argument('--top-k', type=int, help='Sampling top k', default=None)
     # yapf: enable

evalscope/perf/benchmark.py CHANGED Viewed

@@ -150,39 +150,45 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
         name = args.name if args.name else f'{args.model_id}_{current_time}'
         wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
-    with sqlite3.connect(result_db_path) as con:
-        cursor = con.cursor()
-        create_result_table(cursor)
-        with tqdm(desc='Processing') as pbar:
-            while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
-                try:
-                    # Attempt to get benchmark data from the queue with a timeout
-                    benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
-                    benchmark_data_queue.task_done()
-                except asyncio.TimeoutError:
-                    # If timeout, continue to the next iteration
-                    continue
+    collected_benchmark_data = []
+    with tqdm(desc='Processing') as pbar:
+        while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
+            try:
+                # Attempt to get benchmark data from the queue with a timeout
+                benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
+                benchmark_data_queue.task_done()
+            except asyncio.TimeoutError:
+                # If timeout, continue to the next iteration
+                continue
+            # Update metrics based on the benchmark data
+            metrics.update_metrics(benchmark_data, api_plugin)
-                # Update metrics based on the benchmark data
-                metrics.update_metrics(benchmark_data, api_plugin)
+            # Collect benchmark data for later database insertion
+            collected_benchmark_data.append(benchmark_data)
-                # Insert benchmark data into the database and commit the transaction
-                insert_benchmark_data(cursor, benchmark_data)
-                con.commit()
+            # Create a message with the updated metrics
+            message = metrics.create_message()
-                # Create a message with the updated metrics
-                message = metrics.create_message()
+            # Log the message to wandb if the api key is provided
+            if args.wandb_api_key:
+                wandb.log(message)
-                # Log the message to wandb if the api key is provided
-                if args.wandb_api_key:
-                    wandb.log(message)
+            # Log the message to the logger every n queries
+            if int(metrics.n_total_queries) % args.log_every_n_query == 0:
+                msg = json.dumps(message, ensure_ascii=False, indent=2)
+                logger.info(msg)
-                # Log the message to the logger every n queries
-                if int(metrics.n_total_queries) % args.log_every_n_query == 0:
-                    msg = json.dumps(message, ensure_ascii=False, indent=2)
-                    logger.info(msg)
+            pbar.update(1)  # Update the progress bar
-                pbar.update(1)  # Update the progress bar
+    # Now perform database operations after all benchmark data has been processed
+    with sqlite3.connect(result_db_path) as con:
+        cursor = con.cursor()
+        create_result_table(cursor)
+        for benchmark_data in collected_benchmark_data:
+            insert_benchmark_data(cursor, benchmark_data)
+        con.commit()
     return metrics, result_db_path
@@ -199,7 +205,7 @@ async def start_server(args: Arguments) -> bool:
         else:
             args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
-    if not await test_connection(args):
+    if (not args.no_test_connection) and (not await test_connection(args)):
         raise TimeoutError('Test connection failed')

evalscope/perf/main.py CHANGED Viewed

@@ -32,7 +32,7 @@ def run_perf_benchmark(args):
     if platform.system() == 'Windows':
         asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
-    loop = asyncio.get_event_loop()
+    loop = asyncio.new_event_loop()
     if platform.system() != 'Windows':
         add_signal_handlers(loop)

evalscope/perf/plugin/datasets/__init__.py CHANGED Viewed

@@ -3,4 +3,5 @@ from evalscope.perf.plugin.datasets.flickr8k import FlickrDatasetPlugin
 from evalscope.perf.plugin.datasets.line_by_line import LineByLineDatasetPlugin
 from evalscope.perf.plugin.datasets.longalpaca import LongAlpacaDatasetPlugin
 from evalscope.perf.plugin.datasets.openqa import OpenqaDatasetPlugin
+from evalscope.perf.plugin.datasets.random_dataset import RandomDatasetPlugin
 from evalscope.perf.plugin.datasets.speed_benchmark import SpeedBenchmarkDatasetPlugin, SpeedBenchmarkLongDatasetPlugin

evalscope/perf/plugin/datasets/openqa.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import json
-import subprocess
+import os
 from typing import Any, Dict, Iterator, List
 from evalscope.perf.arguments import Arguments
@@ -18,16 +18,11 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
     def build_messages(self) -> Iterator[List[Dict]]:
         if not self.query_parameters.dataset_path:
-            subprocess.call([
-                'modelscope',
-                'download',
-                '--dataset',
-                'AI-ModelScope/HC3-Chinese',
-                'open_qa.jsonl',
-                '--local_dir',
-                './data',
-            ])
-            self.query_parameters.dataset_path = './data/open_qa.jsonl'
+            from modelscope import dataset_snapshot_download
+            file_name = 'open_qa.jsonl'
+            local_path = dataset_snapshot_download('AI-ModelScope/HC3-Chinese', allow_patterns=[file_name])
+            self.query_parameters.dataset_path = os.path.join(local_path, file_name)
         for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
             item = json.loads(item)

evalscope/perf/plugin/datasets/random_dataset.py ADDED Viewed

@@ -0,0 +1,51 @@
+import numpy as np
+from typing import Dict, Iterator, List
+from evalscope.perf.arguments import Arguments
+from evalscope.perf.plugin.datasets.base import DatasetPluginBase
+from evalscope.perf.plugin.registry import register_dataset
+@register_dataset('random')
+class RandomDatasetPlugin(DatasetPluginBase):
+    """Read dataset and return prompt.
+    """
+    def __init__(self, query_parameters: Arguments):
+        super().__init__(query_parameters)
+        assert self.query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer_path`.'  # noqa: E501
+        from modelscope import AutoTokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(self.query_parameters.tokenizer_path, trust_remote_code=True)
+        self.prefix_length = self.query_parameters.prefix_length
+        self.prefix_ids = self.get_random_inputs(self.prefix_length)
+        self.template_len = self.get_template_len()
+        self.number = self.query_parameters.number or 1
+    def build_messages(self) -> Iterator[List[Dict]]:
+        min_prompt_length = self.query_parameters.min_prompt_length - self.template_len
+        max_prompt_length = self.query_parameters.max_prompt_length - self.template_len + 1
+        assert min_prompt_length >= 0, f'min_prompt_length should be greater than or equal to the template length {self.template_len}.'  # noqa: E501
+        assert max_prompt_length >= min_prompt_length, 'max_prompt_length should be greater than or equal to min_prompt_length.'  # noqa: E501
+        # refer to https://github.com/vllm-project/vllm/blob/ed6e9075d31e32c8548b480a47d1ffb77da1f54c/benchmarks/benchmark_serving.py#L366C1-L399C1  # noqa: E501
+        input_lens = np.random.randint(min_prompt_length, max_prompt_length, size=self.number)
+        offsets = np.random.randint(0, self.tokenizer.vocab_size, size=self.number)
+        for i in range(self.number):
+            prompt_ids = (offsets[i] + i + np.arange(input_lens[i])) % self.tokenizer.vocab_size
+            prompt = self.tokenizer.decode(
+                self.prefix_ids + prompt_ids.tolist(), skip_special_tokens=False, clean_up_tokenization_spaces=False)
+            yield [{'role': 'user', 'content': prompt}]
+    def get_random_inputs(self, length: int) -> List[int]:
+        if length <= 0:
+            return []
+        input_ids = np.random.randint(0, self.tokenizer.vocab_size, size=length).tolist()
+        return input_ids
+    def get_template_len(self):
+        empty_message = [{'role': 'user', 'content': ''}]
+        template = self.tokenizer.apply_chat_template(empty_message, tokenize=True, add_generation_prompt=True)
+        return len(template)

evalscope/perf/utils/db_util.py CHANGED Viewed

@@ -2,6 +2,7 @@ import base64
 import json
 import os
 import pickle
+import re
 import sqlite3
 import sys
 from datetime import datetime
@@ -91,6 +92,8 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
 def get_output_path(args: Arguments) -> str:
     current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
     output_path = os.path.join(args.outputs_dir, current_time, f'{args.name or args.model_id}')
+    # Filter illegal characters
+    output_path = re.sub(r'[<>:"|?*]', '_', output_path)
     if not os.path.exists(output_path):
         os.makedirs(output_path, exist_ok=True)
     logger.info(f'Save the result to: {output_path}')

evalscope/run.py CHANGED Viewed

@@ -39,9 +39,11 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
     configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
     if task_cfg.eval_backend != EvalBackend.NATIVE:
-        return run_non_native_backend(task_cfg, outputs)
+        result = run_non_native_backend(task_cfg, outputs)
     else:
-        return evaluate_model(task_cfg, outputs)
+        result = evaluate_model(task_cfg, outputs)
+    return result
 def setup_work_directory(task_cfg: TaskConfig, run_time: str):
@@ -117,6 +119,16 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
         res_dict = evaluator.eval()
         eval_results[evaluator.dataset_name] = res_dict
+    # Clean up
+    if base_model is not None:
+        import gc
+        import torch
+        del base_model
+        del evaluators
+        torch.cuda.empty_cache()
+        gc.collect()
     return eval_results

evalscope/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-__version__ = '0.13.0'
-__release_datetime__ = '2025-03-14 12:00:00'
+__version__ = '0.13.1'
+__release_datetime__ = '2025-03-24 18:00:00'

{evalscope-0.13.0.dist-info → evalscope-0.13.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: evalscope
-Version: 0.13.0
+Version: 0.13.1
 Summary: EvalScope: Lightweight LLMs Evaluation Framework
 Home-page: https://github.com/modelscope/evalscope
 Author: ModelScope team
@@ -239,7 +239,8 @@ Please scan the QR code below to join our community groups:
 ## 🎉 News
-- 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark. You can use it by specifying `live_code_bench`.
+- 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
+- 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
 - 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
 - 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
 - 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
@@ -277,23 +278,24 @@ Please scan the QR code below to join our community groups:
 We recommend using conda to manage your environment and installing dependencies with pip:
 1. Create a conda environment (optional)
-   ```shell
-   # It is recommended to use Python 3.10
-   conda create -n evalscope python=3.10
-   # Activate the conda environment
-   conda activate evalscope
-   ```
+    ```shell
+    # It is recommended to use Python 3.10
+    conda create -n evalscope python=3.10
+    # Activate the conda environment
+    conda activate evalscope
+    ```
 2. Install dependencies using pip
-   ```shell
-   pip install evalscope                # Install Native backend (default)
-   # Additional options
-   pip install evalscope[opencompass]   # Install OpenCompass backend
-   pip install evalscope[vlmeval]       # Install VLMEvalKit backend
-   pip install evalscope[rag]           # Install RAGEval backend
-   pip install evalscope[perf]          # Install Perf dependencies
-   pip install evalscope[all]           # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
-   ```
+    ```shell
+    pip install evalscope                # Install Native backend (default)
+    # Additional options
+    pip install 'evalscope[opencompass]'   # Install OpenCompass backend
+    pip install 'evalscope[vlmeval]'       # Install VLMEvalKit backend
+    pip install 'evalscope[rag]'           # Install RAGEval backend
+    pip install 'evalscope[perf]'          # Install dependencies for the model performance testing module
+    pip install 'evalscope[app]'           # Install dependencies for visualization
+    pip install 'evalscope[all]'           # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
+    ```
 > [!WARNING]
 > As the project has been renamed to `evalscope`, for versions `v0.4.3` or earlier, you can install using the following command:
@@ -307,21 +309,22 @@ We recommend using conda to manage your environment and installing dependencies
 ### Method 2: Install from Source
 1. Download the source code
-   ```shell
-   git clone https://github.com/modelscope/evalscope.git
-   ```
+    ```shell
+    git clone https://github.com/modelscope/evalscope.git
+    ```
 2. Install dependencies
-   ```shell
-   cd evalscope/
-   pip install -e .                  # Install Native backend
-   # Additional options
-   pip install -e '.[opencompass]'   # Install OpenCompass backend
-   pip install -e '.[vlmeval]'       # Install VLMEvalKit backend
-   pip install -e '.[rag]'           # Install RAGEval backend
-   pip install -e '.[perf]'          # Install Perf dependencies
-   pip install -e '.[all]'           # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
-   ```
+    ```shell
+    cd evalscope/
+    pip install -e .                  # Install Native backend
+    # Additional options
+    pip install -e '.[opencompass]'   # Install OpenCompass backend
+    pip install -e '.[vlmeval]'       # Install VLMEvalKit backend
+    pip install -e '.[rag]'           # Install RAGEval backend
+    pip install -e '.[perf]'          # Install Perf dependencies
+    pip install -e '.[app]'           # Install visualization dependencies
+    pip install -e '.[all]'           # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
+    ```
 ## 🚀 Quick Start

{evalscope-0.13.0.dist-info → evalscope-0.13.1.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,11 @@
 evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
 evalscope/arguments.py,sha256=VhZd7a8PoZK01qFCMEADLINqLYi6njRqRb50iR1l1lo,5241
-evalscope/config.py,sha256=9bMV7wf8pM7N5dEj_kJsCq6oM8xobzQDYh0NF8h-j1I,9313
-evalscope/constants.py,sha256=ydS8oihksGnvvzvJZw7HGhEeeccHNpJxspB81gAv29Y,3720
-evalscope/run.py,sha256=Udz-H503UaMYos0ic3A_npXIbnd4eJLx26q5UEahF-U,5797
+evalscope/config.py,sha256=wLrc8a7z28IFPRaeUzot5HGtSDY_13KR-3kRyFKEGx8,9476
+evalscope/constants.py,sha256=Cgzkoz4R3MC3YLtbCM2fmSwF8Z2kuxYdOC8t9FWJj9w,3740
+evalscope/run.py,sha256=LUCdnNzNIfHSWvxu3gxAsHEDX7hT5mcVnV4lSY5h0iA,6007
 evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
 evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
-evalscope/version.py,sha256=a1r1BkZoSpoA_eGXZoXm6WaLayRHhF__TgvE9xG-Whs,119
+evalscope/version.py,sha256=Y30-zF2dwch3upMc0t5yNNjIgvI-LQQWFhftRQgXvOk,119
 evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
 evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -180,11 +180,11 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
 evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
 evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
 evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
-evalscope/collections/evaluator.py,sha256=okP4_a5vuM-Z0O_4ntauuyn2NeH228JUo_YrbrTqKPM,12741
+evalscope/collections/evaluator.py,sha256=YJy8Dj35XCdCwhNDwZecJkeW1_ZgIOsuRLFzfe3SyV8,12724
 evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
 evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
 evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
-evalscope/evaluator/evaluator.py,sha256=yj7ds5WMYqQcRw3B3x11-cajl4DmWsLM_3kO1n2k7OE,19734
+evalscope/evaluator/evaluator.py,sha256=szRQrXH5ILpUljb14lcunuOt185H8Um1paviTokraA4,19845
 evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
 evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
 evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
@@ -203,7 +203,7 @@ evalscope/models/__init__.py,sha256=i9vcOBMEF_UM7C2gpmh2GsQk3njwqevoQ6A4CnP1fHs,
 evalscope/models/base_adapter.py,sha256=7PbRwfD5PIZCBYVds6ZHI8TBY9C5i2LdPOTu88FJWlY,3414
 evalscope/models/chat_adapter.py,sha256=5-yz7L41OdeBO9J_qRkEZcduATrYIMe__UFfh7BzjIc,6277
 evalscope/models/choice_adapter.py,sha256=fnJdo-FMJ-zvNLbEJGc73odgWXIxtVudL00JIf2vzsA,8239
-evalscope/models/custom_adapter.py,sha256=Za52WF1I_YcJkGomJ6s9sP2Fs8DoJ4HHBYBi3iC3WNI,2379
+evalscope/models/custom_adapter.py,sha256=AGztmZ0aT0g2flh4B4NaiZ8LCDg8tT0gVNxmrP5W1mA,2401
 evalscope/models/local_model.py,sha256=yydggBCLcBAmUWbBhv7o2CA3RbG0DwDZharPdrkbNcg,2628
 evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
 evalscope/models/register.py,sha256=4vX6AfScAzwD7UkncbuejfAiQHznQkK5hvtG6jEUbWo,809
@@ -212,10 +212,10 @@ evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk
 evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
 evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
 evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-evalscope/perf/arguments.py,sha256=u3GNdnOBmiEirtgJLspsLO7qBwHeWLoXd4vlt69jJ-g,9717
-evalscope/perf/benchmark.py,sha256=hKN-Nu-x-VTswHP0M6PT3jvduWxN7AJpz34DBrUcafQ,9734
+evalscope/perf/arguments.py,sha256=hBR6TXCoLkHRLxrwXacmierfFZhyQaT5hnKAfp-vE6I,8990
+evalscope/perf/benchmark.py,sha256=VYcFhSoZXcLoNXpFYxOFxLbBLv_8Tn74Qklim7vELCM,9889
 evalscope/perf/http_client.py,sha256=xMakdQkJ2cgIOd-yOmHEW0vbGKTJ0JWhLFt9IFtUP8Q,7473
-evalscope/perf/main.py,sha256=aZUrfbz-Pl2xe8AgUL_6rW6n8dX4YAToDw5xPpLtbI4,1278
+evalscope/perf/main.py,sha256=w-yDbl0osaTAMgC-JNPpqIq2LQ7U4c-Ht7Amj8Nbjc8,1278
 evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
 evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
 evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
@@ -223,18 +223,19 @@ evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqY
 evalscope/perf/plugin/api/custom_api.py,sha256=ay1AGi4y2opjwyRl0J0A54-vLB-pBj3QBFkzog0KA-g,3787
 evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
 evalscope/perf/plugin/api/openai_api.py,sha256=KQRQMOfQceKQtrvTE-SyhNHcDoGuQ0900yh7r74Hcoo,7560
-evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
+evalscope/perf/plugin/datasets/__init__.py,sha256=Z6Jc0RxJS_z0nBBV1-b0-56Ija60AtQ7I_67gY6ZfdQ,568
 evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
 evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
 evalscope/perf/plugin/datasets/flickr8k.py,sha256=UzAIFIO0m5inWOkWM1mO6wfV2HOuXAqiTxCJ4b0SiZM,1589
 evalscope/perf/plugin/datasets/line_by_line.py,sha256=IKVZMpKei6XW9DTm9VEssWHE96i1lTqMf0621dA_img,836
 evalscope/perf/plugin/datasets/longalpaca.py,sha256=2aENqCly_DX1dyNcurYsLFJIvXYFph6jWm7z7XETvMk,1176
-evalscope/perf/plugin/datasets/openqa.py,sha256=2pv7yyPSFYTjPhvAGBsHl0eQO8gt7Wk1CaKcfTi3Tnc,1394
+evalscope/perf/plugin/datasets/openqa.py,sha256=_aVXs2s8wbmtoB6ZO-pNjUZvBVxRUYdoJDGv5-BumtI,1342
+evalscope/perf/plugin/datasets/random_dataset.py,sha256=wPyY5kk2zKnc8u9uYEl-vQ6BLHeWbdC8EHEAZNFSDeU,2702
 evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
 evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
 evalscope/perf/utils/benchmark_util.py,sha256=4TyQ_tE5odcjKDFDueI3jrC0vld6QxmTreOd5_SP4vE,5802
-evalscope/perf/utils/db_util.py,sha256=PSBq16uWyzXx0zyoEE4wazWKN19UAA8_GjobS7rTPso,9001
+evalscope/perf/utils/db_util.py,sha256=hRXixxpNBrACF43reOJV5SoO1vj34cqoNMaTKH_oLLE,9100
 evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
 evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
 evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -312,9 +313,9 @@ tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
 tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
 tests/cli/test_all.py,sha256=1wwXtdjBmWYLhs5TXOJhZBwPm2qd9FYFqQSemXWKNUs,3865
 tests/cli/test_collection.py,sha256=V-_M7ngwekMGqPuI16jjJZyAK2XLE4Z6QTn-8B5ykgU,4071
-tests/cli/test_run.py,sha256=LKWWxT0jaMLtcIl57vnXEFFlzbJpAplFqqwinvAHN8Y,15047
+tests/cli/test_run.py,sha256=Gk8uCT0IjDSf2sf-TXeQFV83ovNzRs4GcAkQ1DhRJEU,15929
 tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
-tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
+tests/perf/test_perf.py,sha256=mfXTCsD9RaCef3b4CLvm8ErxBUaWzn-EKKhOxD65i3A,3817
 tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
 tests/rag/test_mteb.py,sha256=t64FXE-ZsOCLiRJrw-dIDIhKd1OXiaglXaeERs0lOh4,4643
@@ -325,9 +326,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
 tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
 tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
 tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
-evalscope-0.13.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
-evalscope-0.13.0.dist-info/METADATA,sha256=0i3SENci2ws_vqdewQAxVUqan-MV1LwJoLLcEZ8ML7w,32870
-evalscope-0.13.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
-evalscope-0.13.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
-evalscope-0.13.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
-evalscope-0.13.0.dist-info/RECORD,,
+evalscope-0.13.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
+evalscope-0.13.1.dist-info/METADATA,sha256=luYebd_U93wnTkXcv_MYPfd9-JRz51DjWB6Bh6phspU,33546
+evalscope-0.13.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+evalscope-0.13.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
+evalscope-0.13.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
+evalscope-0.13.1.dist-info/RECORD,,

tests/cli/test_run.py CHANGED Viewed

@@ -203,7 +203,7 @@ class TestRun(unittest.TestCase):
         print(res)
     @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
-    def test_run_humaneval(self):
+    def test_run_one_task(self):
         from evalscope.config import TaskConfig
         task_cfg = TaskConfig(
@@ -223,6 +223,33 @@ class TestRun(unittest.TestCase):
         run_task(task_cfg=task_cfg)
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_task_loop(self):
+        os.environ['CUDA_VISIBLE_DEVICES'] = '2'
+        from evalscope.config import TaskConfig
+        task_cfg1 = TaskConfig(
+            model='Qwen/Qwen2.5-0.5B-Instruct',
+            model_id='model1',
+            datasets=['iquiz'],
+            limit=10
+        )
+        task_cfg2 = TaskConfig(
+            model='Qwen/Qwen2.5-0.5B-Instruct',
+            model_id='model2',
+            datasets=['iquiz'],
+            limit=10
+        )
+        task_cfg3 = TaskConfig(
+            model='Qwen/Qwen2.5-0.5B-Instruct',
+            model_id='model3',
+            datasets=['iquiz'],
+            limit=10
+        )
+        run_task(task_cfg=[task_cfg1, task_cfg2, task_cfg3])
     @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_server_model(self):
         from evalscope.config import TaskConfig
@@ -365,20 +392,20 @@ class TestRun(unittest.TestCase):
         from evalscope.config import TaskConfig
         task_cfg = TaskConfig(
-            model='qwen2.5-7b-instruct',
+            model='qwq-32b',
             api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
             api_key= env.get('DASHSCOPE_API_KEY'),
             eval_type=EvalType.SERVICE,
             datasets=[
                 # 'math_500',
-                'aime24',
+                # 'aime24',
                 # 'competition_math',
                 # 'arc',
                 # 'gsm8k'
                 # 'truthful_qa',
                 # 'simple_qa',
                 # # 'chinese_simpleqa',
-                # 'live_code_bench',
+                'live_code_bench',
                 # 'humaneval'
                 # 'general_qa'
             ],
@@ -387,10 +414,9 @@ class TestRun(unittest.TestCase):
                     'subset_list': ['Level 4']
                 },
                 'live_code_bench': {
-                    'subset_list': ['v4_v5'],
                     'extra_params': {
-                        'start_date': '2024-12-01',
-                        'end_date': '2025-01-01'
+                        'start_date': '2024-08-01',
+                        'end_date': '2025-02-28'
                     },
                     'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
                 },
@@ -402,19 +428,23 @@ class TestRun(unittest.TestCase):
                     ]
                 },
             },
-            eval_batch_size=5,
-            limit=5,
+            eval_batch_size=10,
+            # limit=5,
             judge_strategy=JudgeStrategy.AUTO,
+            judge_worker_num=8,
             judge_model_args={
                 'model_id': 'qwen2.5-7b-instruct',
                 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
                 'api_key': env.get('DASHSCOPE_API_KEY'),
             },
             generation_config={
-                'max_new_tokens': 2048,
+                'max_new_tokens': 20000,
                 'temperature': 0.0,
                 'seed': 42,
-            }
+            },
+            timeout=60000,
+            stream=True,
+            # use_cache='outputs/20250320_143658'
         )
         run_task(task_cfg=task_cfg)

tests/perf/test_perf.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
+from dotenv import dotenv_values
+env = dotenv_values('.env')
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 import unittest
@@ -96,6 +98,27 @@ class TestPerf(unittest.TestCase):
         }
         run_perf_benchmark(task_cfg)
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_perf_local_random(self):
+        from evalscope.perf.arguments import Arguments
+        task_cfg = Arguments(
+            parallel=20,
+            model='Qwen2.5-0.5B-Instruct',
+            url='http://127.0.0.1:8801/v1/chat/completions',
+            api='openai',
+            dataset='random',
+            min_tokens=1024,
+            max_tokens=1024,
+            prefix_length=0,
+            min_prompt_length=1024,
+            max_prompt_length=1024,
+            number=40,
+            tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
+            seed=None,
+            debug= True,
+        )
+        run_perf_benchmark(task_cfg)
 if __name__ == '__main__':
     unittest.main(buffer=False)

{evalscope-0.13.0.dist-info → evalscope-0.13.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{evalscope-0.13.0.dist-info → evalscope-0.13.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{evalscope-0.13.0.dist-info → evalscope-0.13.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{evalscope-0.13.0.dist-info → evalscope-0.13.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

evalscope 0.13.0__py3-none-any.whl → 0.13.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.13.0py3-none-any.whl → 0.13.1py3-none-any.whl