PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (97) hide show

evalscope/api/benchmark/__init__.py +1 -1
evalscope/api/benchmark/adapters/__init__.py +2 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +1 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +7 -6
evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
evalscope/api/benchmark/benchmark.py +35 -0
evalscope/api/benchmark/meta.py +6 -0
evalscope/api/dataset/dataset.py +6 -6
evalscope/api/dataset/loader.py +2 -1
evalscope/api/evaluator/cache.py +24 -1
evalscope/api/evaluator/state.py +12 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +47 -2
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +0 -1
evalscope/api/model/generate_config.py +1 -3
evalscope/api/model/model.py +4 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/single_model.py +3 -3
evalscope/app/utils/data_utils.py +7 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/arguments.py +2 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/bfcl/bfcl_adapter.py +2 -6
evalscope/benchmarks/bfcl/generation.py +2 -2
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +5 -1
evalscope/benchmarks/tau_bench/generation.py +1 -1
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +15 -19
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +72 -13
evalscope/constants.py +8 -0
evalscope/evaluator/evaluator.py +6 -4
evalscope/metrics/llm_judge.py +19 -7
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +20 -0
evalscope/models/openai_compatible.py +3 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +7 -4
evalscope/perf/benchmark.py +2 -0
evalscope/perf/utils/benchmark_util.py +8 -5
evalscope/perf/utils/local_server.py +3 -0
evalscope/report/__init__.py +0 -1
evalscope/report/generator.py +8 -87
evalscope/run.py +9 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/chat_service.py +1 -1
evalscope/utils/import_utils.py +23 -1
evalscope/utils/io_utils.py +42 -1
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +23 -6
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/METADATA +12 -15
{evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/RECORD +94 -80
tests/benchmark/test_eval.py +30 -31
tests/benchmark/test_image_edit.py +65 -0
tests/benchmark/test_vlm.py +80 -0
tests/cli/test_all.py +83 -43
tests/cli/test_collection.py +8 -5
tests/cli/test_reasoning.py +81 -0
tests/common.py +73 -0
tests/perf/test_perf.py +4 -2
tests/rag/test_clip_benchmark.py +0 -3
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/aigc/__init__.py +0 -1
/evalscope/benchmarks/{aigc → image_edit}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → image_edit/gedit}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → math_vista}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
/tests/{aigc → benchmark}/test_t2i.py +0 -0

tests/cli/test_all.py CHANGED Viewed

@@ -17,44 +17,44 @@ os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
 logger = get_logger()
 datasets=[
-        'iquiz',
-        'ifeval',
-        'mmlu',
-        'mmlu_pro',
-        'musr',
-        'process_bench',
-        'race',
-        'trivia_qa',
-        'cmmlu',
-        'humaneval',
-        'gsm8k',
-        'bbh',
-        'competition_math',
-        'math_500',
-        'aime24',
-        'gpqa_diamond',
-        'arc',
-        'ceval',
-        'hellaswag',
-        'general_mcq',
-        'general_qa',
-        'super_gpqa',
-        # 'live_code_bench',
-        'mmlu_redux',
-        'simple_qa',
-        'chinese_simpleqa',
-        'alpaca_eval',
-        'arena_hard',
-        'maritime_bench',
-        'drop',
-        'winogrande',
-        'tool_bench',
-        'frames',
-        'docmath',
-        'needle_haystack',
-        'bfcl_v3',
-        'hle',
-        'tau_bench',
+    'iquiz',
+    'ifeval',
+    'mmlu',
+    'mmlu_pro',
+    'musr',
+    'process_bench',
+    'race',
+    'trivia_qa',
+    'cmmlu',
+    'humaneval',
+    'gsm8k',
+    'bbh',
+    'competition_math',
+    'math_500',
+    'aime24',
+    'gpqa_diamond',
+    'arc',
+    'ceval',
+    'hellaswag',
+    'general_mcq',
+    'general_qa',
+    'super_gpqa',
+    # 'live_code_bench',
+    'mmlu_redux',
+    'simple_qa',
+    'chinese_simpleqa',
+    'alpaca_eval',
+    'arena_hard',
+    'maritime_bench',
+    'drop',
+    'winogrande',
+    'tool_bench',
+    'frames',
+    'docmath',
+    'needle_haystack',
+    'bfcl_v3',
+    'hle',
+    'tau_bench',
 ]
 # Reverse the datasets list to ensure the order is from most recent to oldest
@@ -150,7 +150,6 @@ dataset_args={
 }
 class TestRun(unittest.TestCase):
-    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_benchmarks(self):
         from evalscope.config import TaskConfig
@@ -180,19 +179,60 @@ class TestRun(unittest.TestCase):
         run_task(task_cfg=task_cfg)
+    def test_vlm_benchmark(self):
+        from evalscope.config import TaskConfig
+        task_cfg = TaskConfig(
+            model='qwen-vl-plus',
+            api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
+            api_key= env.get('DASHSCOPE_API_KEY'),
+            eval_type=EvalType.SERVICE,
+            datasets=[
+                'mmmu',
+                # 'math_vista',
+            ],
+            dataset_args={
+                'mmmu': {
+                    'subset_list': ['Accounting']
+                },
+                'math_vista': {
+                    'subset_list': ['default']
+                }
+            },
+            eval_batch_size=1,
+            limit=1,
+            stream=True,
+            generation_config={
+                'temperature': 0,
+                'n': 1,
+                'max_tokens': 4096,
+                'image_height': 512,
+                'image_width': 512,
+                'image_num': 2,
+            },
+            judge_worker_num=5,
+            judge_strategy=JudgeStrategy.AUTO,
+            judge_model_args={
+                'model_id': 'qwen2.5-72b-instruct',
+                'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
+                'api_key': env.get('DASHSCOPE_API_KEY'),
+            }
+        )
+        run_task(task_cfg=task_cfg)
-    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_ci_lite(self):
         from evalscope.config import TaskConfig
+        api_key = env.get('DASHSCOPE_API_KEY')
         task_cfg = TaskConfig(
             model='qwen-plus',
             api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
-            api_key= env.get('DASHSCOPE_API_KEY'),
-            eval_type=EvalType.SERVICE,
+            api_key=api_key,
+            eval_type=EvalType.SERVICE if api_key else EvalType.MOCK_LLM,
             datasets=[
                 'general_mcq',
-                'general_qa',
                 'iquiz',
             ],
             dataset_args={

tests/cli/test_collection.py CHANGED Viewed

@@ -52,16 +52,19 @@ class TestCollection(unittest.TestCase):
             api_key=env.get('DASHSCOPE_API_KEY'),
             eval_type=EvalType.SERVICE,
             datasets=['data_collection'],
-            dataset_args={'data_collection': {
-                'local_path': 'outputs/mixed_data_test.jsonl'
-                # 'local_path': 'outputs/weighted_mixed_data.jsonl'
-            }},
+            dataset_args={
+                'data_collection': {
+                    # 'local_path': 'outputs/test_mix.jsonl'
+                    'local_path': 'outputs/mixed_data_test.jsonl',
+                    'shuffle': True,
+                }
+            },
             eval_batch_size=5,
             generation_config = {
                 'max_tokens': 10000,
                 'temperature': 0.0,
             },
-            limit=50,
+            limit=10,
             # use_cache='outputs/20250822_161804'
         )
         run_task(task_cfg=task_cfg)

tests/cli/test_reasoning.py ADDED Viewed

@@ -0,0 +1,81 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from dotenv import dotenv_values
+env = dotenv_values('.env')
+import unittest
+from unittest import TestCase
+from evalscope.config import TaskConfig
+from evalscope.constants import EvalType, JudgeStrategy, OutputType
+from evalscope.run import run_task
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+class TestReasoning(TestCase):
+    """Benchmark evaluation test cases."""
+    def setUp(self):
+        """Setup common test configuration."""
+        self.base_config = {
+            'model': 'Qwen3-0.6B',
+            'api_url': 'http://0.0.0.0:8801/v1',
+            'api_key': env.get('DASHSCOPE_API_KEY'),
+            'eval_type': EvalType.SERVICE,
+            'eval_batch_size': 5,
+            'limit': 5,
+            'generation_config': {
+                'max_tokens': 4096,
+                'temperature': 0.0,
+                'seed': 42,
+                'parallel_tool_calls': True,
+                'extra_body':{'chat_template_kwargs': {'enable_thinking': False}} # 关闭思考模式
+            },
+            'judge_strategy': JudgeStrategy.AUTO,
+            'judge_worker_num': 5,
+            'judge_model_args': {
+                'model_id': 'qwen2.5-72b-instruct',
+                'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
+                'api_key': env.get('DASHSCOPE_API_KEY'),
+                'generation_config': {
+                    'temperature': 0.0,
+                    'max_tokens': 4096,
+                }
+            },
+            'debug': True,
+        }
+    def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
+        """Helper method to run test for a specific dataset."""
+        config = self.base_config.copy()
+        config['datasets'] = [dataset_name]
+        if use_mock:
+            config['eval_type'] = EvalType.MOCK_LLM
+        # 应用配置覆盖
+        config.update(config_overrides)
+        if dataset_args:
+            config['dataset_args'] = {dataset_name: dataset_args}
+        task_cfg = TaskConfig(**config)
+        run_task(task_cfg=task_cfg)
+    def _run_dataset_load_test(self, dataset_name, dataset_args=None):
+        """Helper method to test dataset loading."""
+        self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
+    # Math & Reasoning datasets
+    def test_gsm8k(self):
+        """Test GSM8K math reasoning dataset."""
+        self._run_dataset_test('gsm8k')
+if __name__ == '__main__':
+    # Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k
+    # Run all tests: python -m unittest test_eval.TestBenchmark
+    unittest.main()

tests/common.py ADDED Viewed

@@ -0,0 +1,73 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from dotenv import dotenv_values
+env = dotenv_values('.env')
+import unittest
+from unittest import TestCase
+from evalscope.config import TaskConfig
+from evalscope.constants import EvalType, JudgeStrategy
+from evalscope.run import run_task
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+class TestBenchmark(TestCase):
+    """Benchmark evaluation test cases."""
+    def setUp(self):
+        """Setup common test configuration."""
+        self.base_config = {
+            'model': 'qwen-plus',
+            'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
+            'api_key': env.get('DASHSCOPE_API_KEY'),
+            'eval_type': EvalType.SERVICE,
+            'eval_batch_size': 5,
+            'limit': 5,
+            'generation_config': {
+                'max_tokens': 4096,
+                'temperature': 0.0,
+                'seed': 42,
+                'parallel_tool_calls': True
+            },
+            'judge_strategy': JudgeStrategy.AUTO,
+            'judge_worker_num': 5,
+            'judge_model_args': {
+                'model_id': 'qwen2.5-72b-instruct',
+                'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
+                'api_key': env.get('DASHSCOPE_API_KEY'),
+                'generation_config': {
+                    'temperature': 0.0,
+                    'max_tokens': 4096,
+                }
+            },
+            'debug': True,
+        }
+    def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
+        """Helper method to run test for a specific dataset."""
+        config = self.base_config.copy()
+        config['datasets'] = [dataset_name]
+        if not env.get('DASHSCOPE_API_KEY'):
+            use_mock = True
+            logger.warning('DASHSCOPE_API_KEY is not set. Using mock evaluation.')
+        if use_mock:
+            config['eval_type'] = EvalType.MOCK_LLM
+        # 应用配置覆盖
+        config.update(config_overrides)
+        if dataset_args:
+            config['dataset_args'] = {dataset_name: dataset_args}
+        task_cfg = TaskConfig(**config)
+        run_task(task_cfg=task_cfg)
+    def _run_dataset_load_test(self, dataset_name, dataset_args=None):
+        """Helper method to test dataset loading."""
+        self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)

tests/perf/test_perf.py CHANGED Viewed

@@ -1,9 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
 from dotenv import dotenv_values
 env = dotenv_values('.env')
-os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 import unittest
 from evalscope.perf.main import run_perf_benchmark
@@ -123,6 +121,10 @@ class TestPerf(unittest.TestCase):
     @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_perf_multi_parallel(self):
+        if not env.get('DASHSCOPE_API_KEY'):
+            self.skipTest('DASHSCOPE_API_KEY is not set.')
+            return
         from evalscope.perf.arguments import Arguments
         task_cfg = Arguments(
             parallel=[1, 2],

tests/rag/test_clip_benchmark.py CHANGED Viewed

@@ -1,8 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
-# os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 import subprocess
 import unittest

evalscope/api/mixin/dataset_mixin.py DELETED Viewed

@@ -1,105 +0,0 @@
-from abc import ABC
-from collections import defaultdict
-from typing import Any, Callable, Dict
-from evalscope.api.dataset import Dataset, DatasetDict, RemoteDataLoader
-class DatasetLoaderMixin:
-    """
-    Mixin class providing dataset loading functionality for benchmarks.
-    This mixin provides common dataset loading methods that can be shared
-    across different data adapters, including support for:
-    - Loading multiple subsets
-    - Few-shot dataset loading
-    - Remote dataset loading with configuration
-    """
-    def load_subsets(self, load_func: Callable[[str], Dataset]) -> DatasetDict:
-        """
-        Load multiple subsets of the dataset using the provided loading function.
-        This method handles two loading strategies:
-        1. Reformat mode: Load only the default subset and reformat it
-        2. Multi-subset mode: Load all subsets specified in subset_list
-        Args:
-            load_func (Callable[[str], Dataset]): Function to load individual subsets
-        Returns:
-            DatasetDict: Dictionary containing all loaded subsets
-        """
-        if self.reformat_subset:
-            # Load only the default subset
-            subset_data = load_func(self.default_subset)
-            # Reformat the subset to create multiple subsets based on sample keys
-            # NOTE: subset_list and limit is applied here if specified
-            dataset_dict = DatasetDict.from_dataset(dataset=subset_data, subset_list=self.subset_list, limit=self.limit)
-        else:
-            # Load all specified subsets into separate entries
-            subset_dict = defaultdict()
-            for subset in self.subset_list:
-                subset_data = load_func(subset)
-                subset_dict[subset] = subset_data
-            dataset_dict = DatasetDict(subset_dict)
-        return dataset_dict
-    def load_subset(self, subset: str) -> Dataset:
-        """
-        Load a specific subset of the dataset for evaluation.
-        This method configures and executes the data loading for a single subset,
-        handling both split-as-subset and traditional subset configurations.
-        Args:
-            subset (str): The subset identifier to load
-        Returns:
-            Dataset: The loaded dataset subset with processed samples
-        """
-        # Determine the split and subset names based on configuration
-        split = subset if self.split_as_subset else self.eval_split
-        subset_name = self.default_subset if self.split_as_subset else subset
-        # Create and configure the remote data loader
-        loader = RemoteDataLoader(
-            data_id_or_path=self.dataset_id,
-            split=split,
-            subset=subset_name,
-            sample_fields=self.record_to_sample,  # Custom sample conversion function
-            limit=self.limit if not self.reformat_subset else None,  # Limit number of samples if specified
-            repeats=self._task_config.repeats,  # Number of repetitions for each sample
-            data_source=self._task_config.dataset_hub,  # Data source configuration
-        )
-        return loader.load()
-    def load_fewshot_subset(self, subset: str) -> Dataset:
-        """
-        Load a subset specifically for few-shot examples.
-        This method loads training data to be used as demonstrations in few-shot prompting.
-        It typically loads from the training split with limited samples and optional shuffling.
-        Args:
-            subset (str): The subset identifier to load few-shot examples from
-        Returns:
-            Dataset: The loaded few-shot dataset with demonstration examples
-        """
-        # Use training split for few-shot examples
-        split = subset if self.split_as_subset else self.train_split
-        subset_name = self.default_subset if self.split_as_subset else subset
-        # Create loader specifically configured for few-shot sampling
-        loader = RemoteDataLoader(
-            data_id_or_path=self.dataset_id,
-            split=split,
-            subset=subset_name,
-            sample_fields=self.record_to_sample,
-            limit=self.few_shot_num
-            if not self.reformat_subset else None,  # Limit to specified number of few-shot examples
-            shuffle=self.few_shot_random,  # Randomize selection if enabled
-            data_source=self._task_config.dataset_hub,
-        )
-        return loader.load()

evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py DELETED Viewed

@@ -1,44 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import os.path
-from collections import defaultdict
-from typing import List, Optional, Union
-from evalscope.utils.io_utils import jsonl_to_list
-from evalscope.utils.logger import get_logger
-logger = get_logger()
-class GeneralI2IAdapter:
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-    def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
-        dataset_name_or_path = dataset_name_or_path or self.dataset_id
-        subset_list = subset_list or self.subset_list
-        data_file_dict = defaultdict(str)
-        data_item_dict = defaultdict(list)
-        # get data file path and subset name
-        if os.path.isdir(dataset_name_or_path):
-            for subset_name in subset_list:
-                data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
-        elif os.path.isfile(dataset_name_or_path):
-            cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
-            data_file_dict[cur_subset_name] = dataset_name_or_path
-        else:
-            raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
-        # load data from local disk
-        try:
-            for subset_name, file_path in data_file_dict.items():
-                data_item_dict[subset_name] = jsonl_to_list(file_path)
-        except Exception as e:
-            raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
-        data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
-        return data_dict