PyPI - evalscope - Versions diffs - 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl - Mend

evalscope 0.16.3py3-none-any.whl → 0.17.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (157) hide show

evalscope/app/app.py +9 -762
evalscope/app/constants.py +1 -0
evalscope/app/ui/__init__.py +20 -0
evalscope/app/ui/app_ui.py +52 -0
evalscope/app/ui/multi_model.py +323 -0
evalscope/app/ui/sidebar.py +42 -0
evalscope/app/ui/single_model.py +202 -0
evalscope/app/ui/visualization.py +36 -0
evalscope/app/utils/data_utils.py +178 -0
evalscope/app/utils/localization.py +221 -0
evalscope/app/utils/text_utils.py +119 -0
evalscope/app/utils/visualization.py +91 -0
evalscope/backend/opencompass/backend_manager.py +2 -1
evalscope/backend/rag_eval/backend_manager.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +1 -1
evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
evalscope/benchmarks/__init__.py +15 -1
evalscope/benchmarks/aime/aime24_adapter.py +2 -1
evalscope/benchmarks/aime/aime25_adapter.py +2 -1
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
evalscope/benchmarks/arc/arc_adapter.py +1 -1
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
evalscope/benchmarks/arena_hard/utils.py +0 -12
evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
evalscope/benchmarks/data_adapter.py +29 -9
evalscope/benchmarks/general_arena/__init__.py +0 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
evalscope/benchmarks/general_arena/utils.py +226 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
evalscope/benchmarks/hle/__init__.py +0 -0
evalscope/benchmarks/hle/hle_adapter.py +118 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
evalscope/benchmarks/musr/musr_adapter.py +1 -1
evalscope/benchmarks/race/race_adapter.py +1 -1
evalscope/benchmarks/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
evalscope/benchmarks/utils.py +2 -2
evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
evalscope/config.py +8 -123
evalscope/constants.py +5 -21
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +20 -15
evalscope/metrics/__init__.py +9 -1
evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
evalscope/metrics/llm_judge.py +106 -20
evalscope/metrics/metrics.py +20 -8
evalscope/models/__init__.py +4 -8
evalscope/models/adapters/__init__.py +4 -9
evalscope/models/adapters/base_adapter.py +4 -0
evalscope/models/adapters/bfcl_adapter.py +2 -0
evalscope/models/adapters/chat_adapter.py +3 -0
evalscope/models/adapters/choice_adapter.py +4 -0
evalscope/models/adapters/custom_adapter.py +7 -3
evalscope/models/adapters/server_adapter.py +4 -2
evalscope/models/adapters/t2i_adapter.py +3 -0
evalscope/models/adapters/tau_bench_adapter.py +189 -0
evalscope/models/custom/dummy_model.py +3 -3
evalscope/models/register.py +0 -14
evalscope/perf/arguments.py +15 -16
evalscope/perf/benchmark.py +38 -39
evalscope/perf/http_client.py +30 -86
evalscope/perf/main.py +3 -3
evalscope/perf/plugin/__init__.py +3 -2
evalscope/perf/plugin/api/__init__.py +4 -3
evalscope/perf/plugin/api/base.py +22 -4
evalscope/perf/plugin/api/custom_api.py +212 -55
evalscope/perf/plugin/api/dashscope_api.py +4 -10
evalscope/perf/plugin/api/default_api.py +105 -0
evalscope/perf/plugin/api/openai_api.py +17 -19
evalscope/perf/plugin/datasets/__init__.py +10 -7
evalscope/perf/plugin/datasets/base.py +22 -1
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +4 -27
evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +2 -1
evalscope/perf/plugin/datasets/random_dataset.py +15 -4
evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
evalscope/perf/plugin/registry.py +36 -16
evalscope/perf/utils/analysis_result.py +24 -23
evalscope/perf/utils/benchmark_util.py +14 -20
evalscope/perf/utils/db_util.py +79 -61
evalscope/report/__init__.py +1 -1
evalscope/report/utils.py +34 -15
evalscope/run.py +1 -1
evalscope/summarizer.py +1 -2
evalscope/utils/__init__.py +63 -2
evalscope/utils/argument_utils.py +64 -0
evalscope/utils/import_utils.py +16 -0
evalscope/utils/io_utils.py +55 -4
evalscope/utils/model_utils.py +37 -1
evalscope/version.py +2 -2
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
tests/aigc/test_t2i.py +1 -1
tests/cli/test_all.py +68 -4
tests/cli/test_collection.py +1 -1
tests/cli/test_custom.py +261 -0
tests/cli/test_run.py +34 -70
tests/perf/test_perf.py +31 -4
tests/rag/test_clip_benchmark.py +2 -1
tests/rag/test_mteb.py +3 -1
tests/rag/test_ragas.py +3 -1
tests/swift/test_run_swift_eval.py +2 -1
tests/swift/test_run_swift_vlm_eval.py +2 -1
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
tests/utils.py +13 -0
tests/vlm/test_vlmeval.py +8 -2
evalscope/evaluator/rating_eval.py +0 -157
evalscope/evaluator/reviewer/__init__.py +0 -1
evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
evalscope/models/model.py +0 -189
evalscope/registry/__init__.py +0 -1
evalscope/registry/config/cfg_arena.yaml +0 -77
evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
evalscope/registry/config/cfg_single.yaml +0 -78
evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
evalscope/registry/data/qa_browser/battle.jsonl +0 -634
evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
evalscope/registry/data/question.jsonl +0 -80
evalscope/registry/tasks/arc.yaml +0 -28
evalscope/registry/tasks/bbh.yaml +0 -26
evalscope/registry/tasks/bbh_mini.yaml +0 -26
evalscope/registry/tasks/ceval.yaml +0 -27
evalscope/registry/tasks/ceval_mini.yaml +0 -26
evalscope/registry/tasks/cmmlu.yaml +0 -27
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
evalscope/registry/tasks/general_qa.yaml +0 -27
evalscope/registry/tasks/gsm8k.yaml +0 -29
evalscope/registry/tasks/mmlu.yaml +0 -29
evalscope/registry/tasks/mmlu_mini.yaml +0 -27
evalscope/run_arena.py +0 -202
evalscope/utils/arena_utils.py +0 -217
evalscope/utils/completion_parsers.py +0 -82
/evalscope/{utils → benchmarks}/filters.py +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0

tests/cli/test_custom.py ADDED Viewed

@@ -0,0 +1,261 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from dotenv import dotenv_values
+from tests.utils import test_level_list
+env = dotenv_values('.env')
+import os
+import subprocess
+import unittest
+from evalscope.config import TaskConfig
+from evalscope.constants import EvalStage, EvalType, JudgeStrategy, OutputType
+from evalscope.run import run_task
+from evalscope.utils.import_utils import is_module_installed
+from evalscope.utils.logger import get_logger
+os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
+logger = get_logger()
+class TestRunCustom(unittest.TestCase):
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_custom_task(self):
+        from evalscope.config import TaskConfig
+        task_cfg = TaskConfig(
+            model='Qwen/Qwen3-0.6B',
+            datasets=[
+                'general_mcq',
+                'general_qa'
+            ],
+            dataset_args={
+                'general_mcq': {
+                    'local_path': 'custom_eval/text/mcq',  # 自定义数据集路径
+                    'subset_list': [
+                        'example'  # 评测数据集名称，上述 *_dev.csv 中的 *
+                    ],
+                    'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}'  # 问题模板
+                },
+                'general_qa': {
+                    'local_path': 'custom_eval/text/qa',  # 自定义数据集路径
+                    'subset_list': [
+                        'example'  # 评测数据集名称，上述 *_dev.csv 中的 *
+                    ]
+                }
+            },
+        )
+        res = run_task(task_cfg=task_cfg)
+        print(res)
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_local_dataset(self):
+        from evalscope.config import TaskConfig
+        task_cfg = TaskConfig(
+            model='qwen-plus',
+            api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
+            api_key= env.get('DASHSCOPE_API_KEY'),
+            eval_type=EvalType.SERVICE,
+            datasets=[
+                # 'mmlu',
+                # 'race',
+                'trivia_qa',
+                # 'cmmlu',
+                # 'humaneval',
+                # 'gsm8k',
+                # 'bbh',
+                # 'competition_math',
+                # 'arc',
+                # 'ceval',
+            ],
+            dataset_args={
+                'mmlu': {
+                    'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
+                    'few_shot_num': 0,
+                    'dataset_id': 'data/data/mmlu',
+                },
+                'ceval': {
+                    'subset_list': [
+                        'computer_network', 'operating_system', 'computer_architecture'
+                    ],
+                    'few_shot_num': 0,
+                    'dataset_id': 'data/data/ceval',
+                },
+                'cmmlu': {
+                    'subset_list': ['elementary_chinese'],
+                    'dataset_id': 'data/data/cmmlu',
+                    'few_shot_num': 0
+                },
+                'bbh': {
+                    'subset_list': ['word_sorting', 'movie_recommendation'],
+                },
+                'humaneval': {
+                    'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
+                },
+                'trivia_qa': {
+                    'dataset_id': 'data/data/trivia_qa',
+                },
+            },
+            eval_batch_size=10,
+            limit=5,
+            debug=True,
+            stream=True,
+            generation_config={
+                'temperature': 0,
+                'n': 1,
+                'max_tokens': 4096,
+            },
+            ignore_errors=False,
+        )
+        run_task(task_cfg=task_cfg)
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_general_no_answer(self):
+        from evalscope.config import TaskConfig
+        task_cfg = TaskConfig(
+            model='qwen2.5-72b-instruct',
+            api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
+            api_key= env.get('DASHSCOPE_API_KEY'),
+            eval_type=EvalType.SERVICE,
+            datasets=[
+                'general_qa',
+            ],
+            dataset_args={
+                'general_qa': {
+                    'dataset_id': 'custom_eval/text/qa',
+                    'subset_list': [
+                        'arena',
+                        'example'
+                    ],
+                }
+            },
+            eval_batch_size=10,
+            limit=10,
+            debug=True,
+            stream=True,
+            generation_config={
+                'temperature': 0,
+                'n': 1,
+                'max_tokens': 4096,
+            },
+            ignore_errors=False,
+            judge_model_args={
+                'model_id': 'qwen2.5-72b-instruct',
+                'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
+                'api_key': env.get('DASHSCOPE_API_KEY'),
+                'generation_config': {
+                    'temperature': 0.0,
+                    'max_tokens': 4096
+                },
+                'score_type': 'numeric',
+            },
+            judge_worker_num=5,
+            judge_strategy=JudgeStrategy.AUTO,
+        )
+        run_task(task_cfg=task_cfg)
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_general_with_answer(self):
+        from evalscope.config import TaskConfig
+        task_cfg = TaskConfig(
+            model='qwen-plus',
+            api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
+            api_key= env.get('DASHSCOPE_API_KEY'),
+            eval_type=EvalType.SERVICE,
+            datasets=[
+                'general_qa',
+            ],
+            dataset_args={
+                'general_qa': {
+                    'dataset_id': 'custom_eval/text/qa',
+                    'subset_list': [
+                        'example'
+                    ],
+                }
+            },
+            eval_batch_size=10,
+            limit=10,
+            debug=True,
+            stream=True,
+            generation_config={
+                'temperature': 0,
+                'n': 1,
+                'max_tokens': 4096,
+            },
+            ignore_errors=False,
+            judge_model_args={
+                'model_id': 'qwen2.5-72b-instruct',
+                'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
+                'api_key': env.get('DASHSCOPE_API_KEY'),
+                'generation_config': {
+                    'temperature': 0.0,
+                    'max_tokens': 4096
+                },
+                'score_type': 'pattern',
+            },
+            judge_worker_num=5,
+            judge_strategy=JudgeStrategy.LLM,
+        )
+        run_task(task_cfg=task_cfg)
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_general_arena(self):
+        from evalscope.config import TaskConfig
+        task_cfg = TaskConfig(
+            model_id='Arena',
+            datasets=[
+                'general_arena',
+            ],
+            dataset_args={
+                'general_arena': {
+                    'extra_params':{
+                        'models':[
+                            {
+                                'name': 'qwen2.5-0.5b',
+                                'report_path': 'outputs/20250702_140354/reports/qwen2.5-0.5b-instruct'
+                            },
+                            {
+                                'name': 'qwen2.5-7b',
+                                'report_path': 'outputs/20250702_140702/reports/qwen2.5-7b-instruct'
+                            },
+                            {
+                                'name': 'qwen2.5-72b',
+                                'report_path': 'outputs/20250702_140802/reports/qwen2.5-72b-instruct'
+                            }
+                        ],
+                        'baseline': 'qwen2.5-7b'
+                    }
+                }
+            },
+            eval_batch_size=10,
+            limit=10,
+            debug=True,
+            stream=True,
+            ignore_errors=False,
+            judge_model_args={
+                'model_id': 'qwen-plus',
+                'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
+                'api_key': env.get('DASHSCOPE_API_KEY'),
+                'generation_config': {
+                    'temperature': 0.0,
+                    'max_tokens': 8000
+                },
+            },
+            judge_worker_num=5,
+            use_cache='outputs/20250702_165727'
+        )
+        run_task(task_cfg=task_cfg)

tests/cli/test_run.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from dotenv import dotenv_values
+from tests.utils import test_level_list
 env = dotenv_values('.env')
 import os
@@ -8,9 +10,9 @@ import subprocess
 import unittest
 from evalscope.config import TaskConfig
-from evalscope.constants import EvalType, JudgeStrategy, OutputType
+from evalscope.constants import EvalStage, EvalType, JudgeStrategy, OutputType
 from evalscope.run import run_task
-from evalscope.utils import is_module_installed, test_level_list
+from evalscope.utils.import_utils import is_module_installed
 from evalscope.utils.logger import get_logger
 os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
@@ -182,35 +184,6 @@ class TestRun(unittest.TestCase):
         run_task(task_cfg=task_cfg)
-    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
-    def test_run_custom_task(self):
-        from evalscope.config import TaskConfig
-        task_cfg = TaskConfig(
-            model='Qwen/Qwen3-0.6B',
-            datasets=[
-                'general_mcq',
-                'general_qa'
-            ],
-            dataset_args={
-                'general_mcq': {
-                    'local_path': 'custom_eval/text/mcq',  # 自定义数据集路径
-                    'subset_list': [
-                        'example'  # 评测数据集名称，上述 *_dev.csv 中的 *
-                    ],
-                    'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}'  # 问题模板
-                },
-                'general_qa': {
-                    'local_path': 'custom_eval/text/qa',  # 自定义数据集路径
-                    'subset_list': [
-                        'example'  # 评测数据集名称，上述 *_dev.csv 中的 *
-                    ]
-                }
-            },
-        )
-        res = run_task(task_cfg=task_cfg)
-        print(res)
     @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_one_task(self):
         from evalscope.config import TaskConfig
@@ -293,7 +266,7 @@ class TestRun(unittest.TestCase):
                 # 'musr',
                 # 'process_bench',
                 # 'race',
-                # 'trivia_qa',
+                'trivia_qa',
                 # 'cmmlu',
                 # 'humaneval',
                 # 'gsm8k',
@@ -306,7 +279,7 @@ class TestRun(unittest.TestCase):
                 # 'ceval',
                 # 'hellaswag',
                 # 'general_mcq',
-                'general_qa',
+                # 'general_qa',
                 # 'super_gpqa',
                 # 'mmlu_redux',
                 # 'maritime_bench',
@@ -315,6 +288,9 @@ class TestRun(unittest.TestCase):
                 # 'tool_bench',
                 # 'frames',
                 # 'bfcl_v3',
+                # 'truthful_qa',
+                # 'tau_bench',
+                # 'hle'
             ],
             dataset_args={
                 'mmlu': {
@@ -323,7 +299,7 @@ class TestRun(unittest.TestCase):
                 },
                 'mmlu_pro': {
                     'subset_list': ['math', 'health'],
-                    'few_shot_num': 4
+                    'few_shot_num': 0
                 },
                 'ceval': {
                     'subset_list': [
@@ -354,7 +330,6 @@ class TestRun(unittest.TestCase):
                 },
                 'musr': {
                     'subset_list': ['murder_mysteries'],
-                    'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/MuSR'
                 },
                 'general_mcq': {
                     'local_path': 'custom_eval/text/mcq',  # 自定义数据集路径
@@ -378,59 +353,42 @@ class TestRun(unittest.TestCase):
                 'mmlu_redux':{
                     'subset_list': ['abstract_algebra']
                 },
+                'frames':{
+                    'local_path': 'data/iic/frames',
+                },
                 'bfcl_v3': {
                     'subset_list': ['parallel'],
                     'extra_params': {
                         # 'is_fc_model': False,
                     }
                 },
+                'tau_bench': {
+                    'extra_params': {
+                        'user_model': 'qwen-plus',
+                        'api_key': env.get('DASHSCOPE_API_KEY'),
+                        'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
+                    }
+                },
+                'hle': {
+                    'subset_list': ['Math', 'Other'],
+                },
             },
             eval_batch_size=10,
-            limit=5,
-            debug=True,
+            limit=10,
+            # debug=True,
             stream=True,
             generation_config={
-                'temperature': 0,
+                'temperature': 0.6,
                 'n': 1,
                 'max_tokens': 4096,
                 # 'extra_headers':{'key': 'value'},
             },
             ignore_errors=False,
-            use_cache='outputs/test_2'
         )
         run_task(task_cfg=task_cfg)
-    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
-    def test_run_batch_eval(self):
-        from evalscope.config import TaskConfig
-        task_cfg = TaskConfig(
-            model='LLM-Research/Llama-3.2-1B-Instruct',
-            datasets=[
-                # 'math_500',
-                # 'aime24',
-                # 'competition_math'
-                # 'arc',
-                'gsm8k'
-                # 'truthful_qa'
-            ],
-            dataset_args={
-                'competition_math': {
-                    'subset_list': ['Level 4', 'Level 5']
-                }
-            },
-            eval_batch_size=2,
-            limit=5,
-            generation_config={
-                'max_new_tokens': 2048,
-                'temperature': 0.7,
-                'num_return_sequences': 2,
-            }
-        )
-        run_task(task_cfg=task_cfg)
     @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_judge_model(self):
@@ -442,7 +400,7 @@ class TestRun(unittest.TestCase):
             api_key= env.get('DASHSCOPE_API_KEY'),
             eval_type=EvalType.SERVICE,
             datasets=[
-                'math_500',
+                # 'math_500',
                 # 'aime24',
                 # 'competition_math',
                 # 'arc',
@@ -459,6 +417,7 @@ class TestRun(unittest.TestCase):
                 # 'docmath',
                 # 'needle_haystack',
                 # 'ifeval',
+                'hle'
             ],
             dataset_args={
                 'needle_haystack': {
@@ -491,7 +450,10 @@ class TestRun(unittest.TestCase):
                 },
                 'frames': {
                     'local_path': '/root/.cache/modelscope/hub/datasets/iic/frames'
-                }
+                },
+                'hle': {
+                    'subset_list': ['Math', 'Other'],
+                },
             },
             eval_batch_size=10,
             limit=3,
@@ -514,6 +476,7 @@ class TestRun(unittest.TestCase):
             },
             timeout=60000,
             stream=True,
+            use_cache='outputs/20250714_150626'
             # analysis_report=True,
             # debug=True,
             # use_cache='outputs/20250616_161931'
@@ -521,5 +484,6 @@ class TestRun(unittest.TestCase):
         run_task(task_cfg=task_cfg)
 if __name__ == '__main__':
     unittest.main()

tests/perf/test_perf.py CHANGED Viewed

@@ -7,7 +7,7 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 import unittest
 from evalscope.perf.main import run_perf_benchmark
-from evalscope.utils import test_level_list
+from tests.utils import test_level_list
 class TestPerf(unittest.TestCase):
@@ -35,9 +35,9 @@ class TestPerf(unittest.TestCase):
     @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_perf_stream(self):
         task_cfg = {
-            'url': 'http://127.0.0.1:8000/v1/chat/completions',
+            'url': 'http://127.0.0.1:8801/v1/chat/completions',
             'parallel': 1,
-            'model': 'qwen2.5',
+            'model': 'Qwen2.5-0.5B-Instruct',
             'number': 15,
             'api': 'openai',
             'dataset': 'openqa',
@@ -126,7 +126,7 @@ class TestPerf(unittest.TestCase):
         from evalscope.perf.arguments import Arguments
         task_cfg = Arguments(
             parallel=[1, 2],
-            number=[2, 5],
+            number=[2, 4],
             model='qwen2.5-7b-instruct',
             url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
             api_key=env.get('DASHSCOPE_API_KEY'),
@@ -145,5 +145,32 @@ class TestPerf(unittest.TestCase):
         print(metrics_result)
         print(percentile_result)
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_perf_random_vl(self):
+        from evalscope.perf.arguments import Arguments
+        task_cfg = Arguments(
+            parallel=[1, 2],
+            number=[2, 4],
+            model='qwen-vl-max',
+            url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
+            api_key=env.get('DASHSCOPE_API_KEY'),
+            api='openai',
+            dataset='kontext_bench',
+            min_tokens=100,
+            max_tokens=100,
+            prefix_length=0,
+            min_prompt_length=100,
+            max_prompt_length=100,
+            image_height=512,
+            image_width=512,
+            image_num=2,
+            tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
+            seed=None,
+            extra_args={'ignore_eos': True}
+        )
+        metrics_result, percentile_result = run_perf_benchmark(task_cfg)
+        print(metrics_result)
+        print(percentile_result)
 if __name__ == '__main__':
     unittest.main(buffer=False)

tests/rag/test_clip_benchmark.py CHANGED Viewed

@@ -6,8 +6,9 @@ import subprocess
 import unittest
 from evalscope.run import run_task
-from evalscope.utils import is_module_installed, test_level_list
+from evalscope.utils.import_utils import is_module_installed
 from evalscope.utils.logger import get_logger
+from tests.utils import test_level_list
 logger = get_logger()

tests/rag/test_mteb.py CHANGED Viewed

@@ -3,9 +3,11 @@
 import unittest
 from dotenv import dotenv_values
+from tests.utils import test_level_list
 env = dotenv_values('.env')
 from evalscope.run import run_task
-from evalscope.utils import is_module_installed, test_level_list
+from evalscope.utils.import_utils import is_module_installed
 from evalscope.utils.logger import get_logger
 logger = get_logger()

tests/rag/test_ragas.py CHANGED Viewed

@@ -2,11 +2,13 @@
 import os
 from dotenv import dotenv_values
+from tests.utils import test_level_list
 env = dotenv_values('.env')
 import unittest
 from evalscope import TaskConfig, run_task
-from evalscope.utils import is_module_installed, test_level_list
+from evalscope.utils.import_utils import is_module_installed
 from evalscope.utils.logger import get_logger
 logger = get_logger()

tests/swift/test_run_swift_eval.py CHANGED Viewed

@@ -10,8 +10,9 @@ import unittest
 from evalscope.backend.opencompass import OpenCompassBackendManager
 from evalscope.run import run_task
 from evalscope.summarizer import Summarizer
-from evalscope.utils import is_module_installed, test_level_list
+from evalscope.utils.import_utils import is_module_installed
 from evalscope.utils.logger import get_logger
+from tests.utils import test_level_list
 logger = get_logger(__name__)

tests/swift/test_run_swift_vlm_eval.py CHANGED Viewed

@@ -10,8 +10,9 @@ import unittest
 from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
 from evalscope.run import run_task
 from evalscope.summarizer import Summarizer
-from evalscope.utils import is_module_installed, test_level_list
+from evalscope.utils.import_utils import is_module_installed
 from evalscope.utils.logger import get_logger
+from tests.utils import test_level_list
 logger = get_logger(__name__)

tests/swift/test_run_swift_vlm_jugde_eval.py CHANGED Viewed

@@ -10,8 +10,9 @@ import unittest
 from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
 from evalscope.run import run_task
 from evalscope.summarizer import Summarizer
-from evalscope.utils import is_module_installed, test_level_list
+from evalscope.utils.import_utils import is_module_installed
 from evalscope.utils.logger import get_logger
+from tests.utils import test_level_list
 logger = get_logger(__name__)

tests/utils.py ADDED Viewed

@@ -0,0 +1,13 @@
+import os
+TEST_LEVEL_LIST = [0, 1]
+# Example: export TEST_LEVEL_LIST=0,1
+TEST_LEVEL_LIST_STR = 'TEST_LEVEL_LIST'
+def test_level_list():
+    global TEST_LEVEL_LIST
+    if TEST_LEVEL_LIST_STR in os.environ:
+        TEST_LEVEL_LIST = [int(x) for x in os.environ[TEST_LEVEL_LIST_STR].split(',')]
+    return TEST_LEVEL_LIST

tests/vlm/test_vlmeval.py CHANGED Viewed

@@ -1,12 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from dotenv import dotenv_values
+from tests.utils import test_level_list
 env = dotenv_values('.env')
 import unittest
 from evalscope.run import run_task
 from evalscope.summarizer import Summarizer
-from evalscope.utils import is_module_installed, test_level_list
+from evalscope.utils.import_utils import is_module_installed
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -62,7 +64,11 @@ class TestVLMEval(unittest.TestCase):
         task_cfg = {
             'eval_backend': 'VLMEvalKit',
             'eval_config': {
-                'data': ['SEEDBench_IMG', 'ChartQA_TEST'],
+                'data': [
+                    # 'SEEDBench_IMG',
+                    # 'ChartQA_TEST',
+                    'MMDU'
+                    ],
                 'limit': 5,
                 'mode': 'all',
                 'model': [

evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.16.3py3-none-any.whl → 0.17.1py3-none-any.whl