PyPI - evalscope - Versions diffs - 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl - Mend

evalscope 0.16.2py3-none-any.whl → 0.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (117) hide show

evalscope/app/app.py +9 -762
evalscope/app/constants.py +1 -0
evalscope/app/ui/__init__.py +20 -0
evalscope/app/ui/app_ui.py +52 -0
evalscope/app/ui/multi_model.py +323 -0
evalscope/app/ui/sidebar.py +42 -0
evalscope/app/ui/single_model.py +202 -0
evalscope/app/ui/visualization.py +36 -0
evalscope/app/utils/data_utils.py +178 -0
evalscope/app/utils/localization.py +221 -0
evalscope/app/utils/text_utils.py +119 -0
evalscope/app/utils/visualization.py +91 -0
evalscope/backend/opencompass/backend_manager.py +2 -1
evalscope/backend/rag_eval/backend_manager.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +1 -1
evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
evalscope/benchmarks/__init__.py +15 -1
evalscope/benchmarks/aime/aime24_adapter.py +2 -1
evalscope/benchmarks/aime/aime25_adapter.py +2 -1
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
evalscope/benchmarks/arc/arc_adapter.py +1 -1
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
evalscope/benchmarks/arena_hard/utils.py +0 -12
evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
evalscope/benchmarks/data_adapter.py +20 -5
evalscope/benchmarks/general_arena/__init__.py +0 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
evalscope/benchmarks/general_arena/utils.py +226 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
evalscope/benchmarks/musr/musr_adapter.py +1 -1
evalscope/benchmarks/race/race_adapter.py +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
evalscope/benchmarks/utils.py +1 -2
evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
evalscope/config.py +8 -123
evalscope/evaluator/evaluator.py +15 -12
evalscope/metrics/__init__.py +6 -0
evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
evalscope/metrics/llm_judge.py +105 -20
evalscope/metrics/metrics.py +1 -1
evalscope/models/adapters/base_adapter.py +0 -2
evalscope/models/adapters/server_adapter.py +2 -2
evalscope/models/custom/dummy_model.py +3 -3
evalscope/perf/arguments.py +2 -16
evalscope/perf/main.py +1 -1
evalscope/perf/utils/analysis_result.py +24 -23
evalscope/perf/utils/benchmark_util.py +1 -1
evalscope/report/__init__.py +1 -1
evalscope/report/utils.py +34 -15
evalscope/run.py +1 -1
evalscope/summarizer.py +1 -2
evalscope/utils/__init__.py +63 -2
evalscope/utils/argument_utils.py +64 -0
evalscope/utils/import_utils.py +16 -0
evalscope/utils/io_utils.py +45 -4
evalscope/utils/model_utils.py +37 -1
evalscope/version.py +2 -2
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
tests/aigc/test_t2i.py +1 -1
tests/cli/test_all.py +50 -2
tests/cli/test_collection.py +1 -1
tests/cli/test_custom.py +261 -0
tests/cli/test_run.py +13 -37
tests/perf/test_perf.py +2 -2
tests/rag/test_clip_benchmark.py +2 -1
tests/rag/test_mteb.py +3 -1
tests/rag/test_ragas.py +3 -1
tests/swift/test_run_swift_eval.py +2 -1
tests/swift/test_run_swift_vlm_eval.py +2 -1
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
tests/utils.py +13 -0
tests/vlm/test_vlmeval.py +8 -2
evalscope/evaluator/rating_eval.py +0 -157
evalscope/evaluator/reviewer/__init__.py +0 -1
evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
evalscope/registry/__init__.py +0 -1
evalscope/registry/config/cfg_arena.yaml +0 -77
evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
evalscope/registry/config/cfg_single.yaml +0 -78
evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
evalscope/registry/data/qa_browser/battle.jsonl +0 -634
evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
evalscope/registry/data/question.jsonl +0 -80
evalscope/registry/tasks/arc.yaml +0 -28
evalscope/registry/tasks/bbh.yaml +0 -26
evalscope/registry/tasks/bbh_mini.yaml +0 -26
evalscope/registry/tasks/ceval.yaml +0 -27
evalscope/registry/tasks/ceval_mini.yaml +0 -26
evalscope/registry/tasks/cmmlu.yaml +0 -27
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
evalscope/registry/tasks/general_qa.yaml +0 -27
evalscope/registry/tasks/gsm8k.yaml +0 -29
evalscope/registry/tasks/mmlu.yaml +0 -29
evalscope/registry/tasks/mmlu_mini.yaml +0 -27
evalscope/run_arena.py +0 -202
evalscope/utils/arena_utils.py +0 -217
evalscope/utils/completion_parsers.py +0 -82
/evalscope/{utils → benchmarks}/filters.py +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0

tests/cli/test_custom.py ADDED Viewed

@@ -0,0 +1,261 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from dotenv import dotenv_values
+from tests.utils import test_level_list
+env = dotenv_values('.env')
+import os
+import subprocess
+import unittest
+from evalscope.config import TaskConfig
+from evalscope.constants import EvalStage, EvalType, JudgeStrategy, OutputType
+from evalscope.run import run_task
+from evalscope.utils.import_utils import is_module_installed
+from evalscope.utils.logger import get_logger
+os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
+logger = get_logger()
+class TestRunCustom(unittest.TestCase):
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_custom_task(self):
+        from evalscope.config import TaskConfig
+        task_cfg = TaskConfig(
+            model='Qwen/Qwen3-0.6B',
+            datasets=[
+                'general_mcq',
+                'general_qa'
+            ],
+            dataset_args={
+                'general_mcq': {
+                    'local_path': 'custom_eval/text/mcq',  # 自定义数据集路径
+                    'subset_list': [
+                        'example'  # 评测数据集名称，上述 *_dev.csv 中的 *
+                    ],
+                    'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}'  # 问题模板
+                },
+                'general_qa': {
+                    'local_path': 'custom_eval/text/qa',  # 自定义数据集路径
+                    'subset_list': [
+                        'example'  # 评测数据集名称，上述 *_dev.csv 中的 *
+                    ]
+                }
+            },
+        )
+        res = run_task(task_cfg=task_cfg)
+        print(res)
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_local_dataset(self):
+        from evalscope.config import TaskConfig
+        task_cfg = TaskConfig(
+            model='qwen-plus',
+            api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
+            api_key= env.get('DASHSCOPE_API_KEY'),
+            eval_type=EvalType.SERVICE,
+            datasets=[
+                # 'mmlu',
+                # 'race',
+                'trivia_qa',
+                # 'cmmlu',
+                # 'humaneval',
+                # 'gsm8k',
+                # 'bbh',
+                # 'competition_math',
+                # 'arc',
+                # 'ceval',
+            ],
+            dataset_args={
+                'mmlu': {
+                    'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
+                    'few_shot_num': 0,
+                    'dataset_id': 'data/data/mmlu',
+                },
+                'ceval': {
+                    'subset_list': [
+                        'computer_network', 'operating_system', 'computer_architecture'
+                    ],
+                    'few_shot_num': 0,
+                    'dataset_id': 'data/data/ceval',
+                },
+                'cmmlu': {
+                    'subset_list': ['elementary_chinese'],
+                    'dataset_id': 'data/data/cmmlu',
+                    'few_shot_num': 0
+                },
+                'bbh': {
+                    'subset_list': ['word_sorting', 'movie_recommendation'],
+                },
+                'humaneval': {
+                    'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
+                },
+                'trivia_qa': {
+                    'dataset_id': 'data/data/trivia_qa',
+                },
+            },
+            eval_batch_size=10,
+            limit=5,
+            debug=True,
+            stream=True,
+            generation_config={
+                'temperature': 0,
+                'n': 1,
+                'max_tokens': 4096,
+            },
+            ignore_errors=False,
+        )
+        run_task(task_cfg=task_cfg)
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_general_no_answer(self):
+        from evalscope.config import TaskConfig
+        task_cfg = TaskConfig(
+            model='qwen2.5-72b-instruct',
+            api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
+            api_key= env.get('DASHSCOPE_API_KEY'),
+            eval_type=EvalType.SERVICE,
+            datasets=[
+                'general_qa',
+            ],
+            dataset_args={
+                'general_qa': {
+                    'dataset_id': 'custom_eval/text/qa',
+                    'subset_list': [
+                        'arena',
+                        'example'
+                    ],
+                }
+            },
+            eval_batch_size=10,
+            limit=10,
+            debug=True,
+            stream=True,
+            generation_config={
+                'temperature': 0,
+                'n': 1,
+                'max_tokens': 4096,
+            },
+            ignore_errors=False,
+            judge_model_args={
+                'model_id': 'qwen2.5-72b-instruct',
+                'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
+                'api_key': env.get('DASHSCOPE_API_KEY'),
+                'generation_config': {
+                    'temperature': 0.0,
+                    'max_tokens': 4096
+                },
+                'score_type': 'numeric',
+            },
+            judge_worker_num=5,
+            judge_strategy=JudgeStrategy.AUTO,
+        )
+        run_task(task_cfg=task_cfg)
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_general_with_answer(self):
+        from evalscope.config import TaskConfig
+        task_cfg = TaskConfig(
+            model='qwen-plus',
+            api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
+            api_key= env.get('DASHSCOPE_API_KEY'),
+            eval_type=EvalType.SERVICE,
+            datasets=[
+                'general_qa',
+            ],
+            dataset_args={
+                'general_qa': {
+                    'dataset_id': 'custom_eval/text/qa',
+                    'subset_list': [
+                        'example'
+                    ],
+                }
+            },
+            eval_batch_size=10,
+            limit=10,
+            debug=True,
+            stream=True,
+            generation_config={
+                'temperature': 0,
+                'n': 1,
+                'max_tokens': 4096,
+            },
+            ignore_errors=False,
+            judge_model_args={
+                'model_id': 'qwen2.5-72b-instruct',
+                'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
+                'api_key': env.get('DASHSCOPE_API_KEY'),
+                'generation_config': {
+                    'temperature': 0.0,
+                    'max_tokens': 4096
+                },
+                'score_type': 'pattern',
+            },
+            judge_worker_num=5,
+            judge_strategy=JudgeStrategy.LLM,
+        )
+        run_task(task_cfg=task_cfg)
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_general_arena(self):
+        from evalscope.config import TaskConfig
+        task_cfg = TaskConfig(
+            model_id='Arena',
+            datasets=[
+                'general_arena',
+            ],
+            dataset_args={
+                'general_arena': {
+                    'extra_params':{
+                        'models':[
+                            {
+                                'name': 'qwen2.5-0.5b',
+                                'report_path': 'outputs/20250702_140354/reports/qwen2.5-0.5b-instruct'
+                            },
+                            {
+                                'name': 'qwen2.5-7b',
+                                'report_path': 'outputs/20250702_140702/reports/qwen2.5-7b-instruct'
+                            },
+                            {
+                                'name': 'qwen2.5-72b',
+                                'report_path': 'outputs/20250702_140802/reports/qwen2.5-72b-instruct'
+                            }
+                        ],
+                        'baseline': 'qwen2.5-7b'
+                    }
+                }
+            },
+            eval_batch_size=10,
+            limit=10,
+            debug=True,
+            stream=True,
+            ignore_errors=False,
+            judge_model_args={
+                'model_id': 'qwen-plus',
+                'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
+                'api_key': env.get('DASHSCOPE_API_KEY'),
+                'generation_config': {
+                    'temperature': 0.0,
+                    'max_tokens': 8000
+                },
+            },
+            judge_worker_num=5,
+            use_cache='outputs/20250702_165727'
+        )
+        run_task(task_cfg=task_cfg)

tests/cli/test_run.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from dotenv import dotenv_values
+from tests.utils import test_level_list
 env = dotenv_values('.env')
 import os
@@ -8,9 +10,9 @@ import subprocess
 import unittest
 from evalscope.config import TaskConfig
-from evalscope.constants import EvalType, JudgeStrategy, OutputType
+from evalscope.constants import EvalStage, EvalType, JudgeStrategy, OutputType
 from evalscope.run import run_task
-from evalscope.utils import is_module_installed, test_level_list
+from evalscope.utils.import_utils import is_module_installed
 from evalscope.utils.logger import get_logger
 os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
@@ -182,35 +184,6 @@ class TestRun(unittest.TestCase):
         run_task(task_cfg=task_cfg)
-    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
-    def test_run_custom_task(self):
-        from evalscope.config import TaskConfig
-        task_cfg = TaskConfig(
-            model='Qwen/Qwen3-0.6B',
-            datasets=[
-                'general_mcq',
-                'general_qa'
-            ],
-            dataset_args={
-                'general_mcq': {
-                    'local_path': 'custom_eval/text/mcq',  # 自定义数据集路径
-                    'subset_list': [
-                        'example'  # 评测数据集名称，上述 *_dev.csv 中的 *
-                    ],
-                    'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}'  # 问题模板
-                },
-                'general_qa': {
-                    'local_path': 'custom_eval/text/qa',  # 自定义数据集路径
-                    'subset_list': [
-                        'example'  # 评测数据集名称，上述 *_dev.csv 中的 *
-                    ]
-                }
-            },
-        )
-        res = run_task(task_cfg=task_cfg)
-        print(res)
     @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_one_task(self):
         from evalscope.config import TaskConfig
@@ -286,7 +259,7 @@ class TestRun(unittest.TestCase):
             api_key= env.get('DASHSCOPE_API_KEY'),
             eval_type=EvalType.SERVICE,
             datasets=[
-                # 'iquiz',
+                'iquiz',
                 # 'ifeval',
                 # 'mmlu',
                 # 'mmlu_pro',
@@ -305,7 +278,7 @@ class TestRun(unittest.TestCase):
                 # 'arc',
                 # 'ceval',
                 # 'hellaswag',
-                'general_mcq',
+                # 'general_mcq',
                 # 'general_qa',
                 # 'super_gpqa',
                 # 'mmlu_redux',
@@ -315,6 +288,7 @@ class TestRun(unittest.TestCase):
                 # 'tool_bench',
                 # 'frames',
                 # 'bfcl_v3',
+                # 'truthful_qa',
             ],
             dataset_args={
                 'mmlu': {
@@ -354,7 +328,6 @@ class TestRun(unittest.TestCase):
                 },
                 'musr': {
                     'subset_list': ['murder_mysteries'],
-                    'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/MuSR'
                 },
                 'general_mcq': {
                     'local_path': 'custom_eval/text/mcq',  # 自定义数据集路径
@@ -378,6 +351,9 @@ class TestRun(unittest.TestCase):
                 'mmlu_redux':{
                     'subset_list': ['abstract_algebra']
                 },
+                'frames':{
+                    'local_path': 'data/iic/frames',
+                },
                 'bfcl_v3': {
                     'subset_list': ['parallel'],
                     'extra_params': {
@@ -385,9 +361,9 @@ class TestRun(unittest.TestCase):
                     }
                 },
             },
-            eval_batch_size=10,
+            eval_batch_size=1,
             limit=5,
-            debug=True,
+            # debug=True,
             stream=True,
             generation_config={
                 'temperature': 0,
@@ -396,7 +372,6 @@ class TestRun(unittest.TestCase):
                 # 'extra_headers':{'key': 'value'},
             },
             ignore_errors=False,
-            # use_cache='outputs/20250616_153756'
         )
         run_task(task_cfg=task_cfg)
@@ -521,5 +496,6 @@ class TestRun(unittest.TestCase):
         run_task(task_cfg=task_cfg)
 if __name__ == '__main__':
     unittest.main()

tests/perf/test_perf.py CHANGED Viewed

@@ -7,7 +7,7 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 import unittest
 from evalscope.perf.main import run_perf_benchmark
-from evalscope.utils import test_level_list
+from tests.utils import test_level_list
 class TestPerf(unittest.TestCase):
@@ -126,7 +126,7 @@ class TestPerf(unittest.TestCase):
         from evalscope.perf.arguments import Arguments
         task_cfg = Arguments(
             parallel=[1, 2],
-            number=[2, 5],
+            number=[2, 4],
             model='qwen2.5-7b-instruct',
             url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
             api_key=env.get('DASHSCOPE_API_KEY'),

tests/rag/test_clip_benchmark.py CHANGED Viewed

@@ -6,8 +6,9 @@ import subprocess
 import unittest
 from evalscope.run import run_task
-from evalscope.utils import is_module_installed, test_level_list
+from evalscope.utils.import_utils import is_module_installed
 from evalscope.utils.logger import get_logger
+from tests.utils import test_level_list
 logger = get_logger()

tests/rag/test_mteb.py CHANGED Viewed

@@ -3,9 +3,11 @@
 import unittest
 from dotenv import dotenv_values
+from tests.utils import test_level_list
 env = dotenv_values('.env')
 from evalscope.run import run_task
-from evalscope.utils import is_module_installed, test_level_list
+from evalscope.utils.import_utils import is_module_installed
 from evalscope.utils.logger import get_logger
 logger = get_logger()

tests/rag/test_ragas.py CHANGED Viewed

@@ -2,11 +2,13 @@
 import os
 from dotenv import dotenv_values
+from tests.utils import test_level_list
 env = dotenv_values('.env')
 import unittest
 from evalscope import TaskConfig, run_task
-from evalscope.utils import is_module_installed, test_level_list
+from evalscope.utils.import_utils import is_module_installed
 from evalscope.utils.logger import get_logger
 logger = get_logger()

tests/swift/test_run_swift_eval.py CHANGED Viewed

@@ -10,8 +10,9 @@ import unittest
 from evalscope.backend.opencompass import OpenCompassBackendManager
 from evalscope.run import run_task
 from evalscope.summarizer import Summarizer
-from evalscope.utils import is_module_installed, test_level_list
+from evalscope.utils.import_utils import is_module_installed
 from evalscope.utils.logger import get_logger
+from tests.utils import test_level_list
 logger = get_logger(__name__)

tests/swift/test_run_swift_vlm_eval.py CHANGED Viewed

@@ -10,8 +10,9 @@ import unittest
 from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
 from evalscope.run import run_task
 from evalscope.summarizer import Summarizer
-from evalscope.utils import is_module_installed, test_level_list
+from evalscope.utils.import_utils import is_module_installed
 from evalscope.utils.logger import get_logger
+from tests.utils import test_level_list
 logger = get_logger(__name__)

tests/swift/test_run_swift_vlm_jugde_eval.py CHANGED Viewed

@@ -10,8 +10,9 @@ import unittest
 from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
 from evalscope.run import run_task
 from evalscope.summarizer import Summarizer
-from evalscope.utils import is_module_installed, test_level_list
+from evalscope.utils.import_utils import is_module_installed
 from evalscope.utils.logger import get_logger
+from tests.utils import test_level_list
 logger = get_logger(__name__)

tests/utils.py ADDED Viewed

@@ -0,0 +1,13 @@
+import os
+TEST_LEVEL_LIST = [0, 1]
+# Example: export TEST_LEVEL_LIST=0,1
+TEST_LEVEL_LIST_STR = 'TEST_LEVEL_LIST'
+def test_level_list():
+    global TEST_LEVEL_LIST
+    if TEST_LEVEL_LIST_STR in os.environ:
+        TEST_LEVEL_LIST = [int(x) for x in os.environ[TEST_LEVEL_LIST_STR].split(',')]
+    return TEST_LEVEL_LIST

tests/vlm/test_vlmeval.py CHANGED Viewed

@@ -1,12 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from dotenv import dotenv_values
+from tests.utils import test_level_list
 env = dotenv_values('.env')
 import unittest
 from evalscope.run import run_task
 from evalscope.summarizer import Summarizer
-from evalscope.utils import is_module_installed, test_level_list
+from evalscope.utils.import_utils import is_module_installed
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -62,7 +64,11 @@ class TestVLMEval(unittest.TestCase):
         task_cfg = {
             'eval_backend': 'VLMEvalKit',
             'eval_config': {
-                'data': ['SEEDBench_IMG', 'ChartQA_TEST'],
+                'data': [
+                    # 'SEEDBench_IMG',
+                    # 'ChartQA_TEST',
+                    'MMDU'
+                    ],
                 'limit': 5,
                 'mode': 'all',
                 'model': [

evalscope/evaluator/rating_eval.py DELETED Viewed

@@ -1,157 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import pandas as pd
-import pyarrow as pa
-from typing import List, Union
-from evalscope.constants import MetricMembers
-from evalscope.utils.arena_utils import compute_elo
-from evalscope.utils.io_utils import jsonl_to_list
-from evalscope.utils.logger import get_logger
-logger = get_logger()
-DEFAULT_COLUMNS_MAPPING = {'model_a': 'model_a', 'model_b': 'model_b', 'win': 'win', 'tstamp': 'ts', 'language': 'lang'}
-class RatingEvaluate(object):
-    def __init__(self, metrics: list, baseline_model: str = None, **kwargs):
-        self.metrics = metrics
-        self.baseline_model = baseline_model
-        self.kwargs = kwargs
-    def preprocess(self, raw_data_df: pd.DataFrame, **kwargs):
-        # Get battles data
-        raw_data_df = raw_data_df.sort_values(ascending=True, by=['tstamp'])
-        battles = raw_data_df[raw_data_df['anony']].reset_index(drop=True)
-        return battles
-    def compute_elo_rating(self, raw_data):
-        battles = self.preprocess(raw_data_df=raw_data)
-        elo_ratings = compute_elo(battles)
-        col_model = 'Model'
-        col_elo_rating = 'Elo_Rating'
-        elo_ratings_res = pd.DataFrame([[n, elo_ratings[n]] for n in elo_ratings.keys()],
-                                       columns=[col_model, col_elo_rating]).sort_values(
-                                           col_elo_rating, ascending=False).reset_index(drop=True)
-        elo_ratings_res = elo_ratings_res.round({col_elo_rating: 1})
-        return elo_ratings_res
-    def get_single_pairwise_rating(self, row: pd.Series):
-        tie = False
-        if 'win' in row:
-            win = row['win']
-            if win == 'tie':
-                tie = True
-            else:
-                if win == 'model_a':
-                    winner = row['model_a']
-                    loser = row['model_b']
-                else:
-                    winner = row['model_b']
-                    loser = row['model_a']
-        elif 'win_1' in row:
-            win_1 = row['win_1']
-            win_2 = row['win_2']
-            if win_1 == 'tie' or win_1 != win_2:
-                tie = True
-            else:
-                if win_1 == 'model_a':
-                    winner = row['model_a']
-                    loser = row['model_b']
-                else:
-                    winner = row['model_b']
-                    loser = row['model_a']
-        else:
-            raise ValueError('Unsupported data format')
-        if tie:
-            return [{
-                'model': row['model_a'],
-                'win': 0,
-                'loss': 0,
-                'tie': 1
-            }, {
-                'model': row['model_b'],
-                'win': 0,
-                'loss': 0,
-                'tie': 1
-            }]
-        else:
-            return [{'model': winner, 'win': 1, 'loss': 0, 'tie': 0}, {'model': loser, 'win': 0, 'loss': 1, 'tie': 0}]
-    def compute_pairwise_rating(self, raw_data):
-        df_all = self.preprocess(raw_data_df=raw_data)
-        model_list = (df_all['model_a'].unique().tolist() + df_all['model_b'].unique().tolist())
-        model_list = list(set(model_list))
-        list_res = []
-        # traverse df row by row
-        for index, row in df_all.iterrows():
-            if self.baseline_model is not None:
-                if self.baseline_model not in [row['model_a'], row['model_b']]:
-                    logger.warning(
-                        f'One of the models in the battle should be the baseline model: {self.baseline_model}')
-                    continue
-            rating = self.get_single_pairwise_rating(row)
-            list_res = list_res + rating
-        df = pd.DataFrame(list_res)
-        df = df.groupby(['model']).sum()
-        # remove baseline model
-        if self.baseline_model is not None:
-            df = df[df.index != self.baseline_model]
-        # add win rate
-        df['win_rate'] = df['win'] / (df['win'] + df['loss'] + df['tie'])
-        df['loss_rate'] = df['loss'] / (df['win'] + df['loss'] + df['tie'])
-        df['tie_rate'] = df['tie'] / (df['win'] + df['loss'] + df['tie'])
-        return df.sort_values(by='win_rate', ascending=False)
-    def compute_score_rating(self, raw_data):
-        df_all = self.preprocess(raw_data_df=raw_data)
-        df = df_all[['model', 'score']]
-        df_score = df.groupby(['model']).mean()
-        return df_score.sort_values(by='score', ascending=False)
-    def eval_samples(self, data_list: list):
-        res_all = []
-        raw_data: pd.DataFrame = None
-        if len(data_list) > 0:
-            raw_data = data_list[0]
-        for metric in self.metrics:
-            if metric == MetricMembers.ELO:
-                res = self.compute_elo_rating(raw_data)
-                res_all.append(res)
-            elif metric == MetricMembers.PAIRWISE:
-                res = self.compute_pairwise_rating(raw_data)
-                res_all.append(res)
-            elif metric == MetricMembers.SCORE:
-                res = self.compute_score_rating(raw_data)
-                res_all.append(res)
-            else:
-                raise ValueError(f'Unsupported metric: {metric}')
-        return res_all
-    def run(self, prompts: Union[str, list], **kwargs) -> List[pd.DataFrame]:
-        """
-        Load the predicted samples and evaluate them in arena mode.
-        """
-        # raw_data = pd.read_json(prompts)
-        data_list = jsonl_to_list(prompts)
-        data_df = pa.Table.from_pylist(data_list).to_pandas()
-        res_list = self.eval_samples([data_df])
-        return res_list

evalscope/evaluator/reviewer/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- # Copyright (c) Alibaba, Inc. and its affiliates.

evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.16.2py3-none-any.whl → 0.17.0py3-none-any.whl