PyPI - evalscope - Versions diffs - 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl - Mend

evalscope 0.12.0py3-none-any.whl → 0.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (85) hide show

evalscope/arguments.py +6 -1
evalscope/benchmarks/aime/aime24_adapter.py +3 -3
evalscope/benchmarks/aime/aime25_adapter.py +3 -3
evalscope/benchmarks/arc/arc_adapter.py +15 -18
evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
evalscope/benchmarks/benchmark.py +12 -11
evalscope/benchmarks/ceval/ceval_adapter.py +12 -16
evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +13 -17
evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
evalscope/benchmarks/data_adapter.py +59 -21
evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
evalscope/benchmarks/general_qa/general_qa_adapter.py +30 -15
evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -31
evalscope/benchmarks/humaneval/humaneval_adapter.py +10 -7
evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
evalscope/benchmarks/live_code_bench/__init__.py +0 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
evalscope/benchmarks/live_code_bench/prompts.py +207 -0
evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
evalscope/benchmarks/mmlu/mmlu_adapter.py +13 -17
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
evalscope/benchmarks/musr/musr_adapter.py +8 -5
evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
evalscope/benchmarks/race/race_adapter.py +12 -16
evalscope/benchmarks/simple_qa/__init__.py +0 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +167 -0
evalscope/benchmarks/super_gpqa/__init__.py +0 -0
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
evalscope/benchmarks/super_gpqa/utils.py +85 -0
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
evalscope/benchmarks/utils.py +43 -0
evalscope/collections/evaluator.py +14 -5
evalscope/config.py +15 -2
evalscope/constants.py +14 -0
evalscope/evaluator/evaluator.py +51 -13
evalscope/metrics/llm_judge.py +104 -0
evalscope/metrics/named_metrics.py +1 -0
evalscope/models/__init__.py +2 -1
evalscope/models/base_adapter.py +25 -5
evalscope/models/chat_adapter.py +3 -0
evalscope/models/choice_adapter.py +4 -0
evalscope/models/custom_adapter.py +2 -0
evalscope/models/register.py +28 -0
evalscope/models/server_adapter.py +35 -8
evalscope/perf/arguments.py +13 -7
evalscope/perf/benchmark.py +5 -0
evalscope/perf/http_client.py +15 -5
evalscope/perf/main.py +1 -0
evalscope/perf/utils/analysis_result.py +1 -1
evalscope/report/app.py +3 -0
evalscope/report/combinator.py +2 -2
evalscope/run.py +6 -5
evalscope/third_party/longbench_write/infer.py +1 -1
evalscope/third_party/thinkbench/eval.py +220 -55
evalscope/third_party/thinkbench/infer.py +37 -7
evalscope/third_party/thinkbench/tools/llm.py +1 -0
evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
evalscope/utils/chat_service.py +1 -0
evalscope/utils/filters.py +59 -0
evalscope/utils/logger.py +3 -3
evalscope/version.py +2 -2
{evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/METADATA +31 -12
{evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/RECORD +85 -62
tests/cli/test_all.py +144 -0
tests/cli/test_collection.py +28 -2
tests/cli/test_run.py +201 -32
{evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/LICENSE +0 -0
{evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/WHEEL +0 -0
{evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/top_level.txt +0 -0

tests/cli/test_run.py CHANGED Viewed

@@ -1,10 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from dotenv import dotenv_values
+env = dotenv_values('.env')
 import os
 import subprocess
-import torch
 import unittest
-from evalscope.constants import EvalType
+from evalscope.config import TaskConfig
+from evalscope.constants import EvalType, JudgeStrategy, OutputType
 from evalscope.run import run_task
 from evalscope.utils import is_module_installed, test_level_list
 from evalscope.utils.logger import get_logger
@@ -71,21 +75,104 @@ class TestRun(unittest.TestCase):
     @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_task(self):
-        task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct',
-                    'datasets': [
-                        # 'mmlu_pro',
-                        # 'bbh',
-                        # 'hellaswag',
-                        'gsm8k',
-                        # 'arc',
-                        # 'race',
-                        # 'ifeval',
-                        # 'truthful_qa',
-                        # 'trivia_qa',
-                        ],
-                    'limit': 2,
-                    'eval_batch_size': 2,
-                    'debug': True}
+        task_cfg = TaskConfig(
+            model='qwen/Qwen2.5-0.5B-Instruct',
+            datasets=[
+                'iquiz',
+                # 'ifeval',
+                # 'mmlu',
+                # 'mmlu_pro',
+                # 'musr',
+                # 'process_bench',
+                # 'race',
+                # 'trivia_qa',
+                # 'cmmlu',
+                # 'humaneval',
+                # 'super_gpqa',
+                # 'gsm8k',
+                # 'bbh',
+                # 'competition_math',
+                # 'math_500',
+                'aime24',
+                'gpqa',
+                # 'arc',
+                # 'ceval',
+                # 'hellaswag',
+                # 'general_mcq',
+                # 'general_qa'
+            ],
+            dataset_args={
+                'mmlu': {
+                    'subset_list': ['elementary_mathematics'],
+                    'few_shot_num': 0
+                },
+                'mmlu_pro': {
+                    'subset_list': ['math', 'health'],
+                    'few_shot_num': 4
+                },
+                'ceval': {
+                    'subset_list': [
+                        'computer_network', 'operating_system', 'computer_architecture'
+                    ],
+                    'few_shot_num': 0
+                },
+                'cmmlu': {
+                    'subset_list': ['elementary_chinese'],
+                    'few_shot_num': 0
+                },
+                'bbh': {
+                    'subset_list': ['word_sorting', 'movie_recommendation'],
+                },
+                'gpqa': {
+                    'subset_list': ['gpqa_diamond'],
+                    'few_shot_num': 0
+                },
+                'humaneval': {
+                    'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
+                },
+                'competition_math': {
+                    'subset_list': ['Level 1']
+                },
+                'process_bench': {
+                    'subset_list': ['gsm8k'],
+                },
+                'musr': {
+                    'subset_list': ['murder_mysteries']
+                },
+                'general_mcq': {
+                    'local_path': 'custom_eval/text/mcq',  # 自定义数据集路径
+                    'subset_list': [
+                        'example'  # 评测数据集名称，上述 *_dev.csv 中的 *
+                    ],
+                    'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}'  # 问题模板
+                },
+                'general_qa': {
+                    'local_path': 'custom_eval/text/qa',  # 自定义数据集路径
+                    'subset_list': [
+                        'example',  # 评测数据集名称，上述 *_dev.csv 中的 *
+                        # 'test'
+                    ],
+                    'metric_list': ['AverageBLEU']
+                },
+                'super_gpqa': {
+                    'subset_list': ['Philosophy', 'Education'],
+                    'few_shot_num': 0
+                },
+                'ifeval': {
+                    'filters': {
+                        'remove_until': '</think>'
+                    }
+                }
+            },
+            limit=2,
+            eval_batch_size=2,
+            generation_config={
+                'max_new_tokens': 2048,
+                'temperature': 0.7,
+                'num_return_sequences': 1,
+            },
+            # debug=True
+        )
         run_task(task_cfg=task_cfg)
@@ -141,12 +228,12 @@ class TestRun(unittest.TestCase):
         from evalscope.config import TaskConfig
         task_cfg = TaskConfig(
-            model='Qwen2.5-0.5B-Instruct',
-            api_url='http://127.0.0.1:8801/v1',
-            api_key='EMPTY',
+            model='qwen2.5-7b-instruct',
+            api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
+            api_key= env.get('DASHSCOPE_API_KEY'),
             eval_type=EvalType.SERVICE,
             datasets=[
-                'iquiz',
+                # 'iquiz',
                 # 'ifeval',
                 # 'mmlu',
                 # 'mmlu_pro',
@@ -164,11 +251,14 @@ class TestRun(unittest.TestCase):
                 # 'gpqa',
                 # 'arc',
                 # 'ceval',
-                # 'hellaswag',
+                'hellaswag',
+                # 'general_mcq',
+                # 'general_qa'
+                # 'super_gpqa',
             ],
             dataset_args={
                 'mmlu': {
-                    'subset_list': ['elementary_mathematics'],
+                    'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
                     'few_shot_num': 0
                 },
                 'mmlu_pro': {
@@ -189,8 +279,9 @@ class TestRun(unittest.TestCase):
                     'subset_list': ['word_sorting', 'movie_recommendation'],
                 },
                 'gpqa': {
-                    'subset_list': ['gpqa_diamond'],
-                    'few_shot_num': 0
+                    # 'subset_list': ['gpqa_diamond'],
+                    'few_shot_num': 0,
+                    'local_path': './data/data/gpqa',
                 },
                 'humaneval': {
                     'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
@@ -204,17 +295,36 @@ class TestRun(unittest.TestCase):
                 'musr': {
                     'subset_list': ['murder_mysteries']
                 },
+                'general_mcq': {
+                    'local_path': 'custom_eval/text/mcq',  # 自定义数据集路径
+                    'subset_list': [
+                        'example'  # 评测数据集名称，上述 *_dev.csv 中的 *
+                    ],
+                    'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}'  # 问题模板
+                },
+                'general_qa': {
+                    'local_path': 'custom_eval/text/qa',  # 自定义数据集路径
+                    'subset_list': [
+                        'example',  # 评测数据集名称，上述 *_dev.csv 中的 *
+                        # 'test'
+                    ],
+                    'metric_list': ['AverageBLEU']
+                },
+                'super_gpqa': {
+                    # 'subset_list': ['Philosophy', 'Education'],
+                    'few_shot_num': 0
+                }
             },
-            eval_batch_size=5,
-            limit=5,
-            debug=True,
-            stream=True,
+            eval_batch_size=32,
+            limit=15,
+            # debug=True,
+            stream=False,
             generation_config={
-                'temperature': 0.7,
+                'temperature': 0,
                 'n': 1,
-                'max_tokens': 512,
+                'max_tokens': 4096,
             },
-            # use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250212_150525',
+            # use_cache='./outputs/20250212_150525',
         )
         run_task(task_cfg=task_cfg)
@@ -250,5 +360,64 @@ class TestRun(unittest.TestCase):
         run_task(task_cfg=task_cfg)
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_judge_model(self):
+        from evalscope.config import TaskConfig
+        task_cfg = TaskConfig(
+            model='qwen2.5-7b-instruct',
+            api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
+            api_key= env.get('DASHSCOPE_API_KEY'),
+            eval_type=EvalType.SERVICE,
+            datasets=[
+                # 'math_500',
+                'aime24',
+                # 'competition_math',
+                # 'arc',
+                # 'gsm8k'
+                # 'truthful_qa',
+                # 'simple_qa',
+                # # 'chinese_simpleqa',
+                # 'live_code_bench',
+                # 'humaneval'
+                # 'general_qa'
+            ],
+            dataset_args={
+                'competition_math': {
+                    'subset_list': ['Level 4']
+                },
+                'live_code_bench': {
+                    'subset_list': ['v4_v5'],
+                    'extra_params': {
+                        'start_date': '2024-12-01',
+                        'end_date': '2025-01-01'
+                    },
+                    'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
+                },
+                'general_qa': {
+                    'local_path': 'custom_eval/text/qa',  # 自定义数据集路径
+                    'subset_list': [
+                        'example',  # 评测数据集名称，上述 *_dev.csv 中的 *
+                        # 'test'
+                    ]
+                },
+            },
+            eval_batch_size=5,
+            limit=5,
+            judge_strategy=JudgeStrategy.AUTO,
+            judge_model_args={
+                'model_id': 'qwen2.5-7b-instruct',
+                'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
+                'api_key': env.get('DASHSCOPE_API_KEY'),
+            },
+            generation_config={
+                'max_new_tokens': 2048,
+                'temperature': 0.0,
+                'seed': 42,
+            }
+        )
+        run_task(task_cfg=task_cfg)
 if __name__ == '__main__':
     unittest.main()

{evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

evalscope 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.12.0py3-none-any.whl → 0.13.0py3-none-any.whl