PyPI - evalscope - Versions diffs - 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl - Mend

evalscope 0.13.0py3-none-any.whl → 0.13.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (45) hide show

evalscope/arguments.py +1 -1
evalscope/backend/rag_eval/utils/llm.py +4 -5
evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
evalscope/benchmarks/arena_hard/__init__.py +0 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
evalscope/benchmarks/arena_hard/utils.py +162 -0
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
evalscope/benchmarks/data_adapter.py +26 -2
evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -11
evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
evalscope/benchmarks/live_code_bench/testing_util.py +3 -3
evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
evalscope/collections/evaluator.py +1 -1
evalscope/config.py +6 -3
evalscope/constants.py +1 -0
evalscope/evaluator/evaluator.py +5 -4
evalscope/metrics/llm_judge.py +1 -1
evalscope/models/chat_adapter.py +32 -11
evalscope/models/custom_adapter.py +1 -1
evalscope/perf/arguments.py +19 -46
evalscope/perf/benchmark.py +64 -90
evalscope/perf/main.py +1 -1
evalscope/perf/plugin/api/openai_api.py +4 -2
evalscope/perf/plugin/datasets/__init__.py +1 -0
evalscope/perf/plugin/datasets/openqa.py +6 -11
evalscope/perf/plugin/datasets/random_dataset.py +51 -0
evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
evalscope/perf/utils/db_util.py +5 -2
evalscope/run.py +14 -2
evalscope/version.py +2 -2
{evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/METADATA +42 -78
{evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/RECORD +45 -37
tests/cli/test_all.py +33 -24
tests/cli/test_run.py +69 -22
tests/perf/test_perf.py +23 -0
tests/rag/test_ragas.py +4 -1
{evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/LICENSE +0 -0
{evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/WHEEL +0 -0
{evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/entry_points.txt +0 -0
{evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/top_level.txt +0 -0

tests/cli/test_run.py CHANGED Viewed

@@ -203,15 +203,16 @@ class TestRun(unittest.TestCase):
         print(res)
     @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
-    def test_run_humaneval(self):
+    def test_run_one_task(self):
         from evalscope.config import TaskConfig
         task_cfg = TaskConfig(
-            model='qwen/Qwen2-0.5B-Instruct',
+            model='Qwen/Qwen2.5-0.5B-Instruct',
             datasets=[
+                'iquiz',
                 # 'math_500',
                 # 'aime24',
-                'competition_math'
+                # 'competition_math'
             ],
             dataset_args={
                 'competition_math': {
@@ -223,12 +224,39 @@ class TestRun(unittest.TestCase):
         run_task(task_cfg=task_cfg)
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_task_loop(self):
+        os.environ['CUDA_VISIBLE_DEVICES'] = '2'
+        from evalscope.config import TaskConfig
+        task_cfg1 = TaskConfig(
+            model='Qwen/Qwen2.5-0.5B-Instruct',
+            model_id='model1',
+            datasets=['iquiz'],
+            limit=10
+        )
+        task_cfg2 = TaskConfig(
+            model='Qwen/Qwen2.5-0.5B-Instruct',
+            model_id='model2',
+            datasets=['iquiz'],
+            limit=10
+        )
+        task_cfg3 = TaskConfig(
+            model='Qwen/Qwen2.5-0.5B-Instruct',
+            model_id='model3',
+            datasets=['iquiz'],
+            limit=10
+        )
+        run_task(task_cfg=[task_cfg1, task_cfg2, task_cfg3])
     @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_server_model(self):
         from evalscope.config import TaskConfig
         task_cfg = TaskConfig(
-            model='qwen2.5-7b-instruct',
+            model='qwen-plus',
             api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
             api_key= env.get('DASHSCOPE_API_KEY'),
             eval_type=EvalType.SERVICE,
@@ -251,10 +279,11 @@ class TestRun(unittest.TestCase):
                 # 'gpqa',
                 # 'arc',
                 # 'ceval',
-                'hellaswag',
+                # 'hellaswag',
                 # 'general_mcq',
-                # 'general_qa'
+                'general_qa'
                 # 'super_gpqa',
+                # 'mmlu_redux'
             ],
             dataset_args={
                 'mmlu': {
@@ -308,23 +337,26 @@ class TestRun(unittest.TestCase):
                         'example',  # 评测数据集名称，上述 *_dev.csv 中的 *
                         # 'test'
                     ],
-                    'metric_list': ['AverageBLEU']
+                    'metric_list': ['AverageRouge']
                 },
                 'super_gpqa': {
                     # 'subset_list': ['Philosophy', 'Education'],
                     'few_shot_num': 0
-                }
+                },
+                'mmlu_redux':{
+                    'subset_list': ['abstract_algebra']
+                },
             },
             eval_batch_size=32,
             limit=15,
-            # debug=True,
+            debug=True,
             stream=False,
             generation_config={
                 'temperature': 0,
-                'n': 1,
+                'n': 2,
                 'max_tokens': 4096,
             },
-            # use_cache='./outputs/20250212_150525',
+            use_cache='outputs/20250326_202848',
         )
         run_task(task_cfg=task_cfg)
@@ -365,32 +397,33 @@ class TestRun(unittest.TestCase):
         from evalscope.config import TaskConfig
         task_cfg = TaskConfig(
-            model='qwen2.5-7b-instruct',
+            model='qwen2.5-0.5b-instruct',
             api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
             api_key= env.get('DASHSCOPE_API_KEY'),
             eval_type=EvalType.SERVICE,
             datasets=[
                 # 'math_500',
-                'aime24',
+                # 'aime24',
                 # 'competition_math',
                 # 'arc',
                 # 'gsm8k'
                 # 'truthful_qa',
                 # 'simple_qa',
-                # # 'chinese_simpleqa',
+                # 'chinese_simpleqa',
                 # 'live_code_bench',
-                # 'humaneval'
-                # 'general_qa'
+                # 'humaneval',
+                # 'general_qa',
+                # 'alpaca_eval',
+                'arena_hard'
             ],
             dataset_args={
                 'competition_math': {
                     'subset_list': ['Level 4']
                 },
                 'live_code_bench': {
-                    'subset_list': ['v4_v5'],
                     'extra_params': {
-                        'start_date': '2024-12-01',
-                        'end_date': '2025-01-01'
+                        'start_date': '2024-08-01',
+                        'end_date': '2025-02-28'
                     },
                     'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
                 },
@@ -401,20 +434,34 @@ class TestRun(unittest.TestCase):
                         # 'test'
                     ]
                 },
+                'chinese_simpleqa': {
+                    'subset_list': [
+                        '中华文化'
+                    ]
+                },
             },
             eval_batch_size=5,
-            limit=5,
+            limit=10,
             judge_strategy=JudgeStrategy.AUTO,
+            judge_worker_num=5,
             judge_model_args={
                 'model_id': 'qwen2.5-7b-instruct',
                 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
                 'api_key': env.get('DASHSCOPE_API_KEY'),
+                'generation_config': {
+                    'temperature': 0.0,
+                    'max_tokens': 4096
+                }
             },
             generation_config={
-                'max_new_tokens': 2048,
+                'max_new_tokens': 20000,
                 'temperature': 0.0,
                 'seed': 42,
-            }
+                'n': 1
+            },
+            timeout=60000,
+            stream=True,
+            # use_cache='outputs/20250320_143658'
         )
         run_task(task_cfg=task_cfg)

tests/perf/test_perf.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
+from dotenv import dotenv_values
+env = dotenv_values('.env')
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 import unittest
@@ -96,6 +98,27 @@ class TestPerf(unittest.TestCase):
         }
         run_perf_benchmark(task_cfg)
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_perf_local_random(self):
+        from evalscope.perf.arguments import Arguments
+        task_cfg = Arguments(
+            parallel=20,
+            model='Qwen2.5-0.5B-Instruct',
+            url='http://127.0.0.1:8801/v1/chat/completions',
+            api='openai',
+            dataset='random',
+            min_tokens=1024,
+            max_tokens=1024,
+            prefix_length=0,
+            min_prompt_length=1024,
+            max_prompt_length=1024,
+            number=40,
+            tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
+            seed=None,
+            debug= True,
+        )
+        run_perf_benchmark(task_cfg)
 if __name__ == '__main__':
     unittest.main(buffer=False)

tests/rag/test_ragas.py CHANGED Viewed

@@ -1,5 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
+from dotenv import dotenv_values
+env = dotenv_values('.env')
 import unittest
 from evalscope.run import run_task
@@ -63,7 +66,7 @@ class TestRAGAS(unittest.TestCase):
                 'eval': {
                     'testset_file': 'outputs/testset_chinese_with_answer.json',
                     'critic_llm': {
-                        'model_name_or_path': 'qwen/Qwen2-7B-Instruct',
+                        'model_name_or_path': 'Qwen/Qwen2.5-7B-Instruct',
                     },
                     'embeddings': {
                         'model_name_or_path': 'AI-ModelScope/m3e-base',

{evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

evalscope 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl

Potentially problematic release.

evalscope 0.13.0py3-none-any.whl → 0.13.2py3-none-any.whl