PyPI - evalscope - Versions diffs - 0.16.1__py3-none-any.whl → 0.16.2__py3-none-any.whl - Mend

evalscope 0.16.1py3-none-any.whl → 0.16.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (82) hide show

evalscope/app/app.py +20 -5
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
evalscope/backend/rag_eval/utils/embedding.py +2 -4
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
evalscope/benchmarks/aime/aime24_adapter.py +3 -1
evalscope/benchmarks/aime/aime25_adapter.py +3 -1
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
evalscope/benchmarks/arc/arc_adapter.py +3 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
evalscope/benchmarks/benchmark.py +1 -0
evalscope/benchmarks/bfcl/__init__.py +0 -0
evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
evalscope/benchmarks/data_adapter.py +2 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
evalscope/benchmarks/docmath/docmath_adapter.py +1 -0
evalscope/benchmarks/drop/drop_adapter.py +3 -0
evalscope/benchmarks/frames/frames_adapter.py +1 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
evalscope/benchmarks/musr/musr_adapter.py +3 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +15 -8
evalscope/benchmarks/needle_haystack/utils.py +2 -2
evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
evalscope/benchmarks/race/race_adapter.py +3 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +5 -0
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
evalscope/collections/evaluator.py +50 -28
evalscope/constants.py +1 -1
evalscope/evaluator/evaluator.py +6 -5
evalscope/metrics/t2v_metrics/__init__.py +9 -23
evalscope/models/adapters/__init__.py +2 -0
evalscope/models/adapters/base_adapter.py +31 -27
evalscope/models/adapters/bfcl_adapter.py +244 -0
evalscope/models/adapters/server_adapter.py +78 -17
evalscope/models/custom/custom_model.py +0 -3
evalscope/models/custom/dummy_model.py +77 -39
evalscope/models/local_model.py +1 -1
evalscope/models/register.py +2 -1
evalscope/perf/arguments.py +2 -0
evalscope/perf/benchmark.py +16 -3
evalscope/perf/plugin/api/openai_api.py +2 -0
evalscope/report/combinator.py +38 -12
evalscope/report/utils.py +24 -1
evalscope/run.py +1 -1
evalscope/summarizer.py +1 -1
evalscope/utils/io_utils.py +59 -2
evalscope/version.py +2 -2
{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/METADATA +4 -3
{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/RECORD +82 -79
tests/aigc/test_t2i.py +8 -8
tests/cli/test_all.py +40 -33
tests/cli/test_collection.py +4 -3
tests/cli/test_run.py +36 -21
tests/rag/test_clip_benchmark.py +5 -1
tests/rag/test_mteb.py +46 -2
{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0

tests/cli/test_run.py CHANGED Viewed

@@ -63,7 +63,7 @@ class TestRun(unittest.TestCase):
                         f'--model {model} ' \
                         f'--datasets {datasets} ' \
                         f'--limit {limit} ' \
-                        f'--generation-config do_sample=false,temperature=0.0 ' \
+                        f'--generation-config do_sample=true,temperature=0.6,max_length=65535,max_new_tokens=65535,max_tokens=65535,n=1,top_p=0.95,top_k=20 ' \
                         f"""--dataset-args \'{dataset_args}\' """
         logger.info(f'Start to run command: {cmd_with_args}')
@@ -187,8 +187,11 @@ class TestRun(unittest.TestCase):
         from evalscope.config import TaskConfig
         task_cfg = TaskConfig(
-            model='qwen/Qwen2-0.5B-Instruct',
-            datasets=['general_mcq', 'general_qa'],  # 数据格式，选择题格式固定为 'ceval'
+            model='Qwen/Qwen3-0.6B',
+            datasets=[
+                'general_mcq',
+                'general_qa'
+            ],
             dataset_args={
                 'general_mcq': {
                     'local_path': 'custom_eval/text/mcq',  # 自定义数据集路径
@@ -215,16 +218,14 @@ class TestRun(unittest.TestCase):
         task_cfg = TaskConfig(
             model='Qwen/Qwen3-1.7B',
             datasets=[
-                'iquiz',
+                # 'iquiz',
                 # 'math_500',
                 # 'aime24',
                 # 'competition_math',
                 # 'mmlu',
                 # 'simple_qa',
+                'truthful_qa',
             ],
-            model_args={
-                'device_map': 'auto',
-            },
             dataset_args={
                 'competition_math': {
                     'subset_list': ['Level 4', 'Level 5']
@@ -304,7 +305,7 @@ class TestRun(unittest.TestCase):
                 # 'arc',
                 # 'ceval',
                 # 'hellaswag',
-                # 'general_mcq',
+                'general_mcq',
                 # 'general_qa',
                 # 'super_gpqa',
                 # 'mmlu_redux',
@@ -312,7 +313,8 @@ class TestRun(unittest.TestCase):
                 # 'drop',
                 # 'winogrande',
                 # 'tool_bench',
-                'frames',
+                # 'frames',
+                # 'bfcl_v3',
             ],
             dataset_args={
                 'mmlu': {
@@ -370,25 +372,31 @@ class TestRun(unittest.TestCase):
                     'metric_list': ['AverageRouge']
                 },
                 'super_gpqa': {
-                    # 'subset_list': ['Philosophy', 'Education'],
+                    'subset_list': ['Philosophy', 'Education'],
                     'few_shot_num': 0
                 },
                 'mmlu_redux':{
                     'subset_list': ['abstract_algebra']
                 },
+                'bfcl_v3': {
+                    'subset_list': ['parallel'],
+                    'extra_params': {
+                        # 'is_fc_model': False,
+                    }
+                },
             },
-            eval_batch_size=32,
-            limit=10,
+            eval_batch_size=10,
+            limit=5,
             debug=True,
-            stream=False,
+            stream=True,
             generation_config={
                 'temperature': 0,
                 'n': 1,
                 'max_tokens': 4096,
                 # 'extra_headers':{'key': 'value'},
             },
-            # ignore_errors=True,
-            # use_cache='outputs/20250519_142106'
+            ignore_errors=False,
+            # use_cache='outputs/20250616_153756'
         )
         run_task(task_cfg=task_cfg)
@@ -434,8 +442,8 @@ class TestRun(unittest.TestCase):
             api_key= env.get('DASHSCOPE_API_KEY'),
             eval_type=EvalType.SERVICE,
             datasets=[
-                # 'math_500',
-                'aime24',
+                'math_500',
+                # 'aime24',
                 # 'competition_math',
                 # 'arc',
                 # 'gsm8k',
@@ -450,8 +458,15 @@ class TestRun(unittest.TestCase):
                 # 'frames',
                 # 'docmath',
                 # 'needle_haystack',
+                # 'ifeval',
             ],
             dataset_args={
+                'needle_haystack': {
+                    'subset_list': ['english'],
+                    'extra_params': {
+                        'show_score': True,
+                    }
+                },
                 'competition_math': {
                     'subset_list': ['Level 4']
                 },
@@ -479,8 +494,8 @@ class TestRun(unittest.TestCase):
                 }
             },
             eval_batch_size=10,
-            limit=1,
-            judge_strategy=JudgeStrategy.AUTO,
+            limit=3,
+            judge_strategy=JudgeStrategy.LLM,
             judge_worker_num=5,
             judge_model_args={
                 'model_id': 'qwen2.5-72b-instruct',
@@ -499,9 +514,9 @@ class TestRun(unittest.TestCase):
             },
             timeout=60000,
             stream=True,
-            analysis_report=True,
+            # analysis_report=True,
             # debug=True,
-            # use_cache='outputs/20250602_135859'
+            # use_cache='outputs/20250616_161931'
         )
         run_task(task_cfg=task_cfg)

tests/rag/test_clip_benchmark.py CHANGED Viewed

@@ -39,7 +39,11 @@ class TestCLIPBenchmark(unittest.TestCase):
                             'model_name': 'AI-ModelScope/chinese-clip-vit-large-patch14-336px',
                         }
                     ],
-                    'dataset_name': ['muge', 'mnist'],
+                    'dataset_name': [
+                        'muge',
+                        'mnist',
+                        'flickr8k'
+                    ],
                     'split': 'test',
                     'batch_size': 128,
                     'num_workers': 1,

tests/rag/test_mteb.py CHANGED Viewed

@@ -121,10 +121,54 @@ class TestMTEB(unittest.TestCase):
                     },
                 ],
                 'eval': {
-                    'tasks': ['MedicalRetrieval', 'T2Retrieval'],
+                    'tasks': [
+                        'MedicalRetrieval',
+                        'T2Retrieval'
+                    ],
                     'verbosity': 2,
                     'overwrite_results': True,
-                    # 'limits': 10,
+                    'limits': 10,
+                    'top_k': 10,
+                },
+            },
+        }
+        run_task(task_cfg)
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_two_stage_api(self):
+        task_cfg = {
+            'eval_backend': 'RAGEval',
+            'eval_config': {
+                'tool': 'MTEB',
+                'model': [
+                    {
+                        'model_name': 'text-embedding-v3',
+                        'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
+                        'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
+                        'dimensions': 1024,
+                        'encode_kwargs': {
+                            'batch_size': 10,
+                        },
+                    },
+                    {
+                        'model_name': 'text-embedding-v3',
+                        'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
+                        'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
+                        'dimensions': 1024,
+                        'encode_kwargs': {
+                            'batch_size': 10,
+                        },
+                    },
+                ],
+                'eval': {
+                    'tasks': [
+                        'MedicalRetrieval',
+                        # 'T2Retrieval'
+                    ],
+                    'verbosity': 2,
+                    'overwrite_results': True,
+                    'limits': 10,
                     'top_k': 10,
                 },
             },

{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

evalscope 0.16.1__py3-none-any.whl → 0.16.2__py3-none-any.whl

Potentially problematic release.

evalscope 0.16.1py3-none-any.whl → 0.16.2py3-none-any.whl