PyPI - evalscope - Versions diffs - 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

evalscope 1.0.1py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (155) hide show

evalscope/api/benchmark/adapters/default_data_adapter.py +18 -4
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
evalscope/api/benchmark/benchmark.py +27 -2
evalscope/api/benchmark/meta.py +3 -0
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +5 -0
evalscope/api/messages/chat_message.py +6 -1
evalscope/api/mixin/__init__.py +1 -0
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +204 -0
evalscope/api/model/generate_config.py +0 -3
evalscope/api/model/model.py +1 -1
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +8 -2
evalscope/app/utils/data_utils.py +3 -2
evalscope/app/utils/visualization.py +2 -2
evalscope/arguments.py +6 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/amc/__init__.py +0 -0
evalscope/benchmarks/amc/amc_adapter.py +46 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/bfcl_adapter.py +106 -2
evalscope/benchmarks/bfcl/generation.py +7 -7
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drop/drop_adapter.py +1 -1
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
evalscope/config.py +24 -1
evalscope/constants.py +3 -0
evalscope/evaluator/evaluator.py +25 -7
evalscope/metrics/metric.py +78 -2
evalscope/metrics/metrics.py +16 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/model_apis.py +10 -8
evalscope/models/utils/openai.py +1 -2
evalscope/perf/arguments.py +2 -0
evalscope/perf/plugin/api/base.py +2 -2
evalscope/perf/plugin/api/default_api.py +7 -7
evalscope/perf/plugin/api/openai_api.py +83 -19
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +1 -2
evalscope/report/__init__.py +9 -1
evalscope/report/combinator.py +45 -20
evalscope/report/report.py +8 -4
evalscope/run.py +1 -1
evalscope/utils/function_utils.py +41 -0
evalscope/utils/import_utils.py +63 -13
evalscope/utils/io_utils.py +19 -11
evalscope/utils/json_schema.py +25 -2
evalscope/utils/logger.py +19 -0
evalscope/utils/model_utils.py +1 -1
evalscope/utils/multi_choices.py +16 -1
evalscope/version.py +2 -2
{evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/METADATA +10 -40
{evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/RECORD +120 -95
{evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
tests/__init__.py +0 -1
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -385
tests/benchmark/test_image_edit.py +0 -65
tests/benchmark/test_t2i.py +0 -142
tests/benchmark/test_vlm.py +0 -80
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -269
tests/cli/test_collection.py +0 -99
tests/cli/test_custom.py +0 -268
tests/cli/test_reasoning.py +0 -81
tests/common.py +0 -73
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -178
tests/rag/test_clip_benchmark.py +0 -87
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
{tests/rag → evalscope/benchmarks/ai2d}/__init__.py +0 -0
{evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
{evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
{evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0

tests/cli/test_all.py DELETED Viewed

@@ -1,269 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from dotenv import dotenv_values
-env = dotenv_values('.env')
-import os
-import unittest
-from evalscope.config import TaskConfig
-from evalscope.constants import EvalType, JudgeStrategy, OutputType
-from evalscope.run import run_task
-from evalscope.utils.logger import get_logger
-from tests.utils import test_level_list
-os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
-logger = get_logger()
-datasets=[
-    'iquiz',
-    'ifeval',
-    'mmlu',
-    'mmlu_pro',
-    'musr',
-    'process_bench',
-    'race',
-    'trivia_qa',
-    'cmmlu',
-    'humaneval',
-    'gsm8k',
-    'bbh',
-    'competition_math',
-    'math_500',
-    'aime24',
-    'gpqa_diamond',
-    'arc',
-    'ceval',
-    'hellaswag',
-    'general_mcq',
-    'general_qa',
-    'super_gpqa',
-    # 'live_code_bench',
-    'mmlu_redux',
-    'simple_qa',
-    'chinese_simpleqa',
-    'alpaca_eval',
-    'arena_hard',
-    'maritime_bench',
-    'drop',
-    'winogrande',
-    'tool_bench',
-    'frames',
-    'docmath',
-    'needle_haystack',
-    'bfcl_v3',
-    'hle',
-    'tau_bench',
-]
-# Reverse the datasets list to ensure the order is from most recent to oldest
-datasets.reverse()
-dataset_args={
-    'mmlu': {
-        'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
-        'few_shot_num': 0
-    },
-    'mmlu_pro': {
-        'subset_list': ['math', 'health'],
-        'few_shot_num': 4
-    },
-    'ceval': {
-        'subset_list': [
-            'computer_network', 'operating_system', 'computer_architecture'
-        ],
-        'few_shot_num': 0
-    },
-    'cmmlu': {
-        'subset_list': ['elementary_chinese'],
-        'few_shot_num': 0
-    },
-    'bbh': {
-        'subset_list': ['word_sorting', 'movie_recommendation'],
-    },
-    'gpqa_diamond': {
-        'few_shot_num': 0,
-    },
-    'humaneval': {
-        'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
-    },
-    'competition_math': {
-        'subset_list': ['Level 1']
-    },
-    'math_500': {
-        'subset_list': ['Level 1']
-    },
-    'process_bench': {
-        'subset_list': ['gsm8k'],
-    },
-    'musr': {
-        'subset_list': ['murder_mysteries']
-    },
-    'general_mcq': {
-        'local_path': 'custom_eval/text/mcq',  # 自定义数据集路径
-        'subset_list': [
-            'example'  # 评测数据集名称，上述 *_dev.csv 中的 *
-        ],
-    },
-    'general_qa': {
-        'local_path': 'custom_eval/text/qa',  # 自定义数据集路径
-        'subset_list': [
-            'example',  # 评测数据集名称，上述 *_dev.csv 中的 *
-            # 'test'
-        ]
-    },
-    'super_gpqa': {
-        'subset_list': ['Philosophy', 'Education'],
-        'few_shot_num': 0
-    },
-    'live_code_bench': {
-        'subset_list': ['v4_v5'],
-        'extra_params': {
-            'start_date': '2024-12-01',
-            'end_date': '2025-01-01'
-        },
-    },
-    'chinese_simpleqa': {
-        'subset_list': ['中华文化']
-    },
-    'mmlu_redux':{
-        'subset_list': ['abstract_algebra']
-    },
-    'docmath':{
-        'subset_list': ['simpshort_testmini']
-    },
-    'bfcl_v3':{
-        'subset_list': ['simple', 'multiple']
-    },
-    'hle': {
-        'subset_list': ['Math', 'Other'],
-    },
-    'tau_bench': {
-        'extra_params': {
-            'user_model': 'qwen-plus',
-            'api_key': env.get('DASHSCOPE_API_KEY'),
-            'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
-        },
-        'subset_list': ['airline'],
-    },
-}
-class TestRun(unittest.TestCase):
-    def test_benchmarks(self):
-        from evalscope.config import TaskConfig
-        task_cfg = TaskConfig(
-            model='qwen-plus',
-            api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
-            api_key= env.get('DASHSCOPE_API_KEY'),
-            eval_type=EvalType.SERVICE,
-            datasets=datasets,
-            dataset_args=dataset_args,
-            eval_batch_size=1,
-            limit=1,
-            stream=True,
-            generation_config={
-                'temperature': 0,
-                'n': 1,
-                'max_tokens': 4096,
-            },
-            judge_worker_num=5,
-            judge_strategy=JudgeStrategy.AUTO,
-            judge_model_args={
-                'model_id': 'qwen2.5-72b-instruct',
-                'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
-                'api_key': env.get('DASHSCOPE_API_KEY'),
-            }
-        )
-        run_task(task_cfg=task_cfg)
-    def test_vlm_benchmark(self):
-        from evalscope.config import TaskConfig
-        task_cfg = TaskConfig(
-            model='qwen-vl-plus',
-            api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
-            api_key= env.get('DASHSCOPE_API_KEY'),
-            eval_type=EvalType.SERVICE,
-            datasets=[
-                'mmmu',
-                # 'math_vista',
-            ],
-            dataset_args={
-                'mmmu': {
-                    'subset_list': ['Accounting']
-                },
-                'math_vista': {
-                    'subset_list': ['default']
-                }
-            },
-            eval_batch_size=1,
-            limit=1,
-            stream=True,
-            generation_config={
-                'temperature': 0,
-                'n': 1,
-                'max_tokens': 4096,
-                'image_height': 512,
-                'image_width': 512,
-                'image_num': 2,
-            },
-            judge_worker_num=5,
-            judge_strategy=JudgeStrategy.AUTO,
-            judge_model_args={
-                'model_id': 'qwen2.5-72b-instruct',
-                'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
-                'api_key': env.get('DASHSCOPE_API_KEY'),
-            }
-        )
-        run_task(task_cfg=task_cfg)
-    def test_ci_lite(self):
-        from evalscope.config import TaskConfig
-        api_key = env.get('DASHSCOPE_API_KEY')
-        task_cfg = TaskConfig(
-            model='qwen-plus',
-            api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
-            api_key=api_key,
-            eval_type=EvalType.SERVICE if api_key else EvalType.MOCK_LLM,
-            datasets=[
-                'general_mcq',
-                'iquiz',
-            ],
-            dataset_args={
-                'general_mcq': {
-                    'local_path': 'custom_eval/text/mcq',
-                    'subset_list': [
-                        'example'
-                    ],
-                },
-                'general_qa': {
-                    'local_path': 'custom_eval/text/qa',
-                    'subset_list': [
-                        'example'
-                    ]
-                }
-            },
-            eval_batch_size=1,
-            limit=1,
-            stream=True,
-            generation_config={
-                'temperature': 0,
-                'n': 1,
-                'max_tokens': 4096,
-            },
-            judge_worker_num=1,
-            judge_strategy=JudgeStrategy.AUTO,
-            judge_model_args={
-                'model_id': 'qwen2.5-72b-instruct',
-                'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
-                'api_key': env.get('DASHSCOPE_API_KEY'),
-            }
-        )
-        run_task(task_cfg=task_cfg)

tests/cli/test_collection.py DELETED Viewed

@@ -1,99 +0,0 @@
-from dotenv import dotenv_values
-env = dotenv_values('.env')
-import json
-import os
-import unittest
-from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler
-from evalscope.constants import EvalType, JudgeStrategy
-from evalscope.utils.io_utils import dump_jsonl_data
-from tests.utils import test_level_list
-class TestCollection(unittest.TestCase):
-    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
-    def test_create_collection(self):
-        schema = CollectionSchema(name='math&reasoning', datasets=[
-                    CollectionSchema(name='math', datasets=[
-                        CollectionSchema(name='generation', datasets=[
-                            DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']),
-                        ]),
-                        CollectionSchema(name='multiple_choice', datasets=[
-                            DatasetInfo(name='cmmlu', weight=2, task_type='math', tags=['zh', 'math'], args={'subset_list': ['college_mathematics', 'high_school_mathematics']}),
-                            DatasetInfo(name='ceval', weight=3, task_type='math', tags=['zh', 'math'], args={'subset_list': ['advanced_mathematics', 'high_school_mathematics', 'discrete_mathematics', 'middle_school_mathematics']}),
-                        ]),
-                    ]),
-                    CollectionSchema(name='reasoning', datasets=[
-                        DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
-                        DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh', 'reasoning'], args={'subset_list': ['logic']}),
-                        DatasetInfo(name='race', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
-                    ]),
-                ])
-        print(schema.to_dict())
-        print(schema.flatten())
-        schema.dump_json('outputs/schema_test.json')
-    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
-    def test_generate_data(self):
-        schema = CollectionSchema.from_dict(json.load(open('outputs/schema_test.json', 'r')))
-        print(schema.to_dict())
-        mixed_data = WeightedSampler(schema).sample(100)
-        dump_jsonl_data(mixed_data, 'outputs/mixed_data_test.jsonl')
-    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
-    def test_evaluate_collection(self):
-        from evalscope import TaskConfig, run_task
-        task_cfg = TaskConfig(
-            model='qwen-plus',
-            api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
-            api_key=env.get('DASHSCOPE_API_KEY'),
-            eval_type=EvalType.SERVICE,
-            datasets=['data_collection'],
-            dataset_args={
-                'data_collection': {
-                    # 'local_path': 'outputs/test_mix.jsonl'
-                    'local_path': 'outputs/mixed_data_test.jsonl',
-                    'shuffle': True,
-                }
-            },
-            eval_batch_size=5,
-            generation_config = {
-                'max_tokens': 10000,
-                'temperature': 0.0,
-            },
-            limit=10,
-            # use_cache='outputs/20250822_161804'
-        )
-        run_task(task_cfg=task_cfg)
-    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
-    def test_evaluate_collection_with_judge(self):
-        from evalscope import TaskConfig, run_task
-        task_cfg = TaskConfig(
-            model='qwen2.5-7b-instruct',
-            api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
-            api_key= os.getenv('DASHSCOPE_API_KEY'),
-            eval_type=EvalType.SERVICE,
-            datasets=['data_collection'],
-            dataset_args={'data_collection': {
-                'local_path': 'outputs/mixed_data_test.jsonl'
-                # 'local_path': 'outputs/weighted_mixed_data.jsonl'
-            }},
-            limit=5,
-            judge_strategy=JudgeStrategy.AUTO,
-            judge_model_args={
-                'model_id': 'qwen2.5-72b-instruct',
-                'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
-                'api_key': os.getenv('DASHSCOPE_API_KEY'),
-            },
-            analysis_report=True,
-            ignore_errors=True,
-            # use_cache='outputs/20250522_204520'
-        )
-        res = run_task(task_cfg=task_cfg)
-        print(res)

tests/cli/test_custom.py DELETED Viewed

@@ -1,268 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from dotenv import dotenv_values
-from tests.utils import test_level_list
-env = dotenv_values('.env')
-import os
-import subprocess
-import unittest
-from evalscope.config import TaskConfig
-from evalscope.constants import EvalType, JudgeStrategy, OutputType
-from evalscope.run import run_task
-from evalscope.utils.import_utils import is_module_installed
-from evalscope.utils.logger import get_logger
-os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
-logger = get_logger()
-class TestRunCustom(unittest.TestCase):
-    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
-    def test_run_custom_task(self):
-        from evalscope.config import TaskConfig
-        task_cfg = TaskConfig(
-            model='Qwen/Qwen3-0.6B',
-            datasets=[
-                'general_mcq',
-                'general_qa'
-            ],
-            dataset_args={
-                'general_mcq': {
-                    'local_path': 'custom_eval/text/mcq',  # 自定义数据集路径
-                    'subset_list': [
-                        'example'  # 评测数据集名称，上述 *_dev.csv 中的 *
-                    ],
-                    'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}'  # 问题模板
-                },
-                'general_qa': {
-                    'local_path': 'custom_eval/text/qa',  # 自定义数据集路径
-                    'subset_list': [
-                        'example'  # 评测数据集名称，上述 *_dev.csv 中的 *
-                    ]
-                }
-            },
-        )
-        res = run_task(task_cfg=task_cfg)
-        print(res)
-    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
-    def test_run_local_dataset(self):
-        from evalscope.config import TaskConfig
-        task_cfg = TaskConfig(
-            model='qwen-plus',
-            api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
-            api_key= env.get('DASHSCOPE_API_KEY'),
-            eval_type=EvalType.SERVICE,
-            datasets=[
-                # 'mmlu',
-                # 'race',
-                'trivia_qa',
-                # 'cmmlu',
-                # 'humaneval',
-                # 'gsm8k',
-                # 'bbh',
-                # 'competition_math',
-                # 'arc',
-                # 'ceval',
-            ],
-            dataset_args={
-                'mmlu': {
-                    'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
-                    'few_shot_num': 0,
-                    'dataset_id': 'data/data/mmlu',
-                },
-                'ceval': {
-                    'subset_list': [
-                        'computer_network', 'operating_system', 'computer_architecture'
-                    ],
-                    'few_shot_num': 0,
-                    'dataset_id': 'data/data/ceval',
-                },
-                'cmmlu': {
-                    'subset_list': ['elementary_chinese'],
-                    'dataset_id': 'data/data/cmmlu',
-                    'few_shot_num': 0
-                },
-                'bbh': {
-                    'subset_list': ['word_sorting', 'movie_recommendation'],
-                },
-                'humaneval': {
-                    'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
-                },
-                'trivia_qa': {
-                    'dataset_id': 'data/data/trivia_qa',
-                },
-            },
-            eval_batch_size=10,
-            limit=5,
-            debug=True,
-            stream=True,
-            generation_config={
-                'temperature': 0,
-                'n': 1,
-                'max_tokens': 4096,
-            },
-            ignore_errors=False,
-        )
-        run_task(task_cfg=task_cfg)
-    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
-    def test_run_general_no_answer(self):
-        from evalscope.config import TaskConfig
-        task_cfg = TaskConfig(
-            model='qwen2.5-7b-instruct',
-            api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
-            api_key= env.get('DASHSCOPE_API_KEY'),
-            eval_type=EvalType.SERVICE,
-            datasets=[
-                'general_qa',
-            ],
-            dataset_args={
-                'general_qa': {
-                    'dataset_id': 'custom_eval/text/qa',
-                    'subset_list': [
-                        'arena',
-                        # 'example'
-                    ],
-                }
-            },
-            eval_batch_size=10,
-            limit=10,
-            debug=True,
-            stream=True,
-            generation_config={
-                'temperature': 0,
-                'n': 1,
-                'max_tokens': 4096,
-            },
-            ignore_errors=False,
-            judge_model_args={
-                'model_id': 'qwen2.5-7b-instruct',
-                'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
-                'api_key': env.get('DASHSCOPE_API_KEY'),
-                'generation_config': {
-                    'temperature': 0.0,
-                    'max_tokens': 4096
-                },
-                'score_type': 'numeric',
-                'prompt_template': """Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response.
-Begin your evaluation by providing a short explanation. Be as objective as possible.
-After providing your explanation, you must rate the response on a scale of 0 (worst) to 100 (best) by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\"
-[Question]
-{question}
-[Response]
-{pred}
-"""
-            },
-            judge_worker_num=5,
-            judge_strategy=JudgeStrategy.LLM,
-        )
-        run_task(task_cfg=task_cfg)
-    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
-    def test_run_general_with_answer(self):
-        from evalscope.config import TaskConfig
-        task_cfg = TaskConfig(
-            model='qwen-plus',
-            api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
-            api_key= env.get('DASHSCOPE_API_KEY'),
-            eval_type=EvalType.SERVICE,
-            datasets=[
-                'general_qa',
-            ],
-            dataset_args={
-                'general_qa': {
-                    'dataset_id': 'custom_eval/text/qa',
-                    'subset_list': [
-                        'example'
-                    ],
-                }
-            },
-            eval_batch_size=10,
-            limit=10,
-            debug=True,
-            stream=True,
-            generation_config={
-                'temperature': 0,
-                'n': 1,
-                'max_tokens': 4096,
-            },
-            ignore_errors=False,
-            judge_model_args={
-                'model_id': 'qwen2.5-72b-instruct',
-                'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
-                'api_key': env.get('DASHSCOPE_API_KEY'),
-                'generation_config': {
-                    'temperature': 0.0,
-                    'max_tokens': 4096
-                },
-                'score_type': 'pattern',
-            },
-            judge_worker_num=1,
-            judge_strategy=JudgeStrategy.LLM_RECALL,
-            use_cache='outputs/20250818_170420'
-        )
-        run_task(task_cfg=task_cfg)
-    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
-    def test_run_general_arena(self):
-        from evalscope.config import TaskConfig
-        task_cfg = TaskConfig(
-            model_id='Arena',
-            datasets=[
-                'general_arena',
-            ],
-            dataset_args={
-                'general_arena': {
-                    'extra_params':{
-                        'models':[
-                            {
-                                'name': 'qwen2.5-7b',
-                                'report_path': 'outputs/20250819_165034/reports/qwen2.5-7b-instruct'
-                            },
-                            {
-                                'name': 'qwen2.5-72b',
-                                'report_path': 'outputs/20250819_164926/reports/qwen2.5-72b-instruct'
-                            }
-                        ],
-                        'baseline': 'qwen2.5-72b'
-                    }
-                }
-            },
-            eval_batch_size=10,
-            limit=10,
-            debug=True,
-            stream=True,
-            ignore_errors=False,
-            judge_model_args={
-                'model_id': 'qwen-plus',
-                'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
-                'api_key': env.get('DASHSCOPE_API_KEY'),
-                'generation_config': {
-                    'temperature': 0.0,
-                    'max_tokens': 8000
-                },
-            },
-            judge_worker_num=5,
-            # use_cache='outputs/20250819_173546'
-        )
-        run_task(task_cfg=task_cfg)

evalscope 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

Potentially problematic release.

evalscope 1.0.1py3-none-any.whl → 1.1.0py3-none-any.whl