PyPI - evalscope - Versions diffs - 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl - Mend

evalscope 0.16.0py3-none-any.whl → 0.16.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (114) hide show

evalscope/app/__init__.py +28 -0
evalscope/{report → app}/app.py +40 -30
evalscope/app/constants.py +21 -0
evalscope/arguments.py +2 -1
evalscope/backend/opencompass/backend_manager.py +2 -1
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
evalscope/backend/rag_eval/utils/embedding.py +77 -39
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
evalscope/benchmarks/aime/aime24_adapter.py +3 -1
evalscope/benchmarks/aime/aime25_adapter.py +3 -1
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
evalscope/benchmarks/arc/arc_adapter.py +3 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
evalscope/benchmarks/benchmark.py +2 -0
evalscope/benchmarks/bfcl/__init__.py +0 -0
evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
evalscope/benchmarks/data_adapter.py +99 -16
evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
evalscope/benchmarks/docmath/__init__.py +0 -0
evalscope/benchmarks/docmath/docmath_adapter.py +85 -0
evalscope/benchmarks/docmath/utils.py +220 -0
evalscope/benchmarks/drop/drop_adapter.py +3 -0
evalscope/benchmarks/frames/__init__.py +0 -0
evalscope/benchmarks/frames/frames_adapter.py +91 -0
evalscope/benchmarks/frames/utils.py +37 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
evalscope/benchmarks/musr/musr_adapter.py +3 -0
evalscope/benchmarks/needle_haystack/__init__.py +0 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +348 -0
evalscope/benchmarks/needle_haystack/utils.py +79 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
evalscope/benchmarks/race/race_adapter.py +3 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +9 -1
evalscope/benchmarks/tool_bench/utils.py +5 -4
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
evalscope/benchmarks/utils.py +25 -0
evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
evalscope/cli/start_app.py +2 -2
evalscope/collections/__init__.py +35 -3
evalscope/collections/evaluator.py +68 -34
evalscope/config.py +8 -2
evalscope/constants.py +1 -1
evalscope/evaluator/evaluator.py +40 -28
evalscope/metrics/__init__.py +3 -1
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
evalscope/metrics/llm_judge.py +12 -5
evalscope/metrics/math_parser.py +1 -1
evalscope/metrics/t2v_metrics/__init__.py +9 -23
evalscope/models/adapters/__init__.py +2 -0
evalscope/models/adapters/base_adapter.py +31 -27
evalscope/models/adapters/bfcl_adapter.py +244 -0
evalscope/models/adapters/server_adapter.py +80 -23
evalscope/models/custom/custom_model.py +0 -3
evalscope/models/custom/dummy_model.py +77 -39
evalscope/models/local_model.py +1 -1
evalscope/models/register.py +2 -1
evalscope/perf/arguments.py +4 -2
evalscope/perf/benchmark.py +16 -12
evalscope/perf/main.py +7 -0
evalscope/perf/plugin/api/openai_api.py +2 -0
evalscope/perf/plugin/datasets/custom.py +15 -0
evalscope/perf/utils/benchmark_util.py +1 -1
evalscope/perf/utils/local_server.py +1 -0
evalscope/perf/utils/log_utils.py +12 -5
evalscope/perf/utils/rich_display.py +1 -1
evalscope/report/__init__.py +36 -4
evalscope/report/combinator.py +40 -6
evalscope/report/generator.py +33 -9
evalscope/report/utils.py +84 -4
evalscope/run.py +12 -0
evalscope/summarizer.py +1 -1
evalscope/utils/io_utils.py +59 -2
evalscope/utils/logger.py +1 -1
evalscope/utils/utils.py +12 -0
evalscope/version.py +2 -2
{evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/METADATA +16 -13
{evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/RECORD +114 -100
tests/aigc/test_t2i.py +48 -11
tests/cli/test_all.py +14 -3
tests/cli/test_collection.py +6 -4
tests/cli/test_run.py +50 -25
tests/rag/test_clip_benchmark.py +5 -1
tests/rag/test_mteb.py +51 -7
/evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
{evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
{evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
{evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
{evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0

tests/aigc/test_t2i.py CHANGED Viewed

@@ -11,7 +11,7 @@ from evalscope.run import run_task
 from evalscope.utils import test_level_list
 from evalscope.utils.logger import get_logger
-os.environ['LOG_LEVEL'] = 'DEBUG'
+os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
 logger = get_logger()
@@ -28,15 +28,15 @@ class TestRun(unittest.TestCase):
             dataset_args={
                 'general_t2i': {
                     'metric_list': [
-                        'PickScore',
+                        # 'PickScore',
                         'CLIPScore',
-                        'HPSv2Score',
-                        'HPSv2.1Score',
-                        'BLIPv2Score',
-                        'ImageRewardScore',
-                        'VQAScore',
-                        'FGA_BLIP2Score',
-                        'MPS'
+                        # 'HPSv2Score',
+                        # 'HPSv2.1Score',
+                        # 'BLIPv2Score',
+                        # 'ImageRewardScore',
+                        # 'VQAScore',
+                        # 'FGA_BLIP2Score',
+                        # 'MPS'
                     ],
                     'dataset_id': 'custom_eval/multimodal/t2i/example.jsonl',
                 }
@@ -58,9 +58,9 @@ class TestRun(unittest.TestCase):
                 'torch_dtype': 'torch.float16',
             },
             datasets=[
-                'tifa160',
+                # 'tifa160',
                 # 'genai_bench',
-                # 'evalmuse',
+                'evalmuse',
                 # 'hpdv2',
             ],
             dataset_args={
@@ -85,3 +85,40 @@ class TestRun(unittest.TestCase):
         )
         run_task(task_cfg=task_cfg)
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_benchmark_flux(self):
+        task_cfg = TaskConfig(
+            model='black-forest-labs/FLUX.1-dev',  # model on modelscope
+            model_task=ModelTask.IMAGE_GENERATION,  # must be IMAGE_GENERATION
+            model_args={
+                'torch_dtype': 'torch.float16',
+            },
+            datasets=[
+                # 'tifa160',
+                # 'genai_bench',
+                'evalmuse',
+                # 'hpdv2',
+            ],
+            dataset_args={
+                'tifa160': {
+                    'metric_list': [
+                        'PickScore',
+                        # 'CLIPScore',
+                        # 'HPSv2Score',
+                        # 'BLIPv2Score',
+                        # 'ImageRewardScore',
+                        # 'VQAScore',
+                        # 'FGA_BLIP2Score',
+                    ]
+                }
+            },
+            generation_config={
+                'num_inference_steps': 50,
+                'guidance_scale': 3.5
+            },
+            use_cache='outputs/20250520_112314'
+        )
+        run_task(task_cfg=task_cfg)

tests/cli/test_all.py CHANGED Viewed

@@ -12,7 +12,7 @@ from evalscope.run import run_task
 from evalscope.utils import test_level_list
 from evalscope.utils.logger import get_logger
-os.environ['LOG_LEVEL'] = 'DEBUG'
+os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
 logger = get_logger()
@@ -49,6 +49,10 @@ datasets=[
         'drop',
         'winogrande',
         'tool_bench',
+        'frames',
+        'docmath',
+        'needle_haystack',
+        'bfcl_v3',
 ]
 dataset_args={
@@ -123,6 +127,12 @@ dataset_args={
     'mmlu_redux':{
         'subset_list': ['abstract_algebra']
     },
+    'frames':{
+        'subset_list': ['simpshort_testmini']
+    },
+    'bfcl_v3':{
+        'subset_list': ['simple', 'multiple']
+    }
 }
 class TestRun(unittest.TestCase):
@@ -131,7 +141,7 @@ class TestRun(unittest.TestCase):
         from evalscope.config import TaskConfig
         task_cfg = TaskConfig(
-            model='qwen2.5-0.5b-instruct',
+            model='qwen-plus',
             api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
             api_key= env.get('DASHSCOPE_API_KEY'),
             eval_type=EvalType.SERVICE,
@@ -145,9 +155,10 @@ class TestRun(unittest.TestCase):
                 'n': 1,
                 'max_tokens': 4096,
             },
+            judge_worker_num=5,
             judge_strategy=JudgeStrategy.AUTO,
             judge_model_args={
-                'model_id': 'qwen2.5-7b-instruct',
+                'model_id': 'qwen2.5-72b-instruct',
                 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
                 'api_key': env.get('DASHSCOPE_API_KEY'),
             }

tests/cli/test_collection.py CHANGED Viewed

@@ -72,14 +72,16 @@ class TestCollection(unittest.TestCase):
                 'local_path': 'outputs/mixed_data_test.jsonl'
                 # 'local_path': 'outputs/weighted_mixed_data.jsonl'
             }},
-            limit=10,
-            judge_strategy=JudgeStrategy.LLM_RECALL,
+            limit=5,
+            judge_strategy=JudgeStrategy.AUTO,
             judge_model_args={
-                'model_id': 'qwen2.5-7b-instruct',
+                'model_id': 'qwen2.5-72b-instruct',
                 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
                 'api_key': os.getenv('DASHSCOPE_API_KEY'),
             },
-            use_cache='outputs/20250519_114427'
+            analysis_report=True,
+            ignore_errors=True,
+            # use_cache='outputs/20250522_204520'
         )
         res = run_task(task_cfg=task_cfg)
         print(res)

tests/cli/test_run.py CHANGED Viewed

@@ -13,7 +13,7 @@ from evalscope.run import run_task
 from evalscope.utils import is_module_installed, test_level_list
 from evalscope.utils.logger import get_logger
-os.environ['LOG_LEVEL'] = 'DEBUG'
+os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
 logger = get_logger()
@@ -63,7 +63,7 @@ class TestRun(unittest.TestCase):
                         f'--model {model} ' \
                         f'--datasets {datasets} ' \
                         f'--limit {limit} ' \
-                        f'--generation-config do_sample=false,temperature=0.0 ' \
+                        f'--generation-config do_sample=true,temperature=0.6,max_length=65535,max_new_tokens=65535,max_tokens=65535,n=1,top_p=0.95,top_k=20 ' \
                         f"""--dataset-args \'{dataset_args}\' """
         logger.info(f'Start to run command: {cmd_with_args}')
@@ -187,8 +187,11 @@ class TestRun(unittest.TestCase):
         from evalscope.config import TaskConfig
         task_cfg = TaskConfig(
-            model='qwen/Qwen2-0.5B-Instruct',
-            datasets=['general_mcq', 'general_qa'],  # 数据格式，选择题格式固定为 'ceval'
+            model='Qwen/Qwen3-0.6B',
+            datasets=[
+                'general_mcq',
+                'general_qa'
+            ],
             dataset_args={
                 'general_mcq': {
                     'local_path': 'custom_eval/text/mcq',  # 自定义数据集路径
@@ -215,16 +218,14 @@ class TestRun(unittest.TestCase):
         task_cfg = TaskConfig(
             model='Qwen/Qwen3-1.7B',
             datasets=[
-                'iquiz',
+                # 'iquiz',
                 # 'math_500',
                 # 'aime24',
                 # 'competition_math',
                 # 'mmlu',
                 # 'simple_qa',
+                'truthful_qa',
             ],
-            model_args={
-                'device_map': 'auto',
-            },
             dataset_args={
                 'competition_math': {
                     'subset_list': ['Level 4', 'Level 5']
@@ -304,14 +305,16 @@ class TestRun(unittest.TestCase):
                 # 'arc',
                 # 'ceval',
                 # 'hellaswag',
-                # 'general_mcq',
+                'general_mcq',
                 # 'general_qa',
                 # 'super_gpqa',
                 # 'mmlu_redux',
                 # 'maritime_bench',
                 # 'drop',
                 # 'winogrande',
-                'tool_bench',
+                # 'tool_bench',
+                # 'frames',
+                # 'bfcl_v3',
             ],
             dataset_args={
                 'mmlu': {
@@ -369,24 +372,31 @@ class TestRun(unittest.TestCase):
                     'metric_list': ['AverageRouge']
                 },
                 'super_gpqa': {
-                    # 'subset_list': ['Philosophy', 'Education'],
+                    'subset_list': ['Philosophy', 'Education'],
                     'few_shot_num': 0
                 },
                 'mmlu_redux':{
                     'subset_list': ['abstract_algebra']
                 },
+                'bfcl_v3': {
+                    'subset_list': ['parallel'],
+                    'extra_params': {
+                        # 'is_fc_model': False,
+                    }
+                },
             },
-            eval_batch_size=32,
-            limit=10,
+            eval_batch_size=10,
+            limit=5,
             debug=True,
-            stream=False,
+            stream=True,
             generation_config={
                 'temperature': 0,
                 'n': 1,
                 'max_tokens': 4096,
+                # 'extra_headers':{'key': 'value'},
             },
-            # ignore_errors=True,
-            use_cache='outputs/20250519_142106'
+            ignore_errors=False,
+            # use_cache='outputs/20250616_153756'
         )
         run_task(task_cfg=task_cfg)
@@ -427,26 +437,36 @@ class TestRun(unittest.TestCase):
         from evalscope.config import TaskConfig
         task_cfg = TaskConfig(
-            model='qwen2.5-0.5b-instruct',
+            model='qwen-plus',
             api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
             api_key= env.get('DASHSCOPE_API_KEY'),
             eval_type=EvalType.SERVICE,
             datasets=[
-                # 'math_500',
+                'math_500',
                 # 'aime24',
                 # 'competition_math',
                 # 'arc',
-                # 'gsm8k'
+                # 'gsm8k',
                 # 'truthful_qa',
                 # 'simple_qa',
-                'chinese_simpleqa',
+                # 'chinese_simpleqa',
                 # 'live_code_bench',
                 # 'humaneval',
                 # 'general_qa',
                 # 'alpaca_eval',
-                # 'arena_hard'
+                # 'arena_hard',
+                # 'frames',
+                # 'docmath',
+                # 'needle_haystack',
+                # 'ifeval',
             ],
             dataset_args={
+                'needle_haystack': {
+                    'subset_list': ['english'],
+                    'extra_params': {
+                        'show_score': True,
+                    }
+                },
                 'competition_math': {
                     'subset_list': ['Level 4']
                 },
@@ -469,13 +489,16 @@ class TestRun(unittest.TestCase):
                         '中华文化'
                     ]
                 },
+                'frames': {
+                    'local_path': '/root/.cache/modelscope/hub/datasets/iic/frames'
+                }
             },
             eval_batch_size=10,
-            limit=10,
-            judge_strategy=JudgeStrategy.AUTO,
+            limit=3,
+            judge_strategy=JudgeStrategy.LLM,
             judge_worker_num=5,
             judge_model_args={
-                'model_id': 'qwen2.5-7b-instruct',
+                'model_id': 'qwen2.5-72b-instruct',
                 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
                 'api_key': env.get('DASHSCOPE_API_KEY'),
                 'generation_config': {
@@ -491,7 +514,9 @@ class TestRun(unittest.TestCase):
             },
             timeout=60000,
             stream=True,
-            use_cache='outputs/20250519_142551'
+            # analysis_report=True,
+            # debug=True,
+            # use_cache='outputs/20250616_161931'
         )
         run_task(task_cfg=task_cfg)

tests/rag/test_clip_benchmark.py CHANGED Viewed

@@ -39,7 +39,11 @@ class TestCLIPBenchmark(unittest.TestCase):
                             'model_name': 'AI-ModelScope/chinese-clip-vit-large-patch14-336px',
                         }
                     ],
-                    'dataset_name': ['muge', 'mnist'],
+                    'dataset_name': [
+                        'muge',
+                        'mnist',
+                        'flickr8k'
+                    ],
                     'split': 'test',
                     'batch_size': 128,
                     'num_workers': 1,

tests/rag/test_mteb.py CHANGED Viewed

@@ -46,11 +46,11 @@ class TestMTEB(unittest.TestCase):
                 ],
                 'eval': {
                     'tasks': [
-                        # 'TNews',
-                        # 'CLSClusteringS2S',
+                        'TNews',
+                        'CLSClusteringS2S',
                         'T2Reranking',
-                        # 'T2Retrieval',
-                        # 'ATEC',
+                        'T2Retrieval',
+                        'ATEC',
                     ],
                     'verbosity': 2,
                     'overwrite_results': True,
@@ -85,7 +85,7 @@ class TestMTEB(unittest.TestCase):
                     ],
                     'verbosity': 2,
                     'overwrite_results': True,
-                    'limits': 30,
+                    'limits': 10,
                 },
             },
         )
@@ -121,10 +121,54 @@ class TestMTEB(unittest.TestCase):
                     },
                 ],
                 'eval': {
-                    'tasks': ['MedicalRetrieval', 'T2Retrieval'],
+                    'tasks': [
+                        'MedicalRetrieval',
+                        'T2Retrieval'
+                    ],
+                    'verbosity': 2,
+                    'overwrite_results': True,
+                    'limits': 10,
+                    'top_k': 10,
+                },
+            },
+        }
+        run_task(task_cfg)
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_two_stage_api(self):
+        task_cfg = {
+            'eval_backend': 'RAGEval',
+            'eval_config': {
+                'tool': 'MTEB',
+                'model': [
+                    {
+                        'model_name': 'text-embedding-v3',
+                        'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
+                        'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
+                        'dimensions': 1024,
+                        'encode_kwargs': {
+                            'batch_size': 10,
+                        },
+                    },
+                    {
+                        'model_name': 'text-embedding-v3',
+                        'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
+                        'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
+                        'dimensions': 1024,
+                        'encode_kwargs': {
+                            'batch_size': 10,
+                        },
+                    },
+                ],
+                'eval': {
+                    'tasks': [
+                        'MedicalRetrieval',
+                        # 'T2Retrieval'
+                    ],
                     'verbosity': 2,
                     'overwrite_results': True,
-                    # 'limits': 10,
+                    'limits': 10,
                     'top_k': 10,
                 },
             },

/evalscope/{report/app_arguments.py → app/arguments.py} RENAMED Viewed

File without changes

{evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

evalscope 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl

Potentially problematic release.

evalscope 0.16.0py3-none-any.whl → 0.16.2py3-none-any.whl