PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show

evalscope/__init__.py +4 -1
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +5 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
evalscope/api/benchmark/benchmark.py +356 -0
evalscope/api/benchmark/meta.py +121 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +262 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +378 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +275 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +12 -0
evalscope/api/messages/chat_message.py +243 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +113 -0
evalscope/api/mixin/__init__.py +1 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +155 -0
evalscope/api/model/model.py +386 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/app.py +3 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +26 -14
evalscope/app/utils/data_utils.py +43 -27
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -14
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +7 -10
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -5
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +10 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +136 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +157 -57
evalscope/constants.py +37 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +275 -419
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +47 -33
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/image_edit_model.py +125 -0
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +67 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +126 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +701 -0
evalscope/perf/benchmark.py +4 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +15 -10
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +11 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -3
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +51 -35
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +33 -47
evalscope/summarizer.py +1 -1
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +3 -2
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/import_utils.py +23 -1
evalscope/utils/io_utils.py +142 -6
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +11 -7
evalscope/utils/multi_choices.py +288 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
tests/benchmark/test_eval.py +385 -0
tests/benchmark/test_image_edit.py +65 -0
tests/{aigc → benchmark}/test_t2i.py +22 -4
tests/benchmark/test_vlm.py +80 -0
tests/cli/test_all.py +85 -47
tests/cli/test_collection.py +20 -8
tests/cli/test_custom.py +22 -15
tests/cli/test_reasoning.py +81 -0
tests/common.py +73 -0
tests/perf/test_perf.py +4 -2
tests/rag/test_clip_benchmark.py +0 -2
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
/evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
/tests/{aigc → benchmark}/__init__.py +0 -0

evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py CHANGED Viewed

@@ -1,88 +1,138 @@
-from tqdm import tqdm
-from evalscope.benchmarks import Benchmark, DataAdapter
+from typing import Any, Dict
+from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.messages.chat_message import ChatMessageUser
+from evalscope.api.metric import Score
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import convert_numpy_types
 from evalscope.utils.logger import get_logger
 logger = get_logger()
-@Benchmark.register(
-    name='live_code_bench',
-    pretty_name='Live-Code-Bench',
-    tags=['Coding'],
-    description=
-    'Live Code Bench is a benchmark for evaluating code generation models on real-world coding tasks. It includes a variety of programming problems with test cases to assess the model\'s ability to generate correct and efficient code solutions.',  # noqa: E501
-    dataset_id='AI-ModelScope/code_generation_lite',
-    subset_list=['release_latest'],
-    metric_list=['Pass@1'],
-    few_shot_num=0,
-    train_split=None,
-    eval_split='test',
-    extra_params={
-        'start_date': None,
-        'end_date': None,
-        'timeout': 6,
-        'debug': False
-    },
-    system_prompt=
-    'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.',  # noqa: E501
-    prompt_template=
-    '### Question:\n{question_content}\n\n{format_prompt} ### Answer: (use the provided format with backticks)\n\n',  # noqa: E501
+@register_benchmark(
+    BenchmarkMeta(
+        name='live_code_bench',
+        pretty_name='Live-Code-Bench',
+        tags=[Tags.CODING],
+        description=
+        'Live Code Bench is a benchmark for evaluating code generation models on real-world coding tasks. It includes a variety of programming problems with test cases to assess the model\'s ability to generate correct and efficient code solutions.',  # noqa: E501
+        dataset_id='AI-ModelScope/code_generation_lite',
+        subset_list=['release_latest'],
+        metric_list=['Pass@1'],
+        eval_split='test',
+        prompt_template=
+        '### Question:\n{question_content}\n\n{format_prompt} ### Answer: (use the provided format with backticks)\n\n',
+        extra_params={
+            'start_date': None,
+            'end_date': None,
+            'timeout': 6,
+            'debug': False
+        },
+    )
 )
-class LiveCodeBenchAdapter(DataAdapter):
+class LiveCodeBenchAdapter(DefaultDataAdapter):
+    """
+    Live Code Bench adapter using the new data processing framework.
+    """
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        extra_params = kwargs.get('extra_params', {})
-        self.timeout = extra_params.get('timeout', 6)
-        self.debug = extra_params.get('debug', False)
-        self.start_date = extra_params.get('start_date')
-        self.end_date = extra_params.get('end_date')
-    def load(self, **kwargs) -> dict:
-        from .load_utils import filter_date, transform
-        # Note: need trust_remote_code=True to load the python script
-        dataset_dict = super().load(trust_remote_code=True, **kwargs)
-        new_dataset_dict = {}
-        for subset_key, dataset in dataset_dict.items():
-            datasets = dataset[self.eval_split]
-            filtered_datasets = filter_date(datasets, start_date=self.start_date, end_date=self.end_date)
-            transformed_datasets = [transform(item) for item in tqdm(filtered_datasets, desc='Transforming data')]
-            new_dataset_dict[subset_key] = {self.eval_split: transformed_datasets}
-        return new_dataset_dict
-    def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
-        """
-        Generate the prompt for the model input.
-        """
-        format_prompt = input_d['format_prompt']
-        question_content = input_d['question_content']
+        self.timeout = self.extra_params.get('timeout', 6)
+        self.debug = self.extra_params.get('debug', False)
+        self.start_date = self.extra_params.get('start_date')
+        self.end_date = self.extra_params.get('end_date')
+        self.save_metadata = False  # Don't save metadata, since they are large
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        """Convert a data record to a Sample object."""
+        from .load_utils import transform
+        record = transform(record)
+        question_content = record['question_content']
+        format_prompt = record['format_prompt']
         full_prompt = self.prompt_template.format(question_content=question_content, format_prompt=format_prompt)
-        return self.gen_prompt_data(full_prompt)
+        return Sample(
+            input=[ChatMessageUser(content=full_prompt)],
+            target='',
+            metadata={
+                'evaluation_sample': record['evaluation_sample'],
+                'contest_date': record['contest_date']
+            }
+        )
-    def get_gold_answer(self, input_d: dict) -> str:
-        # Extract the gold answer from the input dict.
-        return input_d
+    def sample_filter(self, sample):
+        from .load_utils import filter_date
-    def match(self, gold: dict, pred: str) -> float:
-        from .evaluate_utils import codegen_metrics
+        return filter_date(sample.metadata['contest_date'], start_date=self.start_date, end_date=self.end_date)
+    def extract_answer(self, prediction: str, task_state: TaskState) -> str:
+        """Extract code from the prediction."""
         from .extract_utils import extract_code_generation
+        return extract_code_generation(prediction)
-        ext_pred = extract_code_generation(pred)
-        references = [{'input_output': gold['evaluation_sample']}]
-        predictions = [[ext_pred]]
-        metrics, eval_results, final_metadata = codegen_metrics(
-            references,
-            predictions,
-            k_list=[1],
-            num_process_evaluate=1,
-            timeout=self.timeout,
-            debug=self.debug,
+    def match_score(
+        self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
+    ) -> Score:
+        from .evaluate_utils import codegen_metrics
+        score = Score(
+            extracted_prediction=filtered_prediction,
+            prediction=original_prediction,
         )
-        return metrics['pass@1'] / 100  # convert to point scale
+        references = [{'input_output': task_state.metadata['evaluation_sample']}]
+        predictions = [[filtered_prediction]]
+        try:
+            metrics, eval_results, final_metadata = codegen_metrics(
+                references,
+                predictions,
+                k_list=[1],
+                num_process_evaluate=1,
+                timeout=self.timeout,
+                debug=self.debug,
+            )
+            pass_rate = metrics['pass@1'] / 100  # convert to point scale
+            score.value = {'pass': float(pass_rate > 0)}
+            score.explanation = f"Pass@1: {metrics['pass@1']}%"
+            # Convert numpy types to native Python types for JSON serialization
+            serializable_eval_results = convert_numpy_types(eval_results)
+            serializable_final_metadata = convert_numpy_types(final_metadata)
+            score.metadata = {
+                'pass_rate': float(pass_rate),
+                'timeout': self.timeout,
+                'debug': self.debug,
+                'eval_results': serializable_eval_results,
+                'final_metadata': serializable_final_metadata
+            }
+        except Exception as e:
+            score.value = {'pass': False}
+            score.explanation = f'Evaluation failed: {str(e)}'
+            score.metadata = {'error': str(e)}
+        score.main_score_name = 'pass'
+        return score
+    def aggregate_scores(self, sample_scores):
+        from evalscope.metrics.metric import PassAtK
+        # calculate pass@k here
+        agg_list = []
+        for metric in self.metric_list:
+            if metric.lower().startswith('pass@'):
+                k = int(metric.split('@')[1])
+                # Get the scores for this metric
+                agg = PassAtK(k)
+                agg_list.extend(agg(sample_scores))
+        return agg_list

evalscope/benchmarks/live_code_bench/load_utils.py CHANGED Viewed

@@ -32,8 +32,8 @@ def transform(item):
         private_test_cases = json.loads(item['private_test_cases'])
     except Exception as e:  # noqa: F841
         private_test_cases = json.loads(
-            pickle.loads(zlib.decompress(base64.b64decode(private_test_cases.encode('utf-8'))  # type: ignore
-                                         )))  # type: ignore
+            pickle.loads(zlib.decompress(base64.b64decode(private_test_cases.encode('utf-8'))))
+        )
     # load metadata
     metadata = json.loads(item['metadata'])
@@ -47,25 +47,17 @@ def transform(item):
     return item
-def filter_date(dataset, start_date=None, end_date=None):
-    new_dataset = []
-    for item in dataset:
-        contest_date = datetime.fromisoformat(item['contest_date'])
-        if start_date is not None:
-            p_start_date = datetime.strptime(start_date, '%Y-%m-%d')
-            if p_start_date > contest_date:
-                continue
+def filter_date(contest_date, start_date=None, end_date=None) -> bool:
-        if end_date is not None:
-            p_end_date = datetime.strptime(end_date, '%Y-%m-%d')
-            if p_end_date < contest_date:
-                continue
+    contest_date = datetime.fromisoformat(contest_date)
+    if start_date is not None:
+        p_start_date = datetime.strptime(start_date, '%Y-%m-%d')
+        if p_start_date > contest_date:
+            return False
-        new_dataset.append(item)
+    if end_date is not None:
+        p_end_date = datetime.strptime(end_date, '%Y-%m-%d')
+        if p_end_date < contest_date:
+            return False
-    if start_date or end_date:
-        logger.info(
-            f'Filtered dataset with start_date: {start_date}, end_date: {end_date}, remaining items: {len(new_dataset)}'
-        )
-    return new_dataset
+    return True

evalscope/benchmarks/live_code_bench/testing_util.py CHANGED Viewed

@@ -4,18 +4,22 @@ import faulthandler
 import json
 import numpy as np
 import platform
 # to run the solution files we're using a timing based approach
 import signal
 import sys
 import time
 # used for debugging to time steps
 from datetime import datetime
 from decimal import Decimal
 from enum import Enum
 from functools import partial
 from io import StringIO
 # from pyext import RuntimeModule
 from types import ModuleType
 # used for testing the code that reads from input
 from unittest.mock import mock_open, patch
@@ -342,8 +346,8 @@ def grade_stdio(
             return all_results, WA_send_args
         for output_line_idx, (
-                stripped_prediction_line,
-                stripped_gt_out_line,
+            stripped_prediction_line,
+            stripped_gt_out_line,
         ) in enumerate(zip(stripped_prediction_lines, stripped_gt_out_lines)):
             WA_send_args['error_message'] = (
                 f'Wrong answer at {output_line_idx=}: {truncatefn(stripped_prediction_line)} != {truncatefn(stripped_gt_out_line)}'

evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py CHANGED Viewed

@@ -1,82 +1,56 @@
 from typing import Any
-from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.constants import EvalType, OutputType
-from evalscope.metrics import exact_match
-from evalscope.metrics.completion_parsers import ResponseParser
-SUBSET_LIST = ['default']
-@Benchmark.register(
-    name='maritime_bench',
-    pretty_name='MaritimeBench',
-    tags=['Maritime', 'MCQ', 'Knowledge'],
-    description=
-    'MaritimeBench is a benchmark for evaluating AI models on maritime-related multiple-choice questions. It consists of questions related to maritime knowledge, where the model must select the correct answer from given options.',  # noqa: E501
-    dataset_id='HiDolphin/MaritimeBench',
-    model_adapter=OutputType.GENERATION,
-    output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
-    subset_list=SUBSET_LIST,
-    metric_list=['AverageAccuracy'],
-    eval_split='test',
-    prompt_template=
-    '题目来自于{subset_name}请回答单选题。要求只输出选项，不输出解释，将选项放在<>里，直接输出答案。示例：\n\n题目：在船舶主推进动力装置中，传动轴系在运转中承受以下复杂的应力和负荷，但不包括______。\n选项：\nA. 电磁力\nB. 压拉应力\nC. 弯曲应力\nD. 扭应力\n答：<A> 当前题目\n {query}',  # noqa: E501
+from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+MARITIME_PROMPT_TEMPLATE = '请回答单选题。要求只输出选项，不输出解释，将选项放在[]里，直接输出答案。示例：\n\n题目：在船舶主推进动力装置中，传动轴系在运转中承受以下复杂的应力和负荷，但不包括______。\n选项：\nA. 电磁力\nB. 压拉应力\nC. 弯曲应力\nD. 扭应力\n答：[A]\n 当前题目\n {question}\n选项：\n{choices}'  # noqa: E501
+@register_benchmark(
+    BenchmarkMeta(
+        name='maritime_bench',
+        pretty_name='MaritimeBench',
+        tags=[Tags.CHINESE, Tags.MULTIPLE_CHOICE, Tags.KNOWLEDGE],
+        description=
+        'MaritimeBench is a benchmark for evaluating AI models on maritime-related multiple-choice questions. It consists of questions related to maritime knowledge, where the model must select the correct answer from given options.',  # noqa: E501
+        dataset_id='HiDolphin/MaritimeBench',
+        metric_list=['acc'],
+        few_shot_num=0,
+        eval_split='test',
+        prompt_template=MARITIME_PROMPT_TEMPLATE,
+    )
 )
-class MaritimeBenchAdapter(DataAdapter):
+class MaritimeBenchAdapter(MultiChoiceAdapter):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        self.choices = ['A', 'B', 'C', 'D']
-    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
-        prefix = ''
-        query = prefix + input_d['question'] + '\n'
-        available_choices = []
-        for option in self.choices:
-            if option in input_d and input_d[option]:
-                query += option + ':' + input_d[option] + '\n'
-                available_choices.append(option)
-        full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
-        return self.gen_prompt_data(full_prompt, choices=available_choices)
-    def get_gold_answer(self, input_d: dict) -> str:
-        """
-        Parse the raw input labels (gold).
-        Args:
-            input_d: input raw data. Depending on the dataset.
-        Returns:
-            The parsed input. e.g. gold answer ... Depending on the dataset.
-        """
-        return input_d['answer']
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
-        """
-        Parse the raw model prediction (pred).
-        Args:
-            pred: model prediction. Depending on the model.
-        Returns:
-            The parsed prediction. e.g. model answer... Depending on the model.
-        """
-        return ResponseParser.parse_bracketed_answer(result, options=self.choices)
-    def match(self, gold: Any, pred: Any) -> Any:
-        """
-        Match the gold answer with the predicted answer.
-        Args:
-            gold: The gold answer.
-            pred: The predicted answer.
-        Returns:
-            The result of the match.
-        """
-        return exact_match(gold=gold, pred=pred)
+        self.reformat_subset = True
+    def record_to_sample(self, record) -> Sample:
+        # Extract available choices from the record
+        choices = []
+        choice_letters = ['A', 'B', 'C', 'D']
+        for letter in choice_letters:
+            if letter in record and record[letter]:
+                choices.append(record[letter])
+        return Sample(
+            input=record['question'],
+            choices=choices,
+            target=record['answer'],
+        )
+    def format_prompt_template(self, sample):
+        choices = '\n'.join([f'{chr(65 + i)}. {choice}' for i, choice in enumerate(sample.choices)])
+        return MARITIME_PROMPT_TEMPLATE.format(question=sample.input, choices=choices)
+    def extract_answer(self, prediction, task_state):
+        # use regex to extract the answer from the prediction
+        import re
+        match = re.search(r'\[([A-D])\]', prediction)
+        if match:
+            return match.group(1)
+        return ''

evalscope/benchmarks/math_500/math_500_adapter.py CHANGED Viewed

@@ -1,58 +1,51 @@
-from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.metrics import extract_answer, math_equal, strip_answer_string
-from evalscope.utils.logger import get_logger
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
-# flake8: noqa
+from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.logger import get_logger
 logger = get_logger()
-@Benchmark.register(
-    name='math_500',
-    pretty_name='MATH-500',
-    tags=['Mathematics'],
-    description=
-    "MATH-500 is a benchmark for evaluating mathematical reasoning capabilities of AI models. It consists of 500 diverse math problems across five levels of difficulty, designed to test a model's ability to solve complex mathematical problems by generating step-by-step solutions and providing the correct final answer.",  # noqa: E501
-    dataset_id='AI-ModelScope/MATH-500',
-    subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
-    metric_list=['AveragePass@1'],
-    few_shot_num=0,
-    train_split=None,
-    eval_split='test',
-    prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
+@register_benchmark(
+    BenchmarkMeta(
+        name='math_500',
+        pretty_name='MATH-500',
+        tags=[Tags.MATH, Tags.REASONING],
+        description=
+        "MATH-500 is a benchmark for evaluating mathematical reasoning capabilities of AI models. It consists of 500 diverse math problems across five levels of difficulty, designed to test a model's ability to solve complex mathematical problems by generating step-by-step solutions and providing the correct final answer.",  # noqa: E501
+        dataset_id='AI-ModelScope/MATH-500',
+        subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
+        metric_list=[{
+            'acc': {
+                'numeric': True
+            }
+        }],
+        few_shot_num=0,
+        train_split=None,
+        eval_split='test',
+        prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
+    )
 )
-class Math500Adapter(DataAdapter):
+class Math500Adapter(DefaultDataAdapter):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-    def load(self, **kwargs):
-        # default load all levels
-        kwargs['subset_list'] = ['default']
-        data_dict = super().load(**kwargs)
-        return self.reformat_subset(data_dict, subset_key='level', format='Level {}')
-    def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
-        """
-        Generate the prompt for the model input.
-        """
-        problem = input_d['problem']
-        full_prompt = self.prompt_template.format(query=problem)
-        return self.gen_prompt_data(full_prompt)
-    def get_gold_answer(self, input_d: dict) -> str:
-        # Extract the gold answer from the input dict.
-        return strip_answer_string(input_d['answer'])
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
-        """
-        Parse the model output to get the answer. Could be the best choice index.
-        """
-        # Note: Use same extraction method for both of checkpoint/service/custom
-        result = strip_answer_string(extract_answer(result))
-        return result
-    def match(self, gold: str, pred: str) -> float:
-        res = math_equal(pred, gold)
-        return 1.0 if res else 0.0
+        self.reformat_subset = True
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        return Sample(
+            input=record['problem'],
+            target=record['answer'],
+            subset_key=f"Level {record['level']}",
+            metadata={
+                'question_id': record['unique_id'],
+                'solution': record['solution'],
+            },
+        )

evalscope/benchmarks/math_vista/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/math_vista/math_vista_adapter.py ADDED Viewed

@@ -0,0 +1,129 @@
+# flake8: noqa: E501
+import re
+from typing import Any, Dict
+from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers, prompt
+logger = get_logger()
+SUBSET_LIST = ['default']
+OPEN_PROMPT = """
+Solve the following problem step by step. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.
+{question}
+Remember to put your answer on its own line at the end in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem, and you do not need to use a \\boxed command.
+"""
+MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
+MULTI_CHOICE_TYPE = 'multi_choice'
+OPEN_TYPE = 'free_form'
+@register_benchmark(
+    BenchmarkMeta(
+        name='math_vista',
+        pretty_name='MathVista',
+        dataset_id='evalscope/MathVista',
+        tags=[Tags.MATH, Tags.REASONING, Tags.MULTIPLE_CHOICE, Tags.MULTI_MODAL],
+        description=
+        'MathVista is a consolidated Mathematical reasoning benchmark within Visual contexts. It consists of three newly created datasets, IQTest, FunctionQA, and PaperQA, which address the missing visual domains and are tailored to evaluate logical reasoning on puzzle test figures, algebraic reasoning over functional plots, and scientific reasoning with academic paper figures, respectively. It also incorporates 9 MathQA datasets and 19 VQA datasets from the literature, which significantly enrich the diversity and complexity of visual perception and mathematical reasoning challenges within our benchmark. In total, MathVista includes 6,141 examples collected from 31 different datasets.',
+        subset_list=SUBSET_LIST,
+        metric_list=['acc'],
+        eval_split='testmini',
+        prompt_template=OPEN_PROMPT,
+    )
+)
+class MathVistaAdapter(VisionLanguageAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        content_list, answers_list = MathVistaAdapter.create_content_and_answers_list(record)
+        if record['question_type'] == 'multi_choice':
+            label_answer = self.get_option_label(answers_list, record['answer'])
+            return Sample(
+                input=[ChatMessageUser(content=content_list)],
+                choices=answers_list,
+                target=label_answer,
+                metadata={
+                    'question_type': record['question_type'],
+                    'answer_type': record['answer_type'],
+                    **record['metadata'],
+                }
+            )
+        elif record['question_type'] == 'free_form':
+            return Sample(
+                input=[ChatMessageUser(content=content_list)],
+                target=record['answer'],
+                metadata={
+                    'precision': record['precision'],
+                    'question_type': record['question_type'],
+                    'answer_type': record['answer_type'],
+                    **record['metadata'],
+                }
+            )
+        else:
+            raise ValueError(f"Unexpected question_type: {record['question_type']}")
+    def get_option_label(self, options, value):
+        try:
+            index = options.index(value)
+            return chr(ord('A') + index)
+        except ValueError:
+            logger.warning(f"Answer '{value}' not found in options: {options}. This may cause evaluation issues.")
+            return value
+    def extract_answer(self, prediction: str, task_state: TaskState) -> str:
+        question_type = task_state.metadata['question_type']
+        if question_type == MULTI_CHOICE_TYPE:
+            answers = parse_answers(task_state)
+            return ''.join(sorted(list(answers)))
+        elif question_type == OPEN_TYPE:
+            pattern = r'ANSWER:\s*(.*)'
+            match = re.search(pattern, prediction)
+            if match:
+                return match.group(1).strip()
+            return ''
+        else:
+            raise ValueError(f'Unsupported question type: {question_type}')
+    @staticmethod
+    def create_content_and_answers_list(record: dict[str, Any], ) -> tuple[list[Content], list[str]]:
+        """
+            Create a list of content elements and a list of answers from a record.
+            Args:
+                record (dict): The record containing question, images, and options.
+            Returns:
+                tuple: A tuple containing:
+                    - content_list (list): A list of content elements (text and images).
+                    - answers_list (list): A list of possible answers (for multiple-choice questions).
+        """
+        question_type = record['question_type']
+        if question_type == MULTI_CHOICE_TYPE:
+            answers_list = record['choices']
+            input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
+            content_list: list[Content] = [ContentText(text=input_text)]
+        else:
+            answers_list: list[str] = []
+            content_list: list[Content] = [ContentText(text=OPEN_PROMPT.format(question=record['question']))]
+        image = record['decoded_image']
+        if image:
+            image_base64 = bytes_to_base64(image['bytes'], format='jpg', add_header=True)
+            content_list.append(ContentImage(image=image_base64))
+        return content_list, answers_list

evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl