PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show

evalscope/__init__.py +4 -1
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +5 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
evalscope/api/benchmark/benchmark.py +356 -0
evalscope/api/benchmark/meta.py +121 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +262 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +378 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +275 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +12 -0
evalscope/api/messages/chat_message.py +243 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +113 -0
evalscope/api/mixin/__init__.py +1 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +155 -0
evalscope/api/model/model.py +386 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/app.py +3 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +26 -14
evalscope/app/utils/data_utils.py +43 -27
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -14
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +7 -10
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -5
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +10 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +136 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +157 -57
evalscope/constants.py +37 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +275 -419
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +47 -33
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/image_edit_model.py +125 -0
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +67 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +126 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +701 -0
evalscope/perf/benchmark.py +4 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +15 -10
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +11 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -3
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +51 -35
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +33 -47
evalscope/summarizer.py +1 -1
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +3 -2
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/import_utils.py +23 -1
evalscope/utils/io_utils.py +142 -6
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +11 -7
evalscope/utils/multi_choices.py +288 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
tests/benchmark/test_eval.py +385 -0
tests/benchmark/test_image_edit.py +65 -0
tests/{aigc → benchmark}/test_t2i.py +22 -4
tests/benchmark/test_vlm.py +80 -0
tests/cli/test_all.py +85 -47
tests/cli/test_collection.py +20 -8
tests/cli/test_custom.py +22 -15
tests/cli/test_reasoning.py +81 -0
tests/common.py +73 -0
tests/perf/test_perf.py +4 -2
tests/rag/test_clip_benchmark.py +0 -2
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
/evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
/tests/{aigc → benchmark}/__init__.py +0 -0

evalscope/benchmarks/process_bench/process_bench_adapter.py CHANGED Viewed

@@ -1,100 +1,168 @@
-import os
+# flake8: noqa: E501
 import re
-from typing import Any, List
-from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.constants import AnswerKeys, EvalType
-from evalscope.metrics import Metric, mean, metric_registry, simple_f1_score
-cur_path = os.path.dirname(os.path.abspath(__file__))
-@Benchmark.register(
-    name='process_bench',
-    pretty_name='ProcessBench',
-    tags=['Mathematical', 'Reasoning'],
-    description=
-    'ProcessBench is a benchmark for evaluating AI models on mathematical reasoning tasks. It includes various subsets such as GSM8K, Math, OlympiadBench, and OmniMath, each with its own set of problems that require step-by-step reasoning to arrive at the correct answer.',  # noqa: E501
-    dataset_id='Qwen/ProcessBench',
-    subset_list=['gsm8k', 'math', 'olympiadbench', 'omnimath'],
-    metric_list=['error_acc', 'correct_acc', 'simple_f1_score'],
-    few_shot_num=0,
-    train_split=None,
-    eval_split='test',
-)
-class ProcessBenchAdapter(DataAdapter):
+from typing import Any, Dict, List
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
+from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.metric import Score
+from evalscope.api.metric.scorer import AggScore, SampleScore
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.logger import get_logger
-        self.prompt_template = open(os.path.join(cur_path, 'critique_template.txt'), encoding='utf-8').read()
+logger = get_logger()
-        # register metrics
-        metric_registry.register(Metric(name='error_acc', object=mean))
-        metric_registry.register(Metric(name='correct_acc', object=mean))
-        metric_registry.register(Metric(name='simple_f1_score', object=simple_f1_score))
+CRITIQUE_TEMPLATE = """CThe following is a math problem and a solution (split into paragraphs, enclosed with tags and indexed from 0):
-    def load(self, **kwargs):
-        # default load all levels
-        kwargs['split_as_subset'] = True
-        data_dict = super().load(**kwargs)
-        return data_dict
+[Math Problem]
-    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
+{problem}
-        problem = input_d['problem']
-        steps = input_d['steps']
-        tagged_response = ''
-        for sdx, step in enumerate(steps):
-            tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
-        tagged_response = tagged_response.strip()
+[Solution]
-        full_prompt = self.prompt_template.format(problem=problem, tagged_response=tagged_response)
+{tagged_response}
-        return self.gen_prompt_data(full_prompt)
+Your task is to review and critique the solution paragraph by paragraph. Once you identify an error in a paragraph, return the index of the paragraph where the earliest error occurs. Otherwise, return the index of -1 (which typically denotes "not found").
-    def get_gold_answer(self, input_d: dict) -> str:
-        """
-        Parse the raw input labels (gold).
-        """
-        return int(input_d['label'])
+Please put your final answer (i.e., the index) in \boxed{{}}.
+"""
+@register_benchmark(
+    BenchmarkMeta(
+        name='process_bench',
+        pretty_name='ProcessBench',
+        tags=[Tags.MATH, Tags.REASONING],
+        description=
+        'ProcessBench is a benchmark for evaluating AI models on mathematical reasoning tasks. It includes various subsets such as GSM8K, Math, OlympiadBench, and OmniMath, each with its own set of problems that require step-by-step reasoning to arrive at the correct answer.',  # noqa: E501
+        dataset_id='Qwen/ProcessBench',
+        subset_list=['gsm8k', 'math', 'olympiadbench', 'omnimath'],
+        metric_list=['error_acc', 'correct_acc', 'simple_f1_score'],
+        eval_split='test',
+        prompt_template=CRITIQUE_TEMPLATE
+    )
+)
+class ProcessBenchAdapter(DefaultDataAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.split_as_subset = True  # Use split as subset
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
         """
-        Parse the predicted result and extract proper answer.
+        Convert a data record to a Sample object.
+        Args:
+            record (Dict[str, Any]): Input data record.
+        Returns:
+            Sample: Sample object with input, target, and metadata.
         """
-        pred = ProcessBenchAdapter.extract_answer(result)
+        problem = record['problem']
+        steps = record['steps']
+        tagged_response = ''
+        for sdx, step in enumerate(steps):
+            tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
+        tagged_response = tagged_response.strip()
+        return Sample(
+            input=problem,
+            target=str(record['label']),
+            metadata={
+                'steps': steps,
+                'tagged_response': tagged_response,
+                'final_answer_correct': record['final_answer_correct']
+            }
+        )
+    def format_prompt_template(self, sample):
+        """Format the prompt template with problem and tagged response."""
+        problem = sample.input
+        tagged_response = sample.metadata['tagged_response']
+        return self.prompt_template.format(problem=problem, tagged_response=tagged_response)
+    def extract_answer(self, prediction: str, task_state: TaskState):
+        """Extract the answer from the model prediction."""
+        pred = self._extract_answer_from_text(prediction)
         try:
             pred = int(pred)
         except Exception:
             pred = None
         return pred
-    def match(self, gold: int, pred: int) -> float:
-        """
-        Match the gold answer and the predicted answer.
-        """
-        return gold == pred
-    def compute_metric(self, review_res_list: list, **kwargs) -> List[dict]:
-        reviews_list = kwargs['reviews_list']
-        error_data = []
-        correct_data = []
-        for res, raw in zip(review_res_list, reviews_list):
-            if raw[AnswerKeys.RAW_INPUT]['label'] == -1:
-                correct_data.append(res)
-            else:
-                error_data.append(res)
-        data = {}
-        if len(correct_data) != 0:
-            data.update({'correct_acc': correct_data})
-        if len(error_data) != 0:
-            data.update({'error_acc': error_data})
-        data.update({'simple_f1_score': (correct_data, error_data)})
-        return super().compute_metric(data)
+    def match_score(
+        self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
+    ) -> Score:
+        """Calculate evaluation scores by comparing prediction with reference."""
+        score = Score(
+            extracted_prediction=str(filtered_prediction) if filtered_prediction is not None else None,
+            prediction=original_prediction,
+        )
+        # Convert filtered_prediction to int if possible
+        try:
+            pred_int = int(filtered_prediction) if filtered_prediction is not None else None
+        except (ValueError, TypeError):
+            pred_int = None
+        # Calculate accuracy
+        reference = int(reference) if reference is not None else None
+        accuracy = 1.0 if reference == pred_int else 0.0
+        # Determine metric name based on label
+        if reference == -1:
+            metric_name = 'correct_acc'
+        else:
+            metric_name = 'error_acc'
+        score.value = {metric_name: accuracy}
+        score.main_score_name = metric_name
+        return score
+    def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
+        """Aggregate scores to compute final metrics including F1 score."""
+        correct_scores = []
+        error_scores = []
+        for sample_score in sample_scores:
+            score = sample_score.score
+            if 'correct_acc' in score.value:
+                correct_scores.append(score.value['correct_acc'])
+            elif 'error_acc' in score.value:
+                error_scores.append(score.value['error_acc'])
+        agg_list = []
+        if correct_scores:
+            agg_list.append(
+                AggScore(
+                    metric_name='correct_acc', score=sum(correct_scores) / len(correct_scores), num=len(correct_scores)
+                )
+            )
+        if error_scores:
+            agg_list.append(
+                AggScore(metric_name='error_acc', score=sum(error_scores) / len(error_scores), num=len(error_scores))
+            )
+        # Calculate simple F1 score
+        if correct_scores and error_scores:
+            from evalscope.metrics import simple_f1_score
+            agg_list.append(
+                AggScore(
+                    metric_name='simple_f1_score',
+                    score=simple_f1_score((correct_scores, error_scores)),
+                    num=len(correct_scores) + len(error_scores)
+                )
+            )
+        return agg_list
     @staticmethod
-    def extract_answer(solution_text: str):
+    def _extract_answer_from_text(solution_text: str):
+        """Extract answer from solution text using boxed pattern."""
         boxed_pattern = r'\\boxed\{([^}]*)\}'
         matches = re.findall(boxed_pattern, solution_text)
         if matches:

evalscope/benchmarks/race/race_adapter.py CHANGED Viewed

@@ -1,135 +1,49 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
-from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.constants import EvalType, OutputType
-from evalscope.metrics import exact_match
-from evalscope.metrics.completion_parsers import ResponseParser
-from evalscope.utils.io_utils import jsonl_to_list
+from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
 from evalscope.utils.logger import get_logger
+from evalscope.utils.multi_choices import MultipleChoiceTemplate
 # flake8: noqa
 logger = get_logger()
-@Benchmark.register(
-    name='race',
-    pretty_name='RACE',
-    tags=['Reasoning', 'MCQ'],
-    description=
-    'RACE is a benchmark for testing reading comprehension and reasoning abilities of neural models. It is constructed from Chinese middle and high school examinations.',  # noqa: E501
-    dataset_id='modelscope/race',
-    model_adapter=OutputType.MULTIPLE_CHOICE,
-    output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
-    subset_list=['high', 'middle'],
-    metric_list=['AverageAccuracy'],
-    few_shot_num=3,
-    train_split='train',
-    eval_split='test',
+@register_benchmark(
+    BenchmarkMeta(
+        name='race',
+        pretty_name='RACE',
+        tags=[Tags.REASONING, Tags.MULTIPLE_CHOICE],
+        description=
+        'RACE is a benchmark for testing reading comprehension and reasoning abilities of neural models. It is constructed from Chinese middle and high school examinations.',  # noqa: E501
+        dataset_id='evalscope/race',
+        metric_list=['acc'],
+        subset_list=['high', 'middle'],
+        few_shot_num=3,
+        train_split='train',
+        eval_split='test',
+        prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER_COT,
+    )
 )
-class RACEAdapter(DataAdapter):
+class RACEAdapter(MultiChoiceAdapter):
     def __init__(self, **kwargs):
-        few_shot_num = kwargs.get('few_shot_num', 3)
-        if few_shot_num > 3:
-            logger.warning(f'few_shot_num <= 3 for RACE, but got {few_shot_num}. Use 3-shot by default.')
-            kwargs['few_shot_num'] = 3
         super().__init__(**kwargs)
-        self.choices = ['A', 'B', 'C', 'D']
-    def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
-        data_dict = {}
-        for subset_name in subset_list:
-            data_dict[subset_name] = {}
-            for split in [self.train_split, self.eval_split]:
-                if os.path.exists(dataset_name_or_path):
-                    file_path = os.path.join(dataset_name_or_path, subset_name, f'{split}.jsonl')
-                else:
-                    file_path = os.path.join(work_dir, dataset_name_or_path, subset_name, f'{split}.jsonl')
-                if os.path.exists(file_path):
-                    data_dict[subset_name][split] = jsonl_to_list(file_path)
-        return data_dict
-    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
-        """
-        Generate model prompt from raw input, unify the prompt format for RACE benchmark.
-        Args:
-            input_d (dict): The raw input. A single data format of the RACE:
-            {'example_id': 'high3680.txt',
-            'article': 'Astronauts on shorter shuttle missions often work very long days. Tasks are scheduled so tightly that break times are often used to finish the day's work. This type of schedule is far too demanding for long missions on the International Space Station(ISS). ISS crewmembers usually live in space for at least a quarter of a year. They work five days on and two days off to _ the normal way they do things on Earth as much as possible. Weekends give the crew valuable time to rest and do a few hours of housework. They can communicate with family and friends by email , internet phone and through private video conferences. While astronauts cannot go to a baseball game or a movie in orbit, there are many familiar activities that they can still enjoy . Before a mission, the family and friends of each ISS crewmember put together a collection of family photos, messages, videos and reading material for the astronauts to look at when they will be floating 370 kilometers above the Earth. During their mission, the crew also receives care packages with CDs, books, magazines, photos and letters . And as from early 2010, the internet became available on the ISS , giving astronauts the chance to do some "web surfing "in their personal time. Besides relaxing with these more common entertainments, astronauts can simply enjoy the experience of living in space. Many astronauts say that one of the most relaxing things to do in space is to look out the window and stare at the universe and the Earth's vast land mass and oceans.',
-            'answer': 'C',
-            'question': 'The passage mainly discusses how astronauts _ .',
-            'options': [
-                "work for longer missions in space",
-                "connect with people on the Earth",
-                "spend their free time in space",
-                "observe the Earth from space"]}
-        Returns:
-            {'data': [(context, continuation), ...]}
-        """
-        prompt = 'The following are multiple choice reading comprehension questions (with answers).\n\n'.format(
-            self._format_subject(subset_name))
-        few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
-        context: str = '\n'.join(few_shot_prompts) + '\n'
-        context += self._generate_prompt(input_d=input_d, include_answer=False)
-        context = prompt + context
-        full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
-        return self.gen_prompt_data(full_prompt)
-    def get_gold_answer(self, input_d: dict) -> str:
-        # Get the gold choice
-        return input_d.get('answer', '')
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
-        """
-        Parse the model output to get the answer. Could be the best choice index.
-        Args:
-            result: Predicted answer from the model. Usually a string for chat.
-            raw_input_d: The raw input. Depending on the dataset.
-            eval_type: The evaluation type. e.g. 'checkpoint' or 'service' or 'custom'.
-        Returns:
-            The parsed answer. Depending on the dataset. Usually a string for chat.
-        """
-        if self.model_adapter == OutputType.MULTIPLE_CHOICE:
-            return result
-        else:
-            return ResponseParser.parse_first_option_with_choices(result, self.choices)
-    def match(self, gold: str, pred: str) -> float:
-        return exact_match(gold=gold, pred=pred)
-    def _generate_prompt(self, input_d: dict, include_answer=True) -> str:
-        input_choices: list = input_d['options']
-        example: str = 'Article:\n{}\nQuestion:\n{}'.format(input_d['article'], input_d['question'])
-        for j in range(len(self.choices)):
-            example += '\n{}. {}'.format(self.choices[j], input_choices[j])
-        example += '\nAnswer:'
-        if include_answer:
-            example += ' {}\n\n'.format(input_d['answer'])
+        if self.few_shot_num > 3:
+            logger.warning(f'few_shot_num <= 3 for RACE, but got {self.few_shot_num}. Use 3-shot by default.')
+            self.few_shot_num = 3
-        return example
+    def record_to_sample(self, record) -> Sample:
+        # Format the article and question as context
+        context = f"Article:\n{record['article']}\nQuestion:\n{record['question']}"
-    @classmethod
-    def _format_subject(cls, subject):
-        l = subject.split('_')
-        s = ''
-        for entry in l:
-            s += ' ' + entry
-        return s
+        return Sample(
+            input=context,
+            choices=record['options'],
+            target=record['answer'],
+            metadata={'example_id': record.get('example_id', 'unknown')},
+        )

evalscope/benchmarks/simple_qa/simple_qa_adapter.py CHANGED Viewed

@@ -1,13 +1,15 @@
+import ast
 import re
-from collections import defaultdict
-from typing import Any, List
-from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
+from typing import Any, Dict
+from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.metric import Score
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
 from evalscope.utils.logger import get_logger
-# flake8: noqa
 logger = get_logger()
 GRADER_TEMPLATE = """
@@ -76,7 +78,7 @@ Also note the following things:
     - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
-Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT_ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
 ```
 Question: {question}
 Gold target: {target}
@@ -92,76 +94,76 @@ Just return the letters "A", "B", or "C", with no text around it.
 """.strip()  # noqa: E501
-@Benchmark.register(
-    name='simple_qa',
-    pretty_name='SimpleQA',
-    tags=['Knowledge', 'QA'],
-    description=
-    'SimpleQA is a benchmark designed to evaluate the performance of language models on simple question-answering tasks. It includes a set of straightforward questions that require basic reasoning and understanding capabilities.',  # noqa: E501
-    dataset_id='AI-ModelScope/SimpleQA',
-    metric_list=['is_correct', 'is_incorrect', 'is_not_attempted'],
-    few_shot_num=0,
-    train_split=None,
-    eval_split='test')
-class SimpleQAAdapter(DataAdapter):
+@register_benchmark(
+    BenchmarkMeta(
+        name='simple_qa',
+        pretty_name='SimpleQA',
+        tags=[Tags.KNOWLEDGE, Tags.QA],
+        description=
+        'SimpleQA is a benchmark designed to evaluate the performance of language models on simple question-answering tasks. It includes a set of straightforward questions that require basic reasoning and understanding capabilities.',  # noqa: E501
+        dataset_id='AI-ModelScope/SimpleQA',
+        metric_list=['is_correct', 'is_incorrect', 'is_not_attempted'],
+        few_shot_num=0,
+        train_split=None,
+        eval_split='test',
+        prompt_template='Answer the question:\n\n{question}'
+    )
+)
+class SimpleQAAdapter(DefaultDataAdapter):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        # register metrics
-        metric_registry.register(Metric(name='is_correct', object=mean))
-        metric_registry.register(Metric(name='is_incorrect', object=mean))
-        metric_registry.register(Metric(name='is_not_attempted', object=mean))
-        # whether to use LLM as a judge
-        self.llm_as_a_judge = True
-    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
-        question = input_d['problem']
-        return self.gen_prompt_data(question)
-    def get_gold_answer(self, input_d: dict) -> str:
-        return input_d['answer']
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
-        return result.strip()
-    def match(self, gold: str, pred: str) -> float:
-        # simple match
-        logger.warning(f'Please use LLMJudge to match the result for {self.name}')
-        is_correct = 1 if gold.lower().strip() == pred.lower().strip() else 0
-        is_incorrect = not is_correct
-        is_not_attempted = 0
-        return {
-            'is_correct': is_correct,
-            'is_incorrect': is_incorrect,
-            'is_not_attempted': is_not_attempted,
-        }
+        self._use_llm_judge = True  # Use LLM as a judge by default
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        """
+        Convert a data record to a Sample object.
+        Args:
+            record (Dict[str, Any]): Input data record.
-    def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> dict:
-        raw_input = kwargs.get('raw_input', None)
-        question = raw_input['problem']
-        # get grading response
-        prompt = GRADER_TEMPLATE.format(question=question, target=gold, predicted_answer=pred)
-        grading_response = judge(prompt)
+        Returns:
+            Sample: Sample object with input, target, and metadata.
+        """
+        question = record['problem']
+        answer = record['answer']
+        metadata = record.get('metadata')
+        return Sample(input=question, target=answer, metadata=ast.literal_eval(metadata))
+    def llm_match_score(
+        self,
+        original_prediction: str,
+        filtered_prediction: str,
+        reference: str,
+        task_state: TaskState,
+    ) -> Score:
+        score = Score(
+            extracted_prediction=filtered_prediction,
+            prediction=original_prediction,
+        )
+        question = task_state.input_text
+        # Request judge and obtain score
+        prompt = GRADER_TEMPLATE.format(question=question, target=reference, predicted_answer=filtered_prediction)
+        judge_response = self.llm_judge.judge(prompt)
         # parse grading response
-        match = re.search(r'(A|B|C)', grading_response)
+        match = re.search(r'(A|B|C)', judge_response)
         res = match.group(0) if match else 'C'
-        return {
+        # Set score based on the match result
+        score.value = {
             'is_correct': 1 if res == 'A' else 0,
             'is_incorrect': 1 if res == 'B' else 0,
             'is_not_attempted': 1 if res == 'C' else 0,
-            'judge_response': grading_response,
         }
-    def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
-        """
-        compute weighted mean of the bleu score of all samples
-        Args:
-            review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
-        """
-        # zip dict answers
-        res_dict = super().compute_dict_metric(review_res_list, **kwargs)
-        return super().compute_metric(res_dict, **kwargs)
+        score.explanation = f'LLM judge: {judge_response}'
+        score.metadata = {
+            'source': 'llm_judge',
+            'judge_strategy': self.judge_strategy,
+            'model': self.llm_judge.model_id
+        }
+        score.main_score_name = 'is_correct'
+        return score

evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl