evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
|
@@ -1,100 +1,168 @@
|
|
|
1
|
-
|
|
1
|
+
# flake8: noqa: E501
|
|
2
2
|
import re
|
|
3
|
-
from typing import Any, List
|
|
4
|
-
|
|
5
|
-
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
-
from evalscope.constants import AnswerKeys, EvalType
|
|
7
|
-
from evalscope.metrics import Metric, mean, metric_registry, simple_f1_score
|
|
8
|
-
|
|
9
|
-
cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@Benchmark.register(
|
|
13
|
-
name='process_bench',
|
|
14
|
-
pretty_name='ProcessBench',
|
|
15
|
-
tags=['Mathematical', 'Reasoning'],
|
|
16
|
-
description=
|
|
17
|
-
'ProcessBench is a benchmark for evaluating AI models on mathematical reasoning tasks. It includes various subsets such as GSM8K, Math, OlympiadBench, and OmniMath, each with its own set of problems that require step-by-step reasoning to arrive at the correct answer.', # noqa: E501
|
|
18
|
-
dataset_id='Qwen/ProcessBench',
|
|
19
|
-
subset_list=['gsm8k', 'math', 'olympiadbench', 'omnimath'],
|
|
20
|
-
metric_list=['error_acc', 'correct_acc', 'simple_f1_score'],
|
|
21
|
-
few_shot_num=0,
|
|
22
|
-
train_split=None,
|
|
23
|
-
eval_split='test',
|
|
24
|
-
)
|
|
25
|
-
class ProcessBenchAdapter(DataAdapter):
|
|
3
|
+
from typing import Any, Dict, List
|
|
26
4
|
|
|
27
|
-
|
|
28
|
-
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
7
|
+
from evalscope.api.evaluator import TaskState
|
|
8
|
+
from evalscope.api.metric import Score
|
|
9
|
+
from evalscope.api.metric.scorer import AggScore, SampleScore
|
|
10
|
+
from evalscope.api.registry import register_benchmark
|
|
11
|
+
from evalscope.constants import Tags
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
29
13
|
|
|
30
|
-
|
|
14
|
+
logger = get_logger()
|
|
31
15
|
|
|
32
|
-
|
|
33
|
-
metric_registry.register(Metric(name='error_acc', object=mean))
|
|
34
|
-
metric_registry.register(Metric(name='correct_acc', object=mean))
|
|
35
|
-
metric_registry.register(Metric(name='simple_f1_score', object=simple_f1_score))
|
|
16
|
+
CRITIQUE_TEMPLATE = """CThe following is a math problem and a solution (split into paragraphs, enclosed with tags and indexed from 0):
|
|
36
17
|
|
|
37
|
-
|
|
38
|
-
# default load all levels
|
|
39
|
-
kwargs['split_as_subset'] = True
|
|
40
|
-
data_dict = super().load(**kwargs)
|
|
41
|
-
return data_dict
|
|
18
|
+
[Math Problem]
|
|
42
19
|
|
|
43
|
-
|
|
20
|
+
{problem}
|
|
44
21
|
|
|
45
|
-
|
|
46
|
-
steps = input_d['steps']
|
|
47
|
-
tagged_response = ''
|
|
48
|
-
for sdx, step in enumerate(steps):
|
|
49
|
-
tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
|
|
50
|
-
tagged_response = tagged_response.strip()
|
|
22
|
+
[Solution]
|
|
51
23
|
|
|
52
|
-
|
|
24
|
+
{tagged_response}
|
|
53
25
|
|
|
54
|
-
|
|
26
|
+
Your task is to review and critique the solution paragraph by paragraph. Once you identify an error in a paragraph, return the index of the paragraph where the earliest error occurs. Otherwise, return the index of -1 (which typically denotes "not found").
|
|
55
27
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
28
|
+
Please put your final answer (i.e., the index) in \boxed{{}}.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@register_benchmark(
|
|
33
|
+
BenchmarkMeta(
|
|
34
|
+
name='process_bench',
|
|
35
|
+
pretty_name='ProcessBench',
|
|
36
|
+
tags=[Tags.MATH, Tags.REASONING],
|
|
37
|
+
description=
|
|
38
|
+
'ProcessBench is a benchmark for evaluating AI models on mathematical reasoning tasks. It includes various subsets such as GSM8K, Math, OlympiadBench, and OmniMath, each with its own set of problems that require step-by-step reasoning to arrive at the correct answer.', # noqa: E501
|
|
39
|
+
dataset_id='Qwen/ProcessBench',
|
|
40
|
+
subset_list=['gsm8k', 'math', 'olympiadbench', 'omnimath'],
|
|
41
|
+
metric_list=['error_acc', 'correct_acc', 'simple_f1_score'],
|
|
42
|
+
eval_split='test',
|
|
43
|
+
prompt_template=CRITIQUE_TEMPLATE
|
|
44
|
+
)
|
|
45
|
+
)
|
|
46
|
+
class ProcessBenchAdapter(DefaultDataAdapter):
|
|
47
|
+
|
|
48
|
+
def __init__(self, **kwargs):
|
|
49
|
+
super().__init__(**kwargs)
|
|
50
|
+
self.split_as_subset = True # Use split as subset
|
|
61
51
|
|
|
62
|
-
def
|
|
52
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
63
53
|
"""
|
|
64
|
-
|
|
54
|
+
Convert a data record to a Sample object.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
record (Dict[str, Any]): Input data record.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Sample: Sample object with input, target, and metadata.
|
|
65
61
|
"""
|
|
66
|
-
|
|
62
|
+
problem = record['problem']
|
|
63
|
+
steps = record['steps']
|
|
64
|
+
tagged_response = ''
|
|
65
|
+
for sdx, step in enumerate(steps):
|
|
66
|
+
tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
|
|
67
|
+
tagged_response = tagged_response.strip()
|
|
68
|
+
|
|
69
|
+
return Sample(
|
|
70
|
+
input=problem,
|
|
71
|
+
target=str(record['label']),
|
|
72
|
+
metadata={
|
|
73
|
+
'steps': steps,
|
|
74
|
+
'tagged_response': tagged_response,
|
|
75
|
+
'final_answer_correct': record['final_answer_correct']
|
|
76
|
+
}
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def format_prompt_template(self, sample):
|
|
80
|
+
"""Format the prompt template with problem and tagged response."""
|
|
81
|
+
problem = sample.input
|
|
82
|
+
tagged_response = sample.metadata['tagged_response']
|
|
83
|
+
return self.prompt_template.format(problem=problem, tagged_response=tagged_response)
|
|
84
|
+
|
|
85
|
+
def extract_answer(self, prediction: str, task_state: TaskState):
|
|
86
|
+
"""Extract the answer from the model prediction."""
|
|
87
|
+
pred = self._extract_answer_from_text(prediction)
|
|
67
88
|
try:
|
|
68
89
|
pred = int(pred)
|
|
69
90
|
except Exception:
|
|
70
91
|
pred = None
|
|
71
92
|
return pred
|
|
72
93
|
|
|
73
|
-
def
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
"""
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
if
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
94
|
+
def match_score(
|
|
95
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
96
|
+
) -> Score:
|
|
97
|
+
"""Calculate evaluation scores by comparing prediction with reference."""
|
|
98
|
+
score = Score(
|
|
99
|
+
extracted_prediction=str(filtered_prediction) if filtered_prediction is not None else None,
|
|
100
|
+
prediction=original_prediction,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Convert filtered_prediction to int if possible
|
|
104
|
+
try:
|
|
105
|
+
pred_int = int(filtered_prediction) if filtered_prediction is not None else None
|
|
106
|
+
except (ValueError, TypeError):
|
|
107
|
+
pred_int = None
|
|
108
|
+
|
|
109
|
+
# Calculate accuracy
|
|
110
|
+
reference = int(reference) if reference is not None else None
|
|
111
|
+
accuracy = 1.0 if reference == pred_int else 0.0
|
|
112
|
+
|
|
113
|
+
# Determine metric name based on label
|
|
114
|
+
if reference == -1:
|
|
115
|
+
metric_name = 'correct_acc'
|
|
116
|
+
else:
|
|
117
|
+
metric_name = 'error_acc'
|
|
118
|
+
|
|
119
|
+
score.value = {metric_name: accuracy}
|
|
120
|
+
score.main_score_name = metric_name
|
|
121
|
+
|
|
122
|
+
return score
|
|
123
|
+
|
|
124
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
|
|
125
|
+
"""Aggregate scores to compute final metrics including F1 score."""
|
|
126
|
+
correct_scores = []
|
|
127
|
+
error_scores = []
|
|
128
|
+
|
|
129
|
+
for sample_score in sample_scores:
|
|
130
|
+
score = sample_score.score
|
|
131
|
+
if 'correct_acc' in score.value:
|
|
132
|
+
correct_scores.append(score.value['correct_acc'])
|
|
133
|
+
elif 'error_acc' in score.value:
|
|
134
|
+
error_scores.append(score.value['error_acc'])
|
|
135
|
+
|
|
136
|
+
agg_list = []
|
|
137
|
+
|
|
138
|
+
if correct_scores:
|
|
139
|
+
agg_list.append(
|
|
140
|
+
AggScore(
|
|
141
|
+
metric_name='correct_acc', score=sum(correct_scores) / len(correct_scores), num=len(correct_scores)
|
|
142
|
+
)
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
if error_scores:
|
|
146
|
+
agg_list.append(
|
|
147
|
+
AggScore(metric_name='error_acc', score=sum(error_scores) / len(error_scores), num=len(error_scores))
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# Calculate simple F1 score
|
|
151
|
+
if correct_scores and error_scores:
|
|
152
|
+
from evalscope.metrics import simple_f1_score
|
|
153
|
+
agg_list.append(
|
|
154
|
+
AggScore(
|
|
155
|
+
metric_name='simple_f1_score',
|
|
156
|
+
score=simple_f1_score((correct_scores, error_scores)),
|
|
157
|
+
num=len(correct_scores) + len(error_scores)
|
|
158
|
+
)
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return agg_list
|
|
95
162
|
|
|
96
163
|
@staticmethod
|
|
97
|
-
def
|
|
164
|
+
def _extract_answer_from_text(solution_text: str):
|
|
165
|
+
"""Extract answer from solution text using boxed pattern."""
|
|
98
166
|
boxed_pattern = r'\\boxed\{([^}]*)\}'
|
|
99
167
|
matches = re.findall(boxed_pattern, solution_text)
|
|
100
168
|
if matches:
|
|
@@ -1,135 +1,49 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
import
|
|
4
|
-
|
|
5
|
-
from evalscope.
|
|
6
|
-
from evalscope.constants import
|
|
7
|
-
from evalscope.metrics import exact_match
|
|
8
|
-
from evalscope.metrics.completion_parsers import ResponseParser
|
|
9
|
-
from evalscope.utils.io_utils import jsonl_to_list
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.registry import register_benchmark
|
|
6
|
+
from evalscope.constants import Tags
|
|
10
7
|
from evalscope.utils.logger import get_logger
|
|
8
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate
|
|
11
9
|
|
|
12
10
|
# flake8: noqa
|
|
13
11
|
|
|
14
12
|
logger = get_logger()
|
|
15
13
|
|
|
16
14
|
|
|
17
|
-
@
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
15
|
+
@register_benchmark(
|
|
16
|
+
BenchmarkMeta(
|
|
17
|
+
name='race',
|
|
18
|
+
pretty_name='RACE',
|
|
19
|
+
tags=[Tags.REASONING, Tags.MULTIPLE_CHOICE],
|
|
20
|
+
description=
|
|
21
|
+
'RACE is a benchmark for testing reading comprehension and reasoning abilities of neural models. It is constructed from Chinese middle and high school examinations.', # noqa: E501
|
|
22
|
+
dataset_id='evalscope/race',
|
|
23
|
+
metric_list=['acc'],
|
|
24
|
+
subset_list=['high', 'middle'],
|
|
25
|
+
few_shot_num=3,
|
|
26
|
+
train_split='train',
|
|
27
|
+
eval_split='test',
|
|
28
|
+
prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER_COT,
|
|
29
|
+
)
|
|
31
30
|
)
|
|
32
|
-
class RACEAdapter(
|
|
31
|
+
class RACEAdapter(MultiChoiceAdapter):
|
|
33
32
|
|
|
34
33
|
def __init__(self, **kwargs):
|
|
35
|
-
few_shot_num = kwargs.get('few_shot_num', 3)
|
|
36
|
-
if few_shot_num > 3:
|
|
37
|
-
logger.warning(f'few_shot_num <= 3 for RACE, but got {few_shot_num}. Use 3-shot by default.')
|
|
38
|
-
kwargs['few_shot_num'] = 3
|
|
39
|
-
|
|
40
34
|
super().__init__(**kwargs)
|
|
41
35
|
|
|
42
|
-
self.
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
data_dict = {}
|
|
46
|
-
for subset_name in subset_list:
|
|
47
|
-
data_dict[subset_name] = {}
|
|
48
|
-
for split in [self.train_split, self.eval_split]:
|
|
49
|
-
if os.path.exists(dataset_name_or_path):
|
|
50
|
-
file_path = os.path.join(dataset_name_or_path, subset_name, f'{split}.jsonl')
|
|
51
|
-
else:
|
|
52
|
-
file_path = os.path.join(work_dir, dataset_name_or_path, subset_name, f'{split}.jsonl')
|
|
53
|
-
if os.path.exists(file_path):
|
|
54
|
-
data_dict[subset_name][split] = jsonl_to_list(file_path)
|
|
55
|
-
|
|
56
|
-
return data_dict
|
|
57
|
-
|
|
58
|
-
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
59
|
-
"""
|
|
60
|
-
Generate model prompt from raw input, unify the prompt format for RACE benchmark.
|
|
61
|
-
|
|
62
|
-
Args:
|
|
63
|
-
input_d (dict): The raw input. A single data format of the RACE:
|
|
64
|
-
|
|
65
|
-
{'example_id': 'high3680.txt',
|
|
66
|
-
'article': 'Astronauts on shorter shuttle missions often work very long days. Tasks are scheduled so tightly that break times are often used to finish the day's work. This type of schedule is far too demanding for long missions on the International Space Station(ISS). ISS crewmembers usually live in space for at least a quarter of a year. They work five days on and two days off to _ the normal way they do things on Earth as much as possible. Weekends give the crew valuable time to rest and do a few hours of housework. They can communicate with family and friends by email , internet phone and through private video conferences. While astronauts cannot go to a baseball game or a movie in orbit, there are many familiar activities that they can still enjoy . Before a mission, the family and friends of each ISS crewmember put together a collection of family photos, messages, videos and reading material for the astronauts to look at when they will be floating 370 kilometers above the Earth. During their mission, the crew also receives care packages with CDs, books, magazines, photos and letters . And as from early 2010, the internet became available on the ISS , giving astronauts the chance to do some "web surfing "in their personal time. Besides relaxing with these more common entertainments, astronauts can simply enjoy the experience of living in space. Many astronauts say that one of the most relaxing things to do in space is to look out the window and stare at the universe and the Earth's vast land mass and oceans.',
|
|
67
|
-
'answer': 'C',
|
|
68
|
-
'question': 'The passage mainly discusses how astronauts _ .',
|
|
69
|
-
'options': [
|
|
70
|
-
"work for longer missions in space",
|
|
71
|
-
"connect with people on the Earth",
|
|
72
|
-
"spend their free time in space",
|
|
73
|
-
"observe the Earth from space"]}
|
|
74
|
-
|
|
75
|
-
Returns:
|
|
76
|
-
{'data': [(context, continuation), ...]}
|
|
77
|
-
|
|
78
|
-
"""
|
|
79
|
-
prompt = 'The following are multiple choice reading comprehension questions (with answers).\n\n'.format(
|
|
80
|
-
self._format_subject(subset_name))
|
|
81
|
-
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
82
|
-
|
|
83
|
-
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
84
|
-
context += self._generate_prompt(input_d=input_d, include_answer=False)
|
|
85
|
-
context = prompt + context
|
|
86
|
-
|
|
87
|
-
full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
88
|
-
|
|
89
|
-
return self.gen_prompt_data(full_prompt)
|
|
90
|
-
|
|
91
|
-
def get_gold_answer(self, input_d: dict) -> str:
|
|
92
|
-
# Get the gold choice
|
|
93
|
-
return input_d.get('answer', '')
|
|
94
|
-
|
|
95
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
96
|
-
"""
|
|
97
|
-
Parse the model output to get the answer. Could be the best choice index.
|
|
98
|
-
|
|
99
|
-
Args:
|
|
100
|
-
result: Predicted answer from the model. Usually a string for chat.
|
|
101
|
-
raw_input_d: The raw input. Depending on the dataset.
|
|
102
|
-
eval_type: The evaluation type. e.g. 'checkpoint' or 'service' or 'custom'.
|
|
103
|
-
|
|
104
|
-
Returns:
|
|
105
|
-
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
106
|
-
"""
|
|
107
|
-
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
108
|
-
return result
|
|
109
|
-
else:
|
|
110
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
111
|
-
|
|
112
|
-
def match(self, gold: str, pred: str) -> float:
|
|
113
|
-
return exact_match(gold=gold, pred=pred)
|
|
114
|
-
|
|
115
|
-
def _generate_prompt(self, input_d: dict, include_answer=True) -> str:
|
|
116
|
-
|
|
117
|
-
input_choices: list = input_d['options']
|
|
118
|
-
|
|
119
|
-
example: str = 'Article:\n{}\nQuestion:\n{}'.format(input_d['article'], input_d['question'])
|
|
120
|
-
for j in range(len(self.choices)):
|
|
121
|
-
example += '\n{}. {}'.format(self.choices[j], input_choices[j])
|
|
122
|
-
|
|
123
|
-
example += '\nAnswer:'
|
|
124
|
-
if include_answer:
|
|
125
|
-
example += ' {}\n\n'.format(input_d['answer'])
|
|
36
|
+
if self.few_shot_num > 3:
|
|
37
|
+
logger.warning(f'few_shot_num <= 3 for RACE, but got {self.few_shot_num}. Use 3-shot by default.')
|
|
38
|
+
self.few_shot_num = 3
|
|
126
39
|
|
|
127
|
-
|
|
40
|
+
def record_to_sample(self, record) -> Sample:
|
|
41
|
+
# Format the article and question as context
|
|
42
|
+
context = f"Article:\n{record['article']}\nQuestion:\n{record['question']}"
|
|
128
43
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
return s
|
|
44
|
+
return Sample(
|
|
45
|
+
input=context,
|
|
46
|
+
choices=record['options'],
|
|
47
|
+
target=record['answer'],
|
|
48
|
+
metadata={'example_id': record.get('example_id', 'unknown')},
|
|
49
|
+
)
|
|
@@ -1,13 +1,15 @@
|
|
|
1
|
+
import ast
|
|
1
2
|
import re
|
|
2
|
-
from
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
from evalscope.
|
|
6
|
-
from evalscope.
|
|
3
|
+
from typing import Any, Dict
|
|
4
|
+
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
7
|
+
from evalscope.api.evaluator import TaskState
|
|
8
|
+
from evalscope.api.metric import Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
7
11
|
from evalscope.utils.logger import get_logger
|
|
8
12
|
|
|
9
|
-
# flake8: noqa
|
|
10
|
-
|
|
11
13
|
logger = get_logger()
|
|
12
14
|
|
|
13
15
|
GRADER_TEMPLATE = """
|
|
@@ -76,7 +78,7 @@ Also note the following things:
|
|
|
76
78
|
- For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
|
|
77
79
|
|
|
78
80
|
|
|
79
|
-
Here is a new example. Simply reply with either CORRECT, INCORRECT,
|
|
81
|
+
Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT_ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
|
80
82
|
```
|
|
81
83
|
Question: {question}
|
|
82
84
|
Gold target: {target}
|
|
@@ -92,76 +94,76 @@ Just return the letters "A", "B", or "C", with no text around it.
|
|
|
92
94
|
""".strip() # noqa: E501
|
|
93
95
|
|
|
94
96
|
|
|
95
|
-
@
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
97
|
+
@register_benchmark(
|
|
98
|
+
BenchmarkMeta(
|
|
99
|
+
name='simple_qa',
|
|
100
|
+
pretty_name='SimpleQA',
|
|
101
|
+
tags=[Tags.KNOWLEDGE, Tags.QA],
|
|
102
|
+
description=
|
|
103
|
+
'SimpleQA is a benchmark designed to evaluate the performance of language models on simple question-answering tasks. It includes a set of straightforward questions that require basic reasoning and understanding capabilities.', # noqa: E501
|
|
104
|
+
dataset_id='AI-ModelScope/SimpleQA',
|
|
105
|
+
metric_list=['is_correct', 'is_incorrect', 'is_not_attempted'],
|
|
106
|
+
few_shot_num=0,
|
|
107
|
+
train_split=None,
|
|
108
|
+
eval_split='test',
|
|
109
|
+
prompt_template='Answer the question:\n\n{question}'
|
|
110
|
+
)
|
|
111
|
+
)
|
|
112
|
+
class SimpleQAAdapter(DefaultDataAdapter):
|
|
107
113
|
|
|
108
114
|
def __init__(self, *args, **kwargs):
|
|
109
115
|
super().__init__(*args, **kwargs)
|
|
110
116
|
|
|
111
|
-
#
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
120
|
-
question = input_d['problem']
|
|
121
|
-
return self.gen_prompt_data(question)
|
|
122
|
-
|
|
123
|
-
def get_gold_answer(self, input_d: dict) -> str:
|
|
124
|
-
return input_d['answer']
|
|
125
|
-
|
|
126
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
|
|
127
|
-
return result.strip()
|
|
128
|
-
|
|
129
|
-
def match(self, gold: str, pred: str) -> float:
|
|
130
|
-
# simple match
|
|
131
|
-
logger.warning(f'Please use LLMJudge to match the result for {self.name}')
|
|
132
|
-
is_correct = 1 if gold.lower().strip() == pred.lower().strip() else 0
|
|
133
|
-
is_incorrect = not is_correct
|
|
134
|
-
is_not_attempted = 0
|
|
135
|
-
return {
|
|
136
|
-
'is_correct': is_correct,
|
|
137
|
-
'is_incorrect': is_incorrect,
|
|
138
|
-
'is_not_attempted': is_not_attempted,
|
|
139
|
-
}
|
|
117
|
+
self._use_llm_judge = True # Use LLM as a judge by default
|
|
118
|
+
|
|
119
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
120
|
+
"""
|
|
121
|
+
Convert a data record to a Sample object.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
record (Dict[str, Any]): Input data record.
|
|
140
125
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
126
|
+
Returns:
|
|
127
|
+
Sample: Sample object with input, target, and metadata.
|
|
128
|
+
"""
|
|
129
|
+
question = record['problem']
|
|
130
|
+
answer = record['answer']
|
|
131
|
+
metadata = record.get('metadata')
|
|
132
|
+
|
|
133
|
+
return Sample(input=question, target=answer, metadata=ast.literal_eval(metadata))
|
|
134
|
+
|
|
135
|
+
def llm_match_score(
|
|
136
|
+
self,
|
|
137
|
+
original_prediction: str,
|
|
138
|
+
filtered_prediction: str,
|
|
139
|
+
reference: str,
|
|
140
|
+
task_state: TaskState,
|
|
141
|
+
) -> Score:
|
|
142
|
+
score = Score(
|
|
143
|
+
extracted_prediction=filtered_prediction,
|
|
144
|
+
prediction=original_prediction,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
question = task_state.input_text
|
|
148
|
+
|
|
149
|
+
# Request judge and obtain score
|
|
150
|
+
prompt = GRADER_TEMPLATE.format(question=question, target=reference, predicted_answer=filtered_prediction)
|
|
151
|
+
judge_response = self.llm_judge.judge(prompt)
|
|
147
152
|
# parse grading response
|
|
148
|
-
match = re.search(r'(A|B|C)',
|
|
153
|
+
match = re.search(r'(A|B|C)', judge_response)
|
|
149
154
|
res = match.group(0) if match else 'C'
|
|
150
|
-
|
|
155
|
+
|
|
156
|
+
# Set score based on the match result
|
|
157
|
+
score.value = {
|
|
151
158
|
'is_correct': 1 if res == 'A' else 0,
|
|
152
159
|
'is_incorrect': 1 if res == 'B' else 0,
|
|
153
160
|
'is_not_attempted': 1 if res == 'C' else 0,
|
|
154
|
-
'judge_response': grading_response,
|
|
155
161
|
}
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
# zip dict answers
|
|
165
|
-
res_dict = super().compute_dict_metric(review_res_list, **kwargs)
|
|
166
|
-
|
|
167
|
-
return super().compute_metric(res_dict, **kwargs)
|
|
162
|
+
score.explanation = f'LLM judge: {judge_response}'
|
|
163
|
+
score.metadata = {
|
|
164
|
+
'source': 'llm_judge',
|
|
165
|
+
'judge_strategy': self.judge_strategy,
|
|
166
|
+
'model': self.llm_judge.model_id
|
|
167
|
+
}
|
|
168
|
+
score.main_score_name = 'is_correct'
|
|
169
|
+
return score
|