evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,16 +1,19 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
1
2
|
import glob
|
|
2
3
|
import os
|
|
3
4
|
from collections import defaultdict
|
|
4
|
-
from typing import Any, List
|
|
5
|
-
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
5
|
+
from typing import Any, Dict, List
|
|
6
|
+
|
|
7
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
8
|
+
from evalscope.api.dataset import DatasetDict, DictDataLoader, Sample
|
|
9
|
+
from evalscope.api.evaluator import TaskState
|
|
10
|
+
from evalscope.api.messages.chat_message import ChatMessageUser
|
|
11
|
+
from evalscope.api.metric import AggScore, SampleScore, Score
|
|
12
|
+
from evalscope.api.registry import register_benchmark
|
|
13
|
+
from evalscope.constants import Tags
|
|
9
14
|
from evalscope.report import Report, ReportKey
|
|
10
15
|
from evalscope.utils.logger import get_logger
|
|
11
16
|
|
|
12
|
-
# flake8: noqa
|
|
13
|
-
|
|
14
17
|
logger = get_logger()
|
|
15
18
|
|
|
16
19
|
GRADER_SYSTEM_PROMPT = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"." # noqa: E501
|
|
@@ -19,59 +22,77 @@ GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's A
|
|
|
19
22
|
) # noqa: E501
|
|
20
23
|
|
|
21
24
|
|
|
22
|
-
@
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
'
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
25
|
+
@register_benchmark(
|
|
26
|
+
BenchmarkMeta(
|
|
27
|
+
name='general_arena',
|
|
28
|
+
pretty_name='GeneralArena',
|
|
29
|
+
tags=[Tags.CUSTOM, Tags.ARENA],
|
|
30
|
+
description=
|
|
31
|
+
'GeneralArena is a custom benchmark designed to evaluate the performance of large language models in a competitive setting, '
|
|
32
|
+
'where models are pitted against each other in custom tasks to determine their relative strengths and weaknesses. You should '
|
|
33
|
+
'provide the model outputs in the format of a list of dictionaries, where each dictionary contains the model name and its report path. '
|
|
34
|
+
'For detailed instructions on how to use this benchmark, please refer to the [Arena User Guide](https://evalscope.readthedocs.io/zh-cn/latest/user_guides/arena.html).',
|
|
35
|
+
dataset_id='general_arena',
|
|
36
|
+
metric_list=['winrate'],
|
|
37
|
+
few_shot_num=0,
|
|
38
|
+
train_split=None,
|
|
39
|
+
eval_split='test',
|
|
40
|
+
system_prompt=GRADER_SYSTEM_PROMPT,
|
|
41
|
+
prompt_template=GRADER_TEMPLATE,
|
|
42
|
+
extra_params={
|
|
43
|
+
'models': [{
|
|
44
|
+
'name': 'qwen-plus',
|
|
45
|
+
'report_path': 'outputs/20250627_172550/reports/qwen-plus'
|
|
46
|
+
}, {
|
|
47
|
+
'name': 'qwen2.5-7b',
|
|
48
|
+
'report_path': 'outputs/20250627_172817/reports/qwen2.5-7b-instruct'
|
|
49
|
+
}],
|
|
50
|
+
'baseline':
|
|
51
|
+
'qwen2.5-7b'
|
|
52
|
+
}
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
class GeneralArenaAdapter(DefaultDataAdapter):
|
|
50
56
|
|
|
51
57
|
def __init__(self, *args, **kwargs):
|
|
52
58
|
super().__init__(*args, **kwargs)
|
|
53
59
|
|
|
54
|
-
|
|
55
|
-
metric_registry.register(Metric(name='winrate', object=mean))
|
|
60
|
+
self._use_llm_judge = True
|
|
56
61
|
|
|
57
|
-
|
|
58
|
-
self.
|
|
62
|
+
self.models = self.extra_params.get('models', [])
|
|
63
|
+
self.baseline = self.extra_params.get('baseline', None)
|
|
59
64
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
self.baseline = extra_params.get('baseline', None)
|
|
63
|
-
|
|
64
|
-
def load(self, **kwargs):
|
|
65
|
+
def load(self):
|
|
66
|
+
"""Load dataset by processing model reports."""
|
|
65
67
|
self._check_names()
|
|
66
68
|
self._check_reports()
|
|
67
69
|
self._check_datasets()
|
|
68
70
|
logger.info(f'Overall datasets: {self.overall_datasets}')
|
|
69
71
|
dataset_model_dict = self._load_common_datasets()
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
72
|
+
datasets = self._build_pair_wise_data(dataset_model_dict)
|
|
73
|
+
|
|
74
|
+
# Convert to DatasetDict format
|
|
75
|
+
dataset_dict = {}
|
|
76
|
+
for subset_name, samples in datasets.items():
|
|
77
|
+
dataset = DictDataLoader(
|
|
78
|
+
dict_list=samples, limit=self.limit, repeats=self.repeats, sample_fields=self.record_to_sample
|
|
79
|
+
).load()
|
|
80
|
+
dataset_dict[subset_name] = dataset
|
|
81
|
+
|
|
82
|
+
test_dataset = DatasetDict(dataset_dict)
|
|
83
|
+
return test_dataset, None
|
|
84
|
+
|
|
85
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
86
|
+
"""Convert a data record to a Sample object."""
|
|
87
|
+
return Sample(
|
|
88
|
+
input=[ChatMessageUser(content=record['question'])],
|
|
89
|
+
target=record['answer_2'], # baseline answer
|
|
90
|
+
metadata={
|
|
91
|
+
'answer_1': record['answer_1'],
|
|
92
|
+
'model_1': record['model_1'],
|
|
93
|
+
'model_2': record['model_2'],
|
|
94
|
+
}
|
|
95
|
+
)
|
|
75
96
|
|
|
76
97
|
def _check_names(self):
|
|
77
98
|
"""Check the names of the models and baseline."""
|
|
@@ -119,7 +140,8 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
119
140
|
|
|
120
141
|
def _load_common_datasets(self):
|
|
121
142
|
"""Load common datasets from the local path."""
|
|
122
|
-
from evalscope.utils import OutputsStructure
|
|
143
|
+
from evalscope.utils import OutputsStructure
|
|
144
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
123
145
|
|
|
124
146
|
dataset_dict = defaultdict(dict)
|
|
125
147
|
for dataset_name, subset_name in self.overall_datasets:
|
|
@@ -128,7 +150,8 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
128
150
|
dataset_file_path = os.path.join(dataset_path, f'{dataset_name}_{subset_name}.jsonl')
|
|
129
151
|
if not os.path.exists(dataset_file_path):
|
|
130
152
|
raise ValueError(
|
|
131
|
-
f'Dataset {dataset_name} with subset {subset_name} not found in model {model["name"]}.'
|
|
153
|
+
f'Dataset {dataset_name} with subset {subset_name} not found in model {model["name"]}.'
|
|
154
|
+
)
|
|
132
155
|
dataset = jsonl_to_list(dataset_file_path)
|
|
133
156
|
# sort by index
|
|
134
157
|
dataset.sort(key=lambda x: x.get('index'))
|
|
@@ -138,9 +161,10 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
138
161
|
|
|
139
162
|
def _build_pair_wise_data(self, dataset_dict):
|
|
140
163
|
"""Build pairwise data for the models."""
|
|
164
|
+
from evalscope.api.evaluator import ReviewResult
|
|
141
165
|
from .utils import process_review_item
|
|
142
166
|
|
|
143
|
-
pairwise_data = defaultdict(
|
|
167
|
+
pairwise_data = defaultdict(list)
|
|
144
168
|
for (dataset_name, subset_name), model_data in dataset_dict.items():
|
|
145
169
|
if len(model_data) < 2:
|
|
146
170
|
logger.warning(f'Not enough models for dataset {dataset_name} with subset {subset_name}. Skipping.')
|
|
@@ -152,8 +176,13 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
152
176
|
continue
|
|
153
177
|
pairs = []
|
|
154
178
|
for model_item, baseline_item in zip(model_data[name], model_data[self.baseline]):
|
|
179
|
+
# Convert to ReviewResult objects like in get_model_prediction
|
|
180
|
+
model_review = ReviewResult.model_validate(model_item)
|
|
181
|
+
baseline_review = ReviewResult.model_validate(baseline_item)
|
|
182
|
+
|
|
155
183
|
for model_choice, baseline_choice in zip(
|
|
156
|
-
|
|
184
|
+
process_review_item(model_review), process_review_item(baseline_review)
|
|
185
|
+
):
|
|
157
186
|
pairs.append({
|
|
158
187
|
'question': model_choice['Question'],
|
|
159
188
|
'answer_1': model_choice['Generated'],
|
|
@@ -161,23 +190,26 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
161
190
|
'model_1': name,
|
|
162
191
|
'model_2': self.baseline
|
|
163
192
|
})
|
|
164
|
-
pairwise_data[f'{dataset_name}&{subset_name}@{name}&{self.baseline}']
|
|
193
|
+
pairwise_data[f'{dataset_name}&{subset_name}@{name}&{self.baseline}'] = pairs
|
|
165
194
|
|
|
166
195
|
return pairwise_data
|
|
167
196
|
|
|
168
|
-
def
|
|
197
|
+
def llm_match_score(
|
|
198
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
199
|
+
) -> Score:
|
|
200
|
+
"""Use LLM as a judge to evaluate the predicted answer against the baseline."""
|
|
169
201
|
from .utils import get_judge_score, post_process_result
|
|
170
202
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
203
|
+
score = Score(
|
|
204
|
+
extracted_prediction=filtered_prediction,
|
|
205
|
+
prediction=original_prediction,
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
question = task_state.input_text
|
|
209
|
+
answer_1 = task_state.metadata['answer_1']
|
|
210
|
+
answer_2 = reference # baseline answer
|
|
211
|
+
model_1 = task_state.metadata['model_1']
|
|
212
|
+
model_2 = task_state.metadata['model_2']
|
|
181
213
|
|
|
182
214
|
system_template = self.system_prompt
|
|
183
215
|
prompt_template = self.prompt_template
|
|
@@ -185,9 +217,11 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
185
217
|
prompt1 = prompt_template.format(question=question, answer_1=answer_1, answer_2=answer_2)
|
|
186
218
|
# reverse the order
|
|
187
219
|
prompt2 = prompt_template.format(question=question, answer_1=answer_2, answer_2=answer_1)
|
|
220
|
+
|
|
188
221
|
# get grading response
|
|
189
|
-
game1_response = judge(prompt1, system_prompt=system_template)
|
|
190
|
-
game2_response = judge(prompt2, system_prompt=system_template)
|
|
222
|
+
game1_response = self.llm_judge.judge(prompt1, system_prompt=system_template)
|
|
223
|
+
game2_response = self.llm_judge.judge(prompt2, system_prompt=system_template)
|
|
224
|
+
|
|
191
225
|
# parse grading response
|
|
192
226
|
# game1
|
|
193
227
|
res1 = post_process_result(game1_response)
|
|
@@ -195,9 +229,9 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
195
229
|
# game2
|
|
196
230
|
res2 = post_process_result(game2_response)
|
|
197
231
|
score2 = get_judge_score(res2, reverse=True)
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
232
|
+
|
|
233
|
+
battle_result = {
|
|
234
|
+
'score': (score1 + score2) / 2,
|
|
201
235
|
'games': [
|
|
202
236
|
{
|
|
203
237
|
'model_a': model_1,
|
|
@@ -214,31 +248,38 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
214
248
|
]
|
|
215
249
|
}
|
|
216
250
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
251
|
+
score.value = {'score': battle_result['score']}
|
|
252
|
+
score.explanation = f'LLM judge battles: Game1: {game1_response[:100]}... Game2: {game2_response[:100]}...'
|
|
253
|
+
score.metadata = {
|
|
254
|
+
'source': 'llm_judge',
|
|
255
|
+
'judge_strategy': getattr(self, 'judge_strategy', 'default'),
|
|
256
|
+
'model': self.llm_judge.model_id if hasattr(self.llm_judge, 'model_id') else 'unknown',
|
|
257
|
+
'battle_result': battle_result
|
|
258
|
+
}
|
|
259
|
+
score.main_score_name = 'score'
|
|
260
|
+
|
|
261
|
+
return score
|
|
262
|
+
|
|
263
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
|
|
264
|
+
"""Aggregate scores to compute winrate."""
|
|
221
265
|
import numpy as np
|
|
222
266
|
import pandas as pd
|
|
223
267
|
|
|
224
268
|
from .utils import compute_mle_elo, get_battles_from_row, get_bootstrap_result, get_win_rate_column
|
|
225
269
|
|
|
226
|
-
|
|
227
|
-
review_res_list = [item for sublist in review_res_list for item in sublist]
|
|
228
|
-
|
|
229
|
-
battles = pd.concat([get_battles_from_row(res) for res in review_res_list])
|
|
270
|
+
battles = pd.concat([get_battles_from_row(res.score.metadata['battle_result']) for res in sample_scores])
|
|
230
271
|
|
|
231
272
|
bt_model_coef = compute_mle_elo(battles, baseline_model=self.baseline)
|
|
232
273
|
|
|
233
274
|
bootstrap_model_coef = get_bootstrap_result(
|
|
234
|
-
battles, func_compute_elo=compute_mle_elo, num_round=100, baseline_model=self.baseline
|
|
275
|
+
battles, func_compute_elo=compute_mle_elo, num_round=100, baseline_model=self.baseline
|
|
276
|
+
)
|
|
235
277
|
|
|
236
278
|
stats = pd.DataFrame()
|
|
237
279
|
stats['results'] = None
|
|
238
280
|
stats['results'] = stats['results'].astype('object')
|
|
239
281
|
|
|
240
282
|
for i, model in enumerate(bt_model_coef.index):
|
|
241
|
-
# assert model in bootstrap_elo_lu.columns
|
|
242
283
|
stats.at[i, 'model'] = model
|
|
243
284
|
stats.at[i, 'score'] = bt_model_coef[model]
|
|
244
285
|
stats.at[i, 'lower'] = np.percentile(bootstrap_model_coef[model], 2.5)
|
|
@@ -249,20 +290,25 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
249
290
|
metrics_dict['winrate_lower'] = get_win_rate_column(stats, 'lower', self.baseline).to_dict()
|
|
250
291
|
metrics_dict['winrate_upper'] = get_win_rate_column(stats, 'upper', self.baseline).to_dict()
|
|
251
292
|
|
|
252
|
-
|
|
293
|
+
agg_scores = []
|
|
253
294
|
for metric_name, models in metrics_dict.items():
|
|
254
|
-
for model_name,
|
|
295
|
+
for model_name, score_val in models.items():
|
|
255
296
|
if model_name == self.baseline:
|
|
256
297
|
continue
|
|
257
|
-
|
|
258
|
-
|
|
298
|
+
agg_scores.append(AggScore(score=score_val, metric_name=metric_name, num=len(sample_scores)))
|
|
299
|
+
|
|
300
|
+
return agg_scores
|
|
259
301
|
|
|
260
|
-
def
|
|
302
|
+
def extract_answer(self, prediction, task_state):
|
|
303
|
+
# NOTE: This is a hacky way to extract the answer from the prediction
|
|
304
|
+
return task_state.metadata['answer_1']
|
|
305
|
+
|
|
306
|
+
def _on_generate_report_end(self, report: 'Report', output_dir: str, **kwargs):
|
|
261
307
|
"""Post-process the report to convert it to a DataFrame with winrate leaderboards."""
|
|
262
308
|
import pandas as pd
|
|
263
309
|
import tabulate
|
|
264
310
|
|
|
265
|
-
report_path =
|
|
311
|
+
report_path = output_dir
|
|
266
312
|
leaderboard_file = os.path.join(report_path, 'leaderboard.txt')
|
|
267
313
|
|
|
268
314
|
# Ensure report directory exists
|
|
@@ -288,7 +334,8 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
288
334
|
"""Format DataFrame as leaderboard with CI."""
|
|
289
335
|
# Pivot to get winrate, winrate_lower, winrate_upper as columns
|
|
290
336
|
pivot_df = data_df.pivot_table(
|
|
291
|
-
index=[ReportKey.model_name], columns=ReportKey.metric_name, values=ReportKey.score, aggfunc='first'
|
|
337
|
+
index=[ReportKey.model_name], columns=ReportKey.metric_name, values=ReportKey.score, aggfunc='first'
|
|
338
|
+
)
|
|
292
339
|
|
|
293
340
|
# Add baseline model with 50% winrate
|
|
294
341
|
baseline_data = {'winrate': 0.5, 'winrate_lower': 0.5, 'winrate_upper': 0.5}
|
|
@@ -392,20 +439,11 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
392
439
|
subset_df = parsed_df[(parsed_df['dataset_name'] == dataset_name)
|
|
393
440
|
& (parsed_df['subset_name'] == subset_name)]
|
|
394
441
|
leaderboard_outputs.append(
|
|
395
|
-
format_leaderboard(subset_df, f'=== SUBSET LEADERBOARD: {dataset_name} - {subset_name} ===')
|
|
442
|
+
format_leaderboard(subset_df, f'=== SUBSET LEADERBOARD: {dataset_name} - {subset_name} ===')
|
|
443
|
+
)
|
|
396
444
|
|
|
397
445
|
# Write all leaderboard outputs to file
|
|
398
446
|
with open(leaderboard_file, 'w', encoding='utf-8') as f:
|
|
399
447
|
f.write('\n'.join(leaderboard_outputs))
|
|
400
448
|
|
|
401
449
|
logger.info(f'Leaderboard results saved to: {leaderboard_file}')
|
|
402
|
-
|
|
403
|
-
def get_gold_answer(self, input_d):
|
|
404
|
-
return f"model_1: {input_d['model_1']}\n---\n" + input_d['answer_1']
|
|
405
|
-
|
|
406
|
-
def llm_parse_pred_result(self, result, raw_input_d=None, eval_type=EvalType.CHECKPOINT):
|
|
407
|
-
return f"model_2: {raw_input_d['model_2']}\n---\n" + raw_input_d['answer_2']
|
|
408
|
-
|
|
409
|
-
def match(self, gold, pred):
|
|
410
|
-
logger.warning(f'Please use LLMJudge to match the result for {self.name}')
|
|
411
|
-
return
|
|
@@ -7,44 +7,39 @@ from collections import defaultdict
|
|
|
7
7
|
from sklearn.linear_model import LogisticRegression
|
|
8
8
|
from tqdm import tqdm
|
|
9
9
|
|
|
10
|
+
from evalscope.api.evaluator import ReviewResult
|
|
10
11
|
from evalscope.utils.logger import get_logger
|
|
11
12
|
|
|
12
13
|
logger = get_logger()
|
|
13
14
|
|
|
14
15
|
|
|
15
|
-
def process_review_item(
|
|
16
|
+
def process_review_item(review_result: ReviewResult) -> list:
|
|
16
17
|
"""
|
|
17
|
-
Process a
|
|
18
|
+
Process a ReviewResult object to extract relevant information.
|
|
18
19
|
|
|
19
20
|
Args:
|
|
20
|
-
|
|
21
|
+
review_result: ReviewResult object or dict (for backward compatibility)
|
|
21
22
|
|
|
22
23
|
Returns:
|
|
23
|
-
|
|
24
|
+
list: List of processed review items with necessary information.
|
|
24
25
|
"""
|
|
25
|
-
res = []
|
|
26
|
-
raw_input = review_item['raw_input']
|
|
27
|
-
sample_index = review_item['index']
|
|
28
|
-
question_keys = ['question', 'Question', 'prompt', 'Prompt', 'query', 'Query', 'problem', 'Problem']
|
|
29
|
-
# Find the first non-empty question key in raw_input
|
|
30
|
-
question = next((raw_input.get(key) for key in question_keys if raw_input.get(key)), None)
|
|
31
|
-
for choice_index, choice in enumerate(review_item['choices']):
|
|
32
|
-
raw_pred_answer = choice['message']['content']
|
|
33
|
-
parsed_gold_answer = choice['review']['gold']
|
|
34
|
-
parsed_pred_answer = choice['review']['pred']
|
|
35
|
-
score = choice['review']['result']
|
|
36
|
-
raw_d = {
|
|
37
|
-
'Index': f'{sample_index}_{choice_index}',
|
|
38
|
-
'Input': raw_input,
|
|
39
|
-
'Question': question if question else '*No Question*',
|
|
40
|
-
'Generated': raw_pred_answer,
|
|
41
|
-
'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
|
|
42
|
-
'Pred': parsed_pred_answer,
|
|
43
|
-
'Score': score,
|
|
44
|
-
}
|
|
45
|
-
res.append(raw_d)
|
|
46
26
|
|
|
47
|
-
|
|
27
|
+
# New format using ReviewResult
|
|
28
|
+
sample_score = review_result.sample_score
|
|
29
|
+
prediction = sample_score.score.prediction
|
|
30
|
+
target = review_result.target
|
|
31
|
+
extracted_prediction = sample_score.score.extracted_prediction
|
|
32
|
+
|
|
33
|
+
raw_d = {
|
|
34
|
+
'Index': str(review_result.index),
|
|
35
|
+
'Input': review_result.input,
|
|
36
|
+
'Question': review_result.input, # Use input as question
|
|
37
|
+
'Generated': prediction if prediction != extracted_prediction else extracted_prediction,
|
|
38
|
+
'Gold': target,
|
|
39
|
+
'Pred': extracted_prediction,
|
|
40
|
+
'Score': sample_score.score.model_dump(exclude_none=True),
|
|
41
|
+
}
|
|
42
|
+
return [raw_d]
|
|
48
43
|
|
|
49
44
|
|
|
50
45
|
def post_process_result(completion):
|
|
@@ -179,7 +174,8 @@ def compute_mle_elo(df, scale=400, base=10, init_rating=1000, baseline_model='gp
|
|
|
179
174
|
return elo_scores.sort_values(ascending=False)
|
|
180
175
|
|
|
181
176
|
lr = LogisticRegression(
|
|
182
|
-
fit_intercept=False, penalty=None, tol=1e-8
|
|
177
|
+
fit_intercept=False, penalty=None, tol=1e-8
|
|
178
|
+
) # May need to set a small value when not use GPT4 as judge model
|
|
183
179
|
lr.fit(X, Y)
|
|
184
180
|
|
|
185
181
|
elo_scores = scale * lr.coef_[0] + init_rating
|
|
@@ -2,118 +2,57 @@
|
|
|
2
2
|
import os
|
|
3
3
|
from collections import defaultdict
|
|
4
4
|
|
|
5
|
-
from evalscope.
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
9
|
from evalscope.utils.io_utils import csv_to_list, jsonl_to_list
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate
|
|
11
12
|
|
|
12
13
|
# flake8: noqa
|
|
13
14
|
|
|
14
15
|
logger = get_logger()
|
|
15
16
|
|
|
16
17
|
|
|
17
|
-
@
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class GeneralMCQAdapter(
|
|
18
|
+
@register_benchmark(
|
|
19
|
+
BenchmarkMeta(
|
|
20
|
+
name='general_mcq',
|
|
21
|
+
pretty_name='General-MCQ',
|
|
22
|
+
description='A general multiple-choice question answering dataset for custom evaluation. '
|
|
23
|
+
'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/custom_dataset/llm.html#mcq).',
|
|
24
|
+
tags=[Tags.MULTIPLE_CHOICE, Tags.CUSTOM],
|
|
25
|
+
dataset_id='general_mcq',
|
|
26
|
+
subset_list=['default'],
|
|
27
|
+
metric_list=['acc'],
|
|
28
|
+
few_shot_num=0,
|
|
29
|
+
train_split='dev',
|
|
30
|
+
eval_split='val',
|
|
31
|
+
prompt_template=MultipleChoiceTemplate.CHINESE_SINGLE_ANSWER_TEMPLATE,
|
|
32
|
+
)
|
|
33
|
+
)
|
|
34
|
+
class GeneralMCQAdapter(MultiChoiceAdapter):
|
|
34
35
|
|
|
35
36
|
def __init__(self, **kwargs):
|
|
36
37
|
super().__init__(**kwargs)
|
|
37
38
|
|
|
38
39
|
self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
|
|
39
40
|
|
|
40
|
-
def load_from_disk(self,
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
"""
|
|
59
|
-
Generate model prompt from raw input, unify the prompt format for C-Eval benchmark.
|
|
60
|
-
|
|
61
|
-
Args:
|
|
62
|
-
input_d (dict): The raw input. A single data format of the C-Eval:
|
|
63
|
-
|
|
64
|
-
{'id': 0,
|
|
65
|
-
'question': '下列关于税法基本原则的表述中,不正确的是____。',
|
|
66
|
-
'A': '税收法定原则包括税收要件法定原则和税务合法性原则',
|
|
67
|
-
'B': '税收公平原则源于法律上的平等性原则',
|
|
68
|
-
'C': '税收效率原则包含经济效率和行政效率两个方面',
|
|
69
|
-
'D': '税务机关按法定程序依法征税,可以自由做出减征、停征或免征税款的决定',
|
|
70
|
-
'answer': 'D'}
|
|
71
|
-
|
|
72
|
-
Returns:
|
|
73
|
-
{'data': ['prompt ...']}
|
|
74
|
-
"""
|
|
75
|
-
|
|
76
|
-
few_shot_prompts = [self._format_example(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
77
|
-
|
|
78
|
-
if len(few_shot_prompts) > 0:
|
|
79
|
-
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
80
|
-
else:
|
|
81
|
-
context = ''
|
|
82
|
-
context = context.strip() + self._format_example(input_d=input_d, include_answer=False)
|
|
83
|
-
|
|
84
|
-
full_prompt = self.prompt_template.format(query=context)
|
|
85
|
-
|
|
86
|
-
return self.gen_prompt_data(full_prompt)
|
|
87
|
-
|
|
88
|
-
def get_gold_answer(self, input_d: dict) -> str:
|
|
89
|
-
# Get the gold choice
|
|
90
|
-
return input_d.get('answer', '')
|
|
91
|
-
|
|
92
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
93
|
-
"""
|
|
94
|
-
Parse the model output to get the answer. Could be the best choice index.
|
|
95
|
-
|
|
96
|
-
Args:
|
|
97
|
-
result: Predicted answer from the model. Usually a string for chat.
|
|
98
|
-
raw_input_d (dict): The raw input. Depending on the dataset.
|
|
99
|
-
eval_type: `checkpoint` or `service` or `custom`. Default is `checkpoint`.
|
|
100
|
-
|
|
101
|
-
Returns:
|
|
102
|
-
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
103
|
-
"""
|
|
104
|
-
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
105
|
-
return result
|
|
106
|
-
else:
|
|
107
|
-
return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
|
|
108
|
-
|
|
109
|
-
def match(self, gold: str, pred: str) -> float:
|
|
110
|
-
return exact_match(gold=gold, pred=pred)
|
|
111
|
-
|
|
112
|
-
def _format_example(self, input_d: dict, include_answer=True):
|
|
113
|
-
choices_str = '\n'.join([f'{choice}. {input_d[choice]}' for choice in self.choices if choice in input_d])
|
|
114
|
-
|
|
115
|
-
if include_answer:
|
|
116
|
-
return self.query_template.format(
|
|
117
|
-
question=input_d['question'], choices=choices_str, answer=input_d['answer'])
|
|
118
|
-
else:
|
|
119
|
-
return self.query_template.format(question=input_d['question'], choices=choices_str, answer='').rstrip()
|
|
41
|
+
def load_from_disk(self, **kwargs):
|
|
42
|
+
return super().load_from_disk(use_local_loader=True)
|
|
43
|
+
|
|
44
|
+
def record_to_sample(self, record) -> Sample:
|
|
45
|
+
# Extract choices from the record (A, B, C, D, etc.)
|
|
46
|
+
choices = []
|
|
47
|
+
for choice_key in self.choices:
|
|
48
|
+
if choice_key in record:
|
|
49
|
+
choices.append(record[choice_key])
|
|
50
|
+
else:
|
|
51
|
+
break # Stop when we reach a choice key that doesn't exist
|
|
52
|
+
|
|
53
|
+
return Sample(
|
|
54
|
+
input=record['question'],
|
|
55
|
+
choices=choices,
|
|
56
|
+
target=record['answer'],
|
|
57
|
+
metadata={'id': record.get('id', 'unknown')},
|
|
58
|
+
)
|