evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
|
@@ -7,44 +7,39 @@ from collections import defaultdict
|
|
|
7
7
|
from sklearn.linear_model import LogisticRegression
|
|
8
8
|
from tqdm import tqdm
|
|
9
9
|
|
|
10
|
+
from evalscope.api.evaluator import ReviewResult
|
|
10
11
|
from evalscope.utils.logger import get_logger
|
|
11
12
|
|
|
12
13
|
logger = get_logger()
|
|
13
14
|
|
|
14
15
|
|
|
15
|
-
def process_review_item(
|
|
16
|
+
def process_review_item(review_result: ReviewResult) -> list:
|
|
16
17
|
"""
|
|
17
|
-
Process a
|
|
18
|
+
Process a ReviewResult object to extract relevant information.
|
|
18
19
|
|
|
19
20
|
Args:
|
|
20
|
-
|
|
21
|
+
review_result: ReviewResult object or dict (for backward compatibility)
|
|
21
22
|
|
|
22
23
|
Returns:
|
|
23
|
-
|
|
24
|
+
list: List of processed review items with necessary information.
|
|
24
25
|
"""
|
|
25
|
-
res = []
|
|
26
|
-
raw_input = review_item['raw_input']
|
|
27
|
-
sample_index = review_item['index']
|
|
28
|
-
question_keys = ['question', 'Question', 'prompt', 'Prompt', 'query', 'Query', 'problem', 'Problem']
|
|
29
|
-
# Find the first non-empty question key in raw_input
|
|
30
|
-
question = next((raw_input.get(key) for key in question_keys if raw_input.get(key)), None)
|
|
31
|
-
for choice_index, choice in enumerate(review_item['choices']):
|
|
32
|
-
raw_pred_answer = choice['message']['content']
|
|
33
|
-
parsed_gold_answer = choice['review']['gold']
|
|
34
|
-
parsed_pred_answer = choice['review']['pred']
|
|
35
|
-
score = choice['review']['result']
|
|
36
|
-
raw_d = {
|
|
37
|
-
'Index': f'{sample_index}_{choice_index}',
|
|
38
|
-
'Input': raw_input,
|
|
39
|
-
'Question': question if question else '*No Question*',
|
|
40
|
-
'Generated': raw_pred_answer,
|
|
41
|
-
'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
|
|
42
|
-
'Pred': parsed_pred_answer,
|
|
43
|
-
'Score': score,
|
|
44
|
-
}
|
|
45
|
-
res.append(raw_d)
|
|
46
26
|
|
|
47
|
-
|
|
27
|
+
# New format using ReviewResult
|
|
28
|
+
sample_score = review_result.sample_score
|
|
29
|
+
prediction = sample_score.score.prediction
|
|
30
|
+
target = review_result.target
|
|
31
|
+
extracted_prediction = sample_score.score.extracted_prediction
|
|
32
|
+
|
|
33
|
+
raw_d = {
|
|
34
|
+
'Index': str(review_result.index),
|
|
35
|
+
'Input': review_result.input,
|
|
36
|
+
'Question': review_result.input, # Use input as question
|
|
37
|
+
'Generated': prediction if prediction != extracted_prediction else extracted_prediction,
|
|
38
|
+
'Gold': target,
|
|
39
|
+
'Pred': extracted_prediction,
|
|
40
|
+
'Score': sample_score.score.model_dump(exclude_none=True),
|
|
41
|
+
}
|
|
42
|
+
return [raw_d]
|
|
48
43
|
|
|
49
44
|
|
|
50
45
|
def post_process_result(completion):
|
|
@@ -179,7 +174,8 @@ def compute_mle_elo(df, scale=400, base=10, init_rating=1000, baseline_model='gp
|
|
|
179
174
|
return elo_scores.sort_values(ascending=False)
|
|
180
175
|
|
|
181
176
|
lr = LogisticRegression(
|
|
182
|
-
fit_intercept=False, penalty=None, tol=1e-8
|
|
177
|
+
fit_intercept=False, penalty=None, tol=1e-8
|
|
178
|
+
) # May need to set a small value when not use GPT4 as judge model
|
|
183
179
|
lr.fit(X, Y)
|
|
184
180
|
|
|
185
181
|
elo_scores = scale * lr.coef_[0] + init_rating
|
|
@@ -2,118 +2,57 @@
|
|
|
2
2
|
import os
|
|
3
3
|
from collections import defaultdict
|
|
4
4
|
|
|
5
|
-
from evalscope.
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
9
|
from evalscope.utils.io_utils import csv_to_list, jsonl_to_list
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate
|
|
11
12
|
|
|
12
13
|
# flake8: noqa
|
|
13
14
|
|
|
14
15
|
logger = get_logger()
|
|
15
16
|
|
|
16
17
|
|
|
17
|
-
@
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class GeneralMCQAdapter(
|
|
18
|
+
@register_benchmark(
|
|
19
|
+
BenchmarkMeta(
|
|
20
|
+
name='general_mcq',
|
|
21
|
+
pretty_name='General-MCQ',
|
|
22
|
+
description='A general multiple-choice question answering dataset for custom evaluation. '
|
|
23
|
+
'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/custom_dataset/llm.html#mcq).',
|
|
24
|
+
tags=[Tags.MULTIPLE_CHOICE, Tags.CUSTOM],
|
|
25
|
+
dataset_id='general_mcq',
|
|
26
|
+
subset_list=['default'],
|
|
27
|
+
metric_list=['acc'],
|
|
28
|
+
few_shot_num=0,
|
|
29
|
+
train_split='dev',
|
|
30
|
+
eval_split='val',
|
|
31
|
+
prompt_template=MultipleChoiceTemplate.CHINESE_SINGLE_ANSWER_TEMPLATE,
|
|
32
|
+
)
|
|
33
|
+
)
|
|
34
|
+
class GeneralMCQAdapter(MultiChoiceAdapter):
|
|
34
35
|
|
|
35
36
|
def __init__(self, **kwargs):
|
|
36
37
|
super().__init__(**kwargs)
|
|
37
38
|
|
|
38
39
|
self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
|
|
39
40
|
|
|
40
|
-
def load_from_disk(self,
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
"""
|
|
59
|
-
Generate model prompt from raw input, unify the prompt format for C-Eval benchmark.
|
|
60
|
-
|
|
61
|
-
Args:
|
|
62
|
-
input_d (dict): The raw input. A single data format of the C-Eval:
|
|
63
|
-
|
|
64
|
-
{'id': 0,
|
|
65
|
-
'question': '下列关于税法基本原则的表述中,不正确的是____。',
|
|
66
|
-
'A': '税收法定原则包括税收要件法定原则和税务合法性原则',
|
|
67
|
-
'B': '税收公平原则源于法律上的平等性原则',
|
|
68
|
-
'C': '税收效率原则包含经济效率和行政效率两个方面',
|
|
69
|
-
'D': '税务机关按法定程序依法征税,可以自由做出减征、停征或免征税款的决定',
|
|
70
|
-
'answer': 'D'}
|
|
71
|
-
|
|
72
|
-
Returns:
|
|
73
|
-
{'data': ['prompt ...']}
|
|
74
|
-
"""
|
|
75
|
-
|
|
76
|
-
few_shot_prompts = [self._format_example(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
77
|
-
|
|
78
|
-
if len(few_shot_prompts) > 0:
|
|
79
|
-
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
80
|
-
else:
|
|
81
|
-
context = ''
|
|
82
|
-
context = context.strip() + self._format_example(input_d=input_d, include_answer=False)
|
|
83
|
-
|
|
84
|
-
full_prompt = self.prompt_template.format(query=context)
|
|
85
|
-
|
|
86
|
-
return self.gen_prompt_data(full_prompt)
|
|
87
|
-
|
|
88
|
-
def get_gold_answer(self, input_d: dict) -> str:
|
|
89
|
-
# Get the gold choice
|
|
90
|
-
return input_d.get('answer', '')
|
|
91
|
-
|
|
92
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
93
|
-
"""
|
|
94
|
-
Parse the model output to get the answer. Could be the best choice index.
|
|
95
|
-
|
|
96
|
-
Args:
|
|
97
|
-
result: Predicted answer from the model. Usually a string for chat.
|
|
98
|
-
raw_input_d (dict): The raw input. Depending on the dataset.
|
|
99
|
-
eval_type: `checkpoint` or `service` or `custom`. Default is `checkpoint`.
|
|
100
|
-
|
|
101
|
-
Returns:
|
|
102
|
-
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
103
|
-
"""
|
|
104
|
-
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
105
|
-
return result
|
|
106
|
-
else:
|
|
107
|
-
return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
|
|
108
|
-
|
|
109
|
-
def match(self, gold: str, pred: str) -> float:
|
|
110
|
-
return exact_match(gold=gold, pred=pred)
|
|
111
|
-
|
|
112
|
-
def _format_example(self, input_d: dict, include_answer=True):
|
|
113
|
-
choices_str = '\n'.join([f'{choice}. {input_d[choice]}' for choice in self.choices if choice in input_d])
|
|
114
|
-
|
|
115
|
-
if include_answer:
|
|
116
|
-
return self.query_template.format(
|
|
117
|
-
question=input_d['question'], choices=choices_str, answer=input_d['answer'])
|
|
118
|
-
else:
|
|
119
|
-
return self.query_template.format(question=input_d['question'], choices=choices_str, answer='').rstrip()
|
|
41
|
+
def load_from_disk(self, **kwargs):
|
|
42
|
+
return super().load_from_disk(use_local_loader=True)
|
|
43
|
+
|
|
44
|
+
def record_to_sample(self, record) -> Sample:
|
|
45
|
+
# Extract choices from the record (A, B, C, D, etc.)
|
|
46
|
+
choices = []
|
|
47
|
+
for choice_key in self.choices:
|
|
48
|
+
if choice_key in record:
|
|
49
|
+
choices.append(record[choice_key])
|
|
50
|
+
else:
|
|
51
|
+
break # Stop when we reach a choice key that doesn't exist
|
|
52
|
+
|
|
53
|
+
return Sample(
|
|
54
|
+
input=record['question'],
|
|
55
|
+
choices=choices,
|
|
56
|
+
target=record['answer'],
|
|
57
|
+
metadata={'id': record.get('id', 'unknown')},
|
|
58
|
+
)
|
|
@@ -1,155 +1,94 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import
|
|
3
|
-
|
|
4
|
-
from
|
|
5
|
-
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
2
|
+
from typing import Any, Dict, List, Optional, Union
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageSystem, ChatMessageUser, dict_to_chat_message
|
|
8
|
+
from evalscope.api.metric import Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
9
11
|
from evalscope.utils.logger import get_logger
|
|
10
12
|
|
|
11
13
|
logger = get_logger()
|
|
12
14
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
15
|
+
PROMPT_TEMPLATE = '请回答问题\n{question}'
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@register_benchmark(
|
|
19
|
+
BenchmarkMeta(
|
|
20
|
+
name='general_qa',
|
|
21
|
+
pretty_name='General-QA',
|
|
22
|
+
description='A general question answering dataset for custom evaluation. '
|
|
23
|
+
'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/custom_dataset/llm.html#qa).', # noqa: E501
|
|
24
|
+
tags=[Tags.QA, Tags.CUSTOM],
|
|
25
|
+
dataset_id='general_qa',
|
|
26
|
+
metric_list=['BLEU', 'Rouge'],
|
|
27
|
+
few_shot_num=0,
|
|
28
|
+
train_split=None,
|
|
29
|
+
eval_split='test',
|
|
30
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
31
|
+
)
|
|
27
32
|
)
|
|
28
|
-
class GeneralQAAdapter(
|
|
33
|
+
class GeneralQAAdapter(DefaultDataAdapter):
|
|
29
34
|
|
|
30
35
|
def __init__(self, **kwargs):
|
|
31
36
|
super().__init__(**kwargs)
|
|
32
37
|
|
|
33
|
-
def
|
|
34
|
-
|
|
35
|
-
Load dataset from the given path or dataset name.
|
|
36
|
-
|
|
37
|
-
Args:
|
|
38
|
-
dataset_name_or_path (str): Path to dataset directory or file.
|
|
39
|
-
subset_list (list): List of subset names to load.
|
|
40
|
-
|
|
41
|
-
Returns:
|
|
42
|
-
dict: Loaded dataset organized by subset.
|
|
43
|
-
"""
|
|
44
|
-
dataset_name_or_path = dataset_name_or_path or self.dataset_id
|
|
45
|
-
subset_list = subset_list or self.subset_list
|
|
46
|
-
|
|
47
|
-
data_file_dict = defaultdict(str)
|
|
48
|
-
data_item_dict = defaultdict(list)
|
|
49
|
-
|
|
50
|
-
# get data file path and subset name
|
|
51
|
-
if os.path.isdir(dataset_name_or_path):
|
|
52
|
-
for subset_name in subset_list:
|
|
53
|
-
data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
|
|
54
|
-
elif os.path.isfile(dataset_name_or_path):
|
|
55
|
-
cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
|
|
56
|
-
data_file_dict[cur_subset_name] = dataset_name_or_path
|
|
57
|
-
else:
|
|
58
|
-
raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
|
|
59
|
-
|
|
60
|
-
# load data from local disk
|
|
61
|
-
try:
|
|
62
|
-
for subset_name, file_path in data_file_dict.items():
|
|
63
|
-
data_item_dict[subset_name] = jsonl_to_list(file_path)
|
|
64
|
-
except Exception as e:
|
|
65
|
-
raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
|
|
66
|
-
|
|
67
|
-
data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
|
|
68
|
-
|
|
69
|
-
return data_dict
|
|
70
|
-
|
|
71
|
-
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
72
|
-
"""
|
|
73
|
-
Generate prompt for the model based on input data.
|
|
74
|
-
|
|
75
|
-
Args:
|
|
76
|
-
input_d (dict): Input data dictionary.
|
|
77
|
-
subset_name (str): Name of the subset.
|
|
78
|
-
few_shot_list (list): List of few-shot examples.
|
|
79
|
-
|
|
80
|
-
Returns:
|
|
81
|
-
dict: Dictionary containing the generated prompt.
|
|
82
|
-
"""
|
|
83
|
-
messages = input_d.get('messages')
|
|
84
|
-
query = input_d.get('question', '') or input_d.get('query', '')
|
|
85
|
-
system_prompt = input_d.get('system')
|
|
86
|
-
prompt = self.prompt_template.format(query=query)
|
|
87
|
-
return self.gen_prompt_data(prompt, system_prompt=system_prompt, messages=messages)
|
|
88
|
-
|
|
89
|
-
def get_gold_answer(self, input_d: dict) -> str:
|
|
90
|
-
"""
|
|
91
|
-
Extract the gold (reference) answer from the input data.
|
|
92
|
-
|
|
93
|
-
Args:
|
|
94
|
-
input_d (dict): Input data dictionary.
|
|
95
|
-
|
|
96
|
-
Returns:
|
|
97
|
-
str: Gold answer string.
|
|
98
|
-
"""
|
|
99
|
-
return input_d.get('answer') or input_d.get('response')
|
|
100
|
-
|
|
101
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
102
|
-
"""
|
|
103
|
-
Parse the prediction result.
|
|
104
|
-
|
|
105
|
-
Args:
|
|
106
|
-
result (str): Model prediction result.
|
|
107
|
-
raw_input_d (dict, optional): Original input data.
|
|
108
|
-
eval_type (str): Evaluation type.
|
|
38
|
+
def load_from_disk(self, **kwargs):
|
|
39
|
+
return super().load_from_disk(use_local_loader=True)
|
|
109
40
|
|
|
110
|
-
|
|
111
|
-
str: Parsed prediction result.
|
|
112
|
-
"""
|
|
113
|
-
return result
|
|
114
|
-
|
|
115
|
-
def match(self, gold: str, pred: str) -> dict:
|
|
41
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
116
42
|
"""
|
|
117
|
-
|
|
43
|
+
Convert a data record to a Sample object.
|
|
118
44
|
|
|
119
45
|
Args:
|
|
120
|
-
|
|
121
|
-
pred (str): Predicted answer.
|
|
46
|
+
record (Dict[str, Any]): Input data record.
|
|
122
47
|
|
|
123
48
|
Returns:
|
|
124
|
-
|
|
49
|
+
Sample: Sample object with input, target, and metadata.
|
|
125
50
|
"""
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
if
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
from evalscope.metrics import bleu_ngram_one_sample
|
|
51
|
+
query = record.get('question') or record.get('query')
|
|
52
|
+
answer = record.get('answer') or record.get('response')
|
|
53
|
+
system_prompt = record.get('system')
|
|
54
|
+
messages = record.get('messages')
|
|
55
|
+
|
|
56
|
+
message_list = []
|
|
57
|
+
if messages:
|
|
58
|
+
message_list = [dict_to_chat_message(m) for m in messages]
|
|
59
|
+
else:
|
|
60
|
+
if system_prompt:
|
|
61
|
+
message_list.append(ChatMessageSystem(content=system_prompt))
|
|
62
|
+
message_list.append(ChatMessageUser(content=query))
|
|
139
63
|
|
|
140
|
-
|
|
141
|
-
res.update(bleu_dict)
|
|
142
|
-
return res
|
|
64
|
+
return Sample(input=message_list, target=answer or '')
|
|
143
65
|
|
|
144
|
-
def
|
|
66
|
+
def match_score(
|
|
67
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
68
|
+
) -> Score:
|
|
145
69
|
"""
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
Args:
|
|
149
|
-
review_res_list (list): List of metric score dictionaries.
|
|
150
|
-
|
|
151
|
-
Returns:
|
|
152
|
-
list: List of dictionaries with averaged metric results.
|
|
70
|
+
Calculate evaluation scores by comparing prediction with reference.
|
|
153
71
|
"""
|
|
154
|
-
|
|
155
|
-
|
|
72
|
+
# Initialize the score object with prediction details
|
|
73
|
+
score = Score(
|
|
74
|
+
extracted_prediction=filtered_prediction,
|
|
75
|
+
prediction=original_prediction,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Calculate scores for each configured metric
|
|
79
|
+
for metric in self.metric_list:
|
|
80
|
+
try:
|
|
81
|
+
if metric == 'Rouge':
|
|
82
|
+
from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
|
|
83
|
+
|
|
84
|
+
score.value.update(compute_rouge_score_one_sample_zh([filtered_prediction], [reference]))
|
|
85
|
+
elif metric == 'BLEU':
|
|
86
|
+
from evalscope.metrics import bleu_ngram_one_sample
|
|
87
|
+
|
|
88
|
+
score.value.update(bleu_ngram_one_sample(filtered_prediction, reference))
|
|
89
|
+
except Exception as e:
|
|
90
|
+
logger.error(f'Error calculating metric {metric}: {e}')
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
score.main_score_name = 'Rouge-L-R'
|
|
94
|
+
return score
|
|
@@ -1,63 +1,70 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
1
3
|
import os
|
|
2
4
|
import random
|
|
3
5
|
import re
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
6
|
+
from typing import Any, Dict
|
|
7
|
+
|
|
8
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
9
|
+
from evalscope.api.dataset import Sample
|
|
10
|
+
from evalscope.api.registry import register_benchmark
|
|
11
|
+
from evalscope.constants import Tags
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
from evalscope.utils.multi_choices import FEW_SHOT_TEMPLATE, MultipleChoiceTemplate
|
|
14
|
+
|
|
15
|
+
logger = get_logger()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@register_benchmark(
|
|
19
|
+
BenchmarkMeta(
|
|
20
|
+
name='gpqa_diamond',
|
|
21
|
+
pretty_name='GPQA-Diamond',
|
|
22
|
+
tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
|
|
23
|
+
description=
|
|
24
|
+
'GPQA is a dataset for evaluating the reasoning ability of large language models (LLMs) on complex mathematical problems. It contains questions that require step-by-step reasoning to arrive at the correct answer.', # noqa: E501
|
|
25
|
+
dataset_id='AI-ModelScope/gpqa_diamond',
|
|
26
|
+
metric_list=['acc'],
|
|
27
|
+
few_shot_num=0,
|
|
28
|
+
train_split=None,
|
|
29
|
+
eval_split='train', # only have train split
|
|
30
|
+
prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER_COT,
|
|
31
|
+
)
|
|
25
32
|
)
|
|
26
|
-
class GPQAAdapter(
|
|
33
|
+
class GPQAAdapter(MultiChoiceAdapter):
|
|
27
34
|
|
|
28
35
|
def __init__(self, **kwargs):
|
|
29
36
|
super().__init__(**kwargs)
|
|
30
37
|
|
|
31
|
-
self.
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
38
|
+
if self.few_shot_num > 0 and self.few_shot_num != 5:
|
|
39
|
+
logger.warning(
|
|
40
|
+
f'Only support few_shot_num 0 or 5 for {self.dataset_id}, but got {self.few_shot_num}. Use 5-shot by default.' # noqa: E501
|
|
41
|
+
)
|
|
42
|
+
self.few_shot_num = 5
|
|
43
|
+
|
|
44
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
45
|
+
# Process the input to create shuffled choices and correct answer
|
|
46
|
+
processed_data = self._process_input(record)
|
|
47
|
+
|
|
48
|
+
return Sample(
|
|
49
|
+
input=record['Question'],
|
|
50
|
+
choices=processed_data['choices'],
|
|
51
|
+
target=processed_data['answer'],
|
|
52
|
+
subset_key=record.get('subset', ''),
|
|
53
|
+
metadata={
|
|
54
|
+
'correct_answer':
|
|
55
|
+
record['Correct Answer'],
|
|
56
|
+
'incorrect_answers':
|
|
57
|
+
[record['Incorrect Answer 1'], record['Incorrect Answer 2'], record['Incorrect Answer 3']],
|
|
58
|
+
},
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def format_fewshot_template(self, fewshot, sample):
|
|
62
|
+
from .prompt import FEW_SHOT_SAMPLES
|
|
63
|
+
|
|
64
|
+
return FEW_SHOT_TEMPLATE.format(fewshot=FEW_SHOT_SAMPLES, ) + self.format_prompt_template(sample)
|
|
65
|
+
|
|
66
|
+
def _process_input(self, input_d: dict) -> dict:
|
|
67
|
+
"""Process input to shuffle choices and determine correct answer letter."""
|
|
61
68
|
|
|
62
69
|
def preprocess(text):
|
|
63
70
|
if text is None:
|
|
@@ -77,53 +84,7 @@ class GPQAAdapter(DataAdapter):
|
|
|
77
84
|
random.shuffle(choices)
|
|
78
85
|
correct_answer_index = choices.index(preprocess(input_d['Correct Answer']))
|
|
79
86
|
|
|
80
|
-
|
|
81
|
-
'choices':
|
|
87
|
+
return {
|
|
88
|
+
'choices': choices,
|
|
82
89
|
'answer': f'{chr(65 + correct_answer_index)}',
|
|
83
90
|
}
|
|
84
|
-
return out_doc
|
|
85
|
-
|
|
86
|
-
def __form_options(self, options: list):
|
|
87
|
-
option_str = 'Choices:\n'
|
|
88
|
-
for opt, choice in zip(options, self.choices):
|
|
89
|
-
option_str += f'({choice}) {opt}' + '\n'
|
|
90
|
-
return option_str
|
|
91
|
-
|
|
92
|
-
def get_gold_answer(self, input_d: dict) -> str:
|
|
93
|
-
"""
|
|
94
|
-
Parse the raw input labels (gold).
|
|
95
|
-
"""
|
|
96
|
-
return input_d['answer']
|
|
97
|
-
|
|
98
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
99
|
-
"""
|
|
100
|
-
Parse the predicted result and extract proper answer.
|
|
101
|
-
"""
|
|
102
|
-
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
103
|
-
return result
|
|
104
|
-
else:
|
|
105
|
-
return GPQAAdapter.get_multiple_choice_answer(result)
|
|
106
|
-
|
|
107
|
-
def match(self, gold: str, pred: str) -> float:
|
|
108
|
-
"""
|
|
109
|
-
Match the gold answer and the predicted answer.
|
|
110
|
-
"""
|
|
111
|
-
return exact_match(gold=gold, pred=pred)
|
|
112
|
-
|
|
113
|
-
@staticmethod
|
|
114
|
-
def get_multiple_choice_answer(pred: str):
|
|
115
|
-
tmp = re.findall(r'\b(A|B|C|D)\b', pred.upper())
|
|
116
|
-
if tmp:
|
|
117
|
-
pred = tmp
|
|
118
|
-
else:
|
|
119
|
-
pred = [pred.strip().strip('.')]
|
|
120
|
-
|
|
121
|
-
if len(pred) == 0:
|
|
122
|
-
pred = ''
|
|
123
|
-
else:
|
|
124
|
-
pred = pred[-1]
|
|
125
|
-
|
|
126
|
-
# Remove the period at the end, again!
|
|
127
|
-
pred = pred.rstrip('.').rstrip('/')
|
|
128
|
-
|
|
129
|
-
return pred
|