evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.metric import AggScore, SampleScore, Score
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
GRADER_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\".""" # noqa: E501
|
|
15
|
+
|
|
16
|
+
GRADER_TEMPLATE = """<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>""".strip(
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@register_benchmark(
|
|
21
|
+
BenchmarkMeta(
|
|
22
|
+
name='arena_hard',
|
|
23
|
+
pretty_name='ArenaHard',
|
|
24
|
+
tags=[Tags.INSTRUCTION_FOLLOWING, Tags.ARENA],
|
|
25
|
+
description=
|
|
26
|
+
'ArenaHard is a benchmark designed to evaluate the performance of large language models in a competitive setting, '
|
|
27
|
+
'where models are pitted against each other in a series of tasks to determine their relative strengths and weaknesses. '
|
|
28
|
+
'It includes a set of challenging tasks that require reasoning, understanding, and generation capabilities. '
|
|
29
|
+
'Currently not support `style-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-0314`.',
|
|
30
|
+
dataset_id='AI-ModelScope/arena-hard-auto-v0.1',
|
|
31
|
+
metric_list=['winrate'],
|
|
32
|
+
aggregation='elo',
|
|
33
|
+
few_shot_num=0,
|
|
34
|
+
train_split=None,
|
|
35
|
+
eval_split='test',
|
|
36
|
+
prompt_template='{question}'
|
|
37
|
+
)
|
|
38
|
+
)
|
|
39
|
+
class ArenaHardAdapter(DefaultDataAdapter):
|
|
40
|
+
|
|
41
|
+
def __init__(self, *args, **kwargs):
|
|
42
|
+
super().__init__(*args, **kwargs)
|
|
43
|
+
|
|
44
|
+
self._use_llm_judge = True # Use LLM as a judge by default
|
|
45
|
+
|
|
46
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
47
|
+
"""
|
|
48
|
+
Convert a data record to a Sample object.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
record (Dict[str, Any]): Input data record.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
Sample: Sample object with input, target, and metadata.
|
|
55
|
+
"""
|
|
56
|
+
question = record['question']
|
|
57
|
+
baseline_prediction = record['prediction'] # baseline model prediction
|
|
58
|
+
|
|
59
|
+
return Sample(
|
|
60
|
+
input=question, target=baseline_prediction, metadata={'capability': record.get('capability', 'unknown')}
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def llm_match_score(
|
|
64
|
+
self,
|
|
65
|
+
original_prediction: str,
|
|
66
|
+
filtered_prediction: str,
|
|
67
|
+
reference: str,
|
|
68
|
+
task_state: TaskState,
|
|
69
|
+
) -> Score:
|
|
70
|
+
from .utils import get_judge_score, post_process_arenahard
|
|
71
|
+
|
|
72
|
+
score = Score(
|
|
73
|
+
extracted_prediction=filtered_prediction,
|
|
74
|
+
prediction=original_prediction,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
question = task_state.input_text
|
|
78
|
+
|
|
79
|
+
# reference is baseline answer 'A', filtered_prediction is model answer 'B'
|
|
80
|
+
prompt1 = GRADER_TEMPLATE.format(question=question, answer_1=reference, answer_2=filtered_prediction)
|
|
81
|
+
# reverse the order
|
|
82
|
+
prompt2 = GRADER_TEMPLATE.format(question=question, answer_1=filtered_prediction, answer_2=reference)
|
|
83
|
+
|
|
84
|
+
# get grading response
|
|
85
|
+
game1_response = self.llm_judge.judge(prompt1, system_prompt=GRADER_SYSTEM_PROMPT)
|
|
86
|
+
game2_response = self.llm_judge.judge(prompt2, system_prompt=GRADER_SYSTEM_PROMPT)
|
|
87
|
+
|
|
88
|
+
# parse grading response
|
|
89
|
+
res1 = post_process_arenahard(game1_response)
|
|
90
|
+
res2 = post_process_arenahard(game2_response)
|
|
91
|
+
|
|
92
|
+
score1 = get_judge_score(res1, reverse=True)
|
|
93
|
+
score2 = get_judge_score(res2, reverse=False)
|
|
94
|
+
|
|
95
|
+
battle_result = {
|
|
96
|
+
'model_a':
|
|
97
|
+
'gpt4-0314',
|
|
98
|
+
'model_b':
|
|
99
|
+
'test_model',
|
|
100
|
+
'games': [
|
|
101
|
+
{
|
|
102
|
+
'user_prompt': prompt1,
|
|
103
|
+
'judgment': game1_response,
|
|
104
|
+
'score': res1
|
|
105
|
+
},
|
|
106
|
+
{
|
|
107
|
+
'user_prompt': prompt2,
|
|
108
|
+
'judgment': game2_response,
|
|
109
|
+
'score': res2
|
|
110
|
+
},
|
|
111
|
+
]
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
# Set score based on the battle result
|
|
115
|
+
score.value = {'score': (score1 + score2) / 2}
|
|
116
|
+
score.explanation = f'LLM judge battles: Game1: {game1_response[:100]}... Game2: {game2_response[:100]}...'
|
|
117
|
+
score.metadata = {
|
|
118
|
+
'source': 'llm_judge',
|
|
119
|
+
'judge_strategy': self.judge_strategy,
|
|
120
|
+
'model': self.llm_judge.model_id,
|
|
121
|
+
'battle_result': battle_result
|
|
122
|
+
}
|
|
123
|
+
return score
|
|
124
|
+
|
|
125
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
|
|
126
|
+
import pandas as pd
|
|
127
|
+
|
|
128
|
+
from .utils import compute_mle_elo, get_battles_from_row, get_bootstrap_result, get_win_rate_column
|
|
129
|
+
|
|
130
|
+
battles = pd.concat([get_battles_from_row(res.score.metadata['battle_result']) for res in sample_scores])
|
|
131
|
+
|
|
132
|
+
bootstrap_online_elo = compute_mle_elo(battles)
|
|
133
|
+
|
|
134
|
+
stats = pd.DataFrame()
|
|
135
|
+
stats['results'] = None
|
|
136
|
+
stats['results'] = stats['results'].astype('object')
|
|
137
|
+
|
|
138
|
+
for i, model in enumerate(bootstrap_online_elo.index):
|
|
139
|
+
# assert model in bootstrap_elo_lu.columns
|
|
140
|
+
stats.at[i, 'model'] = model
|
|
141
|
+
stats.at[i, 'score'] = bootstrap_online_elo[model]
|
|
142
|
+
|
|
143
|
+
score = get_win_rate_column(stats, 'score', 'gpt4-0314').at['test_model']
|
|
144
|
+
|
|
145
|
+
return [AggScore(
|
|
146
|
+
score=score,
|
|
147
|
+
metric_name='winrate',
|
|
148
|
+
num=len(sample_scores),
|
|
149
|
+
)]
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import re
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from sklearn.linear_model import LogisticRegression
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def post_process_arenahard(completion):
|
|
15
|
+
result = re.findall(r'\[\[([AB<>=]+)\]\]', completion)
|
|
16
|
+
if result:
|
|
17
|
+
return result[0]
|
|
18
|
+
else:
|
|
19
|
+
return None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_judge_score(result, reverse=False):
|
|
23
|
+
"""
|
|
24
|
+
Calculate the judge score, considering confidence weight.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
result: Judgment result ('A=B', 'A>B', 'A>>B', 'B>A', 'B>>A')
|
|
28
|
+
reverse: Whether to reverse the score
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
float: Weighted score
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
# Base score mapping - using finer-grained scores
|
|
35
|
+
if not reverse:
|
|
36
|
+
score_mapping = {
|
|
37
|
+
'A=B': 0.5, # Tie
|
|
38
|
+
'A>B': 0.75, # A slightly wins
|
|
39
|
+
'A>>B': 1.0, # A significantly wins
|
|
40
|
+
'B>A': 0.25, # B slightly wins
|
|
41
|
+
'B>>A': 0.0, # B significantly wins
|
|
42
|
+
}
|
|
43
|
+
else:
|
|
44
|
+
score_mapping = {
|
|
45
|
+
'A=B': 0.5, # Tie
|
|
46
|
+
'A>B': 0.25, # A slightly wins
|
|
47
|
+
'A>>B': 0.0, # A significantly wins
|
|
48
|
+
'B>A': 0.75, # B slightly wins
|
|
49
|
+
'B>>A': 1.0, # B significantly wins
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
base_score = score_mapping.get(result, 0.5)
|
|
53
|
+
|
|
54
|
+
return base_score
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_battles_from_row(row, first_game_only=False, multiplier=3):
|
|
58
|
+
results = []
|
|
59
|
+
output = {'model_a': row['model_a'], 'model_b': row['model_b']}
|
|
60
|
+
|
|
61
|
+
game = row['games'][0]
|
|
62
|
+
weight = 1
|
|
63
|
+
if game['score'] == 'A=B':
|
|
64
|
+
output['winner'] = 'tie'
|
|
65
|
+
elif game['score'] == 'A>B':
|
|
66
|
+
output['winner'] = 'model_a'
|
|
67
|
+
elif game['score'] == 'A>>B':
|
|
68
|
+
output['winner'] = 'model_a'
|
|
69
|
+
weight = multiplier
|
|
70
|
+
elif game['score'] == 'B>A':
|
|
71
|
+
output['winner'] = 'model_b'
|
|
72
|
+
elif game['score'] == 'B>>A':
|
|
73
|
+
output['winner'] = 'model_b'
|
|
74
|
+
weight = multiplier
|
|
75
|
+
else:
|
|
76
|
+
weight = 0
|
|
77
|
+
|
|
78
|
+
if weight:
|
|
79
|
+
results += [output] * weight
|
|
80
|
+
|
|
81
|
+
if first_game_only:
|
|
82
|
+
return pd.DataFrame(results)
|
|
83
|
+
|
|
84
|
+
# game 2
|
|
85
|
+
output = {'model_a': row['model_a'], 'model_b': row['model_b']}
|
|
86
|
+
|
|
87
|
+
game = row['games'][1]
|
|
88
|
+
|
|
89
|
+
weight = 1
|
|
90
|
+
if game['score'] == 'A=B':
|
|
91
|
+
output['winner'] = 'tie'
|
|
92
|
+
elif game['score'] == 'A>B':
|
|
93
|
+
output['winner'] = 'model_b'
|
|
94
|
+
elif game['score'] == 'A>>B':
|
|
95
|
+
output['winner'] = 'model_b'
|
|
96
|
+
weight = multiplier
|
|
97
|
+
elif game['score'] == 'B>A':
|
|
98
|
+
output['winner'] = 'model_a'
|
|
99
|
+
elif game['score'] == 'B>>A':
|
|
100
|
+
output['winner'] = 'model_a'
|
|
101
|
+
weight = multiplier
|
|
102
|
+
else:
|
|
103
|
+
weight = 0
|
|
104
|
+
|
|
105
|
+
if weight:
|
|
106
|
+
results += [output] * weight
|
|
107
|
+
|
|
108
|
+
return pd.DataFrame(results)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
112
|
+
models = pd.concat([df['model_a'], df['model_b']]).unique()
|
|
113
|
+
models = pd.Series(np.arange(len(models)), index=models)
|
|
114
|
+
|
|
115
|
+
# duplicate battles
|
|
116
|
+
df = pd.concat([df, df], ignore_index=True)
|
|
117
|
+
p = len(models.index)
|
|
118
|
+
n = df.shape[0]
|
|
119
|
+
|
|
120
|
+
X = np.zeros([n, p])
|
|
121
|
+
X[np.arange(n), models[df['model_a']]] = +math.log(BASE)
|
|
122
|
+
X[np.arange(n), models[df['model_b']]] = -math.log(BASE)
|
|
123
|
+
|
|
124
|
+
# one A win => two A win
|
|
125
|
+
Y = np.zeros(n)
|
|
126
|
+
Y[df['winner'] == 'model_a'] = 1.0
|
|
127
|
+
|
|
128
|
+
# one tie => one A win + one B win
|
|
129
|
+
# find tie + tie (both bad) index
|
|
130
|
+
tie_idx = (df['winner'] == 'tie') | (df['winner'] == 'tie (bothbad)')
|
|
131
|
+
tie_idx[len(tie_idx) // 2:] = False
|
|
132
|
+
Y[tie_idx] = 1.0
|
|
133
|
+
|
|
134
|
+
if len(np.unique(Y)) < 2:
|
|
135
|
+
logger.info('Warning: Only one class in the data')
|
|
136
|
+
elo_scores = pd.Series(INIT_RATING, index=models.index)
|
|
137
|
+
if np.all(Y == 1.0):
|
|
138
|
+
elo_scores[df['model_a'].iloc[0]] += SCALE # Boost the winning model
|
|
139
|
+
elif np.all(Y == 0.0):
|
|
140
|
+
elo_scores[df['model_b'].iloc[0]] += SCALE # Boost the winning model
|
|
141
|
+
return elo_scores.sort_values(ascending=False)
|
|
142
|
+
|
|
143
|
+
lr = LogisticRegression(
|
|
144
|
+
fit_intercept=False, penalty=None, tol=1e-8
|
|
145
|
+
) # May need to set a small value when not use GPT4 as judge model
|
|
146
|
+
lr.fit(X, Y)
|
|
147
|
+
|
|
148
|
+
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
|
|
149
|
+
|
|
150
|
+
# set anchor as gpt4-0314 = 1000
|
|
151
|
+
if 'gpt4-0314' in models.index:
|
|
152
|
+
elo_scores += 1000 - elo_scores[models['gpt4-0314']]
|
|
153
|
+
return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def get_bootstrap_result(battles, func_compute_elo, num_round):
|
|
157
|
+
rows = []
|
|
158
|
+
for _ in tqdm(range(num_round), desc='bootstrap'):
|
|
159
|
+
res = func_compute_elo(battles.sample(frac=1.0, replace=True))
|
|
160
|
+
if res is not None:
|
|
161
|
+
rows.append(res)
|
|
162
|
+
df = pd.DataFrame(rows)
|
|
163
|
+
return df[df.median().sort_values(ascending=False).index]
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
167
|
+
names = sorted(list(elo_ratings.keys()))
|
|
168
|
+
wins = defaultdict(lambda: defaultdict(lambda: 0))
|
|
169
|
+
for a in names:
|
|
170
|
+
for b in names:
|
|
171
|
+
ea = 1 / (1 + BASE**((elo_ratings[b] - elo_ratings[a]) / SCALE))
|
|
172
|
+
wins[a][b] = ea
|
|
173
|
+
wins[b][a] = 1 - ea
|
|
174
|
+
|
|
175
|
+
data = {a: [wins[a][b] if a != b else np.NAN for b in names] for a in names}
|
|
176
|
+
|
|
177
|
+
df = pd.DataFrame(data, index=names)
|
|
178
|
+
df.index.name = 'model_a'
|
|
179
|
+
df.columns.name = 'model_b'
|
|
180
|
+
return df.T
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def get_win_rate_column(df, column, baseline='gpt4-0314'):
|
|
184
|
+
to_dict = df[['model', column]].set_index('model').to_dict()[column]
|
|
185
|
+
win_rate_table = predict_win_rate(to_dict)
|
|
186
|
+
return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x, 4))
|
|
@@ -1,19 +1,16 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
import json
|
|
4
3
|
import os
|
|
5
|
-
import random
|
|
6
4
|
import re
|
|
5
|
+
from typing import Any, Dict
|
|
7
6
|
|
|
8
|
-
from evalscope.
|
|
9
|
-
from evalscope.
|
|
10
|
-
from evalscope.
|
|
11
|
-
from evalscope.
|
|
12
|
-
from evalscope.
|
|
7
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
8
|
+
from evalscope.api.dataset import Sample
|
|
9
|
+
from evalscope.api.evaluator import TaskState
|
|
10
|
+
from evalscope.api.registry import register_benchmark
|
|
11
|
+
from evalscope.constants import Tags
|
|
13
12
|
from evalscope.utils.logger import get_logger
|
|
14
13
|
|
|
15
|
-
# flake8: noqa
|
|
16
|
-
|
|
17
14
|
logger = get_logger()
|
|
18
15
|
|
|
19
16
|
# BBH multiple choice subset list
|
|
@@ -57,185 +54,148 @@ FREE_FORM_LIST = [
|
|
|
57
54
|
TASK_TYPE = 'task_type'
|
|
58
55
|
SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
|
|
59
56
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
57
|
+
PROMPT_TEMPLATE = """
|
|
58
|
+
Q: {question}
|
|
59
|
+
A: Let's think step by step. Put your final answer in the format of "So the answer is $ANSWER" (without quotes and markdown) where $ANSWER is the answer to the problem.
|
|
60
|
+
""".lstrip() # noqa: E501
|
|
61
|
+
|
|
62
|
+
FEWSHOT_TEMPLATE = """
|
|
63
|
+
{fewshot}
|
|
64
|
+
|
|
65
|
+
""".lstrip() + PROMPT_TEMPLATE
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@register_benchmark(
|
|
69
|
+
BenchmarkMeta(
|
|
70
|
+
name='bbh',
|
|
71
|
+
pretty_name='BBH',
|
|
72
|
+
dataset_id='evalscope/bbh',
|
|
73
|
+
tags=[Tags.REASONING],
|
|
74
|
+
description=
|
|
75
|
+
'The BBH (Big Bench Hard) benchmark is a collection of challenging tasks designed to evaluate the reasoning capabilities of AI models. It includes both free-form and multiple-choice tasks, covering a wide range of reasoning skills.', # noqa: E501
|
|
76
|
+
subset_list=SUBSET_LIST,
|
|
77
|
+
few_shot_num=3,
|
|
78
|
+
train_split=None,
|
|
79
|
+
eval_split='test',
|
|
80
|
+
metric_list=['acc'],
|
|
81
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
82
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
83
|
+
)
|
|
71
84
|
)
|
|
72
|
-
class BBHAdapter(
|
|
85
|
+
class BBHAdapter(DefaultDataAdapter):
|
|
73
86
|
"""
|
|
74
87
|
Adapter for BBH free-form and multiple-choices sub-tasks.
|
|
75
88
|
"""
|
|
76
89
|
|
|
77
90
|
def __init__(self, **kwargs):
|
|
78
|
-
|
|
79
91
|
few_shot_num = kwargs.get('few_shot_num', 3)
|
|
80
92
|
|
|
81
93
|
if few_shot_num != 3 and few_shot_num != 0:
|
|
82
|
-
logger.error(
|
|
83
|
-
|
|
94
|
+
logger.error(
|
|
95
|
+
f'BBH uses 3-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
|
|
96
|
+
f'Use 3-shot by default.'
|
|
97
|
+
)
|
|
84
98
|
kwargs['few_shot_num'] = 3
|
|
85
99
|
|
|
86
100
|
super().__init__(**kwargs)
|
|
87
101
|
|
|
88
|
-
def
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
{'data': ['xxx']}
|
|
120
|
-
"""
|
|
121
|
-
# few_shot_list: should be ['xxxx']
|
|
122
|
-
cot_prompts: str = few_shot_list[0] if len(few_shot_list) > 0 else ''
|
|
123
|
-
full_prompt: str = f"Follow the given examples and answer the question.\n{cot_prompts}\n\nQ: {input_d['input']}\nA: Let's think step by step."
|
|
124
|
-
|
|
125
|
-
return {'data': [full_prompt], 'system_prompt': self.prompt_template}
|
|
126
|
-
|
|
127
|
-
def gen_prompts(self, data_dict: dict) -> dict:
|
|
128
|
-
"""
|
|
129
|
-
Generate dataset prompts from raw input, unify the prompt format for different datasets.
|
|
130
|
-
|
|
131
|
-
Args:
|
|
132
|
-
data_dict: Refer to the output of load method: evalscope.benchmarks.benchmark.Benchmark.load
|
|
133
|
-
|
|
134
|
-
Returns:
|
|
135
|
-
{'subset_name': [prompt_d_1, prompt_d_2, ...]}
|
|
136
|
-
prompt_d_i (dict): refer to the output of gen_prompt method.
|
|
137
|
-
|
|
138
|
-
e.g. train -- few-shot data, test -- target dataset to evaluate.
|
|
139
|
-
"""
|
|
140
|
-
res_dict: dict = {}
|
|
141
|
-
|
|
142
|
-
if self.few_shot_num < 0:
|
|
143
|
-
raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
|
|
144
|
-
|
|
145
|
-
logger.info(f'Use default settings: '
|
|
146
|
-
f'> few_shot_num: {self.few_shot_num}, '
|
|
147
|
-
f'> few_shot_split: {self.train_split}, '
|
|
148
|
-
f'> target_eval_split: {self.eval_split}')
|
|
149
|
-
|
|
150
|
-
for sub_name, sub_data_dict in data_dict.items():
|
|
151
|
-
few_shot_data = []
|
|
152
|
-
if self.few_shot_num > 0:
|
|
153
|
-
with open(os.path.join(os.path.dirname(__file__), 'cot_prompts', f'{sub_name}.txt'), 'r') as f:
|
|
154
|
-
cot_prompt_str = f.read()
|
|
155
|
-
few_shot_data = [cot_prompt_str]
|
|
156
|
-
|
|
157
|
-
res_dict[sub_name] = []
|
|
158
|
-
for sample_d in sub_data_dict[self.eval_split]:
|
|
159
|
-
prompt_d = self.gen_prompt(input_d=sample_d, few_shot_list=few_shot_data)
|
|
160
|
-
sample_d_new = sample_d.copy()
|
|
161
|
-
if sub_name in MULTIPLE_CHOICE_LIST:
|
|
162
|
-
sample_d_new[TASK_TYPE] = MULTIPLE_CHOICE
|
|
163
|
-
elif sub_name in FREE_FORM_LIST:
|
|
164
|
-
sample_d_new[TASK_TYPE] = FREE_FORM
|
|
165
|
-
else:
|
|
166
|
-
raise ValueError(f'Invalid subset name: {sub_name}')
|
|
167
|
-
|
|
168
|
-
prompt_d[AnswerKeys.RAW_INPUT] = sample_d_new
|
|
169
|
-
res_dict[sub_name].append(prompt_d)
|
|
170
|
-
|
|
171
|
-
rnd = random.Random()
|
|
172
|
-
rnd.seed(42)
|
|
173
|
-
for k, v in res_dict.items():
|
|
174
|
-
rnd.shuffle(v)
|
|
175
|
-
|
|
176
|
-
return res_dict
|
|
177
|
-
|
|
178
|
-
def get_gold_answer(self, input_d: dict) -> str:
|
|
179
|
-
# Get the gold choice
|
|
180
|
-
gold = input_d.get('target')
|
|
181
|
-
if gold is None:
|
|
182
|
-
logger.error(f'BBHAdapter: gold is None.')
|
|
183
|
-
return gold
|
|
184
|
-
|
|
185
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
186
|
-
"""
|
|
187
|
-
Parse the model output to get the answer. Could be the best choice index.
|
|
188
|
-
|
|
189
|
-
Args:
|
|
190
|
-
result: Predicted answer from the model. Usually a string for chat.
|
|
191
|
-
raw_input_d (dict): The raw input. Depending on the dataset.
|
|
192
|
-
eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
|
|
193
|
-
|
|
194
|
-
Returns:
|
|
195
|
-
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
196
|
-
"""
|
|
197
|
-
# Note: to use same extraction method for both of checkpoint/service/custom.
|
|
198
|
-
task_type: str = raw_input_d.get(TASK_TYPE)
|
|
102
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
103
|
+
input = record['input']
|
|
104
|
+
target = record['target'].replace('(', '').replace(')', '').strip() # Clean up the target answer
|
|
105
|
+
|
|
106
|
+
# Determine task type based on subset name
|
|
107
|
+
task_type = None
|
|
108
|
+
subset_name = self.current_subset_name
|
|
109
|
+
if subset_name in MULTIPLE_CHOICE_LIST:
|
|
110
|
+
task_type = MULTIPLE_CHOICE
|
|
111
|
+
elif subset_name in FREE_FORM_LIST:
|
|
112
|
+
task_type = FREE_FORM
|
|
113
|
+
|
|
114
|
+
metadata = {TASK_TYPE: task_type}
|
|
115
|
+
|
|
116
|
+
return Sample(input=input, target=target, metadata=metadata, subset_key=subset_name)
|
|
117
|
+
|
|
118
|
+
def format_fewshot_template(self, fewshot: str, sample: Sample) -> str:
|
|
119
|
+
# Load CoT prompts from file for BBH
|
|
120
|
+
subset_name = sample.subset_key
|
|
121
|
+
if subset_name:
|
|
122
|
+
cot_file_path = os.path.join(os.path.dirname(__file__), 'cot_prompts', f'{subset_name}.txt')
|
|
123
|
+
if os.path.exists(cot_file_path):
|
|
124
|
+
with open(cot_file_path, 'r', encoding='utf-8') as f:
|
|
125
|
+
fewshot = f.read().strip()
|
|
126
|
+
return self.few_shot_prompt_template.format(
|
|
127
|
+
fewshot=fewshot,
|
|
128
|
+
question=sample.input,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def extract_answer(self, prediction: str, task_state: TaskState):
|
|
132
|
+
task_type = task_state.metadata.get(TASK_TYPE)
|
|
199
133
|
|
|
200
134
|
if task_type == MULTIPLE_CHOICE:
|
|
201
|
-
return self._extract_mc_answer(
|
|
135
|
+
return self._extract_mc_answer(prediction)
|
|
202
136
|
elif task_type == FREE_FORM:
|
|
203
|
-
return self._extract_ff_answer(
|
|
137
|
+
return self._extract_ff_answer(prediction)
|
|
204
138
|
else:
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
def match(self, gold: str, pred: str) -> float:
|
|
208
|
-
return exact_match(gold=gold, pred=pred)
|
|
139
|
+
return prediction.strip()
|
|
209
140
|
|
|
210
141
|
@classmethod
|
|
211
142
|
def _extract_mc_answer(cls, ans: str) -> str:
|
|
212
143
|
"""
|
|
213
|
-
Extract
|
|
144
|
+
Extract normalized answer for BBH multiple-choice tasks.
|
|
145
|
+
Handles formats like:
|
|
146
|
+
- "answer is (A)"
|
|
147
|
+
- "The answer is A."
|
|
148
|
+
- Extra text after answer.
|
|
149
|
+
Always uses the *last* occurrence of "answer is".
|
|
214
150
|
"""
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
151
|
+
ans = ans.strip()
|
|
152
|
+
|
|
153
|
+
parts = ans.split('So the answer is ')
|
|
154
|
+
if len(parts) > 1:
|
|
155
|
+
ans = parts[-1].strip()
|
|
156
|
+
ans = ans.split('\n')[0].strip()
|
|
157
|
+
|
|
158
|
+
# Remove trailing period
|
|
159
|
+
if ans.endswith('.'):
|
|
160
|
+
ans = ans[:-1].strip()
|
|
161
|
+
|
|
162
|
+
# Capture uppercase letter inside parentheses (A) (B) ...
|
|
163
|
+
match = re.search(r'\(([A-Z])\)', ans)
|
|
219
164
|
if match:
|
|
220
165
|
return match.group(1)
|
|
221
|
-
|
|
166
|
+
|
|
167
|
+
# Capture single uppercase letter
|
|
168
|
+
match = re.search(r'\b([A-Z])\b', ans)
|
|
222
169
|
if match:
|
|
223
170
|
return match.group(1)
|
|
171
|
+
|
|
224
172
|
return ans
|
|
225
173
|
|
|
226
174
|
@classmethod
|
|
227
175
|
def _extract_ff_answer(cls, ans: str):
|
|
228
176
|
"""
|
|
229
|
-
Extract the answer
|
|
177
|
+
Extract the normalized answer for BBH free-form tasks.
|
|
178
|
+
Handles patterns like:
|
|
179
|
+
- "answer is XXX."
|
|
180
|
+
- "The answer is **valid**."
|
|
181
|
+
- Extra trailing dots / line breaks.
|
|
182
|
+
- Bold-marked answers (**xxx**).
|
|
183
|
+
Always uses the *last* occurrence of "answer is".
|
|
230
184
|
"""
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
185
|
+
ans = ans.strip()
|
|
186
|
+
|
|
187
|
+
parts = ans.split('So the answer is ')
|
|
188
|
+
if len(parts) > 1:
|
|
189
|
+
ans = parts[-1].strip()
|
|
190
|
+
ans = ans.split('\n')[0].strip()
|
|
191
|
+
|
|
192
|
+
# Remove trailing period
|
|
239
193
|
if ans.endswith('.'):
|
|
240
|
-
ans = ans[:-1]
|
|
194
|
+
ans = ans[:-1].strip()
|
|
195
|
+
|
|
196
|
+
# If answer is in bold (**xxx**), prefer the content inside
|
|
197
|
+
match = re.search(r'\*\*(.*?)\*\*', ans)
|
|
198
|
+
if match:
|
|
199
|
+
ans = match.group(1).strip()
|
|
200
|
+
|
|
241
201
|
return ans
|
|
File without changes
|
|
File without changes
|