evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,69 +1,53 @@
|
|
|
1
|
-
|
|
1
|
+
"""
|
|
2
|
+
This logic is largely copied from the Hendrycks' MATH release (math_equivalence).
|
|
2
3
|
|
|
3
4
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
if str1 is None and str2 is None:
|
|
7
|
-
print('WARNING: Both None')
|
|
8
|
-
return True
|
|
9
|
-
if str1 is None or str2 is None:
|
|
10
|
-
return False
|
|
5
|
+
This file is adapted from OpenAI's PRM800K repository:
|
|
6
|
+
https://github.com/openai/prm800k/blob/main/prm800k/grading/math_normalize.py
|
|
11
7
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
if '\\boxed ' in s:
|
|
24
|
-
left = '\\boxed '
|
|
25
|
-
assert s[:len(left)] == left
|
|
26
|
-
return s[len(left):]
|
|
27
|
-
|
|
28
|
-
left = '\\boxed{'
|
|
29
|
-
|
|
30
|
-
assert s[:len(left)] == left
|
|
31
|
-
assert s[-1] == '}'
|
|
32
|
-
|
|
33
|
-
return s[len(left):-1]
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def last_boxed_only_string(string):
|
|
37
|
-
idx = string.rfind('\\boxed')
|
|
38
|
-
if '\\boxed ' in string:
|
|
39
|
-
return '\\boxed ' + string.split('\\boxed ')[-1].split('$')[0]
|
|
40
|
-
if idx < 0:
|
|
41
|
-
idx = string.rfind('\\fbox')
|
|
42
|
-
if idx < 0:
|
|
43
|
-
return None
|
|
44
|
-
|
|
45
|
-
i = idx
|
|
46
|
-
right_brace_idx = None
|
|
47
|
-
num_left_braces_open = 0
|
|
48
|
-
while i < len(string):
|
|
49
|
-
if string[i] == '{':
|
|
50
|
-
num_left_braces_open += 1
|
|
51
|
-
if string[i] == '}':
|
|
52
|
-
num_left_braces_open -= 1
|
|
53
|
-
if num_left_braces_open == 0:
|
|
54
|
-
right_brace_idx = i
|
|
55
|
-
break
|
|
56
|
-
i += 1
|
|
57
|
-
|
|
58
|
-
if right_brace_idx is None:
|
|
59
|
-
retval = None
|
|
60
|
-
else:
|
|
61
|
-
retval = string[idx:right_brace_idx + 1]
|
|
8
|
+
Original License:
|
|
9
|
+
MIT License
|
|
10
|
+
|
|
11
|
+
Copyright (c) 2023 OpenAI
|
|
12
|
+
|
|
13
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
14
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
15
|
+
in the Software without restriction, including without limitation the rights
|
|
16
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
17
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
18
|
+
furnished to do so, subject to the following conditions:
|
|
62
19
|
|
|
63
|
-
|
|
20
|
+
The above copyright notice and this permission notice shall be included in all
|
|
21
|
+
copies or substantial portions of the Software.
|
|
22
|
+
|
|
23
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
24
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
25
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
26
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
27
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
28
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
29
|
+
SOFTWARE.
|
|
30
|
+
"""
|
|
31
|
+
# flake8: noqa
|
|
32
|
+
import re
|
|
33
|
+
from typing import Optional
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def normalize_answer(answer: Optional[str]) -> Optional[str]:
|
|
37
|
+
if answer is None:
|
|
38
|
+
return None
|
|
39
|
+
answer = answer.strip()
|
|
40
|
+
try:
|
|
41
|
+
# Remove enclosing `\text{}`.
|
|
42
|
+
m = re.search('^\\\\text\{(?P<text>.+?)\}$', answer)
|
|
43
|
+
if m is not None:
|
|
44
|
+
answer = m.group('text').strip()
|
|
45
|
+
return _strip_string(answer)
|
|
46
|
+
except:
|
|
47
|
+
return answer
|
|
64
48
|
|
|
65
49
|
|
|
66
|
-
def
|
|
50
|
+
def _fix_fracs(string):
|
|
67
51
|
substrs = string.split('\\frac')
|
|
68
52
|
new_str = substrs[0]
|
|
69
53
|
if len(substrs) > 1:
|
|
@@ -75,7 +59,7 @@ def fix_fracs(string):
|
|
|
75
59
|
else:
|
|
76
60
|
try:
|
|
77
61
|
assert len(substr) >= 2
|
|
78
|
-
except
|
|
62
|
+
except:
|
|
79
63
|
return string
|
|
80
64
|
a = substr[0]
|
|
81
65
|
b = substr[1]
|
|
@@ -95,7 +79,7 @@ def fix_fracs(string):
|
|
|
95
79
|
return string
|
|
96
80
|
|
|
97
81
|
|
|
98
|
-
def
|
|
82
|
+
def _fix_a_slash_b(string):
|
|
99
83
|
if len(string.split('/')) != 2:
|
|
100
84
|
return string
|
|
101
85
|
a = string.split('/')[0]
|
|
@@ -106,11 +90,11 @@ def fix_a_slash_b(string):
|
|
|
106
90
|
assert string == '{}/{}'.format(a, b)
|
|
107
91
|
new_string = '\\frac{' + str(a) + '}{' + str(b) + '}'
|
|
108
92
|
return new_string
|
|
109
|
-
except
|
|
93
|
+
except:
|
|
110
94
|
return string
|
|
111
95
|
|
|
112
96
|
|
|
113
|
-
def
|
|
97
|
+
def _remove_right_units(string):
|
|
114
98
|
# "\\text{ " only ever occurs (at least in the val set) when describing units
|
|
115
99
|
if '\\text{ ' in string:
|
|
116
100
|
splits = string.split('\\text{ ')
|
|
@@ -120,7 +104,7 @@ def remove_right_units(string):
|
|
|
120
104
|
return string
|
|
121
105
|
|
|
122
106
|
|
|
123
|
-
def
|
|
107
|
+
def _fix_sqrt(string):
|
|
124
108
|
if '\\sqrt' not in string:
|
|
125
109
|
return string
|
|
126
110
|
splits = string.split('\\sqrt')
|
|
@@ -135,23 +119,28 @@ def fix_sqrt(string):
|
|
|
135
119
|
return new_string
|
|
136
120
|
|
|
137
121
|
|
|
138
|
-
def
|
|
122
|
+
def _strip_string(string):
|
|
139
123
|
# linebreaks
|
|
140
124
|
string = string.replace('\n', '')
|
|
125
|
+
# print(string)
|
|
141
126
|
|
|
142
127
|
# remove inverse spaces
|
|
143
128
|
string = string.replace('\\!', '')
|
|
129
|
+
# print(string)
|
|
144
130
|
|
|
145
131
|
# replace \\ with \
|
|
146
132
|
string = string.replace('\\\\', '\\')
|
|
133
|
+
# print(string)
|
|
147
134
|
|
|
148
135
|
# replace tfrac and dfrac with frac
|
|
149
136
|
string = string.replace('tfrac', 'frac')
|
|
150
137
|
string = string.replace('dfrac', 'frac')
|
|
138
|
+
# print(string)
|
|
151
139
|
|
|
152
140
|
# remove \left and \right
|
|
153
141
|
string = string.replace('\\left', '')
|
|
154
142
|
string = string.replace('\\right', '')
|
|
143
|
+
# print(string)
|
|
155
144
|
|
|
156
145
|
# Remove circ (degrees)
|
|
157
146
|
string = string.replace('^{\\circ}', '')
|
|
@@ -161,11 +150,11 @@ def strip_string(string):
|
|
|
161
150
|
string = string.replace('\\$', '')
|
|
162
151
|
|
|
163
152
|
# remove units (on the right)
|
|
164
|
-
string =
|
|
153
|
+
string = _remove_right_units(string)
|
|
165
154
|
|
|
166
155
|
# remove percentage
|
|
167
156
|
string = string.replace('\\%', '')
|
|
168
|
-
string = string.replace('\%', '')
|
|
157
|
+
string = string.replace('\%', '')
|
|
169
158
|
|
|
170
159
|
# " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
|
|
171
160
|
string = string.replace(' .', ' 0.')
|
|
@@ -182,19 +171,19 @@ def strip_string(string):
|
|
|
182
171
|
string = string.split('=')[1]
|
|
183
172
|
|
|
184
173
|
# fix sqrt3 --> sqrt{3}
|
|
185
|
-
string =
|
|
174
|
+
string = _fix_sqrt(string)
|
|
186
175
|
|
|
187
176
|
# remove spaces
|
|
188
177
|
string = string.replace(' ', '')
|
|
189
178
|
|
|
190
|
-
# \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
|
|
191
|
-
string =
|
|
179
|
+
# \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
|
|
180
|
+
string = _fix_fracs(string)
|
|
192
181
|
|
|
193
182
|
# manually change 0.5 --> \frac{1}{2}
|
|
194
183
|
if string == '0.5':
|
|
195
184
|
string = '\\frac{1}{2}'
|
|
196
185
|
|
|
197
186
|
# NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
|
|
198
|
-
string =
|
|
187
|
+
string = _fix_a_slash_b(string)
|
|
199
188
|
|
|
200
189
|
return string
|
|
File without changes
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.metric import Score
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
GRADER_SYSTEM_PROMPT = """You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers.""" # noqa: E501
|
|
15
|
+
|
|
16
|
+
GRADER_TEMPLATE = """
|
|
17
|
+
I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
|
|
18
|
+
|
|
19
|
+
## Instruction
|
|
20
|
+
|
|
21
|
+
{{
|
|
22
|
+
"instruction": "{instruction}"
|
|
23
|
+
}}
|
|
24
|
+
|
|
25
|
+
## Model Outputs
|
|
26
|
+
|
|
27
|
+
Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
|
|
28
|
+
|
|
29
|
+
{{
|
|
30
|
+
{{
|
|
31
|
+
"model_identifier": "m",
|
|
32
|
+
"output": "{output_1}"
|
|
33
|
+
}},
|
|
34
|
+
{{
|
|
35
|
+
"model_identifier": "M",
|
|
36
|
+
"output": "{output_2}"
|
|
37
|
+
}}
|
|
38
|
+
}}
|
|
39
|
+
|
|
40
|
+
## Task
|
|
41
|
+
|
|
42
|
+
Evaluate the models based on the quality and relevance of their outputs, and select the model that generated the best output. Answer by providing the model identifier of the best model. We will use your output as the name of the best model, so make sure your output only contains one of the following model identifiers and nothing else (no quotes, no spaces, no new lines, ...): m or M.
|
|
43
|
+
|
|
44
|
+
## Best Model Identifier
|
|
45
|
+
""".strip() # noqa: E501
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@register_benchmark(
|
|
49
|
+
BenchmarkMeta(
|
|
50
|
+
name='alpaca_eval',
|
|
51
|
+
pretty_name='AlpacaEval2.0',
|
|
52
|
+
tags=[Tags.INSTRUCTION_FOLLOWING, Tags.ARENA],
|
|
53
|
+
description='Alpaca Eval 2.0 is an enhanced framework for evaluating instruction-following language models, '
|
|
54
|
+
'featuring an improved auto-annotator, updated baselines, and continuous preference calculation to '
|
|
55
|
+
'provide more accurate and cost-effective model assessments. '
|
|
56
|
+
'Currently not support `length-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-turbo`.', # noqa: E501
|
|
57
|
+
dataset_id='AI-ModelScope/alpaca_eval',
|
|
58
|
+
subset_list=['alpaca_eval_gpt4_baseline'],
|
|
59
|
+
metric_list=['winrate'],
|
|
60
|
+
few_shot_num=0,
|
|
61
|
+
train_split=None,
|
|
62
|
+
eval_split='eval',
|
|
63
|
+
prompt_template='{question}'
|
|
64
|
+
)
|
|
65
|
+
)
|
|
66
|
+
class AlpacaEvalAdapter(DefaultDataAdapter):
|
|
67
|
+
|
|
68
|
+
def __init__(self, *args, **kwargs):
|
|
69
|
+
super().__init__(*args, **kwargs)
|
|
70
|
+
|
|
71
|
+
self._use_llm_judge = True # Use LLM as a judge by default
|
|
72
|
+
|
|
73
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
74
|
+
"""
|
|
75
|
+
Convert a data record to a Sample object.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
record (Dict[str, Any]): Input data record.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Sample: Sample object with input, target, and metadata.
|
|
82
|
+
"""
|
|
83
|
+
instruction = record['instruction']
|
|
84
|
+
baseline_output = record['output'] # baseline model output
|
|
85
|
+
|
|
86
|
+
return Sample(
|
|
87
|
+
input=instruction,
|
|
88
|
+
target=baseline_output,
|
|
89
|
+
metadata={
|
|
90
|
+
'generator': record.get('generator', 'unknown'),
|
|
91
|
+
'dataset': record.get('dataset', 'unknown')
|
|
92
|
+
}
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
def llm_match_score(
|
|
96
|
+
self,
|
|
97
|
+
original_prediction: str,
|
|
98
|
+
filtered_prediction: str,
|
|
99
|
+
reference: str,
|
|
100
|
+
task_state: TaskState,
|
|
101
|
+
) -> Score:
|
|
102
|
+
score = Score(
|
|
103
|
+
extracted_prediction=filtered_prediction,
|
|
104
|
+
prediction=original_prediction,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
instruction = task_state.input_text
|
|
108
|
+
|
|
109
|
+
# Request judge and obtain score
|
|
110
|
+
# reference is baseline answer 'm', filtered_prediction is model answer 'M'
|
|
111
|
+
prompt = GRADER_TEMPLATE.format(instruction=instruction, output_1=reference, output_2=filtered_prediction)
|
|
112
|
+
judge_response = self.llm_judge.judge(prompt, system_prompt=GRADER_SYSTEM_PROMPT)
|
|
113
|
+
|
|
114
|
+
# parse grading response
|
|
115
|
+
match = re.search(r'(m|M)', judge_response)
|
|
116
|
+
res = match.group(0) if match else None
|
|
117
|
+
|
|
118
|
+
if res:
|
|
119
|
+
winrate = 1 if res == 'M' else 0
|
|
120
|
+
else:
|
|
121
|
+
logger.info(f'Failed to parse grading response: {prompt=}\n {judge_response=}')
|
|
122
|
+
winrate = 0
|
|
123
|
+
|
|
124
|
+
# Set score based on the match result
|
|
125
|
+
score.value = {'winrate': winrate}
|
|
126
|
+
score.explanation = f'LLM judge: {judge_response}'
|
|
127
|
+
score.metadata = {
|
|
128
|
+
'source': 'llm_judge',
|
|
129
|
+
'judge_strategy': self.judge_strategy,
|
|
130
|
+
'model': self.llm_judge.model_id
|
|
131
|
+
}
|
|
132
|
+
score.main_score_name = 'winrate'
|
|
133
|
+
return score
|
|
File without changes
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from typing import Any, Dict
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.registry import register_benchmark
|
|
6
|
+
from evalscope.constants import Tags
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@register_benchmark(
|
|
13
|
+
BenchmarkMeta(
|
|
14
|
+
name='amc',
|
|
15
|
+
pretty_name='AMC',
|
|
16
|
+
tags=[Tags.MATH, Tags.REASONING],
|
|
17
|
+
description=
|
|
18
|
+
'AMC (American Mathematics Competitions) is a series of mathematics competitions for high school students.',
|
|
19
|
+
dataset_id='evalscope/amc_22-24',
|
|
20
|
+
subset_list=['amc22', 'amc23', 'amc24'],
|
|
21
|
+
metric_list=[{
|
|
22
|
+
'acc': {
|
|
23
|
+
'numeric': True
|
|
24
|
+
}
|
|
25
|
+
}],
|
|
26
|
+
prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
27
|
+
)
|
|
28
|
+
)
|
|
29
|
+
class AMCAdapter(DefaultDataAdapter):
|
|
30
|
+
|
|
31
|
+
def __init__(self, *args, **kwargs):
|
|
32
|
+
super().__init__(*args, **kwargs)
|
|
33
|
+
|
|
34
|
+
# Use split as subset
|
|
35
|
+
self.split_as_subset = True
|
|
36
|
+
|
|
37
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
38
|
+
return Sample(
|
|
39
|
+
input=record['problem'],
|
|
40
|
+
target=record['answer'],
|
|
41
|
+
metadata={
|
|
42
|
+
'year': record['year'],
|
|
43
|
+
'url': record['url'],
|
|
44
|
+
'solution': record.get('solution', '')
|
|
45
|
+
},
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def extract_answer(self, prediction: str, task_state):
|
|
49
|
+
from evalscope.metrics.math_parser import extract_answer
|
|
50
|
+
|
|
51
|
+
return extract_answer(prediction)
|
|
@@ -1,161 +1,46 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
import
|
|
4
|
-
import
|
|
5
|
-
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.constants import EvalType
|
|
8
|
-
from evalscope.metrics import AverageAccuracy, exact_match
|
|
9
|
-
from evalscope.models import MultiChoiceModelAdapter
|
|
10
|
-
from evalscope.utils import ResponseParser
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.registry import register_benchmark
|
|
6
|
+
from evalscope.constants import Tags
|
|
11
7
|
from evalscope.utils.logger import get_logger
|
|
12
|
-
|
|
13
|
-
# flake8: noqa
|
|
8
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate
|
|
14
9
|
|
|
15
10
|
logger = get_logger()
|
|
16
11
|
|
|
17
12
|
|
|
18
|
-
@
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
13
|
+
@register_benchmark(
|
|
14
|
+
BenchmarkMeta(
|
|
15
|
+
name='arc',
|
|
16
|
+
pretty_name='ARC',
|
|
17
|
+
tags=[Tags.REASONING, Tags.MULTIPLE_CHOICE],
|
|
18
|
+
description=
|
|
19
|
+
'The ARC (AI2 Reasoning Challenge) benchmark is designed to evaluate the reasoning capabilities of AI models through multiple-choice questions derived from science exams. It includes two subsets: ARC-Easy and ARC-Challenge, which vary in difficulty.', # noqa: E501
|
|
20
|
+
dataset_id='allenai/ai2_arc',
|
|
21
|
+
subset_list=['ARC-Easy', 'ARC-Challenge'],
|
|
22
|
+
metric_list=['acc'],
|
|
23
|
+
few_shot_num=0,
|
|
24
|
+
train_split='train',
|
|
25
|
+
eval_split='test',
|
|
26
|
+
prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER,
|
|
27
|
+
)
|
|
28
28
|
)
|
|
29
|
-
class ARCAdapter(
|
|
30
|
-
|
|
31
|
-
choices = ['A', 'B', 'C', 'D']
|
|
29
|
+
class ARCAdapter(MultiChoiceAdapter):
|
|
32
30
|
|
|
33
31
|
def __init__(self, **kwargs):
|
|
34
|
-
few_shot_num = kwargs.get('few_shot_num', None)
|
|
35
|
-
if few_shot_num is None:
|
|
36
|
-
# Use 0-shot by default
|
|
37
|
-
logger.info(f'Set 0-shot examples by system for ARC.')
|
|
38
|
-
few_shot_num = 0
|
|
39
|
-
|
|
40
|
-
if few_shot_num != 0:
|
|
41
|
-
logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.')
|
|
42
|
-
|
|
43
32
|
super().__init__(**kwargs)
|
|
44
33
|
|
|
45
|
-
def
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
else:
|
|
59
|
-
subset_path = os.path.join(work_dir, dataset_name_or_path, subset_name)
|
|
60
|
-
for split_name in ['Train', 'Test']:
|
|
61
|
-
split_path = os.path.join(subset_path, f'{subset_name}-{split_name}.jsonl')
|
|
62
|
-
if os.path.exists(split_path):
|
|
63
|
-
with open(split_path, 'r', errors='ignore') as in_f:
|
|
64
|
-
rows = []
|
|
65
|
-
for line in in_f:
|
|
66
|
-
item = json.loads(line.strip())
|
|
67
|
-
raw_choices = item['question']['choices']
|
|
68
|
-
rows.append({
|
|
69
|
-
'id': item['id'],
|
|
70
|
-
'question': item['question']['stem'],
|
|
71
|
-
'choices': {
|
|
72
|
-
'text': [d['text'] for d in raw_choices],
|
|
73
|
-
'label': [d['label'] for d in raw_choices]
|
|
74
|
-
},
|
|
75
|
-
'answerKey': item['answerKey'],
|
|
76
|
-
})
|
|
77
|
-
|
|
78
|
-
if subset_name in data_dict:
|
|
79
|
-
data_dict[subset_name].update({split_name.lower(): rows})
|
|
80
|
-
else:
|
|
81
|
-
data_dict[subset_name] = {split_name.lower(): rows}
|
|
82
|
-
|
|
83
|
-
return data_dict
|
|
84
|
-
|
|
85
|
-
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
86
|
-
"""
|
|
87
|
-
Generate model prompt from raw data, unify the prompt format for ARC benchmark.
|
|
88
|
-
|
|
89
|
-
Args:
|
|
90
|
-
input_d (dict): The raw input. A single data format of the ARC:
|
|
91
|
-
|
|
92
|
-
{
|
|
93
|
-
'id': 'Mercury_7220990',
|
|
94
|
-
'question': 'Which factor will most likely cause a person to develop a fever?',
|
|
95
|
-
'choices':
|
|
96
|
-
{
|
|
97
|
-
'text':['a leg muscle relaxing after exercise',
|
|
98
|
-
'a bacterial population in the bloodstream',
|
|
99
|
-
'several viral particles on the skin',
|
|
100
|
-
'carbohydrates being digested in the stomach'],
|
|
101
|
-
'label': ['A', 'B', 'C', 'D']
|
|
102
|
-
},
|
|
103
|
-
'answerKey': 'B'
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
Returns:
|
|
107
|
-
{'data': ['xxx'], 'multi_choices': ['A', 'B', 'C', 'D']}
|
|
108
|
-
"""
|
|
109
|
-
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
110
|
-
context: str = '\n'.join(few_shot_prompts)
|
|
111
|
-
|
|
112
|
-
# context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
|
|
113
|
-
full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
114
|
-
|
|
115
|
-
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
|
|
116
|
-
|
|
117
|
-
def get_gold_answer(self, input_d: dict) -> str:
|
|
118
|
-
# Get the gold choice
|
|
119
|
-
return input_d.get('answerKey', '')
|
|
120
|
-
|
|
121
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
122
|
-
"""
|
|
123
|
-
Parse the model output to get the answer. Could be the best choice index.
|
|
124
|
-
|
|
125
|
-
Args:
|
|
126
|
-
result: Predicted answer from the model. Usually a string for chat.
|
|
127
|
-
raw_input_d (dict): The raw input. Depending on the dataset.
|
|
128
|
-
eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
|
|
129
|
-
|
|
130
|
-
Returns:
|
|
131
|
-
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
132
|
-
"""
|
|
133
|
-
if eval_type == EvalType.CHECKPOINT:
|
|
134
|
-
return result
|
|
135
|
-
elif eval_type == EvalType.SERVICE:
|
|
136
|
-
return ResponseParser.parse_first_option_with_choices(
|
|
137
|
-
text=result, options=self.choices) # TODO: to be checked !
|
|
138
|
-
elif eval_type == EvalType.CUSTOM:
|
|
139
|
-
return ResponseParser.parse_first_option_with_choices(
|
|
140
|
-
text=result, options=self.choices) # TODO: to be checked !
|
|
141
|
-
else:
|
|
142
|
-
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
143
|
-
|
|
144
|
-
def match(self, gold: str, pred: str) -> float:
|
|
145
|
-
return exact_match(gold=gold, pred=pred)
|
|
146
|
-
|
|
147
|
-
@classmethod
|
|
148
|
-
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
|
|
149
|
-
|
|
150
|
-
example: str = input_d['question']
|
|
151
|
-
|
|
152
|
-
choices_texts: list = input_d['choices']['text']
|
|
153
|
-
choices_labels: list = input_d['choices']['label']
|
|
154
|
-
choices_prompts: str = '\n'.join([label + '. ' + text for text, label in zip(choices_texts, choices_labels)])
|
|
155
|
-
example += '\n' + choices_prompts
|
|
156
|
-
|
|
157
|
-
example += '\nAnswer:'
|
|
158
|
-
if include_answer:
|
|
159
|
-
example += ' {}\n\n'.format(input_d['answerKey'])
|
|
160
|
-
|
|
161
|
-
return example
|
|
34
|
+
def record_to_sample(self, record) -> Sample:
|
|
35
|
+
# Convert choice labels to indices (A->0, B->1, etc.)
|
|
36
|
+
choice_texts = record['choices']['text']
|
|
37
|
+
answer_key = record['answerKey']
|
|
38
|
+
|
|
39
|
+
return Sample(
|
|
40
|
+
input=record['question'],
|
|
41
|
+
choices=choice_texts,
|
|
42
|
+
target=answer_key,
|
|
43
|
+
metadata={
|
|
44
|
+
'id': record.get('id', ''),
|
|
45
|
+
},
|
|
46
|
+
)
|
|
File without changes
|