evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,35 +1,16 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
import
|
|
4
|
-
import
|
|
5
|
-
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.constants import EvalType
|
|
8
|
-
from evalscope.metrics import AverageAccuracy, exact_match
|
|
9
|
-
from evalscope.models import MultiChoiceModelAdapter
|
|
10
|
-
from evalscope.utils import ResponseParser, normalize_score
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.registry import register_benchmark
|
|
6
|
+
from evalscope.constants import Tags
|
|
11
7
|
from evalscope.utils.logger import get_logger
|
|
8
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate
|
|
12
9
|
|
|
13
10
|
# flake8: noqa
|
|
14
11
|
|
|
15
12
|
logger = get_logger()
|
|
16
13
|
|
|
17
|
-
SUBSET_LIST = [
|
|
18
|
-
'agronomy', 'anatomy', 'ancient_chinese', 'arts', 'astronomy', 'business_ethics', 'chinese_civil_service_exam',
|
|
19
|
-
'chinese_driving_rule', 'chinese_food_culture', 'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
|
|
20
|
-
'chinese_teacher_qualification', 'college_actuarial_science', 'college_education', 'college_engineering_hydrology',
|
|
21
|
-
'college_law', 'college_mathematics', 'college_medical_statistics', 'clinical_knowledge', 'college_medicine',
|
|
22
|
-
'computer_science', 'computer_security', 'conceptual_physics', 'construction_project_management', 'economics',
|
|
23
|
-
'education', 'elementary_chinese', 'elementary_commonsense', 'elementary_information_and_technology',
|
|
24
|
-
'electrical_engineering', 'elementary_mathematics', 'ethnology', 'food_science', 'genetics', 'global_facts',
|
|
25
|
-
'high_school_biology', 'high_school_chemistry', 'high_school_geography', 'high_school_mathematics',
|
|
26
|
-
'high_school_physics', 'high_school_politics', 'human_sexuality', 'international_law', 'journalism',
|
|
27
|
-
'jurisprudence', 'legal_and_moral_basis', 'logical', 'machine_learning', 'management', 'marketing',
|
|
28
|
-
'marxist_theory', 'modern_chinese', 'nutrition', 'philosophy', 'professional_accounting', 'professional_law',
|
|
29
|
-
'professional_medicine', 'professional_psychology', 'public_relations', 'security_study', 'sociology',
|
|
30
|
-
'sports_science', 'traditional_chinese_medicine', 'virology', 'world_history', 'world_religions'
|
|
31
|
-
]
|
|
32
|
-
|
|
33
14
|
SUBJECT_MAPPING = {
|
|
34
15
|
'agronomy': ['other', 'Other'],
|
|
35
16
|
'anatomy': ['biology', 'STEM'],
|
|
@@ -101,128 +82,41 @@ SUBJECT_MAPPING = {
|
|
|
101
82
|
}
|
|
102
83
|
|
|
103
84
|
|
|
104
|
-
@
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
85
|
+
@register_benchmark(
|
|
86
|
+
BenchmarkMeta(
|
|
87
|
+
name='cmmlu',
|
|
88
|
+
pretty_name='C-MMLU',
|
|
89
|
+
tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE, Tags.CHINESE],
|
|
90
|
+
description=
|
|
91
|
+
'C-MMLU is a benchmark designed to evaluate the performance of AI models on Chinese language tasks, including reading comprehension, text classification, and more.',
|
|
92
|
+
dataset_id='evalscope/cmmlu',
|
|
93
|
+
metric_list=['acc'],
|
|
94
|
+
subset_list=list(SUBJECT_MAPPING.keys()),
|
|
95
|
+
few_shot_num=0,
|
|
96
|
+
train_split=None,
|
|
97
|
+
eval_split='test',
|
|
98
|
+
prompt_template=MultipleChoiceTemplate.CHINESE_SINGLE_ANSWER_TEMPLATE_COT,
|
|
99
|
+
)
|
|
113
100
|
)
|
|
114
|
-
class CMMLUAdapter(
|
|
115
|
-
|
|
116
|
-
choices = ['A', 'B', 'C', 'D']
|
|
101
|
+
class CMMLUAdapter(MultiChoiceAdapter):
|
|
117
102
|
|
|
118
103
|
def __init__(self, **kwargs):
|
|
119
104
|
super().__init__(**kwargs)
|
|
120
105
|
|
|
106
|
+
self.reformat_subset = True
|
|
121
107
|
self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
|
|
122
108
|
|
|
123
|
-
def
|
|
124
|
-
data_dict = {}
|
|
125
|
-
for subset_name in subset_list:
|
|
126
|
-
data_dict[subset_name] = {}
|
|
127
|
-
for split_name in [self.train_split, self.eval_split]:
|
|
128
|
-
file_path = os.path.join(work_dir, dataset_name_or_path, split_name, f'{subset_name}.csv')
|
|
129
|
-
if os.path.exists(file_path):
|
|
130
|
-
with open(file_path, encoding='utf-8') as f:
|
|
131
|
-
rows = []
|
|
132
|
-
reader = csv.reader(f)
|
|
133
|
-
for row in reader:
|
|
134
|
-
if len(row) != 7:
|
|
135
|
-
logger.error(f'Mismatch len of row: {row}, len of row should be 6. Skip this row.')
|
|
136
|
-
continue
|
|
137
|
-
rows.append({
|
|
138
|
-
'Question': row[1],
|
|
139
|
-
'A': row[2],
|
|
140
|
-
'B': row[3],
|
|
141
|
-
'C': row[4],
|
|
142
|
-
'D': row[5],
|
|
143
|
-
'Answer': row[6],
|
|
144
|
-
})
|
|
145
|
-
|
|
146
|
-
data_dict[subset_name].update({split_name: rows})
|
|
147
|
-
|
|
148
|
-
return data_dict
|
|
149
|
-
|
|
150
|
-
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
151
|
-
"""
|
|
152
|
-
Generate model prompt from raw input, unify the prompt format for CMMLU benchmark.
|
|
153
|
-
|
|
154
|
-
Args:
|
|
155
|
-
input_d (dict): The raw input. A single data format of the CMMLU:
|
|
156
|
-
|
|
157
|
-
{'Question': '下列关于重力的说法正确的是',
|
|
158
|
-
'A': '在地球周围的物体都要受到重力作用,与其运动状态无关',
|
|
159
|
-
'B': '对某一物体而言,重力的大小是一个恒量,不随物体的地理位置而改变',
|
|
160
|
-
'C': '重力就是地球对物体的吸引力,重力的方向总是竖直向下',
|
|
161
|
-
'D': '在地球表面各处的重力方向都是相同的',
|
|
162
|
-
'Answer': 'A'}
|
|
163
|
-
|
|
164
|
-
Returns:
|
|
165
|
-
{'data': [(context, continuation), ...]}
|
|
166
|
-
|
|
167
|
-
"""
|
|
168
|
-
prompt = '以下是关于{}的单项选择题。\n\n'.format(self._format_subject(subset_name))
|
|
169
|
-
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
170
|
-
|
|
171
|
-
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
172
|
-
context += self._generate_prompt(input_d=input_d, include_answer=False)
|
|
173
|
-
context = prompt + context
|
|
174
|
-
|
|
175
|
-
full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
176
|
-
|
|
177
|
-
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': prompt}
|
|
178
|
-
|
|
179
|
-
def get_gold_answer(self, input_d: dict) -> str:
|
|
180
|
-
# Get the gold choice
|
|
181
|
-
return input_d.get('Answer', '')
|
|
182
|
-
|
|
183
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
184
|
-
"""
|
|
185
|
-
Parse the model output to get the answer. Could be the best choice index.
|
|
186
|
-
|
|
187
|
-
Args:
|
|
188
|
-
result: Predicted answer from the model. Usually a string for chat.
|
|
189
|
-
raw_input_d: The raw input. Depending on the dataset.
|
|
190
|
-
eval_type: The evaluation type. 'checkpoint', 'service', 'custom'.
|
|
191
|
-
|
|
192
|
-
Returns:
|
|
193
|
-
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
194
|
-
"""
|
|
195
|
-
if eval_type == EvalType.CHECKPOINT:
|
|
196
|
-
return result
|
|
197
|
-
elif eval_type == EvalType.SERVICE:
|
|
198
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
|
|
199
|
-
elif eval_type == EvalType.CUSTOM:
|
|
200
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
|
|
201
|
-
else:
|
|
202
|
-
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
203
|
-
|
|
204
|
-
def match(self, gold: str, pred: str) -> float:
|
|
205
|
-
return exact_match(gold=gold, pred=pred)
|
|
206
|
-
|
|
207
|
-
@classmethod
|
|
208
|
-
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
|
|
209
|
-
|
|
210
|
-
input_choices: list = [input_d['A'], input_d['B'], input_d['C'], input_d['D']]
|
|
211
|
-
|
|
212
|
-
example: str = input_d['Question']
|
|
213
|
-
for j in range(len(cls.choices)):
|
|
214
|
-
example += '\n{}. {}'.format(cls.choices[j], input_choices[j])
|
|
215
|
-
|
|
216
|
-
example += '\nAnswer:'
|
|
217
|
-
if include_answer:
|
|
218
|
-
example += ' {}\n\n'.format(input_d['Answer'])
|
|
109
|
+
def record_to_sample(self, record) -> Sample:
|
|
219
110
|
|
|
220
|
-
|
|
111
|
+
# choices: ["(A) 农业生产工具","(B) 土地","(C) 劳动力","(D) 资金"]
|
|
112
|
+
# remove the leading (A), (B), (C), (D)
|
|
113
|
+
raw_choices = record['choices']
|
|
114
|
+
choice_list = [choice[3:].strip() for choice in raw_choices]
|
|
221
115
|
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
116
|
+
return Sample(
|
|
117
|
+
input=record['question'],
|
|
118
|
+
choices=choice_list,
|
|
119
|
+
target=record['answer'][1], # answer is like "A"
|
|
120
|
+
subset_key=record['category'],
|
|
121
|
+
metadata={'subject': record['category']},
|
|
122
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentText
|
|
6
|
+
from evalscope.api.metric.scorer import AggScore, SampleScore, Score
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
DESCRIPTION = (
|
|
14
|
+
"CoinFlip is a symbolic reasoning dataset that tests an LLM's ability "
|
|
15
|
+
'to track binary state changes through a sequence of actions. '
|
|
16
|
+
'Each example describes whether a coin is flipped or not by different person, '
|
|
17
|
+
'requiring logical inference to determine the final state (heads or tails).'
|
|
18
|
+
) # noqa: E501
|
|
19
|
+
|
|
20
|
+
PROMPT_TEMPLATE = """
|
|
21
|
+
Solve the following coin flip problem step by step. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.
|
|
22
|
+
|
|
23
|
+
{question}
|
|
24
|
+
|
|
25
|
+
Remember to put your answer on its own line at the end in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer YES or NO to the problem.
|
|
26
|
+
|
|
27
|
+
Reasoning:
|
|
28
|
+
""" # noqa: E501
|
|
29
|
+
|
|
30
|
+
FEWSHOT_TEMPLATE = """
|
|
31
|
+
Here are some examples of how to solve similar problems:
|
|
32
|
+
|
|
33
|
+
{fewshot}
|
|
34
|
+
|
|
35
|
+
""".lstrip() + PROMPT_TEMPLATE # noqa: E501
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@register_benchmark(
|
|
39
|
+
BenchmarkMeta(
|
|
40
|
+
name='coin_flip',
|
|
41
|
+
pretty_name='CoinFlip',
|
|
42
|
+
tags=[Tags.REASONING, Tags.YES_NO],
|
|
43
|
+
description=DESCRIPTION.strip(),
|
|
44
|
+
dataset_id='extraordinarylab/coin-flip',
|
|
45
|
+
metric_list=['accuracy', 'precision', 'recall', 'f1_score', 'yes_ratio'],
|
|
46
|
+
aggregation='f1',
|
|
47
|
+
few_shot_num=0,
|
|
48
|
+
train_split='validation',
|
|
49
|
+
eval_split='test',
|
|
50
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
51
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
class CoinFlipAdapter(DefaultDataAdapter):
|
|
55
|
+
|
|
56
|
+
def __init__(self, **kwargs):
|
|
57
|
+
super().__init__(**kwargs)
|
|
58
|
+
self.add_overall_metric = False
|
|
59
|
+
|
|
60
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
61
|
+
question = record['question']
|
|
62
|
+
answer = record['answer']
|
|
63
|
+
input_text = self.prompt_template.format(question=question)
|
|
64
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
65
|
+
answer = str(answer).upper() # 'YES' or 'NO'
|
|
66
|
+
return Sample(input=[ChatMessageUser(content=content_list)], target=answer, metadata={
|
|
67
|
+
'answer': answer,
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
def extract_answer(self, prediction, task_state):
|
|
71
|
+
import re
|
|
72
|
+
|
|
73
|
+
match = re.search(r'ANSWER:\s*(.*)', prediction)
|
|
74
|
+
return match.group(1) if match else prediction
|
|
75
|
+
|
|
76
|
+
def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
|
|
77
|
+
score = Score(
|
|
78
|
+
extracted_prediction=filtered_prediction,
|
|
79
|
+
prediction=original_prediction,
|
|
80
|
+
)
|
|
81
|
+
# Check for an exact match against the extracted answer.
|
|
82
|
+
result = 1 if reference in filtered_prediction else 0
|
|
83
|
+
score.value = {'acc': result}
|
|
84
|
+
return score
|
|
85
|
+
|
|
86
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
|
|
87
|
+
"""
|
|
88
|
+
Custom aggregation to compute accuracy, precision, recall, f1_score, and yes_ratio.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
tp = fp = tn = fn = 0
|
|
92
|
+
yes_count = 0
|
|
93
|
+
total_count = len(sample_scores)
|
|
94
|
+
|
|
95
|
+
for ss in sample_scores:
|
|
96
|
+
gt = ss.sample_metadata['answer'].strip().upper()
|
|
97
|
+
pred = ss.score.extracted_prediction.strip().upper()
|
|
98
|
+
|
|
99
|
+
if pred == 'YES':
|
|
100
|
+
yes_count += 1
|
|
101
|
+
if pred == 'YES' and gt == 'YES':
|
|
102
|
+
tp += 1
|
|
103
|
+
elif pred == 'YES' and gt == 'NO':
|
|
104
|
+
fp += 1
|
|
105
|
+
elif pred == 'NO' and gt == 'NO':
|
|
106
|
+
tn += 1
|
|
107
|
+
elif pred == 'NO' and gt == 'YES':
|
|
108
|
+
fn += 1
|
|
109
|
+
|
|
110
|
+
accuracy = (tp + tn) / total_count if total_count > 0 else 0.0
|
|
111
|
+
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
|
|
112
|
+
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
|
|
113
|
+
f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
|
|
114
|
+
yes_ratio = yes_count / total_count if total_count > 0 else 0.0
|
|
115
|
+
|
|
116
|
+
overall_metrics = {
|
|
117
|
+
'accuracy': accuracy,
|
|
118
|
+
'precision': precision,
|
|
119
|
+
'recall': recall,
|
|
120
|
+
'f1_score': f1_score,
|
|
121
|
+
'yes_ratio': yes_ratio
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
agg_scores = []
|
|
125
|
+
for metric_name, value in overall_metrics.items():
|
|
126
|
+
agg_scores.append(AggScore(metric_name=metric_name, score=value, num=len(sample_scores), metadata={}))
|
|
127
|
+
|
|
128
|
+
return agg_scores
|
|
File without changes
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
2
|
+
from evalscope.api.dataset import Sample
|
|
3
|
+
from evalscope.api.registry import register_benchmark
|
|
4
|
+
from evalscope.constants import Tags
|
|
5
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate
|
|
6
|
+
|
|
7
|
+
DESCRIPTION = 'CommonsenseQA requires different types of commonsense knowledge to predict the correct answers.'
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@register_benchmark(
|
|
11
|
+
BenchmarkMeta(
|
|
12
|
+
name='commonsense_qa',
|
|
13
|
+
pretty_name='CommonsenseQA',
|
|
14
|
+
tags=[Tags.REASONING, Tags.COMMONSENSE, Tags.MULTIPLE_CHOICE],
|
|
15
|
+
description=DESCRIPTION.strip(),
|
|
16
|
+
dataset_id='extraordinarylab/commonsense-qa',
|
|
17
|
+
metric_list=['acc'],
|
|
18
|
+
few_shot_num=0,
|
|
19
|
+
train_split=None,
|
|
20
|
+
eval_split='validation',
|
|
21
|
+
prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER,
|
|
22
|
+
)
|
|
23
|
+
)
|
|
24
|
+
class CommonsenseQAAdapter(MultiChoiceAdapter):
|
|
25
|
+
|
|
26
|
+
def record_to_sample(self, record) -> Sample:
|
|
27
|
+
return Sample(
|
|
28
|
+
input=record['question'],
|
|
29
|
+
choices=record['choices'],
|
|
30
|
+
target=record['answer'],
|
|
31
|
+
metadata={},
|
|
32
|
+
)
|
|
@@ -1,126 +1,78 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
# Copyright (c) EleutherAI, Inc. and its affiliates.
|
|
3
|
-
|
|
4
|
-
import
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
9
|
-
from evalscope.
|
|
10
|
-
from evalscope.
|
|
3
|
+
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
7
|
+
from evalscope.api.dataset import Sample
|
|
8
|
+
from evalscope.api.evaluator import TaskState
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
12
12
|
|
|
13
13
|
# flake8: noqa
|
|
14
14
|
|
|
15
15
|
logger = get_logger()
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
17
|
+
PROMPT_TEMPLATE = """
|
|
18
|
+
Problem:
|
|
19
|
+
{question}
|
|
20
|
+
|
|
21
|
+
Please reason step by step, and put your final answer within \\boxed{{}}.
|
|
22
|
+
""".lstrip()
|
|
23
|
+
|
|
24
|
+
FEWSHOT_TEMPLATE = """
|
|
25
|
+
Here are some examples of how to solve similar problems:
|
|
26
|
+
|
|
27
|
+
{fewshot}
|
|
28
|
+
""".lstrip() + PROMPT_TEMPLATE
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@register_benchmark(
|
|
32
|
+
BenchmarkMeta(
|
|
33
|
+
name='competition_math',
|
|
34
|
+
pretty_name='MATH',
|
|
35
|
+
tags=[Tags.MATH, Tags.REASONING],
|
|
36
|
+
description=
|
|
37
|
+
'The MATH (Mathematics) benchmark is designed to evaluate the mathematical reasoning abilities of AI models through a variety of problem types, including arithmetic, algebra, geometry, and more.',
|
|
38
|
+
dataset_id='evalscope/competition_math',
|
|
39
|
+
subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
40
|
+
metric_list=[{
|
|
41
|
+
'acc': {
|
|
42
|
+
'numeric': True
|
|
43
|
+
}
|
|
44
|
+
}],
|
|
45
|
+
few_shot_num=4,
|
|
46
|
+
train_split='train',
|
|
47
|
+
eval_split='test',
|
|
48
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
49
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
50
|
+
)
|
|
28
51
|
)
|
|
29
|
-
class CompetitionMathAdapter(
|
|
30
|
-
""" To be tested for all models. """
|
|
52
|
+
class CompetitionMathAdapter(DefaultDataAdapter):
|
|
31
53
|
|
|
32
54
|
def __init__(self, **kwargs):
|
|
55
|
+
super().__init__(**kwargs)
|
|
33
56
|
|
|
34
|
-
|
|
35
|
-
if few_shot_num != 4 and few_shot_num != 0:
|
|
36
|
-
logger.error(f'The MATH benchmark ONLY supports 4-shot by system or 0-shot settings, '
|
|
37
|
-
f'but got {few_shot_num}. Use 4-shot by default.')
|
|
38
|
-
kwargs['few_shot_num'] = 4
|
|
57
|
+
self.reformat_subset = True
|
|
39
58
|
|
|
40
|
-
|
|
59
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
60
|
+
from evalscope.metrics.math_parser import extract_answer
|
|
61
|
+
|
|
62
|
+
return Sample(
|
|
63
|
+
input=record['problem'],
|
|
64
|
+
target=extract_answer(record['solution']),
|
|
65
|
+
subset_key=record['level'],
|
|
66
|
+
metadata={
|
|
67
|
+
'reasoning': record.get('solution', ''),
|
|
68
|
+
'type': record.get('type', ''),
|
|
69
|
+
},
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def sample_to_fewshot(self, sample: Sample) -> str:
|
|
73
|
+
return f'Problem:\n{sample.input}\nSolution:\n{sample.target}'
|
|
74
|
+
|
|
75
|
+
def extract_answer(self, prediction: str, task_state):
|
|
76
|
+
from evalscope.metrics.math_parser import extract_answer
|
|
41
77
|
|
|
42
|
-
|
|
43
|
-
data_dict: dict = {}
|
|
44
|
-
for subset_name in subset_list:
|
|
45
|
-
for split_name in [self.train_split, self.eval_split]:
|
|
46
|
-
if os.path.exists(dataset_name_or_path):
|
|
47
|
-
split_dir = os.path.join(dataset_name_or_path, split_name)
|
|
48
|
-
else:
|
|
49
|
-
split_dir = os.path.join(work_dir, dataset_name_or_path, split_name)
|
|
50
|
-
split_files = glob.glob(os.path.join(split_dir, '**', '*.json'))
|
|
51
|
-
split_data = []
|
|
52
|
-
for file_path in split_files:
|
|
53
|
-
if os.path.exists(file_path):
|
|
54
|
-
with open(file_path, 'r') as f:
|
|
55
|
-
split_data.append(json.load(f))
|
|
56
|
-
if subset_name in data_dict:
|
|
57
|
-
data_dict[subset_name].update({split_name: split_data})
|
|
58
|
-
else:
|
|
59
|
-
data_dict[subset_name] = {split_name: split_data}
|
|
60
|
-
|
|
61
|
-
return data_dict
|
|
62
|
-
|
|
63
|
-
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
64
|
-
"""
|
|
65
|
-
Generate the prompt for the model input.
|
|
66
|
-
|
|
67
|
-
Args:
|
|
68
|
-
input_d: raw input dict.
|
|
69
|
-
{"problem": "How many vertical asymptotes does the graph of $y=\\frac{2}{x^2+x-6}$ have?", "level": "Level 3", "type": "Algebra", "solution": "The denominator of the rational function factors into $x^2+x-6=(x-2)(x+3)$. Since the numerator is always nonzero, there is a vertical asymptote whenever the denominator is $0$, which occurs for $x = 2$ and $x = -3$. Therefore, the graph has $\\boxed{2}$ vertical asymptotes."}
|
|
70
|
-
|
|
71
|
-
few_shot_list: few shot list. Each item is a raw input dict.
|
|
72
|
-
**kwargs:
|
|
73
|
-
|
|
74
|
-
Returns:
|
|
75
|
-
{'data': [prompt]}
|
|
76
|
-
"""
|
|
77
|
-
use_fewshot = self.few_shot_num > 0
|
|
78
|
-
full_prompt = self._generate_prompt(input_d, use_fewshot=use_fewshot)
|
|
79
|
-
|
|
80
|
-
return {'data': [full_prompt], 'system_prompt': self.prompt_template}
|
|
81
|
-
|
|
82
|
-
def get_gold_answer(self, input_d: dict) -> str:
|
|
83
|
-
# Extract the gold answer from the input dict.
|
|
84
|
-
return remove_boxed(last_boxed_only_string(input_d['solution']))
|
|
85
|
-
|
|
86
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
87
|
-
"""
|
|
88
|
-
Parse the model output to get the answer. Could be the best choice index.
|
|
89
|
-
|
|
90
|
-
Args:
|
|
91
|
-
result: Predicted answer from the model. Usually a string for chat.
|
|
92
|
-
raw_input_d (dict): The raw input. Depending on the dataset.
|
|
93
|
-
eval_type: 'checkpoint' or 'service' or `custom`
|
|
94
|
-
|
|
95
|
-
Returns:
|
|
96
|
-
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
97
|
-
"""
|
|
98
|
-
# Note: Use same extraction method for both of checkpoint/service/custom
|
|
99
|
-
try:
|
|
100
|
-
result = remove_boxed(last_boxed_only_string(result))
|
|
101
|
-
except Exception:
|
|
102
|
-
return None
|
|
103
|
-
return result
|
|
104
|
-
|
|
105
|
-
def match(self, gold: str, pred: str) -> float:
|
|
106
|
-
res = 0
|
|
107
|
-
if is_equiv(pred, gold):
|
|
108
|
-
res = 1
|
|
109
|
-
|
|
110
|
-
return res
|
|
111
|
-
|
|
112
|
-
@classmethod
|
|
113
|
-
def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str:
|
|
114
|
-
problem: str = input_d['problem']
|
|
115
|
-
|
|
116
|
-
if use_fewshot:
|
|
117
|
-
# Use 4-shot examples by system
|
|
118
|
-
context = (
|
|
119
|
-
'Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:\nThe expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'
|
|
120
|
-
'Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:\nWe have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'
|
|
121
|
-
'Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:\nIf Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'
|
|
122
|
-
'Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:\nIf we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'
|
|
123
|
-
f'Problem:\n{problem}\nSolution:\n')
|
|
124
|
-
else:
|
|
125
|
-
context = 'Problem:\n' + problem + '\nSolution:\n'
|
|
126
|
-
return context
|
|
78
|
+
return extract_answer(prediction)
|
|
File without changes
|