evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
evalscope/utils/logger.py
CHANGED
|
@@ -1,37 +1,74 @@
|
|
|
1
|
+
import colorlog
|
|
1
2
|
import importlib.util as iutil
|
|
2
3
|
import logging
|
|
3
4
|
import os
|
|
4
|
-
from
|
|
5
|
+
from logging import Logger
|
|
6
|
+
from typing import List, Optional
|
|
5
7
|
|
|
6
8
|
init_loggers = {}
|
|
9
|
+
# Define log formats
|
|
10
|
+
data_format = '%Y-%m-%d %H:%M:%S'
|
|
11
|
+
# For console output
|
|
12
|
+
color_detailed_format = '%(asctime)s - %(name)s - %(filename)s - %(funcName)s - %(lineno)d - %(log_color)s%(levelname)s%(reset)s: %(message)s' # noqa:E501
|
|
13
|
+
color_simple_format = '%(asctime)s - %(name)s - %(log_color)s%(levelname)s%(reset)s: %(message)s'
|
|
14
|
+
color_detailed_formatter = colorlog.ColoredFormatter(color_detailed_format, datefmt=data_format)
|
|
15
|
+
color_simple_formatter = colorlog.ColoredFormatter(color_simple_format, datefmt=data_format)
|
|
16
|
+
# For file output
|
|
17
|
+
detailed_format = '%(asctime)s - %(name)s - %(filename)s - %(funcName)s - %(lineno)d - %(levelname)s: %(message)s' # noqa:E501
|
|
18
|
+
simple_format = '%(asctime)s - %(name)s - %(levelname)s: %(message)s'
|
|
19
|
+
plain_detailed_formatter = logging.Formatter(detailed_format, datefmt=data_format)
|
|
20
|
+
plain_simple_formatter = logging.Formatter(simple_format, datefmt=data_format)
|
|
21
|
+
|
|
22
|
+
DEFAULT_LEVEL = logging.DEBUG if os.getenv('EVALSCOPE_LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
|
|
23
|
+
|
|
24
|
+
logging.basicConfig(format=simple_format, level=logging.INFO, force=True)
|
|
25
|
+
|
|
26
|
+
# set logging level
|
|
27
|
+
logging.getLogger('datasets').setLevel(logging.WARNING)
|
|
28
|
+
logging.getLogger('httpx').setLevel(logging.WARNING)
|
|
29
|
+
logging.getLogger('modelscope').setLevel(logging.ERROR)
|
|
7
30
|
|
|
8
|
-
|
|
9
|
-
|
|
31
|
+
info_set = set()
|
|
32
|
+
warning_set = set()
|
|
10
33
|
|
|
11
|
-
detailed_formatter = logging.Formatter(detailed_format)
|
|
12
|
-
simple_formatter = logging.Formatter(simple_format)
|
|
13
|
-
DEFAULT_LEVEL = logging.DEBUG if os.getenv('LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
|
|
14
34
|
|
|
15
|
-
|
|
35
|
+
def info_once(self, msg, *args, **kwargs):
|
|
36
|
+
hash_id = kwargs.get('hash_id') or msg
|
|
37
|
+
if hash_id in info_set:
|
|
38
|
+
return
|
|
39
|
+
info_set.add(hash_id)
|
|
40
|
+
self.info(msg)
|
|
16
41
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
42
|
+
|
|
43
|
+
def warning_once(self, msg, *args, **kwargs):
|
|
44
|
+
hash_id = kwargs.get('hash_id') or msg
|
|
45
|
+
if hash_id in warning_set:
|
|
46
|
+
return
|
|
47
|
+
warning_set.add(hash_id)
|
|
48
|
+
self.warning(msg)
|
|
21
49
|
|
|
22
50
|
|
|
23
|
-
def get_logger(
|
|
51
|
+
def get_logger(
|
|
52
|
+
log_file: Optional[str] = None,
|
|
53
|
+
name: Optional[str] = None,
|
|
54
|
+
log_level: int = DEFAULT_LEVEL,
|
|
55
|
+
file_mode: str = 'w',
|
|
56
|
+
force: bool = False,
|
|
57
|
+
):
|
|
24
58
|
"""Get logging logger
|
|
25
59
|
|
|
26
60
|
Args:
|
|
27
|
-
log_file: Log filename
|
|
28
|
-
|
|
29
|
-
log_level: Logging level.
|
|
30
|
-
file_mode:
|
|
31
|
-
|
|
61
|
+
log_file: Log filename. If specified, a file handler will be added to the logger.
|
|
62
|
+
name: Logical component name. Used to derive the logger name.
|
|
63
|
+
log_level: Logging level to set.
|
|
64
|
+
file_mode: Mode to open the file when log_file is provided (default 'w').
|
|
65
|
+
force: If True, reconfigure the existing logger (levels, formatters, handlers).
|
|
32
66
|
"""
|
|
33
67
|
|
|
34
|
-
|
|
68
|
+
if name:
|
|
69
|
+
logger_name = f"evalscope.{name.split('.')[-1]}"
|
|
70
|
+
else:
|
|
71
|
+
logger_name = 'evalscope'
|
|
35
72
|
logger = logging.getLogger(logger_name)
|
|
36
73
|
logger.propagate = False
|
|
37
74
|
|
|
@@ -40,7 +77,16 @@ def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, f
|
|
|
40
77
|
logger.setLevel(log_level)
|
|
41
78
|
for handler in logger.handlers:
|
|
42
79
|
handler.setLevel(log_level)
|
|
43
|
-
|
|
80
|
+
# Select formatter by handler type
|
|
81
|
+
if isinstance(handler, logging.FileHandler):
|
|
82
|
+
handler.setFormatter(
|
|
83
|
+
plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter
|
|
84
|
+
)
|
|
85
|
+
else:
|
|
86
|
+
handler.setFormatter(
|
|
87
|
+
color_detailed_formatter if log_level == logging.DEBUG else color_simple_formatter
|
|
88
|
+
)
|
|
89
|
+
# Ensure file handler points to current log_file (replace if needed)
|
|
44
90
|
add_file_handler_if_needed(logger, log_file, file_mode, log_level)
|
|
45
91
|
return logger
|
|
46
92
|
|
|
@@ -62,11 +108,15 @@ def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, f
|
|
|
62
108
|
handlers = [stream_handler]
|
|
63
109
|
|
|
64
110
|
if is_worker0 and log_file is not None:
|
|
65
|
-
file_handler = logging.FileHandler(log_file, file_mode)
|
|
111
|
+
file_handler = logging.FileHandler(log_file, file_mode, encoding='utf-8')
|
|
66
112
|
handlers.append(file_handler)
|
|
67
113
|
|
|
68
114
|
for handler in handlers:
|
|
69
|
-
|
|
115
|
+
# 区分不同类型的 handler,使用相应的格式化器
|
|
116
|
+
if isinstance(handler, logging.FileHandler):
|
|
117
|
+
handler.setFormatter(plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter)
|
|
118
|
+
else:
|
|
119
|
+
handler.setFormatter(color_detailed_formatter if log_level == logging.DEBUG else color_simple_formatter)
|
|
70
120
|
handler.setLevel(log_level)
|
|
71
121
|
logger.addHandler(handler)
|
|
72
122
|
|
|
@@ -88,20 +138,60 @@ def configure_logging(debug: bool, log_file: Optional[str] = None):
|
|
|
88
138
|
get_logger(log_level=logging.DEBUG, force=True)
|
|
89
139
|
|
|
90
140
|
|
|
91
|
-
def add_file_handler_if_needed(
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
141
|
+
def add_file_handler_if_needed(
|
|
142
|
+
logger: logging.Logger,
|
|
143
|
+
log_file: Optional[str],
|
|
144
|
+
file_mode: str,
|
|
145
|
+
log_level: int,
|
|
146
|
+
) -> None:
|
|
147
|
+
"""Ensure logger has a FileHandler targeting log_file.
|
|
148
|
+
- If no FileHandler exists, add one.
|
|
149
|
+
- If a FileHandler exists but points to a different file, replace it.
|
|
150
|
+
"""
|
|
151
|
+
if log_file is None:
|
|
152
|
+
return
|
|
95
153
|
|
|
154
|
+
# Only worker-0 writes files
|
|
96
155
|
if iutil.find_spec('torch') is not None:
|
|
97
156
|
from modelscope.utils.torch_utils import is_master
|
|
98
|
-
|
|
99
157
|
is_worker0 = is_master()
|
|
100
158
|
else:
|
|
101
159
|
is_worker0 = True
|
|
102
160
|
|
|
103
|
-
if
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
161
|
+
if not is_worker0:
|
|
162
|
+
return
|
|
163
|
+
|
|
164
|
+
target_path = os.path.abspath(log_file)
|
|
165
|
+
existing_file_handlers = [h for h in logger.handlers if isinstance(h, logging.FileHandler)]
|
|
166
|
+
|
|
167
|
+
# If there is a FileHandler already pointing to the target file, nothing to do.
|
|
168
|
+
for fh in existing_file_handlers:
|
|
169
|
+
try:
|
|
170
|
+
if os.path.abspath(getattr(fh, 'baseFilename', '')) == target_path:
|
|
171
|
+
return
|
|
172
|
+
except Exception:
|
|
173
|
+
# If any issue retrieving baseFilename, fall through to replacement
|
|
174
|
+
pass
|
|
175
|
+
|
|
176
|
+
# Replace all existing FileHandlers with the new one
|
|
177
|
+
for fh in existing_file_handlers:
|
|
178
|
+
try:
|
|
179
|
+
logger.removeHandler(fh)
|
|
180
|
+
fh.flush()
|
|
181
|
+
fh.close()
|
|
182
|
+
except Exception:
|
|
183
|
+
pass
|
|
184
|
+
|
|
185
|
+
file_handler = logging.FileHandler(target_path, file_mode, encoding='utf-8')
|
|
186
|
+
file_handler.setFormatter(plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter)
|
|
187
|
+
file_handler.setLevel(log_level)
|
|
188
|
+
logger.addHandler(file_handler)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def warn_once(logger: Logger, message: str) -> None:
|
|
192
|
+
if message not in _warned:
|
|
193
|
+
logger.warning(message)
|
|
194
|
+
_warned.append(message)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
_warned: List[str] = []
|
evalscope/utils/model_utils.py
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import random
|
|
1
3
|
from enum import Enum
|
|
2
|
-
from typing import TYPE_CHECKING
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.utils.import_utils import check_import
|
|
3
7
|
|
|
4
8
|
if TYPE_CHECKING:
|
|
5
9
|
from transformers import GenerationConfig
|
|
@@ -22,3 +26,55 @@ def fix_do_sample_warning(generation_config: 'GenerationConfig') -> None:
|
|
|
22
26
|
generation_config.temperature = 1.
|
|
23
27
|
generation_config.top_p = 1.
|
|
24
28
|
generation_config.top_k = 50
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_device() -> str:
|
|
32
|
+
from transformers.utils import is_torch_cuda_available, is_torch_mps_available, is_torch_npu_available
|
|
33
|
+
|
|
34
|
+
if is_torch_npu_available():
|
|
35
|
+
device = 'npu'
|
|
36
|
+
elif is_torch_mps_available():
|
|
37
|
+
device = 'mps'
|
|
38
|
+
elif is_torch_cuda_available():
|
|
39
|
+
device = 'cuda'
|
|
40
|
+
else:
|
|
41
|
+
device = 'cpu'
|
|
42
|
+
|
|
43
|
+
return device
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def dict_torch_dtype_to_str(d: Dict[str, Any]) -> dict:
|
|
47
|
+
"""
|
|
48
|
+
Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None,
|
|
49
|
+
converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
|
|
50
|
+
string, which can then be stored in the json format.
|
|
51
|
+
|
|
52
|
+
Refer to: https://github.com/huggingface/transformers/pull/16065/files for details.
|
|
53
|
+
"""
|
|
54
|
+
if d.get('torch_dtype', None) is not None and not isinstance(d['torch_dtype'], str):
|
|
55
|
+
d['torch_dtype'] = str(d['torch_dtype']).split('.')[1]
|
|
56
|
+
|
|
57
|
+
for value in d.values():
|
|
58
|
+
if isinstance(value, dict):
|
|
59
|
+
dict_torch_dtype_to_str(value)
|
|
60
|
+
|
|
61
|
+
return d
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def seed_everything(seed: int):
|
|
65
|
+
"""Set all random seeds to a fixed value for reproducibility.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
seed (int): The seed value.
|
|
69
|
+
"""
|
|
70
|
+
random.seed(seed)
|
|
71
|
+
np.random.seed(seed)
|
|
72
|
+
|
|
73
|
+
if check_import('torch', raise_warning=False):
|
|
74
|
+
import torch
|
|
75
|
+
|
|
76
|
+
torch.manual_seed(seed)
|
|
77
|
+
if torch.cuda.is_available():
|
|
78
|
+
torch.cuda.manual_seed_all(seed)
|
|
79
|
+
torch.backends.cudnn.deterministic = True
|
|
80
|
+
torch.backends.cudnn.benchmark = False
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
2
|
+
import re
|
|
3
|
+
from typing import List, Optional, Union
|
|
4
|
+
|
|
5
|
+
from evalscope.api.evaluator import Choices, Target, TaskState
|
|
6
|
+
|
|
7
|
+
FEW_SHOT_TEMPLATE = r"""Here are some examples of how to answer similar questions:
|
|
8
|
+
|
|
9
|
+
{fewshot}
|
|
10
|
+
|
|
11
|
+
""".lstrip()
|
|
12
|
+
|
|
13
|
+
CHINESE_FEW_SHOT_TEMPLATE = r"""以下是一些示例问题:
|
|
14
|
+
|
|
15
|
+
{fewshot}
|
|
16
|
+
|
|
17
|
+
""".lstrip()
|
|
18
|
+
|
|
19
|
+
CHINESE_SINGLE_ANSWER_TEMPLATE = r"""回答下面的单项选择题,请选出其中的正确答案。你的回答的最后一行应该是这样的格式:"答案:LETTER"(不带引号),其中 LETTER 是 {letters} 中的一个。
|
|
20
|
+
|
|
21
|
+
问题:{question}
|
|
22
|
+
选项:
|
|
23
|
+
{choices}
|
|
24
|
+
""".lstrip()
|
|
25
|
+
|
|
26
|
+
CHINESE_SINGLE_ANSWER_TEMPLATE_COT = r"""回答下面的单项选择题,请选出其中的正确答案。你的回答的最后一行应该是这样的格式:"答案:LETTER"(不带引号),其中 LETTER 是 {letters} 中的一个。请在回答前进行一步步思考。
|
|
27
|
+
|
|
28
|
+
问题:{question}
|
|
29
|
+
选项:
|
|
30
|
+
{choices}
|
|
31
|
+
""".lstrip()
|
|
32
|
+
|
|
33
|
+
SINGLE_ANSWER_TEMPLATE = r"""
|
|
34
|
+
Answer the following multiple choice question. The entire content of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
|
|
35
|
+
|
|
36
|
+
{question}
|
|
37
|
+
|
|
38
|
+
{choices}
|
|
39
|
+
""".strip()
|
|
40
|
+
|
|
41
|
+
SINGLE_ANSWER_TEMPLATE_COT = r"""
|
|
42
|
+
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}. Think step by step before answering.
|
|
43
|
+
|
|
44
|
+
{question}
|
|
45
|
+
|
|
46
|
+
{choices}
|
|
47
|
+
""".strip()
|
|
48
|
+
|
|
49
|
+
MULTIPLE_ANSWER_TEMPLATE = r"""
|
|
50
|
+
Answer the following multiple choice question where multiple answers may be correct. The entire content of your response should be of the following format: 'ANSWER: $LETTERS' (without quotes) where LETTERS is one or more of {letters}.
|
|
51
|
+
|
|
52
|
+
{question}
|
|
53
|
+
|
|
54
|
+
{choices}
|
|
55
|
+
""".strip()
|
|
56
|
+
|
|
57
|
+
MULTIPLE_ANSWER_TEMPLATE_COT = r"""
|
|
58
|
+
Answer the following multiple choice question where multiple answers may be correct. The last line of your response should be of the following format: 'ANSWER: $LETTERS' (without quotes) where LETTERS is one or more of {letters}. Think step by step before answering.
|
|
59
|
+
|
|
60
|
+
{question}
|
|
61
|
+
|
|
62
|
+
{choices}
|
|
63
|
+
""".strip()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def unshuffle_choices(choices: Choices) -> Choices:
|
|
67
|
+
# `sorted` returns `list[Choice]`, but for consistency we wrap this back
|
|
68
|
+
# into a `Choices` object
|
|
69
|
+
return Choices(sorted(choices, key=lambda choice: choice.original_position))
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def answer_options(choices: Choices) -> str:
|
|
73
|
+
r"""
|
|
74
|
+
Returns the `choices` formatted as a multiple choice question, e.g.:
|
|
75
|
+
|
|
76
|
+
["choice 1", "choice 2", "choice 3"] ->
|
|
77
|
+
"A) choice 1\nB) choice 2\nC) choice 3"
|
|
78
|
+
"""
|
|
79
|
+
indexes = list(range(len(choices)))
|
|
80
|
+
|
|
81
|
+
return '\n'.join([f'{answer_character(i)}) {choices[j].value}' for i, j in enumerate(indexes)])
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def format_letter_choices(choices: Union[Choices, List[str]]) -> str:
|
|
85
|
+
"""
|
|
86
|
+
Returns the `choices` formatted as a letter list, e.g.:
|
|
87
|
+
|
|
88
|
+
["choice 1", "choice 2", "choice 3"] ->
|
|
89
|
+
"A,B,C"
|
|
90
|
+
"""
|
|
91
|
+
if isinstance(choices, list):
|
|
92
|
+
choices = Choices(choices)
|
|
93
|
+
|
|
94
|
+
indexes = list(range(len(choices)))
|
|
95
|
+
|
|
96
|
+
return ','.join([f'{answer_character(i)}' for i in indexes])
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def prompt(question: str, choices: Union[Choices, List[str]], template: str, fewshot: Optional[str] = None) -> str:
|
|
100
|
+
if isinstance(choices, list):
|
|
101
|
+
choices = Choices(choices)
|
|
102
|
+
|
|
103
|
+
choices_text = answer_options(choices)
|
|
104
|
+
letters = format_letter_choices(choices)
|
|
105
|
+
if not fewshot:
|
|
106
|
+
return template.format(
|
|
107
|
+
choices=choices_text,
|
|
108
|
+
letters=letters,
|
|
109
|
+
question=question,
|
|
110
|
+
)
|
|
111
|
+
else:
|
|
112
|
+
return template.format(
|
|
113
|
+
choices=choices_text,
|
|
114
|
+
letters=letters,
|
|
115
|
+
question=question,
|
|
116
|
+
fewshot=fewshot,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def format_example(
|
|
121
|
+
question: str,
|
|
122
|
+
choices: Choices,
|
|
123
|
+
answer: Target,
|
|
124
|
+
) -> str:
|
|
125
|
+
"""Format a single example for few-shot learning.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
question (str): The question text.
|
|
129
|
+
choices (list[str]): The list of choices.
|
|
130
|
+
answer (list[str]): The correct answers.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
str: Formatted example string.
|
|
134
|
+
"""
|
|
135
|
+
choices_text = answer_options(choices)
|
|
136
|
+
return f'{question}\n{choices_text}\nANSWER: {answer.text}'
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _fallback_parse_answer(completion: str) -> Optional[set[str]]:
|
|
140
|
+
# Fallback to find the last upper case letter
|
|
141
|
+
for letter in reversed(completion):
|
|
142
|
+
if letter.isupper():
|
|
143
|
+
return {letter}
|
|
144
|
+
return None
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def parse_answers(state: TaskState, multiple_correct: bool = False) -> set[str]:
|
|
148
|
+
"""
|
|
149
|
+
Convenience function for extracting answers from the state output.
|
|
150
|
+
|
|
151
|
+
The generated response must be in the format 'ANSWER: <answers>',
|
|
152
|
+
otherwise we can't extract what the model thinks is "true". We can be a
|
|
153
|
+
bit flexible whether these are "AB" vs "A,B" vs "A B".
|
|
154
|
+
|
|
155
|
+
However, if the answer isn't in the expected format the model has
|
|
156
|
+
failed in the task so we'll ultimately just mark it as incorrect
|
|
157
|
+
"""
|
|
158
|
+
# First check whether the string strictly ends with the expected answer
|
|
159
|
+
# In this case, we're looking for a single line which contains the expected
|
|
160
|
+
# ANSWER: <answer> string with only whitespace or a period/full stop at the end.
|
|
161
|
+
match = re.search(
|
|
162
|
+
r'(?i)^ANSWER\s*:\s*([A-Za-z\d ,]+)\s*(?:$|\n|\.)',
|
|
163
|
+
state.output.completion,
|
|
164
|
+
flags=re.MULTILINE,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# If we couldn't match the strict version, we can try the less strict
|
|
168
|
+
# version for backward compatibility
|
|
169
|
+
if match is None:
|
|
170
|
+
match = re.search(
|
|
171
|
+
r'(?i)ANSWER\s*:\s*([A-Za-z\d ,]+)(?:[^\w]|\n|$|\.)',
|
|
172
|
+
state.output.completion,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
if match is None:
|
|
176
|
+
fallback_answer = _fallback_parse_answer(state.output.completion)
|
|
177
|
+
if fallback_answer:
|
|
178
|
+
return fallback_answer
|
|
179
|
+
|
|
180
|
+
if match is None:
|
|
181
|
+
return set()
|
|
182
|
+
|
|
183
|
+
matched = match.group(1)
|
|
184
|
+
|
|
185
|
+
# Strip trailing period / full stop
|
|
186
|
+
matched = matched.strip()
|
|
187
|
+
matched = matched.rstrip('.')
|
|
188
|
+
|
|
189
|
+
allowed_options = set(answer_character(i) for i in range(len(state.choices)))
|
|
190
|
+
|
|
191
|
+
if multiple_correct:
|
|
192
|
+
# Match must contain only the allowed choices
|
|
193
|
+
# (may be separated by commas, spaces, the word 'and', or nothing at all)
|
|
194
|
+
|
|
195
|
+
matched = matched.replace(' and ', '')
|
|
196
|
+
|
|
197
|
+
matched = matched.replace(' ', '')
|
|
198
|
+
|
|
199
|
+
split_comma = set(matched.split(','))
|
|
200
|
+
if split_comma.issubset(allowed_options):
|
|
201
|
+
answers = split_comma
|
|
202
|
+
return answers
|
|
203
|
+
|
|
204
|
+
split_nothing = set(matched)
|
|
205
|
+
if split_nothing.issubset(allowed_options):
|
|
206
|
+
answers = split_nothing
|
|
207
|
+
return answers
|
|
208
|
+
|
|
209
|
+
else:
|
|
210
|
+
# Match must contain a single letter in the allowed choices
|
|
211
|
+
if matched in allowed_options:
|
|
212
|
+
answers = {matched}
|
|
213
|
+
return answers
|
|
214
|
+
|
|
215
|
+
return set()
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def parse_answers_zh(state: TaskState, multiple_correct: bool = False) -> set[str]:
|
|
219
|
+
"""
|
|
220
|
+
Convenience function for extracting answers from the state output in Chinese format.
|
|
221
|
+
|
|
222
|
+
The generated response must be in the format '答案:选项',
|
|
223
|
+
otherwise we can't extract what the model thinks is "true". We can be a
|
|
224
|
+
bit flexible whether these are "AB" vs "A,B" vs "A B".
|
|
225
|
+
"""
|
|
226
|
+
# Simple pattern to capture answers with optional bold markdown
|
|
227
|
+
pattern = r'答案\s*[::]\s*([A-Za-z0-9,,]+)'
|
|
228
|
+
match = re.search(pattern, state.output.completion, flags=re.MULTILINE)
|
|
229
|
+
|
|
230
|
+
if match is None:
|
|
231
|
+
fallback_answer = _fallback_parse_answer(state.output.completion)
|
|
232
|
+
if fallback_answer:
|
|
233
|
+
return fallback_answer
|
|
234
|
+
|
|
235
|
+
if match is None:
|
|
236
|
+
return set()
|
|
237
|
+
|
|
238
|
+
matched = match.group(1).strip().rstrip('。.')
|
|
239
|
+
allowed_options = set(answer_character(i) for i in range(len(state.choices)))
|
|
240
|
+
|
|
241
|
+
if multiple_correct:
|
|
242
|
+
# Handle comma-separated or continuous letters
|
|
243
|
+
matched = matched.replace(' 和 ', '').replace(' ', '').replace(',', ',')
|
|
244
|
+
answers = set(matched.split(',')) if ',' in matched else set(matched)
|
|
245
|
+
return answers if answers.issubset(allowed_options) else set()
|
|
246
|
+
else:
|
|
247
|
+
# Single answer
|
|
248
|
+
return {matched} if matched in allowed_options else set()
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def set_choices_based_on_generated_response(state: TaskState, answers: set[str]) -> None:
|
|
252
|
+
true_answers = [answer_index(letter) for letter in answers]
|
|
253
|
+
|
|
254
|
+
for i in range(len(state.choices)):
|
|
255
|
+
if i in true_answers:
|
|
256
|
+
state.choices.mark_choice(i, True)
|
|
257
|
+
else:
|
|
258
|
+
state.choices.mark_choice(i, False)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def valid_template(template: str) -> bool:
|
|
262
|
+
"""Check if a template has the required capture groups for a multiple choice question"""
|
|
263
|
+
return bool(re.search(r'\{question\}', template) and re.search(r'\{choices\}', template))
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
class MultipleChoiceTemplate:
|
|
267
|
+
"""
|
|
268
|
+
Templates for multiple choice questions.
|
|
269
|
+
"""
|
|
270
|
+
|
|
271
|
+
SINGLE_ANSWER = SINGLE_ANSWER_TEMPLATE
|
|
272
|
+
SINGLE_ANSWER_COT = SINGLE_ANSWER_TEMPLATE_COT
|
|
273
|
+
MULTIPLE_ANSWER = MULTIPLE_ANSWER_TEMPLATE
|
|
274
|
+
MULTIPLE_ANSWER_COT = MULTIPLE_ANSWER_TEMPLATE_COT
|
|
275
|
+
CHINESE_FEW_SHOT_TEMPLATE = CHINESE_FEW_SHOT_TEMPLATE
|
|
276
|
+
CHINESE_SINGLE_ANSWER_TEMPLATE = CHINESE_SINGLE_ANSWER_TEMPLATE
|
|
277
|
+
CHINESE_SINGLE_ANSWER_TEMPLATE_COT = CHINESE_SINGLE_ANSWER_TEMPLATE_COT
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def answer_character(index: int) -> str:
|
|
281
|
+
r"""
|
|
282
|
+
Helper to go from array index to char, for example:
|
|
283
|
+
|
|
284
|
+
0 -> 'A', 1 -> 'B', etc
|
|
285
|
+
"""
|
|
286
|
+
if index < 26:
|
|
287
|
+
return chr(ord('A') + index)
|
|
288
|
+
else:
|
|
289
|
+
return str(index - 25)
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def answer_index(char: str) -> int:
|
|
293
|
+
r"""
|
|
294
|
+
Helper to go from char to array index, for example:
|
|
295
|
+
|
|
296
|
+
'A' -> 0, 'B' -> 1, etc
|
|
297
|
+
"""
|
|
298
|
+
if char.isalpha() or char == ',' or char == ' ':
|
|
299
|
+
return ord(char.upper()) - ord('A')
|
|
300
|
+
elif char.isnumeric():
|
|
301
|
+
return 25 + int(char)
|
|
302
|
+
else:
|
|
303
|
+
raise ValueError(f'Unepxected multiple choice answer: {char} (must be a letter or number)')
|