evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
2
|
+
from evalscope.api.dataset import Sample
|
|
3
|
+
from evalscope.api.registry import register_benchmark
|
|
4
|
+
from evalscope.constants import Tags
|
|
5
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate
|
|
6
|
+
|
|
7
|
+
DESCRIPTION = (
|
|
8
|
+
'MathQA dataset is gathered by using a new representation language to annotate over the '
|
|
9
|
+
'AQuA-RAT dataset with fully-specified operational programs.'
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@register_benchmark(
|
|
14
|
+
BenchmarkMeta(
|
|
15
|
+
name='math_qa',
|
|
16
|
+
pretty_name='MathQA',
|
|
17
|
+
tags=[Tags.REASONING, Tags.MATH, Tags.MULTIPLE_CHOICE],
|
|
18
|
+
description=DESCRIPTION.strip(),
|
|
19
|
+
dataset_id='extraordinarylab/math-qa',
|
|
20
|
+
metric_list=['acc'],
|
|
21
|
+
few_shot_num=0,
|
|
22
|
+
train_split=None,
|
|
23
|
+
eval_split='test',
|
|
24
|
+
prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER_COT,
|
|
25
|
+
)
|
|
26
|
+
)
|
|
27
|
+
class MathQAAdapter(MultiChoiceAdapter):
|
|
28
|
+
|
|
29
|
+
def record_to_sample(self, record) -> Sample:
|
|
30
|
+
return Sample(
|
|
31
|
+
input=record['question'],
|
|
32
|
+
choices=record['choices'],
|
|
33
|
+
target=record['answer'],
|
|
34
|
+
metadata={'reasoning': record['reasoning']},
|
|
35
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
MULTI_CHOICE_TYPE = 'multi-choice'
|
|
15
|
+
OPEN_TYPE = 'free-form'
|
|
16
|
+
|
|
17
|
+
OPEN_PROMPT = '{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.'
|
|
18
|
+
|
|
19
|
+
MULT_CHOICE_PROMPT = """
|
|
20
|
+
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of A, B, C, D. Think step by step before answering.
|
|
21
|
+
|
|
22
|
+
{question}
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
SUBSET_LIST = ['Text Dominant', 'Text Lite', 'Vision Intensive', 'Vision Dominant', 'Vision Only']
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@register_benchmark(
|
|
29
|
+
BenchmarkMeta(
|
|
30
|
+
name='math_verse',
|
|
31
|
+
pretty_name='MathVerse',
|
|
32
|
+
dataset_id='evalscope/MathVerse',
|
|
33
|
+
tags=[Tags.MATH, Tags.REASONING, Tags.MULTIPLE_CHOICE, Tags.MULTI_MODAL],
|
|
34
|
+
description=
|
|
35
|
+
'MathVerse, an all-around visual math benchmark designed for an equitable and in-depth evaluation of MLLMs. 2,612 high-quality, multi-subject math problems with diagrams from publicly available sources. Each problem is then transformed by human annotators into six distinct versions, each offering varying degrees of information content in multi-modality, contributing to 15K test samples in total. This approach allows MathVerse to comprehensively assess whether and how much MLLMs can truly understand the visual diagrams for mathematical reasoning.',
|
|
36
|
+
subset_list=SUBSET_LIST,
|
|
37
|
+
metric_list=[{
|
|
38
|
+
'acc': {
|
|
39
|
+
'numeric': True
|
|
40
|
+
}
|
|
41
|
+
}],
|
|
42
|
+
default_subset='testmini',
|
|
43
|
+
eval_split='testmini',
|
|
44
|
+
prompt_template=OPEN_PROMPT,
|
|
45
|
+
)
|
|
46
|
+
)
|
|
47
|
+
class MathVerseAdapter(VisionLanguageAdapter):
|
|
48
|
+
|
|
49
|
+
def __init__(self, **kwargs):
|
|
50
|
+
super().__init__(**kwargs)
|
|
51
|
+
self.reformat_subset = True
|
|
52
|
+
self._use_llm_judge = True
|
|
53
|
+
|
|
54
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
55
|
+
"""
|
|
56
|
+
Convert a dataset record to a Sample. Unifies handling for both multi-choice and free-form.
|
|
57
|
+
Builds the content list inline and appends image content if provided.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
record: Raw dataset record.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Sample: The standardized sample ready for evaluation.
|
|
64
|
+
"""
|
|
65
|
+
question_type = record.get('question_type', OPEN_TYPE)
|
|
66
|
+
question: str = record.get('question', '')
|
|
67
|
+
content_list: list[Content] = []
|
|
68
|
+
|
|
69
|
+
# Choose prompt text based on type; keep a single unified flow for creating Sample
|
|
70
|
+
if question_type == MULTI_CHOICE_TYPE:
|
|
71
|
+
prompt_text = MULT_CHOICE_PROMPT.format(question=question).strip()
|
|
72
|
+
else:
|
|
73
|
+
prompt_text = OPEN_PROMPT.format(question=question).strip()
|
|
74
|
+
|
|
75
|
+
content_list.append(ContentText(text=prompt_text))
|
|
76
|
+
|
|
77
|
+
# Append image if exists
|
|
78
|
+
image = record.get('image')
|
|
79
|
+
if image and isinstance(image, dict):
|
|
80
|
+
image_bytes = image.get('bytes')
|
|
81
|
+
if image_bytes:
|
|
82
|
+
image_base64 = bytes_to_base64(image_bytes, format='png', add_header=True)
|
|
83
|
+
content_list.append(ContentImage(image=image_base64))
|
|
84
|
+
|
|
85
|
+
metadata: Dict[str, Any] = {
|
|
86
|
+
'sample_index': record.get('sample_index'),
|
|
87
|
+
'problem_index': record.get('problem_index'),
|
|
88
|
+
'problem_version': record.get('problem_version'),
|
|
89
|
+
'question_type': question_type,
|
|
90
|
+
'query_wo': record.get('query_wo'),
|
|
91
|
+
'query_cot': record.get('query_cot'),
|
|
92
|
+
'question_for_eval': record.get('question_for_eval'),
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return Sample(
|
|
96
|
+
input=[ChatMessageUser(content=content_list)],
|
|
97
|
+
target=record['answer'],
|
|
98
|
+
subset_key=record['problem_version'],
|
|
99
|
+
metadata=metadata,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def extract_answer(self, prediction: str, task_state):
|
|
103
|
+
from evalscope.metrics.math_parser import extract_answer
|
|
104
|
+
|
|
105
|
+
return extract_answer(prediction)
|
|
File without changes
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
7
|
+
from evalscope.api.evaluator import TaskState
|
|
8
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers, prompt
|
|
14
|
+
|
|
15
|
+
logger = get_logger()
|
|
16
|
+
|
|
17
|
+
OPEN_PROMPT = '{question}\nPlease reason step by step, and put your final answer within \\boxed{{}} without units.'
|
|
18
|
+
|
|
19
|
+
MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
|
|
20
|
+
|
|
21
|
+
SUBSET_LIST = ['level 1', 'level 2', 'level 3', 'level 4', 'level 5']
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@register_benchmark(
|
|
25
|
+
BenchmarkMeta(
|
|
26
|
+
name='math_vision',
|
|
27
|
+
pretty_name='MathVision',
|
|
28
|
+
dataset_id='evalscope/MathVision',
|
|
29
|
+
tags=[Tags.MATH, Tags.REASONING, Tags.MULTIPLE_CHOICE, Tags.MULTI_MODAL],
|
|
30
|
+
description=
|
|
31
|
+
'The MATH-Vision (MATH-V) dataset, a meticulously curated collection of 3,040 high-quality mathematical problems with visual contexts sourced from real math competitions.',
|
|
32
|
+
subset_list=SUBSET_LIST,
|
|
33
|
+
metric_list=[{
|
|
34
|
+
'acc': {
|
|
35
|
+
'numeric': True
|
|
36
|
+
}
|
|
37
|
+
}],
|
|
38
|
+
eval_split='test',
|
|
39
|
+
prompt_template=OPEN_PROMPT,
|
|
40
|
+
)
|
|
41
|
+
)
|
|
42
|
+
class MathVisionAdapter(VisionLanguageAdapter):
|
|
43
|
+
|
|
44
|
+
def __init__(self, **kwargs):
|
|
45
|
+
super().__init__(**kwargs)
|
|
46
|
+
self.reformat_subset = True
|
|
47
|
+
|
|
48
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
49
|
+
if len(record['options']) > 0:
|
|
50
|
+
question_type = 'multi_choice'
|
|
51
|
+
else:
|
|
52
|
+
question_type = 'free_form'
|
|
53
|
+
content_list, answers_list = MathVisionAdapter.create_content_and_answers_list(record, question_type)
|
|
54
|
+
metadata = {
|
|
55
|
+
'id': record['id'],
|
|
56
|
+
'image': record['image'],
|
|
57
|
+
'solution': record['solution'],
|
|
58
|
+
'level': record['level'],
|
|
59
|
+
'question_type': question_type,
|
|
60
|
+
'subject': record['subject']
|
|
61
|
+
}
|
|
62
|
+
if question_type == 'multi_choice':
|
|
63
|
+
label_answer = record['answer']
|
|
64
|
+
return Sample(
|
|
65
|
+
input=[ChatMessageUser(content=content_list)],
|
|
66
|
+
choices=answers_list,
|
|
67
|
+
target=label_answer,
|
|
68
|
+
subset_key=f'level {record["level"]}',
|
|
69
|
+
metadata=metadata
|
|
70
|
+
)
|
|
71
|
+
elif question_type == 'free_form':
|
|
72
|
+
return Sample(
|
|
73
|
+
input=[ChatMessageUser(content=content_list)],
|
|
74
|
+
target=record['answer'],
|
|
75
|
+
subset_key=f'level {record["level"]}',
|
|
76
|
+
metadata=metadata
|
|
77
|
+
)
|
|
78
|
+
else:
|
|
79
|
+
raise ValueError(f'Unexpected question_type: {question_type}')
|
|
80
|
+
|
|
81
|
+
@staticmethod
|
|
82
|
+
def create_content_and_answers_list(record: Dict[str, Any], question_type) -> tuple[List[Content], List[str]]:
|
|
83
|
+
"""
|
|
84
|
+
Create a list of content elements and a list of answers from a record.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
record (dict): The record containing question, images, and options.
|
|
88
|
+
question_type (str): The type of this question
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
tuple: A tuple containing:
|
|
93
|
+
- content_list (list): A list of content elements (text and images).
|
|
94
|
+
- answers_list (list): A list of possible answers (for multiple-choice questions).
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
# Replace <image1>, <image2> ... to [image1], [image2], ... from question text
|
|
98
|
+
question = re.sub(r'<image(\d+)>', r'[image\1]', record['question']).strip()
|
|
99
|
+
|
|
100
|
+
if question_type == 'multi_choice':
|
|
101
|
+
answers_list = record['options']
|
|
102
|
+
input_text = prompt(question=question, choices=answers_list, template=MULT_CHOICE_PROMPT)
|
|
103
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
104
|
+
else:
|
|
105
|
+
answers_list: List[str] = []
|
|
106
|
+
content_list: List[Content] = [ContentText(text=OPEN_PROMPT.format(question=question))]
|
|
107
|
+
image = record['decoded_image']
|
|
108
|
+
if image:
|
|
109
|
+
image_base64 = bytes_to_base64(image['bytes'], format='jpg', add_header=True)
|
|
110
|
+
content_list.append(ContentImage(image=image_base64))
|
|
111
|
+
return content_list, answers_list
|
|
112
|
+
|
|
113
|
+
def extract_answer(self, prediction: str, task_state):
|
|
114
|
+
from evalscope.metrics.math_parser import extract_answer
|
|
115
|
+
|
|
116
|
+
return extract_answer(prediction)
|
|
File without changes
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any, Dict
|
|
4
|
+
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers, prompt
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
OPEN_PROMPT = '{question}\nPlease reason step by step, and put your final answer within \\boxed{{}} without units.'
|
|
17
|
+
|
|
18
|
+
MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
|
|
19
|
+
|
|
20
|
+
MULTI_CHOICE_TYPE = 'multi_choice'
|
|
21
|
+
OPEN_TYPE = 'free_form'
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@register_benchmark(
|
|
25
|
+
BenchmarkMeta(
|
|
26
|
+
name='math_vista',
|
|
27
|
+
pretty_name='MathVista',
|
|
28
|
+
dataset_id='evalscope/MathVista',
|
|
29
|
+
tags=[Tags.MATH, Tags.REASONING, Tags.MULTIPLE_CHOICE, Tags.MULTI_MODAL],
|
|
30
|
+
description=
|
|
31
|
+
'MathVista is a consolidated Mathematical reasoning benchmark within Visual contexts. It consists of three newly created datasets, IQTest, FunctionQA, and PaperQA, which address the missing visual domains and are tailored to evaluate logical reasoning on puzzle test figures, algebraic reasoning over functional plots, and scientific reasoning with academic paper figures, respectively. It also incorporates 9 MathQA datasets and 19 VQA datasets from the literature, which significantly enrich the diversity and complexity of visual perception and mathematical reasoning challenges within our benchmark. In total, MathVista includes 6,141 examples collected from 31 different datasets.',
|
|
32
|
+
metric_list=[{
|
|
33
|
+
'acc': {
|
|
34
|
+
'numeric': True
|
|
35
|
+
}
|
|
36
|
+
}],
|
|
37
|
+
eval_split='testmini',
|
|
38
|
+
prompt_template=OPEN_PROMPT,
|
|
39
|
+
)
|
|
40
|
+
)
|
|
41
|
+
class MathVistaAdapter(VisionLanguageAdapter):
|
|
42
|
+
|
|
43
|
+
def __init__(self, **kwargs):
|
|
44
|
+
super().__init__(**kwargs)
|
|
45
|
+
|
|
46
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
47
|
+
content_list, answers_list = MathVistaAdapter.create_content_and_answers_list(record)
|
|
48
|
+
|
|
49
|
+
if record['question_type'] == 'multi_choice':
|
|
50
|
+
label_answer = self.get_option_label(answers_list, record['answer'])
|
|
51
|
+
return Sample(
|
|
52
|
+
input=[ChatMessageUser(content=content_list)],
|
|
53
|
+
choices=answers_list,
|
|
54
|
+
target=label_answer,
|
|
55
|
+
metadata={
|
|
56
|
+
'question_type': record['question_type'],
|
|
57
|
+
'answer_type': record['answer_type'],
|
|
58
|
+
**record['metadata'],
|
|
59
|
+
}
|
|
60
|
+
)
|
|
61
|
+
elif record['question_type'] == 'free_form':
|
|
62
|
+
return Sample(
|
|
63
|
+
input=[ChatMessageUser(content=content_list)],
|
|
64
|
+
target=record['answer'],
|
|
65
|
+
metadata={
|
|
66
|
+
'precision': record['precision'],
|
|
67
|
+
'question_type': record['question_type'],
|
|
68
|
+
'answer_type': record['answer_type'],
|
|
69
|
+
**record['metadata'],
|
|
70
|
+
}
|
|
71
|
+
)
|
|
72
|
+
else:
|
|
73
|
+
raise ValueError(f"Unexpected question_type: {record['question_type']}")
|
|
74
|
+
|
|
75
|
+
def get_option_label(self, options, value):
|
|
76
|
+
try:
|
|
77
|
+
index = options.index(value)
|
|
78
|
+
return chr(ord('A') + index)
|
|
79
|
+
except ValueError:
|
|
80
|
+
logger.warning(f"Answer '{value}' not found in options: {options}. This may cause evaluation issues.")
|
|
81
|
+
return value
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def create_content_and_answers_list(record: dict[str, Any], ) -> tuple[list[Content], list[str]]:
|
|
85
|
+
"""
|
|
86
|
+
Create a list of content elements and a list of answers from a record.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
record (dict): The record containing question, images, and options.
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
tuple: A tuple containing:
|
|
94
|
+
- content_list (list): A list of content elements (text and images).
|
|
95
|
+
- answers_list (list): A list of possible answers (for multiple-choice questions).
|
|
96
|
+
"""
|
|
97
|
+
question_type = record['question_type']
|
|
98
|
+
if question_type == MULTI_CHOICE_TYPE:
|
|
99
|
+
answers_list = record['choices']
|
|
100
|
+
input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
|
|
101
|
+
content_list: list[Content] = [ContentText(text=input_text)]
|
|
102
|
+
else:
|
|
103
|
+
answers_list: list[str] = []
|
|
104
|
+
content_list: list[Content] = [ContentText(text=OPEN_PROMPT.format(question=record['question']))]
|
|
105
|
+
image = record['decoded_image']
|
|
106
|
+
if image:
|
|
107
|
+
image_base64 = bytes_to_base64(image['bytes'], format='jpg', add_header=True)
|
|
108
|
+
content_list.append(ContentImage(image=image_base64))
|
|
109
|
+
return content_list, answers_list
|
|
110
|
+
|
|
111
|
+
def extract_answer(self, prediction: str, task_state):
|
|
112
|
+
from evalscope.metrics.math_parser import extract_answer
|
|
113
|
+
|
|
114
|
+
return extract_answer(prediction)
|
|
File without changes
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
2
|
+
from evalscope.api.dataset import Sample
|
|
3
|
+
from evalscope.api.registry import register_benchmark
|
|
4
|
+
from evalscope.constants import Tags
|
|
5
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate
|
|
6
|
+
|
|
7
|
+
DESCRIPTION = 'MedMCQA is a large-scale MCQA dataset designed to address real-world medical entrance exam questions.' # noqa: E501
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@register_benchmark(
|
|
11
|
+
BenchmarkMeta(
|
|
12
|
+
name='med_mcqa',
|
|
13
|
+
pretty_name='Med-MCQA',
|
|
14
|
+
tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
|
|
15
|
+
description=DESCRIPTION.strip(),
|
|
16
|
+
dataset_id='extraordinarylab/medmcqa',
|
|
17
|
+
metric_list=['acc'],
|
|
18
|
+
few_shot_num=0,
|
|
19
|
+
train_split='train',
|
|
20
|
+
eval_split='validation',
|
|
21
|
+
prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER,
|
|
22
|
+
)
|
|
23
|
+
)
|
|
24
|
+
class MedMCQAAdapter(MultiChoiceAdapter):
|
|
25
|
+
|
|
26
|
+
def record_to_sample(self, record) -> Sample:
|
|
27
|
+
return Sample(
|
|
28
|
+
input=record['question'],
|
|
29
|
+
choices=record['choices'],
|
|
30
|
+
target=record['answer'],
|
|
31
|
+
metadata={},
|
|
32
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from typing import Any, Dict
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.registry import register_benchmark
|
|
6
|
+
from evalscope.constants import Tags
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@register_benchmark(
|
|
13
|
+
BenchmarkMeta(
|
|
14
|
+
name='minerva_math',
|
|
15
|
+
pretty_name='Minerva-Math',
|
|
16
|
+
tags=[Tags.MATH, Tags.REASONING],
|
|
17
|
+
description='Minerva-math is a benchmark designed to evaluate the mathematical and quantitative '
|
|
18
|
+
'reasoning capabilities of LLMs. It consists of **272 problems** '
|
|
19
|
+
'sourced primarily from **MIT OpenCourseWare** '
|
|
20
|
+
'courses, covering advanced STEM subjects such as solid-state chemistry, astronomy, differential '
|
|
21
|
+
'equations, and special relativity at the **university and graduate level**.',
|
|
22
|
+
dataset_id='knoveleng/Minerva-Math',
|
|
23
|
+
subset_list=['default'],
|
|
24
|
+
metric_list=[{
|
|
25
|
+
'acc': {
|
|
26
|
+
'numeric': True
|
|
27
|
+
}
|
|
28
|
+
}],
|
|
29
|
+
eval_split='train',
|
|
30
|
+
prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
31
|
+
)
|
|
32
|
+
)
|
|
33
|
+
class MinervaMathAdapter(DefaultDataAdapter):
|
|
34
|
+
|
|
35
|
+
def __init__(self, **kwargs):
|
|
36
|
+
super().__init__(**kwargs)
|
|
37
|
+
|
|
38
|
+
self._use_llm_judge = True
|
|
39
|
+
|
|
40
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
41
|
+
return Sample(
|
|
42
|
+
input=record['problem'],
|
|
43
|
+
target=record['solution'],
|
|
44
|
+
metadata={
|
|
45
|
+
'type': record['type'],
|
|
46
|
+
'idx': record['idx'],
|
|
47
|
+
},
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def extract_answer(self, prediction: str, task_state):
|
|
51
|
+
from evalscope.metrics.math_parser import extract_answer
|
|
52
|
+
|
|
53
|
+
return extract_answer(prediction)
|
|
File without changes
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter, VisionLanguageAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
6
|
+
from evalscope.api.registry import register_benchmark
|
|
7
|
+
from evalscope.constants import Tags
|
|
8
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate, prompt
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@register_benchmark(
|
|
18
|
+
BenchmarkMeta(
|
|
19
|
+
name='cc_bench',
|
|
20
|
+
pretty_name='CCBench',
|
|
21
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
|
|
22
|
+
description=
|
|
23
|
+
'CCBench is an extension of MMBench with newly design questions about Chinese traditional culture, including Calligraphy Painting, Cultural Relic, Food & Clothes, Historical Figures, Scenery & Building, Sketch Reasoning and Traditional Show.', # noqa: E501
|
|
24
|
+
dataset_id='lmms-lab/MMBench',
|
|
25
|
+
subset_list=['cc'],
|
|
26
|
+
metric_list=['acc'],
|
|
27
|
+
eval_split='test',
|
|
28
|
+
prompt_template=MULT_CHOICE_PROMPT,
|
|
29
|
+
)
|
|
30
|
+
)
|
|
31
|
+
class CCBenchAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
|
|
32
|
+
|
|
33
|
+
def __init__(self, **kwargs):
|
|
34
|
+
super().__init__(**kwargs)
|
|
35
|
+
|
|
36
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
37
|
+
answers_list: List[str] = [record.get('A', ''), record.get('B', ''), record.get('C', ''), record.get('D', '')]
|
|
38
|
+
input_text = prompt(question=record['question'], choices=answers_list, template=self.prompt_template)
|
|
39
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
40
|
+
image = record.get('image')
|
|
41
|
+
if image:
|
|
42
|
+
image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
|
|
43
|
+
content_list.append(ContentImage(image=image_base64))
|
|
44
|
+
label_answer = record.get('answer')
|
|
45
|
+
return Sample(
|
|
46
|
+
input=[ChatMessageUser(content=content_list)],
|
|
47
|
+
choices=answers_list,
|
|
48
|
+
target=label_answer,
|
|
49
|
+
metadata={
|
|
50
|
+
'index': record.get('index'),
|
|
51
|
+
'category': record.get('category'),
|
|
52
|
+
'source': record.get('source')
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@register_benchmark(
|
|
58
|
+
BenchmarkMeta(
|
|
59
|
+
name='mm_bench',
|
|
60
|
+
pretty_name='MMBench',
|
|
61
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
62
|
+
description=
|
|
63
|
+
'MMBench is a comprehensive evaluation pipeline comprised of meticulously curated multimodal dataset and a novel circulareval strategy using ChatGPT. It is comprised of 20 ability dimensions defined by MMBench. It also contains chinese version with translated question.', # noqa: E501
|
|
64
|
+
dataset_id='lmms-lab/MMBench',
|
|
65
|
+
subset_list=['cn', 'en'],
|
|
66
|
+
metric_list=['acc'],
|
|
67
|
+
eval_split='dev',
|
|
68
|
+
prompt_template=MULT_CHOICE_PROMPT,
|
|
69
|
+
)
|
|
70
|
+
)
|
|
71
|
+
class MMBenchAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
|
|
72
|
+
|
|
73
|
+
def __init__(self, **kwargs):
|
|
74
|
+
super().__init__(**kwargs)
|
|
75
|
+
|
|
76
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
77
|
+
answers_list: List[str] = [record.get('A', ''), record.get('B', ''), record.get('C', ''), record.get('D', '')]
|
|
78
|
+
answers_list = [ans for ans in answers_list if (ans.strip() and ans != 'nan')]
|
|
79
|
+
question_hint = record['hint'] + record['question']
|
|
80
|
+
input_text = prompt(question=question_hint, choices=answers_list, template=self.prompt_template)
|
|
81
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
82
|
+
image = record.get('image')
|
|
83
|
+
if image:
|
|
84
|
+
image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
|
|
85
|
+
content_list.append(ContentImage(image=image_base64))
|
|
86
|
+
label_answer = record.get('answer')
|
|
87
|
+
return Sample(
|
|
88
|
+
input=[ChatMessageUser(content=content_list)],
|
|
89
|
+
choices=answers_list,
|
|
90
|
+
target=label_answer,
|
|
91
|
+
metadata={
|
|
92
|
+
'index': record.get('index'),
|
|
93
|
+
'category': record.get('category'),
|
|
94
|
+
'source': record.get('source'),
|
|
95
|
+
'L2-category': record.get('L2-category'),
|
|
96
|
+
'comment': record.get('comment'),
|
|
97
|
+
'split': record.get('split')
|
|
98
|
+
}
|
|
99
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
MULT_CHOICE_PROMPT = r"""
|
|
16
|
+
Answer the following multiple choice question.
|
|
17
|
+
The last line of your response should be of the following format:
|
|
18
|
+
'ANSWER: $LETTER' (without quotes)
|
|
19
|
+
where LETTER is one of A,B,C,D. Think step by step before answering.
|
|
20
|
+
|
|
21
|
+
{question}
|
|
22
|
+
""".strip()
|
|
23
|
+
|
|
24
|
+
SUBSET_LIST = [
|
|
25
|
+
'coarse perception', 'fine-grained perception', 'instance reasoning', 'logical reasoning', 'math',
|
|
26
|
+
'science & technology'
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@register_benchmark(
|
|
31
|
+
BenchmarkMeta(
|
|
32
|
+
name='mm_star',
|
|
33
|
+
pretty_name='MMStar',
|
|
34
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
|
|
35
|
+
description=
|
|
36
|
+
'MMStar: an elite vision-indispensible multi-modal benchmark, aiming to ensure each curated sample exhibits visual dependency, minimal data leakage, and requires advanced multi-modal capabilities.', # noqa: E501
|
|
37
|
+
dataset_id='evalscope/MMStar',
|
|
38
|
+
subset_list=SUBSET_LIST,
|
|
39
|
+
metric_list=['acc'],
|
|
40
|
+
default_subset='val',
|
|
41
|
+
eval_split='val',
|
|
42
|
+
prompt_template=MULT_CHOICE_PROMPT,
|
|
43
|
+
)
|
|
44
|
+
)
|
|
45
|
+
class MMStarAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
|
|
46
|
+
|
|
47
|
+
def __init__(self, **kwargs):
|
|
48
|
+
super().__init__(**kwargs)
|
|
49
|
+
|
|
50
|
+
self.reformat_subset = True
|
|
51
|
+
|
|
52
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
53
|
+
input_text = MULT_CHOICE_PROMPT.format(question=record['question'])
|
|
54
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
55
|
+
image = record.get('image')
|
|
56
|
+
if image:
|
|
57
|
+
image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
|
|
58
|
+
content_list.append(ContentImage(image=image_base64))
|
|
59
|
+
label_answer = record.get('answer')
|
|
60
|
+
return Sample(
|
|
61
|
+
input=[ChatMessageUser(content=content_list)],
|
|
62
|
+
choices=['A', 'B', 'C', 'D'],
|
|
63
|
+
target=label_answer,
|
|
64
|
+
subset_key=record.get('category'),
|
|
65
|
+
metadata={
|
|
66
|
+
'index': record.get('index'),
|
|
67
|
+
'category': record.get('category'),
|
|
68
|
+
'l2_category': record.get('l2_category'),
|
|
69
|
+
'source': record.get('meta_info', {}).get('source'),
|
|
70
|
+
'split': record.get('meta_info', {}).get('split'),
|
|
71
|
+
'image_path': record.get('meta_info', {}).get('image_path')
|
|
72
|
+
}
|
|
73
|
+
)
|