evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import os
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
|
+
|
|
6
|
+
from evalscope.api.benchmark import BenchmarkMeta, DataAdapter, DefaultDataAdapter
|
|
7
|
+
from evalscope.api.dataset import DatasetDict, LocalDataLoader, Sample
|
|
8
|
+
from evalscope.api.evaluator import TaskState
|
|
9
|
+
from evalscope.api.metric.scorer import AggScore, SampleScore
|
|
10
|
+
from evalscope.api.registry import get_benchmark, register_benchmark
|
|
11
|
+
from evalscope.config import TaskConfig
|
|
12
|
+
from evalscope.constants import DataCollection, Tags
|
|
13
|
+
from evalscope.report.generator import ReportGenerator
|
|
14
|
+
from evalscope.report.report import Report
|
|
15
|
+
from evalscope.utils.logger import get_logger
|
|
16
|
+
|
|
17
|
+
logger = get_logger()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@register_benchmark(
|
|
21
|
+
BenchmarkMeta(
|
|
22
|
+
name=DataCollection.NAME,
|
|
23
|
+
pretty_name='Data-Collection',
|
|
24
|
+
dataset_id='', # dataset_id need to be set
|
|
25
|
+
description='Custom Data collection, mixing multiple evaluation datasets for '
|
|
26
|
+
'a unified evaluation, aiming to use less data to achieve a more comprehensive '
|
|
27
|
+
'assessment of the model\'s capabilities. '
|
|
28
|
+
'[Usage Reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html)',
|
|
29
|
+
tags=[Tags.CUSTOM],
|
|
30
|
+
metric_list=['acc'],
|
|
31
|
+
eval_split='test',
|
|
32
|
+
prompt_template='',
|
|
33
|
+
)
|
|
34
|
+
)
|
|
35
|
+
class DataCollectionAdapter(DefaultDataAdapter):
|
|
36
|
+
|
|
37
|
+
def __init__(self, **kwargs):
|
|
38
|
+
"""
|
|
39
|
+
Data adapter for collection dataset.
|
|
40
|
+
"""
|
|
41
|
+
super().__init__(**kwargs)
|
|
42
|
+
|
|
43
|
+
def load(self):
|
|
44
|
+
# Try to load dataset from local disk
|
|
45
|
+
dataset_name_or_path = self.dataset_id
|
|
46
|
+
if os.path.exists(dataset_name_or_path):
|
|
47
|
+
logger.info(f'Loading dataset from {dataset_name_or_path}')
|
|
48
|
+
dataset_path = dataset_name_or_path
|
|
49
|
+
else:
|
|
50
|
+
from modelscope import dataset_snapshot_download
|
|
51
|
+
|
|
52
|
+
# Load dataset from remote
|
|
53
|
+
logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
|
|
54
|
+
# download dataset snapshot
|
|
55
|
+
dataset_path = dataset_snapshot_download(dataset_name_or_path, allow_file_pattern='*.jsonl')
|
|
56
|
+
|
|
57
|
+
dataset = LocalDataLoader(
|
|
58
|
+
data_id_or_path=dataset_path,
|
|
59
|
+
split=self.eval_split,
|
|
60
|
+
sample_fields=self.record_to_sample,
|
|
61
|
+
subset='test', # NOTE: using hardcoded test subset
|
|
62
|
+
limit=self.limit,
|
|
63
|
+
repeats=self.repeats,
|
|
64
|
+
shuffle=self.shuffle,
|
|
65
|
+
).load()
|
|
66
|
+
|
|
67
|
+
test_dataset = DatasetDict({self.default_subset: dataset})
|
|
68
|
+
|
|
69
|
+
return test_dataset, None
|
|
70
|
+
|
|
71
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
72
|
+
"""
|
|
73
|
+
Convert a data record to a Sample object. Every record is a DatasetEntry.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
record (Dict[str, Any]): Input data record.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Sample: Sample object with input, target, and metadata.
|
|
80
|
+
"""
|
|
81
|
+
from evalscope.collections import DatasetEntry
|
|
82
|
+
|
|
83
|
+
entry = DatasetEntry.model_validate(record)
|
|
84
|
+
sample = Sample.model_validate(entry.prompt)
|
|
85
|
+
|
|
86
|
+
record_without_prompt = copy.deepcopy(record)
|
|
87
|
+
del record_without_prompt['prompt']
|
|
88
|
+
sample.metadata[DataCollection.INFO] = record_without_prompt # keep all metadata
|
|
89
|
+
return sample
|
|
90
|
+
|
|
91
|
+
def _post_process_samples(self):
|
|
92
|
+
"""Post process of each sample"""
|
|
93
|
+
self._initialize_adapters()
|
|
94
|
+
|
|
95
|
+
def _initialize_adapters(self):
|
|
96
|
+
"""Init adapters for each dataset and create dataset id map"""
|
|
97
|
+
self.dataset_adapters: Dict[str, DataAdapter] = {}
|
|
98
|
+
self.dataset_name_map = defaultdict(lambda: defaultdict(list))
|
|
99
|
+
|
|
100
|
+
# load dataset args
|
|
101
|
+
dataset_args = copy.deepcopy(self._task_config.dataset_args)
|
|
102
|
+
|
|
103
|
+
# Iterate through each sample in the dataset
|
|
104
|
+
dataset = self.test_dataset[self.default_subset]
|
|
105
|
+
for sample in dataset:
|
|
106
|
+
collection_info = sample.metadata.get(DataCollection.INFO, {})
|
|
107
|
+
dataset_name = collection_info.get('dataset_name', '')
|
|
108
|
+
subset_name = collection_info.get('subset_name', '')
|
|
109
|
+
# create id mapping
|
|
110
|
+
self.dataset_name_map[dataset_name][subset_name].append(sample.id)
|
|
111
|
+
|
|
112
|
+
# update dataset args
|
|
113
|
+
cur_dataset_args = dataset_args.get(dataset_name, {})
|
|
114
|
+
|
|
115
|
+
# Initialize dataset adapter
|
|
116
|
+
if dataset_name not in self.dataset_adapters:
|
|
117
|
+
config = TaskConfig(dataset_args={dataset_name: cur_dataset_args})
|
|
118
|
+
self.dataset_adapters[dataset_name] = get_benchmark(dataset_name, config=config)
|
|
119
|
+
|
|
120
|
+
def _get_adapter(self, metadata: Dict[str, Any]) -> DataAdapter:
|
|
121
|
+
collection_info = metadata.get(DataCollection.INFO, {})
|
|
122
|
+
dataset_name = collection_info.get('dataset_name', '')
|
|
123
|
+
return self.dataset_adapters.get(dataset_name)
|
|
124
|
+
|
|
125
|
+
def run_inference(self, model, sample, output_dir, **kwargs) -> TaskState:
|
|
126
|
+
data_adapter = self._get_adapter(sample.metadata)
|
|
127
|
+
if not data_adapter:
|
|
128
|
+
raise ValueError(f'No data adapter found for sample: {sample}')
|
|
129
|
+
|
|
130
|
+
return data_adapter.run_inference(model, sample, output_dir, **kwargs)
|
|
131
|
+
|
|
132
|
+
def calculate_metrics(self, task_state) -> SampleScore:
|
|
133
|
+
data_adapter = self._get_adapter(task_state.metadata)
|
|
134
|
+
if not data_adapter:
|
|
135
|
+
raise ValueError(f'No data adapter found for task state: {task_state}')
|
|
136
|
+
|
|
137
|
+
return data_adapter.calculate_metrics(task_state)
|
|
138
|
+
|
|
139
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]):
|
|
140
|
+
import pandas as pd
|
|
141
|
+
from tabulate import tabulate
|
|
142
|
+
|
|
143
|
+
data = []
|
|
144
|
+
for sample_score in sample_scores:
|
|
145
|
+
collection_info = sample_score.sample_metadata[DataCollection.INFO]
|
|
146
|
+
main_score = sample_score.score.main_value
|
|
147
|
+
main_metric = sample_score.score.main_score_name
|
|
148
|
+
|
|
149
|
+
# use main score
|
|
150
|
+
data.append(
|
|
151
|
+
dict(
|
|
152
|
+
task_type=collection_info['task_type'],
|
|
153
|
+
categories=tuple(collection_info['categories']),
|
|
154
|
+
dataset_name=collection_info['dataset_name'],
|
|
155
|
+
subset_name=collection_info['subset_name'],
|
|
156
|
+
tags=collection_info['tags'],
|
|
157
|
+
sample_id=sample_score.sample_id,
|
|
158
|
+
metric=main_metric,
|
|
159
|
+
score=main_score
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
df = pd.DataFrame(data)
|
|
164
|
+
|
|
165
|
+
def aggregate_and_sort(df, group_by_cols):
|
|
166
|
+
# aggregate by group_by_cols, and calculate average_score and count
|
|
167
|
+
report_df = df.groupby(group_by_cols) \
|
|
168
|
+
.agg(average_score=('score', 'mean'), count=('score', 'size')) \
|
|
169
|
+
.reset_index()
|
|
170
|
+
report_df['average_score'] = report_df['average_score'].round(4)
|
|
171
|
+
report_df = report_df.sort_values(by='count', ascending=False) \
|
|
172
|
+
.to_dict(orient='records')
|
|
173
|
+
return report_df
|
|
174
|
+
|
|
175
|
+
# multi-level aggregation
|
|
176
|
+
subset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name', 'subset_name'])
|
|
177
|
+
dataset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name'])
|
|
178
|
+
task_report_df = aggregate_and_sort(df, ['task_type', 'metric'])
|
|
179
|
+
|
|
180
|
+
# explode tags to multiple rows
|
|
181
|
+
df_exploded_tags = df.explode('tags')
|
|
182
|
+
tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags', 'metric'])
|
|
183
|
+
|
|
184
|
+
# process multi-level categories
|
|
185
|
+
df_categories = df.copy()
|
|
186
|
+
# multi-level aggregation for categories
|
|
187
|
+
max_depth = df_categories['categories'].apply(len).max()
|
|
188
|
+
for level in range(max_depth):
|
|
189
|
+
df_categories[f'category{level}'] = df_categories['categories'].apply(
|
|
190
|
+
lambda x: x[level] if len(x) > level else ''
|
|
191
|
+
)
|
|
192
|
+
category_report_df = aggregate_and_sort(
|
|
193
|
+
df_categories, [f'category{level}' for level in range(max_depth)] + ['metric']
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# convert to dict format
|
|
197
|
+
report_dict = {
|
|
198
|
+
'subset_level': subset_report_df,
|
|
199
|
+
'dataset_level': dataset_report_df,
|
|
200
|
+
'task_level': task_report_df,
|
|
201
|
+
'tag_level': tag_report_df,
|
|
202
|
+
'category_level': category_report_df,
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
# record report
|
|
206
|
+
for level, data in report_dict.items():
|
|
207
|
+
table = tabulate(data, headers='keys', tablefmt='pretty', showindex=False)
|
|
208
|
+
logger.info(f'{level} Report:\n{table}')
|
|
209
|
+
|
|
210
|
+
return df
|
|
211
|
+
|
|
212
|
+
def generate_report(self, scores, model_name, output_dir, **kwargs) -> Report:
|
|
213
|
+
df = scores[self.default_subset]
|
|
214
|
+
report = ReportGenerator.gen_collection_report(df, self.name, model_name)
|
|
215
|
+
return report
|
|
File without changes
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
from typing import Any, Dict
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.evaluator import TaskState
|
|
6
|
+
from evalscope.api.metric import Score
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
TEMPLATE_0SHOT = """Please read the following text and answer the question below.
|
|
14
|
+
|
|
15
|
+
<text>
|
|
16
|
+
{context}
|
|
17
|
+
</text>
|
|
18
|
+
|
|
19
|
+
{question}
|
|
20
|
+
|
|
21
|
+
Format your response as follows: "Therefore, the answer is (insert answer here)"."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@register_benchmark(
|
|
25
|
+
BenchmarkMeta(
|
|
26
|
+
name='docmath',
|
|
27
|
+
pretty_name='DocMath',
|
|
28
|
+
tags=[Tags.REASONING, Tags.MATH, Tags.LONG_CONTEXT],
|
|
29
|
+
description=
|
|
30
|
+
'DocMath-Eval is a comprehensive benchmark focused on numerical reasoning within specialized domains. It requires the model to comprehend long and specialized documents and perform numerical reasoning to answer the given question.', # noqa: E501
|
|
31
|
+
dataset_id='yale-nlp/DocMath-Eval',
|
|
32
|
+
metric_list=['acc'],
|
|
33
|
+
subset_list=['complong_testmini', 'compshort_testmini', 'simplong_testmini', 'simpshort_testmini'],
|
|
34
|
+
eval_split='test',
|
|
35
|
+
prompt_template=TEMPLATE_0SHOT,
|
|
36
|
+
)
|
|
37
|
+
)
|
|
38
|
+
class DocMathAdapter(DefaultDataAdapter):
|
|
39
|
+
|
|
40
|
+
def __init__(self, **kwargs):
|
|
41
|
+
super().__init__(**kwargs)
|
|
42
|
+
self._use_llm_judge = True # Enable LLM judge for DocMath
|
|
43
|
+
self.split_as_subset = True # Use split as subset for DocMath
|
|
44
|
+
|
|
45
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
46
|
+
"""
|
|
47
|
+
Convert a data record to a Sample object.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
record (Dict[str, Any]): Input data record.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Sample: Sample object with input, target, and metadata.
|
|
54
|
+
"""
|
|
55
|
+
ground_truth = record['ground_truth']
|
|
56
|
+
|
|
57
|
+
return Sample(
|
|
58
|
+
input=record['question'],
|
|
59
|
+
target=str(ground_truth),
|
|
60
|
+
metadata={
|
|
61
|
+
'question_id': record.get('question_id', ''),
|
|
62
|
+
'paragraphs': record['paragraphs'],
|
|
63
|
+
'answer_type': type(ground_truth).__name__
|
|
64
|
+
}
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def format_prompt_template(self, sample):
|
|
68
|
+
context = '\n'.join(sample.metadata['paragraphs'])
|
|
69
|
+
question = sample.input
|
|
70
|
+
return self.prompt_template.format(context=context, question=question)
|
|
71
|
+
|
|
72
|
+
def extract_answer(self, prediction: str, task_state: TaskState):
|
|
73
|
+
"""
|
|
74
|
+
Extract the answer from the model prediction.
|
|
75
|
+
"""
|
|
76
|
+
from .utils import extract_answer
|
|
77
|
+
|
|
78
|
+
extracted_answer = extract_answer(prediction)
|
|
79
|
+
return extracted_answer
|
|
80
|
+
|
|
81
|
+
def match_score(
|
|
82
|
+
self,
|
|
83
|
+
original_prediction: str,
|
|
84
|
+
filtered_prediction: str,
|
|
85
|
+
reference: str,
|
|
86
|
+
task_state: TaskState,
|
|
87
|
+
) -> Score:
|
|
88
|
+
"""
|
|
89
|
+
Calculate accuracy score by matching prediction with reference.
|
|
90
|
+
"""
|
|
91
|
+
from .utils import get_acc
|
|
92
|
+
|
|
93
|
+
score = Score(
|
|
94
|
+
extracted_prediction=filtered_prediction,
|
|
95
|
+
prediction=original_prediction,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
answer_type = task_state.metadata.get('answer_type', 'unknown')
|
|
99
|
+
accuracy = get_acc(prediction=filtered_prediction, gt=reference, answer_type=answer_type)
|
|
100
|
+
score.value = {'acc': accuracy}
|
|
101
|
+
score.main_score_name = 'acc'
|
|
102
|
+
|
|
103
|
+
return score
|
|
104
|
+
|
|
105
|
+
def llm_match_score(
|
|
106
|
+
self,
|
|
107
|
+
original_prediction: str,
|
|
108
|
+
filtered_prediction: str,
|
|
109
|
+
reference: str,
|
|
110
|
+
task_state: TaskState,
|
|
111
|
+
) -> Score:
|
|
112
|
+
"""
|
|
113
|
+
Use LLM judge to evaluate the prediction against the reference.
|
|
114
|
+
"""
|
|
115
|
+
from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE
|
|
116
|
+
|
|
117
|
+
score = Score(
|
|
118
|
+
extracted_prediction=filtered_prediction,
|
|
119
|
+
prediction=original_prediction,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
question = task_state.metadata.get('question', '')
|
|
123
|
+
|
|
124
|
+
# Get grading response
|
|
125
|
+
prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=reference, answer_2=filtered_prediction)
|
|
126
|
+
orm_response = self.llm_judge.judge(prompt, system_prompt=GENERAL_ORM_PROMPT)
|
|
127
|
+
|
|
128
|
+
# Parse grading response
|
|
129
|
+
if 'YES' in orm_response:
|
|
130
|
+
accuracy = 1.0
|
|
131
|
+
else:
|
|
132
|
+
accuracy = 0.0
|
|
133
|
+
|
|
134
|
+
score.value = {'acc': accuracy}
|
|
135
|
+
score.explanation = f'LLM judge: {orm_response}'
|
|
136
|
+
score.metadata = {
|
|
137
|
+
'source': 'llm_judge',
|
|
138
|
+
'judge_strategy': self.judge_strategy,
|
|
139
|
+
'model': self.llm_judge.model_id
|
|
140
|
+
}
|
|
141
|
+
score.main_score_name = 'acc'
|
|
142
|
+
|
|
143
|
+
return score
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import numpy as np
|
|
3
|
+
import re
|
|
4
|
+
from sympy import Rational
|
|
5
|
+
|
|
6
|
+
from evalscope.utils.logger import get_logger
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
GENERAL_ORM_PROMPT = """You are an expert in verifying if two answers are the same.
|
|
11
|
+
Your input is a problem and two answers, Answer 1 and Answer 2. You need to check if they are equivalent.
|
|
12
|
+
Your task is to determine if two answers are equivalent, without attempting to solve the original problem.
|
|
13
|
+
Compare the answers to verify they represent identical values or meaning, even when written in different forms or notations.
|
|
14
|
+
|
|
15
|
+
Your output must follow the following format:
|
|
16
|
+
1) Provide an explanation for why the answers are equivalent or not.
|
|
17
|
+
2) Then provide your final answer in the form of: [[YES]] or [[NO]]
|
|
18
|
+
""" # noqa: E501
|
|
19
|
+
|
|
20
|
+
ORM_USER_TEMPLATE = """
|
|
21
|
+
Problem: {problem}
|
|
22
|
+
Answer 1: {answer_1}
|
|
23
|
+
Answer 2: {answer_2}
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def round_up_to_decimal(number, decimals):
|
|
28
|
+
factor = 10**decimals
|
|
29
|
+
return math.ceil(number * factor) / factor
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def is_number(string):
|
|
33
|
+
pattern = r'^[-+]?(\d{1,3}(,\d{3})*|(\d+))(\.\d+)?$'
|
|
34
|
+
match = re.match(pattern, string)
|
|
35
|
+
return bool(match)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def is_scientific_number(string):
|
|
39
|
+
pattern = r'^[-+]?\d+(\.\d+)?e[-]?\d+$'
|
|
40
|
+
match = re.match(pattern, string)
|
|
41
|
+
return bool(match)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def normalize(prediction: str):
|
|
45
|
+
# Preprocessing the string [Stage 1]
|
|
46
|
+
prediction = prediction.strip()
|
|
47
|
+
prediction = prediction.rstrip('.')
|
|
48
|
+
if not isinstance(prediction, str):
|
|
49
|
+
prediction = str(prediction) if prediction is not None else '0'
|
|
50
|
+
|
|
51
|
+
for money in ['£', '€', '¥', 'million', 'billion', 'thousand', 'US', 'USD', 'RMB']:
|
|
52
|
+
prediction = prediction.replace(money, '')
|
|
53
|
+
|
|
54
|
+
# Replace special tokens
|
|
55
|
+
if '=' in prediction:
|
|
56
|
+
prediction = prediction.split('=')[-1].strip()
|
|
57
|
+
if '≈' in prediction:
|
|
58
|
+
prediction = prediction.split('≈')[-1].strip()
|
|
59
|
+
if '`' in prediction:
|
|
60
|
+
prediction = prediction.replace('`', '')
|
|
61
|
+
if '%' in prediction:
|
|
62
|
+
prediction = prediction.replace('%', '')
|
|
63
|
+
if '$' in prediction:
|
|
64
|
+
prediction = prediction.replace('$', '')
|
|
65
|
+
if '°' in prediction:
|
|
66
|
+
prediction = prediction.replace('°', '')
|
|
67
|
+
|
|
68
|
+
# Detect the boolean keyword in the generation
|
|
69
|
+
if prediction in ['true', 'yes', 'false', 'no']:
|
|
70
|
+
if prediction == 'true' or prediction == 'yes':
|
|
71
|
+
prediction = 'True'
|
|
72
|
+
else:
|
|
73
|
+
prediction = 'False'
|
|
74
|
+
if 'True' in prediction or 'False' in prediction:
|
|
75
|
+
prediction = 'True' if 'True' in prediction else 'False'
|
|
76
|
+
|
|
77
|
+
# Detect the approximation keyword
|
|
78
|
+
if 'approximately' in prediction:
|
|
79
|
+
prediction = prediction.replace('approximately', '').strip()
|
|
80
|
+
if ' or ' in prediction:
|
|
81
|
+
prediction = prediction.split(' or ')[0]
|
|
82
|
+
|
|
83
|
+
# Drop the units before and after the number
|
|
84
|
+
if re.match(r'[-+]?(?:[\d,]*\.*\d+) [^0-9 ]+$', prediction):
|
|
85
|
+
prediction = re.search(r'([-+]?(?:[\d,]*\.*\d+)) [^0-9 ]+$', prediction).group(1)
|
|
86
|
+
if re.match(r'[^0-9 ]+ [-+]?(?:[\d,]*\.*\d+)$', prediction):
|
|
87
|
+
prediction = re.search(r'[^0-9 ]+ ([-+]?(?:[\d,]*\.*\d+))$', prediction).group(1)
|
|
88
|
+
if re.match(r'[-+]?(?:[\d,]*\.*\d+)[^\d]{1,2}$', prediction):
|
|
89
|
+
prediction = re.search(r'([-+]?(?:[\d,]*\.*\d+))[^\d]{1,2}$', prediction).group(1)
|
|
90
|
+
if re.match(r'[^-+\d]{1,2}(?:[\d,]*\.*\d+)$', prediction):
|
|
91
|
+
prediction = re.search(r'[^-+\d]{1,2}((?:[\d,]*\.*\d+))$', prediction).group(1)
|
|
92
|
+
|
|
93
|
+
# Preprocessing the number [Stage 1]
|
|
94
|
+
if '10^' in prediction:
|
|
95
|
+
prediction = re.sub(r'10\^(-?\d+)', r'math.pow(10, \1)', prediction)
|
|
96
|
+
if ' x ' in prediction:
|
|
97
|
+
prediction = prediction.replace(' x ', '*')
|
|
98
|
+
if ' × ' in prediction:
|
|
99
|
+
prediction = prediction.replace(' × ', '*')
|
|
100
|
+
if is_number(prediction):
|
|
101
|
+
prediction = prediction.replace(',', '')
|
|
102
|
+
|
|
103
|
+
# Preprocessing the option [Stage 3]
|
|
104
|
+
if '(a)' in prediction or '(b)' in prediction or '(c)' in prediction or '(d)' in prediction:
|
|
105
|
+
prediction = '"' + re.search(r'\([a-d]\)', prediction).group(0) + '"'
|
|
106
|
+
|
|
107
|
+
# If the prediction is empty, use dummy '0'
|
|
108
|
+
if not prediction:
|
|
109
|
+
prediction = '0'
|
|
110
|
+
|
|
111
|
+
# Converting the string answer to a number/list/bool/option
|
|
112
|
+
try:
|
|
113
|
+
prediction = eval(prediction)
|
|
114
|
+
except Exception:
|
|
115
|
+
# TO CHECK
|
|
116
|
+
prediction = 0
|
|
117
|
+
|
|
118
|
+
# Performing common type conversion
|
|
119
|
+
if isinstance(prediction, (set, tuple)):
|
|
120
|
+
prediction = list(prediction)
|
|
121
|
+
if isinstance(prediction[0], complex):
|
|
122
|
+
prediction = [tmp.real for tmp in prediction]
|
|
123
|
+
elif isinstance(prediction[0], Rational):
|
|
124
|
+
prediction = [float(tmp) for tmp in prediction]
|
|
125
|
+
elif isinstance(prediction, np.ndarray):
|
|
126
|
+
prediction = prediction.tolist()
|
|
127
|
+
else:
|
|
128
|
+
if isinstance(prediction, complex):
|
|
129
|
+
prediction = prediction.real
|
|
130
|
+
elif isinstance(prediction, Rational):
|
|
131
|
+
prediction = float(prediction)
|
|
132
|
+
|
|
133
|
+
return prediction
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def extract_answer(response: str):
|
|
137
|
+
"""Parses the final answer from the model's response text.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
response: Text extracted from the model's response
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
The final answer as a numeric value (string), or None if not found
|
|
144
|
+
"""
|
|
145
|
+
# Remove any asterisks or other unwanted characters
|
|
146
|
+
response = response.replace('*', '')
|
|
147
|
+
response = response.replace('(', '')
|
|
148
|
+
response = response.replace(')', '')
|
|
149
|
+
|
|
150
|
+
# Search for the pattern 'the answer is {final answer}.'
|
|
151
|
+
match = re.search(r'the answer is (\=?\≈?\`?\%?\$?\°?\£?\€?\¥?-?[0-9\.,]+)', response, re.IGNORECASE)
|
|
152
|
+
|
|
153
|
+
if match:
|
|
154
|
+
# Remove commas from the matched number (if any)
|
|
155
|
+
res = match.group(1).replace(',', '').rstrip('.')
|
|
156
|
+
return res
|
|
157
|
+
else:
|
|
158
|
+
return response
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def within_eps(pred: float, gt: float):
|
|
162
|
+
eps = abs(gt) * 0.0015
|
|
163
|
+
if pred >= gt - eps and pred <= gt + eps:
|
|
164
|
+
return True
|
|
165
|
+
else:
|
|
166
|
+
return False
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def compare_two_numbers(p, gt):
|
|
170
|
+
if isinstance(p, int) or isinstance(p, float):
|
|
171
|
+
pass
|
|
172
|
+
elif isinstance(p, list) or isinstance(p, bool) or isinstance(p, str):
|
|
173
|
+
return False
|
|
174
|
+
elif isinstance(p, tuple) or isinstance(p, complex) or isinstance(p, dict):
|
|
175
|
+
return False
|
|
176
|
+
else:
|
|
177
|
+
raise ValueError(p)
|
|
178
|
+
|
|
179
|
+
v1, v2 = max(abs(gt), abs(p)), min(abs(gt), abs(p))
|
|
180
|
+
if (v1 != 0 and v2 != 0) and int(math.log10(v1 / v2)) == math.log10(v1 / v2):
|
|
181
|
+
return True
|
|
182
|
+
|
|
183
|
+
if v2 <= v1 / 50 and within_eps(pred=v2 * 100, gt=v1):
|
|
184
|
+
return True
|
|
185
|
+
elif v2 <= v1 / 500 and within_eps(pred=v2 * 1000, gt=v1):
|
|
186
|
+
return True
|
|
187
|
+
elif v2 <= v1 / 50000 and within_eps(pred=v2 * 100000, gt=v1):
|
|
188
|
+
return True
|
|
189
|
+
|
|
190
|
+
if round_up_to_decimal(v1, 2) == round_up_to_decimal(v2, 2):
|
|
191
|
+
return True
|
|
192
|
+
|
|
193
|
+
return within_eps(pred=p, gt=gt)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def get_acc(prediction, gt, answer_type, cot=True):
|
|
197
|
+
try:
|
|
198
|
+
if cot:
|
|
199
|
+
prediction = normalize(prediction)
|
|
200
|
+
else:
|
|
201
|
+
prediction = float(prediction)
|
|
202
|
+
|
|
203
|
+
assert answer_type in ['int', 'float', 'float64', 'bool'], answer_type
|
|
204
|
+
if isinstance(prediction, (str, int, float, bool)) or isinstance(prediction, list):
|
|
205
|
+
# Comparing prediction against the reference
|
|
206
|
+
if answer_type in ['bool']:
|
|
207
|
+
acc = int(prediction == bool(gt))
|
|
208
|
+
elif answer_type == 'int':
|
|
209
|
+
acc = int(compare_two_numbers(prediction, int(gt)))
|
|
210
|
+
elif answer_type == 'float' or answer_type == 'float64':
|
|
211
|
+
acc = int(compare_two_numbers(prediction, float(gt)))
|
|
212
|
+
else:
|
|
213
|
+
acc = 0
|
|
214
|
+
else:
|
|
215
|
+
acc = 0
|
|
216
|
+
logger.error('Error: ', prediction, type(prediction))
|
|
217
|
+
return acc
|
|
218
|
+
except Exception:
|
|
219
|
+
return 0
|
|
File without changes
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator.state import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
PROMPT = """Answer the question according to the image using a single word or phrase.
|
|
16
|
+
{question}
|
|
17
|
+
The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the question.""" # noqa: E501
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@register_benchmark(
|
|
21
|
+
BenchmarkMeta(
|
|
22
|
+
name='docvqa',
|
|
23
|
+
pretty_name='DocVQA',
|
|
24
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
25
|
+
description=
|
|
26
|
+
'DocVQA (Document Visual Question Answering) is a benchmark designed to evaluate AI systems on their ability to answer questions based on the content of document images, such as scanned pages, forms, or invoices. Unlike general visual question answering, it requires understanding not just the text extracted by OCR, but also the complex layout, structure, and visual elements of a document.', # noqa: E501
|
|
27
|
+
dataset_id='lmms-lab/DocVQA',
|
|
28
|
+
subset_list=['DocVQA'],
|
|
29
|
+
metric_list=['anls'],
|
|
30
|
+
eval_split='validation',
|
|
31
|
+
prompt_template=PROMPT,
|
|
32
|
+
)
|
|
33
|
+
)
|
|
34
|
+
class DocVQAAdapter(VisionLanguageAdapter):
|
|
35
|
+
|
|
36
|
+
def __init__(self, **kwargs):
|
|
37
|
+
super().__init__(**kwargs)
|
|
38
|
+
self.add_aggregation_name = False
|
|
39
|
+
|
|
40
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
41
|
+
|
|
42
|
+
input_text = PROMPT.format(question=record['question'])
|
|
43
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
44
|
+
image = record.get('image')
|
|
45
|
+
if image:
|
|
46
|
+
image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
|
|
47
|
+
content_list.append(ContentImage(image=image_base64))
|
|
48
|
+
return Sample(
|
|
49
|
+
input=[ChatMessageUser(content=content_list)],
|
|
50
|
+
target=json.dumps(record.get('answers')), # answers is a list
|
|
51
|
+
metadata={
|
|
52
|
+
'questionId': record.get('questionId'),
|
|
53
|
+
'question_types': record.get('question_types'),
|
|
54
|
+
'docId': record.get('docId'),
|
|
55
|
+
'ucsf_document_id': record.get('ucsf_document_id'),
|
|
56
|
+
'ucsf_document_page_no': record.get('ucsf_document_page_no'),
|
|
57
|
+
}
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
61
|
+
import re
|
|
62
|
+
|
|
63
|
+
pattern = r'ANSWER:\s*(.*)'
|
|
64
|
+
match = re.search(pattern, prediction)
|
|
65
|
+
if match:
|
|
66
|
+
return match.group(1).strip()
|
|
67
|
+
return prediction.strip()
|
|
File without changes
|