evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,611 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import numpy as np
|
|
3
|
+
import os
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from typing import Dict, List
|
|
6
|
+
|
|
7
|
+
from evalscope.api.metric import Aggregator, AggScore, Metric, SampleScore, SingletonMetric, T2IMetric
|
|
8
|
+
from evalscope.api.registry import register_aggregation, register_metric
|
|
9
|
+
from evalscope.utils.import_utils import check_import
|
|
10
|
+
from .metrics import calculate_pass_at_k, calculate_pass_hat_k, mean, normalize_text
|
|
11
|
+
|
|
12
|
+
# ##################
|
|
13
|
+
# NLP Metrics ######
|
|
14
|
+
# ##################
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@register_metric(name='exact_match')
|
|
18
|
+
class ExactMatch(Metric):
|
|
19
|
+
|
|
20
|
+
def apply(self, predictions, references):
|
|
21
|
+
return [
|
|
22
|
+
float(normalize_text(prediction) == normalize_text(reference))
|
|
23
|
+
for prediction, reference in zip(predictions, references)
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@register_metric(name='acc')
|
|
28
|
+
class Accuracy(ExactMatch):
|
|
29
|
+
|
|
30
|
+
def __init__(self, allow_inclusion: bool = False, numeric: bool = False):
|
|
31
|
+
self.allow_inclusion = allow_inclusion
|
|
32
|
+
self.numeric = numeric
|
|
33
|
+
|
|
34
|
+
def apply(self, predictions, references):
|
|
35
|
+
if self.allow_inclusion:
|
|
36
|
+
results = []
|
|
37
|
+
for prediction, reference in zip(predictions, references):
|
|
38
|
+
if prediction and prediction in reference:
|
|
39
|
+
results.append(1.0)
|
|
40
|
+
else:
|
|
41
|
+
results.append(0.0)
|
|
42
|
+
return results
|
|
43
|
+
elif self.numeric:
|
|
44
|
+
from .math_parser import math_equal, strip_answer_string
|
|
45
|
+
|
|
46
|
+
results = []
|
|
47
|
+
for prediction, reference in zip(predictions, references):
|
|
48
|
+
ref_answer = strip_answer_string(reference)
|
|
49
|
+
results.append(float(math_equal(prediction, ref_answer)))
|
|
50
|
+
|
|
51
|
+
return results
|
|
52
|
+
else:
|
|
53
|
+
return super().apply(predictions, references)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@register_metric(name='numeric_match')
|
|
57
|
+
class NumericMatch(Metric):
|
|
58
|
+
|
|
59
|
+
def apply(self, predictions, references):
|
|
60
|
+
return [float(prediction == reference) for prediction, reference in zip(predictions, references)]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@register_metric(name='math_acc')
|
|
64
|
+
class MathAcc(Metric):
|
|
65
|
+
|
|
66
|
+
def apply(self, predictions, references):
|
|
67
|
+
from .math_parser import extract_answer, math_equal, strip_answer_string
|
|
68
|
+
|
|
69
|
+
results = []
|
|
70
|
+
for prediction, reference in zip(predictions, references):
|
|
71
|
+
pred_answer = strip_answer_string(extract_answer(prediction))
|
|
72
|
+
ref_answer = strip_answer_string(reference)
|
|
73
|
+
results.append(float(math_equal(pred_answer, ref_answer)))
|
|
74
|
+
|
|
75
|
+
return results
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@register_metric(name='multi_choice_acc')
|
|
79
|
+
class MultiChoiceAcc(Metric):
|
|
80
|
+
|
|
81
|
+
def apply(self, predictions, references):
|
|
82
|
+
"""
|
|
83
|
+
Calculate accuracy for multiple-choice questions.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
predictions (List[str]): List of predicted answers.
|
|
87
|
+
references (List[str]): List of correct answers.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
List[float]: List of accuracy scores (1.0 for correct, 0.0 for incorrect).
|
|
91
|
+
"""
|
|
92
|
+
res = []
|
|
93
|
+
for prediction, reference in zip(predictions, references):
|
|
94
|
+
prediction = set(prediction.strip().upper())
|
|
95
|
+
reference = set(reference.strip().upper())
|
|
96
|
+
# if the prediction has answer that not in reference, it is wrong
|
|
97
|
+
if not prediction.issubset(reference):
|
|
98
|
+
res.append(0.0)
|
|
99
|
+
continue
|
|
100
|
+
common = prediction.intersection(reference)
|
|
101
|
+
res.append(len(common) / len(reference) if reference else 0.0)
|
|
102
|
+
return res
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@register_metric(name='anls')
|
|
106
|
+
class ANLS(Metric):
|
|
107
|
+
|
|
108
|
+
def __init__(self, thresh_hold=0.5):
|
|
109
|
+
self.thresh_hold = thresh_hold
|
|
110
|
+
|
|
111
|
+
def apply(self, predictions, references):
|
|
112
|
+
"""
|
|
113
|
+
Calculate ANLS (Average Normalized Levenshtein Similarity) for a list of predictions and references.
|
|
114
|
+
This implementation is adapted from
|
|
115
|
+
https://github.com/QwenLM/Qwen-VL/blob/master/eval_mm/infographicsvqa_eval.py
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
references (List[str]): List of correct answers. Each answer can be a string of json.
|
|
119
|
+
predictions (List[str]): List of predicted answers.
|
|
120
|
+
"""
|
|
121
|
+
from .metrics import levenshtein_distance
|
|
122
|
+
|
|
123
|
+
res = []
|
|
124
|
+
# Unwrap predictions if it's a nested list
|
|
125
|
+
for prediction, reference in zip(predictions, references):
|
|
126
|
+
# Parse the reference which is a json string
|
|
127
|
+
try:
|
|
128
|
+
answer = json.loads(reference)
|
|
129
|
+
except json.JSONDecodeError:
|
|
130
|
+
answer = reference
|
|
131
|
+
if isinstance(answer, str):
|
|
132
|
+
answer = [answer]
|
|
133
|
+
assert isinstance(answer, list), 'The reference answer should be a list of answers.'
|
|
134
|
+
|
|
135
|
+
# Calculate ANLS for each reference answer
|
|
136
|
+
values = []
|
|
137
|
+
for ans in answer:
|
|
138
|
+
# preprocess both the answers - gt and prediction
|
|
139
|
+
gt_answer = ' '.join(ans.strip().lower().split())
|
|
140
|
+
det_answer = ' '.join(prediction.strip().lower().split())
|
|
141
|
+
|
|
142
|
+
dist = levenshtein_distance(gt_answer, det_answer)
|
|
143
|
+
length = max(len(ans.upper()), len(prediction.upper()))
|
|
144
|
+
values.append(0.0 if length == 0 else float(dist) / float(length))
|
|
145
|
+
|
|
146
|
+
question_result = 0.0
|
|
147
|
+
if values:
|
|
148
|
+
question_result = 1 - min(values)
|
|
149
|
+
if question_result < self.thresh_hold:
|
|
150
|
+
question_result = 0.0
|
|
151
|
+
res.append(question_result)
|
|
152
|
+
return res
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@register_metric(name='bertscore')
|
|
156
|
+
class BertScore(SingletonMetric):
|
|
157
|
+
|
|
158
|
+
def _init_once(self, model_id_or_path: str = 'google-bert/bert-base-chinese', **kwargs):
|
|
159
|
+
"""BertScore metric.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
model_id_or_path (str, optional): The model ID on modelscope or path to the pre-trained model.
|
|
163
|
+
Defaults to 'google-bert/bert-base-chinese'.
|
|
164
|
+
"""
|
|
165
|
+
check_import('torch', 'torch', raise_error=True, feature_name='BertScore Metric')
|
|
166
|
+
|
|
167
|
+
from .bert_score.scorer import BERTScorer
|
|
168
|
+
self.scorer = BERTScorer(model_id_or_path=model_id_or_path, batch_size=1024, **kwargs)
|
|
169
|
+
|
|
170
|
+
def apply(self, predictions: List[str], references: List[str]) -> List[float]:
|
|
171
|
+
_, _, F1 = self.scorer.score(predictions, references)
|
|
172
|
+
return [round(f1.item(), 6) for f1 in F1]
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
@register_metric(name='comet')
|
|
176
|
+
class COMETScore(SingletonMetric):
|
|
177
|
+
|
|
178
|
+
def _init_once(self, model_id_or_path: str = 'evalscope/wmt22-comet-da'):
|
|
179
|
+
"""COMETScore metric.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
model_name (str, optional): The model name on huggingface.
|
|
183
|
+
Defaults to 'evalscope/wmt22-comet-da'.
|
|
184
|
+
"""
|
|
185
|
+
check_import('comet', 'unbabel-comet', raise_error=True, feature_name='COMETScore Metric')
|
|
186
|
+
|
|
187
|
+
from comet import load_from_checkpoint
|
|
188
|
+
from modelscope import snapshot_download
|
|
189
|
+
|
|
190
|
+
self.model_name = model_id_or_path
|
|
191
|
+
model_path = snapshot_download(model_id_or_path)
|
|
192
|
+
checkpoint_path = os.path.join(model_path, 'checkpoints', 'model.ckpt')
|
|
193
|
+
self.comet_scorer = load_from_checkpoint(checkpoint_path)
|
|
194
|
+
|
|
195
|
+
def apply(self, samples: List[Dict[str, str]]) -> List[float]:
|
|
196
|
+
"""Apply COMET scoring."""
|
|
197
|
+
import torch
|
|
198
|
+
|
|
199
|
+
model_output = self.comet_scorer.predict(
|
|
200
|
+
samples=samples,
|
|
201
|
+
batch_size=1024,
|
|
202
|
+
gpus=1 if torch.cuda.is_available() else 0,
|
|
203
|
+
progress_bar=False,
|
|
204
|
+
)
|
|
205
|
+
scores = model_output.scores if hasattr(model_output, 'scores') else [model_output.system_score] * len(samples)
|
|
206
|
+
|
|
207
|
+
return [round(score, 6) for score in scores]
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
# ##################
|
|
211
|
+
# T2I Metrics ######
|
|
212
|
+
# ##################
|
|
213
|
+
@register_metric(name='VQAScore')
|
|
214
|
+
class VQAScore(T2IMetric):
|
|
215
|
+
|
|
216
|
+
def _init_once(self, model: str = 'clip-flant5-xxl'):
|
|
217
|
+
from .t2v_metrics.vqascore import VQAScore
|
|
218
|
+
self.model = VQAScore(model=model)
|
|
219
|
+
|
|
220
|
+
def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
|
|
221
|
+
return self.model(images, texts, **kwargs)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
@register_metric(name='PickScore')
|
|
225
|
+
class PickScore(T2IMetric):
|
|
226
|
+
|
|
227
|
+
def _init_once(self, model: str = 'pickscore-v1'):
|
|
228
|
+
from .t2v_metrics.clipscore import CLIPScore
|
|
229
|
+
self.model = CLIPScore(model=model)
|
|
230
|
+
|
|
231
|
+
def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
|
|
232
|
+
return self.model(images, texts, **kwargs)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
@register_metric(name='CLIPScore')
|
|
236
|
+
class CLIPScore(T2IMetric):
|
|
237
|
+
|
|
238
|
+
def _init_once(self, model: str = 'openai:ViT-L-14-336'):
|
|
239
|
+
from .t2v_metrics.clipscore import CLIPScore
|
|
240
|
+
self.model = CLIPScore(model=model)
|
|
241
|
+
|
|
242
|
+
def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
|
|
243
|
+
return self.model(images, texts, **kwargs)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
@register_metric(name='BLIPv2Score')
|
|
247
|
+
class BLIPv2Score(T2IMetric):
|
|
248
|
+
|
|
249
|
+
def _init_once(self, model: str = 'blip2-itm'):
|
|
250
|
+
from .t2v_metrics.itmscore import ITMScore
|
|
251
|
+
self.model = ITMScore(model=model)
|
|
252
|
+
|
|
253
|
+
def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
|
|
254
|
+
return self.model(images, texts, **kwargs)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
@register_metric(name='HPSv2Score')
|
|
258
|
+
class HPSv2Score(T2IMetric):
|
|
259
|
+
|
|
260
|
+
def _init_once(self, model: str = 'hpsv2'):
|
|
261
|
+
from .t2v_metrics.clipscore import CLIPScore
|
|
262
|
+
self.model = CLIPScore(model=model)
|
|
263
|
+
|
|
264
|
+
def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
|
|
265
|
+
return self.model(images, texts, **kwargs)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
@register_metric(name='HPSv2.1Score')
|
|
269
|
+
class HPSv2_1Score(T2IMetric):
|
|
270
|
+
|
|
271
|
+
def _init_once(self, model: str = 'hpsv2.1'):
|
|
272
|
+
from .t2v_metrics.clipscore import CLIPScore
|
|
273
|
+
self.model = CLIPScore(model=model)
|
|
274
|
+
|
|
275
|
+
def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
|
|
276
|
+
return self.model(images, texts, **kwargs)
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
@register_metric(name='ImageRewardScore')
|
|
280
|
+
class ImageRewardScore(T2IMetric):
|
|
281
|
+
|
|
282
|
+
def _init_once(self, model: str = 'image-reward-v1'):
|
|
283
|
+
from .t2v_metrics.itmscore import ITMScore
|
|
284
|
+
self.model = ITMScore(model=model)
|
|
285
|
+
|
|
286
|
+
def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
|
|
287
|
+
return self.model(images, texts, **kwargs)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
@register_metric(name='FGA_BLIP2Score')
|
|
291
|
+
class FGA_BLIP2Score(T2IMetric):
|
|
292
|
+
|
|
293
|
+
def _init_once(self, model: str = 'fga_blip2'):
|
|
294
|
+
from .t2v_metrics.itmscore import ITMScore
|
|
295
|
+
self.model = ITMScore(model=model)
|
|
296
|
+
|
|
297
|
+
def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
|
|
298
|
+
return self.model(images, texts, **kwargs)
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
@register_metric(name='MPS')
|
|
302
|
+
class MPS(T2IMetric):
|
|
303
|
+
|
|
304
|
+
def _init_once(self, model: str = 'mps'):
|
|
305
|
+
from .t2v_metrics.clipscore import CLIPScore
|
|
306
|
+
self.model = CLIPScore(model=model)
|
|
307
|
+
|
|
308
|
+
def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
|
|
309
|
+
return self.model(images, texts, **kwargs)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
# ##################
|
|
313
|
+
# Aggregators ######
|
|
314
|
+
# ##################
|
|
315
|
+
@register_aggregation(name='mean')
|
|
316
|
+
class Mean(Aggregator):
|
|
317
|
+
|
|
318
|
+
name = 'mean'
|
|
319
|
+
|
|
320
|
+
def agg_func(self, values: List[float]) -> float:
|
|
321
|
+
return mean(values)
|
|
322
|
+
|
|
323
|
+
def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
|
|
324
|
+
"""Aggregate scores by computing the mean for each metric.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
scores: List of sample scores to aggregate
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
List of aggregated scores with mean values
|
|
331
|
+
"""
|
|
332
|
+
if not scores:
|
|
333
|
+
return []
|
|
334
|
+
|
|
335
|
+
# Group score values by metric name
|
|
336
|
+
metric_values = defaultdict(list)
|
|
337
|
+
metric_sample_ids = defaultdict(list)
|
|
338
|
+
|
|
339
|
+
for score in scores:
|
|
340
|
+
|
|
341
|
+
for metric_name, value in score.score.value.items():
|
|
342
|
+
metric_values[metric_name].append(value)
|
|
343
|
+
metric_sample_ids[metric_name].append(score.sample_id)
|
|
344
|
+
|
|
345
|
+
# Calculate mean for each metric
|
|
346
|
+
aggregated_scores = []
|
|
347
|
+
for metric_name, values in metric_values.items():
|
|
348
|
+
if values: # Only process non-empty value lists
|
|
349
|
+
aggregated_scores.append(
|
|
350
|
+
AggScore(
|
|
351
|
+
score=self.agg_func(values),
|
|
352
|
+
metric_name=metric_name,
|
|
353
|
+
aggregation_name=self.name,
|
|
354
|
+
num=len(values),
|
|
355
|
+
ids=metric_sample_ids[metric_name]
|
|
356
|
+
)
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
return aggregated_scores
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
@register_aggregation(name='clipped_mean')
|
|
363
|
+
class ClippedMean(Mean):
|
|
364
|
+
|
|
365
|
+
name = 'clipped_mean'
|
|
366
|
+
|
|
367
|
+
def __init__(self, clip_min: float = 0.0, clip_max: float = 1.0):
|
|
368
|
+
self.clip_min = clip_min
|
|
369
|
+
self.clip_max = clip_max
|
|
370
|
+
|
|
371
|
+
def agg_func(self, values: List[float]) -> float:
|
|
372
|
+
clipped_values = min(max(mean(values), self.clip_min), self.clip_max)
|
|
373
|
+
return clipped_values
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
@register_aggregation(name='pass_at_k')
|
|
377
|
+
class PassAtK(Aggregator):
|
|
378
|
+
|
|
379
|
+
def __init__(self, k: int = 1):
|
|
380
|
+
self.k = k
|
|
381
|
+
self.name = f'pass_at_{k}'
|
|
382
|
+
|
|
383
|
+
def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
|
|
384
|
+
"""Aggregate scores by computing the pass@k for each metric using group_id.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
scores: List of sample scores to aggregate
|
|
388
|
+
|
|
389
|
+
Returns:
|
|
390
|
+
List of aggregated scores with pass@k values
|
|
391
|
+
"""
|
|
392
|
+
if not scores:
|
|
393
|
+
return []
|
|
394
|
+
|
|
395
|
+
# Group scores by metric name and group_id
|
|
396
|
+
metric_groups = defaultdict(lambda: defaultdict(list))
|
|
397
|
+
|
|
398
|
+
for score in scores:
|
|
399
|
+
group_id = getattr(score, 'group_id', score.sample_id) # fallback to sample_id if no group_id
|
|
400
|
+
|
|
401
|
+
for metric_name, value in score.score.value.items():
|
|
402
|
+
metric_groups[metric_name][group_id].append(float(value))
|
|
403
|
+
|
|
404
|
+
# Calculate pass@k for each metric
|
|
405
|
+
aggregated_scores = []
|
|
406
|
+
for metric_name, groups in metric_groups.items():
|
|
407
|
+
if not groups:
|
|
408
|
+
continue
|
|
409
|
+
|
|
410
|
+
# Calculate pass@k for each group (problem)
|
|
411
|
+
num_samples = []
|
|
412
|
+
num_correct = []
|
|
413
|
+
all_sample_ids = []
|
|
414
|
+
|
|
415
|
+
for group_id, group_values in groups.items():
|
|
416
|
+
num_samples.append(len(group_values))
|
|
417
|
+
num_correct.append(sum(group_values)) # count how many passed in this group
|
|
418
|
+
all_sample_ids.extend([f'{group_id}_{i}' for i in range(len(group_values))])
|
|
419
|
+
|
|
420
|
+
if num_samples:
|
|
421
|
+
# Use the calculate_pass_at_k function from metrics
|
|
422
|
+
pass_at_k_values = calculate_pass_at_k(num_samples, num_correct, self.k)
|
|
423
|
+
overall_pass_at_k = float(np.mean(pass_at_k_values))
|
|
424
|
+
|
|
425
|
+
aggregated_scores.append(
|
|
426
|
+
AggScore(
|
|
427
|
+
score=overall_pass_at_k,
|
|
428
|
+
metric_name=f'pass@{self.k}',
|
|
429
|
+
aggregation_name='',
|
|
430
|
+
num=len(scores),
|
|
431
|
+
ids=all_sample_ids
|
|
432
|
+
)
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
return aggregated_scores
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
@register_aggregation(name='mean_and_pass_at_k')
|
|
439
|
+
class MeanPassAtK(Aggregator):
|
|
440
|
+
|
|
441
|
+
def __init__(self):
|
|
442
|
+
self.name = 'mean_and_pass_at_k'
|
|
443
|
+
|
|
444
|
+
def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
|
|
445
|
+
"""Add per-metric pass@k (computed via calculate_pass_at_k) to each sample, then mean-aggregate.
|
|
446
|
+
|
|
447
|
+
For each metric:
|
|
448
|
+
- Group scores by group_id
|
|
449
|
+
- Collect binary correctness values
|
|
450
|
+
- Infer k as (total samples / number of groups) assuming uniform repetitions
|
|
451
|
+
- Compute per-group pass@k via calculate_pass_at_k
|
|
452
|
+
- Annotate each sample with metric_pass@k for its group
|
|
453
|
+
Finally run Mean() over the augmented metric set.
|
|
454
|
+
"""
|
|
455
|
+
if not scores:
|
|
456
|
+
return []
|
|
457
|
+
|
|
458
|
+
# Extract metric names present in score values
|
|
459
|
+
metrics = list(scores[0].score.value.keys())
|
|
460
|
+
|
|
461
|
+
for metric_name in metrics:
|
|
462
|
+
# group_id -> list[float] (0/1 correctness values)
|
|
463
|
+
group_values: Dict[str, List[float]] = defaultdict(list)
|
|
464
|
+
for s in scores:
|
|
465
|
+
group_id = getattr(s, 'group_id', s.sample_id)
|
|
466
|
+
value = float(s.score.value[metric_name])
|
|
467
|
+
group_values[group_id].append(value)
|
|
468
|
+
|
|
469
|
+
if not group_values:
|
|
470
|
+
continue
|
|
471
|
+
|
|
472
|
+
# Infer k (assumes roughly uniform repeats)
|
|
473
|
+
k = int(len(scores) / len(group_values)) if len(group_values) > 0 else 1
|
|
474
|
+
if k <= 0:
|
|
475
|
+
k = 1
|
|
476
|
+
|
|
477
|
+
# Prepare inputs for calculate_pass_at_k
|
|
478
|
+
num_samples: List[int] = []
|
|
479
|
+
num_correct: List[int] = []
|
|
480
|
+
group_order: List[str] = []
|
|
481
|
+
for gid, vals in group_values.items():
|
|
482
|
+
group_order.append(gid)
|
|
483
|
+
num_samples.append(len(vals))
|
|
484
|
+
num_correct.append(int(sum(vals)))
|
|
485
|
+
|
|
486
|
+
# Compute per-group pass@k
|
|
487
|
+
pass_at_k_list = calculate_pass_at_k(num_samples, num_correct, k)
|
|
488
|
+
# Map back: group_id -> pass@k value
|
|
489
|
+
pass_at_k_map = {gid: float(v) for gid, v in zip(group_order, pass_at_k_list)}
|
|
490
|
+
|
|
491
|
+
# Annotate each sample with its group's pass@k
|
|
492
|
+
for s in scores:
|
|
493
|
+
group_id = getattr(s, 'group_id', s.sample_id)
|
|
494
|
+
s.score.value[f'{metric_name}_pass@{k}'] = pass_at_k_map[group_id]
|
|
495
|
+
|
|
496
|
+
# Delegate mean aggregation over original + injected pass@k metrics
|
|
497
|
+
m = Mean()
|
|
498
|
+
return m(scores)
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
@register_aggregation(name='mean_and_vote_at_k')
|
|
502
|
+
class MeanVoteAtK(Aggregator):
|
|
503
|
+
|
|
504
|
+
def __init__(self):
|
|
505
|
+
|
|
506
|
+
self.name = 'mean_and_vote_at_k'
|
|
507
|
+
|
|
508
|
+
def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
|
|
509
|
+
"""Aggregate scores by computing the vote@k for each metric using group_id.
|
|
510
|
+
|
|
511
|
+
Args:
|
|
512
|
+
scores: List of sample scores to aggregate
|
|
513
|
+
|
|
514
|
+
Returns:
|
|
515
|
+
List of aggregated scores with vote@k values
|
|
516
|
+
"""
|
|
517
|
+
if not scores:
|
|
518
|
+
return []
|
|
519
|
+
|
|
520
|
+
metrics = list(scores[0].score.value.keys())
|
|
521
|
+
|
|
522
|
+
# Calculate vote@k for all metrics
|
|
523
|
+
for metric_name in metrics:
|
|
524
|
+
|
|
525
|
+
# Count of occurrences for each answer in each group_id
|
|
526
|
+
answer_groups = defaultdict(lambda: defaultdict(int))
|
|
527
|
+
# Score for each answer in each group_id
|
|
528
|
+
scores_groups = defaultdict(lambda: defaultdict(float))
|
|
529
|
+
# Score of the most frequently occurring answer
|
|
530
|
+
final_scores_groups = defaultdict(float)
|
|
531
|
+
# Count different answers for this metric
|
|
532
|
+
for score in scores:
|
|
533
|
+
group_id = getattr(score, 'group_id', score.sample_id) # fallback to sample_id if no group_id
|
|
534
|
+
answer_prediction = getattr(score.score, 'extracted_prediction', None)
|
|
535
|
+
answer_groups[group_id][answer_prediction] += 1
|
|
536
|
+
scores_groups[group_id][answer_prediction] = score.score.value[metric_name]
|
|
537
|
+
# Calculate the repetition count k for each problem
|
|
538
|
+
k = int(len(scores) / len(answer_groups))
|
|
539
|
+
|
|
540
|
+
# Use the score of the most frequently occurring answer as the group's score
|
|
541
|
+
for group_id in answer_groups:
|
|
542
|
+
final_scores_groups[group_id] = scores_groups[group_id][
|
|
543
|
+
max(answer_groups[group_id], key=answer_groups[group_id].get)]
|
|
544
|
+
|
|
545
|
+
# Add the corresponding vote@k for the metric to each score's value
|
|
546
|
+
for score in scores:
|
|
547
|
+
group_id = getattr(score, 'group_id', score.sample_id)
|
|
548
|
+
score.score.value.update({f'{metric_name}_vote@{k}': final_scores_groups[group_id]})
|
|
549
|
+
|
|
550
|
+
# Calculate the mean value for all metrics and their corresponding vote@k
|
|
551
|
+
m = Mean()
|
|
552
|
+
return m(scores)
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
@register_aggregation(name='mean_and_pass_hat_k')
|
|
556
|
+
class MeanPassHatK(Aggregator):
|
|
557
|
+
|
|
558
|
+
def __init__(self):
|
|
559
|
+
self.name = 'mean_and_pass_hat_k'
|
|
560
|
+
|
|
561
|
+
def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
|
|
562
|
+
"""Add per-metric pass^k using calculate_pass_hat_k, then mean-aggregate.
|
|
563
|
+
|
|
564
|
+
For each metric:
|
|
565
|
+
- Group scores by group_id
|
|
566
|
+
- Collect binary correctness values
|
|
567
|
+
- Infer k as approximate repeats and clamp to min attempts across groups
|
|
568
|
+
- Compute per-group pass^k via calculate_pass_hat_k
|
|
569
|
+
- Annotate each sample with metric_pass^{k} for its group
|
|
570
|
+
Finally run Mean() over the augmented metric set.
|
|
571
|
+
"""
|
|
572
|
+
if not scores:
|
|
573
|
+
return []
|
|
574
|
+
|
|
575
|
+
# Freeze metric names before augmenting values to avoid iterating injected keys
|
|
576
|
+
metrics = list(scores[0].score.value.keys())
|
|
577
|
+
|
|
578
|
+
for metric_name in metrics:
|
|
579
|
+
# group_id -> list[float] (0/1 correctness values)
|
|
580
|
+
group_values: Dict[str, List[float]] = defaultdict(list)
|
|
581
|
+
for s in scores:
|
|
582
|
+
group_id = getattr(s, 'group_id', s.sample_id)
|
|
583
|
+
value = float(s.score.value[metric_name])
|
|
584
|
+
group_values[group_id].append(value)
|
|
585
|
+
|
|
586
|
+
if not group_values:
|
|
587
|
+
continue
|
|
588
|
+
|
|
589
|
+
# Infer repeats and clamp to the smallest group size to satisfy k <= n
|
|
590
|
+
approx_k = int(len(scores) / len(group_values)) if len(group_values) > 0 else 1
|
|
591
|
+
min_n = min(len(vals) for vals in group_values.values())
|
|
592
|
+
k = max(1, min(approx_k, min_n))
|
|
593
|
+
|
|
594
|
+
# Compute per-group pass^k
|
|
595
|
+
pass_hat_k_map: Dict[str, float] = {}
|
|
596
|
+
for gid, vals in group_values.items():
|
|
597
|
+
n = len(vals)
|
|
598
|
+
c = int(sum(vals))
|
|
599
|
+
# calculate_pass_hat_k requires k <= n; ensured by clamping above
|
|
600
|
+
pass_hat_k_map[gid] = float(calculate_pass_hat_k(n, c, k))
|
|
601
|
+
|
|
602
|
+
# Annotate each sample with its group's pass^k
|
|
603
|
+
suffix = f'pass^{k}'
|
|
604
|
+
injected_key = f'{metric_name}_{suffix}'
|
|
605
|
+
for s in scores:
|
|
606
|
+
group_id = getattr(s, 'group_id', s.sample_id)
|
|
607
|
+
s.score.value[injected_key] = pass_hat_k_map[group_id]
|
|
608
|
+
|
|
609
|
+
# Mean aggregate over original + injected pass^k metrics
|
|
610
|
+
m = Mean()
|
|
611
|
+
return m(scores)
|