evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from evalscope.constants import EvalType, FileConstants
|
|
5
|
+
from evalscope.utils import get_logger
|
|
6
|
+
from evalscope.utils.function_utils import thread_safe
|
|
7
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
8
|
+
from .text2image_adapter import Text2ImageAdapter
|
|
9
|
+
|
|
10
|
+
logger = get_logger()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ImageEditAdapter(Text2ImageAdapter):
|
|
14
|
+
"""
|
|
15
|
+
Support two methods:
|
|
16
|
+
1. Inference using modelscope pipeline
|
|
17
|
+
2. Load local inference jsonl file with key to corresponding prompt
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, **kwargs):
|
|
21
|
+
super().__init__(**kwargs)
|
|
22
|
+
|
|
23
|
+
self.local_file = self.extra_params.get('local_file', None)
|
|
24
|
+
self.id_key = self.extra_params.get('id_key', FileConstants.ID)
|
|
25
|
+
self.image_key = self.extra_params.get('image_key', FileConstants.IMAGE_PATH)
|
|
26
|
+
self.local_data = self.load_local_file()
|
|
27
|
+
|
|
28
|
+
def load_local_file(self) -> Optional[dict]:
|
|
29
|
+
if not self.local_file:
|
|
30
|
+
return None
|
|
31
|
+
|
|
32
|
+
# Load file and check
|
|
33
|
+
data_list = jsonl_to_list(self.local_file)
|
|
34
|
+
data_dict = {}
|
|
35
|
+
for record in data_list:
|
|
36
|
+
if self.image_key not in record:
|
|
37
|
+
raise ValueError(f"Image key '{self.image_key}' not found in record: {record}, file {self.local_file}")
|
|
38
|
+
if self.id_key not in record:
|
|
39
|
+
raise ValueError(f"ID key '{self.id_key}' not found in record: {record}, file {self.local_file}")
|
|
40
|
+
|
|
41
|
+
image_path = record[self.image_key]
|
|
42
|
+
if not os.path.isabs(image_path):
|
|
43
|
+
image_path = os.path.join(os.path.dirname(self.local_file), image_path)
|
|
44
|
+
if not os.path.exists(image_path):
|
|
45
|
+
raise FileNotFoundError(f"Image file '{image_path}' not found.")
|
|
46
|
+
|
|
47
|
+
data_dict[record[self.id_key]] = record
|
|
48
|
+
return data_dict
|
|
49
|
+
|
|
50
|
+
def get_image_path_from_id(self, image_id) -> Optional[str]:
|
|
51
|
+
if not self.local_file:
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
record = self.local_data.get(image_id)
|
|
55
|
+
if not record:
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
return record[self.image_key]
|
|
59
|
+
|
|
60
|
+
def _post_process_samples(self):
|
|
61
|
+
super()._post_process_samples()
|
|
62
|
+
|
|
63
|
+
# Add local image path if exists
|
|
64
|
+
for subset in self.test_dataset.keys():
|
|
65
|
+
for sample in self.test_dataset[subset]:
|
|
66
|
+
local_image_path = self.get_image_path_from_id(sample.metadata.get(FileConstants.ID))
|
|
67
|
+
if local_image_path:
|
|
68
|
+
sample.metadata[FileConstants.IMAGE_PATH] = local_image_path
|
|
69
|
+
|
|
70
|
+
def sample_filter(self, sample) -> bool:
|
|
71
|
+
"""
|
|
72
|
+
Filter samples based on metadata availability.
|
|
73
|
+
If local file is not available, all samples are considered valid.
|
|
74
|
+
Otherwise, only samples with valid metadata and image path are kept.
|
|
75
|
+
"""
|
|
76
|
+
if not self.local_data:
|
|
77
|
+
return True
|
|
78
|
+
else:
|
|
79
|
+
sample_id = sample.metadata.get(FileConstants.ID)
|
|
80
|
+
if (not sample_id) or (not self.get_image_path_from_id(sample_id)):
|
|
81
|
+
return False
|
|
82
|
+
return True
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from evalscope.api.dataset.dataset import Sample
|
|
2
|
+
from evalscope.api.evaluator import Choices, Target, TaskState
|
|
3
|
+
from evalscope.utils.multi_choices import (
|
|
4
|
+
FEW_SHOT_TEMPLATE,
|
|
5
|
+
MultipleChoiceTemplate,
|
|
6
|
+
format_example,
|
|
7
|
+
parse_answers,
|
|
8
|
+
parse_answers_zh,
|
|
9
|
+
prompt,
|
|
10
|
+
valid_template,
|
|
11
|
+
)
|
|
12
|
+
from .default_data_adapter import DefaultDataAdapter
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MultiChoiceAdapter(DefaultDataAdapter):
|
|
16
|
+
"""
|
|
17
|
+
Adapter for multi-choice benchmarks.
|
|
18
|
+
This adapter formats the input for multi-choice questions and handles few-shot examples.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, **kwargs):
|
|
22
|
+
super().__init__(**kwargs)
|
|
23
|
+
|
|
24
|
+
self.multiple_correct: bool = False
|
|
25
|
+
"""Whether the benchmark allows multiple correct answers."""
|
|
26
|
+
|
|
27
|
+
def format_prompt_template(self, sample: Sample) -> str:
|
|
28
|
+
"""
|
|
29
|
+
Format the basic prompt template with the sample data.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
sample (Sample): The sample object containing the prompt data
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
str: The formatted prompt ready for model input
|
|
36
|
+
"""
|
|
37
|
+
assert valid_template(self.prompt_template), 'Prompt template is not valid'
|
|
38
|
+
|
|
39
|
+
return prompt(
|
|
40
|
+
question=sample.input,
|
|
41
|
+
choices=Choices(sample.choices),
|
|
42
|
+
template=self.prompt_template,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def format_fewshot_template(self, fewshot: str, sample: Sample) -> str:
|
|
46
|
+
"""
|
|
47
|
+
Format the few-shot template with demonstrations and the main prompt.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
fewshot (str): The formatted few-shot demonstration examples
|
|
51
|
+
sample (Sample): The sample object containing the prompt data
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
str: The complete formatted input with few-shot context
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
few_shot_prompt_template = self.few_shot_prompt_template or (FEW_SHOT_TEMPLATE + self.prompt_template)
|
|
58
|
+
|
|
59
|
+
assert valid_template(few_shot_prompt_template), 'Few-shot prompt template is not valid'
|
|
60
|
+
|
|
61
|
+
return prompt(
|
|
62
|
+
question=sample.input, choices=Choices(sample.choices), template=few_shot_prompt_template, fewshot=fewshot
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def sample_to_fewshot(self, sample: Sample) -> str:
|
|
66
|
+
"""
|
|
67
|
+
Convert a sample to a few-shot formatted string.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
sample (Sample): The sample object to format
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
str: The formatted few-shot example string
|
|
74
|
+
"""
|
|
75
|
+
return format_example(question=sample.input, choices=Choices(sample.choices), answer=Target(sample.target))
|
|
76
|
+
|
|
77
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
78
|
+
if self.prompt_template in [
|
|
79
|
+
MultipleChoiceTemplate.CHINESE_SINGLE_ANSWER_TEMPLATE_COT,
|
|
80
|
+
MultipleChoiceTemplate.CHINESE_SINGLE_ANSWER_TEMPLATE
|
|
81
|
+
]:
|
|
82
|
+
# For Chinese COT template, we use a different extraction method
|
|
83
|
+
answers = parse_answers_zh(task_state, multiple_correct=self.multiple_correct)
|
|
84
|
+
else:
|
|
85
|
+
answers = parse_answers(task_state, multiple_correct=self.multiple_correct)
|
|
86
|
+
return ''.join(sorted(list(answers)))
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Set, Tuple
|
|
2
|
+
|
|
3
|
+
from evalscope.api.dataset import Sample
|
|
4
|
+
from evalscope.api.metric.scorer import AggScore, SampleScore, Score
|
|
5
|
+
from evalscope.utils.import_utils import check_import
|
|
6
|
+
from evalscope.utils.logger import get_logger
|
|
7
|
+
from evalscope.utils.ner import (
|
|
8
|
+
DEFAULT_TAG_FIX_PATTERNS,
|
|
9
|
+
calculate_bio_metrics,
|
|
10
|
+
clean_prediction,
|
|
11
|
+
create_target_text,
|
|
12
|
+
extract_entities_from_text,
|
|
13
|
+
extract_spans_from_bio,
|
|
14
|
+
xml_to_bio_tags,
|
|
15
|
+
)
|
|
16
|
+
from .default_data_adapter import DefaultDataAdapter
|
|
17
|
+
|
|
18
|
+
logger = get_logger()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class NERAdapter(DefaultDataAdapter):
|
|
22
|
+
"""
|
|
23
|
+
Base adapter class for Named Entity Recognition (NER) tasks.
|
|
24
|
+
|
|
25
|
+
This adapter handles converting between BIO tagging schemes and XML-style entity markup,
|
|
26
|
+
and provides evaluation metrics using seqeval.
|
|
27
|
+
|
|
28
|
+
Subclasses should define their entity types and register the benchmark.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, **kwargs):
|
|
32
|
+
super().__init__(**kwargs)
|
|
33
|
+
# Define mapping from BIO tags to user-friendly tag names
|
|
34
|
+
self.entity_type_map = {}
|
|
35
|
+
# Add descriptions for each entity type
|
|
36
|
+
self.entity_descriptions = {}
|
|
37
|
+
|
|
38
|
+
# These will be initialized in setup_entity_mappings
|
|
39
|
+
self.reverse_entity_map = {}
|
|
40
|
+
self.entity_list = []
|
|
41
|
+
self.entities_description = ''
|
|
42
|
+
|
|
43
|
+
# Define common error patterns to handle
|
|
44
|
+
self.tag_fix_patterns = DEFAULT_TAG_FIX_PATTERNS
|
|
45
|
+
|
|
46
|
+
check_import('seqeval', 'seqeval', raise_error=True, feature_name='NER metrics')
|
|
47
|
+
# Note: setup_entity_mappings() should be called by subclasses
|
|
48
|
+
# after they define their entity_type_map and entity_descriptions
|
|
49
|
+
|
|
50
|
+
def setup_entity_mappings(self):
|
|
51
|
+
"""
|
|
52
|
+
Setup entity mappings and descriptions for prompt formatting.
|
|
53
|
+
This should be called after entity_type_map and entity_descriptions are defined.
|
|
54
|
+
"""
|
|
55
|
+
# Reverse mapping for converting back from prediction to evaluation
|
|
56
|
+
self.reverse_entity_map = {v.lower(): k for k, v in self.entity_type_map.items()}
|
|
57
|
+
|
|
58
|
+
# Create list of tags for prompt formatting
|
|
59
|
+
self.entity_list = [f'<{ent.lower()}>' for ent in self.entity_type_map.values()]
|
|
60
|
+
|
|
61
|
+
# Create description of entities for prompt
|
|
62
|
+
self.entities_description = ', '.join([
|
|
63
|
+
f'{self.entity_type_map[tag]} ({self.entity_descriptions[tag]})' for tag in self.entity_type_map
|
|
64
|
+
])
|
|
65
|
+
|
|
66
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
67
|
+
"""
|
|
68
|
+
Convert a record with tokens and NER tags into a Sample.
|
|
69
|
+
Creates both the raw text input and annotated text target.
|
|
70
|
+
"""
|
|
71
|
+
tokens: List[str] = record['tokens']
|
|
72
|
+
ner_tags: List[str] = record['ner_tags']
|
|
73
|
+
|
|
74
|
+
# Create the input text by joining tokens
|
|
75
|
+
input_text = ' '.join(tokens)
|
|
76
|
+
|
|
77
|
+
# Process tokens and tags to create annotated target text
|
|
78
|
+
target_text = create_target_text(tokens, ner_tags, self.entity_type_map)
|
|
79
|
+
|
|
80
|
+
# Store tokens and tags in metadata for evaluation
|
|
81
|
+
metadata = {'tokens': tokens, 'ner_tags': ner_tags}
|
|
82
|
+
|
|
83
|
+
return Sample(input=input_text, target=target_text, metadata=metadata)
|
|
84
|
+
|
|
85
|
+
def format_prompt_template(self, sample):
|
|
86
|
+
"""
|
|
87
|
+
Format the prompt with entity types, available tags, and text to annotate.
|
|
88
|
+
"""
|
|
89
|
+
return self.prompt_template.format(
|
|
90
|
+
entities=self.entities_description, entity_list=', '.join(self.entity_list), text=sample.input
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
def format_fewshot_template(self, fewshot, sample):
|
|
94
|
+
"""
|
|
95
|
+
Format the few-shot prompt with all required parameters.
|
|
96
|
+
"""
|
|
97
|
+
return self.few_shot_prompt_template.format(
|
|
98
|
+
fewshot=fewshot,
|
|
99
|
+
entities=self.entities_description,
|
|
100
|
+
entity_list=', '.join(self.entity_list),
|
|
101
|
+
text=sample.input
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def sample_to_fewshot(self, sample: Sample) -> str:
|
|
105
|
+
"""
|
|
106
|
+
Format a sample as a few-shot example showing original and annotated text.
|
|
107
|
+
"""
|
|
108
|
+
if not sample.metadata:
|
|
109
|
+
return ''
|
|
110
|
+
|
|
111
|
+
# Format few-shot examples to match the expected response format
|
|
112
|
+
return f'Input:\n{sample.input}\n\nOutput:\n{sample.target}'
|
|
113
|
+
|
|
114
|
+
def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
|
|
115
|
+
"""
|
|
116
|
+
Evaluate named entity recognition performance using seqeval.
|
|
117
|
+
"""
|
|
118
|
+
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
|
|
119
|
+
|
|
120
|
+
score = Score(
|
|
121
|
+
extracted_prediction=filtered_prediction,
|
|
122
|
+
prediction=original_prediction,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
# Get the original tokens and tags from the reference metadata
|
|
127
|
+
original_tokens = task_state.metadata['tokens']
|
|
128
|
+
original_tags = task_state.metadata['ner_tags']
|
|
129
|
+
|
|
130
|
+
if not original_tokens or len(original_tokens) == 0:
|
|
131
|
+
if hasattr(reference, 'metadata') and reference.metadata:
|
|
132
|
+
original_tokens = reference.metadata['tokens']
|
|
133
|
+
original_tags = reference.metadata['ner_tags']
|
|
134
|
+
|
|
135
|
+
# Clean and normalize the prediction
|
|
136
|
+
cleaned_prediction = clean_prediction(filtered_prediction, self.tag_fix_patterns)
|
|
137
|
+
|
|
138
|
+
# Convert XML-style prediction back to BIO tags aligned with original tokens
|
|
139
|
+
pred_bio_tags = xml_to_bio_tags(cleaned_prediction, original_tokens, self.reverse_entity_map)
|
|
140
|
+
|
|
141
|
+
# Use seqeval to calculate metrics
|
|
142
|
+
# Note: seqeval expects lists of lists (one per sequence)
|
|
143
|
+
y_true = [original_tags]
|
|
144
|
+
y_pred = [pred_bio_tags]
|
|
145
|
+
|
|
146
|
+
precision = precision_score(y_true, y_pred)
|
|
147
|
+
recall = recall_score(y_true, y_pred)
|
|
148
|
+
f1 = f1_score(y_true, y_pred)
|
|
149
|
+
accuracy = accuracy_score(y_true, y_pred)
|
|
150
|
+
|
|
151
|
+
score.value = {'precision': precision, 'recall': recall, 'f1_score': f1, 'accuracy': accuracy}
|
|
152
|
+
|
|
153
|
+
# Store tags for aggregation (proper micro-averaging in aggregate_scores)
|
|
154
|
+
# This way aggregate_scores can compute metrics across all samples at once,
|
|
155
|
+
# which gives you true micro-averaged scores rather than averaged macro scores.
|
|
156
|
+
score.metadata = {'y_true': original_tags, 'y_pred': pred_bio_tags}
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.warning(f'Error evaluating NER prediction: {str(e)}')
|
|
159
|
+
score.value = {'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0, 'accuracy': 0.0}
|
|
160
|
+
|
|
161
|
+
return score
|
|
162
|
+
|
|
163
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
|
|
164
|
+
"""
|
|
165
|
+
Aggregate metrics across all samples using seqeval.
|
|
166
|
+
"""
|
|
167
|
+
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
|
|
168
|
+
|
|
169
|
+
# Collect all predictions and references
|
|
170
|
+
y_true_all = []
|
|
171
|
+
y_pred_all = []
|
|
172
|
+
|
|
173
|
+
for ss in sample_scores:
|
|
174
|
+
# Extract the BIO tags from metadata if available
|
|
175
|
+
# You may need to store these during match_score
|
|
176
|
+
if hasattr(ss.score, 'metadata') and 'y_true' in ss.score.metadata and 'y_pred' in ss.score.metadata:
|
|
177
|
+
y_true_all.append(ss.score.metadata['y_true'])
|
|
178
|
+
y_pred_all.append(ss.score.metadata['y_pred'])
|
|
179
|
+
|
|
180
|
+
if not y_true_all:
|
|
181
|
+
# Fallback: calculate averages from individual scores
|
|
182
|
+
num_samples = len(sample_scores)
|
|
183
|
+
avg_precision = sum(ss.score.value.get('precision', 0.0) for ss in sample_scores) / num_samples
|
|
184
|
+
avg_recall = sum(ss.score.value.get('recall', 0.0) for ss in sample_scores) / num_samples
|
|
185
|
+
avg_f1 = sum(ss.score.value.get('f1_score', 0.0) for ss in sample_scores) / num_samples
|
|
186
|
+
avg_accuracy = sum(ss.score.value.get('accuracy', 0.0) for ss in sample_scores) / num_samples
|
|
187
|
+
else:
|
|
188
|
+
# Use seqeval for micro-averaged metrics across all samples
|
|
189
|
+
avg_precision = precision_score(y_true_all, y_pred_all)
|
|
190
|
+
avg_recall = recall_score(y_true_all, y_pred_all)
|
|
191
|
+
avg_f1 = f1_score(y_true_all, y_pred_all)
|
|
192
|
+
avg_accuracy = accuracy_score(y_true_all, y_pred_all)
|
|
193
|
+
|
|
194
|
+
num_samples = len(sample_scores)
|
|
195
|
+
|
|
196
|
+
agg_scores = [
|
|
197
|
+
AggScore(
|
|
198
|
+
metric_name='precision',
|
|
199
|
+
score=avg_precision,
|
|
200
|
+
num=num_samples,
|
|
201
|
+
metadata={'type': 'seqeval-micro-average'}
|
|
202
|
+
),
|
|
203
|
+
AggScore(
|
|
204
|
+
metric_name='recall', score=avg_recall, num=num_samples, metadata={'type': 'seqeval-micro-average'}
|
|
205
|
+
),
|
|
206
|
+
AggScore(metric_name='f1_score', score=avg_f1, num=num_samples, metadata={'type': 'seqeval-micro-average'}),
|
|
207
|
+
AggScore(
|
|
208
|
+
metric_name='accuracy', score=avg_accuracy, num=num_samples, metadata={'type': 'seqeval-accuracy'}
|
|
209
|
+
)
|
|
210
|
+
]
|
|
211
|
+
|
|
212
|
+
return agg_scores
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.evaluator import TaskState
|
|
6
|
+
from evalscope.api.messages.chat_message import ChatMessageUser
|
|
7
|
+
from evalscope.api.messages.content import ContentImage
|
|
8
|
+
from evalscope.api.metric import Score
|
|
9
|
+
from evalscope.api.model import ChatCompletionChoice, Model, ModelOutput
|
|
10
|
+
from evalscope.api.registry import get_metric
|
|
11
|
+
from evalscope.constants import EvalType, FileConstants
|
|
12
|
+
from evalscope.utils import get_logger
|
|
13
|
+
from evalscope.utils.function_utils import thread_safe
|
|
14
|
+
from .default_data_adapter import DefaultDataAdapter
|
|
15
|
+
|
|
16
|
+
logger = get_logger()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Text2ImageAdapter(DefaultDataAdapter):
|
|
20
|
+
"""Text to Image Adapter for benchmarks."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, **kwargs):
|
|
23
|
+
super().__init__(**kwargs)
|
|
24
|
+
|
|
25
|
+
self.add_aggregation_name = False # Do not add aggregation name in the report by default
|
|
26
|
+
|
|
27
|
+
def load_from_disk(self, **kwargs):
|
|
28
|
+
return super().load_from_disk(use_local_loader=True)
|
|
29
|
+
|
|
30
|
+
def record_to_sample(self, record) -> Sample:
|
|
31
|
+
"""Convert a record dictionary to a Sample object."""
|
|
32
|
+
return Sample(
|
|
33
|
+
input=[ChatMessageUser(content=record['prompt'])],
|
|
34
|
+
metadata={
|
|
35
|
+
'prompt': record['prompt'],
|
|
36
|
+
'category': record.get('category', ''),
|
|
37
|
+
'tags': record.get('tags', []),
|
|
38
|
+
FileConstants.ID: record[FileConstants.ID],
|
|
39
|
+
FileConstants.IMAGE_PATH: record.get(FileConstants.IMAGE_PATH,
|
|
40
|
+
''), # Optional field for existing image path
|
|
41
|
+
}
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
|
|
45
|
+
"""
|
|
46
|
+
Hook method called during the actual inference process.
|
|
47
|
+
|
|
48
|
+
This method executes the model inference and can be overridden
|
|
49
|
+
to implement custom inference logic or model interaction patterns.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
model (Model): The model to use for inference
|
|
53
|
+
sample (Sample): The sample to process
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
ModelOutput: The raw output from the model
|
|
57
|
+
"""
|
|
58
|
+
if self.eval_type == EvalType.MOCK_LLM:
|
|
59
|
+
return ModelOutput(
|
|
60
|
+
model=model.name,
|
|
61
|
+
choices=[ChatCompletionChoice.from_content('')],
|
|
62
|
+
)
|
|
63
|
+
else:
|
|
64
|
+
# Execute model inference with the processed input and any tools
|
|
65
|
+
model_output = model.generate(input=sample.input, tools=sample.tools)
|
|
66
|
+
return model_output
|
|
67
|
+
|
|
68
|
+
def _on_inference_end(
|
|
69
|
+
self, model: Model, sample: Sample, model_output: ModelOutput, output_dir: str, **kwargs
|
|
70
|
+
) -> TaskState:
|
|
71
|
+
"""
|
|
72
|
+
Hook method called after inference completes. Save generated images to output_dir.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
model (Model): The model that performed inference
|
|
76
|
+
sample (Sample): The processed sample
|
|
77
|
+
model_output (ModelOutput): The raw model output
|
|
78
|
+
output_dir (str): The directory where the model output was saved
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
TaskState: Complete state object for the inference task
|
|
82
|
+
"""
|
|
83
|
+
if self.eval_type == EvalType.MOCK_LLM:
|
|
84
|
+
return TaskState(
|
|
85
|
+
model=model.name,
|
|
86
|
+
sample=sample,
|
|
87
|
+
messages=[model_output.message],
|
|
88
|
+
output=model_output,
|
|
89
|
+
completed=True,
|
|
90
|
+
)
|
|
91
|
+
else:
|
|
92
|
+
image_id = f'{sample.metadata.get(FileConstants.ID, sample.id)}_{sample.group_id}'
|
|
93
|
+
output_path = os.path.join(output_dir, 'images', f'{image_id}.png')
|
|
94
|
+
if not os.path.exists(os.path.dirname(output_path)):
|
|
95
|
+
os.makedirs(os.path.dirname(output_path))
|
|
96
|
+
# get base64 image from model_output
|
|
97
|
+
content = model_output.message.content[0]
|
|
98
|
+
|
|
99
|
+
assert isinstance(content, ContentImage), 'Expected ContentImage in model output'
|
|
100
|
+
|
|
101
|
+
image_base64 = content.image
|
|
102
|
+
with open(output_path, 'wb') as f:
|
|
103
|
+
f.write(base64.b64decode(image_base64))
|
|
104
|
+
|
|
105
|
+
sample.metadata[FileConstants.IMAGE_PATH] = output_path
|
|
106
|
+
return TaskState(
|
|
107
|
+
model=model.name,
|
|
108
|
+
sample=sample,
|
|
109
|
+
messages=[model_output.message],
|
|
110
|
+
output=model_output,
|
|
111
|
+
completed=True,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# NOTE: thread safe is needed, since we can't batch inference here.
|
|
115
|
+
@thread_safe
|
|
116
|
+
def match_score(
|
|
117
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
118
|
+
) -> Score:
|
|
119
|
+
# Get prediction and prompt from task state
|
|
120
|
+
image_path = task_state.metadata.get(FileConstants.IMAGE_PATH, original_prediction)
|
|
121
|
+
prompt = task_state.input[0].content
|
|
122
|
+
meta = task_state.metadata
|
|
123
|
+
|
|
124
|
+
# Initialize the score object with prediction details
|
|
125
|
+
score = Score(
|
|
126
|
+
extracted_prediction=image_path,
|
|
127
|
+
prediction=image_path,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Calculate scores for each configured metric
|
|
131
|
+
for metric in self.metric_list:
|
|
132
|
+
try:
|
|
133
|
+
if isinstance(metric, str):
|
|
134
|
+
metric_name = metric
|
|
135
|
+
metric_scorer = get_metric(metric) # Get metric implementation from registry
|
|
136
|
+
metric_func = metric_scorer() # Instantiate the metric scorer
|
|
137
|
+
elif isinstance(metric, dict):
|
|
138
|
+
metric_name = list(metric.keys())[0]
|
|
139
|
+
metric_cls = get_metric(metric_name)
|
|
140
|
+
metric_func = metric_cls(**metric[metric_name]) # Initialize with parameters
|
|
141
|
+
metric_score = metric_func(image_path, prompt)[0]
|
|
142
|
+
|
|
143
|
+
# fine-granular metrics
|
|
144
|
+
category = meta.get('category')
|
|
145
|
+
if category:
|
|
146
|
+
metric_name = f'{metric_name}_{category}'
|
|
147
|
+
if isinstance(metric_score, dict):
|
|
148
|
+
for k, v in metric_score.items():
|
|
149
|
+
score.value[f'{metric_name}_{k}'] = v.cpu().item()
|
|
150
|
+
else:
|
|
151
|
+
score.value[metric_name] = metric_score.cpu().item()
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.error(f'Error calculating metric {metric}: {e}')
|
|
154
|
+
score.value[metric_name] = 0
|
|
155
|
+
score.metadata[metric_name] = f'error: {str(e)}'
|
|
156
|
+
|
|
157
|
+
return score
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
from .default_data_adapter import DefaultDataAdapter
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class VisionLanguageAdapter(DefaultDataAdapter):
|
|
5
|
+
"""Adapter for vision-language benchmarks. e.g., image captioning, visual question answering, etc."""
|
|
6
|
+
|
|
7
|
+
def __init__(self, **kwargs):
|
|
8
|
+
super().__init__(**kwargs)
|