evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,14 +1,18 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import torch
|
|
3
3
|
from langchain_core.embeddings import Embeddings
|
|
4
|
+
from langchain_openai.embeddings import OpenAIEmbeddings
|
|
5
|
+
from mteb.encoder_interface import PromptType
|
|
4
6
|
from sentence_transformers import models
|
|
5
7
|
from sentence_transformers.cross_encoder import CrossEncoder
|
|
6
8
|
from sentence_transformers.SentenceTransformer import SentenceTransformer
|
|
7
9
|
from torch import Tensor
|
|
10
|
+
from tqdm import tqdm
|
|
8
11
|
from typing import Dict, List, Optional, Union
|
|
9
12
|
|
|
10
13
|
from evalscope.backend.rag_eval.utils.tools import download_model
|
|
11
14
|
from evalscope.constants import HubType
|
|
15
|
+
from evalscope.utils.argument_utils import get_supported_params
|
|
12
16
|
from evalscope.utils.logger import get_logger
|
|
13
17
|
|
|
14
18
|
logger = get_logger()
|
|
@@ -18,16 +22,16 @@ class BaseModel(Embeddings):
|
|
|
18
22
|
|
|
19
23
|
def __init__(
|
|
20
24
|
self,
|
|
21
|
-
model_name_or_path: str,
|
|
25
|
+
model_name_or_path: str = '',
|
|
22
26
|
max_seq_length: int = 512,
|
|
23
|
-
prompt: str =
|
|
24
|
-
|
|
27
|
+
prompt: Optional[str] = None,
|
|
28
|
+
prompts: Optional[Dict[str, str]] = None,
|
|
29
|
+
revision: Optional[str] = 'master',
|
|
25
30
|
**kwargs,
|
|
26
31
|
):
|
|
27
32
|
self.model_name_or_path = model_name_or_path
|
|
28
33
|
self.max_seq_length = max_seq_length
|
|
29
34
|
self.model_kwargs = kwargs.pop('model_kwargs', {})
|
|
30
|
-
self.model_kwargs['trust_remote_code'] = True
|
|
31
35
|
|
|
32
36
|
self.config_kwargs = kwargs.pop('config_kwargs', {})
|
|
33
37
|
self.config_kwargs['trust_remote_code'] = True
|
|
@@ -36,7 +40,9 @@ class BaseModel(Embeddings):
|
|
|
36
40
|
self.encode_kwargs['convert_to_tensor'] = True
|
|
37
41
|
|
|
38
42
|
self.prompt = prompt
|
|
43
|
+
self.prompts = prompts if prompts else {}
|
|
39
44
|
self.revision = revision
|
|
45
|
+
self.framework = ['PyTorch']
|
|
40
46
|
|
|
41
47
|
@property
|
|
42
48
|
def mteb_model_meta(self):
|
|
@@ -44,10 +50,22 @@ class BaseModel(Embeddings):
|
|
|
44
50
|
from mteb import ModelMeta
|
|
45
51
|
|
|
46
52
|
return ModelMeta(
|
|
47
|
-
name=os.path.basename(self.model_name_or_path),
|
|
53
|
+
name='eval/' + os.path.basename(self.model_name_or_path), # Ensure the name contains a slash
|
|
48
54
|
revision=self.revision,
|
|
49
55
|
languages=None,
|
|
50
56
|
release_date=None,
|
|
57
|
+
n_parameters=None,
|
|
58
|
+
memory_usage_mb=None,
|
|
59
|
+
max_tokens=None,
|
|
60
|
+
embed_dim=None,
|
|
61
|
+
license=None,
|
|
62
|
+
open_weights=None,
|
|
63
|
+
public_training_code=None,
|
|
64
|
+
public_training_data=None,
|
|
65
|
+
similarity_fn_name=None,
|
|
66
|
+
use_instructions=None,
|
|
67
|
+
training_datasets=None,
|
|
68
|
+
framework=self.framework,
|
|
51
69
|
)
|
|
52
70
|
|
|
53
71
|
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
|
@@ -59,7 +77,7 @@ class BaseModel(Embeddings):
|
|
|
59
77
|
Returns:
|
|
60
78
|
List of embeddings.
|
|
61
79
|
"""
|
|
62
|
-
return self.
|
|
80
|
+
return self.encode(texts).tolist()
|
|
63
81
|
|
|
64
82
|
def embed_query(self, text: str) -> List[float]:
|
|
65
83
|
"""Embed query text. Compact langchain.
|
|
@@ -70,19 +88,17 @@ class BaseModel(Embeddings):
|
|
|
70
88
|
Returns:
|
|
71
89
|
Embedding.
|
|
72
90
|
"""
|
|
73
|
-
return self.
|
|
91
|
+
return self.encode(text).tolist()
|
|
74
92
|
|
|
75
93
|
def encode(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
|
|
76
94
|
"""Embed text."""
|
|
77
95
|
raise NotImplementedError
|
|
78
96
|
|
|
79
|
-
def
|
|
80
|
-
"""
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
"""Embed search docs . Compact mteb."""
|
|
85
|
-
raise NotImplementedError
|
|
97
|
+
def get_prompt(self, task_name: str) -> Optional[str]:
|
|
98
|
+
"""Get prompt for the given task name."""
|
|
99
|
+
if self.prompt:
|
|
100
|
+
return self.prompt
|
|
101
|
+
return self.prompts.get(task_name, None)
|
|
86
102
|
|
|
87
103
|
|
|
88
104
|
class SentenceTransformerModel(BaseModel):
|
|
@@ -90,6 +106,9 @@ class SentenceTransformerModel(BaseModel):
|
|
|
90
106
|
def __init__(self, model_name_or_path: str, pooling_mode: Optional[str] = None, **kwargs):
|
|
91
107
|
super().__init__(model_name_or_path, **kwargs)
|
|
92
108
|
|
|
109
|
+
self.framework = ['Sentence Transformers', 'PyTorch']
|
|
110
|
+
|
|
111
|
+
self.model_kwargs['trust_remote_code'] = True
|
|
93
112
|
if not pooling_mode:
|
|
94
113
|
self.model = SentenceTransformer(
|
|
95
114
|
self.model_name_or_path,
|
|
@@ -110,43 +129,59 @@ class SentenceTransformerModel(BaseModel):
|
|
|
110
129
|
|
|
111
130
|
self.model.max_seq_length = self.max_seq_length
|
|
112
131
|
|
|
113
|
-
|
|
114
|
-
|
|
132
|
+
self.supported_encode_params = get_supported_params(self.model.encode)
|
|
133
|
+
|
|
134
|
+
def encode(self, texts: Union[str, List[str]], **kwargs) -> List[torch.Tensor]:
|
|
135
|
+
# pop unused kwargs
|
|
136
|
+
extra_params = {}
|
|
137
|
+
for key in list(kwargs.keys()):
|
|
138
|
+
if key not in self.supported_encode_params:
|
|
139
|
+
extra_params[key] = kwargs.pop(key)
|
|
115
140
|
self.encode_kwargs.update(kwargs)
|
|
116
141
|
|
|
142
|
+
# set prompt if provided
|
|
143
|
+
prompt = None
|
|
144
|
+
prompt_type = extra_params.pop('prompt_type', '')
|
|
145
|
+
task_name = extra_params.pop('task_name', '')
|
|
146
|
+
if prompt_type and prompt_type == PromptType.query:
|
|
147
|
+
prompt = self.get_prompt(task_name)
|
|
148
|
+
|
|
117
149
|
embeddings = self.model.encode(texts, prompt=prompt, **self.encode_kwargs)
|
|
118
150
|
assert isinstance(embeddings, Tensor)
|
|
119
151
|
return embeddings.cpu().detach()
|
|
120
152
|
|
|
121
|
-
def encode_queries(self, queries, **kwargs):
|
|
122
|
-
return self.encode(queries, prompt=self.prompt)
|
|
123
|
-
|
|
124
|
-
def encode_corpus(self, corpus, **kwargs):
|
|
125
|
-
if isinstance(corpus[0], dict):
|
|
126
|
-
input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
|
|
127
|
-
else:
|
|
128
|
-
input_texts = corpus
|
|
129
|
-
return self.encode(input_texts)
|
|
130
|
-
|
|
131
153
|
|
|
132
154
|
class CrossEncoderModel(BaseModel):
|
|
133
155
|
|
|
134
156
|
def __init__(self, model_name_or_path: str, **kwargs):
|
|
135
157
|
super().__init__(model_name_or_path, **kwargs)
|
|
158
|
+
|
|
159
|
+
self.framework = ['Sentence Transformers', 'PyTorch']
|
|
160
|
+
|
|
136
161
|
self.model = CrossEncoder(
|
|
137
162
|
self.model_name_or_path,
|
|
138
163
|
trust_remote_code=True,
|
|
139
164
|
max_length=self.max_seq_length,
|
|
165
|
+
automodel_args=self.model_kwargs,
|
|
140
166
|
)
|
|
141
|
-
|
|
142
|
-
|
|
167
|
+
self.tokenizer = self.model.tokenizer
|
|
168
|
+
# set pad token
|
|
169
|
+
if self.tokenizer.pad_token is None:
|
|
170
|
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
171
|
+
if ('pad_token_id' not in self.model.config) or (self.model.config.pad_token_id is None):
|
|
172
|
+
self.model.config.update({'pad_token_id': self.tokenizer.eos_token_id})
|
|
173
|
+
|
|
174
|
+
self.supported_encode_params = get_supported_params(self.model.predict)
|
|
175
|
+
|
|
176
|
+
def predict(self, sentences: List[List[str]], **kwargs) -> Tensor:
|
|
177
|
+
for key in list(kwargs.keys()):
|
|
178
|
+
if key not in self.supported_encode_params:
|
|
179
|
+
kwargs.pop(key)
|
|
143
180
|
self.encode_kwargs.update(kwargs)
|
|
144
181
|
|
|
145
|
-
if len(sentences[0]) ==
|
|
182
|
+
if len(sentences[0]) == 2: # Note: For mteb retrieval task
|
|
146
183
|
processed_sentences = []
|
|
147
|
-
for query, docs
|
|
148
|
-
if isinstance(docs, dict):
|
|
149
|
-
docs = docs['text']
|
|
184
|
+
for query, docs in sentences:
|
|
150
185
|
processed_sentences.append((self.prompt + query, docs))
|
|
151
186
|
sentences = processed_sentences
|
|
152
187
|
embeddings = self.model.predict(sentences, **self.encode_kwargs)
|
|
@@ -154,6 +189,60 @@ class CrossEncoderModel(BaseModel):
|
|
|
154
189
|
return embeddings
|
|
155
190
|
|
|
156
191
|
|
|
192
|
+
class APIEmbeddingModel(BaseModel):
|
|
193
|
+
|
|
194
|
+
def __init__(self, **kwargs):
|
|
195
|
+
self.model_name = kwargs.get('model_name')
|
|
196
|
+
self.openai_api_base = kwargs.get('api_base')
|
|
197
|
+
self.openai_api_key = kwargs.get('api_key')
|
|
198
|
+
self.dimensions = kwargs.get('dimensions')
|
|
199
|
+
self.check_embedding_ctx_length = kwargs.get('check_embedding_ctx_length', False)
|
|
200
|
+
self.framework = ['API']
|
|
201
|
+
|
|
202
|
+
self.model = OpenAIEmbeddings(
|
|
203
|
+
model=self.model_name,
|
|
204
|
+
openai_api_base=self.openai_api_base,
|
|
205
|
+
openai_api_key=self.openai_api_key,
|
|
206
|
+
dimensions=self.dimensions,
|
|
207
|
+
check_embedding_ctx_length=self.check_embedding_ctx_length,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
super().__init__(model_name_or_path=self.model_name, **kwargs)
|
|
211
|
+
|
|
212
|
+
self.batch_size = self.encode_kwargs.get('batch_size', 10)
|
|
213
|
+
|
|
214
|
+
self.supported_encode_params = get_supported_params(self.model.embed_documents)
|
|
215
|
+
|
|
216
|
+
def encode(self, texts: Union[str, List[str]], **kwargs) -> Tensor:
|
|
217
|
+
# pop unused kwargs
|
|
218
|
+
extra_params = {}
|
|
219
|
+
for key in list(kwargs.keys()):
|
|
220
|
+
if key not in self.supported_encode_params:
|
|
221
|
+
extra_params[key] = kwargs.pop(key)
|
|
222
|
+
self.encode_kwargs.update(kwargs)
|
|
223
|
+
|
|
224
|
+
# set prompt if provided
|
|
225
|
+
prompt = None
|
|
226
|
+
prompt_type = extra_params.pop('prompt_type', '')
|
|
227
|
+
task_name = extra_params.pop('task_name', '')
|
|
228
|
+
if prompt_type and prompt_type == PromptType.query:
|
|
229
|
+
prompt = self.get_prompt(task_name)
|
|
230
|
+
|
|
231
|
+
if isinstance(texts, str):
|
|
232
|
+
texts = [texts]
|
|
233
|
+
|
|
234
|
+
embeddings: List[List[float]] = []
|
|
235
|
+
for i in tqdm(range(0, len(texts), self.batch_size)):
|
|
236
|
+
# set prompt if provided
|
|
237
|
+
if prompt is not None:
|
|
238
|
+
batch_texts = [prompt + text for text in texts[i:i + self.batch_size]]
|
|
239
|
+
else:
|
|
240
|
+
batch_texts = texts[i:i + self.batch_size]
|
|
241
|
+
response = self.model.embed_documents(batch_texts, chunk_size=self.batch_size)
|
|
242
|
+
embeddings.extend(response)
|
|
243
|
+
return torch.tensor(embeddings)
|
|
244
|
+
|
|
245
|
+
|
|
157
246
|
class EmbeddingModel:
|
|
158
247
|
"""Custom embeddings"""
|
|
159
248
|
|
|
@@ -165,6 +254,10 @@ class EmbeddingModel:
|
|
|
165
254
|
revision: Optional[str] = 'master',
|
|
166
255
|
**kwargs,
|
|
167
256
|
):
|
|
257
|
+
if kwargs.get('model_name'):
|
|
258
|
+
# If model_name is provided, use OpenAIEmbeddings
|
|
259
|
+
return APIEmbeddingModel(**kwargs)
|
|
260
|
+
|
|
168
261
|
# If model path does not exist and hub is 'modelscope', download the model
|
|
169
262
|
if not os.path.exists(model_name_or_path) and hub == HubType.MODELSCOPE:
|
|
170
263
|
model_name_or_path = download_model(model_name_or_path, revision)
|
|
@@ -2,11 +2,10 @@ import os
|
|
|
2
2
|
from langchain_core.callbacks.manager import CallbackManagerForLLMRun
|
|
3
3
|
from langchain_core.language_models.llms import LLM as BaseLLM
|
|
4
4
|
from langchain_openai import ChatOpenAI
|
|
5
|
-
from modelscope.utils.hf_util import GenerationConfig
|
|
6
5
|
from typing import Any, Dict, Iterator, List, Mapping, Optional
|
|
7
6
|
|
|
8
|
-
from evalscope.
|
|
9
|
-
from evalscope.
|
|
7
|
+
from evalscope.api.model import GenerateConfig, Model, get_model
|
|
8
|
+
from evalscope.constants import DEFAULT_MODEL_REVISION, EvalType
|
|
10
9
|
|
|
11
10
|
|
|
12
11
|
class LLM:
|
|
@@ -16,9 +15,9 @@ class LLM:
|
|
|
16
15
|
api_base = kw.get('api_base', None)
|
|
17
16
|
if api_base:
|
|
18
17
|
return ChatOpenAI(
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
18
|
+
model=kw.get('model_name', ''),
|
|
19
|
+
base_url=api_base,
|
|
20
|
+
api_key=kw.get('api_key', 'EMPTY'),
|
|
22
21
|
)
|
|
23
22
|
else:
|
|
24
23
|
return LocalLLM(**kw)
|
|
@@ -30,17 +29,19 @@ class LocalLLM(BaseLLM):
|
|
|
30
29
|
model_name_or_path: str
|
|
31
30
|
model_revision: str = DEFAULT_MODEL_REVISION
|
|
32
31
|
template_type: Optional[str] = None
|
|
33
|
-
model_name: Optional[str]
|
|
34
|
-
model: Optional[
|
|
35
|
-
generation_config: Optional[Dict]
|
|
32
|
+
model_name: Optional[str] = None
|
|
33
|
+
model: Optional[Model] = None
|
|
34
|
+
generation_config: Optional[Dict] = {}
|
|
36
35
|
|
|
37
36
|
def __init__(self, **kw):
|
|
38
37
|
super().__init__(**kw)
|
|
39
38
|
self.model_name = os.path.basename(self.model_name_or_path)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
39
|
+
|
|
40
|
+
# Create and initialize the local model
|
|
41
|
+
self.model = get_model(
|
|
42
|
+
model=self.model_name_or_path,
|
|
43
|
+
eval_type=EvalType.CHECKPOINT,
|
|
44
|
+
config=GenerateConfig(**self.generation_config),
|
|
44
45
|
)
|
|
45
46
|
|
|
46
47
|
def _call(
|
|
@@ -51,10 +52,9 @@ class LocalLLM(BaseLLM):
|
|
|
51
52
|
**kwargs: Any,
|
|
52
53
|
) -> str:
|
|
53
54
|
"""Run the LLM on the given input."""
|
|
54
|
-
infer_cfg = {'stop': stop}
|
|
55
55
|
|
|
56
|
-
response = self.model.
|
|
57
|
-
return response
|
|
56
|
+
response = self.model.generate(input=prompt)
|
|
57
|
+
return response.completion
|
|
58
58
|
|
|
59
59
|
@property
|
|
60
60
|
def _identifying_params(self) -> Dict[str, Any]:
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import copy
|
|
2
|
+
import os
|
|
2
3
|
import subprocess
|
|
3
4
|
from functools import partial
|
|
4
5
|
from typing import Optional, Union
|
|
5
6
|
|
|
6
7
|
from evalscope.backend.base import BackendManager
|
|
7
|
-
from evalscope.utils import
|
|
8
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
9
|
+
from evalscope.utils.io_utils import get_valid_list
|
|
8
10
|
from evalscope.utils.logger import get_logger
|
|
9
11
|
|
|
10
12
|
logger = get_logger()
|
|
@@ -66,8 +68,11 @@ class VLMEvalKitBackendManager(BackendManager):
|
|
|
66
68
|
del remain_cfg['name'] # remove not used args
|
|
67
69
|
del remain_cfg['type'] # remove not used args
|
|
68
70
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
+
norm_model_type = os.path.basename(model_type).replace(':', '-').replace('.', '_')
|
|
72
|
+
model_cfg['type'] = norm_model_type
|
|
73
|
+
|
|
74
|
+
self.valid_models.update({norm_model_type: partial(model_class, model=model_type, **remain_cfg)})
|
|
75
|
+
new_model_names.append(norm_model_type)
|
|
71
76
|
else:
|
|
72
77
|
remain_cfg = copy.deepcopy(model_cfg)
|
|
73
78
|
del remain_cfg['name'] # remove not used args
|
evalscope/benchmarks/__init__.py
CHANGED
|
@@ -2,16 +2,17 @@
|
|
|
2
2
|
import glob
|
|
3
3
|
import importlib
|
|
4
4
|
import os
|
|
5
|
+
import time
|
|
5
6
|
|
|
6
|
-
from evalscope.benchmarks.benchmark import Benchmark, BenchmarkMeta
|
|
7
|
-
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
8
7
|
from evalscope.utils import get_logger
|
|
9
8
|
|
|
10
9
|
logger = get_logger()
|
|
11
10
|
|
|
12
11
|
# Using glob to find all files matching the pattern
|
|
13
|
-
pattern = os.path.join(os.path.dirname(__file__), '*', '*_adapter.py')
|
|
14
|
-
files = glob.glob(pattern, recursive=
|
|
12
|
+
pattern = os.path.join(os.path.dirname(__file__), '*', '**', '*_adapter.py')
|
|
13
|
+
files = glob.glob(pattern, recursive=True)
|
|
14
|
+
|
|
15
|
+
import_times = []
|
|
15
16
|
|
|
16
17
|
for file_path in files:
|
|
17
18
|
if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
|
|
@@ -19,5 +20,16 @@ for file_path in files:
|
|
|
19
20
|
relative_path = os.path.relpath(file_path, os.path.dirname(__file__))
|
|
20
21
|
module_path = relative_path[:-3].replace(os.path.sep, '.') # strip '.py' and convert to module path
|
|
21
22
|
full_path = f'evalscope.benchmarks.{module_path}'
|
|
23
|
+
|
|
24
|
+
start_time = time.perf_counter()
|
|
22
25
|
importlib.import_module(full_path)
|
|
23
|
-
|
|
26
|
+
end_time = time.perf_counter()
|
|
27
|
+
|
|
28
|
+
import_times.append((full_path, end_time - start_time))
|
|
29
|
+
|
|
30
|
+
# Sort by import time in descending order
|
|
31
|
+
import_times.sort(key=lambda x: x[1], reverse=True)
|
|
32
|
+
|
|
33
|
+
# Log the sorted import times
|
|
34
|
+
for module, duration in import_times:
|
|
35
|
+
logger.debug(f'Module {module} imported in {duration:.6f} seconds')
|
|
File without changes
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
# flake8: noqa: E501
|
|
3
|
+
import re
|
|
4
|
+
import urllib.request
|
|
5
|
+
import zipfile
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict
|
|
8
|
+
|
|
9
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
10
|
+
from evalscope.api.dataset import Sample
|
|
11
|
+
from evalscope.api.evaluator import TaskState
|
|
12
|
+
from evalscope.api.messages import ChatMessageUser
|
|
13
|
+
from evalscope.api.metric import Score
|
|
14
|
+
from evalscope.api.registry import register_benchmark
|
|
15
|
+
from evalscope.constants import DEFAULT_EVALSCOPE_CACHE_DIR, Tags
|
|
16
|
+
from evalscope.utils.logger import get_logger
|
|
17
|
+
|
|
18
|
+
logger = get_logger()
|
|
19
|
+
|
|
20
|
+
# Default judge prompt template
|
|
21
|
+
JUDGE_PROMPT = """Assess whether the following CANDIDATE ANSWER is CORRECT or INCORRECT. For the CANDIDATE ANSWER to be correct, it must be consistent with the OFFICIAL ANSWER.
|
|
22
|
+
|
|
23
|
+
The question, for reference only: {question}
|
|
24
|
+
The OFFICIAL ANSWER: {correct_answer}
|
|
25
|
+
CANDIDATE ANSWER TO ASSESS: {response}
|
|
26
|
+
|
|
27
|
+
Reply only with CORRECT or INCORRECT."""
|
|
28
|
+
|
|
29
|
+
PROMPT_TEMPLATE = """
|
|
30
|
+
BEGIN INPUT DOCUMENTS
|
|
31
|
+
|
|
32
|
+
{documents_text}
|
|
33
|
+
|
|
34
|
+
END INPUT DOCUMENTS
|
|
35
|
+
|
|
36
|
+
Answer the following question using the input documents provided above.
|
|
37
|
+
|
|
38
|
+
START QUESTION
|
|
39
|
+
|
|
40
|
+
{question}
|
|
41
|
+
|
|
42
|
+
END QUESTION
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
# New constants for auto-download
|
|
46
|
+
DOWNLOAD_URL: str = (
|
|
47
|
+
'https://modelscope.cn/datasets/evalscope/AA-LCR/resolve/master/extracted_text/AA-LCR_extracted-text.zip'
|
|
48
|
+
)
|
|
49
|
+
DEFAULT_CACHE_SUBDIR: str = 'aa_lcr'
|
|
50
|
+
DEFAULT_ZIP_NAME: str = 'AA-LCR_extracted-text.zip'
|
|
51
|
+
DEFAULT_EXTRACTED_DIR_NAME: str = 'lcr'
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@register_benchmark(
|
|
55
|
+
BenchmarkMeta(
|
|
56
|
+
name='aa_lcr',
|
|
57
|
+
pretty_name='AA-LCR',
|
|
58
|
+
tags=[Tags.KNOWLEDGE, Tags.REASONING, Tags.LONG_CONTEXT],
|
|
59
|
+
description='AA-LCR (Artificial Analysis Long Context Retrieval) is a benchmark for evaluating long-context '
|
|
60
|
+
'retrieval and reasoning capabilities of language models across multiple documents.', # noqa: E501
|
|
61
|
+
dataset_id='evalscope/AA-LCR',
|
|
62
|
+
metric_list=['acc'],
|
|
63
|
+
few_shot_num=0,
|
|
64
|
+
train_split=None,
|
|
65
|
+
eval_split='test',
|
|
66
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
67
|
+
extra_params={'text_dir': None}
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
class AALCRAdapter(DefaultDataAdapter):
|
|
71
|
+
|
|
72
|
+
def __init__(self, *args, **kwargs):
|
|
73
|
+
super().__init__(*args, **kwargs)
|
|
74
|
+
|
|
75
|
+
self._use_llm_judge = True
|
|
76
|
+
|
|
77
|
+
# Get extra parameters
|
|
78
|
+
self.text_dir = self.extra_params.get('text_dir')
|
|
79
|
+
|
|
80
|
+
def load(self):
|
|
81
|
+
# Auto download and extract when text_dir is not provided
|
|
82
|
+
if not self.text_dir:
|
|
83
|
+
self.text_dir = self._ensure_text_dir_downloaded()
|
|
84
|
+
elif not Path(self.text_dir).exists():
|
|
85
|
+
raise ValueError(
|
|
86
|
+
'AA-LCR text_dir does not exist: '
|
|
87
|
+
f'{self.text_dir}. Please provide a valid directory or omit text_dir to auto-download.'
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
self.text_dir = Path(self.text_dir)
|
|
91
|
+
return super().load()
|
|
92
|
+
|
|
93
|
+
def _ensure_text_dir_downloaded(self) -> Path:
|
|
94
|
+
"""Ensure AA-LCR extracted texts are available locally; download and extract if missing."""
|
|
95
|
+
cache_root = Path(DEFAULT_EVALSCOPE_CACHE_DIR) / DEFAULT_CACHE_SUBDIR
|
|
96
|
+
extracted_dir = cache_root / DEFAULT_EXTRACTED_DIR_NAME
|
|
97
|
+
|
|
98
|
+
if extracted_dir.exists():
|
|
99
|
+
logger.info(f'AA-LCR documents found: {extracted_dir}')
|
|
100
|
+
return extracted_dir
|
|
101
|
+
|
|
102
|
+
cache_root.mkdir(parents=True, exist_ok=True)
|
|
103
|
+
zip_path = cache_root / DEFAULT_ZIP_NAME
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
logger.info(f'Downloading AA-LCR documents from {DOWNLOAD_URL} to {zip_path}...')
|
|
107
|
+
urllib.request.urlretrieve(DOWNLOAD_URL, zip_path)
|
|
108
|
+
|
|
109
|
+
logger.info(f'Extracting {zip_path} to {cache_root}...')
|
|
110
|
+
with zipfile.ZipFile(zip_path, 'r') as zf:
|
|
111
|
+
zf.extractall(cache_root)
|
|
112
|
+
|
|
113
|
+
if not extracted_dir.exists():
|
|
114
|
+
raise ValueError(f'Extraction succeeded but target directory not found: {extracted_dir}')
|
|
115
|
+
|
|
116
|
+
logger.info(f'AA-LCR documents ready at {extracted_dir}')
|
|
117
|
+
return extracted_dir
|
|
118
|
+
except Exception as e:
|
|
119
|
+
raise ValueError(
|
|
120
|
+
f'Failed to download or extract AA-LCR documents: {e}. '
|
|
121
|
+
'You can also manually download and set extra_params["text_dir"].'
|
|
122
|
+
) from e
|
|
123
|
+
finally:
|
|
124
|
+
# Best-effort cleanup of the zip file
|
|
125
|
+
try:
|
|
126
|
+
if zip_path.exists():
|
|
127
|
+
zip_path.unlink()
|
|
128
|
+
except Exception:
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
def _get_context(self, record: Dict[str, Any]) -> str:
|
|
132
|
+
doc_folder = self.text_dir / record['document_category'] / record['document_set_id']
|
|
133
|
+
|
|
134
|
+
# Check if the document folder exists
|
|
135
|
+
if not doc_folder.exists() or not doc_folder.is_dir():
|
|
136
|
+
logger.warning(f'Document folder not found: {doc_folder}. Returning empty context.')
|
|
137
|
+
return ''
|
|
138
|
+
|
|
139
|
+
doc_blocks = []
|
|
140
|
+
try:
|
|
141
|
+
for file_path in doc_folder.iterdir():
|
|
142
|
+
if file_path.is_file():
|
|
143
|
+
try:
|
|
144
|
+
content = file_path.read_text(encoding='utf-8').strip()
|
|
145
|
+
if content:
|
|
146
|
+
doc_blocks.append(content)
|
|
147
|
+
except (IOError, UnicodeDecodeError) as e:
|
|
148
|
+
logger.warning(f'Could not read file {file_path}, skipping: {e}')
|
|
149
|
+
except OSError as e:
|
|
150
|
+
logger.warning(f'Could not access document folder {doc_folder}: {e}')
|
|
151
|
+
return f"ERROR: Could not read documents for {record['document_category']}/{record['document_set_id']}"
|
|
152
|
+
|
|
153
|
+
documents_text = '\n\n'.join(
|
|
154
|
+
f'BEGIN DOCUMENT {i + 1}:\n{doc}\nEND DOCUMENT {i + 1}' for i, doc in enumerate(doc_blocks)
|
|
155
|
+
)
|
|
156
|
+
return documents_text
|
|
157
|
+
|
|
158
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
159
|
+
"""Convert a record to a Sample with long-context prompt."""
|
|
160
|
+
context = self._get_context(record)
|
|
161
|
+
prompt = self.prompt_template.format(documents_text=context, question=record['question'])
|
|
162
|
+
|
|
163
|
+
return Sample(
|
|
164
|
+
input=[ChatMessageUser(content=prompt)],
|
|
165
|
+
target=record['answer'],
|
|
166
|
+
metadata={
|
|
167
|
+
'question': record['question'],
|
|
168
|
+
'data_source_urls': record['data_source_urls'],
|
|
169
|
+
'input_tokens': record.get('input_tokens', 0),
|
|
170
|
+
}
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
def llm_match_score(
|
|
174
|
+
self,
|
|
175
|
+
original_prediction: str,
|
|
176
|
+
filtered_prediction: str,
|
|
177
|
+
reference: str,
|
|
178
|
+
task_state: TaskState,
|
|
179
|
+
) -> Score:
|
|
180
|
+
score = Score(
|
|
181
|
+
extracted_prediction=filtered_prediction,
|
|
182
|
+
prediction=original_prediction,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
judge_prompt = JUDGE_PROMPT.format(
|
|
186
|
+
question=task_state.metadata['question'], correct_answer=reference, response=filtered_prediction
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Request judge and obtain score
|
|
190
|
+
judge_response = self.llm_judge.judge(prompt=judge_prompt)
|
|
191
|
+
|
|
192
|
+
# Parse judge response to get accuracy score
|
|
193
|
+
# Use word boundaries to avoid matching "CORRECT" within "INCORRECT"
|
|
194
|
+
is_correct = bool(re.search(r'\bCORRECT\b', judge_response, re.IGNORECASE))
|
|
195
|
+
score.value = {
|
|
196
|
+
'acc': 1.0 if is_correct else 0.0,
|
|
197
|
+
}
|
|
198
|
+
score.explanation = f'LLM judge: {judge_response}'
|
|
199
|
+
score.metadata = {
|
|
200
|
+
'source': 'llm_judge',
|
|
201
|
+
'judge_strategy': self.judge_strategy,
|
|
202
|
+
'model': self.llm_judge.model_id,
|
|
203
|
+
}
|
|
204
|
+
score.main_score_name = 'acc'
|
|
205
|
+
return score
|
|
File without changes
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.evaluator import TaskState
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers, prompt
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
SUBSET_LIST = ['default']
|
|
16
|
+
|
|
17
|
+
MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@register_benchmark(
|
|
21
|
+
BenchmarkMeta(
|
|
22
|
+
name='ai2d',
|
|
23
|
+
pretty_name='AI2D',
|
|
24
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
25
|
+
description=
|
|
26
|
+
'AI2D is a benchmark dataset for researching the understanding of diagrams by AI. It contains over 5,000 diverse diagrams from science textbooks (e.g., the water cycle, food webs). Each diagram is accompanied by multiple-choice questions that test an AI\'s ability to interpret visual elements, text labels, and their relationships. The benchmark is challenging because it requires jointly understanding the layout, symbols, and text to answer questions correctly.', # noqa: E501
|
|
27
|
+
dataset_id='lmms-lab/ai2d',
|
|
28
|
+
subset_list=SUBSET_LIST,
|
|
29
|
+
metric_list=['acc'],
|
|
30
|
+
eval_split='test',
|
|
31
|
+
prompt_template=MULT_CHOICE_PROMPT,
|
|
32
|
+
)
|
|
33
|
+
)
|
|
34
|
+
class Ai2dAdapter(VisionLanguageAdapter):
|
|
35
|
+
|
|
36
|
+
def __init__(self, **kwargs):
|
|
37
|
+
super().__init__(**kwargs)
|
|
38
|
+
|
|
39
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
40
|
+
answers_list: list[str] = record['options']
|
|
41
|
+
input_text = prompt(question=record['question'], choices=answers_list, template=self.prompt_template)
|
|
42
|
+
content_list: list[Content] = [ContentText(text=input_text)]
|
|
43
|
+
image = record.get('image')
|
|
44
|
+
if image:
|
|
45
|
+
image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
|
|
46
|
+
content_list.append(ContentImage(image=image_base64))
|
|
47
|
+
|
|
48
|
+
label_answer = chr(int(record['answer']) + ord('A'))
|
|
49
|
+
|
|
50
|
+
return Sample(input=[ChatMessageUser(content=content_list)], choices=answers_list, target=label_answer)
|
|
51
|
+
|
|
52
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
53
|
+
answers = parse_answers(task_state)
|
|
54
|
+
return ''.join(sorted(list(answers)))
|
|
File without changes
|