evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,454 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
2
|
+
import glob
|
|
3
|
+
import os
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from typing import Any, Dict, List
|
|
6
|
+
|
|
7
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
8
|
+
from evalscope.api.dataset import DatasetDict, DictDataLoader, Sample
|
|
9
|
+
from evalscope.api.evaluator import TaskState
|
|
10
|
+
from evalscope.api.messages.chat_message import ChatMessageUser
|
|
11
|
+
from evalscope.api.metric import AggScore, SampleScore, Score
|
|
12
|
+
from evalscope.api.registry import register_benchmark
|
|
13
|
+
from evalscope.constants import Tags
|
|
14
|
+
from evalscope.report import Report, ReportKey
|
|
15
|
+
from evalscope.utils.logger import get_logger
|
|
16
|
+
|
|
17
|
+
logger = get_logger()
|
|
18
|
+
|
|
19
|
+
GRADER_SYSTEM_PROMPT = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"." # noqa: E501
|
|
20
|
+
|
|
21
|
+
GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>".strip(
|
|
22
|
+
) # noqa: E501
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@register_benchmark(
|
|
26
|
+
BenchmarkMeta(
|
|
27
|
+
name='general_arena',
|
|
28
|
+
pretty_name='GeneralArena',
|
|
29
|
+
tags=[Tags.CUSTOM, Tags.ARENA],
|
|
30
|
+
description=
|
|
31
|
+
'GeneralArena is a custom benchmark designed to evaluate the performance of large language models in a competitive setting, '
|
|
32
|
+
'where models are pitted against each other in custom tasks to determine their relative strengths and weaknesses. You should '
|
|
33
|
+
'provide the model outputs in the format of a list of dictionaries, where each dictionary contains the model name and its report path. '
|
|
34
|
+
'For detailed instructions on how to use this benchmark, please refer to the [Arena User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html).',
|
|
35
|
+
dataset_id='general_arena',
|
|
36
|
+
metric_list=['winrate'],
|
|
37
|
+
aggregation='elo',
|
|
38
|
+
few_shot_num=0,
|
|
39
|
+
train_split=None,
|
|
40
|
+
eval_split='test',
|
|
41
|
+
system_prompt=GRADER_SYSTEM_PROMPT,
|
|
42
|
+
prompt_template=GRADER_TEMPLATE,
|
|
43
|
+
extra_params={
|
|
44
|
+
'models': [{
|
|
45
|
+
'name': 'qwen-plus',
|
|
46
|
+
'report_path': 'outputs/20250627_172550/reports/qwen-plus'
|
|
47
|
+
}, {
|
|
48
|
+
'name': 'qwen2.5-7b',
|
|
49
|
+
'report_path': 'outputs/20250627_172817/reports/qwen2.5-7b-instruct'
|
|
50
|
+
}],
|
|
51
|
+
'baseline':
|
|
52
|
+
'qwen2.5-7b'
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
)
|
|
56
|
+
class GeneralArenaAdapter(DefaultDataAdapter):
|
|
57
|
+
|
|
58
|
+
def __init__(self, *args, **kwargs):
|
|
59
|
+
super().__init__(*args, **kwargs)
|
|
60
|
+
|
|
61
|
+
self._use_llm_judge = True
|
|
62
|
+
|
|
63
|
+
self.models = self.extra_params.get('models', [])
|
|
64
|
+
self.baseline = self.extra_params.get('baseline', None)
|
|
65
|
+
|
|
66
|
+
def load(self):
|
|
67
|
+
"""Load dataset by processing model reports."""
|
|
68
|
+
self._check_names()
|
|
69
|
+
self._check_reports()
|
|
70
|
+
self._check_datasets()
|
|
71
|
+
logger.info(f'Overall datasets: {self.overall_datasets}')
|
|
72
|
+
dataset_model_dict = self._load_common_datasets()
|
|
73
|
+
datasets = self._build_pair_wise_data(dataset_model_dict)
|
|
74
|
+
|
|
75
|
+
# Convert to DatasetDict format
|
|
76
|
+
dataset_dict = {}
|
|
77
|
+
for subset_name, samples in datasets.items():
|
|
78
|
+
dataset = DictDataLoader(
|
|
79
|
+
dict_list=samples,
|
|
80
|
+
limit=self.limit,
|
|
81
|
+
shuffle=self.shuffle,
|
|
82
|
+
repeats=self.repeats,
|
|
83
|
+
sample_fields=self.record_to_sample
|
|
84
|
+
).load()
|
|
85
|
+
dataset_dict[subset_name] = dataset
|
|
86
|
+
|
|
87
|
+
test_dataset = DatasetDict(dataset_dict)
|
|
88
|
+
return test_dataset, None
|
|
89
|
+
|
|
90
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
91
|
+
"""Convert a data record to a Sample object."""
|
|
92
|
+
return Sample(
|
|
93
|
+
input=[ChatMessageUser(content=record['question'])],
|
|
94
|
+
target=record['answer_2'], # baseline answer
|
|
95
|
+
metadata={
|
|
96
|
+
'answer_1': record['answer_1'],
|
|
97
|
+
'model_1': record['model_1'],
|
|
98
|
+
'model_2': record['model_2'],
|
|
99
|
+
}
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def _check_names(self):
|
|
103
|
+
"""Check the names of the models and baseline."""
|
|
104
|
+
# check duplicate models
|
|
105
|
+
model_names = [model['name'] for model in self.models]
|
|
106
|
+
if len(model_names) != len(set(model_names)):
|
|
107
|
+
raise ValueError(f'Duplicate model names found in the models list {model_names}.')
|
|
108
|
+
# check if models list is empty
|
|
109
|
+
if len(self.models) < 2:
|
|
110
|
+
raise ValueError('Models list must contain at least two models.')
|
|
111
|
+
# check baseline model
|
|
112
|
+
if self.baseline and self.baseline not in model_names:
|
|
113
|
+
raise ValueError(f'Baseline model {self.baseline} not found in the models list.')
|
|
114
|
+
# check if the baseline model is not set
|
|
115
|
+
if not self.baseline:
|
|
116
|
+
logger.warning('Baseline model is not set. Using the first model as the baseline.')
|
|
117
|
+
self.baseline = self.models[0]['name']
|
|
118
|
+
|
|
119
|
+
def _check_reports(self):
|
|
120
|
+
"""Check if the report paths are valid."""
|
|
121
|
+
for model in self.models:
|
|
122
|
+
report_path = model.get('report_path', None)
|
|
123
|
+
if not report_path or not os.path.exists(report_path):
|
|
124
|
+
raise ValueError(f'Report path {report_path} for model {model["name"]} does not exist.')
|
|
125
|
+
reports = []
|
|
126
|
+
for report_item in glob.glob(os.path.join(report_path, '*.json')):
|
|
127
|
+
report = Report.from_json(report_item)
|
|
128
|
+
reports.append(report)
|
|
129
|
+
model['reports'] = reports
|
|
130
|
+
|
|
131
|
+
def _check_datasets(self):
|
|
132
|
+
"""Check common datasets in the reports."""
|
|
133
|
+
overall_datasets = set()
|
|
134
|
+
for model in self.models:
|
|
135
|
+
datasets = set()
|
|
136
|
+
for report in model['reports']:
|
|
137
|
+
report_df = report.to_dataframe()
|
|
138
|
+
# get unique (dataset, subset) tuples
|
|
139
|
+
unique_datasets = set(zip(report_df[ReportKey.dataset_name], report_df[ReportKey.subset_name]))
|
|
140
|
+
datasets.update(unique_datasets)
|
|
141
|
+
model['datasets'] = datasets
|
|
142
|
+
# get overall datasets by intersecting all models' datasets
|
|
143
|
+
overall_datasets = set.intersection(*[model['datasets'] for model in self.models if 'datasets' in model])
|
|
144
|
+
self.overall_datasets = overall_datasets
|
|
145
|
+
|
|
146
|
+
def _load_common_datasets(self):
|
|
147
|
+
"""Load common datasets from the local path."""
|
|
148
|
+
from evalscope.utils import OutputsStructure
|
|
149
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
150
|
+
|
|
151
|
+
dataset_dict = defaultdict(dict)
|
|
152
|
+
for dataset_name, subset_name in self.overall_datasets:
|
|
153
|
+
for model in self.models:
|
|
154
|
+
dataset_path = model['report_path'].replace(OutputsStructure.REPORTS_DIR, OutputsStructure.REVIEWS_DIR)
|
|
155
|
+
dataset_file_path = os.path.join(dataset_path, f'{dataset_name}_{subset_name}.jsonl')
|
|
156
|
+
if not os.path.exists(dataset_file_path):
|
|
157
|
+
raise ValueError(
|
|
158
|
+
f'Dataset {dataset_name} with subset {subset_name} not found in model {model["name"]}.'
|
|
159
|
+
)
|
|
160
|
+
dataset = jsonl_to_list(dataset_file_path)
|
|
161
|
+
# sort by index
|
|
162
|
+
dataset.sort(key=lambda x: x.get('index'))
|
|
163
|
+
dataset_dict[(dataset_name, subset_name)][model['name']] = dataset
|
|
164
|
+
|
|
165
|
+
return dataset_dict
|
|
166
|
+
|
|
167
|
+
def _build_pair_wise_data(self, dataset_dict):
|
|
168
|
+
"""Build pairwise data for the models."""
|
|
169
|
+
from evalscope.api.evaluator import ReviewResult
|
|
170
|
+
from .utils import process_review_item
|
|
171
|
+
|
|
172
|
+
pairwise_data = defaultdict(list)
|
|
173
|
+
for (dataset_name, subset_name), model_data in dataset_dict.items():
|
|
174
|
+
if len(model_data) < 2:
|
|
175
|
+
logger.warning(f'Not enough models for dataset {dataset_name} with subset {subset_name}. Skipping.')
|
|
176
|
+
continue
|
|
177
|
+
# create pairwise data for each model against the baseline
|
|
178
|
+
model_names = list(model_data.keys())
|
|
179
|
+
for name in model_names:
|
|
180
|
+
if name == self.baseline:
|
|
181
|
+
continue
|
|
182
|
+
pairs = []
|
|
183
|
+
for model_item, baseline_item in zip(model_data[name], model_data[self.baseline]):
|
|
184
|
+
# Convert to ReviewResult objects like in get_model_prediction
|
|
185
|
+
model_review = ReviewResult.model_validate(model_item)
|
|
186
|
+
baseline_review = ReviewResult.model_validate(baseline_item)
|
|
187
|
+
|
|
188
|
+
for model_choice, baseline_choice in zip(
|
|
189
|
+
process_review_item(model_review), process_review_item(baseline_review)
|
|
190
|
+
):
|
|
191
|
+
pairs.append({
|
|
192
|
+
'question': model_choice['Question'],
|
|
193
|
+
'answer_1': model_choice['Generated'],
|
|
194
|
+
'answer_2': baseline_choice['Generated'],
|
|
195
|
+
'model_1': name,
|
|
196
|
+
'model_2': self.baseline
|
|
197
|
+
})
|
|
198
|
+
pairwise_data[f'{dataset_name}&{subset_name}@{name}&{self.baseline}'] = pairs
|
|
199
|
+
|
|
200
|
+
return pairwise_data
|
|
201
|
+
|
|
202
|
+
def llm_match_score(
|
|
203
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
204
|
+
) -> Score:
|
|
205
|
+
"""Use LLM as a judge to evaluate the predicted answer against the baseline."""
|
|
206
|
+
from .utils import get_judge_score, post_process_result
|
|
207
|
+
|
|
208
|
+
score = Score(
|
|
209
|
+
extracted_prediction=filtered_prediction,
|
|
210
|
+
prediction=original_prediction,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
question = task_state.input_text
|
|
214
|
+
answer_1 = task_state.metadata['answer_1']
|
|
215
|
+
answer_2 = reference # baseline answer
|
|
216
|
+
model_1 = task_state.metadata['model_1']
|
|
217
|
+
model_2 = task_state.metadata['model_2']
|
|
218
|
+
|
|
219
|
+
system_template = self.system_prompt
|
|
220
|
+
prompt_template = self.prompt_template
|
|
221
|
+
|
|
222
|
+
prompt1 = prompt_template.format(question=question, answer_1=answer_1, answer_2=answer_2)
|
|
223
|
+
# reverse the order
|
|
224
|
+
prompt2 = prompt_template.format(question=question, answer_1=answer_2, answer_2=answer_1)
|
|
225
|
+
|
|
226
|
+
# get grading response
|
|
227
|
+
game1_response = self.llm_judge.judge(prompt1, system_prompt=system_template)
|
|
228
|
+
game2_response = self.llm_judge.judge(prompt2, system_prompt=system_template)
|
|
229
|
+
|
|
230
|
+
# parse grading response
|
|
231
|
+
# game1
|
|
232
|
+
res1 = post_process_result(game1_response)
|
|
233
|
+
score1 = get_judge_score(res1, reverse=False)
|
|
234
|
+
# game2
|
|
235
|
+
res2 = post_process_result(game2_response)
|
|
236
|
+
score2 = get_judge_score(res2, reverse=True)
|
|
237
|
+
|
|
238
|
+
battle_result = {
|
|
239
|
+
'score': (score1 + score2) / 2,
|
|
240
|
+
'games': [
|
|
241
|
+
{
|
|
242
|
+
'model_a': model_1,
|
|
243
|
+
'model_b': model_2,
|
|
244
|
+
'response': game1_response,
|
|
245
|
+
'judgment': res1
|
|
246
|
+
},
|
|
247
|
+
{
|
|
248
|
+
'model_a': model_2,
|
|
249
|
+
'model_b': model_1,
|
|
250
|
+
'response': game2_response,
|
|
251
|
+
'judgment': res2
|
|
252
|
+
},
|
|
253
|
+
]
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
score.value = {'score': battle_result['score']}
|
|
257
|
+
score.explanation = f'LLM judge battles: Game1: {game1_response[:100]}... Game2: {game2_response[:100]}...'
|
|
258
|
+
score.metadata = {
|
|
259
|
+
'source': 'llm_judge',
|
|
260
|
+
'judge_strategy': getattr(self, 'judge_strategy', 'default'),
|
|
261
|
+
'model': self.llm_judge.model_id if hasattr(self.llm_judge, 'model_id') else 'unknown',
|
|
262
|
+
'battle_result': battle_result
|
|
263
|
+
}
|
|
264
|
+
score.main_score_name = 'score'
|
|
265
|
+
|
|
266
|
+
return score
|
|
267
|
+
|
|
268
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
|
|
269
|
+
"""Aggregate scores to compute winrate."""
|
|
270
|
+
import numpy as np
|
|
271
|
+
import pandas as pd
|
|
272
|
+
|
|
273
|
+
from .utils import compute_mle_elo, get_battles_from_row, get_bootstrap_result, get_win_rate_column
|
|
274
|
+
|
|
275
|
+
battles = pd.concat([get_battles_from_row(res.score.metadata['battle_result']) for res in sample_scores])
|
|
276
|
+
|
|
277
|
+
bt_model_coef = compute_mle_elo(battles, baseline_model=self.baseline)
|
|
278
|
+
|
|
279
|
+
bootstrap_model_coef = get_bootstrap_result(
|
|
280
|
+
battles, func_compute_elo=compute_mle_elo, num_round=100, baseline_model=self.baseline
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
stats = pd.DataFrame()
|
|
284
|
+
stats['results'] = None
|
|
285
|
+
stats['results'] = stats['results'].astype('object')
|
|
286
|
+
|
|
287
|
+
for i, model in enumerate(bt_model_coef.index):
|
|
288
|
+
stats.at[i, 'model'] = model
|
|
289
|
+
stats.at[i, 'score'] = bt_model_coef[model]
|
|
290
|
+
stats.at[i, 'lower'] = np.percentile(bootstrap_model_coef[model], 2.5)
|
|
291
|
+
stats.at[i, 'upper'] = np.percentile(bootstrap_model_coef[model], 97.5)
|
|
292
|
+
|
|
293
|
+
metrics_dict = {}
|
|
294
|
+
metrics_dict['winrate'] = get_win_rate_column(stats, 'score', self.baseline).to_dict()
|
|
295
|
+
metrics_dict['winrate_lower'] = get_win_rate_column(stats, 'lower', self.baseline).to_dict()
|
|
296
|
+
metrics_dict['winrate_upper'] = get_win_rate_column(stats, 'upper', self.baseline).to_dict()
|
|
297
|
+
|
|
298
|
+
agg_scores = []
|
|
299
|
+
for metric_name, models in metrics_dict.items():
|
|
300
|
+
for model_name, score_val in models.items():
|
|
301
|
+
if model_name == self.baseline:
|
|
302
|
+
continue
|
|
303
|
+
agg_scores.append(AggScore(score=score_val, metric_name=metric_name, num=len(sample_scores)))
|
|
304
|
+
|
|
305
|
+
return agg_scores
|
|
306
|
+
|
|
307
|
+
def extract_answer(self, prediction, task_state):
|
|
308
|
+
# NOTE: This is a hacky way to extract the answer from the prediction
|
|
309
|
+
return task_state.metadata['answer_1']
|
|
310
|
+
|
|
311
|
+
def _on_generate_report_end(self, report: 'Report', output_dir: str, **kwargs):
|
|
312
|
+
"""Post-process the report to convert it to a DataFrame with winrate leaderboards."""
|
|
313
|
+
import pandas as pd
|
|
314
|
+
import tabulate
|
|
315
|
+
|
|
316
|
+
report_path = output_dir
|
|
317
|
+
leaderboard_file = os.path.join(report_path, 'leaderboard.txt')
|
|
318
|
+
|
|
319
|
+
# Ensure report directory exists
|
|
320
|
+
os.makedirs(report_path, exist_ok=True)
|
|
321
|
+
|
|
322
|
+
# Convert report to dataframe
|
|
323
|
+
df = report.to_dataframe()
|
|
324
|
+
|
|
325
|
+
# Filter for winrate-related metrics
|
|
326
|
+
winrate_df = df[df[ReportKey.metric_name].str.contains('winrate')].copy()
|
|
327
|
+
|
|
328
|
+
if winrate_df.empty:
|
|
329
|
+
logger.warning('No winrate data found in the report.')
|
|
330
|
+
return
|
|
331
|
+
|
|
332
|
+
# Get all model names from self.models
|
|
333
|
+
all_model_names = [model['name'] for model in self.models]
|
|
334
|
+
|
|
335
|
+
# Collect all leaderboard outputs
|
|
336
|
+
leaderboard_outputs = []
|
|
337
|
+
|
|
338
|
+
def format_leaderboard(data_df, title):
|
|
339
|
+
"""Format DataFrame as leaderboard with CI."""
|
|
340
|
+
# Pivot to get winrate, winrate_lower, winrate_upper as columns
|
|
341
|
+
pivot_df = data_df.pivot_table(
|
|
342
|
+
index=[ReportKey.model_name], columns=ReportKey.metric_name, values=ReportKey.score, aggfunc='first'
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
# Add baseline model with 50% winrate
|
|
346
|
+
baseline_data = {'winrate': 0.5, 'winrate_lower': 0.5, 'winrate_upper': 0.5}
|
|
347
|
+
|
|
348
|
+
# Create a complete index with all models
|
|
349
|
+
complete_index = pd.Index(all_model_names, name=pivot_df.index.name)
|
|
350
|
+
pivot_df = pivot_df.reindex(complete_index)
|
|
351
|
+
|
|
352
|
+
# Fill baseline model data
|
|
353
|
+
if self.baseline in pivot_df.index:
|
|
354
|
+
for col, val in baseline_data.items():
|
|
355
|
+
if col in pivot_df.columns:
|
|
356
|
+
pivot_df.loc[self.baseline, col] = val
|
|
357
|
+
|
|
358
|
+
# Fill missing values with winrate score for other models
|
|
359
|
+
if 'winrate' in pivot_df.columns:
|
|
360
|
+
pivot_df['winrate_lower'] = pivot_df.get('winrate_lower', pivot_df['winrate'])
|
|
361
|
+
pivot_df['winrate_upper'] = pivot_df.get('winrate_upper', pivot_df['winrate'])
|
|
362
|
+
|
|
363
|
+
# Format for display
|
|
364
|
+
leaderboard_data = []
|
|
365
|
+
for model in pivot_df.index:
|
|
366
|
+
if pd.isna(pivot_df.loc[model, 'winrate']):
|
|
367
|
+
continue
|
|
368
|
+
|
|
369
|
+
score_pct = pivot_df.loc[model, 'winrate'] * 100
|
|
370
|
+
lower_diff = (pivot_df.loc[model, 'winrate_lower'] - pivot_df.loc[model, 'winrate']) * 100
|
|
371
|
+
upper_diff = (pivot_df.loc[model, 'winrate_upper'] - pivot_df.loc[model, 'winrate']) * 100
|
|
372
|
+
|
|
373
|
+
leaderboard_data.append({
|
|
374
|
+
'Model': model,
|
|
375
|
+
'WinRate (%)': f'{score_pct:.1f}',
|
|
376
|
+
'CI (%)': f'({lower_diff:+.1f} / {upper_diff:+.1f})'
|
|
377
|
+
})
|
|
378
|
+
|
|
379
|
+
# Sort by score descending
|
|
380
|
+
leaderboard_data.sort(key=lambda x: float(x['WinRate (%)'].replace('%', '')), reverse=True)
|
|
381
|
+
|
|
382
|
+
# Create DataFrame
|
|
383
|
+
leaderboard_df = pd.DataFrame(leaderboard_data)
|
|
384
|
+
leaderboard_df.index = range(len(leaderboard_df))
|
|
385
|
+
|
|
386
|
+
# Format as string
|
|
387
|
+
table_str = tabulate.tabulate(leaderboard_df, headers='keys', showindex=False)
|
|
388
|
+
output = f'{title}\n{table_str}\n'
|
|
389
|
+
|
|
390
|
+
logger.info(f'\n{title}\n{table_str}')
|
|
391
|
+
return output
|
|
392
|
+
|
|
393
|
+
# Parse dataset and subset information from dataset_name column
|
|
394
|
+
# Format: '{dataset_name}&{subset_name}@{name}&{self.baseline}'
|
|
395
|
+
def parse_dataset_key(dataset_key):
|
|
396
|
+
"""Parse dataset key to extract dataset_name, subset_name, and model pair."""
|
|
397
|
+
parts = dataset_key.split('@')
|
|
398
|
+
|
|
399
|
+
dataset_subset = parts[0]
|
|
400
|
+
model_pair = parts[1]
|
|
401
|
+
|
|
402
|
+
dataset_name, subset_name = dataset_subset.split('&', 1)
|
|
403
|
+
model_1, model_2 = model_pair.split('&', 1)
|
|
404
|
+
|
|
405
|
+
return dataset_name, subset_name, model_1, model_2
|
|
406
|
+
|
|
407
|
+
# Add parsed columns
|
|
408
|
+
parsed_data = []
|
|
409
|
+
for _, row in winrate_df.iterrows():
|
|
410
|
+
dataset_name, subset_name, model_1, model_2 = parse_dataset_key(row[ReportKey.subset_name])
|
|
411
|
+
if dataset_name is not None:
|
|
412
|
+
parsed_data.append({
|
|
413
|
+
'dataset_name': dataset_name,
|
|
414
|
+
'subset_name': subset_name,
|
|
415
|
+
ReportKey.model_name: model_1,
|
|
416
|
+
ReportKey.metric_name: row[ReportKey.metric_name],
|
|
417
|
+
ReportKey.score: row[ReportKey.score]
|
|
418
|
+
})
|
|
419
|
+
|
|
420
|
+
if not parsed_data:
|
|
421
|
+
logger.warning('No valid dataset keys found for parsing.')
|
|
422
|
+
return
|
|
423
|
+
|
|
424
|
+
parsed_df = pd.DataFrame(parsed_data)
|
|
425
|
+
|
|
426
|
+
# 1. Overall ranking (aggregate across all datasets and subsets)
|
|
427
|
+
overall_df = parsed_df.groupby([ReportKey.model_name,
|
|
428
|
+
ReportKey.metric_name])[ReportKey.score].mean().reset_index()
|
|
429
|
+
leaderboard_outputs.append(format_leaderboard(overall_df, '=== OVERALL LEADERBOARD ==='))
|
|
430
|
+
|
|
431
|
+
# 2. Dataset-level rankings
|
|
432
|
+
datasets = parsed_df['dataset_name'].unique()
|
|
433
|
+
for dataset in sorted(datasets):
|
|
434
|
+
dataset_df = parsed_df[parsed_df['dataset_name'] == dataset]
|
|
435
|
+
dataset_agg = dataset_df.groupby([ReportKey.model_name,
|
|
436
|
+
ReportKey.metric_name])[ReportKey.score].mean().reset_index()
|
|
437
|
+
leaderboard_outputs.append(format_leaderboard(dataset_agg, f'=== DATASET LEADERBOARD: {dataset} ==='))
|
|
438
|
+
|
|
439
|
+
# 3. Subset-level rankings
|
|
440
|
+
subsets = parsed_df[['dataset_name', 'subset_name']].drop_duplicates()
|
|
441
|
+
for _, subset_row in subsets.iterrows():
|
|
442
|
+
dataset_name = subset_row['dataset_name']
|
|
443
|
+
subset_name = subset_row['subset_name']
|
|
444
|
+
subset_df = parsed_df[(parsed_df['dataset_name'] == dataset_name)
|
|
445
|
+
& (parsed_df['subset_name'] == subset_name)]
|
|
446
|
+
leaderboard_outputs.append(
|
|
447
|
+
format_leaderboard(subset_df, f'=== SUBSET LEADERBOARD: {dataset_name} - {subset_name} ===')
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
# Write all leaderboard outputs to file
|
|
451
|
+
with open(leaderboard_file, 'w', encoding='utf-8') as f:
|
|
452
|
+
f.write('\n'.join(leaderboard_outputs))
|
|
453
|
+
|
|
454
|
+
logger.info(f'Leaderboard results saved to: {leaderboard_file}')
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
import math
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import re
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from sklearn.linear_model import LogisticRegression
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
|
|
10
|
+
from evalscope.api.evaluator import ReviewResult
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def process_review_item(review_result: ReviewResult) -> list:
|
|
17
|
+
"""
|
|
18
|
+
Process a ReviewResult object to extract relevant information.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
review_result: ReviewResult object or dict (for backward compatibility)
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
list: List of processed review items with necessary information.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
# New format using ReviewResult
|
|
28
|
+
sample_score = review_result.sample_score
|
|
29
|
+
prediction = sample_score.score.prediction
|
|
30
|
+
target = review_result.target
|
|
31
|
+
extracted_prediction = sample_score.score.extracted_prediction
|
|
32
|
+
|
|
33
|
+
raw_d = {
|
|
34
|
+
'Index': str(review_result.index),
|
|
35
|
+
'Input': review_result.input,
|
|
36
|
+
'Question': review_result.input, # Use input as question
|
|
37
|
+
'Generated':
|
|
38
|
+
prediction if prediction != extracted_prediction else extracted_prediction or '', # Ensure no None value
|
|
39
|
+
'Gold': target,
|
|
40
|
+
'Pred': extracted_prediction,
|
|
41
|
+
'Score': sample_score.score.model_dump(exclude_none=True),
|
|
42
|
+
}
|
|
43
|
+
return [raw_d]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def post_process_result(completion):
|
|
47
|
+
result = re.findall(r'\[\[([AB<>=]+)\]\]', completion)
|
|
48
|
+
if result:
|
|
49
|
+
return result[0]
|
|
50
|
+
else:
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def get_judge_score(result, reverse=False):
|
|
55
|
+
"""
|
|
56
|
+
Calculate the judge score, considering confidence weight.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
result: Judgment result ('A=B', 'A>B', 'A>>B', 'B>A', 'B>>A')
|
|
60
|
+
reverse: Whether to reverse the score
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
float: Weighted score
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
# Base score mapping - using finer-grained scores
|
|
67
|
+
if not reverse:
|
|
68
|
+
score_mapping = {
|
|
69
|
+
'A=B': 0.5, # Tie
|
|
70
|
+
'A>B': 0.75, # A slightly wins
|
|
71
|
+
'A>>B': 1.0, # A significantly wins
|
|
72
|
+
'B>A': 0.25, # B slightly wins
|
|
73
|
+
'B>>A': 0.0, # B significantly wins
|
|
74
|
+
}
|
|
75
|
+
else:
|
|
76
|
+
score_mapping = {
|
|
77
|
+
'A=B': 0.5, # Tie
|
|
78
|
+
'A>B': 0.25, # A slightly wins
|
|
79
|
+
'A>>B': 0.0, # A significantly wins
|
|
80
|
+
'B>A': 0.75, # B slightly wins
|
|
81
|
+
'B>>A': 1.0, # B significantly wins
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
base_score = score_mapping.get(result, 0.5)
|
|
85
|
+
|
|
86
|
+
return base_score
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def get_battles_from_row(row, first_game_only=False, multiplier=3):
|
|
90
|
+
results = []
|
|
91
|
+
|
|
92
|
+
game = row['games'][0]
|
|
93
|
+
output = {'model_a': game['model_a'], 'model_b': game['model_b']}
|
|
94
|
+
|
|
95
|
+
weight = 1
|
|
96
|
+
if game['judgment'] == 'A=B':
|
|
97
|
+
output['winner'] = 'tie'
|
|
98
|
+
elif game['judgment'] == 'A>B':
|
|
99
|
+
output['winner'] = 'model_a'
|
|
100
|
+
elif game['judgment'] == 'A>>B':
|
|
101
|
+
output['winner'] = 'model_a'
|
|
102
|
+
weight = multiplier
|
|
103
|
+
elif game['judgment'] == 'B>A':
|
|
104
|
+
output['winner'] = 'model_b'
|
|
105
|
+
elif game['judgment'] == 'B>>A':
|
|
106
|
+
output['winner'] = 'model_b'
|
|
107
|
+
weight = multiplier
|
|
108
|
+
else:
|
|
109
|
+
weight = 0
|
|
110
|
+
|
|
111
|
+
if weight:
|
|
112
|
+
results += [output] * weight
|
|
113
|
+
|
|
114
|
+
if first_game_only:
|
|
115
|
+
return pd.DataFrame(results)
|
|
116
|
+
|
|
117
|
+
# Dont change the order of model_a and model_b
|
|
118
|
+
output = {'model_a': game['model_a'], 'model_b': game['model_b']}
|
|
119
|
+
|
|
120
|
+
# game 2
|
|
121
|
+
game = row['games'][1]
|
|
122
|
+
|
|
123
|
+
weight = 1
|
|
124
|
+
if game['judgment'] == 'A=B':
|
|
125
|
+
output['winner'] = 'tie'
|
|
126
|
+
elif game['judgment'] == 'A>B':
|
|
127
|
+
output['winner'] = 'model_b'
|
|
128
|
+
elif game['judgment'] == 'A>>B':
|
|
129
|
+
output['winner'] = 'model_b'
|
|
130
|
+
weight = multiplier
|
|
131
|
+
elif game['judgment'] == 'B>A':
|
|
132
|
+
output['winner'] = 'model_a'
|
|
133
|
+
elif game['judgment'] == 'B>>A':
|
|
134
|
+
output['winner'] = 'model_a'
|
|
135
|
+
weight = multiplier
|
|
136
|
+
else:
|
|
137
|
+
weight = 0
|
|
138
|
+
|
|
139
|
+
if weight:
|
|
140
|
+
results += [output] * weight
|
|
141
|
+
|
|
142
|
+
return pd.DataFrame(results)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def compute_mle_elo(df, scale=400, base=10, init_rating=1000, baseline_model='gpt4-0314'):
|
|
146
|
+
models = pd.concat([df['model_a'], df['model_b']]).unique()
|
|
147
|
+
models = pd.Series(np.arange(len(models)), index=models)
|
|
148
|
+
|
|
149
|
+
# duplicate battles
|
|
150
|
+
df = pd.concat([df, df], ignore_index=True)
|
|
151
|
+
p = len(models.index)
|
|
152
|
+
n = df.shape[0]
|
|
153
|
+
|
|
154
|
+
X = np.zeros([n, p])
|
|
155
|
+
X[np.arange(n), models[df['model_a']]] = +math.log(base)
|
|
156
|
+
X[np.arange(n), models[df['model_b']]] = -math.log(base)
|
|
157
|
+
|
|
158
|
+
# one A win => two A win
|
|
159
|
+
Y = np.zeros(n)
|
|
160
|
+
Y[df['winner'] == 'model_a'] = 1.0
|
|
161
|
+
|
|
162
|
+
# one tie => one A win + one B win
|
|
163
|
+
# find tie + tie (both bad) index
|
|
164
|
+
tie_idx = (df['winner'] == 'tie') | (df['winner'] == 'tie (bothbad)')
|
|
165
|
+
tie_idx[len(tie_idx) // 2:] = False
|
|
166
|
+
Y[tie_idx] = 1.0
|
|
167
|
+
|
|
168
|
+
if len(np.unique(Y)) < 2:
|
|
169
|
+
logger.info('Warning: Only one class in the data')
|
|
170
|
+
elo_scores = pd.Series(init_rating, index=models.index)
|
|
171
|
+
if np.all(Y == 1.0):
|
|
172
|
+
elo_scores[df['model_a'].iloc[0]] += scale # Boost the winning model
|
|
173
|
+
elif np.all(Y == 0.0):
|
|
174
|
+
elo_scores[df['model_b'].iloc[0]] += scale # Boost the winning model
|
|
175
|
+
return elo_scores.sort_values(ascending=False)
|
|
176
|
+
|
|
177
|
+
lr = LogisticRegression(
|
|
178
|
+
fit_intercept=False, penalty=None, tol=1e-8
|
|
179
|
+
) # May need to set a small value when not use GPT4 as judge model
|
|
180
|
+
lr.fit(X, Y)
|
|
181
|
+
|
|
182
|
+
elo_scores = scale * lr.coef_[0] + init_rating
|
|
183
|
+
|
|
184
|
+
# set anchor 1000
|
|
185
|
+
if baseline_model in models.index:
|
|
186
|
+
elo_scores += 1000 - elo_scores[models[baseline_model]]
|
|
187
|
+
return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def get_bootstrap_result(battles, func_compute_elo, num_round, baseline_model='gpt-4-0314'):
|
|
191
|
+
rows = []
|
|
192
|
+
kwargs = {}
|
|
193
|
+
if 'baseline_model' in inspect.signature(func_compute_elo).parameters:
|
|
194
|
+
kwargs['baseline_model'] = baseline_model
|
|
195
|
+
for _ in tqdm(range(num_round), desc='bootstrap'):
|
|
196
|
+
res = func_compute_elo(battles.sample(frac=1.0, replace=True), **kwargs)
|
|
197
|
+
if res is not None:
|
|
198
|
+
rows.append(res)
|
|
199
|
+
df = pd.DataFrame(rows)
|
|
200
|
+
return df[df.median().sort_values(ascending=False).index]
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def predict_win_rate(elo_ratings, scale=400, base=10, init_rating=1000):
|
|
204
|
+
names = sorted(list(elo_ratings.keys()))
|
|
205
|
+
wins = defaultdict(lambda: defaultdict(lambda: 0))
|
|
206
|
+
for a in names:
|
|
207
|
+
for b in names:
|
|
208
|
+
ea = 1 / (1 + base**((elo_ratings[b] - elo_ratings[a]) / scale))
|
|
209
|
+
wins[a][b] = ea
|
|
210
|
+
wins[b][a] = 1 - ea
|
|
211
|
+
|
|
212
|
+
data = {a: [wins[a][b] if a != b else np.NAN for b in names] for a in names}
|
|
213
|
+
|
|
214
|
+
df = pd.DataFrame(data, index=names)
|
|
215
|
+
df.index.name = 'model_a'
|
|
216
|
+
df.columns.name = 'model_b'
|
|
217
|
+
return df.T
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def get_win_rate_column(df, column, baseline='gpt4-0314'):
|
|
221
|
+
to_dict = df[['model', column]].set_index('model').to_dict()[column]
|
|
222
|
+
win_rate_table = predict_win_rate(to_dict)
|
|
223
|
+
return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x, 4))
|
|
File without changes
|