evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -1,377 +1,393 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
"""
|
|
3
|
+
Default evaluator implementation for running benchmark evaluations.
|
|
4
|
+
|
|
5
|
+
This module provides the DefaultEvaluator class which orchestrates the entire
|
|
6
|
+
evaluation process including data loading, model inference, metric calculation,
|
|
7
|
+
and report generation.
|
|
8
|
+
"""
|
|
2
9
|
|
|
3
|
-
import json
|
|
4
10
|
import os
|
|
5
|
-
import
|
|
6
|
-
from collections import
|
|
7
|
-
from copy import deepcopy
|
|
11
|
+
import traceback
|
|
12
|
+
from collections import defaultdict
|
|
8
13
|
from tqdm import tqdm
|
|
9
|
-
from typing import
|
|
14
|
+
from typing import TYPE_CHECKING, Callable, Dict, List
|
|
10
15
|
|
|
11
|
-
from evalscope.
|
|
12
|
-
from evalscope.
|
|
13
|
-
from evalscope.
|
|
14
|
-
from evalscope.
|
|
16
|
+
from evalscope.api.dataset import Dataset, DatasetDict, Sample
|
|
17
|
+
from evalscope.api.evaluator import CacheManager, Evaluator, TaskState
|
|
18
|
+
from evalscope.api.metric import AggScore, SampleScore
|
|
19
|
+
from evalscope.constants import HEARTBEAT_INTERVAL_SEC
|
|
15
20
|
from evalscope.report import Report, gen_table
|
|
16
|
-
from evalscope.utils import
|
|
17
|
-
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
21
|
+
from evalscope.utils.function_utils import run_in_threads_with_progress
|
|
18
22
|
from evalscope.utils.logger import get_logger
|
|
19
23
|
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from evalscope.api.benchmark import DataAdapter
|
|
26
|
+
from evalscope.api.model import Model
|
|
27
|
+
from evalscope.config import TaskConfig
|
|
28
|
+
from evalscope.utils.io_utils import OutputsStructure
|
|
29
|
+
|
|
20
30
|
logger = get_logger()
|
|
21
31
|
|
|
22
32
|
|
|
23
|
-
class Evaluator
|
|
33
|
+
class DefaultEvaluator(Evaluator):
|
|
24
34
|
"""
|
|
25
|
-
|
|
35
|
+
Default Evaluator for running evaluations on benchmarks.
|
|
36
|
+
|
|
37
|
+
This evaluator handles the complete evaluation pipeline:
|
|
38
|
+
1. Loading datasets from benchmarks
|
|
39
|
+
2. Running model inference on samples
|
|
40
|
+
3. Calculating evaluation metrics
|
|
41
|
+
4. Generating and saving reports
|
|
42
|
+
5. Managing caching for predictions and reviews
|
|
26
43
|
|
|
27
44
|
Args:
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
model_adapter: BaseModelAdapter, the model adapter for the model.
|
|
33
|
-
outputs: OutputsStructure, the outputs dir. Default: None
|
|
34
|
-
task_cfg: TaskConfig, the overall task config. Default: None
|
|
35
|
-
**kwargs: kwargs.
|
|
45
|
+
benchmark: The data adapter for loading and processing data.
|
|
46
|
+
model: The model to be evaluated.
|
|
47
|
+
outputs: The output structure for saving evaluation results.
|
|
48
|
+
task_config: The task configuration.
|
|
36
49
|
"""
|
|
37
50
|
|
|
38
|
-
def __init__(
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
self.
|
|
47
|
-
self.
|
|
48
|
-
self.
|
|
49
|
-
self.
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
self.
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
self.
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
self.
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
self.
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
subset_list=self.data_adapter.subset_list,
|
|
68
|
-
work_dir=os.path.expanduser(self.task_cfg.dataset_dir),
|
|
69
|
-
datasets_hub=self.dataset_hub,
|
|
70
|
-
**self.kwargs)
|
|
71
|
-
|
|
72
|
-
# Get prompts from dataset
|
|
73
|
-
prompts = self.data_adapter.gen_prompts(data_dict=dataset)
|
|
74
|
-
return prompts
|
|
75
|
-
|
|
76
|
-
def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
|
|
77
|
-
model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
|
|
78
|
-
input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(input_d).items())), ensure_ascii=False)
|
|
79
|
-
infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
|
|
80
|
-
return 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
|
|
81
|
-
|
|
82
|
-
def _process_answer(self, answer_d, input_d, subset_name, answer_id):
|
|
83
|
-
answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
|
|
84
|
-
answer_d[AnswerKeys.ANSWER_ID] = answer_id
|
|
85
|
-
answer_d[AnswerKeys.SUBSET_NAME] = subset_name
|
|
86
|
-
answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
|
|
87
|
-
answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
|
|
88
|
-
return answer_d
|
|
89
|
-
|
|
90
|
-
def get_answers(self,
|
|
91
|
-
subset_name: str,
|
|
92
|
-
prompts_list: List[dict],
|
|
93
|
-
infer_cfg: dict = None,
|
|
94
|
-
debug: bool = False,
|
|
95
|
-
**kwargs) -> list:
|
|
96
|
-
"""
|
|
97
|
-
Get answers from model inference.
|
|
98
|
-
It is required to rewrite this method to support your own evaluator.
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
benchmark: 'DataAdapter',
|
|
54
|
+
model: 'Model',
|
|
55
|
+
outputs: 'OutputsStructure',
|
|
56
|
+
task_config: 'TaskConfig',
|
|
57
|
+
):
|
|
58
|
+
# Store core components needed for evaluation
|
|
59
|
+
self.benchmark = benchmark
|
|
60
|
+
self.model = model
|
|
61
|
+
self.outputs = outputs
|
|
62
|
+
self.task_config = task_config
|
|
63
|
+
|
|
64
|
+
# Extract frequently used identifiers
|
|
65
|
+
self.benchmark_name = benchmark.name
|
|
66
|
+
"""Name of the benchmark being evaluated."""
|
|
67
|
+
|
|
68
|
+
self.model_name = task_config.model_id
|
|
69
|
+
"""ID of the model being evaluated."""
|
|
70
|
+
|
|
71
|
+
self.use_cache = task_config.use_cache
|
|
72
|
+
"""Whether to use cache for predictions."""
|
|
73
|
+
|
|
74
|
+
# Initialize cache manager for storing and retrieving cached results
|
|
75
|
+
self.cache_manager = CacheManager(
|
|
76
|
+
outputs=outputs,
|
|
77
|
+
model_name=self.model_name,
|
|
78
|
+
benchmark_name=self.benchmark_name,
|
|
79
|
+
)
|
|
99
80
|
|
|
100
|
-
|
|
101
|
-
subset_name: subset name for benchmark.
|
|
102
|
-
prompts_list: prompts list.
|
|
103
|
-
infer_cfg: model inference config.
|
|
104
|
-
Attributes:
|
|
105
|
-
do_sample: bool, whether to use sampling.
|
|
106
|
-
top_k: int, the number of highest probability vocabulary tokens to keep for top-k-filtering.
|
|
107
|
-
top_p: float, if set to float < 1, only the most probable tokens with probabilities to add.
|
|
108
|
-
temperature: float, the value used to module the next token probabilities.
|
|
109
|
-
num_beams: int, number of beams for beam search. 1 means no beam search.
|
|
110
|
-
max_length: int, the max length of the sequence to be generated.
|
|
111
|
-
max_new_tokens: int, the max number of new tokens to be generated.
|
|
112
|
-
repetition_penalty: float, the parameter for repetition penalty. 1.0 means no penalty.
|
|
113
|
-
debug: whether to run in debug mode.
|
|
114
|
-
**kwargs: kwargs.
|
|
115
|
-
|
|
116
|
-
Returns: The list of answers.
|
|
81
|
+
def eval(self) -> Report:
|
|
117
82
|
"""
|
|
118
|
-
|
|
119
|
-
assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
|
|
120
|
-
assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
|
|
83
|
+
Run the complete evaluation process.
|
|
121
84
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
85
|
+
This is the main entry point that orchestrates the entire evaluation:
|
|
86
|
+
1. Load dataset from benchmark
|
|
87
|
+
2. Evaluate each subset independently
|
|
88
|
+
3. Aggregate scores across subsets
|
|
89
|
+
4. Generate final evaluation report
|
|
126
90
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
91
|
+
Returns:
|
|
92
|
+
Report: The complete evaluation report containing all metrics and results.
|
|
93
|
+
"""
|
|
94
|
+
# Load the dataset and evaluate each subset
|
|
95
|
+
logger.info(f'Start evaluating benchmark: {self.benchmark_name}')
|
|
96
|
+
dataset_dict = self.benchmark.load_dataset()
|
|
97
|
+
agg_score_dict = defaultdict(list)
|
|
98
|
+
|
|
99
|
+
# Process each subset (e.g., test, validation) independently
|
|
100
|
+
logger.info('Evaluating all subsets of the dataset...')
|
|
101
|
+
for subset, dataset in dataset_dict.items():
|
|
102
|
+
if len(dataset) == 0:
|
|
103
|
+
logger.info(f'No samples found in subset: {subset}, skipping.')
|
|
104
|
+
continue
|
|
105
|
+
logger.info(f'Evaluating subset: {subset}')
|
|
106
|
+
subset_score = self.evaluate_subset(subset, dataset)
|
|
107
|
+
agg_score_dict[subset] = subset_score
|
|
135
108
|
|
|
136
|
-
|
|
137
|
-
|
|
109
|
+
# Generate the report based on aggregated scores
|
|
110
|
+
logger.info('Generating report...')
|
|
111
|
+
report = self.get_report(agg_score_dict)
|
|
138
112
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
113
|
+
# Finalize the evaluation process
|
|
114
|
+
self.finalize()
|
|
115
|
+
logger.info(f'Benchmark {self.benchmark_name} evaluation finished.')
|
|
116
|
+
return report
|
|
144
117
|
|
|
145
|
-
|
|
146
|
-
for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
|
|
147
|
-
answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
|
|
148
|
-
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
149
|
-
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
150
|
-
|
|
151
|
-
if debug:
|
|
152
|
-
logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
|
|
153
|
-
logger.info(f'**predicted ans: {json.dumps(processed_answer, ensure_ascii=False)} \n')
|
|
154
|
-
|
|
155
|
-
answers_list.append(processed_answer)
|
|
156
|
-
dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
157
|
-
|
|
158
|
-
logger.info(f'Dump predictions to {pred_file_path}.')
|
|
159
|
-
return answers_list
|
|
160
|
-
|
|
161
|
-
def _get_review(self, answer_d: dict, review_id: str = None, reviewer_spec: dict = None) -> dict:
|
|
162
|
-
|
|
163
|
-
if reviewer_spec is None:
|
|
164
|
-
reviewer_spec = {}
|
|
165
|
-
|
|
166
|
-
review_res = deepcopy(answer_d)
|
|
167
|
-
choices = review_res[AnswerKeys.CHOICES]
|
|
168
|
-
if len(choices) == 0:
|
|
169
|
-
review_res[ReviewKeys.REVIEWED] = False
|
|
170
|
-
review_res[ReviewKeys.REVIEW_ID] = None
|
|
171
|
-
review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
|
|
172
|
-
review_res[ReviewKeys.REVIEW_TIME] = time.time()
|
|
173
|
-
return review_res
|
|
174
|
-
|
|
175
|
-
rev_choices = []
|
|
176
|
-
for choice in choices:
|
|
177
|
-
raw_input_d: dict = review_res[AnswerKeys.RAW_INPUT]
|
|
178
|
-
answer_content = choice[ReviewKeys.MESSAGE][ReviewKeys.CONTENT]
|
|
179
|
-
answer_content = self.data_adapter.parse_pred_result(
|
|
180
|
-
result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
|
|
181
|
-
gold_content = self.data_adapter.get_gold_answer(raw_input_d)
|
|
182
|
-
|
|
183
|
-
review_result = self.data_adapter.match(gold_content, answer_content)
|
|
184
|
-
choice[ReviewKeys.REVIEW] = {
|
|
185
|
-
ReviewKeys.GOLD: gold_content,
|
|
186
|
-
ReviewKeys.PRED: answer_content,
|
|
187
|
-
ReviewKeys.RESULT: review_result
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
rev_choices.append(choice)
|
|
191
|
-
|
|
192
|
-
review_res[AnswerKeys.CHOICES] = rev_choices
|
|
193
|
-
review_res[ReviewKeys.REVIEWED] = True
|
|
194
|
-
review_res[ReviewKeys.REVIEW_ID] = review_id
|
|
195
|
-
review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
|
|
196
|
-
review_res[ReviewKeys.REVIEW_TIME] = time.time()
|
|
197
|
-
|
|
198
|
-
return review_res
|
|
199
|
-
|
|
200
|
-
def _generate_review_id(self, answer_d):
|
|
201
|
-
# Gen review_id (concat: answer_id + reviewer_spec)
|
|
202
|
-
answer_id = answer_d[AnswerKeys.ANSWER_ID]
|
|
203
|
-
reviewer_spec = {
|
|
204
|
-
'metric': [metric.name for metric in self.data_adapter.metric_list],
|
|
205
|
-
'reviewer': ['Evaluator'],
|
|
206
|
-
'revision': ['default']
|
|
207
|
-
}
|
|
208
|
-
reviewer_spec_str = json.dumps(
|
|
209
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
|
|
210
|
-
review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
|
|
211
|
-
return review_id, reviewer_spec
|
|
212
|
-
|
|
213
|
-
def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
|
|
118
|
+
def evaluate_subset(self, subset: str, dataset: Dataset) -> List[AggScore]:
|
|
214
119
|
"""
|
|
215
|
-
|
|
216
|
-
|
|
120
|
+
Evaluate a single subset of the dataset.
|
|
121
|
+
|
|
122
|
+
This method processes one subset through the complete evaluation pipeline:
|
|
123
|
+
1. Get model predictions for all samples
|
|
124
|
+
2. Calculate evaluation metrics for predictions
|
|
125
|
+
3. Aggregate individual sample scores
|
|
217
126
|
|
|
218
127
|
Args:
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
debug: whether to run in debug mode.
|
|
222
|
-
**kwargs: kwargs.
|
|
128
|
+
subset: Name of the subset being evaluated (e.g., 'test', 'validation').
|
|
129
|
+
dataset: The dataset subset containing samples to evaluate.
|
|
223
130
|
|
|
224
|
-
Returns:
|
|
131
|
+
Returns:
|
|
132
|
+
List[AggScore]: Aggregated scores for this subset.
|
|
225
133
|
"""
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
review_file_path = os.path.join(self.outputs_structure.reviews_dir, self.model_name, review_file_name)
|
|
230
|
-
os.makedirs(os.path.dirname(review_file_path), exist_ok=True)
|
|
134
|
+
# Get model predictions for all samples in the subset
|
|
135
|
+
logger.info(f'Getting predictions for subset: {subset}')
|
|
136
|
+
task_states = self.get_answers(subset, dataset)
|
|
231
137
|
|
|
232
|
-
|
|
233
|
-
|
|
138
|
+
# Calculate evaluation metrics for each prediction
|
|
139
|
+
logger.info(f'Getting reviews for subset: {subset}')
|
|
140
|
+
sample_scores = self.get_reviews(subset, task_states)
|
|
234
141
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
142
|
+
# Aggregate individual sample scores into subset-level metrics
|
|
143
|
+
logger.info(f'Aggregating scores for subset: {subset}')
|
|
144
|
+
agg_scores = self.benchmark.aggregate_scores(sample_scores=sample_scores)
|
|
145
|
+
return agg_scores
|
|
239
146
|
|
|
240
|
-
|
|
241
|
-
|
|
147
|
+
def get_answers(self, subset: str, dataset: Dataset) -> List[TaskState]:
|
|
148
|
+
"""
|
|
149
|
+
Get model predictions for all samples in the dataset subset.
|
|
242
150
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
151
|
+
This method handles:
|
|
152
|
+
1. Loading cached predictions if available and caching is enabled
|
|
153
|
+
2. Running model inference on remaining samples in parallel
|
|
154
|
+
3. Saving new predictions to cache
|
|
246
155
|
|
|
247
|
-
|
|
156
|
+
Args:
|
|
157
|
+
subset: Name of the subset being processed.
|
|
158
|
+
dataset: The dataset subset containing samples for prediction.
|
|
248
159
|
|
|
249
|
-
|
|
160
|
+
Returns:
|
|
161
|
+
List[TaskState]: Task states containing model predictions for each sample.
|
|
162
|
+
"""
|
|
163
|
+
# Initialize task state list and filter cached predictions if caching is enabled
|
|
164
|
+
if self.use_cache:
|
|
165
|
+
cached_task_state_list, dataset = self.cache_manager.filter_prediction_cache(subset, dataset)
|
|
166
|
+
else:
|
|
167
|
+
cached_task_state_list = []
|
|
168
|
+
|
|
169
|
+
# Get output directory for storing model predictions
|
|
170
|
+
model_prediction_dir = os.path.dirname(self.cache_manager.get_prediction_cache_path(subset))
|
|
171
|
+
|
|
172
|
+
# Convert dataset to list for parallel processing
|
|
173
|
+
dataset_list = list(dataset)
|
|
174
|
+
if not dataset_list:
|
|
175
|
+
return cached_task_state_list
|
|
176
|
+
|
|
177
|
+
logger.info(f'Processing {len(dataset_list)} samples, if data is large, it may take a while.')
|
|
178
|
+
|
|
179
|
+
def worker(sample: Sample) -> TaskState:
|
|
180
|
+
return self._predict_sample(sample, model_prediction_dir)
|
|
181
|
+
|
|
182
|
+
def on_result(sample: Sample, task_state: TaskState) -> None:
|
|
183
|
+
model_result = self.cache_manager.save_prediction_cache(subset, task_state, self.benchmark.save_metadata)
|
|
184
|
+
logger.debug(f'Model result: \n{model_result.pretty_print()}')
|
|
185
|
+
|
|
186
|
+
def on_error(sample: Sample, exc: Exception) -> None:
|
|
187
|
+
tb_str = traceback.format_exc()
|
|
188
|
+
logger.error(f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}\nTraceback:\n{tb_str}')
|
|
189
|
+
if self.task_config.ignore_errors:
|
|
190
|
+
logger.warning('Error ignored, continuing with next sample.')
|
|
191
|
+
return
|
|
192
|
+
raise exc
|
|
193
|
+
|
|
194
|
+
finished_task_states = run_in_threads_with_progress(
|
|
195
|
+
dataset_list,
|
|
196
|
+
worker,
|
|
197
|
+
desc=f'Predicting[{self.benchmark_name}@{subset}]: ',
|
|
198
|
+
max_workers=self.task_config.eval_batch_size,
|
|
199
|
+
heartbeat_sec=HEARTBEAT_INTERVAL_SEC,
|
|
200
|
+
on_result=on_result,
|
|
201
|
+
on_error=on_error,
|
|
202
|
+
filter_none_results=True,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
logger.info(f'Finished getting predictions for subset: {subset}.')
|
|
206
|
+
return cached_task_state_list + finished_task_states
|
|
207
|
+
|
|
208
|
+
def _predict_sample(self, sample: Sample, model_prediction_dir: str) -> TaskState:
|
|
250
209
|
"""
|
|
251
|
-
|
|
252
|
-
It is required to rewrite this method to support your own evaluator.
|
|
210
|
+
Helper method to predict a single sample.
|
|
253
211
|
|
|
254
212
|
Args:
|
|
255
|
-
|
|
213
|
+
sample: The sample to predict.
|
|
214
|
+
model_prediction_dir: Directory for storing model predictions.
|
|
256
215
|
|
|
257
216
|
Returns:
|
|
258
|
-
The
|
|
217
|
+
TaskState: The task state containing the prediction result.
|
|
259
218
|
"""
|
|
219
|
+
logger.debug(f'\n{sample.pretty_print()}')
|
|
260
220
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
|
|
265
|
-
continue
|
|
221
|
+
# Run model inference on the current sample
|
|
222
|
+
task_state = self.benchmark.run_inference(model=self.model, sample=sample, output_dir=model_prediction_dir)
|
|
223
|
+
return task_state
|
|
266
224
|
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
elif len(review_d[AnswerKeys.CHOICES]) == 1:
|
|
271
|
-
review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
|
|
272
|
-
else:
|
|
273
|
-
review_res = [choice[ReviewKeys.REVIEW][ReviewKeys.RESULT] for choice in review_d[AnswerKeys.CHOICES]]
|
|
274
|
-
|
|
275
|
-
review_res_list.append(review_res)
|
|
225
|
+
def get_reviews(self, subset: str, task_states: List[TaskState]) -> List[SampleScore]:
|
|
226
|
+
"""
|
|
227
|
+
Calculate evaluation metrics for model predictions.
|
|
276
228
|
|
|
277
|
-
|
|
229
|
+
This method handles:
|
|
230
|
+
1. Loading cached review results if available and caching is enabled
|
|
231
|
+
2. Computing metrics for remaining task states in parallel
|
|
232
|
+
3. Saving new review results to cache
|
|
278
233
|
|
|
279
|
-
|
|
234
|
+
Args:
|
|
235
|
+
subset: Name of the subset being reviewed.
|
|
236
|
+
task_states: List of task states containing model predictions.
|
|
280
237
|
|
|
281
|
-
|
|
238
|
+
Returns:
|
|
239
|
+
List[SampleScore]: Evaluation scores for each sample.
|
|
240
|
+
"""
|
|
241
|
+
# Initialize sample score list and filter cached reviews if caching is enabled
|
|
242
|
+
if self.use_cache and not self.task_config.rerun_review:
|
|
243
|
+
cached_score_list, task_states = self.cache_manager.filter_review_cache(subset, task_states)
|
|
244
|
+
else:
|
|
245
|
+
# Init a clean sample score list
|
|
246
|
+
cached_score_list = []
|
|
247
|
+
self.cache_manager.delete_review_cache(subset)
|
|
248
|
+
|
|
249
|
+
if not task_states:
|
|
250
|
+
return cached_score_list
|
|
251
|
+
|
|
252
|
+
logger.info(f'Reviewing {len(task_states)} samples, if data is large, it may take a while.')
|
|
253
|
+
|
|
254
|
+
def worker(task_state: TaskState) -> SampleScore:
|
|
255
|
+
return self._review_task_state(task_state)
|
|
256
|
+
|
|
257
|
+
def on_result(task_state: TaskState, sample_score: SampleScore) -> None:
|
|
258
|
+
review_result = self.cache_manager.save_review_cache(
|
|
259
|
+
subset=subset,
|
|
260
|
+
task_state=task_state,
|
|
261
|
+
sample_score=sample_score,
|
|
262
|
+
save_metadata=self.benchmark.save_metadata
|
|
263
|
+
)
|
|
264
|
+
logger.debug(f'Review result: \n{review_result.pretty_print()}')
|
|
265
|
+
|
|
266
|
+
def on_error(task_state: TaskState, exc: Exception) -> None:
|
|
267
|
+
tb_str = traceback.format_exc()
|
|
268
|
+
logger.error(f'Error when review sample {task_state.sample_id}: due to {exc}\nTraceback:\n{tb_str}')
|
|
269
|
+
if self.task_config.ignore_errors:
|
|
270
|
+
logger.warning('Error ignored, continuing with next sample.')
|
|
271
|
+
return
|
|
272
|
+
raise exc
|
|
273
|
+
|
|
274
|
+
# Run reviews in parallel
|
|
275
|
+
reviewed_scores = run_in_threads_with_progress(
|
|
276
|
+
task_states,
|
|
277
|
+
worker,
|
|
278
|
+
desc=f'Reviewing[{self.benchmark_name}@{subset}]: ',
|
|
279
|
+
max_workers=self.task_config.judge_worker_num,
|
|
280
|
+
heartbeat_sec=HEARTBEAT_INTERVAL_SEC,
|
|
281
|
+
on_error=on_error,
|
|
282
|
+
# Do not persist interim results when batch scoring is enabled
|
|
283
|
+
on_result=None if self.benchmark.use_batch_scoring else on_result,
|
|
284
|
+
filter_none_results=False,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# Batch calculate metrics if supported by the benchmark
|
|
288
|
+
if self.benchmark.use_batch_scoring:
|
|
289
|
+
reviewed_scores = self._batch_review_task_states(
|
|
290
|
+
task_states=task_states, reviewed_scores=reviewed_scores, on_result=on_result
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
logger.info(f'Finished reviewing subset: {subset}. Total reviewed: {len(reviewed_scores)}')
|
|
294
|
+
return cached_score_list + reviewed_scores
|
|
295
|
+
|
|
296
|
+
def _review_task_state(self, task_state: TaskState) -> SampleScore:
|
|
282
297
|
"""
|
|
283
|
-
|
|
284
|
-
It is required to rewrite this method to support your own evaluator.
|
|
298
|
+
Helper method to review a single task state.
|
|
285
299
|
|
|
286
300
|
Args:
|
|
287
|
-
|
|
288
|
-
use_table: whether to generate table for reports. Default to True.
|
|
301
|
+
task_state: The task state to review.
|
|
289
302
|
|
|
290
|
-
Returns:
|
|
303
|
+
Returns:
|
|
304
|
+
SampleScore: The evaluation score for the task state.
|
|
291
305
|
"""
|
|
292
|
-
#
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
306
|
+
# Compute evaluation metrics using the benchmark's metric calculation
|
|
307
|
+
sample_score = self.benchmark.calculate_metrics(task_state=task_state)
|
|
308
|
+
return sample_score
|
|
309
|
+
|
|
310
|
+
def _batch_review_task_states(
|
|
311
|
+
self, task_states: List[TaskState], reviewed_scores: List[SampleScore],
|
|
312
|
+
on_result: Callable[[TaskState, SampleScore], None]
|
|
313
|
+
) -> List[SampleScore]:
|
|
314
|
+
valid_indices = [i for i, score in enumerate(reviewed_scores) if score is not None]
|
|
315
|
+
if not valid_indices:
|
|
316
|
+
return reviewed_scores
|
|
317
|
+
|
|
318
|
+
task_states = [task_states[i] for i in valid_indices]
|
|
319
|
+
reviewed_scores = [reviewed_scores[i] for i in valid_indices]
|
|
320
|
+
|
|
321
|
+
# Iterate in batches with progress bar
|
|
322
|
+
all_reviewed_scores = []
|
|
323
|
+
total = len(task_states)
|
|
324
|
+
batch_size = self.task_config.judge_worker_num
|
|
325
|
+
with tqdm(total=total, desc='Scoring (batch)', unit='sample') as pbar:
|
|
326
|
+
for start in range(0, total, batch_size):
|
|
327
|
+
# Process batch
|
|
328
|
+
end = min(start + batch_size, total)
|
|
329
|
+
batch_task_states = task_states[start:end]
|
|
330
|
+
batch_scores = reviewed_scores[start:end]
|
|
331
|
+
# Batch calculate metrics
|
|
332
|
+
updated_reviewed_scores = self.benchmark.batch_calculate_metrics(
|
|
333
|
+
task_states=batch_task_states, sample_scores=batch_scores
|
|
334
|
+
)
|
|
335
|
+
# Append results
|
|
336
|
+
all_reviewed_scores.extend(updated_reviewed_scores)
|
|
337
|
+
# Save each result to cache
|
|
338
|
+
for task_state, sample_score in zip(batch_task_states, updated_reviewed_scores):
|
|
339
|
+
on_result(task_state, sample_score)
|
|
340
|
+
|
|
341
|
+
pbar.update(len(batch_task_states))
|
|
342
|
+
return all_reviewed_scores
|
|
343
|
+
|
|
344
|
+
def get_report(self, agg_score_dict: Dict[str, List[AggScore]]) -> Report:
|
|
319
345
|
"""
|
|
320
|
-
|
|
321
|
-
It is required to rewrite this method to support your own evaluator.
|
|
346
|
+
Generate a comprehensive evaluation report from aggregated scores.
|
|
322
347
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
5. Generate report from review results.
|
|
348
|
+
This method handles:
|
|
349
|
+
1. Creating the evaluation report from scores
|
|
350
|
+
2. Generating and displaying a summary table
|
|
351
|
+
3. Optionally generating detailed analysis
|
|
352
|
+
4. Saving the report to file
|
|
329
353
|
|
|
330
354
|
Args:
|
|
331
|
-
|
|
332
|
-
debug: Whether to run in debug mode. Default: False.
|
|
355
|
+
agg_score_dict: Dictionary mapping subset names to their aggregated scores.
|
|
333
356
|
|
|
334
357
|
Returns:
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
stage == 'all': return the report_map
|
|
338
|
-
stage == 'infer': return the answers_map
|
|
339
|
-
stage == 'review': return the reviews_map
|
|
358
|
+
Report: The complete evaluation report.
|
|
340
359
|
"""
|
|
360
|
+
assert agg_score_dict, 'No scores to generate report from.'
|
|
361
|
+
|
|
362
|
+
# Get paths for saving the report
|
|
363
|
+
report_path = self.cache_manager.get_report_path()
|
|
364
|
+
report_file = self.cache_manager.get_report_file()
|
|
365
|
+
|
|
366
|
+
# Generate the main evaluation report using benchmark-specific logic
|
|
367
|
+
report = self.benchmark.generate_report(
|
|
368
|
+
scores=agg_score_dict, model_name=self.model_name, output_dir=report_path
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
# Generate and display a summary table of results
|
|
372
|
+
try:
|
|
373
|
+
report_table = gen_table(report_list=[report], add_overall_metric=self.benchmark.add_overall_metric)
|
|
374
|
+
logger.info(f'\n{self.benchmark_name} report table:'
|
|
375
|
+
f'\n{report_table} \n')
|
|
376
|
+
except Exception:
|
|
377
|
+
logger.error('Failed to generate report table.')
|
|
378
|
+
|
|
379
|
+
# Generate detailed analysis if requested in configuration
|
|
380
|
+
if self.task_config.analysis_report:
|
|
381
|
+
logger.info('Generating report analysis, please wait ...')
|
|
382
|
+
analysis = report.generate_analysis(self.task_config.judge_model_args)
|
|
383
|
+
logger.info(f'Report analysis:\n{analysis}')
|
|
384
|
+
else:
|
|
385
|
+
logger.info('Skipping report analysis (`analysis_report=False`).')
|
|
341
386
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
stage_reviews_dict = {}
|
|
347
|
-
|
|
348
|
-
prompts = self.load_dataset()
|
|
349
|
-
for subset_name, prompts_list in prompts.items():
|
|
350
|
-
limit = kwargs.get('limit', len(prompts_list))
|
|
351
|
-
prompts_list = prompts_list[:limit]
|
|
352
|
-
|
|
353
|
-
answers_list: list = self.get_answers(
|
|
354
|
-
subset_name=subset_name, prompts_list=prompts_list, infer_cfg=infer_cfg, debug=debug, **kwargs)
|
|
355
|
-
if self.stage == EvalStage.INFER:
|
|
356
|
-
stage_answers_dict[subset_name] = answers_list
|
|
357
|
-
continue
|
|
358
|
-
|
|
359
|
-
reviews_list: list = self.get_reviews(
|
|
360
|
-
subset_name=subset_name, answers_list=answers_list, debug=debug, **kwargs)
|
|
361
|
-
|
|
362
|
-
metric_res = self.compute_metrics(reviews_list=reviews_list)
|
|
363
|
-
reviews_score_all[subset_name] = metric_res
|
|
364
|
-
stage_reviews_dict[subset_name] = reviews_list
|
|
365
|
-
|
|
366
|
-
if self.stage == EvalStage.INFER:
|
|
367
|
-
return stage_answers_dict
|
|
368
|
-
|
|
369
|
-
if self.stage == EvalStage.REVIEW:
|
|
370
|
-
return stage_reviews_dict
|
|
371
|
-
|
|
372
|
-
# Generate report
|
|
373
|
-
report_map = self.dump_report(reviews_score_all)
|
|
374
|
-
|
|
375
|
-
logger.info(f'**** Evaluation finished on {self.dataset_name_or_path} ****\n')
|
|
387
|
+
# Save the complete report to file
|
|
388
|
+
report.to_json(report_file)
|
|
389
|
+
logger.info(f'Dump report to: {report_file} \n')
|
|
390
|
+
return report
|
|
376
391
|
|
|
377
|
-
|
|
392
|
+
def finalize(self, *args, **kwargs):
|
|
393
|
+
self.benchmark.finalize(*args, **kwargs)
|