evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
evalscope/run.py
CHANGED
|
@@ -2,19 +2,16 @@
|
|
|
2
2
|
"""
|
|
3
3
|
Run evaluation for LLMs.
|
|
4
4
|
"""
|
|
5
|
-
import os
|
|
5
|
+
import os
|
|
6
6
|
from argparse import Namespace
|
|
7
7
|
from datetime import datetime
|
|
8
8
|
from typing import TYPE_CHECKING, List, Optional, Union
|
|
9
9
|
|
|
10
10
|
from evalscope.config import TaskConfig, parse_task_config
|
|
11
11
|
from evalscope.constants import DataCollection, EvalBackend
|
|
12
|
-
from evalscope.utils import seed_everything
|
|
13
12
|
from evalscope.utils.io_utils import OutputsStructure
|
|
14
13
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
15
|
-
|
|
16
|
-
if TYPE_CHECKING:
|
|
17
|
-
from evalscope.models import LocalModel
|
|
14
|
+
from evalscope.utils.model_utils import seed_everything
|
|
18
15
|
|
|
19
16
|
logger = get_logger()
|
|
20
17
|
|
|
@@ -39,25 +36,40 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
|
|
|
39
36
|
configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
|
|
40
37
|
|
|
41
38
|
if task_cfg.eval_backend != EvalBackend.NATIVE:
|
|
42
|
-
|
|
39
|
+
result = run_non_native_backend(task_cfg, outputs)
|
|
43
40
|
else:
|
|
44
|
-
|
|
41
|
+
logger.info('Running with native backend')
|
|
42
|
+
result = evaluate_model(task_cfg, outputs)
|
|
43
|
+
|
|
44
|
+
logger.info(f'Finished evaluation for {task_cfg.model_id} on {task_cfg.datasets}')
|
|
45
|
+
logger.info(f'Output directory: {outputs.outputs_dir}')
|
|
46
|
+
|
|
47
|
+
return result
|
|
45
48
|
|
|
46
49
|
|
|
47
50
|
def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
48
51
|
"""Set the working directory for the task."""
|
|
52
|
+
# use cache
|
|
49
53
|
if task_cfg.use_cache:
|
|
50
54
|
task_cfg.work_dir = task_cfg.use_cache
|
|
51
55
|
logger.info(f'Set resume from {task_cfg.work_dir}')
|
|
52
56
|
# elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
|
|
53
|
-
|
|
57
|
+
else:
|
|
58
|
+
task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
|
|
54
59
|
|
|
55
60
|
outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
|
|
56
61
|
|
|
62
|
+
# Unify the output directory structure
|
|
57
63
|
if task_cfg.eval_backend == EvalBackend.OPEN_COMPASS:
|
|
58
64
|
task_cfg.eval_config['time_str'] = run_time
|
|
59
65
|
elif task_cfg.eval_backend == EvalBackend.VLM_EVAL_KIT:
|
|
60
66
|
task_cfg.eval_config['work_dir'] = task_cfg.work_dir
|
|
67
|
+
elif task_cfg.eval_backend == EvalBackend.RAG_EVAL:
|
|
68
|
+
from evalscope.backend.rag_eval import Tools
|
|
69
|
+
if task_cfg.eval_config['tool'].lower() == Tools.MTEB:
|
|
70
|
+
task_cfg.eval_config['eval']['output_folder'] = task_cfg.work_dir
|
|
71
|
+
elif task_cfg.eval_config['tool'].lower() == Tools.CLIP_BENCHMARK:
|
|
72
|
+
task_cfg.eval_config['eval']['output_dir'] = task_cfg.work_dir
|
|
61
73
|
return outputs
|
|
62
74
|
|
|
63
75
|
|
|
@@ -83,69 +95,82 @@ def run_non_native_backend(task_cfg: TaskConfig, outputs: OutputsStructure) -> d
|
|
|
83
95
|
def get_backend_manager_class(eval_backend: EvalBackend):
|
|
84
96
|
"""Get the backend manager class based on the evaluation backend."""
|
|
85
97
|
if eval_backend == EvalBackend.OPEN_COMPASS:
|
|
98
|
+
logger.info('Using OpenCompassBackendManager')
|
|
86
99
|
from evalscope.backend.opencompass import OpenCompassBackendManager
|
|
87
100
|
return OpenCompassBackendManager
|
|
88
101
|
elif eval_backend == EvalBackend.VLM_EVAL_KIT:
|
|
102
|
+
logger.info('Using VLMEvalKitBackendManager')
|
|
89
103
|
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
90
104
|
return VLMEvalKitBackendManager
|
|
91
105
|
elif eval_backend == EvalBackend.RAG_EVAL:
|
|
106
|
+
logger.info('Using RAGEvalBackendManager')
|
|
92
107
|
from evalscope.backend.rag_eval import RAGEvalBackendManager
|
|
93
108
|
return RAGEvalBackendManager
|
|
94
109
|
elif eval_backend == EvalBackend.THIRD_PARTY:
|
|
95
110
|
raise NotImplementedError(f'Not implemented for evaluation backend {eval_backend}')
|
|
96
111
|
|
|
97
112
|
|
|
98
|
-
def evaluate_model(
|
|
113
|
+
def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
99
114
|
"""Evaluate the model based on the provided task configuration."""
|
|
100
|
-
from evalscope.
|
|
115
|
+
from evalscope.api.evaluator import Evaluator
|
|
116
|
+
from evalscope.api.model import get_model_with_task_config
|
|
117
|
+
from evalscope.api.registry import get_benchmark
|
|
118
|
+
from evalscope.evaluator import DefaultEvaluator
|
|
119
|
+
from evalscope.report import gen_table
|
|
101
120
|
|
|
102
121
|
# Initialize evaluator
|
|
103
122
|
eval_results = {}
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
for
|
|
107
|
-
|
|
123
|
+
# Initialize model
|
|
124
|
+
model = get_model_with_task_config(task_config=task_config)
|
|
125
|
+
# Initialize evaluators for each dataset
|
|
126
|
+
evaluators: List[Evaluator] = []
|
|
127
|
+
for dataset_name in task_config.datasets:
|
|
128
|
+
# Create evaluator for each dataset
|
|
129
|
+
benchmark = get_benchmark(dataset_name, task_config)
|
|
130
|
+
evaluator = DefaultEvaluator(
|
|
131
|
+
task_config=task_config,
|
|
132
|
+
model=model,
|
|
133
|
+
benchmark=benchmark,
|
|
134
|
+
outputs=outputs,
|
|
135
|
+
)
|
|
108
136
|
evaluators.append(evaluator)
|
|
109
137
|
|
|
138
|
+
# Update task_config.dataset_args with benchmark metadata, except for DataCollection
|
|
139
|
+
if dataset_name != DataCollection.NAME:
|
|
140
|
+
task_config.dataset_args[dataset_name] = benchmark.to_dict()
|
|
141
|
+
|
|
110
142
|
# dump task_cfg to outputs.configs_dir after creating evaluators
|
|
111
|
-
|
|
112
|
-
logger.info(
|
|
143
|
+
task_config.dump_yaml(outputs.configs_dir)
|
|
144
|
+
logger.info(task_config)
|
|
113
145
|
|
|
146
|
+
# Run evaluation for each evaluator
|
|
114
147
|
for evaluator in evaluators:
|
|
115
|
-
res_dict = evaluator.eval(
|
|
116
|
-
eval_results[
|
|
148
|
+
res_dict = evaluator.eval()
|
|
149
|
+
eval_results[evaluator.benchmark.name] = res_dict
|
|
150
|
+
|
|
151
|
+
# Make overall report
|
|
152
|
+
try:
|
|
153
|
+
report_table: str = gen_table(reports_path_list=[outputs.reports_dir], add_overall_metric=True)
|
|
154
|
+
logger.info(f'Overall report table: \n{report_table} \n')
|
|
155
|
+
except Exception:
|
|
156
|
+
logger.error('Failed to generate report table.')
|
|
157
|
+
# Clean up
|
|
158
|
+
if model is not None:
|
|
159
|
+
import gc
|
|
160
|
+
|
|
161
|
+
del model
|
|
162
|
+
del evaluators
|
|
163
|
+
gc.collect()
|
|
164
|
+
|
|
165
|
+
from evalscope.utils.import_utils import check_import
|
|
166
|
+
if check_import('torch', raise_warning=False):
|
|
167
|
+
import torch
|
|
168
|
+
if torch.cuda.is_available():
|
|
169
|
+
torch.cuda.empty_cache()
|
|
117
170
|
|
|
118
171
|
return eval_results
|
|
119
172
|
|
|
120
173
|
|
|
121
|
-
def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: 'LocalModel'):
|
|
122
|
-
"""Create an evaluator object for the specified dataset."""
|
|
123
|
-
from evalscope.benchmarks import Benchmark, BenchmarkMeta
|
|
124
|
-
from evalscope.evaluator import Evaluator
|
|
125
|
-
from evalscope.models import initialize_model_adapter
|
|
126
|
-
|
|
127
|
-
if dataset_name == DataCollection.NAME:
|
|
128
|
-
# EvaluatorCollection is a collection of evaluators
|
|
129
|
-
from evalscope.collections import EvaluatorCollection
|
|
130
|
-
return EvaluatorCollection(task_cfg, outputs)
|
|
131
|
-
|
|
132
|
-
benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
|
|
133
|
-
|
|
134
|
-
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
135
|
-
model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
|
|
136
|
-
|
|
137
|
-
# update task_cfg.dataset_args
|
|
138
|
-
task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
|
|
139
|
-
|
|
140
|
-
return Evaluator(
|
|
141
|
-
dataset_name_or_path=benchmark.dataset_id,
|
|
142
|
-
data_adapter=data_adapter,
|
|
143
|
-
model_adapter=model_adapter,
|
|
144
|
-
outputs=outputs,
|
|
145
|
-
task_cfg=task_cfg,
|
|
146
|
-
)
|
|
147
|
-
|
|
148
|
-
|
|
149
174
|
def main():
|
|
150
175
|
from evalscope.arguments import parse_args
|
|
151
176
|
args = parse_args()
|
evalscope/summarizer.py
CHANGED
|
@@ -7,8 +7,7 @@ from typing import List, Union
|
|
|
7
7
|
from evalscope.config import TaskConfig, parse_task_config
|
|
8
8
|
from evalscope.constants import EvalBackend
|
|
9
9
|
from evalscope.report import gen_table
|
|
10
|
-
from evalscope.utils import csv_to_list, get_latest_folder_path
|
|
11
|
-
from evalscope.utils.io_utils import OutputsStructure, json_to_dict, yaml_to_dict
|
|
10
|
+
from evalscope.utils.io_utils import OutputsStructure, csv_to_list, get_latest_folder_path, json_to_dict, yaml_to_dict
|
|
12
11
|
from evalscope.utils.logger import get_logger
|
|
13
12
|
|
|
14
13
|
logger = get_logger()
|
|
@@ -30,7 +29,7 @@ class Summarizer:
|
|
|
30
29
|
with open(report_file, 'r') as f:
|
|
31
30
|
res_list.append(json.load(f))
|
|
32
31
|
|
|
33
|
-
report_table: str = gen_table([reports_dir])
|
|
32
|
+
report_table: str = gen_table(reports_path_list=[reports_dir])
|
|
34
33
|
logger.info(f'*** Report table ***\n{report_table}')
|
|
35
34
|
|
|
36
35
|
return res_list
|
|
@@ -81,7 +80,7 @@ class Summarizer:
|
|
|
81
80
|
|
|
82
81
|
summary_file_path = summary_files[0]
|
|
83
82
|
# Example: [{'dataset': 'gsm8k', 'version': '1d7fe4', 'metric': 'accuracy', 'mode': 'gen', 'qwen-7b-chat': '53.98'} # noqa: E501
|
|
84
|
-
summary_res: List[dict] = csv_to_list(
|
|
83
|
+
summary_res: List[dict] = csv_to_list(summary_file_path)
|
|
85
84
|
final_res_list.extend(summary_res)
|
|
86
85
|
elif eval_backend == EvalBackend.VLM_EVAL_KIT:
|
|
87
86
|
eval_config = Summarizer.parse_eval_config(candidate_task)
|
|
@@ -105,7 +104,8 @@ class Summarizer:
|
|
|
105
104
|
summary_res: dict = csv_to_list(summary_file_path)[0]
|
|
106
105
|
elif summary_file_path.endswith('json'):
|
|
107
106
|
summary_res: dict = json_to_dict(summary_file_path)
|
|
108
|
-
|
|
107
|
+
base_name = os.path.basename(summary_file_path)
|
|
108
|
+
file_name = os.path.splitext(base_name)[0]
|
|
109
109
|
final_res_list.append({file_name: summary_res})
|
|
110
110
|
|
|
111
111
|
elif eval_backend == EvalBackend.THIRD_PARTY:
|
|
@@ -8,7 +8,7 @@ import random
|
|
|
8
8
|
import torch
|
|
9
9
|
from typing import List
|
|
10
10
|
|
|
11
|
-
from evalscope.
|
|
11
|
+
from evalscope.third_party.longbench_write.tools.openai_api import OpenaiApi
|
|
12
12
|
from evalscope.third_party.longbench_write.utils import count_words
|
|
13
13
|
from evalscope.utils import get_logger
|
|
14
14
|
|
|
@@ -0,0 +1,441 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import plotly.graph_objects as go
|
|
5
|
+
import re
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from functools import lru_cache
|
|
8
|
+
from modelscope import AutoTokenizer
|
|
9
|
+
from plotly.subplots import make_subplots
|
|
10
|
+
from tqdm.contrib.concurrent import thread_map
|
|
11
|
+
from typing import List
|
|
12
|
+
|
|
13
|
+
from evalscope.third_party.thinkbench.tools.llm import request_url
|
|
14
|
+
from evalscope.third_party.thinkbench.tools.utils import extract_answer
|
|
15
|
+
from evalscope.utils.io_utils import dict_to_json, dump_jsonl_data, json_to_dict, jsonl_to_list
|
|
16
|
+
|
|
17
|
+
cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
18
|
+
|
|
19
|
+
class EvalThink:
|
|
20
|
+
def __init__(self, report_path, tokenizer_path, model_name, dataset_name, subsets, split_strategies='llm', judge_config=None):
|
|
21
|
+
self.report_path = report_path
|
|
22
|
+
self.reformat_template = open(os.path.join(cur_path, 'resources/reformat_template.txt'), 'r').read()
|
|
23
|
+
self.critique_template = open(os.path.join(cur_path, 'resources/critique_template.txt'), 'r').read()
|
|
24
|
+
self.switch_tokens = ['alternatively', 'but wait', 'let me reconsider', 'another way', 'another approach', 'another method', 'another angle']
|
|
25
|
+
self.subset_dict = defaultdict(lambda: defaultdict(list))
|
|
26
|
+
self.think_end_token = '</think>'
|
|
27
|
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
|
|
28
|
+
self.model_name = model_name
|
|
29
|
+
self.dataset_name = dataset_name
|
|
30
|
+
self.subsets = subsets
|
|
31
|
+
self.metrics = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens','token_efficiency', 'thought_num', 'accuracy']
|
|
32
|
+
self.split_strategies = split_strategies # split by llm, keywords, separator
|
|
33
|
+
self.judge_config = judge_config
|
|
34
|
+
self.model_parse_file_path = os.path.join(self.report_path, 'answer_index.jsonl')
|
|
35
|
+
self.model_parse_dict = self.__init_parse_file()
|
|
36
|
+
|
|
37
|
+
def __init_parse_file(self):
|
|
38
|
+
if not os.path.exists(self.model_parse_file_path):
|
|
39
|
+
return {}
|
|
40
|
+
else:
|
|
41
|
+
list_file = jsonl_to_list(self.model_parse_file_path)
|
|
42
|
+
# convert to dict prompt as key, answer_index as value
|
|
43
|
+
return {item['prompt']: item['answer_index'] for item in list_file}
|
|
44
|
+
|
|
45
|
+
def get_think_part(self, message: dict) -> str:
|
|
46
|
+
if 'reasoning_content' in message and message['reasoning_content']:
|
|
47
|
+
return message['reasoning_content']
|
|
48
|
+
else:
|
|
49
|
+
text = message['content']
|
|
50
|
+
last_think_end = text.rfind(self.think_end_token)
|
|
51
|
+
return text[:last_think_end]
|
|
52
|
+
|
|
53
|
+
@lru_cache(maxsize=None)
|
|
54
|
+
def cal_tokens(self, text: str):
|
|
55
|
+
return len(self.tokenizer.encode(text, add_special_tokens=False))
|
|
56
|
+
|
|
57
|
+
def process_choice(self, choice, problem):
|
|
58
|
+
think_part = self.get_think_part(choice['message'])
|
|
59
|
+
answer = choice['review']['gold']
|
|
60
|
+
tokens = self.cal_tokens(think_part)
|
|
61
|
+
switch_count = sum(think_part.lower().count(token) for token in self.switch_tokens)
|
|
62
|
+
useful_tokens = self.cal_tokens(self.get_first_correct(think_part, problem, answer))
|
|
63
|
+
reflection_tokens = tokens - useful_tokens
|
|
64
|
+
# score = choice['review']['result']
|
|
65
|
+
score = 0 if useful_tokens == 0 else 1
|
|
66
|
+
return tokens, switch_count, useful_tokens, reflection_tokens, score
|
|
67
|
+
|
|
68
|
+
def process_item(self, item):
|
|
69
|
+
problem = item['raw_input'].get('question') or item['raw_input'].get('problem') or ''
|
|
70
|
+
results = []
|
|
71
|
+
for choice in item['choices']:
|
|
72
|
+
results.append(self.process_choice(choice, problem))
|
|
73
|
+
break # only process the first choice
|
|
74
|
+
|
|
75
|
+
total_tokens, switch_counts, useful_tokens, reflection_tokens, scores = zip(*results)
|
|
76
|
+
|
|
77
|
+
avg_tokens = sum(total_tokens) / len(total_tokens)
|
|
78
|
+
avg_thought_num = sum(switch_counts) / len(switch_counts)
|
|
79
|
+
avg_token_efficiency = sum(useful_tokens) / sum(total_tokens)
|
|
80
|
+
avg_accuracy = sum(scores) / len(scores)
|
|
81
|
+
avg_useful_tokens = sum(useful_tokens) / len(useful_tokens)
|
|
82
|
+
avg_reflection_tokens = sum(reflection_tokens) / len(reflection_tokens)
|
|
83
|
+
return avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy, avg_useful_tokens, avg_reflection_tokens
|
|
84
|
+
|
|
85
|
+
def split_by_llm(self, response, problem) -> List[str]:
|
|
86
|
+
response = response.replace('\n', ' ') # remove newline characters
|
|
87
|
+
prompt = self.reformat_template.format(problem=problem, response=response)
|
|
88
|
+
llm_response = request_url(self.judge_config, prompt)
|
|
89
|
+
return llm_response.split('\n\n')
|
|
90
|
+
|
|
91
|
+
def split_by_keywords(self, text) -> List[str]:
|
|
92
|
+
pattern = r'(?=\b(?:{})\b)'.format('|'.join(map(re.escape, self.switch_tokens)))
|
|
93
|
+
segments = re.split(pattern, text)
|
|
94
|
+
# remove empty segments
|
|
95
|
+
segments = [segment.strip() for segment in segments if segment.strip()]
|
|
96
|
+
|
|
97
|
+
return segments if segments else [text]
|
|
98
|
+
|
|
99
|
+
def split_by_separator(self, text) -> List[str]:
|
|
100
|
+
return text.split('\n\n')
|
|
101
|
+
|
|
102
|
+
def get_answer_index(self, response: List[str], problem: str, answer: str) -> int:
|
|
103
|
+
tagged_response = ''
|
|
104
|
+
for sdx, step in enumerate(response):
|
|
105
|
+
tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
|
|
106
|
+
tagged_response = tagged_response.strip()
|
|
107
|
+
|
|
108
|
+
prompt = self.critique_template.format(problem=problem, answer=answer, tagged_response=tagged_response)
|
|
109
|
+
if prompt in self.model_parse_dict:
|
|
110
|
+
answer_index = self.model_parse_dict[prompt]
|
|
111
|
+
else:
|
|
112
|
+
llm_response = request_url(self.judge_config, prompt)
|
|
113
|
+
if not llm_response:
|
|
114
|
+
answer_index = -1
|
|
115
|
+
else:
|
|
116
|
+
answer_index = extract_answer(llm_response)
|
|
117
|
+
|
|
118
|
+
dump_jsonl_data({'prompt': prompt, 'response': llm_response, 'answer_index': answer_index},
|
|
119
|
+
self.model_parse_file_path, dump_mode='append')
|
|
120
|
+
try:
|
|
121
|
+
answer_index = int(answer_index)
|
|
122
|
+
except Exception:
|
|
123
|
+
answer_index = -1
|
|
124
|
+
return answer_index
|
|
125
|
+
|
|
126
|
+
def get_first_correct(self, response: str, problem: str, answer: str) -> str:
|
|
127
|
+
if self.split_strategies == 'llm':
|
|
128
|
+
text_list = self.split_by_llm(response, problem)
|
|
129
|
+
elif self.split_strategies == 'keywords':
|
|
130
|
+
text_list = self.split_by_keywords(response)
|
|
131
|
+
else:
|
|
132
|
+
text_list = self.split_by_separator(response)
|
|
133
|
+
|
|
134
|
+
answer_index = self.get_answer_index(text_list, problem, answer)
|
|
135
|
+
|
|
136
|
+
if answer_index == -1: # no correct answer found
|
|
137
|
+
first_correct = ''
|
|
138
|
+
else:
|
|
139
|
+
first_correct = '\n\n'.join(text_list[: answer_index])
|
|
140
|
+
return first_correct
|
|
141
|
+
|
|
142
|
+
def plot_metrics(self, results, output_dir):
|
|
143
|
+
# Change layout to 2x3
|
|
144
|
+
fig = make_subplots(rows=2, cols=3,
|
|
145
|
+
subplot_titles=('Reasoning Tokens', 'First Correct Tokens', 'Reflection Tokens',
|
|
146
|
+
'Token Efficiency', 'Thought Num', 'Accuracy'),
|
|
147
|
+
shared_xaxes=True, x_title='Subsets',
|
|
148
|
+
vertical_spacing=0.1, # Decrease vertical spacing between subplots
|
|
149
|
+
horizontal_spacing=0.1) # Decrease horizontal spacing between subplots
|
|
150
|
+
|
|
151
|
+
metrics_order = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens',
|
|
152
|
+
'token_efficiency', 'thought_num', 'accuracy']
|
|
153
|
+
|
|
154
|
+
for i, metric in enumerate(metrics_order, start=1):
|
|
155
|
+
y_values = [results[metric][subset] for subset in self.subsets]
|
|
156
|
+
# Determine row and column for 2x3 layout
|
|
157
|
+
row = (i - 1) // 3 + 1
|
|
158
|
+
col = (i - 1) % 3 + 1
|
|
159
|
+
fig.add_trace(
|
|
160
|
+
go.Scatter(x=list(range(len(self.subsets))), y=y_values,
|
|
161
|
+
mode='lines+markers',
|
|
162
|
+
name=metric.replace('_', ' ').title()),
|
|
163
|
+
row=row, col=col
|
|
164
|
+
)
|
|
165
|
+
# Add annotations for each data point
|
|
166
|
+
for j, y in enumerate(y_values):
|
|
167
|
+
fig.add_annotation(
|
|
168
|
+
x=j,
|
|
169
|
+
y=y,
|
|
170
|
+
text=f'{y:.2f}',
|
|
171
|
+
showarrow=False,
|
|
172
|
+
yshift=10,
|
|
173
|
+
row=row,
|
|
174
|
+
col=col
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
fig.update_layout(
|
|
178
|
+
height=800, # Adjust height for 2x3 layout
|
|
179
|
+
width=1200, # Adjust width for 2x3 layout
|
|
180
|
+
title_text=f'Evaluation Metrics for {self.model_name} on {self.dataset_name}',
|
|
181
|
+
legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1)
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
for i in range(1, len(metrics_order) + 1):
|
|
185
|
+
row = (i - 1) // 3 + 1
|
|
186
|
+
col = (i - 1) % 3 + 1
|
|
187
|
+
fig.update_xaxes(
|
|
188
|
+
ticktext=self.subsets,
|
|
189
|
+
tickvals=list(range(len(self.subsets))),
|
|
190
|
+
row=row, col=col
|
|
191
|
+
)
|
|
192
|
+
fig.update_yaxes(title_text=metrics_order[i-1].replace('_', ' ').title(), row=row, col=col)
|
|
193
|
+
|
|
194
|
+
# Update y-axis ranges
|
|
195
|
+
fig.update_yaxes(range=[500, 5000], row=1, col=1) # Reasoning Tokens
|
|
196
|
+
fig.update_yaxes(range=[0, 3000], row=1, col=2) # First Correct Tokens
|
|
197
|
+
fig.update_yaxes(range=[0, 3000], row=1, col=3) # Reflection Tokens
|
|
198
|
+
fig.update_yaxes(range=[0, 1], row=2, col=1) # Token Efficiency
|
|
199
|
+
fig.update_yaxes(range=[0, 13], row=2, col=2) # Thought Num
|
|
200
|
+
fig.update_yaxes(range=[0, 1], row=2, col=3) # Accuracy
|
|
201
|
+
|
|
202
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
203
|
+
output_path = os.path.join(output_dir, f'{self.model_name}_{self.dataset_name}_metrics.png')
|
|
204
|
+
fig.write_image(output_path)
|
|
205
|
+
print(f'save figure to: {output_path}')
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def filter_df(self, df, response_len: int = 8000, count: int=10):
|
|
210
|
+
def is_valid_row(row):
|
|
211
|
+
return all(self.cal_tokens(choice['message']['content']) <= response_len for choice in row['choices'])
|
|
212
|
+
|
|
213
|
+
bools = df.apply(is_valid_row, axis=1)
|
|
214
|
+
|
|
215
|
+
return df[bools].head(count)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def evaluate(self, output_dir, max_tokens=8000, count=50, workers=128):
|
|
219
|
+
for subset in self.subsets:
|
|
220
|
+
review_path = os.path.join(self.report_path, 'reviews', self.model_name, f'{self.dataset_name}_{subset}.jsonl')
|
|
221
|
+
review_df = pd.read_json(review_path, lines=True)
|
|
222
|
+
|
|
223
|
+
review_df = self.filter_df(review_df, response_len=max_tokens, count=count)
|
|
224
|
+
|
|
225
|
+
results = thread_map(
|
|
226
|
+
self.process_item,
|
|
227
|
+
(item for _, item in review_df.iterrows()),
|
|
228
|
+
desc=f'Evaluating {subset}',
|
|
229
|
+
total=len(review_df),
|
|
230
|
+
max_workers=workers
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy, avg_useful_tokens, avg_reflection_tokens = zip(*results)
|
|
234
|
+
|
|
235
|
+
self.subset_dict[subset]['reasoning_tokens'] = sum(avg_tokens) / len(avg_tokens)
|
|
236
|
+
self.subset_dict[subset]['thought_num'] = sum(avg_thought_num) / len(avg_thought_num)
|
|
237
|
+
self.subset_dict[subset]['token_efficiency'] = sum(avg_token_efficiency) / len(avg_token_efficiency)
|
|
238
|
+
self.subset_dict[subset]['accuracy'] = sum(avg_accuracy) / len(avg_accuracy)
|
|
239
|
+
self.subset_dict[subset]['first_correct_tokens'] = sum(avg_useful_tokens) / len(avg_useful_tokens)
|
|
240
|
+
self.subset_dict[subset]['reflection_tokens'] = sum(avg_reflection_tokens) / len(avg_reflection_tokens)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
results = {metric: {subset: self.subset_dict[subset][metric] for subset in self.subsets}
|
|
244
|
+
for metric in self.metrics}
|
|
245
|
+
|
|
246
|
+
self.plot_metrics(results, output_dir)
|
|
247
|
+
|
|
248
|
+
# save results to json
|
|
249
|
+
dict_to_json(results, os.path.join(self.report_path, f'think_eval_results.json'))
|
|
250
|
+
return results
|
|
251
|
+
|
|
252
|
+
def run_task(config, output_dir='outputs', max_tokens=8000, count=50, workers=128):
|
|
253
|
+
evaluator = EvalThink(**config,)
|
|
254
|
+
results = evaluator.evaluate(output_dir, max_tokens, count, workers)
|
|
255
|
+
print(results)
|
|
256
|
+
|
|
257
|
+
def combine_results(configs: List[dict], output_path: str):
|
|
258
|
+
"""
|
|
259
|
+
Combine evaluation results from multiple model configs into one plot.
|
|
260
|
+
All models' results for the same metric will be shown in the same subplot for easy comparison.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
configs: List of model config dicts containing model_name and report_path
|
|
264
|
+
"""
|
|
265
|
+
# Combine results from different runs
|
|
266
|
+
combined_results = defaultdict(lambda: defaultdict(dict))
|
|
267
|
+
for config in configs:
|
|
268
|
+
model_name = config['model_name']
|
|
269
|
+
report_path = config['report_path']
|
|
270
|
+
# Results is a dict with metric as key and subset as value
|
|
271
|
+
results = json_to_dict(os.path.join(report_path, f'think_eval_results.json'))
|
|
272
|
+
combined_results[model_name] = results
|
|
273
|
+
|
|
274
|
+
# Create a 2x3 subplot layout, one subplot per metric
|
|
275
|
+
fig = make_subplots(rows=2, cols=3,
|
|
276
|
+
subplot_titles=('Reasoning Tokens', 'First Correct Tokens', 'Reflection Tokens',
|
|
277
|
+
'Token Efficiency', 'Thought Num', 'Accuracy'),
|
|
278
|
+
shared_xaxes=True, x_title='Subsets',
|
|
279
|
+
vertical_spacing=0.08, # 减小垂直间距
|
|
280
|
+
horizontal_spacing=0.05) # 减小水平间距
|
|
281
|
+
|
|
282
|
+
metrics_order = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens',
|
|
283
|
+
'token_efficiency', 'thought_num', 'accuracy']
|
|
284
|
+
|
|
285
|
+
# Assign different colors for each model
|
|
286
|
+
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
|
|
287
|
+
|
|
288
|
+
# Plot each metric in a separate subplot
|
|
289
|
+
for i, metric in enumerate(metrics_order, start=1):
|
|
290
|
+
row = (i - 1) // 3 + 1
|
|
291
|
+
col = (i - 1) % 3 + 1
|
|
292
|
+
|
|
293
|
+
# Get subsets from first model (assuming all models have same subsets)
|
|
294
|
+
subsets = list(next(iter(combined_results.values()))[metric].keys())
|
|
295
|
+
|
|
296
|
+
# Add all models' data for this metric to the same subplot
|
|
297
|
+
for j, (model_name, results) in enumerate(combined_results.items()):
|
|
298
|
+
y_values = [results[metric][subset] for subset in subsets]
|
|
299
|
+
|
|
300
|
+
fig.add_trace(
|
|
301
|
+
go.Scatter(x=subsets, y=y_values,
|
|
302
|
+
mode='lines+markers',
|
|
303
|
+
name=model_name, # Just model name since metrics are shown in subplot titles
|
|
304
|
+
line=dict(color=colors[j % len(colors)]),
|
|
305
|
+
showlegend=(i == 1)), # Only show legend for first metric
|
|
306
|
+
row=row, col=col
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# Add value annotations
|
|
310
|
+
for k, y in enumerate(y_values):
|
|
311
|
+
fig.add_annotation(
|
|
312
|
+
x=subsets[k],
|
|
313
|
+
y=y,
|
|
314
|
+
text=f'{y:.2f}',
|
|
315
|
+
showarrow=False,
|
|
316
|
+
yshift=10,
|
|
317
|
+
font=dict(size=12, color=colors[j % len(colors)]),
|
|
318
|
+
row=row, col=col
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
# Update axis ranges and labels based on metric type
|
|
322
|
+
# if metric == 'token_efficiency':
|
|
323
|
+
# fig.update_yaxes(range=[0.2, 0.7], row=row, col=col)
|
|
324
|
+
# elif metric == 'accuracy':
|
|
325
|
+
# fig.update_yaxes(range=[0.8, 1], row=row, col=col)
|
|
326
|
+
|
|
327
|
+
fig.update_yaxes(title_text=metric.replace('_', ' ').title(), row=row, col=col)
|
|
328
|
+
|
|
329
|
+
# Update layout
|
|
330
|
+
fig.update_layout(
|
|
331
|
+
height=1000, # 增加高度
|
|
332
|
+
width=1500, # 增加宽度
|
|
333
|
+
title_text=f'Model Comparison Across Evaluation Metrics on MATH-500',
|
|
334
|
+
title=dict(font=dict(size=22)), # 增大标题字号
|
|
335
|
+
font=dict(size=14), # 增大整体字号
|
|
336
|
+
legend=dict(
|
|
337
|
+
orientation='h',
|
|
338
|
+
yanchor='bottom',
|
|
339
|
+
y=1.02,
|
|
340
|
+
xanchor='right',
|
|
341
|
+
x=1,
|
|
342
|
+
font=dict(size=14) # 增大图例字号
|
|
343
|
+
)
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Save plot
|
|
347
|
+
os.makedirs('outputs', exist_ok=True)
|
|
348
|
+
fig.write_image(output_path)
|
|
349
|
+
print(f'Model comparison plot saved to {output_path}')
|
|
350
|
+
|
|
351
|
+
return combined_results
|
|
352
|
+
|
|
353
|
+
judge_config = dict(
|
|
354
|
+
api_key='EMPTY',
|
|
355
|
+
base_url='http://0.0.0.0:8801/v1',
|
|
356
|
+
model_name='Qwen2.5-72B-Instruct',
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
distill_qwen_config = dict(
|
|
360
|
+
report_path = '../eval-scope/outputs/20250218_180219',
|
|
361
|
+
model_name = 'DeepSeek-R1-Distill-Qwen-7B',
|
|
362
|
+
tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
|
|
363
|
+
dataset_name = 'math_500',
|
|
364
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
365
|
+
split_strategies='separator',
|
|
366
|
+
judge_config=judge_config
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
math_qwen_config = dict(
|
|
370
|
+
report_path = '../eval-scope/outputs/20250219_202358',
|
|
371
|
+
model_name = 'Qwen2.5-Math-7B-Instruct',
|
|
372
|
+
tokenizer_path = 'Qwen/Qwen2.5-Math-7B-Instruct',
|
|
373
|
+
dataset_name = 'math_500',
|
|
374
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
375
|
+
split_strategies='separator',
|
|
376
|
+
judge_config=judge_config
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
r1_config = dict(
|
|
380
|
+
report_path = '../eval-scope/outputs/20250307_000404',
|
|
381
|
+
model_name = 'deepseek-r1',
|
|
382
|
+
tokenizer_path = 'deepseek-ai/DeepSeek-R1',
|
|
383
|
+
dataset_name = 'math_500',
|
|
384
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
385
|
+
split_strategies='separator',
|
|
386
|
+
judge_config=judge_config
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
qwq_preview_config = dict(
|
|
390
|
+
report_path = '../eval-scope/outputs/20250221_105911',
|
|
391
|
+
model_name = 'qwq-32b-preview',
|
|
392
|
+
tokenizer_path = 'Qwen/QwQ-32B-Preview',
|
|
393
|
+
dataset_name = 'math_500',
|
|
394
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
395
|
+
split_strategies='separator',
|
|
396
|
+
judge_config=judge_config
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
qwq_config = dict(
|
|
400
|
+
report_path = '../eval-scope/outputs/20250306_181550',
|
|
401
|
+
model_name = 'QwQ-32B',
|
|
402
|
+
tokenizer_path = 'Qwen/QwQ-32B',
|
|
403
|
+
dataset_name = 'math_500',
|
|
404
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
405
|
+
split_strategies='separator',
|
|
406
|
+
judge_config=judge_config
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
distill_qwen_32b = dict(
|
|
410
|
+
report_path = '../eval-scope/outputs/20250306_235951',
|
|
411
|
+
model_name = 'deepseek-r1-distill-qwen-32b',
|
|
412
|
+
tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
|
|
413
|
+
dataset_name = 'math_500',
|
|
414
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
415
|
+
split_strategies='separator',
|
|
416
|
+
judge_config=judge_config
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
qwen3_32b_think = dict(
|
|
420
|
+
report_path = '../eval-scope/outputs/20250428_151817',
|
|
421
|
+
model_name = 'Qwen3-32B',
|
|
422
|
+
tokenizer_path = 'Qwen/Qwen3-32B',
|
|
423
|
+
dataset_name = 'math_500',
|
|
424
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
425
|
+
split_strategies='separator',
|
|
426
|
+
judge_config=judge_config
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
if __name__ == '__main__':
|
|
430
|
+
# run_task(distill_qwen_config, count=80)
|
|
431
|
+
# run_task(math_qwen_config)
|
|
432
|
+
# run_task(qwq_preview_config, max_tokens=20000, count=200, workers=128)
|
|
433
|
+
# run_task(r1_config, max_tokens=20000, count=200, workers=128)
|
|
434
|
+
# run_task(qwq_config, max_tokens=20000, count=200, workers=128)
|
|
435
|
+
run_task(qwen3_32b_think, max_tokens=20000, count=200, workers=128)
|
|
436
|
+
# run_task(distill_qwen_32b, max_tokens=20000, count=200, workers=128)
|
|
437
|
+
|
|
438
|
+
# combine_results([qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics.png')
|
|
439
|
+
# combine_results([qwq_config, r1_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_3models.png')
|
|
440
|
+
# combine_results([distill_qwen_config, math_qwen_config, qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_6models.png')
|
|
441
|
+
combine_results([qwq_config, r1_config, distill_qwen_32b, qwen3_32b_think], output_path='outputs/model_comparison_metrics_4models.png')
|