evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,404 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from collections import OrderedDict
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
|
7
|
+
|
|
8
|
+
from evalscope.api.dataset import DatasetDict, Sample
|
|
9
|
+
from evalscope.api.evaluator import TaskState
|
|
10
|
+
from evalscope.api.filter import FilterEnsemble, build_filter_ensemble
|
|
11
|
+
from evalscope.api.metric import AggScore, SampleScore
|
|
12
|
+
from evalscope.api.mixin import LLMJudgeMixin, SandboxMixin
|
|
13
|
+
from evalscope.api.model import Model
|
|
14
|
+
from evalscope.report import Report
|
|
15
|
+
from evalscope.utils.logger import get_logger
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from evalscope.api.benchmark import BenchmarkMeta
|
|
19
|
+
from evalscope.config import TaskConfig
|
|
20
|
+
|
|
21
|
+
logger = get_logger()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DataAdapter(LLMJudgeMixin, SandboxMixin, ABC):
|
|
25
|
+
"""
|
|
26
|
+
Data Adapter for the benchmark.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, benchmark_meta: 'BenchmarkMeta', task_config: Optional['TaskConfig'] = None):
|
|
30
|
+
self._benchmark_meta = benchmark_meta
|
|
31
|
+
self._task_config = task_config
|
|
32
|
+
super().__init__(task_config=task_config)
|
|
33
|
+
|
|
34
|
+
self.reformat_subset = False
|
|
35
|
+
"""Whether to reformat the subset data with subset key"""
|
|
36
|
+
|
|
37
|
+
self.split_as_subset = False
|
|
38
|
+
"""Whether to use the split name as the dataset subsets"""
|
|
39
|
+
|
|
40
|
+
self.shuffle_choices = False
|
|
41
|
+
"""Whether to shuffle the choices in the dataset"""
|
|
42
|
+
|
|
43
|
+
self.use_batch_scoring = False
|
|
44
|
+
"""Whether to use batch scoring for metrics that support it, need to be enabled in the benchmark as well"""
|
|
45
|
+
|
|
46
|
+
self.save_metadata = True
|
|
47
|
+
"""Whether to save metadata in the review result"""
|
|
48
|
+
|
|
49
|
+
self.add_aggregation_name = True
|
|
50
|
+
"""Whether to add aggregation name in the report"""
|
|
51
|
+
|
|
52
|
+
self.add_overall_metric = True
|
|
53
|
+
"""Whether to add overall metric in the report"""
|
|
54
|
+
|
|
55
|
+
self.category_map = {}
|
|
56
|
+
"""Category map for the benchmark"""
|
|
57
|
+
|
|
58
|
+
self.current_subset_name = ''
|
|
59
|
+
"""Subset name when loading datasets"""
|
|
60
|
+
|
|
61
|
+
# dataset
|
|
62
|
+
self.test_dataset: Optional[DatasetDict] = None
|
|
63
|
+
"""Dataset to be evaluated"""
|
|
64
|
+
|
|
65
|
+
self.fewshot_dataset: Optional[DatasetDict] = None
|
|
66
|
+
"""Dataset for few-shot evaluation"""
|
|
67
|
+
|
|
68
|
+
# filters
|
|
69
|
+
self._filter_ensemble: Optional[OrderedDict] = None
|
|
70
|
+
|
|
71
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
72
|
+
"""Convert the benchmark metadata to a dictionary."""
|
|
73
|
+
return self._benchmark_meta.to_string_dict()
|
|
74
|
+
|
|
75
|
+
@abstractmethod
|
|
76
|
+
def load_dataset(self) -> DatasetDict:
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
@abstractmethod
|
|
80
|
+
def run_inference(self, model: Model, sample: Sample, output_dir: str, **kwargs) -> TaskState:
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
@abstractmethod
|
|
84
|
+
def calculate_metrics(self, task_state: TaskState) -> SampleScore:
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
@abstractmethod
|
|
88
|
+
def batch_calculate_metrics(self, task_states: List[TaskState],
|
|
89
|
+
sample_scores: List[SampleScore]) -> List[SampleScore]:
|
|
90
|
+
"""Batch calculate metrics for a list of task states. Need to update sample_scores in place."""
|
|
91
|
+
pass
|
|
92
|
+
|
|
93
|
+
@abstractmethod
|
|
94
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
|
|
95
|
+
pass
|
|
96
|
+
|
|
97
|
+
@abstractmethod
|
|
98
|
+
def generate_report(self, scores: Dict[str, List[AggScore]], model_name: str, output_dir: str, **kwargs) -> Report:
|
|
99
|
+
"""
|
|
100
|
+
Generate a report based on the evaluation results.
|
|
101
|
+
"""
|
|
102
|
+
pass
|
|
103
|
+
|
|
104
|
+
@abstractmethod
|
|
105
|
+
def finalize(self, *args, **kwargs) -> None:
|
|
106
|
+
"""Finalize the evaluation process."""
|
|
107
|
+
pass
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def name(self) -> str:
|
|
111
|
+
"""
|
|
112
|
+
Return the unique name of the benchmark.
|
|
113
|
+
"""
|
|
114
|
+
return self._benchmark_meta.name
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def dataset_id(self) -> str:
|
|
118
|
+
"""
|
|
119
|
+
Return the dataset ID or path to the benchmark.
|
|
120
|
+
"""
|
|
121
|
+
return self._benchmark_meta.dataset_id
|
|
122
|
+
|
|
123
|
+
@property
|
|
124
|
+
def output_types(self) -> Optional[List[str]]:
|
|
125
|
+
"""
|
|
126
|
+
Return the output types of the benchmark.
|
|
127
|
+
"""
|
|
128
|
+
return self._benchmark_meta.output_types
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def limit(self) -> Optional[Union[int, float]]:
|
|
132
|
+
"""
|
|
133
|
+
Return the limit for the benchmark.
|
|
134
|
+
"""
|
|
135
|
+
return self._task_config.limit
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def repeats(self) -> int:
|
|
139
|
+
"""
|
|
140
|
+
Return the number of repeats for each sample in the benchmark.
|
|
141
|
+
"""
|
|
142
|
+
return self._task_config.repeats
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def dataset_hub(self) -> str:
|
|
146
|
+
"""
|
|
147
|
+
Return the dataset hub type for the benchmark.
|
|
148
|
+
"""
|
|
149
|
+
return self._task_config.dataset_hub
|
|
150
|
+
|
|
151
|
+
@dataset_hub.setter
|
|
152
|
+
def dataset_hub(self, value: str):
|
|
153
|
+
"""
|
|
154
|
+
Set the dataset hub type for the benchmark.
|
|
155
|
+
"""
|
|
156
|
+
self._task_config.dataset_hub = value
|
|
157
|
+
|
|
158
|
+
@property
|
|
159
|
+
def eval_type(self) -> str:
|
|
160
|
+
"""
|
|
161
|
+
Return the evaluation type for the benchmark.
|
|
162
|
+
"""
|
|
163
|
+
return self._task_config.eval_type
|
|
164
|
+
|
|
165
|
+
@property
|
|
166
|
+
def subset_list(self) -> List[str]:
|
|
167
|
+
"""
|
|
168
|
+
Return the subset list of the benchmark.
|
|
169
|
+
"""
|
|
170
|
+
return self._benchmark_meta.subset_list
|
|
171
|
+
|
|
172
|
+
@subset_list.setter
|
|
173
|
+
def subset_list(self, value: List[str]):
|
|
174
|
+
"""
|
|
175
|
+
Set the subset list of the benchmark.
|
|
176
|
+
"""
|
|
177
|
+
self._benchmark_meta.subset_list = value
|
|
178
|
+
|
|
179
|
+
@property
|
|
180
|
+
def metric_list(self) -> List[Union[str, Dict[str, Any]]]:
|
|
181
|
+
"""
|
|
182
|
+
Return the metric list of the benchmark.
|
|
183
|
+
"""
|
|
184
|
+
return self._benchmark_meta.metric_list
|
|
185
|
+
|
|
186
|
+
@property
|
|
187
|
+
def default_subset(self) -> str:
|
|
188
|
+
"""
|
|
189
|
+
Return the default subset of the benchmark.
|
|
190
|
+
"""
|
|
191
|
+
return self._benchmark_meta.default_subset
|
|
192
|
+
|
|
193
|
+
@default_subset.setter
|
|
194
|
+
def default_subset(self, value: str):
|
|
195
|
+
"""
|
|
196
|
+
Set the default subset of the benchmark.
|
|
197
|
+
"""
|
|
198
|
+
self._benchmark_meta.default_subset = value
|
|
199
|
+
|
|
200
|
+
@property
|
|
201
|
+
def few_shot_num(self) -> int:
|
|
202
|
+
"""
|
|
203
|
+
Return the few shot number of the benchmark.
|
|
204
|
+
"""
|
|
205
|
+
return self._benchmark_meta.few_shot_num
|
|
206
|
+
|
|
207
|
+
@few_shot_num.setter
|
|
208
|
+
def few_shot_num(self, value: int):
|
|
209
|
+
"""
|
|
210
|
+
Set the few shot number of the benchmark.
|
|
211
|
+
"""
|
|
212
|
+
self._benchmark_meta.few_shot_num = value
|
|
213
|
+
|
|
214
|
+
@property
|
|
215
|
+
def few_shot_random(self) -> bool:
|
|
216
|
+
"""
|
|
217
|
+
Return whether few shot is random for the benchmark.
|
|
218
|
+
"""
|
|
219
|
+
return self._benchmark_meta.few_shot_random
|
|
220
|
+
|
|
221
|
+
@property
|
|
222
|
+
def train_split(self) -> Optional[str]:
|
|
223
|
+
"""
|
|
224
|
+
Return the train split of the benchmark.
|
|
225
|
+
"""
|
|
226
|
+
return self._benchmark_meta.train_split
|
|
227
|
+
|
|
228
|
+
@train_split.setter
|
|
229
|
+
def train_split(self, value: str):
|
|
230
|
+
"""
|
|
231
|
+
Set the train split of the benchmark.
|
|
232
|
+
"""
|
|
233
|
+
self._benchmark_meta.train_split = value
|
|
234
|
+
|
|
235
|
+
@property
|
|
236
|
+
def eval_split(self) -> Optional[str]:
|
|
237
|
+
"""
|
|
238
|
+
Return the eval split of the benchmark.
|
|
239
|
+
"""
|
|
240
|
+
return self._benchmark_meta.eval_split
|
|
241
|
+
|
|
242
|
+
@eval_split.setter
|
|
243
|
+
def eval_split(self, value: str):
|
|
244
|
+
"""
|
|
245
|
+
Set the eval split of the benchmark.
|
|
246
|
+
"""
|
|
247
|
+
self._benchmark_meta.eval_split = value
|
|
248
|
+
|
|
249
|
+
@property
|
|
250
|
+
def prompt_template(self) -> Optional[str]:
|
|
251
|
+
"""
|
|
252
|
+
Return the prompt template of the benchmark.
|
|
253
|
+
"""
|
|
254
|
+
return self._benchmark_meta.prompt_template
|
|
255
|
+
|
|
256
|
+
@prompt_template.setter
|
|
257
|
+
def prompt_template(self, value: str):
|
|
258
|
+
"""
|
|
259
|
+
Set the prompt template of the benchmark.
|
|
260
|
+
"""
|
|
261
|
+
self._benchmark_meta.prompt_template = value
|
|
262
|
+
|
|
263
|
+
@property
|
|
264
|
+
def system_prompt(self) -> Optional[str]:
|
|
265
|
+
"""
|
|
266
|
+
Return the system prompt of the benchmark.
|
|
267
|
+
"""
|
|
268
|
+
return self._benchmark_meta.system_prompt
|
|
269
|
+
|
|
270
|
+
@property
|
|
271
|
+
def query_template(self) -> Optional[str]:
|
|
272
|
+
"""
|
|
273
|
+
Return the query template of the benchmark.
|
|
274
|
+
"""
|
|
275
|
+
return self._benchmark_meta.query_template
|
|
276
|
+
|
|
277
|
+
@property
|
|
278
|
+
def few_shot_prompt_template(self) -> Optional[str]:
|
|
279
|
+
"""
|
|
280
|
+
Return the few-shot prompt template of the benchmark.
|
|
281
|
+
"""
|
|
282
|
+
return self._benchmark_meta.few_shot_prompt_template
|
|
283
|
+
|
|
284
|
+
@property
|
|
285
|
+
def pretty_name(self) -> Optional[str]:
|
|
286
|
+
"""
|
|
287
|
+
Return the pretty name of the benchmark.
|
|
288
|
+
"""
|
|
289
|
+
return self._benchmark_meta.pretty_name
|
|
290
|
+
|
|
291
|
+
@property
|
|
292
|
+
def description(self) -> Optional[str]:
|
|
293
|
+
"""
|
|
294
|
+
Return the description of the benchmark.
|
|
295
|
+
"""
|
|
296
|
+
return self._benchmark_meta.description
|
|
297
|
+
|
|
298
|
+
@property
|
|
299
|
+
def tags(self) -> Optional[List[str]]:
|
|
300
|
+
"""
|
|
301
|
+
Return the tags of the benchmark.
|
|
302
|
+
"""
|
|
303
|
+
return self._benchmark_meta.tags
|
|
304
|
+
|
|
305
|
+
@property
|
|
306
|
+
def filters(self) -> Optional[OrderedDict]:
|
|
307
|
+
"""
|
|
308
|
+
Return the filters of the benchmark.
|
|
309
|
+
"""
|
|
310
|
+
return self._benchmark_meta.filters
|
|
311
|
+
|
|
312
|
+
@property
|
|
313
|
+
def filter_ensemble(self) -> Optional[FilterEnsemble]:
|
|
314
|
+
"""
|
|
315
|
+
Return the filter ensemble of the benchmark.
|
|
316
|
+
"""
|
|
317
|
+
if self._filter_ensemble is None:
|
|
318
|
+
if self.filters:
|
|
319
|
+
self._filter_ensemble = build_filter_ensemble(filters=self.filters)
|
|
320
|
+
return self._filter_ensemble
|
|
321
|
+
|
|
322
|
+
@property
|
|
323
|
+
def aggregation(self) -> str:
|
|
324
|
+
"""
|
|
325
|
+
Return the aggregation function for the metrics.
|
|
326
|
+
"""
|
|
327
|
+
return self._benchmark_meta.aggregation
|
|
328
|
+
|
|
329
|
+
@property
|
|
330
|
+
def extra_params(self) -> Optional[Dict]:
|
|
331
|
+
"""
|
|
332
|
+
Return the extra parameters of the benchmark.
|
|
333
|
+
"""
|
|
334
|
+
return self._benchmark_meta.extra_params
|
|
335
|
+
|
|
336
|
+
@property
|
|
337
|
+
def seed(self) -> Optional[int]:
|
|
338
|
+
"""
|
|
339
|
+
Return the seed for the benchmark.
|
|
340
|
+
"""
|
|
341
|
+
return self._task_config.seed
|
|
342
|
+
|
|
343
|
+
@property
|
|
344
|
+
def shuffle(self) -> bool:
|
|
345
|
+
"""
|
|
346
|
+
Return whether to shuffle the dataset before evaluation.
|
|
347
|
+
"""
|
|
348
|
+
return self._benchmark_meta.shuffle
|
|
349
|
+
|
|
350
|
+
@shuffle.setter
|
|
351
|
+
def shuffle(self, value: bool):
|
|
352
|
+
"""
|
|
353
|
+
Set whether to shuffle the dataset before evaluation.
|
|
354
|
+
"""
|
|
355
|
+
self._benchmark_meta.shuffle = value
|
|
356
|
+
|
|
357
|
+
@property
|
|
358
|
+
def shuffle_choices(self) -> bool:
|
|
359
|
+
"""
|
|
360
|
+
Return whether to shuffle the choices in multiple-choice datasets.
|
|
361
|
+
"""
|
|
362
|
+
return self._benchmark_meta.shuffle_choices
|
|
363
|
+
|
|
364
|
+
@shuffle_choices.setter
|
|
365
|
+
def shuffle_choices(self, value: bool):
|
|
366
|
+
"""
|
|
367
|
+
Set whether to shuffle the choices in multiple-choice datasets.
|
|
368
|
+
"""
|
|
369
|
+
self._benchmark_meta.shuffle_choices = value
|
|
370
|
+
|
|
371
|
+
@property
|
|
372
|
+
def review_timeout(self) -> Optional[float]:
|
|
373
|
+
"""
|
|
374
|
+
Return the timeout for the review process.
|
|
375
|
+
"""
|
|
376
|
+
return self._benchmark_meta.review_timeout
|
|
377
|
+
|
|
378
|
+
@review_timeout.setter
|
|
379
|
+
def review_timeout(self, value: float):
|
|
380
|
+
"""
|
|
381
|
+
Set the timeout for the review process.
|
|
382
|
+
"""
|
|
383
|
+
self._benchmark_meta.review_timeout = value
|
|
384
|
+
|
|
385
|
+
@contextlib.contextmanager
|
|
386
|
+
def _temporary_attribute(self, attr_name: str, new_value):
|
|
387
|
+
"""
|
|
388
|
+
Set a temporary value for an attribute and restore the original value after the context block.
|
|
389
|
+
|
|
390
|
+
Args:
|
|
391
|
+
attr_name: The name of the attribute to temporarily set.
|
|
392
|
+
new_value: The new value to set for the attribute.
|
|
393
|
+
"""
|
|
394
|
+
had_attr = hasattr(self, attr_name)
|
|
395
|
+
original_value = getattr(self, attr_name, None) if had_attr else None
|
|
396
|
+
|
|
397
|
+
setattr(self, attr_name, new_value)
|
|
398
|
+
try:
|
|
399
|
+
yield
|
|
400
|
+
finally:
|
|
401
|
+
if had_attr:
|
|
402
|
+
setattr(self, attr_name, original_value)
|
|
403
|
+
else:
|
|
404
|
+
delattr(self, attr_name)
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
from collections import OrderedDict
|
|
3
|
+
from dataclasses import asdict, dataclass, field
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.constants import OutputType
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from evalscope.api.benchmark import DataAdapter
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class BenchmarkMeta:
|
|
14
|
+
"""Metadata for a benchmark, including dataset and model configurations."""
|
|
15
|
+
|
|
16
|
+
name: str
|
|
17
|
+
""" Unique name of the benchmark."""
|
|
18
|
+
|
|
19
|
+
dataset_id: str
|
|
20
|
+
""" Dataset id on modelscope or path to local dataset."""
|
|
21
|
+
|
|
22
|
+
data_adapter: Optional[Type['DataAdapter']] = None
|
|
23
|
+
""" Data adapter class for the benchmark."""
|
|
24
|
+
|
|
25
|
+
output_types: List[str] = field(default_factory=lambda: [OutputType.GENERATION])
|
|
26
|
+
""" List of output types supported by the benchmark."""
|
|
27
|
+
|
|
28
|
+
subset_list: List[str] = field(default_factory=lambda: ['default'])
|
|
29
|
+
""" List of subsets available for the benchmark."""
|
|
30
|
+
|
|
31
|
+
default_subset: str = 'default'
|
|
32
|
+
""" Default subset to use for the benchmark."""
|
|
33
|
+
|
|
34
|
+
few_shot_num: int = 0
|
|
35
|
+
""" Number of few-shot examples to use."""
|
|
36
|
+
|
|
37
|
+
few_shot_random: bool = False
|
|
38
|
+
""" Whether to use random few-shot examples."""
|
|
39
|
+
|
|
40
|
+
train_split: Optional[str] = None
|
|
41
|
+
""" Training split to use for the benchmark."""
|
|
42
|
+
|
|
43
|
+
eval_split: Optional[str] = None
|
|
44
|
+
""" Evaluation split to use for the benchmark."""
|
|
45
|
+
|
|
46
|
+
prompt_template: Optional[str] = None
|
|
47
|
+
""" Prompt template to use for the benchmark."""
|
|
48
|
+
|
|
49
|
+
few_shot_prompt_template: Optional[str] = None
|
|
50
|
+
""" Few-shot prompt template to use for the benchmark."""
|
|
51
|
+
|
|
52
|
+
system_prompt: Optional[str] = None
|
|
53
|
+
""" System prompt to use for the benchmark."""
|
|
54
|
+
|
|
55
|
+
query_template: Optional[str] = None
|
|
56
|
+
""" Query template to use for the benchmark."""
|
|
57
|
+
|
|
58
|
+
pretty_name: Optional[str] = None
|
|
59
|
+
""" Human-readable name for the benchmark."""
|
|
60
|
+
|
|
61
|
+
description: Optional[str] = None
|
|
62
|
+
""" Description of the benchmark."""
|
|
63
|
+
|
|
64
|
+
tags: List[str] = field(default_factory=list)
|
|
65
|
+
""" Tags associated with the benchmark."""
|
|
66
|
+
|
|
67
|
+
filters: Optional[OrderedDict] = None
|
|
68
|
+
""" Filters to apply to the dataset on model output."""
|
|
69
|
+
|
|
70
|
+
metric_list: List[Union[str, Dict[str, Any]]] = field(default_factory=list)
|
|
71
|
+
""" List of metrics to evaluate the benchmark."""
|
|
72
|
+
|
|
73
|
+
aggregation: str = 'mean'
|
|
74
|
+
""" Aggregation function for the metrics. Default is 'mean'. Can be 'mean', 'pass@<k>' or a custom function name."""
|
|
75
|
+
|
|
76
|
+
shuffle: bool = False
|
|
77
|
+
"""Whether to shuffle the dataset before evaluation."""
|
|
78
|
+
|
|
79
|
+
shuffle_choices: bool = False
|
|
80
|
+
"""Whether to shuffle the choices in multiple-choice datasets."""
|
|
81
|
+
|
|
82
|
+
review_timeout: Optional[float] = None
|
|
83
|
+
"""Timeout for review in seconds."""
|
|
84
|
+
|
|
85
|
+
extra_params: Dict = field(default_factory=dict)
|
|
86
|
+
"""Additional parameters for the benchmark."""
|
|
87
|
+
|
|
88
|
+
def __post_init__(self):
|
|
89
|
+
"""Validate fields after initialization."""
|
|
90
|
+
if self.few_shot_num < 0:
|
|
91
|
+
raise ValueError('few_shot_num must be >= 0')
|
|
92
|
+
|
|
93
|
+
def _update(self, args: dict):
|
|
94
|
+
"""Update instance with provided arguments, maintaining backward compatibility."""
|
|
95
|
+
args = copy.deepcopy(args)
|
|
96
|
+
|
|
97
|
+
if args.get('local_path'):
|
|
98
|
+
self.dataset_id = args['local_path']
|
|
99
|
+
del args['local_path']
|
|
100
|
+
|
|
101
|
+
if args.get('filters'):
|
|
102
|
+
if self.filters is None:
|
|
103
|
+
self.filters = OrderedDict()
|
|
104
|
+
new_filters = OrderedDict(args['filters'])
|
|
105
|
+
# insert filters at the beginning
|
|
106
|
+
self.filters = OrderedDict(list(new_filters.items()) + list(self.filters.items()))
|
|
107
|
+
del args['filters']
|
|
108
|
+
# Update fields with validation
|
|
109
|
+
for key, value in args.items():
|
|
110
|
+
if hasattr(self, key):
|
|
111
|
+
setattr(self, key, value) # Validate few_shot_num if it's being updated
|
|
112
|
+
if key == 'few_shot_num' and value < 0:
|
|
113
|
+
raise ValueError('few_shot_num must be >= 0')
|
|
114
|
+
|
|
115
|
+
def to_dict(self) -> dict:
|
|
116
|
+
"""Convert to dictionary, maintaining backward compatibility."""
|
|
117
|
+
return asdict(self)
|
|
118
|
+
|
|
119
|
+
def to_string_dict(self) -> dict:
|
|
120
|
+
"""Convert to string dictionary, excluding data_adapter."""
|
|
121
|
+
cur_dict = copy.deepcopy(asdict(self))
|
|
122
|
+
if 'data_adapter' in cur_dict:
|
|
123
|
+
del cur_dict['data_adapter']
|
|
124
|
+
return cur_dict
|