evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import traceback
|
|
4
|
+
from copy import deepcopy
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Dict, List
|
|
7
|
+
|
|
8
|
+
from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
|
|
9
|
+
from evalscope.api.dataset import Sample
|
|
10
|
+
from evalscope.api.dataset.dataset import DatasetDict
|
|
11
|
+
from evalscope.api.dataset.loader import DictDataLoader
|
|
12
|
+
from evalscope.api.evaluator import TaskState
|
|
13
|
+
from evalscope.api.messages.chat_message import ChatMessageUser
|
|
14
|
+
from evalscope.api.metric import Score
|
|
15
|
+
from evalscope.api.model import Model, ModelOutput
|
|
16
|
+
from evalscope.api.registry import register_benchmark
|
|
17
|
+
from evalscope.constants import Tags
|
|
18
|
+
from evalscope.report import Report
|
|
19
|
+
from evalscope.utils.function_utils import thread_safe
|
|
20
|
+
from evalscope.utils.import_utils import check_import
|
|
21
|
+
from evalscope.utils.logger import get_logger
|
|
22
|
+
from .utils import (
|
|
23
|
+
ALL_SCORING_CATEGORIES,
|
|
24
|
+
compute_aggregate_subsets,
|
|
25
|
+
compute_entry_result,
|
|
26
|
+
load_bfcl_data,
|
|
27
|
+
process_test_entries,
|
|
28
|
+
run_prereq_inference,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
logger = get_logger()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@register_benchmark(
|
|
35
|
+
BenchmarkMeta(
|
|
36
|
+
name='bfcl_v4',
|
|
37
|
+
pretty_name='BFCL-v4',
|
|
38
|
+
tags=[Tags.FUNCTION_CALLING, Tags.AGENT],
|
|
39
|
+
description='With function-calling being the building blocks of Agents, '
|
|
40
|
+
'the Berkeley Function-Calling Leaderboard (BFCL) V4 presents a holistic agentic '
|
|
41
|
+
'evaluation for LLMs. BFCL V4 Agentic includes web search, memory, and format sensitivity. '
|
|
42
|
+
'Together, the ability to web search, read and write from memory, and the ability to invoke '
|
|
43
|
+
'functions in different languages present the building blocks for the exciting and extremely '
|
|
44
|
+
'challenging avenues that power agentic LLMs today from deep-research, to agents for coding and law. '
|
|
45
|
+
'Need to run `pip install bfcl-eval==2025.10.27.1` before evaluating. '
|
|
46
|
+
'[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/bfcl_v4.html)',
|
|
47
|
+
dataset_id='https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard',
|
|
48
|
+
subset_list=ALL_SCORING_CATEGORIES,
|
|
49
|
+
metric_list=['acc'],
|
|
50
|
+
eval_split='train',
|
|
51
|
+
extra_params={
|
|
52
|
+
'underscore_to_dot': True,
|
|
53
|
+
'is_fc_model': True,
|
|
54
|
+
'SERPAPI_API_KEY': None,
|
|
55
|
+
}
|
|
56
|
+
)
|
|
57
|
+
)
|
|
58
|
+
class BFCLV4Adapter(AgentAdapter):
|
|
59
|
+
"""
|
|
60
|
+
BFCL adapter using the new data processing framework.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(self, **kwargs):
|
|
64
|
+
super().__init__(**kwargs)
|
|
65
|
+
|
|
66
|
+
check_import('bfcl_eval', package='bfcl-eval==2025.10.27.1', raise_error=True, feature_name=self.pretty_name)
|
|
67
|
+
|
|
68
|
+
self.add_overall_metric = False
|
|
69
|
+
self.add_aggregation_name = False
|
|
70
|
+
|
|
71
|
+
self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
|
|
72
|
+
self.is_fc_model = self.extra_params.get('is_fc_model', True)
|
|
73
|
+
# Set SERPAPI_API_KEY in environment variables if provided
|
|
74
|
+
serpapi_api_key = self.extra_params.get('SERPAPI_API_KEY', None)
|
|
75
|
+
if serpapi_api_key:
|
|
76
|
+
os.environ['SERPAPI_API_KEY'] = serpapi_api_key
|
|
77
|
+
self.model_result_dir = Path(self._task_config.work_dir) if self._task_config else Path('./bfcl_model_results')
|
|
78
|
+
self.handler = None
|
|
79
|
+
self.prereq_entries = []
|
|
80
|
+
self.prereq_finished = False
|
|
81
|
+
|
|
82
|
+
def load(self):
|
|
83
|
+
"""Load and process the BFCL dataset."""
|
|
84
|
+
from bfcl_eval.utils import parse_test_category_argument
|
|
85
|
+
datasets = {}
|
|
86
|
+
all_test_categories = parse_test_category_argument(self.subset_list)
|
|
87
|
+
|
|
88
|
+
test_entries_by_cat, ground_truth_by_cat = load_bfcl_data(all_test_categories)
|
|
89
|
+
|
|
90
|
+
for category in all_test_categories:
|
|
91
|
+
test_entries = test_entries_by_cat.get(category, [])
|
|
92
|
+
ground_truth_entries = ground_truth_by_cat.get(category, [])
|
|
93
|
+
|
|
94
|
+
if not test_entries:
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
datasets[category] = self._create_dataset_for_category(category, test_entries, ground_truth_entries)
|
|
98
|
+
|
|
99
|
+
test_dataset = DatasetDict(datasets)
|
|
100
|
+
return test_dataset, None
|
|
101
|
+
|
|
102
|
+
def _create_dataset_for_category(
|
|
103
|
+
self, category: str, test_entries: List[Dict], ground_truth_entries: List[Dict]
|
|
104
|
+
) -> DatasetDict:
|
|
105
|
+
"""Create a dataset for a single category by merging test and ground truth data."""
|
|
106
|
+
processed_entries, prereq_entries = process_test_entries(
|
|
107
|
+
category=category,
|
|
108
|
+
test_entries=test_entries,
|
|
109
|
+
ground_truth_entries=ground_truth_entries,
|
|
110
|
+
model_result_dir=self.model_result_dir,
|
|
111
|
+
)
|
|
112
|
+
# collect prereq entries for later prereq inference
|
|
113
|
+
self.prereq_entries.extend(prereq_entries)
|
|
114
|
+
|
|
115
|
+
return DictDataLoader(
|
|
116
|
+
dict_list=processed_entries,
|
|
117
|
+
limit=self.limit,
|
|
118
|
+
repeats=self.repeats,
|
|
119
|
+
sample_fields=self.record_to_sample,
|
|
120
|
+
shuffle=self.shuffle,
|
|
121
|
+
).load()
|
|
122
|
+
|
|
123
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
124
|
+
"""Convert a data record to a Sample object."""
|
|
125
|
+
return Sample(
|
|
126
|
+
input=[ChatMessageUser(content=json.dumps(record['question']))],
|
|
127
|
+
target=json.dumps(record['ground_truth']), # Will use the record for evaluation
|
|
128
|
+
metadata=record # Store the full record for evaluation
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
@thread_safe
|
|
132
|
+
def _init_handler(self):
|
|
133
|
+
if self.handler is not None:
|
|
134
|
+
return # Handler already initialized
|
|
135
|
+
|
|
136
|
+
from bfcl_eval.model_handler.api_inference.openai_completion import OpenAICompletionsHandler
|
|
137
|
+
|
|
138
|
+
# Set env variables for OpenAI API
|
|
139
|
+
os.environ['OPENAI_API_KEY'] = self._task_config.api_key
|
|
140
|
+
os.environ['OPENAI_BASE_URL'] = self._task_config.api_url
|
|
141
|
+
|
|
142
|
+
self.handler = OpenAICompletionsHandler(
|
|
143
|
+
model_name=self._task_config.model,
|
|
144
|
+
temperature=self._task_config.generation_config.temperature,
|
|
145
|
+
registry_name=self._task_config.model_id,
|
|
146
|
+
is_fc_model=self.is_fc_model,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
self._prereq_inference()
|
|
150
|
+
|
|
151
|
+
def _prereq_inference(self):
|
|
152
|
+
if self.prereq_finished:
|
|
153
|
+
return
|
|
154
|
+
# MOVED: delegate prereq processing to utils
|
|
155
|
+
run_prereq_inference(
|
|
156
|
+
handler=self.handler,
|
|
157
|
+
prereq_entries=self.prereq_entries,
|
|
158
|
+
model_result_dir=self.model_result_dir,
|
|
159
|
+
batch_size=self._task_config.eval_batch_size,
|
|
160
|
+
logger=logger,
|
|
161
|
+
)
|
|
162
|
+
self.prereq_finished = True
|
|
163
|
+
|
|
164
|
+
def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
|
|
165
|
+
try:
|
|
166
|
+
self._init_handler()
|
|
167
|
+
|
|
168
|
+
result, _ = self.handler.inference(
|
|
169
|
+
deepcopy(sample.metadata), include_input_log=False, exclude_state_log=False
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
output = ModelOutput.from_content(
|
|
173
|
+
model=model.name,
|
|
174
|
+
content=json.dumps(result),
|
|
175
|
+
)
|
|
176
|
+
except Exception as e:
|
|
177
|
+
# This is usually the case when the model getting stuck on one particular test case.
|
|
178
|
+
# For example, timeout error or FC model returning invalid JSON response.
|
|
179
|
+
# Since temperature is already set to 0.001, retrying the same test case will not help.
|
|
180
|
+
# So we continue the generation process and record the error message as the model response
|
|
181
|
+
logger.error(f'Error during inference for sample ID {sample.metadata.get("id")}: {e}')
|
|
182
|
+
logger.error(traceback.format_exc())
|
|
183
|
+
|
|
184
|
+
output = ModelOutput.from_content(
|
|
185
|
+
model=model.name,
|
|
186
|
+
content=json.dumps({
|
|
187
|
+
'error': str(e),
|
|
188
|
+
'error_message': traceback.format_exc(),
|
|
189
|
+
}),
|
|
190
|
+
)
|
|
191
|
+
return output
|
|
192
|
+
|
|
193
|
+
def match_score(
|
|
194
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
195
|
+
) -> Score:
|
|
196
|
+
self._init_handler()
|
|
197
|
+
|
|
198
|
+
score = Score(
|
|
199
|
+
extracted_prediction=filtered_prediction,
|
|
200
|
+
prediction=original_prediction,
|
|
201
|
+
)
|
|
202
|
+
model_result = json.loads(filtered_prediction)
|
|
203
|
+
prompt = task_state.metadata
|
|
204
|
+
|
|
205
|
+
entry_result = compute_entry_result(
|
|
206
|
+
handler=self.handler,
|
|
207
|
+
model_result=model_result,
|
|
208
|
+
prompt_entry=prompt,
|
|
209
|
+
underscore_to_dot=self.underscore_to_dot,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
valid = 1 if entry_result['valid'] else 0
|
|
213
|
+
score.value = {'acc': valid}
|
|
214
|
+
score.metadata = {
|
|
215
|
+
'valid': bool(entry_result.get('valid')),
|
|
216
|
+
'error': str(entry_result.get('error')),
|
|
217
|
+
'error_message': str(entry_result.get('error_message')),
|
|
218
|
+
'error_type': str(entry_result.get('error_type')),
|
|
219
|
+
}
|
|
220
|
+
return score
|
|
221
|
+
|
|
222
|
+
def _on_generate_report_end(self, report: Report, output_dir, **kwargs):
|
|
223
|
+
"""
|
|
224
|
+
Finalize the report generation process. Calculate the overall score.
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
# noqa: E501
|
|
228
|
+
# MOVED: delegate aggregation logic to utils
|
|
229
|
+
compute_aggregate_subsets(report)
|
|
@@ -0,0 +1,410 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import traceback
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from copy import deepcopy
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
from typing import Any, Dict, List, Tuple
|
|
9
|
+
|
|
10
|
+
from evalscope.report import (
|
|
11
|
+
Category,
|
|
12
|
+
Report,
|
|
13
|
+
Subset,
|
|
14
|
+
percentage_weighted_average_from_subsets,
|
|
15
|
+
unweighted_average_from_subsets,
|
|
16
|
+
weighted_average_from_subsets,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
# ----------------------------
|
|
20
|
+
# Public constants (extracted)
|
|
21
|
+
# ----------------------------
|
|
22
|
+
|
|
23
|
+
ALL_AVAILABLE_MEMORY_BACKENDS: List[str] = [
|
|
24
|
+
'kv',
|
|
25
|
+
'vector',
|
|
26
|
+
'rec_sum',
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
NON_LIVE_CATEGORY: List[str] = [
|
|
30
|
+
'simple_python',
|
|
31
|
+
'simple_java',
|
|
32
|
+
'simple_javascript',
|
|
33
|
+
'multiple',
|
|
34
|
+
'parallel',
|
|
35
|
+
'parallel_multiple',
|
|
36
|
+
'irrelevance',
|
|
37
|
+
]
|
|
38
|
+
LIVE_CATEGORY: List[str] = [
|
|
39
|
+
'live_simple',
|
|
40
|
+
'live_multiple',
|
|
41
|
+
'live_parallel',
|
|
42
|
+
'live_parallel_multiple',
|
|
43
|
+
'live_irrelevance',
|
|
44
|
+
'live_relevance',
|
|
45
|
+
]
|
|
46
|
+
MULTI_TURN_CATEGORY: List[str] = [
|
|
47
|
+
'multi_turn_base',
|
|
48
|
+
'multi_turn_miss_func',
|
|
49
|
+
'multi_turn_miss_param',
|
|
50
|
+
'multi_turn_long_context',
|
|
51
|
+
]
|
|
52
|
+
WEB_SEARCH_CATEGORY: List[str] = [
|
|
53
|
+
'web_search_base',
|
|
54
|
+
'web_search_no_snippet',
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
MEMORY_CATEGORY: List[str] = [f'memory_{backend}' for backend in ALL_AVAILABLE_MEMORY_BACKENDS]
|
|
58
|
+
MEMORY_SCENARIO_NAME = [
|
|
59
|
+
'student',
|
|
60
|
+
'customer',
|
|
61
|
+
'finance',
|
|
62
|
+
'healthcare',
|
|
63
|
+
'notetaker',
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
SINGLE_TURN_CATEGORY: List[str] = NON_LIVE_CATEGORY + LIVE_CATEGORY
|
|
67
|
+
AGENTIC_CATEGORY: List[str] = MEMORY_CATEGORY + WEB_SEARCH_CATEGORY
|
|
68
|
+
|
|
69
|
+
ALL_SCORING_CATEGORIES: List[str] = SINGLE_TURN_CATEGORY + MULTI_TURN_CATEGORY + AGENTIC_CATEGORY
|
|
70
|
+
|
|
71
|
+
# Dummy models used only to infer underscore_to_dot behavior
|
|
72
|
+
DUMMY_MODEL_UNDERSCORE_TO_DOT = 'gpt-4o-2024-11-20-FC'
|
|
73
|
+
DUMMY_MODEL_NO_UNDERSCORE_TO_DOT = 'meta-llama/Llama-3.3-70B-Instruct-FC'
|
|
74
|
+
|
|
75
|
+
# ----------------------------
|
|
76
|
+
# Data preparation helpers
|
|
77
|
+
# ----------------------------
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def load_bfcl_data(categories: List[str]) -> Tuple[Dict[str, List[Dict]], Dict[str, List[Dict]]]:
|
|
81
|
+
"""
|
|
82
|
+
Load test entries and ground truth data from bfcl_eval for given categories.
|
|
83
|
+
"""
|
|
84
|
+
from bfcl_eval.utils import is_relevance_or_irrelevance, load_dataset_entry, load_ground_truth_entry
|
|
85
|
+
|
|
86
|
+
test_entries_by_cat: Dict[str, List[Dict]] = defaultdict(list)
|
|
87
|
+
ground_truth_by_cat: Dict[str, List[Dict]] = defaultdict(list)
|
|
88
|
+
|
|
89
|
+
for category in categories:
|
|
90
|
+
test_entries_by_cat[category] = load_dataset_entry(
|
|
91
|
+
category, include_prereq=True, include_language_specific_hint=False
|
|
92
|
+
)
|
|
93
|
+
if not is_relevance_or_irrelevance(category):
|
|
94
|
+
ground_truth_by_cat[category] = load_ground_truth_entry(category)
|
|
95
|
+
|
|
96
|
+
return test_entries_by_cat, ground_truth_by_cat
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def prepare_ground_truth_map(category: str, ground_truth_entries: List[Dict]) -> Dict[str, Dict]:
|
|
100
|
+
"""
|
|
101
|
+
Map ground truth entries to IDs with category-specific adjustments.
|
|
102
|
+
"""
|
|
103
|
+
from bfcl_eval.utils import is_memory, is_web_search
|
|
104
|
+
|
|
105
|
+
if not ground_truth_entries:
|
|
106
|
+
return {}
|
|
107
|
+
|
|
108
|
+
if is_memory(category):
|
|
109
|
+
return {entry['id'].replace('memory', category): entry for entry in ground_truth_entries}
|
|
110
|
+
if is_web_search(category):
|
|
111
|
+
return {entry['id'].replace('web_search', category): entry for entry in ground_truth_entries}
|
|
112
|
+
return {entry['id']: entry for entry in ground_truth_entries}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def process_test_entries(
|
|
116
|
+
category: str,
|
|
117
|
+
test_entries: List[Dict[str, Any]],
|
|
118
|
+
ground_truth_entries: List[Dict[str, Any]],
|
|
119
|
+
model_result_dir: Path,
|
|
120
|
+
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
121
|
+
"""
|
|
122
|
+
Clean and enrich test entries, return processed entries and prereq entries.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
processed_entries: entries ready to be mapped to Samples
|
|
126
|
+
prereq_entries: entries requiring prereq inference (memory snapshots)
|
|
127
|
+
"""
|
|
128
|
+
from bfcl_eval.utils import (
|
|
129
|
+
clean_up_memory_prereq_entries,
|
|
130
|
+
is_memory_prereq,
|
|
131
|
+
populate_initial_settings_for_memory_test_cases,
|
|
132
|
+
populate_initial_settings_for_web_search_test_cases,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
ground_truth_map = prepare_ground_truth_map(category, ground_truth_entries)
|
|
136
|
+
|
|
137
|
+
test_entries = clean_up_memory_prereq_entries(test_entries)
|
|
138
|
+
test_entries = populate_initial_settings_for_web_search_test_cases(test_entries)
|
|
139
|
+
test_entries = populate_initial_settings_for_memory_test_cases(test_entries, model_result_dir=model_result_dir)
|
|
140
|
+
|
|
141
|
+
prereq_entries = [entry for entry in test_entries if is_memory_prereq(entry['id'])]
|
|
142
|
+
main_entries = [entry for entry in test_entries if not is_memory_prereq(entry['id'])]
|
|
143
|
+
|
|
144
|
+
processed_entries: List[Dict[str, Any]] = []
|
|
145
|
+
for entry in main_entries:
|
|
146
|
+
entry_id = entry['id']
|
|
147
|
+
entry['category'] = category
|
|
148
|
+
entry['ground_truth'] = ground_truth_map.get(entry_id, {}).get('ground_truth', {})
|
|
149
|
+
processed_entries.append(entry)
|
|
150
|
+
|
|
151
|
+
return processed_entries, prereq_entries
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def run_prereq_inference(
|
|
155
|
+
handler: Any,
|
|
156
|
+
prereq_entries: List[Dict[str, Any]],
|
|
157
|
+
model_result_dir: Path,
|
|
158
|
+
batch_size: int,
|
|
159
|
+
logger: Any,
|
|
160
|
+
) -> None:
|
|
161
|
+
"""
|
|
162
|
+
Run prerequisite inferences for memory snapshot creation if results are missing.
|
|
163
|
+
Optimized to run different (backend, scenario) groups in parallel while preserving in-group order.
|
|
164
|
+
"""
|
|
165
|
+
import re
|
|
166
|
+
from bfcl_eval.utils import get_directory_structure_by_id
|
|
167
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
168
|
+
|
|
169
|
+
if not prereq_entries:
|
|
170
|
+
return
|
|
171
|
+
|
|
172
|
+
def _parse_backend_scenario_idx(entry_id: str) -> Tuple[str, str, int]:
|
|
173
|
+
"""
|
|
174
|
+
Extract backend, scenario, and scenario index from an entry id.
|
|
175
|
+
Expected format:
|
|
176
|
+
memory_{backend}_prereq_{total_index}-{scenario}-{scenario_index}
|
|
177
|
+
Returns ('unknown', 'unknown', 0) on failure.
|
|
178
|
+
"""
|
|
179
|
+
backend = 'unknown'
|
|
180
|
+
scenario = 'unknown'
|
|
181
|
+
idx = 0
|
|
182
|
+
|
|
183
|
+
m_backend = re.search(r'^memory_(?P<backend>.+?)_prereq_', entry_id)
|
|
184
|
+
if m_backend:
|
|
185
|
+
backend = m_backend.group('backend')
|
|
186
|
+
|
|
187
|
+
m_tail = re.search(r'-(?P<scenario>[a-zA-Z_]+)-(?P<idx>\d+)$', entry_id)
|
|
188
|
+
if m_tail:
|
|
189
|
+
scenario = m_tail.group('scenario')
|
|
190
|
+
idx = int(m_tail.group('idx'))
|
|
191
|
+
|
|
192
|
+
return backend, scenario, idx
|
|
193
|
+
|
|
194
|
+
# Group entries by (backend, scenario)
|
|
195
|
+
groups: Dict[Tuple[str, str], List[Dict[str, Any]]] = {}
|
|
196
|
+
for entry in prereq_entries:
|
|
197
|
+
backend, scenario, idx = _parse_backend_scenario_idx(entry['id'])
|
|
198
|
+
entry['_group_backend'] = backend
|
|
199
|
+
entry['_group_scenario'] = scenario
|
|
200
|
+
entry['_scenario_idx'] = idx
|
|
201
|
+
groups.setdefault((backend, scenario), []).append(entry)
|
|
202
|
+
|
|
203
|
+
# Sort entries within each group by scenario index to keep order
|
|
204
|
+
for group_entries in groups.values():
|
|
205
|
+
group_entries.sort(key=lambda e: e.get('_scenario_idx', 0))
|
|
206
|
+
|
|
207
|
+
# Worker to process a single (backend, scenario) group sequentially
|
|
208
|
+
def _process_group_entries(group_entries: List[Dict[str, Any]], progress: Any) -> None:
|
|
209
|
+
for entry in group_entries:
|
|
210
|
+
try:
|
|
211
|
+
memory_snapshot_folder = (
|
|
212
|
+
model_result_dir / get_directory_structure_by_id(entry['id']) / 'memory_snapshot'
|
|
213
|
+
/ 'prereq_checkpoints'
|
|
214
|
+
)
|
|
215
|
+
existing_filenames = {f.name for f in memory_snapshot_folder.rglob('*.json')}
|
|
216
|
+
if (entry['id'] + '.json') in existing_filenames:
|
|
217
|
+
logger.info(f'Skipping prereq inference for entry ID {entry["id"]} as result already exists.')
|
|
218
|
+
else:
|
|
219
|
+
handler.inference(deepcopy(entry), include_input_log=False, exclude_state_log=False)
|
|
220
|
+
except Exception as e:
|
|
221
|
+
logger.error(f'Error during prereq inference for entry ID {entry.get("id")}: {e}')
|
|
222
|
+
logger.error(traceback.format_exc())
|
|
223
|
+
finally:
|
|
224
|
+
# tqdm is thread-safe; each worker updates shared progress bar
|
|
225
|
+
progress.update(1)
|
|
226
|
+
|
|
227
|
+
# Run each (backend, scenario) group in parallel; preserve in-group order
|
|
228
|
+
total = len(prereq_entries)
|
|
229
|
+
with tqdm(total=total, desc='Running prereq inferences for memory snapshots...') as progress:
|
|
230
|
+
max_workers = min(batch_size, len(groups))
|
|
231
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
232
|
+
futures = [
|
|
233
|
+
executor.submit(_process_group_entries, group_entries, progress) for group_entries in groups.values()
|
|
234
|
+
]
|
|
235
|
+
for _ in as_completed(futures):
|
|
236
|
+
# Errors are logged within workers
|
|
237
|
+
pass
|
|
238
|
+
|
|
239
|
+
# Cleanup temp keys
|
|
240
|
+
for group_entries in groups.values():
|
|
241
|
+
for entry in group_entries:
|
|
242
|
+
entry.pop('_group_backend', None)
|
|
243
|
+
entry.pop('_group_scenario', None)
|
|
244
|
+
entry.pop('_scenario_idx', None)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
# ----------------------------
|
|
248
|
+
# Scoring helpers
|
|
249
|
+
# ----------------------------
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def compute_entry_result(
|
|
253
|
+
handler: Any,
|
|
254
|
+
model_result: Any,
|
|
255
|
+
prompt_entry: Dict[str, Any],
|
|
256
|
+
underscore_to_dot: bool,
|
|
257
|
+
) -> Dict[str, Any]:
|
|
258
|
+
"""
|
|
259
|
+
Compute evaluation result for a single entry across BFCL categories.
|
|
260
|
+
"""
|
|
261
|
+
from bfcl_eval.constants.enums import Language, ReturnFormat
|
|
262
|
+
from bfcl_eval.eval_checker.eval_runner import (
|
|
263
|
+
_evaluate_single_agentic_entry,
|
|
264
|
+
_evaluate_single_ast_entry,
|
|
265
|
+
_evaluate_single_multi_turn_entry,
|
|
266
|
+
_evaluate_single_relevance_entry,
|
|
267
|
+
)
|
|
268
|
+
from bfcl_eval.utils import is_agentic, is_java, is_js, is_multi_turn, is_relevance_or_irrelevance
|
|
269
|
+
|
|
270
|
+
test_category = prompt_entry['category']
|
|
271
|
+
index = prompt_entry['id']
|
|
272
|
+
ground_truth = prompt_entry.get('ground_truth', {})
|
|
273
|
+
|
|
274
|
+
model_name = (DUMMY_MODEL_UNDERSCORE_TO_DOT if underscore_to_dot else DUMMY_MODEL_NO_UNDERSCORE_TO_DOT)
|
|
275
|
+
|
|
276
|
+
if is_relevance_or_irrelevance(test_category):
|
|
277
|
+
return _evaluate_single_relevance_entry(
|
|
278
|
+
handler=handler,
|
|
279
|
+
index=index,
|
|
280
|
+
model_result_item=model_result,
|
|
281
|
+
prompt_entry=prompt_entry,
|
|
282
|
+
model_name=model_name,
|
|
283
|
+
test_category=test_category,
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
elif is_multi_turn(test_category):
|
|
287
|
+
return _evaluate_single_multi_turn_entry(
|
|
288
|
+
handler=handler,
|
|
289
|
+
test_entry_id=index,
|
|
290
|
+
model_result_list=model_result,
|
|
291
|
+
ground_truth_list=ground_truth,
|
|
292
|
+
prompt_entry=prompt_entry,
|
|
293
|
+
model_name=model_name,
|
|
294
|
+
test_category=test_category,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
elif is_agentic(test_category):
|
|
298
|
+
return _evaluate_single_agentic_entry(
|
|
299
|
+
handler=handler,
|
|
300
|
+
index=index,
|
|
301
|
+
model_result_list=model_result,
|
|
302
|
+
possible_answer_item=ground_truth,
|
|
303
|
+
prompt_entry=prompt_entry,
|
|
304
|
+
model_name=model_name,
|
|
305
|
+
test_category=test_category,
|
|
306
|
+
)
|
|
307
|
+
else:
|
|
308
|
+
# AST categories (python/java/js)
|
|
309
|
+
if is_java(test_category):
|
|
310
|
+
language = Language.JAVA
|
|
311
|
+
return_format = ReturnFormat.JAVA
|
|
312
|
+
elif is_js(test_category):
|
|
313
|
+
language = Language.JAVASCRIPT
|
|
314
|
+
return_format = ReturnFormat.JAVASCRIPT
|
|
315
|
+
else:
|
|
316
|
+
language = Language.PYTHON
|
|
317
|
+
return_format = ReturnFormat.PYTHON
|
|
318
|
+
|
|
319
|
+
return _evaluate_single_ast_entry(
|
|
320
|
+
handler=handler,
|
|
321
|
+
index=index,
|
|
322
|
+
model_result_item=model_result,
|
|
323
|
+
possible_answer_item=ground_truth,
|
|
324
|
+
prompt_entry=prompt_entry,
|
|
325
|
+
model_name=model_name,
|
|
326
|
+
test_category=test_category,
|
|
327
|
+
language=language,
|
|
328
|
+
return_format=return_format,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
# ----------------------------
|
|
333
|
+
# Report aggregation helpers
|
|
334
|
+
# ----------------------------
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def compute_aggregate_subsets(report: Report) -> None:
|
|
338
|
+
"""
|
|
339
|
+
Compute aggregated subsets and overall score for BFCL report.
|
|
340
|
+
Modifies the report in-place.
|
|
341
|
+
"""
|
|
342
|
+
for metric in report.metrics:
|
|
343
|
+
# Collect all subsets in a dictionary for easy access
|
|
344
|
+
subset_dict: Dict[str, Subset] = {}
|
|
345
|
+
for category in metric.categories:
|
|
346
|
+
for subset in category.subsets:
|
|
347
|
+
subset_dict[subset.name] = subset
|
|
348
|
+
|
|
349
|
+
# Step 1: simple_ast
|
|
350
|
+
simple_subsets = ['simple_python', 'simple_java', 'simple_javascript']
|
|
351
|
+
simple_ast = unweighted_average_from_subsets(simple_subsets, subset_dict)
|
|
352
|
+
subset_dict['simple_ast'] = simple_ast
|
|
353
|
+
|
|
354
|
+
# Step 2.1: non_live (simple_ast, multiple, parallel, parallel_multiple)
|
|
355
|
+
non_live_subsets = ['simple_ast', 'multiple', 'parallel', 'parallel_multiple']
|
|
356
|
+
non_live = unweighted_average_from_subsets(non_live_subsets, subset_dict)
|
|
357
|
+
subset_dict['non_live'] = non_live
|
|
358
|
+
|
|
359
|
+
# Step 2.2: live (weighted)
|
|
360
|
+
live_subsets = ['live_simple', 'live_multiple', 'live_parallel', 'live_parallel_multiple']
|
|
361
|
+
live = weighted_average_from_subsets(live_subsets, subset_dict)
|
|
362
|
+
subset_dict['live'] = live
|
|
363
|
+
|
|
364
|
+
# Step 2.3: hallucination (unweighted)
|
|
365
|
+
hallucination_subsets = ['live_irrelevance', 'irrelevance']
|
|
366
|
+
hallucination = unweighted_average_from_subsets(hallucination_subsets, subset_dict)
|
|
367
|
+
subset_dict['hallucination'] = hallucination
|
|
368
|
+
|
|
369
|
+
# Step 2.4: multi_turn (unweighted)
|
|
370
|
+
multi_turn_subsets = [
|
|
371
|
+
'multi_turn_base',
|
|
372
|
+
'multi_turn_miss_func',
|
|
373
|
+
'multi_turn_miss_param',
|
|
374
|
+
'multi_turn_long_context',
|
|
375
|
+
]
|
|
376
|
+
multi_turn = unweighted_average_from_subsets(multi_turn_subsets, subset_dict)
|
|
377
|
+
subset_dict['multi_turn'] = multi_turn
|
|
378
|
+
|
|
379
|
+
# Step 2.5: web_search (unweighted)
|
|
380
|
+
web_search_subsets = ['web_search_base', 'web_search_no_snippet']
|
|
381
|
+
web_search = unweighted_average_from_subsets(web_search_subsets, subset_dict)
|
|
382
|
+
subset_dict['web_search'] = web_search
|
|
383
|
+
|
|
384
|
+
# Step 2.6: memory (unweighted)
|
|
385
|
+
memory_subsets = ['memory_kv', 'memory_vector', 'memory_rec_sum']
|
|
386
|
+
memory = unweighted_average_from_subsets(memory_subsets, subset_dict)
|
|
387
|
+
subset_dict['memory'] = memory
|
|
388
|
+
|
|
389
|
+
# Step 2.7: agentic (unweighted)
|
|
390
|
+
agentic_subsets = ['web_search', 'memory']
|
|
391
|
+
agentic = unweighted_average_from_subsets(agentic_subsets, subset_dict)
|
|
392
|
+
subset_dict['agentic'] = agentic
|
|
393
|
+
|
|
394
|
+
# Step 4: overall (percentage weighted average)
|
|
395
|
+
overall_subsets = ['agentic', 'multi_turn', 'non_live', 'live', 'hallucination']
|
|
396
|
+
overall = percentage_weighted_average_from_subsets(overall_subsets, subset_dict, weights=[40, 30, 10, 10, 10])
|
|
397
|
+
subset_dict['overall'] = overall
|
|
398
|
+
|
|
399
|
+
# Add computed scores to the category
|
|
400
|
+
computed_subset_names = ['agentic', 'multi_turn', 'non_live', 'live', 'hallucination', 'overall']
|
|
401
|
+
|
|
402
|
+
# Add the computed scores as new subsets in the metric
|
|
403
|
+
dummy_subsets: List[Subset] = []
|
|
404
|
+
for subset_name in computed_subset_names:
|
|
405
|
+
if subset_name in subset_dict and subset_dict[subset_name].num > 0:
|
|
406
|
+
subset = subset_dict[subset_name]
|
|
407
|
+
subset.name = subset_name.upper()
|
|
408
|
+
dummy_subsets.append(subset)
|
|
409
|
+
dummy_category = Category(name='-', subsets=dummy_subsets)
|
|
410
|
+
metric.categories.append(dummy_category)
|
|
File without changes
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
2
|
+
from evalscope.api.dataset import Sample
|
|
3
|
+
from evalscope.api.registry import register_benchmark
|
|
4
|
+
from evalscope.constants import Tags
|
|
5
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate
|
|
6
|
+
|
|
7
|
+
DESCRIPTION = (
|
|
8
|
+
'BiomixQA is a curated biomedical question-answering dataset. '
|
|
9
|
+
'BiomixQA has been utilized to validate the Knowledge Graph based '
|
|
10
|
+
'Retrieval-Augmented Generation (KG-RAG) framework across different LLMs.'
|
|
11
|
+
) # noqa: E501
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_benchmark(
|
|
15
|
+
BenchmarkMeta(
|
|
16
|
+
name='biomix_qa',
|
|
17
|
+
pretty_name='BioMixQA',
|
|
18
|
+
tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE, Tags.MEDICAL],
|
|
19
|
+
description=DESCRIPTION.strip(),
|
|
20
|
+
dataset_id='extraordinarylab/biomix-qa',
|
|
21
|
+
metric_list=['acc'],
|
|
22
|
+
few_shot_num=0,
|
|
23
|
+
train_split=None,
|
|
24
|
+
eval_split='test',
|
|
25
|
+
prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER,
|
|
26
|
+
)
|
|
27
|
+
)
|
|
28
|
+
class BioMixQAAdapter(MultiChoiceAdapter):
|
|
29
|
+
|
|
30
|
+
def record_to_sample(self, record) -> Sample:
|
|
31
|
+
return Sample(
|
|
32
|
+
input=record['question'],
|
|
33
|
+
choices=record['choices'],
|
|
34
|
+
target=record['answer'],
|
|
35
|
+
metadata={},
|
|
36
|
+
)
|
|
File without changes
|