evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
import traceback
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
|
+
|
|
6
|
+
from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
|
|
7
|
+
from evalscope.api.dataset import Sample
|
|
8
|
+
from evalscope.api.evaluator import TaskState
|
|
9
|
+
from evalscope.api.messages.chat_message import ChatMessageUser
|
|
10
|
+
from evalscope.api.metric import Score
|
|
11
|
+
from evalscope.api.model import Model, ModelOutput
|
|
12
|
+
from evalscope.api.registry import register_benchmark
|
|
13
|
+
from evalscope.constants import Tags
|
|
14
|
+
from evalscope.report import Category, Report, Subset, unweighted_average_from_subsets, weighted_average_from_subsets
|
|
15
|
+
from evalscope.utils.import_utils import check_import
|
|
16
|
+
from evalscope.utils.logger import get_logger
|
|
17
|
+
|
|
18
|
+
logger = get_logger()
|
|
19
|
+
|
|
20
|
+
SUBJECT_MAPPING = {
|
|
21
|
+
'simple': 'AST_NON_LIVE',
|
|
22
|
+
'multiple': 'AST_NON_LIVE',
|
|
23
|
+
'parallel': 'AST_NON_LIVE',
|
|
24
|
+
'parallel_multiple': 'AST_NON_LIVE',
|
|
25
|
+
'java': 'AST_NON_LIVE',
|
|
26
|
+
'javascript': 'AST_NON_LIVE',
|
|
27
|
+
'live_simple': 'AST_LIVE',
|
|
28
|
+
'live_multiple': 'AST_LIVE',
|
|
29
|
+
'live_parallel': 'AST_LIVE',
|
|
30
|
+
'live_parallel_multiple': 'AST_LIVE',
|
|
31
|
+
'irrelevance': 'RELEVANCE',
|
|
32
|
+
'live_relevance': 'RELEVANCE',
|
|
33
|
+
'live_irrelevance': 'RELEVANCE',
|
|
34
|
+
'multi_turn_base': 'MULTI_TURN',
|
|
35
|
+
'multi_turn_miss_func': 'MULTI_TURN',
|
|
36
|
+
'multi_turn_miss_param': 'MULTI_TURN',
|
|
37
|
+
'multi_turn_long_context': 'MULTI_TURN'
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
BFCL_V3_TO_V4_SUBJECT_MAPPING = {
|
|
41
|
+
'simple': 'simple_python',
|
|
42
|
+
'java': 'simple_java',
|
|
43
|
+
'javascript': 'simple_javascript',
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@register_benchmark(
|
|
48
|
+
BenchmarkMeta(
|
|
49
|
+
name='bfcl_v3',
|
|
50
|
+
pretty_name='BFCL-v3',
|
|
51
|
+
tags=[Tags.FUNCTION_CALLING, Tags.AGENT],
|
|
52
|
+
description='Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive '
|
|
53
|
+
'and executable function call evaluation** '
|
|
54
|
+
'dedicated to assessing Large Language Models\' (LLMs) ability to invoke '
|
|
55
|
+
'functions. Unlike previous evaluations, '
|
|
56
|
+
'BFCL accounts for various forms of function calls, diverse scenarios, and executability. '
|
|
57
|
+
'Need to run `pip install bfcl-eval==2025.10.27.1` before evaluating. '
|
|
58
|
+
'[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/bfcl_v3.html)',
|
|
59
|
+
dataset_id='AI-ModelScope/bfcl_v3',
|
|
60
|
+
subset_list=list(SUBJECT_MAPPING.keys()),
|
|
61
|
+
metric_list=['acc'],
|
|
62
|
+
eval_split='train',
|
|
63
|
+
extra_params={
|
|
64
|
+
'underscore_to_dot': True,
|
|
65
|
+
'is_fc_model': True,
|
|
66
|
+
}
|
|
67
|
+
)
|
|
68
|
+
)
|
|
69
|
+
class BFCLV3Adapter(AgentAdapter):
|
|
70
|
+
"""
|
|
71
|
+
BFCL adapter using the new data processing framework.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def __init__(self, **kwargs):
|
|
75
|
+
super().__init__(**kwargs)
|
|
76
|
+
|
|
77
|
+
check_import('bfcl_eval', package='bfcl-eval==2025.10.27.1', raise_error=True, feature_name=self.pretty_name)
|
|
78
|
+
|
|
79
|
+
self.category_map = SUBJECT_MAPPING
|
|
80
|
+
self.reformat_subset = True
|
|
81
|
+
self.add_overall_metric = False
|
|
82
|
+
self.add_aggregation_name = False
|
|
83
|
+
|
|
84
|
+
self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
|
|
85
|
+
self.is_fc_model = self.extra_params.get('is_fc_model', True)
|
|
86
|
+
|
|
87
|
+
def preprocess_row(self, row: dict):
|
|
88
|
+
"""
|
|
89
|
+
Inplace preprocess the row to ensure it has the correct format for BFCL evaluation.
|
|
90
|
+
"""
|
|
91
|
+
row['should_execute_tool_calls'] = True if row['multi_turn'] else False
|
|
92
|
+
row['functions'] = json.loads(row['functions'])
|
|
93
|
+
row['tools'] = json.loads(row['tools'])
|
|
94
|
+
row['turns'] = json.loads(row['turns'])
|
|
95
|
+
row['missing_functions'] = json.loads(row['missed_functions'])
|
|
96
|
+
row['ground_truth'] = json.loads(row.get('ground_truth', '{}'))
|
|
97
|
+
row['initial_config'] = json.loads(row['initial_config'])
|
|
98
|
+
row['is_fc_model'] = self.is_fc_model
|
|
99
|
+
|
|
100
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
101
|
+
"""Convert a data record to a Sample object."""
|
|
102
|
+
self.preprocess_row(record)
|
|
103
|
+
|
|
104
|
+
# If the model is a function calling model, we need to remove the system prompt
|
|
105
|
+
if self.is_fc_model:
|
|
106
|
+
turns = record['turns']
|
|
107
|
+
new_turns = []
|
|
108
|
+
for turn_idx, messages in enumerate(turns):
|
|
109
|
+
current_messages = messages.copy()
|
|
110
|
+
if len(current_messages) > 0 and current_messages[0]['role'] == 'system':
|
|
111
|
+
current_messages = current_messages[1:]
|
|
112
|
+
new_turns.append(current_messages)
|
|
113
|
+
record['turns'] = new_turns
|
|
114
|
+
|
|
115
|
+
return Sample(
|
|
116
|
+
input=[ChatMessageUser(content=json.dumps(record['turns']))],
|
|
117
|
+
target=json.dumps(record['ground_truth']), # Will use the record for evaluation
|
|
118
|
+
subset_key=record['subset'],
|
|
119
|
+
metadata=record # Store the full record for evaluation
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
|
|
123
|
+
from .generation import predict
|
|
124
|
+
return predict(model, sample)
|
|
125
|
+
|
|
126
|
+
def match_score(
|
|
127
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
128
|
+
) -> Score:
|
|
129
|
+
from bfcl_eval.eval_checker.ast_eval.ast_checker import ast_checker
|
|
130
|
+
from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker import multi_turn_checker
|
|
131
|
+
from bfcl_eval.model_handler.utils import (
|
|
132
|
+
convert_to_function_call,
|
|
133
|
+
default_decode_ast_prompting,
|
|
134
|
+
default_decode_execute_prompting,
|
|
135
|
+
)
|
|
136
|
+
from bfcl_eval.utils import is_empty_output
|
|
137
|
+
|
|
138
|
+
from .utils import convert_format_language, convert_language
|
|
139
|
+
|
|
140
|
+
score = Score(
|
|
141
|
+
extracted_prediction=filtered_prediction,
|
|
142
|
+
prediction=original_prediction,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
# NOTE: This is hardcoded dummy model since its only use is to infer underscore_to_dot
|
|
147
|
+
if self.underscore_to_dot:
|
|
148
|
+
dummy_model = 'gpt-4o-2024-11-20-FC'
|
|
149
|
+
else:
|
|
150
|
+
dummy_model = 'meta-llama/Llama-3.3-70B-Instruct-FC'
|
|
151
|
+
|
|
152
|
+
row = task_state.metadata
|
|
153
|
+
test_category = BFCL_V3_TO_V4_SUBJECT_MAPPING.get(row['test_category'], row['test_category'])
|
|
154
|
+
|
|
155
|
+
if test_category in {'irrelevance', 'live_irrelevance', 'live_relevance'}:
|
|
156
|
+
error = None
|
|
157
|
+
try:
|
|
158
|
+
if self.is_fc_model:
|
|
159
|
+
decoded_tool_calls = []
|
|
160
|
+
for tool_call in row['generation'][0]:
|
|
161
|
+
name = list(tool_call.keys())[0]
|
|
162
|
+
params = tool_call[name]
|
|
163
|
+
decoded_tool_calls.append({name: params})
|
|
164
|
+
else:
|
|
165
|
+
decoded_tool_calls = default_decode_ast_prompting(
|
|
166
|
+
row['generation'][0][0], convert_format_language(row['language'])
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# successful decode means valid function call was present
|
|
170
|
+
contains_func_call = True
|
|
171
|
+
if is_empty_output(decoded_tool_calls):
|
|
172
|
+
# Empty output is not considered as a valid function call
|
|
173
|
+
contains_func_call = False
|
|
174
|
+
error = 'Empty decoded output.'
|
|
175
|
+
except Exception:
|
|
176
|
+
contains_func_call = False
|
|
177
|
+
error = f'Failed to decode with traceback: {traceback.format_exc()}'
|
|
178
|
+
finally:
|
|
179
|
+
valid = contains_func_call if test_category == 'live_relevance' else not contains_func_call
|
|
180
|
+
score_result = {'valid': valid, 'error_message': error}
|
|
181
|
+
|
|
182
|
+
elif row['multi_turn']:
|
|
183
|
+
# each step might give a list of tool calls and each turn is multi-step
|
|
184
|
+
# and multi-turn has generations of all the turns
|
|
185
|
+
# hence in a multi-turn setting,
|
|
186
|
+
# multi_turn_decoded_generations is a list of list of list of strings
|
|
187
|
+
multi_turn_decoded_generations: list[list[list[str]]] = []
|
|
188
|
+
for single_turn_generations in row['generation']:
|
|
189
|
+
single_turn_decoded_generations: list[list[str]] = []
|
|
190
|
+
for generation in single_turn_generations:
|
|
191
|
+
try:
|
|
192
|
+
if self.is_fc_model:
|
|
193
|
+
tool_calls = convert_to_function_call(generation)
|
|
194
|
+
else:
|
|
195
|
+
tool_calls = default_decode_execute_prompting(generation)
|
|
196
|
+
|
|
197
|
+
single_turn_decoded_generations.append(tool_calls)
|
|
198
|
+
except Exception:
|
|
199
|
+
single_turn_decoded_generations.append([generation])
|
|
200
|
+
|
|
201
|
+
multi_turn_decoded_generations.append(single_turn_decoded_generations)
|
|
202
|
+
|
|
203
|
+
try:
|
|
204
|
+
raw_score_result = multi_turn_checker(
|
|
205
|
+
multi_turn_decoded_generations,
|
|
206
|
+
row['ground_truth'],
|
|
207
|
+
row,
|
|
208
|
+
test_category,
|
|
209
|
+
dummy_model,
|
|
210
|
+
)
|
|
211
|
+
except Exception:
|
|
212
|
+
raw_score_result = {
|
|
213
|
+
'valid': False,
|
|
214
|
+
'error_type': 'multi_turn:checker_failed',
|
|
215
|
+
'error_message': f'Failed to grade multi-turn. Traceback: {traceback.format_exc()}',
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
score_result = {
|
|
219
|
+
'valid': float(raw_score_result['valid']),
|
|
220
|
+
'error_message': raw_score_result.get('error_message', ''),
|
|
221
|
+
'error_type': raw_score_result.get('error_type', ''),
|
|
222
|
+
}
|
|
223
|
+
else:
|
|
224
|
+
try:
|
|
225
|
+
if self.is_fc_model:
|
|
226
|
+
decoded_tool_calls = []
|
|
227
|
+
for tool_call in row['generation'][0]:
|
|
228
|
+
name = list(tool_call.keys())[0]
|
|
229
|
+
params = tool_call[name]
|
|
230
|
+
decoded_tool_calls.append({name: params})
|
|
231
|
+
else:
|
|
232
|
+
decoded_tool_calls = default_decode_ast_prompting(
|
|
233
|
+
row['generation'][0][0], convert_format_language(row['language'])
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
score_result = ast_checker(
|
|
237
|
+
row['functions'],
|
|
238
|
+
decoded_tool_calls,
|
|
239
|
+
row['ground_truth'],
|
|
240
|
+
convert_language(row['language']),
|
|
241
|
+
test_category,
|
|
242
|
+
dummy_model,
|
|
243
|
+
)
|
|
244
|
+
except Exception:
|
|
245
|
+
score_result = {
|
|
246
|
+
'valid': False,
|
|
247
|
+
'error_message': f'Invalid syntax. Failed to decode AST. Traceback: {traceback.format_exc()}',
|
|
248
|
+
'error_type': 'ast_decoder:decoder_failed',
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
score.value = {
|
|
252
|
+
'acc': float(score_result['valid']),
|
|
253
|
+
}
|
|
254
|
+
score.explanation = score_result.get('error_message', 'Evaluation completed')
|
|
255
|
+
score.metadata = {
|
|
256
|
+
'raw_score_result': score_result,
|
|
257
|
+
'test_category': test_category,
|
|
258
|
+
'underscore_to_dot': self.underscore_to_dot,
|
|
259
|
+
'is_fc_model': self.is_fc_model
|
|
260
|
+
}
|
|
261
|
+
score.main_score_name = 'acc'
|
|
262
|
+
|
|
263
|
+
except Exception:
|
|
264
|
+
logger.error(f'Evaluation failed for sample: {task_state.sample_id}\n{traceback.format_exc()}')
|
|
265
|
+
score.value = {'acc': 0.0}
|
|
266
|
+
score.explanation = 'Evaluation failed with an unexpected error.'
|
|
267
|
+
score.metadata = {'error': traceback.format_exc()}
|
|
268
|
+
score.main_score_name = 'acc'
|
|
269
|
+
return score
|
|
270
|
+
|
|
271
|
+
def _on_generate_report_end(self, report: Report, output_dir, **kwargs):
|
|
272
|
+
"""
|
|
273
|
+
Finalize the report generation process. Calculate the overall score.
|
|
274
|
+
|
|
275
|
+
Track the number of each category.
|
|
276
|
+
- step1: simple, java, javascript unweighted average as simple_ast
|
|
277
|
+
- step2.1: simple_ast, multiple, parallel, parallel_multiple unweighted average as ast_non_live
|
|
278
|
+
- step2.2: live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average as ast_live
|
|
279
|
+
- step2.3: irrelevance as hallucination_non_live
|
|
280
|
+
- step2.4: live_irrelevance, live_relevance weighted average as hallucination_live
|
|
281
|
+
- step2.5: multi_turn_base as multi_turn_base
|
|
282
|
+
- step2.6: multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average as multi_turn_augmented
|
|
283
|
+
- step3.1: ast_non_live, hallucination_non_live unweighted average as non_live
|
|
284
|
+
- step3.2: ast_live, hallucination_live weighted average as live
|
|
285
|
+
- step3.3: multi_turn_base, multi_turn_augmented unweighted average as multi_turn
|
|
286
|
+
- step4: non_live, live, multi_turn unweighted average as overall
|
|
287
|
+
Args:
|
|
288
|
+
report (Report): The generated evaluation report.
|
|
289
|
+
output_dir (str): The directory to save the report.
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
None
|
|
293
|
+
""" # noqa: E501
|
|
294
|
+
for metric in report.metrics:
|
|
295
|
+
# Collect all subsets in a dictionary for easy access
|
|
296
|
+
subset_dict: Dict[str, Subset] = {}
|
|
297
|
+
for category in metric.categories:
|
|
298
|
+
for subset in category.subsets:
|
|
299
|
+
subset_dict[subset.name] = subset
|
|
300
|
+
|
|
301
|
+
# Step 1: Calculate simple_ast (simple, java, javascript unweighted average)
|
|
302
|
+
simple_subsets = ['simple', 'java', 'javascript']
|
|
303
|
+
simple_ast = unweighted_average_from_subsets(simple_subsets, subset_dict)
|
|
304
|
+
subset_dict['simple_ast'] = simple_ast
|
|
305
|
+
|
|
306
|
+
# Step 2.1: Calculate ast_non_live
|
|
307
|
+
# (simple_ast, multiple, parallel, parallel_multiple unweighted average)
|
|
308
|
+
ast_non_live_subsets = ['simple_ast', 'multiple', 'parallel', 'parallel_multiple']
|
|
309
|
+
ast_non_live = unweighted_average_from_subsets(ast_non_live_subsets, subset_dict)
|
|
310
|
+
subset_dict['ast_non_live'] = ast_non_live
|
|
311
|
+
|
|
312
|
+
# Step 2.2: Calculate ast_live
|
|
313
|
+
# (live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average)
|
|
314
|
+
live_subsets = ['live_simple', 'live_multiple', 'live_parallel', 'live_parallel_multiple']
|
|
315
|
+
ast_live = weighted_average_from_subsets(live_subsets, subset_dict)
|
|
316
|
+
subset_dict['ast_live'] = ast_live
|
|
317
|
+
|
|
318
|
+
# Step 2.3: hallucination_non_live (irrelevance)
|
|
319
|
+
if 'irrelevance' in subset_dict:
|
|
320
|
+
subset_dict['hallucination_non_live'] = subset_dict['irrelevance']
|
|
321
|
+
else:
|
|
322
|
+
subset_dict['hallucination_non_live'] = Subset(name='hallucination_non_live', score=0, num=0)
|
|
323
|
+
|
|
324
|
+
# Step 2.4: Calculate hallucination_live (live_irrelevance, live_relevance weighted average)
|
|
325
|
+
hallucination_live_subsets = ['live_irrelevance', 'live_relevance']
|
|
326
|
+
hallucination_live = weighted_average_from_subsets(hallucination_live_subsets, subset_dict)
|
|
327
|
+
subset_dict['hallucination_live'] = hallucination_live
|
|
328
|
+
|
|
329
|
+
# Step 2.5: multi_turn_base
|
|
330
|
+
if 'multi_turn_base' not in subset_dict:
|
|
331
|
+
subset_dict['multi_turn_base'] = Subset(name='multi_turn_base', score=0, num=0)
|
|
332
|
+
|
|
333
|
+
# Step 2.6: Calculate multi_turn_augmented
|
|
334
|
+
# (multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average)
|
|
335
|
+
multi_turn_augmented_subsets = ['multi_turn_miss_func', 'multi_turn_miss_param', 'multi_turn_long_context']
|
|
336
|
+
multi_turn_augmented = weighted_average_from_subsets(multi_turn_augmented_subsets, subset_dict)
|
|
337
|
+
subset_dict['multi_turn_augmented'] = multi_turn_augmented
|
|
338
|
+
|
|
339
|
+
# Step 3.1: Calculate non_live (ast_non_live, hallucination_non_live unweighted average)
|
|
340
|
+
non_live_subsets = ['ast_non_live', 'hallucination_non_live']
|
|
341
|
+
non_live = unweighted_average_from_subsets(non_live_subsets, subset_dict)
|
|
342
|
+
subset_dict['non_live'] = non_live
|
|
343
|
+
|
|
344
|
+
# Step 3.2: Calculate live (ast_live, hallucination_live weighted average)
|
|
345
|
+
live_agg_subsets = ['ast_live', 'hallucination_live']
|
|
346
|
+
live = weighted_average_from_subsets(live_agg_subsets, subset_dict)
|
|
347
|
+
subset_dict['live'] = live
|
|
348
|
+
|
|
349
|
+
# Step 3.3: Calculate multi_turn (multi_turn_base, multi_turn_augmented unweighted average)
|
|
350
|
+
multi_turn_subsets = ['multi_turn_base', 'multi_turn_augmented']
|
|
351
|
+
multi_turn = unweighted_average_from_subsets(multi_turn_subsets, subset_dict)
|
|
352
|
+
subset_dict['multi_turn'] = multi_turn
|
|
353
|
+
|
|
354
|
+
# Step 4: Calculate overall (non_live, live, multi_turn unweighted average)
|
|
355
|
+
overall_subsets = ['non_live', 'live', 'multi_turn']
|
|
356
|
+
overall = unweighted_average_from_subsets(overall_subsets, subset_dict)
|
|
357
|
+
subset_dict['overall'] = overall
|
|
358
|
+
|
|
359
|
+
# Add computed scores to the category
|
|
360
|
+
computed_subset_names = ['non_live', 'live', 'multi_turn', 'overall']
|
|
361
|
+
|
|
362
|
+
# Add the computed scores as new subsets in the metric
|
|
363
|
+
dummy_subsets = []
|
|
364
|
+
for subset_name in computed_subset_names:
|
|
365
|
+
if subset_name in subset_dict:
|
|
366
|
+
subset = subset_dict[subset_name]
|
|
367
|
+
subset.name = subset_name.upper()
|
|
368
|
+
dummy_subsets.append(subset)
|
|
369
|
+
dummy_category = Category(name='-', subsets=dummy_subsets)
|
|
370
|
+
metric.categories.append(dummy_category)
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import time
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.messages import dict_to_chat_message
|
|
7
|
+
from evalscope.api.model import ChatCompletionChoice, Model, ModelOutput, ModelUsage
|
|
8
|
+
from evalscope.api.tool.tool_info import ToolInfo
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def predict(model: Model, sample: Sample) -> ModelOutput:
|
|
15
|
+
"""Main prediction function for BFCL using the new API framework."""
|
|
16
|
+
# Extract the row data from sample metadata
|
|
17
|
+
row = sample.metadata
|
|
18
|
+
is_fc_model = row.get('is_fc_model', False)
|
|
19
|
+
|
|
20
|
+
if is_fc_model:
|
|
21
|
+
response, model_usage = generate_turn_with_tools(model, row)
|
|
22
|
+
else:
|
|
23
|
+
response, model_usage = generate_turn(model, row)
|
|
24
|
+
|
|
25
|
+
sample.metadata['generation'] = response
|
|
26
|
+
# wrap response with openai types
|
|
27
|
+
return ModelOutput(
|
|
28
|
+
model=model.name,
|
|
29
|
+
choices=[ChatCompletionChoice.from_content(json.dumps(response, ensure_ascii=False, indent=2))],
|
|
30
|
+
model_usage=model_usage,
|
|
31
|
+
time=time.time()
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def generate_turn(model: Model, row: dict[str, Any]):
|
|
36
|
+
from bfcl_eval.constants.default_prompts import (
|
|
37
|
+
DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING,
|
|
38
|
+
MAXIMUM_STEP_LIMIT,
|
|
39
|
+
)
|
|
40
|
+
from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
|
|
41
|
+
from bfcl_eval.model_handler.utils import default_decode_execute_prompting
|
|
42
|
+
|
|
43
|
+
all_model_responses = []
|
|
44
|
+
current_messages = []
|
|
45
|
+
turns = row['turns']
|
|
46
|
+
model_usage = ModelUsage()
|
|
47
|
+
|
|
48
|
+
for turn_idx, messages in enumerate(turns):
|
|
49
|
+
n_steps = 0
|
|
50
|
+
current_responses = []
|
|
51
|
+
current_messages += messages.copy()
|
|
52
|
+
|
|
53
|
+
if str(turn_idx) in row['missing_functions']:
|
|
54
|
+
assert len(messages) == 0, 'Holdout turn should not have user message.'
|
|
55
|
+
new_turn = [{
|
|
56
|
+
'role':
|
|
57
|
+
'user',
|
|
58
|
+
'content':
|
|
59
|
+
DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING.format(
|
|
60
|
+
functions=row['missing_functions'][str(turn_idx)]
|
|
61
|
+
),
|
|
62
|
+
}]
|
|
63
|
+
current_messages += new_turn
|
|
64
|
+
|
|
65
|
+
while True:
|
|
66
|
+
# Create a sample for the current messages
|
|
67
|
+
from evalscope.api.messages.chat_message import dict_to_chat_message
|
|
68
|
+
chat_messages = [dict_to_chat_message(msg) for msg in current_messages]
|
|
69
|
+
|
|
70
|
+
# Get model response using generate method
|
|
71
|
+
model_output = model.generate(chat_messages)
|
|
72
|
+
|
|
73
|
+
# Handle the response based on the model output structure
|
|
74
|
+
message = model_output.message
|
|
75
|
+
if model_output.usage is not None:
|
|
76
|
+
model_usage += model_output.usage
|
|
77
|
+
|
|
78
|
+
current_messages.append(message)
|
|
79
|
+
if isinstance(message, str):
|
|
80
|
+
result = message
|
|
81
|
+
else:
|
|
82
|
+
result = message.text
|
|
83
|
+
|
|
84
|
+
logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
|
|
85
|
+
current_responses.append(result)
|
|
86
|
+
|
|
87
|
+
execute_tools = row.get('should_execute_tool_calls', False)
|
|
88
|
+
if execute_tools:
|
|
89
|
+
try:
|
|
90
|
+
tool_calls = default_decode_execute_prompting(result)
|
|
91
|
+
except Exception:
|
|
92
|
+
tool_calls = None
|
|
93
|
+
|
|
94
|
+
if tool_calls is None:
|
|
95
|
+
break
|
|
96
|
+
|
|
97
|
+
tool_outputs, _ = execute_multi_turn_func_call(
|
|
98
|
+
tool_calls,
|
|
99
|
+
initial_config=row['initial_config'],
|
|
100
|
+
involved_classes=row['involved_classes'],
|
|
101
|
+
model_name='evaluator_loop',
|
|
102
|
+
test_entry_id=row['id'],
|
|
103
|
+
long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
|
|
104
|
+
is_evaL_run=False,
|
|
105
|
+
)
|
|
106
|
+
# Append tool outputs to the current messages
|
|
107
|
+
tool_results = []
|
|
108
|
+
for tool_output, tool_call in zip(tool_outputs, tool_calls):
|
|
109
|
+
tool_results.append({'role': 'tool', 'name': tool_call, 'content': tool_output})
|
|
110
|
+
current_messages.append({
|
|
111
|
+
'role': 'user',
|
|
112
|
+
'content': repr(tool_results),
|
|
113
|
+
})
|
|
114
|
+
else:
|
|
115
|
+
break
|
|
116
|
+
|
|
117
|
+
n_steps += 1
|
|
118
|
+
if n_steps > MAXIMUM_STEP_LIMIT:
|
|
119
|
+
logger.warning(f'INFERENCE_WARNING: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
|
|
120
|
+
break
|
|
121
|
+
|
|
122
|
+
all_model_responses.append(current_responses)
|
|
123
|
+
|
|
124
|
+
return all_model_responses, model_usage
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def generate_turn_with_tools(model: Model, row: dict[str, Any]):
|
|
128
|
+
from bfcl_eval.constants.default_prompts import DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC, MAXIMUM_STEP_LIMIT
|
|
129
|
+
from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
|
|
130
|
+
from bfcl_eval.model_handler.utils import convert_to_function_call
|
|
131
|
+
|
|
132
|
+
all_model_responses = []
|
|
133
|
+
current_messages = []
|
|
134
|
+
turns = row['turns']
|
|
135
|
+
model_usage = ModelUsage()
|
|
136
|
+
|
|
137
|
+
for turn_idx, messages in enumerate(turns):
|
|
138
|
+
n_steps = 0
|
|
139
|
+
current_responses = []
|
|
140
|
+
current_messages += messages.copy()
|
|
141
|
+
tools = row['tools']
|
|
142
|
+
|
|
143
|
+
if str(turn_idx) in row['missing_functions']:
|
|
144
|
+
assert len(messages) == 0, 'Holdout turn should not have user message.'
|
|
145
|
+
# inject new functions on the fly
|
|
146
|
+
new_tools = row['missing_functions'][str(turn_idx)]
|
|
147
|
+
for new_tool in new_tools:
|
|
148
|
+
cur_tool = new_tool[0]
|
|
149
|
+
cur_tool['parameters']['type'] = 'object'
|
|
150
|
+
tools.append({
|
|
151
|
+
'type': 'function',
|
|
152
|
+
'function': cur_tool,
|
|
153
|
+
})
|
|
154
|
+
new_turn = [{
|
|
155
|
+
'role': 'user',
|
|
156
|
+
'content': DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
|
|
157
|
+
}]
|
|
158
|
+
current_messages += new_turn
|
|
159
|
+
|
|
160
|
+
while True:
|
|
161
|
+
# Create a sample for the current messages with tools
|
|
162
|
+
chat_messages = [dict_to_chat_message(msg) for msg in current_messages]
|
|
163
|
+
current_sample = Sample(
|
|
164
|
+
input=chat_messages,
|
|
165
|
+
target='',
|
|
166
|
+
tools=[ToolInfo.model_validate(tool['function']) for tool in tools],
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# Get model response
|
|
170
|
+
model_output = model.generate(current_sample.input, tools=current_sample.tools)
|
|
171
|
+
|
|
172
|
+
# Handle the response based on the model output structure
|
|
173
|
+
message = model_output.message
|
|
174
|
+
if model_output.usage is not None:
|
|
175
|
+
model_usage += model_output.usage
|
|
176
|
+
|
|
177
|
+
current_messages.append(message)
|
|
178
|
+
if isinstance(message, str):
|
|
179
|
+
model_responses = [message]
|
|
180
|
+
tool_call_strs = None
|
|
181
|
+
elif message.tool_calls:
|
|
182
|
+
model_responses = [{tc.function.name: tc.function.arguments} for tc in message.tool_calls]
|
|
183
|
+
try:
|
|
184
|
+
tool_call_strs = convert_to_function_call(model_responses)
|
|
185
|
+
except Exception as e:
|
|
186
|
+
logger.error(f'Error converting tool calls to function call strings: {e}')
|
|
187
|
+
tool_call_strs = None
|
|
188
|
+
else:
|
|
189
|
+
model_responses = [message.text]
|
|
190
|
+
tool_call_strs = None
|
|
191
|
+
|
|
192
|
+
current_responses.extend(model_responses)
|
|
193
|
+
|
|
194
|
+
execute_tools = row.get('should_execute_tool_calls', False)
|
|
195
|
+
if execute_tools and tool_call_strs is not None:
|
|
196
|
+
tool_outputs, _ = execute_multi_turn_func_call(
|
|
197
|
+
tool_call_strs,
|
|
198
|
+
initial_config=row['initial_config'],
|
|
199
|
+
involved_classes=row['involved_classes'],
|
|
200
|
+
model_name='evaluator_loop',
|
|
201
|
+
test_entry_id=row['id'],
|
|
202
|
+
long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
|
|
203
|
+
is_evaL_run=False,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
for tc, tool_output in zip(message.tool_calls, tool_outputs, strict=False):
|
|
207
|
+
current_messages.append({
|
|
208
|
+
'role': 'tool',
|
|
209
|
+
'tool_call_id': tc.id,
|
|
210
|
+
'content': json.dumps({'response': tool_output}),
|
|
211
|
+
})
|
|
212
|
+
else:
|
|
213
|
+
break
|
|
214
|
+
|
|
215
|
+
n_steps += 1
|
|
216
|
+
if n_steps > MAXIMUM_STEP_LIMIT:
|
|
217
|
+
logger.warning(f'INFERENCE_WARNING: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
|
|
218
|
+
break
|
|
219
|
+
|
|
220
|
+
all_model_responses.append(current_responses)
|
|
221
|
+
|
|
222
|
+
return all_model_responses, model_usage
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
def convert_language(language: str) -> str:
|
|
2
|
+
"""Convert language names from BFCL v3 to BFCL v4 naming conventions."""
|
|
3
|
+
from bfcl_eval.constants.enums import Language
|
|
4
|
+
mapping = {
|
|
5
|
+
'python': Language.PYTHON,
|
|
6
|
+
'java': Language.JAVA,
|
|
7
|
+
'javascript': Language.JAVASCRIPT,
|
|
8
|
+
}
|
|
9
|
+
return mapping[language.lower()]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def convert_format_language(format_language: str) -> str:
|
|
13
|
+
"""Convert format language names from BFCL v3 to BFCL v4 naming conventions."""
|
|
14
|
+
from bfcl_eval.constants.enums import ReturnFormat
|
|
15
|
+
mapping = {
|
|
16
|
+
'python': ReturnFormat.PYTHON,
|
|
17
|
+
'java': ReturnFormat.JAVA,
|
|
18
|
+
'javascript': ReturnFormat.JAVASCRIPT,
|
|
19
|
+
'json': ReturnFormat.JSON,
|
|
20
|
+
'verbose_xml': ReturnFormat.VERBOSE_XML,
|
|
21
|
+
'concise_xml': ReturnFormat.CONCISE_XML,
|
|
22
|
+
}
|
|
23
|
+
return mapping[format_language.lower()]
|
|
File without changes
|