evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# flake8: noqa
|
|
2
|
+
import re
|
|
3
|
+
from collections.abc import Sequence
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def safe_regex_search(pattern, text, flags=0):
|
|
7
|
+
try:
|
|
8
|
+
return re.search(pattern, text, flags)
|
|
9
|
+
except Exception as e:
|
|
10
|
+
print(f'Regex match error: {str(e)}')
|
|
11
|
+
return None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def extract_option_labels(text, options='ABCDEFGHIJ'):
|
|
15
|
+
if not isinstance(text, str) or not isinstance(options, str):
|
|
16
|
+
return 'error'
|
|
17
|
+
|
|
18
|
+
text = text.rstrip()
|
|
19
|
+
last_line = text.split('\n')[-1]
|
|
20
|
+
|
|
21
|
+
option_str = ''.join([chr(65 + i) for i in range(len(options))]) if options else 'ABCDEFGHIJ'
|
|
22
|
+
|
|
23
|
+
patterns = [
|
|
24
|
+
# e.g. "The final answer to this question is: A."
|
|
25
|
+
# "The best option is $\boxed{B}:"
|
|
26
|
+
# "The correct answer is (C)."
|
|
27
|
+
f'[Tt]he\s+(?:\w+\s+)?(?:answer|option)(?:\w+\s+)?\s+is?:?\s*(?:[\*\$\\{{(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*([{option_str}])(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
|
|
28
|
+
|
|
29
|
+
# e.g. "ANSWER: A"
|
|
30
|
+
# "Answer: $\boxed{B}."
|
|
31
|
+
# "ANSWER: (C):"
|
|
32
|
+
f'(?i:Answer)[\*\s]*:\s*(?:[\*\$\\{{(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*([{option_str}])(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
|
|
33
|
+
|
|
34
|
+
# e.g. "A"
|
|
35
|
+
# "$\boxed{B}$"
|
|
36
|
+
# "(C)."
|
|
37
|
+
# "[D]:"
|
|
38
|
+
f'^[^\w\r\n]*(?:[\*\$\\{{(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*([{option_str}])(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
for pattern in patterns:
|
|
42
|
+
match = safe_regex_search(pattern, last_line, re.IGNORECASE)
|
|
43
|
+
if match:
|
|
44
|
+
return match.group(1)
|
|
45
|
+
|
|
46
|
+
for pattern in patterns:
|
|
47
|
+
match = safe_regex_search(pattern, text, re.IGNORECASE)
|
|
48
|
+
if match:
|
|
49
|
+
return match.group(1)
|
|
50
|
+
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def extract_option_content(text, options_content=None):
|
|
55
|
+
if not isinstance(text, str) or not isinstance(options_content, Sequence):
|
|
56
|
+
return 'error'
|
|
57
|
+
|
|
58
|
+
escaped_options_content = [re.escape(option_content) for option_content in options_content]
|
|
59
|
+
escaped_options_content_str = '|'.join(escaped_options_content)
|
|
60
|
+
|
|
61
|
+
text = text.rstrip()
|
|
62
|
+
last_line = text.split('\n')[-1]
|
|
63
|
+
|
|
64
|
+
patterns = [
|
|
65
|
+
f'[Tt]he\s+(?:\w+\s+)?(?:answer|option)(?:\w+\s+)?\s+is:?\s*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
|
|
66
|
+
f'(?i:Answer)\s*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
|
|
67
|
+
f'^[^\w\r\n]*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
for pattern in patterns:
|
|
71
|
+
match = safe_regex_search(pattern, last_line)
|
|
72
|
+
if match:
|
|
73
|
+
if match.group(1) in escaped_options_content:
|
|
74
|
+
return options_content[escaped_options_content.index(match.group(1))]
|
|
75
|
+
else:
|
|
76
|
+
return match.group(1)
|
|
77
|
+
|
|
78
|
+
for pattern in patterns:
|
|
79
|
+
match = safe_regex_search(pattern, text)
|
|
80
|
+
if match:
|
|
81
|
+
if match.group(1) in escaped_options_content:
|
|
82
|
+
return options_content[escaped_options_content.index(match.group(1))]
|
|
83
|
+
else:
|
|
84
|
+
return match.group(1)
|
|
85
|
+
|
|
86
|
+
return None
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import sys
|
|
3
|
+
import tau2.utils.llm_utils as tau_llm_utils
|
|
4
|
+
from tau2.data_model.message import AssistantMessage, Message, ToolCall
|
|
5
|
+
from tau2.data_model.tasks import Task
|
|
6
|
+
from tau2.environment.tool import Tool
|
|
7
|
+
from tau2.run import run_task
|
|
8
|
+
from tau2.utils.llm_utils import to_litellm_messages
|
|
9
|
+
from typing import Any, Callable, Dict, List, Optional, Sequence
|
|
10
|
+
|
|
11
|
+
from evalscope.api.dataset.dataset import Sample
|
|
12
|
+
from evalscope.api.messages.chat_message import dict_to_chat_message
|
|
13
|
+
from evalscope.api.model import GenerateConfig, get_model
|
|
14
|
+
from evalscope.api.model.model import Model
|
|
15
|
+
from evalscope.api.model.model_output import ChatCompletionChoice, ModelOutput
|
|
16
|
+
from evalscope.api.tool.tool_info import ToolInfo
|
|
17
|
+
from evalscope.constants import EvalType
|
|
18
|
+
from evalscope.models.utils.openai import openai_chat_choices
|
|
19
|
+
from evalscope.utils.function_utils import run_once
|
|
20
|
+
|
|
21
|
+
MODEL_DICT: Dict[str, Model] = {
|
|
22
|
+
'agent': None,
|
|
23
|
+
'user': None,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
_MODEL_PATCHED: bool = False
|
|
27
|
+
_ORIGINAL_TAU2_GENERATE: Optional[Callable[..., Any]] = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _patch_tau2_generate(new_generate: Callable[..., Any]) -> None:
|
|
31
|
+
"""Fan-out monkey patch for Tau2 when consumers did `from ... import generate`."""
|
|
32
|
+
global _MODEL_PATCHED, _ORIGINAL_TAU2_GENERATE
|
|
33
|
+
if _MODEL_PATCHED:
|
|
34
|
+
return
|
|
35
|
+
|
|
36
|
+
original = getattr(tau_llm_utils, 'generate', None)
|
|
37
|
+
if original is None:
|
|
38
|
+
raise RuntimeError('tau2.utils.llm_utils.generate not found')
|
|
39
|
+
|
|
40
|
+
# Replace on the source module first
|
|
41
|
+
if original is not new_generate:
|
|
42
|
+
tau_llm_utils.generate = new_generate
|
|
43
|
+
|
|
44
|
+
# Fan-out to all tau2 submodules that may hold a direct reference
|
|
45
|
+
for mod_name, mod in list(sys.modules.items()):
|
|
46
|
+
if not (isinstance(mod_name, str) and mod_name.startswith('tau2')):
|
|
47
|
+
continue
|
|
48
|
+
mod_obj = sys.modules.get(mod_name)
|
|
49
|
+
if mod_obj is None:
|
|
50
|
+
continue
|
|
51
|
+
try:
|
|
52
|
+
# Common direct binding: `generate` at module top-level
|
|
53
|
+
if getattr(mod_obj, 'generate', None) is original:
|
|
54
|
+
setattr(mod_obj, 'generate', new_generate)
|
|
55
|
+
# Replace any other aliases that equal the original function
|
|
56
|
+
for attr, val in list(vars(mod_obj).items()):
|
|
57
|
+
if val is original:
|
|
58
|
+
setattr(mod_obj, attr, new_generate)
|
|
59
|
+
except Exception:
|
|
60
|
+
# Best-effort: ignore modules that disallow setattr or have weird loaders
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
_ORIGINAL_TAU2_GENERATE = original
|
|
64
|
+
_MODEL_PATCHED = True
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@run_once
|
|
68
|
+
def build_model(agent_model, adapter_instance):
|
|
69
|
+
|
|
70
|
+
user_server = get_model(
|
|
71
|
+
model=adapter_instance.user_model,
|
|
72
|
+
eval_type=EvalType.SERVICE,
|
|
73
|
+
base_url=adapter_instance.api_base,
|
|
74
|
+
api_key=adapter_instance.api_key,
|
|
75
|
+
config=GenerateConfig(**adapter_instance.generation_config)
|
|
76
|
+
)
|
|
77
|
+
MODEL_DICT['user'] = user_server
|
|
78
|
+
MODEL_DICT['agent'] = agent_model
|
|
79
|
+
# Patch Tau2 generate function for `from ... import generate` consumers
|
|
80
|
+
_patch_tau2_generate(patched_generate)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def patched_generate(
|
|
84
|
+
model: str,
|
|
85
|
+
messages: List[Message],
|
|
86
|
+
tools: Optional[List[Tool]] = None,
|
|
87
|
+
tool_choice: Optional[Any] = None,
|
|
88
|
+
**kwargs: Any,
|
|
89
|
+
) -> AssistantMessage:
|
|
90
|
+
"""
|
|
91
|
+
Generate a response via an OpenAI-compatible /chat/completions call.
|
|
92
|
+
|
|
93
|
+
- Reads EVALSCOPE_API_KEY and EVALSCOPE_BASE_URL from environment.
|
|
94
|
+
- Uses OpenAI chat format for messages/tools/tool_choice.
|
|
95
|
+
- Returns Tau2 AssistantMessage with optional tool_calls and usage.
|
|
96
|
+
"""
|
|
97
|
+
global MODEL_DICT
|
|
98
|
+
|
|
99
|
+
oa_model = MODEL_DICT.get(model)
|
|
100
|
+
assert oa_model is not None, f'Model {model} not found in MODEL_DICT'
|
|
101
|
+
|
|
102
|
+
oa_messages = to_litellm_messages(messages)
|
|
103
|
+
tools = [tool.openai_schema for tool in tools] if tools else None
|
|
104
|
+
if tools and tool_choice is None:
|
|
105
|
+
tool_choice = 'auto'
|
|
106
|
+
|
|
107
|
+
# Perform request
|
|
108
|
+
completion = oa_model.generate(
|
|
109
|
+
input=[dict_to_chat_message(msg) for msg in oa_messages],
|
|
110
|
+
tools=[ToolInfo.model_validate(tool['function']) for tool in tools] if tools else None,
|
|
111
|
+
tool_choice=tool_choice,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
oa_choices = openai_chat_choices(completion.choices, include_reasoning=False)
|
|
115
|
+
choice = oa_choices[0]
|
|
116
|
+
msg = choice.message
|
|
117
|
+
|
|
118
|
+
tool_calls = msg.tool_calls or []
|
|
119
|
+
tool_calls = [
|
|
120
|
+
ToolCall(
|
|
121
|
+
id=tool_call.id,
|
|
122
|
+
name=tool_call.function.name,
|
|
123
|
+
arguments=json.loads(tool_call.function.arguments),
|
|
124
|
+
) for tool_call in tool_calls
|
|
125
|
+
]
|
|
126
|
+
tool_calls = tool_calls or None
|
|
127
|
+
usage = completion.usage.model_dump(exclude_none=True)
|
|
128
|
+
|
|
129
|
+
return AssistantMessage(
|
|
130
|
+
role='assistant',
|
|
131
|
+
content=msg.content,
|
|
132
|
+
tool_calls=tool_calls,
|
|
133
|
+
cost=None,
|
|
134
|
+
usage=usage,
|
|
135
|
+
raw_data=completion.model_dump(),
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def predict(model: Model, sample: Sample, adapter_instance) -> ModelOutput:
|
|
140
|
+
|
|
141
|
+
build_model(agent_model=model, adapter_instance=adapter_instance)
|
|
142
|
+
|
|
143
|
+
domain = sample.subset_key
|
|
144
|
+
task = Task.model_validate(sample.metadata)
|
|
145
|
+
res = run_task(
|
|
146
|
+
domain=domain,
|
|
147
|
+
task=task,
|
|
148
|
+
agent='llm_agent_gt',
|
|
149
|
+
user='user_simulator',
|
|
150
|
+
llm_agent='agent',
|
|
151
|
+
llm_user='user',
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
sample.metadata['task_result'] = res.reward_info.model_dump()
|
|
155
|
+
return ModelOutput(
|
|
156
|
+
model=model.name,
|
|
157
|
+
choices=[ChatCompletionChoice.from_content(res.model_dump_json(indent=2))],
|
|
158
|
+
)
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
7
|
+
from evalscope.api.dataset.dataset import DatasetDict
|
|
8
|
+
from evalscope.api.dataset.loader import DictDataLoader
|
|
9
|
+
from evalscope.api.messages.chat_message import ChatMessageUser
|
|
10
|
+
from evalscope.api.metric import Score
|
|
11
|
+
from evalscope.api.model import Model, ModelOutput
|
|
12
|
+
from evalscope.api.registry import register_benchmark
|
|
13
|
+
from evalscope.constants import Tags
|
|
14
|
+
from evalscope.utils import get_logger
|
|
15
|
+
from evalscope.utils.function_utils import run_once
|
|
16
|
+
from evalscope.utils.import_utils import check_import
|
|
17
|
+
|
|
18
|
+
logger = get_logger()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@register_benchmark(
|
|
22
|
+
BenchmarkMeta(
|
|
23
|
+
name='tau2_bench',
|
|
24
|
+
pretty_name='τ²-bench',
|
|
25
|
+
tags=[Tags.FUNCTION_CALLING, Tags.REASONING, Tags.AGENT],
|
|
26
|
+
description='τ²-bench (Tau Squared Bench) is an extension and enhancement of the original '
|
|
27
|
+
'τ-bench (Tau Bench), which is a benchmark designed to evaluate conversational AI agents '
|
|
28
|
+
'that interact with users through domain-specific API tools and guidelines. '
|
|
29
|
+
'Please install it with `pip install git+https://github.com/sierra-research/tau2-bench@v0.2.0` '
|
|
30
|
+
'before evaluating and set a user model. [Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/tau2_bench.html)', # noqa: E501
|
|
31
|
+
dataset_id='evalscope/tau2-bench-data',
|
|
32
|
+
subset_list=['airline', 'retail', 'telecom'],
|
|
33
|
+
aggregation='mean_and_pass_hat_k',
|
|
34
|
+
eval_split='test',
|
|
35
|
+
extra_params={
|
|
36
|
+
'user_model': 'qwen-plus',
|
|
37
|
+
'api_key': 'EMPTY',
|
|
38
|
+
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
39
|
+
'generation_config': {
|
|
40
|
+
'temperature': 0.0,
|
|
41
|
+
'max_tokens': 4096,
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
)
|
|
45
|
+
)
|
|
46
|
+
class Tau2BenchAdapter(AgentAdapter):
|
|
47
|
+
|
|
48
|
+
def __init__(self, **kwargs):
|
|
49
|
+
super().__init__(**kwargs)
|
|
50
|
+
|
|
51
|
+
check_import(
|
|
52
|
+
'tau2',
|
|
53
|
+
package='git+https://github.com/sierra-research/tau2-bench@v0.2.0',
|
|
54
|
+
raise_error=True,
|
|
55
|
+
feature_name=self.pretty_name
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# setup user model args
|
|
59
|
+
self.user_model = self.extra_params.get('user_model', 'qwen-plus')
|
|
60
|
+
self.api_key = self.extra_params.get('api_key', 'EMPTY')
|
|
61
|
+
self.api_base = self.extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
|
|
62
|
+
self.generation_config = self.extra_params.get('generation_config', {'temperature': 0.0, 'max_tokens': 4096})
|
|
63
|
+
|
|
64
|
+
def load(self):
|
|
65
|
+
# Load dataset
|
|
66
|
+
dataset_name_or_path = self.dataset_id
|
|
67
|
+
if os.path.exists(dataset_name_or_path):
|
|
68
|
+
logger.info(f'Loading dataset from {dataset_name_or_path}')
|
|
69
|
+
dataset_path = dataset_name_or_path
|
|
70
|
+
else:
|
|
71
|
+
from modelscope import dataset_snapshot_download
|
|
72
|
+
logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
|
|
73
|
+
dataset_path = dataset_snapshot_download(dataset_name_or_path)
|
|
74
|
+
|
|
75
|
+
# Set Tau2 data dir
|
|
76
|
+
os.environ['TAU2_DATA_DIR'] = dataset_path
|
|
77
|
+
|
|
78
|
+
# Load data for each domain
|
|
79
|
+
from tau2.agent.llm_agent import LLMGTAgent
|
|
80
|
+
from tau2.registry import registry
|
|
81
|
+
|
|
82
|
+
data_dict = defaultdict(dict)
|
|
83
|
+
for domain_name in self.subset_list:
|
|
84
|
+
logger.info(f'Loading Tau2-Bench environment: {domain_name}')
|
|
85
|
+
# Get tasks
|
|
86
|
+
task_loader = registry.get_tasks_loader(domain_name)
|
|
87
|
+
tasks = task_loader()
|
|
88
|
+
tasks = [task for task in tasks if LLMGTAgent.check_valid_task(task)]
|
|
89
|
+
tasks = [task.model_dump(exclude_unset=True) for task in tasks]
|
|
90
|
+
|
|
91
|
+
# load dataset
|
|
92
|
+
dataset = DictDataLoader(
|
|
93
|
+
dict_list=tasks,
|
|
94
|
+
sample_fields=self.record_to_sample,
|
|
95
|
+
limit=self.limit,
|
|
96
|
+
repeats=self.repeats,
|
|
97
|
+
shuffle=self.shuffle,
|
|
98
|
+
).load()
|
|
99
|
+
|
|
100
|
+
data_dict[domain_name] = dataset
|
|
101
|
+
|
|
102
|
+
test_dataset = DatasetDict(data_dict)
|
|
103
|
+
|
|
104
|
+
return test_dataset, None
|
|
105
|
+
|
|
106
|
+
def record_to_sample(self, record: Dict) -> Sample:
|
|
107
|
+
"""Convert a data record to a Sample object."""
|
|
108
|
+
return Sample(
|
|
109
|
+
input=[ChatMessageUser(content=record['description']['purpose'] or '')],
|
|
110
|
+
target='', # Will use the record for evaluation
|
|
111
|
+
subset_key=record['user_scenario']['instructions']['domain'],
|
|
112
|
+
metadata=record # Store the full record for evaluation
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
|
|
116
|
+
from .generation import predict
|
|
117
|
+
return predict(model, sample, adapter_instance=self)
|
|
118
|
+
|
|
119
|
+
def match_score(self, original_prediction: str, filtered_prediction: str, reference: str, task_state) -> Score:
|
|
120
|
+
|
|
121
|
+
score = Score(
|
|
122
|
+
extracted_prediction=filtered_prediction,
|
|
123
|
+
prediction=original_prediction,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
# Parse the prediction to get the reward
|
|
128
|
+
task_result = task_state.metadata['task_result']
|
|
129
|
+
reward = task_result['reward']
|
|
130
|
+
|
|
131
|
+
score.value = {
|
|
132
|
+
'acc': float(reward),
|
|
133
|
+
}
|
|
134
|
+
score.explanation = f'Task completed with reward: {reward}'
|
|
135
|
+
score.metadata = {
|
|
136
|
+
'task_result': task_result,
|
|
137
|
+
}
|
|
138
|
+
score.main_score_name = 'acc'
|
|
139
|
+
|
|
140
|
+
except Exception as e:
|
|
141
|
+
score.value = {'acc': 0.0}
|
|
142
|
+
score.explanation = f'Evaluation failed: {str(e)}'
|
|
143
|
+
score.metadata = {'error': str(e)}
|
|
144
|
+
score.main_score_name = 'acc'
|
|
145
|
+
|
|
146
|
+
return score
|
|
File without changes
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
from evalscope.api.dataset import Sample
|
|
4
|
+
from evalscope.api.messages import dict_to_chat_message
|
|
5
|
+
from evalscope.api.model import Model, ModelOutput
|
|
6
|
+
from evalscope.api.model.model_output import ChatCompletionChoice
|
|
7
|
+
from evalscope.api.tool import ToolInfo
|
|
8
|
+
from evalscope.models.utils.openai import openai_chat_choices
|
|
9
|
+
from evalscope.utils.function_utils import run_once
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@run_once
|
|
16
|
+
def _patch_agent_solve(model: Model):
|
|
17
|
+
"""Patch ToolCallingAgent.solve method to use custom model configuration"""
|
|
18
|
+
from tau_bench.agents.tool_calling_agent import ToolCallingAgent, message_to_action
|
|
19
|
+
from tau_bench.envs.base import Env
|
|
20
|
+
from tau_bench.types import RESPOND_ACTION_NAME, Action, SolveResult
|
|
21
|
+
|
|
22
|
+
def patched_solve(
|
|
23
|
+
self,
|
|
24
|
+
env: Env,
|
|
25
|
+
task_index: Optional[int] = None,
|
|
26
|
+
max_num_steps: int = 30,
|
|
27
|
+
) -> SolveResult:
|
|
28
|
+
env_reset_res = env.reset(task_index=task_index)
|
|
29
|
+
obs = env_reset_res.observation
|
|
30
|
+
info = env_reset_res.info.model_dump()
|
|
31
|
+
reward = 0.0
|
|
32
|
+
messages: List[Dict[str, Any]] = [
|
|
33
|
+
{
|
|
34
|
+
'role': 'system',
|
|
35
|
+
'content': self.wiki
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
'role': 'user',
|
|
39
|
+
'content': obs
|
|
40
|
+
},
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
for step_index in range(max_num_steps):
|
|
44
|
+
res = model.generate(
|
|
45
|
+
input=[dict_to_chat_message(msg) for msg in messages],
|
|
46
|
+
tools=[ToolInfo.model_validate(tool['function']) for tool in self.tools_info]
|
|
47
|
+
)
|
|
48
|
+
oai_res = openai_chat_choices(res.choices, include_reasoning=False)
|
|
49
|
+
|
|
50
|
+
next_message = oai_res[0].message.model_dump(exclude_none=True)
|
|
51
|
+
|
|
52
|
+
action = message_to_action(next_message)
|
|
53
|
+
|
|
54
|
+
env_response = env.step(action)
|
|
55
|
+
reward = env_response.reward
|
|
56
|
+
info = {**info, **env_response.info.model_dump()}
|
|
57
|
+
|
|
58
|
+
if action.name != RESPOND_ACTION_NAME:
|
|
59
|
+
next_message['tool_calls'] = next_message['tool_calls'][:1]
|
|
60
|
+
messages.extend([
|
|
61
|
+
next_message,
|
|
62
|
+
{
|
|
63
|
+
'role': 'tool',
|
|
64
|
+
'tool_call_id': next_message['tool_calls'][0]['id'],
|
|
65
|
+
'name': next_message['tool_calls'][0]['function']['name'],
|
|
66
|
+
'content': env_response.observation,
|
|
67
|
+
},
|
|
68
|
+
])
|
|
69
|
+
else:
|
|
70
|
+
messages.extend([
|
|
71
|
+
next_message,
|
|
72
|
+
{
|
|
73
|
+
'role': 'user',
|
|
74
|
+
'content': env_response.observation
|
|
75
|
+
},
|
|
76
|
+
])
|
|
77
|
+
logger.debug(f'Task: {task_index} Step: {step_index} finished')
|
|
78
|
+
|
|
79
|
+
if env_response.done:
|
|
80
|
+
break
|
|
81
|
+
|
|
82
|
+
return SolveResult(
|
|
83
|
+
reward=reward,
|
|
84
|
+
info=info,
|
|
85
|
+
messages=messages,
|
|
86
|
+
total_cost=0,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
ToolCallingAgent.solve = patched_solve
|
|
90
|
+
|
|
91
|
+
return 'ToolCallingAgent.solve patched successfully'
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def predict(model: Model, sample: Sample) -> ModelOutput:
|
|
95
|
+
"""
|
|
96
|
+
Generate predictions for tau_bench tasks using the model.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
model: The model to use for prediction
|
|
100
|
+
sample: The sample containing task metadata
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
ModelOutput containing the prediction results
|
|
104
|
+
"""
|
|
105
|
+
from tau_bench.agents.tool_calling_agent import ToolCallingAgent
|
|
106
|
+
from tau_bench.envs import get_env
|
|
107
|
+
|
|
108
|
+
_patch_agent_solve(model)
|
|
109
|
+
try:
|
|
110
|
+
# Extract task information from sample metadata
|
|
111
|
+
task_data = sample.metadata
|
|
112
|
+
env_name = task_data['env_name']
|
|
113
|
+
task_index = task_data['task_index']
|
|
114
|
+
|
|
115
|
+
# Direct call to tau_bench_server adapter's solve method
|
|
116
|
+
# This method can be implemented to solve specific tasks in the TauBench environment
|
|
117
|
+
isolated_env = get_env(
|
|
118
|
+
env_name=env_name,
|
|
119
|
+
user_strategy='llm',
|
|
120
|
+
user_model='dummy', # Use dummy model to prevent errors
|
|
121
|
+
user_provider='openai', # Use dummy provider to prevent errors
|
|
122
|
+
task_split='test',
|
|
123
|
+
task_index=task_index,
|
|
124
|
+
)
|
|
125
|
+
agent = ToolCallingAgent(
|
|
126
|
+
tools_info=isolated_env.tools_info,
|
|
127
|
+
wiki=isolated_env.wiki,
|
|
128
|
+
model='dummy', # Use dummy model to prevent errors
|
|
129
|
+
provider='dummy', # Use dummy provider to prevent errors
|
|
130
|
+
temperature=0, # dummy temperature to prevent errors
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
res = agent.solve(env=isolated_env, task_index=task_index)
|
|
134
|
+
|
|
135
|
+
sample.metadata['task_result'] = res.model_dump(exclude_none=True)
|
|
136
|
+
return ModelOutput(
|
|
137
|
+
model=model.name,
|
|
138
|
+
choices=[ChatCompletionChoice.from_content(res.model_dump_json(indent=2))],
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
except Exception as e:
|
|
142
|
+
logger.error(f'Error in tau_bench prediction: {str(e)}')
|
|
143
|
+
sample.metadata['task_result'] = {'reward': 0, 'error': str(e)}
|
|
144
|
+
return ModelOutput(
|
|
145
|
+
model=model.name,
|
|
146
|
+
choices=[ChatCompletionChoice.from_content('')],
|
|
147
|
+
)
|