evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from pydantic import BaseModel, Field, JsonValue, model_validator
|
|
3
|
+
from typing import Any, Dict, List, Literal, Optional, Type, Union
|
|
4
|
+
|
|
5
|
+
from evalscope.api.messages import ChatMessageAssistant, Content
|
|
6
|
+
from evalscope.api.tool import ToolCall, ToolFunction
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ModelUsage(BaseModel):
|
|
10
|
+
"""Token usage for completion."""
|
|
11
|
+
|
|
12
|
+
input_tokens: int = Field(default=0)
|
|
13
|
+
"""Total input tokens used."""
|
|
14
|
+
|
|
15
|
+
output_tokens: int = Field(default=0)
|
|
16
|
+
"""Total output tokens used."""
|
|
17
|
+
|
|
18
|
+
total_tokens: int = Field(default=0)
|
|
19
|
+
"""Total tokens used."""
|
|
20
|
+
|
|
21
|
+
input_tokens_cache_write: Optional[int] = Field(default=None)
|
|
22
|
+
"""Number of tokens written to the cache."""
|
|
23
|
+
|
|
24
|
+
input_tokens_cache_read: Optional[int] = Field(default=None)
|
|
25
|
+
"""Number of tokens retrieved from the cache."""
|
|
26
|
+
|
|
27
|
+
reasoning_tokens: Optional[int] = Field(default=None)
|
|
28
|
+
"""Number of tokens used for reasoning."""
|
|
29
|
+
|
|
30
|
+
def __add__(self, other: 'ModelUsage') -> 'ModelUsage':
|
|
31
|
+
|
|
32
|
+
def optional_sum(a: Optional[int], b: Optional[int]) -> Optional[int]:
|
|
33
|
+
if a is not None and b is not None:
|
|
34
|
+
return a + b
|
|
35
|
+
if a is not None:
|
|
36
|
+
return a
|
|
37
|
+
if b is not None:
|
|
38
|
+
return b
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
return ModelUsage(
|
|
42
|
+
input_tokens=self.input_tokens + other.input_tokens,
|
|
43
|
+
output_tokens=self.output_tokens + other.output_tokens,
|
|
44
|
+
total_tokens=self.total_tokens + other.total_tokens,
|
|
45
|
+
input_tokens_cache_write=optional_sum(self.input_tokens_cache_write, other.input_tokens_cache_write),
|
|
46
|
+
input_tokens_cache_read=optional_sum(self.input_tokens_cache_read, other.input_tokens_cache_read),
|
|
47
|
+
reasoning_tokens=optional_sum(self.reasoning_tokens, other.reasoning_tokens),
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
StopReason = Literal[
|
|
52
|
+
'stop',
|
|
53
|
+
'max_tokens',
|
|
54
|
+
'model_length',
|
|
55
|
+
'tool_calls',
|
|
56
|
+
'content_filter',
|
|
57
|
+
'unknown',
|
|
58
|
+
]
|
|
59
|
+
"""Reason that the model stopped or failed to generate."""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class TopLogprob(BaseModel):
|
|
63
|
+
"""List of the most likely tokens and their log probability, at this token position."""
|
|
64
|
+
|
|
65
|
+
token: str
|
|
66
|
+
"""The top-kth token represented as a string."""
|
|
67
|
+
|
|
68
|
+
logprob: float
|
|
69
|
+
"""The log probability value of the model for the top-kth token."""
|
|
70
|
+
|
|
71
|
+
bytes: Optional[List[int]] = Field(default=None)
|
|
72
|
+
"""The top-kth token represented as a byte array (a list of integers)."""
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class Logprob(BaseModel):
|
|
76
|
+
"""Log probability for a token."""
|
|
77
|
+
|
|
78
|
+
token: str
|
|
79
|
+
"""The predicted token represented as a string."""
|
|
80
|
+
|
|
81
|
+
logprob: float
|
|
82
|
+
"""The log probability value of the model for the predicted token."""
|
|
83
|
+
|
|
84
|
+
bytes: Optional[List[int]] = Field(default=None)
|
|
85
|
+
"""The predicted token represented as a byte array (a list of integers)."""
|
|
86
|
+
|
|
87
|
+
top_logprobs: Optional[List[TopLogprob]] = Field(default=None)
|
|
88
|
+
"""If the `top_logprobs` argument is greater than 0, this will contain an ordered list of the top K most likely tokens and their log probabilities.""" # noqa: E501
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class Logprobs(BaseModel):
|
|
92
|
+
"""Log probability information for a completion choice."""
|
|
93
|
+
|
|
94
|
+
content: List[Logprob]
|
|
95
|
+
"""a (num_generated_tokens,) length list containing the individual log probabilities for each generated token."""
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class ChatCompletionChoice(BaseModel):
|
|
99
|
+
"""Choice generated for completion."""
|
|
100
|
+
|
|
101
|
+
message: ChatMessageAssistant
|
|
102
|
+
"""Assistant message."""
|
|
103
|
+
|
|
104
|
+
stop_reason: StopReason = Field(default='unknown')
|
|
105
|
+
"""Reason that the model stopped generating."""
|
|
106
|
+
|
|
107
|
+
logprobs: Optional[Logprobs] = Field(default=None)
|
|
108
|
+
"""Logprobs."""
|
|
109
|
+
|
|
110
|
+
@model_validator(mode='before')
|
|
111
|
+
@classmethod
|
|
112
|
+
def migrate_stop_reason(cls: Type['ChatCompletionChoice'], values: Dict[str, Any]) -> Dict[str, Any]:
|
|
113
|
+
if 'stop_reason' in values:
|
|
114
|
+
stop_reason = values['stop_reason']
|
|
115
|
+
if stop_reason == 'length':
|
|
116
|
+
values['stop_reason'] = 'max_tokens'
|
|
117
|
+
|
|
118
|
+
return values
|
|
119
|
+
|
|
120
|
+
@classmethod
|
|
121
|
+
def from_content(cls, content: Union[str, List[Content]]) -> 'ChatCompletionChoice':
|
|
122
|
+
"""Create a ChatCompletionChoice from content string."""
|
|
123
|
+
return cls(
|
|
124
|
+
message=ChatMessageAssistant(content=content),
|
|
125
|
+
stop_reason='stop',
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class ModelOutput(BaseModel):
|
|
130
|
+
"""Output from model generation."""
|
|
131
|
+
|
|
132
|
+
model: str = Field(default_factory=str)
|
|
133
|
+
"""Model used for generation."""
|
|
134
|
+
|
|
135
|
+
choices: List[ChatCompletionChoice] = Field(default=[])
|
|
136
|
+
"""Completion choices."""
|
|
137
|
+
|
|
138
|
+
usage: Optional[ModelUsage] = Field(default=None)
|
|
139
|
+
"""Model token usage"""
|
|
140
|
+
|
|
141
|
+
time: Optional[float] = Field(default=None)
|
|
142
|
+
"""Time elapsed (in seconds) for call to generate."""
|
|
143
|
+
|
|
144
|
+
metadata: Optional[Dict[str, Any]] = Field(default=None)
|
|
145
|
+
"""Additional metadata associated with model output."""
|
|
146
|
+
|
|
147
|
+
error: Optional[str] = Field(default=None)
|
|
148
|
+
"""Error message in the case of content moderation refusals."""
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def empty(self) -> bool:
|
|
152
|
+
return len(self.choices) == 0
|
|
153
|
+
|
|
154
|
+
@property
|
|
155
|
+
def stop_reason(self) -> StopReason:
|
|
156
|
+
"""First message stop reason."""
|
|
157
|
+
return self.choices[0].stop_reason
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def message(self) -> ChatMessageAssistant:
|
|
161
|
+
"""First message choice."""
|
|
162
|
+
return self.choices[0].message
|
|
163
|
+
|
|
164
|
+
@property
|
|
165
|
+
def completion(self) -> str:
|
|
166
|
+
"""Text of first message choice text."""
|
|
167
|
+
if len(self.choices) > 0:
|
|
168
|
+
return self.choices[0].message.text
|
|
169
|
+
else:
|
|
170
|
+
return '\n'.join(choice.message.text for choice in self.choices)
|
|
171
|
+
|
|
172
|
+
@completion.setter
|
|
173
|
+
def completion(self, completion: str) -> None:
|
|
174
|
+
"""Set the text of the first message choice.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
completion (str): Text for first message.
|
|
178
|
+
"""
|
|
179
|
+
if len(self.choices) > 0:
|
|
180
|
+
self.choices[0].message.text = completion
|
|
181
|
+
else:
|
|
182
|
+
self.choices.append(
|
|
183
|
+
ChatCompletionChoice(
|
|
184
|
+
message=ChatMessageAssistant(content=completion, model=self.model),
|
|
185
|
+
stop_reason='stop',
|
|
186
|
+
)
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
@property
|
|
190
|
+
def completions(self) -> List[str]:
|
|
191
|
+
"""List of all message choices text."""
|
|
192
|
+
return [choice.message.text for choice in self.choices]
|
|
193
|
+
|
|
194
|
+
@staticmethod
|
|
195
|
+
def from_content(
|
|
196
|
+
model: str,
|
|
197
|
+
content: Union[str, List[Content]],
|
|
198
|
+
stop_reason: StopReason = 'stop',
|
|
199
|
+
error: Optional[str] = None,
|
|
200
|
+
) -> 'ModelOutput':
|
|
201
|
+
"""Create ModelOutput from simple text content.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
model: Model name.
|
|
205
|
+
content: Text content from generation.
|
|
206
|
+
stop_reason: Stop reason for generation.
|
|
207
|
+
error: Error message.
|
|
208
|
+
"""
|
|
209
|
+
return ModelOutput(
|
|
210
|
+
model=model,
|
|
211
|
+
choices=[
|
|
212
|
+
ChatCompletionChoice(
|
|
213
|
+
message=ChatMessageAssistant(content=content, model=model, source='generate'),
|
|
214
|
+
stop_reason=stop_reason,
|
|
215
|
+
)
|
|
216
|
+
],
|
|
217
|
+
error=error,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
@staticmethod
|
|
221
|
+
def for_tool_call(
|
|
222
|
+
model: str,
|
|
223
|
+
tool_name: str,
|
|
224
|
+
tool_arguments: Dict[str, Any],
|
|
225
|
+
internal: Optional[JsonValue] = None,
|
|
226
|
+
tool_call_id: Optional[str] = None,
|
|
227
|
+
content: Optional[str] = None,
|
|
228
|
+
) -> 'ModelOutput':
|
|
229
|
+
"""
|
|
230
|
+
Returns a ModelOutput for requesting a tool call.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
model: model name
|
|
234
|
+
tool_name: The name of the tool.
|
|
235
|
+
internal: The model's internal info for the tool (if any).
|
|
236
|
+
tool_arguments: The arguments passed to the tool.
|
|
237
|
+
tool_call_id: Optional ID for the tool call. Defaults to a random UUID.
|
|
238
|
+
content: Optional content to include in the message. Defaults to "tool call for tool {tool_name}".
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
A ModelOutput corresponding to the tool call
|
|
242
|
+
"""
|
|
243
|
+
if content is None:
|
|
244
|
+
content = f'tool call for tool {tool_name}'
|
|
245
|
+
|
|
246
|
+
if tool_call_id is None:
|
|
247
|
+
tool_call_id = f'for_tool_call_{uuid.uuid4()}'
|
|
248
|
+
|
|
249
|
+
return ModelOutput(
|
|
250
|
+
model=model,
|
|
251
|
+
choices=[
|
|
252
|
+
ChatCompletionChoice(
|
|
253
|
+
message=ChatMessageAssistant(
|
|
254
|
+
content=content,
|
|
255
|
+
model=model,
|
|
256
|
+
source='generate',
|
|
257
|
+
tool_calls=[
|
|
258
|
+
ToolCall(
|
|
259
|
+
id=tool_call_id,
|
|
260
|
+
internal=internal,
|
|
261
|
+
function=ToolFunction(
|
|
262
|
+
name=tool_name,
|
|
263
|
+
arguments=tool_arguments,
|
|
264
|
+
)
|
|
265
|
+
)
|
|
266
|
+
],
|
|
267
|
+
),
|
|
268
|
+
stop_reason='tool_calls',
|
|
269
|
+
)
|
|
270
|
+
],
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def as_stop_reason(reason: Optional[str]) -> StopReason:
|
|
275
|
+
"""Encode common reason strings into standard StopReason."""
|
|
276
|
+
if reason in ['stop', 'eos']:
|
|
277
|
+
return 'stop'
|
|
278
|
+
elif reason == 'length':
|
|
279
|
+
return 'max_tokens'
|
|
280
|
+
elif reason in ['tool_calls', 'function_call']:
|
|
281
|
+
return 'tool_calls'
|
|
282
|
+
elif reason in ['content_filter', 'model_length', 'max_tokens']:
|
|
283
|
+
return reason
|
|
284
|
+
else:
|
|
285
|
+
return 'unknown'
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, Union
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, DataAdapter
|
|
6
|
+
from evalscope.api.filter import Filter
|
|
7
|
+
from evalscope.api.metric import Aggregator, Metric
|
|
8
|
+
from evalscope.api.model.model import ModelAPI
|
|
9
|
+
from evalscope.config import TaskConfig
|
|
10
|
+
|
|
11
|
+
# BEGIN: Registry for benchmarks
|
|
12
|
+
# Registry for benchmarks, allowing dynamic registration and retrieval of benchmark metadata and data adapters.
|
|
13
|
+
BENCHMARK_REGISTRY: Dict[str, 'BenchmarkMeta'] = {}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def register_benchmark(metadata: 'BenchmarkMeta'):
|
|
17
|
+
"""Register a benchmark with its metadata."""
|
|
18
|
+
|
|
19
|
+
def register_wrapper(data_adapter: Type['DataAdapter']):
|
|
20
|
+
if metadata.name in BENCHMARK_REGISTRY:
|
|
21
|
+
raise ValueError(f'Benchmark {metadata.name} already registered')
|
|
22
|
+
metadata.data_adapter = data_adapter
|
|
23
|
+
BENCHMARK_REGISTRY[metadata.name] = metadata
|
|
24
|
+
return data_adapter
|
|
25
|
+
|
|
26
|
+
return register_wrapper
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_benchmark(name: str, config: Optional['TaskConfig'] = None) -> 'DataAdapter':
|
|
30
|
+
"""
|
|
31
|
+
Retrieve a registered benchmark by name.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
name (str): The name of the benchmark.
|
|
35
|
+
config (Optional['TaskConfig']): The task configuration.
|
|
36
|
+
dataset_args (Optional[dict]): The dataset-specific arguments.
|
|
37
|
+
|
|
38
|
+
"""
|
|
39
|
+
# copy to avoid modifying the original metadata
|
|
40
|
+
metadata = copy.deepcopy(BENCHMARK_REGISTRY.get(name))
|
|
41
|
+
if not metadata:
|
|
42
|
+
raise ValueError(f'Benchmark {name} not found, available benchmarks: {list(sorted(BENCHMARK_REGISTRY.keys()))}')
|
|
43
|
+
|
|
44
|
+
# Update metadata with dataset-specific configuration
|
|
45
|
+
if config is not None:
|
|
46
|
+
metadata._update(config.dataset_args.get(name, {}))
|
|
47
|
+
# Return the data adapter initialized with the benchmark metadata
|
|
48
|
+
data_adapter_cls = metadata.data_adapter
|
|
49
|
+
return data_adapter_cls(benchmark_meta=metadata, task_config=config)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# END: Registry for benchmarks
|
|
53
|
+
|
|
54
|
+
# BEGIN: Registry for model APIs
|
|
55
|
+
# Registry for model APIs, allowing dynamic registration and retrieval of model API classes.
|
|
56
|
+
MODEL_APIS: Dict[str, Type['ModelAPI']] = {}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def register_model_api(name: str):
|
|
60
|
+
"""
|
|
61
|
+
Decorator to register a model API class with a given name.
|
|
62
|
+
|
|
63
|
+
:param name: The name of the model API.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def decorator(api_class: Type['ModelAPI']):
|
|
67
|
+
if name in MODEL_APIS:
|
|
68
|
+
raise ValueError(f"Model API '{name}' is already registered.")
|
|
69
|
+
MODEL_APIS[name] = api_class
|
|
70
|
+
return api_class
|
|
71
|
+
|
|
72
|
+
return decorator
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_model_api(name: str) -> Type['ModelAPI']:
|
|
76
|
+
"""
|
|
77
|
+
Retrieve a registered model API class by name.
|
|
78
|
+
|
|
79
|
+
:param name: The name of the model API.
|
|
80
|
+
:return: The model API class.
|
|
81
|
+
"""
|
|
82
|
+
if name not in MODEL_APIS:
|
|
83
|
+
raise ValueError(f"Model API '{name}' is not registered. Available model APIs: {list(MODEL_APIS.keys())}")
|
|
84
|
+
|
|
85
|
+
wrapped = MODEL_APIS[name]
|
|
86
|
+
if not isinstance(wrapped, type):
|
|
87
|
+
return wrapped()
|
|
88
|
+
else:
|
|
89
|
+
return wrapped
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# END: Registry for model APIs
|
|
93
|
+
|
|
94
|
+
# BEGIN: Registry for metrics
|
|
95
|
+
METRIC_REGISTRY: Dict[str, Type['Metric']] = {}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def register_metric(name: str):
|
|
99
|
+
|
|
100
|
+
def decorate(fn):
|
|
101
|
+
if name in METRIC_REGISTRY:
|
|
102
|
+
raise ValueError(f"Metric named '{name}' conflicts with existing registered metric!")
|
|
103
|
+
|
|
104
|
+
METRIC_REGISTRY[name] = fn
|
|
105
|
+
return fn
|
|
106
|
+
|
|
107
|
+
return decorate
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def get_metric(name: str) -> Type['Metric']:
|
|
111
|
+
if name in METRIC_REGISTRY:
|
|
112
|
+
return METRIC_REGISTRY[name]
|
|
113
|
+
else:
|
|
114
|
+
raise ValueError(
|
|
115
|
+
f"Metric '{name}' not found in the registry. Available metrics: {list(METRIC_REGISTRY.keys())}"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# END: Registry for metrics
|
|
120
|
+
|
|
121
|
+
# BEGIN: Registry for filters
|
|
122
|
+
|
|
123
|
+
FILTER_REGISTRY: Dict[str, Type['Filter']] = {}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def register_filter(name):
|
|
127
|
+
|
|
128
|
+
def decorate(cls):
|
|
129
|
+
if name in FILTER_REGISTRY:
|
|
130
|
+
raise ValueError(f'Registering filter `{name}` that is already in Registry {FILTER_REGISTRY}')
|
|
131
|
+
FILTER_REGISTRY[name] = cls
|
|
132
|
+
return cls
|
|
133
|
+
|
|
134
|
+
return decorate
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def get_filter(filter_name: str) -> Type['Filter']:
|
|
138
|
+
if filter_name not in FILTER_REGISTRY:
|
|
139
|
+
raise KeyError(
|
|
140
|
+
f"Filter '{filter_name}' not found in the registry. Available filters: {list(FILTER_REGISTRY.keys())}"
|
|
141
|
+
)
|
|
142
|
+
return FILTER_REGISTRY[filter_name]
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# END: Registry for filters
|
|
146
|
+
|
|
147
|
+
# BEGIN: Registry for aggregation functions
|
|
148
|
+
AGGREGATION_REGISTRY: Dict[str, Type['Aggregator']] = {}
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def register_aggregation(name: str):
|
|
152
|
+
"""
|
|
153
|
+
Decorator to register an aggregation function with a given name.
|
|
154
|
+
|
|
155
|
+
:param name: The name of the aggregation function.
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
def decorator(aggregation_fn: 'Aggregator'):
|
|
159
|
+
if name in AGGREGATION_REGISTRY:
|
|
160
|
+
raise ValueError(f"Aggregation function '{name}' is already registered.")
|
|
161
|
+
AGGREGATION_REGISTRY[name] = aggregation_fn
|
|
162
|
+
return aggregation_fn
|
|
163
|
+
|
|
164
|
+
return decorator
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def get_aggregation(name: str) -> Type['Aggregator']:
|
|
168
|
+
"""
|
|
169
|
+
Retrieve a registered aggregation function by name.
|
|
170
|
+
|
|
171
|
+
:param name: The name of the aggregation function.
|
|
172
|
+
:return: The aggregation function.
|
|
173
|
+
"""
|
|
174
|
+
if name not in AGGREGATION_REGISTRY:
|
|
175
|
+
raise ValueError(
|
|
176
|
+
f"Aggregation function '{name}' is not registered. "
|
|
177
|
+
f'Available aggregations: {list(AGGREGATION_REGISTRY.keys())}'
|
|
178
|
+
)
|
|
179
|
+
return AGGREGATION_REGISTRY[name]
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
# END: Registry for aggregation functions
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pydantic import BaseModel, Field, JsonValue, field_validator
|
|
3
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Union
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ToolFunction(BaseModel):
|
|
7
|
+
"""Indicate that a specific tool function should be called."""
|
|
8
|
+
|
|
9
|
+
name: str
|
|
10
|
+
"""The name of the tool function to call."""
|
|
11
|
+
|
|
12
|
+
arguments: Dict[str, Any]
|
|
13
|
+
"""The arguments of the tool function to call"""
|
|
14
|
+
|
|
15
|
+
@field_validator('arguments', mode='before')
|
|
16
|
+
@classmethod
|
|
17
|
+
def parse_arguments(cls, v):
|
|
18
|
+
if isinstance(v, str):
|
|
19
|
+
try:
|
|
20
|
+
v = json.loads(v)
|
|
21
|
+
except Exception as e:
|
|
22
|
+
raise ValueError(f'arguments field string is not valid JSON: {e}')
|
|
23
|
+
if not isinstance(v, dict):
|
|
24
|
+
raise ValueError('arguments must be a dict or a JSON string representing a dict')
|
|
25
|
+
return v
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ToolCallContent(BaseModel):
|
|
29
|
+
"""Content to include in tool call view."""
|
|
30
|
+
|
|
31
|
+
title: Optional[str] = Field(default=None)
|
|
32
|
+
"""Optional (plain text) title for tool call content."""
|
|
33
|
+
|
|
34
|
+
format: Literal['text', 'markdown']
|
|
35
|
+
"""Format (text or markdown)."""
|
|
36
|
+
|
|
37
|
+
content: str
|
|
38
|
+
"""Text or markdown content."""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ToolCallView(BaseModel):
|
|
42
|
+
"""Custom view of a tool call.
|
|
43
|
+
|
|
44
|
+
Both `context` and `call` are optional. If `call` is not specified
|
|
45
|
+
then the view will default to a syntax highlighted Python function call.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
context: Optional[ToolCallContent] = Field(default=None)
|
|
49
|
+
"""Context for the tool call (i.e. current tool state)."""
|
|
50
|
+
|
|
51
|
+
call: Optional[ToolCallContent] = Field(default=None)
|
|
52
|
+
"""Custom representation of tool call."""
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ToolCall(BaseModel):
|
|
56
|
+
id: str
|
|
57
|
+
"""Unique identifier for tool call."""
|
|
58
|
+
|
|
59
|
+
function: ToolFunction
|
|
60
|
+
"""Function to call."""
|
|
61
|
+
|
|
62
|
+
internal: Optional[JsonValue] = Field(default=None)
|
|
63
|
+
"""Model provider specific payload - typically used to aid transformation back to model types."""
|
|
64
|
+
|
|
65
|
+
parse_error: Optional[str] = Field(default=None)
|
|
66
|
+
"""Error which occurred parsing tool call."""
|
|
67
|
+
|
|
68
|
+
view: Optional[ToolCallContent] = Field(default=None)
|
|
69
|
+
"""Custom view of tool call input."""
|
|
70
|
+
|
|
71
|
+
type: Optional[str] = Field(default=None)
|
|
72
|
+
"""Tool call type (deprecated)."""
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class ToolCallError(BaseModel):
|
|
76
|
+
"""Error raised by a tool call."""
|
|
77
|
+
|
|
78
|
+
type: Literal[
|
|
79
|
+
'parsing',
|
|
80
|
+
'timeout',
|
|
81
|
+
'unicode_decode',
|
|
82
|
+
'permission',
|
|
83
|
+
'file_not_found',
|
|
84
|
+
'is_a_directory',
|
|
85
|
+
'limit',
|
|
86
|
+
'approval',
|
|
87
|
+
'unknown',
|
|
88
|
+
]
|
|
89
|
+
"""Error type."""
|
|
90
|
+
|
|
91
|
+
message: str
|
|
92
|
+
"""Error message."""
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
ToolChoice = Union[Literal['auto', 'any', 'none'], ToolFunction]
|
|
96
|
+
"""Specify which tool to call.
|
|
97
|
+
|
|
98
|
+
"auto" means the model decides; "any" means use at least one tool,
|
|
99
|
+
"none" means never call a tool; ToolFunction instructs the model
|
|
100
|
+
to call a specific function.
|
|
101
|
+
"""
|