evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,455 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import copy
|
|
4
|
+
import functools
|
|
5
|
+
import json
|
|
6
|
+
import time
|
|
7
|
+
import torch # type: ignore
|
|
8
|
+
from concurrent.futures import Future
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from logging import getLogger
|
|
11
|
+
from modelscope import AutoModelForCausalLM, AutoTokenizer
|
|
12
|
+
from queue import Empty, Queue
|
|
13
|
+
from threading import Thread
|
|
14
|
+
from torch import Tensor # type: ignore
|
|
15
|
+
from typing import Any, Dict, List, Literal, Optional, Protocol, Tuple, Union, cast
|
|
16
|
+
from typing_extensions import override
|
|
17
|
+
|
|
18
|
+
from evalscope.api.messages import (
|
|
19
|
+
ChatMessage,
|
|
20
|
+
ChatMessageAssistant,
|
|
21
|
+
ContentAudio,
|
|
22
|
+
ContentImage,
|
|
23
|
+
ContentText,
|
|
24
|
+
ContentVideo,
|
|
25
|
+
)
|
|
26
|
+
from evalscope.api.model import (
|
|
27
|
+
ChatCompletionChoice,
|
|
28
|
+
GenerateConfig,
|
|
29
|
+
Logprob,
|
|
30
|
+
Logprobs,
|
|
31
|
+
ModelAPI,
|
|
32
|
+
ModelOutput,
|
|
33
|
+
ModelUsage,
|
|
34
|
+
TopLogprob,
|
|
35
|
+
)
|
|
36
|
+
from evalscope.api.tool import ToolChoice, ToolInfo
|
|
37
|
+
from evalscope.utils.model_utils import get_device
|
|
38
|
+
|
|
39
|
+
logger = getLogger()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ModelScopeAPI(ModelAPI):
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
model_name: str,
|
|
47
|
+
base_url: Optional[str] = None,
|
|
48
|
+
api_key: Optional[str] = None,
|
|
49
|
+
config: GenerateConfig = GenerateConfig(),
|
|
50
|
+
**model_args: Any,
|
|
51
|
+
):
|
|
52
|
+
super().__init__(
|
|
53
|
+
model_name=model_name,
|
|
54
|
+
base_url=base_url,
|
|
55
|
+
api_key=api_key,
|
|
56
|
+
config=config,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# collect known model_args (then delete them so we can pass the rest on)
|
|
60
|
+
def collect_model_arg(name: str) -> Optional[Any]:
|
|
61
|
+
nonlocal model_args
|
|
62
|
+
value = model_args.get(name, None)
|
|
63
|
+
if value is not None:
|
|
64
|
+
model_args.pop(name)
|
|
65
|
+
return value
|
|
66
|
+
|
|
67
|
+
model_path = collect_model_arg('model_path')
|
|
68
|
+
device_map = collect_model_arg('device_map')
|
|
69
|
+
torch_dtype = collect_model_arg('precision')
|
|
70
|
+
tokenizer_path = collect_model_arg('tokenizer_path')
|
|
71
|
+
self.chat_template = collect_model_arg('chat_template')
|
|
72
|
+
self.tokenizer_call_args = collect_model_arg('tokenizer_call_args')
|
|
73
|
+
self.enable_thinking = collect_model_arg('enable_thinking')
|
|
74
|
+
if self.tokenizer_call_args is None:
|
|
75
|
+
self.tokenizer_call_args = {}
|
|
76
|
+
|
|
77
|
+
# device
|
|
78
|
+
self.device = device_map or get_device()
|
|
79
|
+
|
|
80
|
+
# torch dtype
|
|
81
|
+
DTYPE_MAP = {'float16': torch.float16, 'float32': torch.float32, 'bfloat16': torch.bfloat16, 'auto': 'auto'}
|
|
82
|
+
|
|
83
|
+
if isinstance(torch_dtype, str) and torch_dtype != 'auto':
|
|
84
|
+
torch_dtype = DTYPE_MAP.get(torch_dtype, torch.float32)
|
|
85
|
+
self.torch_dtype = torch_dtype
|
|
86
|
+
|
|
87
|
+
# model
|
|
88
|
+
model_name_or_path = model_path or model_name
|
|
89
|
+
self.model = AutoModelForCausalLM.from_pretrained(
|
|
90
|
+
model_name_or_path,
|
|
91
|
+
device_map=self.device,
|
|
92
|
+
token=self.api_key,
|
|
93
|
+
torch_dtype=self.torch_dtype,
|
|
94
|
+
trust_remote_code=True,
|
|
95
|
+
**model_args
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# tokenizer
|
|
99
|
+
tokenizer_name_or_path = tokenizer_path or model_name_or_path
|
|
100
|
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, trust_remote_code=True)
|
|
101
|
+
# LLMs generally don't have a pad token and we need one for batching
|
|
102
|
+
if self.tokenizer.pad_token is None:
|
|
103
|
+
if self.tokenizer.eos_token is not None:
|
|
104
|
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
105
|
+
else:
|
|
106
|
+
# add a pad token
|
|
107
|
+
self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
|
108
|
+
# set padding side to left for LLMs
|
|
109
|
+
self.tokenizer.padding_side = 'left'
|
|
110
|
+
# set chat template if provided
|
|
111
|
+
if self.chat_template:
|
|
112
|
+
self.tokenizer.chat_template = self.chat_template
|
|
113
|
+
logger.info(f'Using custom chat template: {self.chat_template}')
|
|
114
|
+
|
|
115
|
+
def generate(
|
|
116
|
+
self,
|
|
117
|
+
input: List[ChatMessage],
|
|
118
|
+
tools: List[ToolInfo],
|
|
119
|
+
tool_choice: ToolChoice,
|
|
120
|
+
config: GenerateConfig,
|
|
121
|
+
) -> ModelOutput:
|
|
122
|
+
|
|
123
|
+
# create chat
|
|
124
|
+
chat = self.ms_chat(input, tools)
|
|
125
|
+
|
|
126
|
+
assert isinstance(self.tokenizer_call_args, dict)
|
|
127
|
+
# prepare tokenizer
|
|
128
|
+
tokenizer = functools.partial(
|
|
129
|
+
self.tokenizer,
|
|
130
|
+
return_tensors='pt',
|
|
131
|
+
padding=True,
|
|
132
|
+
**self.tokenizer_call_args,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# prepare generator
|
|
136
|
+
kwargs: Dict[str, Any] = {}
|
|
137
|
+
if config.do_sample is not None:
|
|
138
|
+
kwargs['do_sample'] = config.do_sample
|
|
139
|
+
if config.n is not None:
|
|
140
|
+
if config.n > 1:
|
|
141
|
+
assert config.do_sample, 'n > 1 requires do_sample=True in GenerateConfig'
|
|
142
|
+
kwargs['num_return_sequences'] = config.n
|
|
143
|
+
if config.max_tokens is not None:
|
|
144
|
+
kwargs['max_new_tokens'] = config.max_tokens
|
|
145
|
+
if config.temperature is not None:
|
|
146
|
+
kwargs['temperature'] = config.temperature
|
|
147
|
+
if config.top_p is not None:
|
|
148
|
+
kwargs['top_p'] = config.top_p
|
|
149
|
+
if config.top_k is not None:
|
|
150
|
+
kwargs['top_k'] = config.top_k
|
|
151
|
+
if config.logprobs is not None:
|
|
152
|
+
kwargs['output_logits'] = config.logprobs
|
|
153
|
+
if 'return_dict_in_generate' in kwargs:
|
|
154
|
+
assert kwargs['return_dict_in_generate']
|
|
155
|
+
if config.stop_seqs is not None:
|
|
156
|
+
from transformers.generation import StopStringCriteria # type: ignore
|
|
157
|
+
|
|
158
|
+
stopping_criteria = [StopStringCriteria(self.tokenizer, config.stop_seqs)]
|
|
159
|
+
kwargs['stopping_criteria'] = stopping_criteria
|
|
160
|
+
|
|
161
|
+
kwargs['return_dict_in_generate'] = True
|
|
162
|
+
generator = functools.partial(self.model.generate, **kwargs)
|
|
163
|
+
|
|
164
|
+
# prepare decoder
|
|
165
|
+
decoder = functools.partial(
|
|
166
|
+
self.tokenizer.batch_decode,
|
|
167
|
+
skip_special_tokens=True,
|
|
168
|
+
clean_up_tokenization_spaces=False,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# generate
|
|
172
|
+
responses = batched_generate(
|
|
173
|
+
GenerateInput(
|
|
174
|
+
input=chat,
|
|
175
|
+
device=self.model.device,
|
|
176
|
+
tokenizer=tokenizer,
|
|
177
|
+
generator=generator,
|
|
178
|
+
decoder=decoder,
|
|
179
|
+
batch_size=config.batch_size or self.max_connections(),
|
|
180
|
+
)
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
choices: List[ChatCompletionChoice] = []
|
|
184
|
+
for response in responses:
|
|
185
|
+
# gather logprobs
|
|
186
|
+
final_logprobs = None
|
|
187
|
+
if config.logprobs is not None:
|
|
188
|
+
final_logprobs = extract_logprobs(
|
|
189
|
+
response=response,
|
|
190
|
+
top=config.top_logprobs,
|
|
191
|
+
tokenizer=self.tokenizer,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# construct choice
|
|
195
|
+
# TODO: Handle tool calls
|
|
196
|
+
choice = ChatCompletionChoice(
|
|
197
|
+
message=ChatMessageAssistant(content=response.output, model=self.model_name, source='generate'),
|
|
198
|
+
logprobs=(Logprobs(content=final_logprobs) if final_logprobs is not None else None),
|
|
199
|
+
)
|
|
200
|
+
choices.append(choice)
|
|
201
|
+
|
|
202
|
+
# return output
|
|
203
|
+
return ModelOutput(
|
|
204
|
+
model=self.model_name,
|
|
205
|
+
choices=choices,
|
|
206
|
+
usage=ModelUsage(
|
|
207
|
+
input_tokens=response.input_tokens,
|
|
208
|
+
output_tokens=response.output_tokens,
|
|
209
|
+
total_tokens=response.total_tokens,
|
|
210
|
+
),
|
|
211
|
+
time=response.time,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
@override
|
|
215
|
+
def max_tokens(self) -> Optional[int]:
|
|
216
|
+
"""Default is 2048, bump it up to a value suitable for evals."""
|
|
217
|
+
return 2048
|
|
218
|
+
|
|
219
|
+
@override
|
|
220
|
+
def max_connections(self) -> int:
|
|
221
|
+
"""Effectively the batch size."""
|
|
222
|
+
return 8
|
|
223
|
+
|
|
224
|
+
def ms_chat(self, messages: List[ChatMessage], tools: List[ToolInfo]) -> str:
|
|
225
|
+
# convert to ms format
|
|
226
|
+
tools_list = []
|
|
227
|
+
ms_messages = copy.deepcopy(messages)
|
|
228
|
+
if len(tools) > 0:
|
|
229
|
+
tools_list = [json.loads(tool.model_dump_json(exclude_none=True, indent=2)) for tool in tools]
|
|
230
|
+
|
|
231
|
+
ms_messages = message_content_to_string(ms_messages)
|
|
232
|
+
# apply chat template
|
|
233
|
+
if self.tokenizer.chat_template is not None:
|
|
234
|
+
chat = self.tokenizer.apply_chat_template(
|
|
235
|
+
ms_messages,
|
|
236
|
+
add_generation_prompt=True,
|
|
237
|
+
tokenize=False,
|
|
238
|
+
tools=tools_list if len(tools_list) > 0 else None,
|
|
239
|
+
enable_thinking=self.enable_thinking, # not all models use this, check if it is supported
|
|
240
|
+
)
|
|
241
|
+
else:
|
|
242
|
+
chat = ''
|
|
243
|
+
for message in ms_messages:
|
|
244
|
+
chat += f'{message.role}: {message.content}\n'
|
|
245
|
+
# return
|
|
246
|
+
return cast(str, chat)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def message_content_to_string(messages: List[ChatMessage]) -> List[ChatMessage]:
|
|
250
|
+
"""Convert list of content in `ChatMessageAssistant`, `ChatMessageUser` or `ChatMessageSystem` to a string."""
|
|
251
|
+
for message in messages:
|
|
252
|
+
if isinstance(message.content, list):
|
|
253
|
+
is_multimodal = any(
|
|
254
|
+
isinstance(item, (ContentAudio, ContentImage, ContentVideo)) for item in message.content
|
|
255
|
+
)
|
|
256
|
+
if is_multimodal:
|
|
257
|
+
raise NotImplementedError(
|
|
258
|
+
'Transformer model does not support multimodal content, please provide text inputs only.'
|
|
259
|
+
)
|
|
260
|
+
message.content = message.text
|
|
261
|
+
return messages
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
# return value from generate as a result of specifying return_dict_in_generate
|
|
265
|
+
class ModelGenerateOutput:
|
|
266
|
+
sequences: Tensor
|
|
267
|
+
logits: tuple[Tensor]
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
class Tokenizer(Protocol):
|
|
271
|
+
|
|
272
|
+
def __call__(self, input: List[str]) -> Dict[Literal['input_ids', 'attention_mask'], Tensor]:
|
|
273
|
+
...
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
class Generator(Protocol):
|
|
277
|
+
|
|
278
|
+
def __call__(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
|
|
279
|
+
...
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
class Decoder(Protocol):
|
|
283
|
+
|
|
284
|
+
def __call__(self, sequences: Tensor) -> list[str]:
|
|
285
|
+
...
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
@dataclass
|
|
289
|
+
class GenerateInput:
|
|
290
|
+
input: str
|
|
291
|
+
device: str
|
|
292
|
+
tokenizer: Tokenizer
|
|
293
|
+
generator: Generator
|
|
294
|
+
decoder: Decoder
|
|
295
|
+
batch_size: int
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
@dataclass
|
|
299
|
+
class GenerateOutput:
|
|
300
|
+
output: str
|
|
301
|
+
input_tokens: int
|
|
302
|
+
output_tokens: int
|
|
303
|
+
total_tokens: int
|
|
304
|
+
logprobs: Optional[torch.Tensor]
|
|
305
|
+
time: float
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
@dataclass
|
|
309
|
+
class _QueueItem:
|
|
310
|
+
input: GenerateInput
|
|
311
|
+
future: Future[GenerateOutput]
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
batch_thread: Optional[Thread] = None
|
|
315
|
+
|
|
316
|
+
batch_queue: 'Queue[_QueueItem]' = Queue()
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def batched_generate(input: GenerateInput) -> List[GenerateOutput]:
|
|
320
|
+
# start the background thread if necessary
|
|
321
|
+
global batch_thread
|
|
322
|
+
if batch_thread is None:
|
|
323
|
+
batch_thread = Thread(target=process_batches, daemon=True)
|
|
324
|
+
batch_thread.start()
|
|
325
|
+
|
|
326
|
+
# enqueue the job
|
|
327
|
+
future = Future[GenerateOutput]()
|
|
328
|
+
batch_queue.put(_QueueItem(input=input, future=future))
|
|
329
|
+
|
|
330
|
+
return future.result()
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def process_batches() -> None:
|
|
334
|
+
while True:
|
|
335
|
+
# drain the queue (wait until no new messages have shown up for 2 seconds)
|
|
336
|
+
inputs: List[Tuple[GenerateInput, Future[GenerateOutput]]] = []
|
|
337
|
+
while True:
|
|
338
|
+
try:
|
|
339
|
+
input = batch_queue.get(timeout=2)
|
|
340
|
+
inputs.append((input.input, input.future))
|
|
341
|
+
if len(inputs) == input.input.batch_size:
|
|
342
|
+
# max batch size reached
|
|
343
|
+
break
|
|
344
|
+
except Empty:
|
|
345
|
+
# we have exhausted the queue
|
|
346
|
+
break
|
|
347
|
+
|
|
348
|
+
# see if we have any work to do
|
|
349
|
+
if len(inputs) == 0:
|
|
350
|
+
continue
|
|
351
|
+
|
|
352
|
+
try:
|
|
353
|
+
# capture the generator and decoder functions
|
|
354
|
+
start_time = time.monotonic()
|
|
355
|
+
first_input = inputs[0][0]
|
|
356
|
+
device = first_input.device
|
|
357
|
+
tokenizer = first_input.tokenizer
|
|
358
|
+
generator = first_input.generator
|
|
359
|
+
decoder = first_input.decoder
|
|
360
|
+
num_return_sequences = generator.keywords.get('num_return_sequences', 1)
|
|
361
|
+
|
|
362
|
+
# tokenize and move to device
|
|
363
|
+
tokenized_inputs = tokenizer([item[0].input for item in inputs])
|
|
364
|
+
input_ids = tokenized_inputs['input_ids']
|
|
365
|
+
attention_mask = tokenized_inputs['attention_mask']
|
|
366
|
+
input_ids = input_ids.to(device)
|
|
367
|
+
attention_mask = attention_mask.to(device)
|
|
368
|
+
|
|
369
|
+
# generate
|
|
370
|
+
with torch.inference_mode():
|
|
371
|
+
generation_outputs = cast(
|
|
372
|
+
ModelGenerateOutput,
|
|
373
|
+
generator(input_ids=input_ids, attention_mask=attention_mask),
|
|
374
|
+
)
|
|
375
|
+
generate_ids = generation_outputs.sequences
|
|
376
|
+
logits = generation_outputs.logits
|
|
377
|
+
|
|
378
|
+
# get logprobs from logits
|
|
379
|
+
logprobs = None
|
|
380
|
+
if logits is not None:
|
|
381
|
+
stacked_logits = torch.stack(logits).transpose(0, 1)
|
|
382
|
+
logprobs = torch.nn.functional.log_softmax(stacked_logits, dim=-1)
|
|
383
|
+
|
|
384
|
+
# decode
|
|
385
|
+
generated_tokens = generate_ids[:, input_ids.size(dim=1):]
|
|
386
|
+
if logprobs is not None:
|
|
387
|
+
assert logprobs.shape[1] == generated_tokens.shape[1]
|
|
388
|
+
outputs = decoder(sequences=generated_tokens)
|
|
389
|
+
|
|
390
|
+
# call back futures
|
|
391
|
+
total_time = time.monotonic() - start_time
|
|
392
|
+
for input_index in range(len(inputs)):
|
|
393
|
+
choices: List[GenerateOutput] = []
|
|
394
|
+
# handle input
|
|
395
|
+
future = inputs[input_index][1]
|
|
396
|
+
input_tokens = input_ids[input_index].shape[-1]
|
|
397
|
+
# handle choices
|
|
398
|
+
for choice_index in range(num_return_sequences):
|
|
399
|
+
output_index = input_index * num_return_sequences + choice_index
|
|
400
|
+
# handle out of
|
|
401
|
+
output = outputs[output_index]
|
|
402
|
+
output_tokens = generate_ids[output_index].shape[-1] - input_tokens
|
|
403
|
+
logprobs_tensor = logprobs[output_index] if logprobs is not None else None
|
|
404
|
+
# create the output
|
|
405
|
+
choices.append(
|
|
406
|
+
GenerateOutput(
|
|
407
|
+
output=output,
|
|
408
|
+
input_tokens=input_tokens,
|
|
409
|
+
output_tokens=output_tokens,
|
|
410
|
+
total_tokens=input_tokens + output_tokens,
|
|
411
|
+
logprobs=logprobs_tensor,
|
|
412
|
+
time=total_time,
|
|
413
|
+
)
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
# asyncio futures are not thread safe, so we need to pass the event loop
|
|
417
|
+
# down to this point, so we can mark the future as done in a thread safe manner.
|
|
418
|
+
# see: https://docs.python.org/3/library/asyncio-dev.html#concurrency-and-multithreading
|
|
419
|
+
future.set_result(choices)
|
|
420
|
+
|
|
421
|
+
except Exception as ex:
|
|
422
|
+
for inp in inputs:
|
|
423
|
+
future = inp[1]
|
|
424
|
+
future.set_exception(ex)
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def extract_logprobs(
|
|
428
|
+
response: GenerateOutput,
|
|
429
|
+
top: Optional[int],
|
|
430
|
+
tokenizer,
|
|
431
|
+
) -> List[Logprob]:
|
|
432
|
+
assert response.logprobs is not None
|
|
433
|
+
k = top or 1
|
|
434
|
+
topk_values, topk_inds = response.logprobs.topk(k=k, dim=-1)
|
|
435
|
+
final_logprobs = []
|
|
436
|
+
for toks, vals in zip(topk_inds, topk_values):
|
|
437
|
+
top_logprobs: List[TopLogprob] = []
|
|
438
|
+
for tok, val in zip(toks, vals):
|
|
439
|
+
# TODO: you get byte artifacts converting single ids to tokens like this...
|
|
440
|
+
# but `tokenizer.decode` strips spaces. There must be a better way to do this.
|
|
441
|
+
token_str = tokenizer.convert_ids_to_tokens(tok.item())
|
|
442
|
+
top_logprobs.append(TopLogprob(
|
|
443
|
+
token=token_str,
|
|
444
|
+
logprob=val,
|
|
445
|
+
bytes=list(map(ord, token_str)),
|
|
446
|
+
))
|
|
447
|
+
final_logprobs.append(
|
|
448
|
+
Logprob(
|
|
449
|
+
token=top_logprobs[0].token,
|
|
450
|
+
logprob=top_logprobs[0].logprob,
|
|
451
|
+
bytes=top_logprobs[0].bytes,
|
|
452
|
+
top_logprobs=top_logprobs,
|
|
453
|
+
)
|
|
454
|
+
)
|
|
455
|
+
return final_logprobs
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from openai import APIStatusError, BadRequestError, OpenAI, PermissionDeniedError, UnprocessableEntityError
|
|
3
|
+
from openai._types import NOT_GIVEN
|
|
4
|
+
from openai.types.chat import ChatCompletion
|
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
6
|
+
|
|
7
|
+
from evalscope.api.messages import ChatMessage
|
|
8
|
+
from evalscope.api.model import ChatCompletionChoice, GenerateConfig, ModelAPI, ModelOutput
|
|
9
|
+
from evalscope.api.tool import ToolChoice, ToolInfo
|
|
10
|
+
from evalscope.utils import get_logger
|
|
11
|
+
from evalscope.utils.argument_utils import get_supported_params
|
|
12
|
+
from .utils.openai import (
|
|
13
|
+
chat_choices_from_openai,
|
|
14
|
+
collect_stream_response,
|
|
15
|
+
model_output_from_openai,
|
|
16
|
+
openai_chat_messages,
|
|
17
|
+
openai_chat_tool_choice,
|
|
18
|
+
openai_chat_tools,
|
|
19
|
+
openai_completion_params,
|
|
20
|
+
openai_handle_bad_request,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
logger = get_logger()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class OpenAICompatibleAPI(ModelAPI):
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
model_name: str,
|
|
31
|
+
base_url: Optional[str] = None,
|
|
32
|
+
api_key: Optional[str] = None,
|
|
33
|
+
config: GenerateConfig = GenerateConfig(),
|
|
34
|
+
**model_args: Any,
|
|
35
|
+
) -> None:
|
|
36
|
+
|
|
37
|
+
super().__init__(
|
|
38
|
+
model_name=model_name,
|
|
39
|
+
base_url=base_url,
|
|
40
|
+
api_key=api_key,
|
|
41
|
+
config=config,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# use service prefix to lookup api_key
|
|
45
|
+
self.api_key = api_key or os.environ.get('EVALSCOPE_API_KEY', None)
|
|
46
|
+
assert self.api_key, f'API key for {model_name} not found'
|
|
47
|
+
|
|
48
|
+
# use service prefix to lookup base_url
|
|
49
|
+
self.base_url = base_url or os.environ.get('EVALSCOPE_BASE_URL', None)
|
|
50
|
+
assert self.base_url, f'Base URL for {model_name} not found'
|
|
51
|
+
|
|
52
|
+
# remove trailing slash from base_url
|
|
53
|
+
self.base_url = self.base_url.rstrip('/').removesuffix('/chat/completions')
|
|
54
|
+
|
|
55
|
+
# create http client
|
|
56
|
+
self.client = OpenAI(
|
|
57
|
+
api_key=self.api_key,
|
|
58
|
+
base_url=self.base_url,
|
|
59
|
+
**model_args,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def generate(
|
|
63
|
+
self,
|
|
64
|
+
input: List[ChatMessage],
|
|
65
|
+
tools: List[ToolInfo],
|
|
66
|
+
tool_choice: ToolChoice,
|
|
67
|
+
config: GenerateConfig,
|
|
68
|
+
) -> ModelOutput:
|
|
69
|
+
# setup request and response for ModelCall
|
|
70
|
+
request: Dict[str, Any] = {}
|
|
71
|
+
response: Dict[str, Any] = {}
|
|
72
|
+
|
|
73
|
+
tools, tool_choice, config = self.resolve_tools(tools, tool_choice, config)
|
|
74
|
+
|
|
75
|
+
# get completion params (slice off service from model name)
|
|
76
|
+
completion_params = self.completion_params(
|
|
77
|
+
config=config,
|
|
78
|
+
tools=len(tools) > 0,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
request = dict(
|
|
82
|
+
messages=openai_chat_messages(input),
|
|
83
|
+
tools=openai_chat_tools(tools) if len(tools) > 0 else NOT_GIVEN,
|
|
84
|
+
tool_choice=openai_chat_tool_choice(tool_choice) if len(tools) > 0 else NOT_GIVEN,
|
|
85
|
+
**completion_params,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
self.validate_request_params(request)
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
# generate completion and save response for model call
|
|
92
|
+
completion = self.client.chat.completions.create(**request)
|
|
93
|
+
# handle streaming response
|
|
94
|
+
if not isinstance(completion, ChatCompletion):
|
|
95
|
+
completion = collect_stream_response(completion)
|
|
96
|
+
response = completion.model_dump()
|
|
97
|
+
self.on_response(response)
|
|
98
|
+
|
|
99
|
+
# return output and call
|
|
100
|
+
choices = self.chat_choices_from_completion(completion, tools)
|
|
101
|
+
return model_output_from_openai(completion, choices)
|
|
102
|
+
|
|
103
|
+
except (BadRequestError, UnprocessableEntityError, PermissionDeniedError) as ex:
|
|
104
|
+
return self.handle_bad_request(ex)
|
|
105
|
+
|
|
106
|
+
def resolve_tools(self, tools: List[ToolInfo], tool_choice: ToolChoice,
|
|
107
|
+
config: GenerateConfig) -> Tuple[List[ToolInfo], ToolChoice, GenerateConfig]:
|
|
108
|
+
"""Provides an opportunity for concrete classes to customize tool resolution."""
|
|
109
|
+
return tools, tool_choice, config
|
|
110
|
+
|
|
111
|
+
def completion_params(self, config: GenerateConfig, tools: bool) -> Dict[str, Any]:
|
|
112
|
+
return openai_completion_params(
|
|
113
|
+
model=self.model_name,
|
|
114
|
+
config=config,
|
|
115
|
+
tools=tools,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
def validate_request_params(self, params: Dict[str, Any]):
|
|
119
|
+
"""Hook for subclasses to do custom request parameter validation."""
|
|
120
|
+
# Cache supported params to avoid repeated calls to inspect.signature.
|
|
121
|
+
if not hasattr(self, '_valid_params'):
|
|
122
|
+
self._valid_params = get_supported_params(self.client.chat.completions.create)
|
|
123
|
+
|
|
124
|
+
# Move unsupported parameters to extra_body.
|
|
125
|
+
extra_body = params.get('extra_body', {})
|
|
126
|
+
for key in list(params.keys()):
|
|
127
|
+
if key not in self._valid_params:
|
|
128
|
+
extra_body[key] = params.pop(key)
|
|
129
|
+
|
|
130
|
+
if extra_body:
|
|
131
|
+
params['extra_body'] = extra_body
|
|
132
|
+
|
|
133
|
+
def on_response(self, response: Dict[str, Any]) -> None:
|
|
134
|
+
"""Hook for subclasses to do custom response handling."""
|
|
135
|
+
pass
|
|
136
|
+
|
|
137
|
+
def chat_choices_from_completion(self, completion: ChatCompletion,
|
|
138
|
+
tools: List[ToolInfo]) -> List[ChatCompletionChoice]:
|
|
139
|
+
"""Hook for subclasses to do custom chat choice processing."""
|
|
140
|
+
return chat_choices_from_openai(completion, tools)
|
|
141
|
+
|
|
142
|
+
def handle_bad_request(self, ex: APIStatusError) -> Union[ModelOutput, Exception]:
|
|
143
|
+
"""Hook for subclasses to do bad request handling"""
|
|
144
|
+
return openai_handle_bad_request(self.model_name, ex)
|