evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
2
|
+
from copy import deepcopy
|
|
3
|
+
from pydantic import BaseModel, Field, model_validator
|
|
4
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.utils.json_schema import JSONSchema
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ResponseSchema(BaseModel):
|
|
10
|
+
"""Schema for model response when using Structured Output."""
|
|
11
|
+
|
|
12
|
+
name: str
|
|
13
|
+
"""The name of the response schema. Must be a-z, A-Z, 0-9, or contain underscores and dashes, with a maximum length of 64."""
|
|
14
|
+
|
|
15
|
+
json_schema: JSONSchema
|
|
16
|
+
"""The schema for the response format, described as a JSON Schema object."""
|
|
17
|
+
|
|
18
|
+
description: Optional[str] = Field(default=None)
|
|
19
|
+
"""A description of what the response format is for, used by the model to determine how to respond in the format."""
|
|
20
|
+
|
|
21
|
+
strict: Optional[bool] = Field(default=None)
|
|
22
|
+
"""Whether to enable strict schema adherence when generating the output. If set to true, the model will always follow the exact schema defined in the schema field.
|
|
23
|
+
OpenAI and Mistral only."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class GenerateConfig(BaseModel):
|
|
27
|
+
"""Model generation options."""
|
|
28
|
+
model_config = {'extra': 'allow'}
|
|
29
|
+
|
|
30
|
+
timeout: Optional[int] = Field(default=None)
|
|
31
|
+
"""Request timeout (in seconds)."""
|
|
32
|
+
|
|
33
|
+
batch_size: Optional[int] = Field(default=None)
|
|
34
|
+
"""Maximum number of concurrent connections to Model API (default is model specific) or batch size for generation."""
|
|
35
|
+
|
|
36
|
+
stream: Optional[bool] = Field(default=None)
|
|
37
|
+
"""Whether to stream the response (default is model specific)."""
|
|
38
|
+
|
|
39
|
+
max_tokens: Optional[int] = Field(default=None)
|
|
40
|
+
"""The maximum number of tokens that can be generated in the completion (default is model specific)."""
|
|
41
|
+
|
|
42
|
+
top_p: Optional[float] = Field(default=None)
|
|
43
|
+
"""An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass."""
|
|
44
|
+
|
|
45
|
+
temperature: Optional[float] = Field(default=None)
|
|
46
|
+
"""What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic."""
|
|
47
|
+
|
|
48
|
+
stop_seqs: Optional[List[str]] = Field(default=None)
|
|
49
|
+
"""Sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence."""
|
|
50
|
+
|
|
51
|
+
best_of: Optional[int] = Field(default=None)
|
|
52
|
+
"""Generates best_of completions server-side and returns the 'best' (the one with the highest log probability per token). vLLM only."""
|
|
53
|
+
|
|
54
|
+
frequency_penalty: Optional[float] = Field(default=None)
|
|
55
|
+
"""Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. OpenAI, Google, Grok, Groq, vLLM, and SGLang only."""
|
|
56
|
+
|
|
57
|
+
presence_penalty: Optional[float] = Field(default=None)
|
|
58
|
+
"""Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. OpenAI, Google, Grok, Groq, vLLM, and SGLang only."""
|
|
59
|
+
|
|
60
|
+
repetition_penalty: Optional[float] = Field(default=None)
|
|
61
|
+
"""Exponential penalty applied to existing tokens in the generated text. 1.0 means no penalty. OpenAI, HuggingFace, and vLLM only."""
|
|
62
|
+
|
|
63
|
+
logit_bias: Optional[Dict[int, float]] = Field(default=None)
|
|
64
|
+
"""Map token Ids to an associated bias value from -100 to 100 (e.g. "42=10,43=-10"). OpenAI, Grok, Grok, and vLLM only."""
|
|
65
|
+
|
|
66
|
+
seed: Optional[int] = Field(default=None)
|
|
67
|
+
"""Random seed. OpenAI, Google, Mistral, Groq, HuggingFace, and vLLM only."""
|
|
68
|
+
|
|
69
|
+
do_sample: Optional[bool] = Field(default=None)
|
|
70
|
+
"""Whether to use sampling; use greedy decoding otherwise. Only transformers models support this parameter."""
|
|
71
|
+
|
|
72
|
+
top_k: Optional[int] = Field(default=None)
|
|
73
|
+
"""Randomly sample the next word from the top_k most likely next words. Anthropic, Google, HuggingFace, vLLM, and SGLang only."""
|
|
74
|
+
|
|
75
|
+
n: Optional[int] = Field(default=None)
|
|
76
|
+
"""How many chat completion choices to generate for each input message. OpenAI, Grok, Google, TogetherAI, vLLM, and SGLang only."""
|
|
77
|
+
|
|
78
|
+
logprobs: Optional[bool] = Field(default=None)
|
|
79
|
+
"""Return log probabilities of the output tokens. OpenAI, Grok, TogetherAI, Huggingface, llama-cpp-python, vLLM, and SGLang only."""
|
|
80
|
+
|
|
81
|
+
top_logprobs: Optional[int] = Field(default=None)
|
|
82
|
+
"""Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI, Grok, Huggingface, vLLM, and SGLang only."""
|
|
83
|
+
|
|
84
|
+
parallel_tool_calls: Optional[bool] = Field(default=None)
|
|
85
|
+
"""Whether to enable parallel function calling during tool use (defaults to True). OpenAI and Groq only."""
|
|
86
|
+
|
|
87
|
+
internal_tools: Optional[bool] = Field(default=None)
|
|
88
|
+
"""Whether to automatically map tools to model internal implementations (e.g. 'computer' for anthropic)."""
|
|
89
|
+
|
|
90
|
+
max_tool_output: Optional[int] = Field(default=None)
|
|
91
|
+
"""Maximum tool output (in bytes). Defaults to 16 * 1024."""
|
|
92
|
+
|
|
93
|
+
cache_prompt: Union[Literal['auto'], bool, None] = Field(default=None)
|
|
94
|
+
"""Whether to cache the prompt prefix. Defaults to "auto", which will enable caching for requests with tools. Anthropic only."""
|
|
95
|
+
|
|
96
|
+
reasoning_effort: Optional[Literal['low', 'medium', 'high']] = Field(default=None)
|
|
97
|
+
"""Constrains effort on reasoning for reasoning models (defaults to `medium`). Open AI o1 models only."""
|
|
98
|
+
|
|
99
|
+
reasoning_tokens: Optional[int] = Field(default=None)
|
|
100
|
+
"""Maximum number of tokens to use for reasoning. Anthropic Claude models only."""
|
|
101
|
+
|
|
102
|
+
reasoning_summary: Optional[Literal['concise', 'detailed', 'auto']] = Field(default=None)
|
|
103
|
+
"""Provide summary of reasoning steps (defaults to no summary). Use 'auto' to access the most detailed summarizer available for the current model. OpenAI reasoning models only."""
|
|
104
|
+
|
|
105
|
+
reasoning_history: Optional[Literal['none', 'all', 'last', 'auto']] = Field(default=None)
|
|
106
|
+
"""Include reasoning in chat message history sent to generate."""
|
|
107
|
+
|
|
108
|
+
response_schema: Optional[ResponseSchema] = Field(default=None)
|
|
109
|
+
"""Request a response format as JSONSchema (output should still be validated). OpenAI, Google, and Mistral only."""
|
|
110
|
+
|
|
111
|
+
extra_body: Optional[Dict[str, Any]] = Field(default=None)
|
|
112
|
+
"""Extra body to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
|
|
113
|
+
|
|
114
|
+
extra_query: Optional[Dict[str, Any]] = Field(default=None)
|
|
115
|
+
"""Extra query parameters to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
|
|
116
|
+
|
|
117
|
+
extra_headers: Optional[Dict[str, str]] = Field(default=None)
|
|
118
|
+
"""Extra headers to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
|
|
119
|
+
|
|
120
|
+
height: Optional[int] = Field(default=None)
|
|
121
|
+
"""Image height for image generation model only"""
|
|
122
|
+
|
|
123
|
+
width: Optional[int] = Field(default=None)
|
|
124
|
+
"""Image width for image generation model only"""
|
|
125
|
+
|
|
126
|
+
num_inference_steps: Optional[int] = Field(default=None)
|
|
127
|
+
"""Number of inference steps for image generation model only"""
|
|
128
|
+
|
|
129
|
+
guidance_scale: Optional[float] = Field(default=None)
|
|
130
|
+
"""Guidance scale for image generation model only"""
|
|
131
|
+
|
|
132
|
+
# migrate reasoning_history as a bool
|
|
133
|
+
@model_validator(mode='before')
|
|
134
|
+
@classmethod
|
|
135
|
+
def migrate_reasoning(cls, data: Any) -> Any:
|
|
136
|
+
if isinstance(data, dict):
|
|
137
|
+
reasoning_history = data.get('reasoning_history', None)
|
|
138
|
+
if reasoning_history is True:
|
|
139
|
+
data['reasoning_history'] = 'all'
|
|
140
|
+
elif reasoning_history is False:
|
|
141
|
+
data['reasoning_history'] = 'none'
|
|
142
|
+
|
|
143
|
+
return data
|
|
144
|
+
|
|
145
|
+
def merge(self, other: 'GenerateConfig') -> 'GenerateConfig':
|
|
146
|
+
"""Merge another model configuration into this one.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
other (GenerateConfig):
|
|
150
|
+
Configuration to merge.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Merged configuration.
|
|
154
|
+
"""
|
|
155
|
+
config_keys = [field for field in self.__class__.model_fields.keys()]
|
|
156
|
+
config = deepcopy(self)
|
|
157
|
+
for key in config_keys:
|
|
158
|
+
value = getattr(other, key, None)
|
|
159
|
+
if value is not None:
|
|
160
|
+
setattr(config, key, value)
|
|
161
|
+
return config
|
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from pydantic_core import to_jsonable_python
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Dict, Generator, List, Literal, Optional, Sequence, Union
|
|
4
|
+
|
|
5
|
+
from evalscope.api.messages import ChatMessage, ChatMessageAssistant, ChatMessageSystem, ChatMessageUser
|
|
6
|
+
from evalscope.api.registry import get_model_api
|
|
7
|
+
from evalscope.api.tool import ToolChoice, ToolFunction, ToolInfo
|
|
8
|
+
from evalscope.utils import get_logger
|
|
9
|
+
from evalscope.utils.function_utils import thread_safe
|
|
10
|
+
from .generate_config import GenerateConfig
|
|
11
|
+
from .model_output import ModelOutput
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from evalscope.config import TaskConfig
|
|
15
|
+
|
|
16
|
+
logger = get_logger()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ModelAPI(abc.ABC):
|
|
20
|
+
"""Model API provider."""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
model_name: str,
|
|
25
|
+
base_url: Optional[str] = None,
|
|
26
|
+
api_key: Optional[str] = None,
|
|
27
|
+
config: GenerateConfig = GenerateConfig(),
|
|
28
|
+
**kwargs
|
|
29
|
+
) -> None:
|
|
30
|
+
"""Create a model API provider.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
model_name (str): Model name.
|
|
34
|
+
base_url (str | None): Alternate base URL for model.
|
|
35
|
+
api_key (str | None): API key for model.
|
|
36
|
+
api_key_vars (list[str]): Environment variables that
|
|
37
|
+
may contain keys for this provider (used for override)
|
|
38
|
+
config (GenerateConfig): Model configuration.
|
|
39
|
+
"""
|
|
40
|
+
self.model_name = model_name
|
|
41
|
+
self.base_url = base_url
|
|
42
|
+
self.api_key = api_key
|
|
43
|
+
self.config = config
|
|
44
|
+
|
|
45
|
+
@abc.abstractmethod
|
|
46
|
+
def generate(
|
|
47
|
+
self,
|
|
48
|
+
input: List[ChatMessage],
|
|
49
|
+
tools: List[ToolInfo],
|
|
50
|
+
tool_choice: ToolChoice,
|
|
51
|
+
config: GenerateConfig,
|
|
52
|
+
) -> ModelOutput:
|
|
53
|
+
"""Generate output from the model.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
input (str | list[ChatMessage]): Chat message
|
|
57
|
+
input (if a `str` is passed it is converted
|
|
58
|
+
to a `ChatUserMessage`).
|
|
59
|
+
tools (list[ToolInfo]): Tools available for the
|
|
60
|
+
model to call.
|
|
61
|
+
tool_choice (ToolChoice): Directives to the model
|
|
62
|
+
as to which tools to prefer.
|
|
63
|
+
config (GenerateConfig): Model configuration.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
ModelOutput
|
|
67
|
+
"""
|
|
68
|
+
...
|
|
69
|
+
|
|
70
|
+
def batch_generate(
|
|
71
|
+
self,
|
|
72
|
+
inputs: List[List[ChatMessage]],
|
|
73
|
+
tools: List[List[ToolInfo]],
|
|
74
|
+
tool_choices: List[ToolChoice],
|
|
75
|
+
configs: List[GenerateConfig],
|
|
76
|
+
) -> Generator[ModelOutput, None, None]:
|
|
77
|
+
"""Default batch implementation using individual generate calls.
|
|
78
|
+
|
|
79
|
+
ModelAPI implementations can override this for optimized batch processing.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
inputs: List of preprocessed chat message inputs.
|
|
83
|
+
tools: List of tools for each input.
|
|
84
|
+
tool_choices: List of tool choices for each input.
|
|
85
|
+
configs: List of configs for each input.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Generator yielding ModelOutput for each input.
|
|
89
|
+
"""
|
|
90
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
91
|
+
|
|
92
|
+
def single_generate(args):
|
|
93
|
+
input_msgs, input_tools, tool_choice, config = args
|
|
94
|
+
return self.generate(input_msgs, input_tools, tool_choice, config)
|
|
95
|
+
|
|
96
|
+
with ThreadPoolExecutor(max_workers=self.config.batch_size) as executor:
|
|
97
|
+
futures = []
|
|
98
|
+
for input_msgs, input_tools, tool_choice, config in zip(inputs, tools, tool_choices, configs):
|
|
99
|
+
future = executor.submit(single_generate, (input_msgs, input_tools, tool_choice, config))
|
|
100
|
+
futures.append(future)
|
|
101
|
+
|
|
102
|
+
for future in futures:
|
|
103
|
+
yield future.result()
|
|
104
|
+
|
|
105
|
+
def supports_batch(self) -> bool:
|
|
106
|
+
"""Whether this ModelAPI supports optimized batch processing."""
|
|
107
|
+
return False
|
|
108
|
+
|
|
109
|
+
def max_tokens(self) -> Optional[int]:
|
|
110
|
+
"""Default max_tokens."""
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
def max_tokens_for_config(self, config: GenerateConfig) -> Optional[int]:
|
|
114
|
+
"""Default max_tokens for a given config.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
config: Generation config.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Default maximum tokens for specified configuration.
|
|
121
|
+
"""
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
def tools_required(self) -> bool:
|
|
125
|
+
"""Any tool use in a message stream means that tools must be passed."""
|
|
126
|
+
return False
|
|
127
|
+
|
|
128
|
+
def tool_result_images(self) -> bool:
|
|
129
|
+
"""Tool results can contain images"""
|
|
130
|
+
return False
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class Model:
|
|
134
|
+
"""Model interface.
|
|
135
|
+
|
|
136
|
+
Use `get_model()` to get an instance of a model.
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
api: ModelAPI
|
|
140
|
+
"""Model API."""
|
|
141
|
+
|
|
142
|
+
config: GenerateConfig
|
|
143
|
+
"""Generation config."""
|
|
144
|
+
|
|
145
|
+
def __init__(self, api: ModelAPI, config: GenerateConfig, model_args: Dict[str, Any] = {}) -> None:
|
|
146
|
+
"""Create a model.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
api: Model API provider.
|
|
150
|
+
config: Model configuration.
|
|
151
|
+
model_args: Optional model args
|
|
152
|
+
"""
|
|
153
|
+
self.api = api
|
|
154
|
+
self.config = config
|
|
155
|
+
self.model_args = model_args
|
|
156
|
+
|
|
157
|
+
@property
|
|
158
|
+
def name(self) -> str:
|
|
159
|
+
"""Model name or path to model."""
|
|
160
|
+
return self.api.model_name
|
|
161
|
+
|
|
162
|
+
@property
|
|
163
|
+
def role(self) -> Optional[str]:
|
|
164
|
+
"""Model role."""
|
|
165
|
+
return self._role
|
|
166
|
+
|
|
167
|
+
@role.setter
|
|
168
|
+
def role(self, role: str) -> None:
|
|
169
|
+
self._role = role
|
|
170
|
+
|
|
171
|
+
def __str__(self) -> str:
|
|
172
|
+
return f'Model(name={self.model_id}, role={self.role})'
|
|
173
|
+
|
|
174
|
+
def generate(
|
|
175
|
+
self,
|
|
176
|
+
input: Union[str, List[ChatMessage]],
|
|
177
|
+
tools: Optional[Sequence[ToolInfo]] = None,
|
|
178
|
+
tool_choice: Optional[ToolChoice] = None,
|
|
179
|
+
config: Optional[GenerateConfig] = None,
|
|
180
|
+
) -> ModelOutput:
|
|
181
|
+
"""Generate output from the model.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
input: Chat message input (if a `str` is passed it is converted
|
|
185
|
+
to a `ChatMessageUser`).
|
|
186
|
+
tools: Tools available for the model to call.
|
|
187
|
+
tool_choice: Directives to the model as to which tools to prefer.
|
|
188
|
+
config: Model configuration.
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
ModelOutput
|
|
192
|
+
"""
|
|
193
|
+
processed_input, processed_tools, processed_tool_choice, processed_config = self._preprocess_input(
|
|
194
|
+
input, tools, tool_choice, config
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# Call the model's generate method
|
|
198
|
+
output = self.api.generate(
|
|
199
|
+
input=processed_input,
|
|
200
|
+
tools=processed_tools,
|
|
201
|
+
tool_choice=processed_tool_choice,
|
|
202
|
+
config=processed_config,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# return output
|
|
206
|
+
return output
|
|
207
|
+
|
|
208
|
+
def batch_generate(
|
|
209
|
+
self,
|
|
210
|
+
inputs: List[List[ChatMessage]],
|
|
211
|
+
tools: List[List[ToolInfo]],
|
|
212
|
+
tool_choices: List[ToolChoice],
|
|
213
|
+
configs: List[GenerateConfig],
|
|
214
|
+
) -> Generator[ModelOutput, None, None]:
|
|
215
|
+
"""Generate output from the model for a batch of inputs.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
inputs (List[List[ChatMessage]]): Batch of chat message inputs.
|
|
219
|
+
tools (List[List[ToolInfo]]): Batch of tools for each input.
|
|
220
|
+
tool_choices (List[ToolChoice]): Batch of tool choices for each input.
|
|
221
|
+
configs (List[GenerateConfig]): Batch of configs for each input.
|
|
222
|
+
"""
|
|
223
|
+
preprocessed_data = []
|
|
224
|
+
|
|
225
|
+
for input_item, input_tools, input_tool_choice, input_config in zip(inputs, tools, tool_choices, configs):
|
|
226
|
+
processed_input, processed_tools, processed_tool_choice, processed_config = self._preprocess_input(
|
|
227
|
+
input=input_item, tools=input_tools, tool_choice=input_tool_choice, config=input_config
|
|
228
|
+
)
|
|
229
|
+
preprocessed_data.append((processed_input, processed_tools, processed_tool_choice, processed_config))
|
|
230
|
+
|
|
231
|
+
# check if ModelAPI supports batch processing
|
|
232
|
+
if self.api.supports_batch() and len(preprocessed_data) > 1:
|
|
233
|
+
# use the batch_generate method of the ModelAPI
|
|
234
|
+
inputs, tools, tool_choices, configs = zip(*preprocessed_data)
|
|
235
|
+
batch_results = self.api.batch_generate(
|
|
236
|
+
inputs=list(inputs), tools=list(tools), tool_choices=list(tool_choices), configs=list(configs)
|
|
237
|
+
)
|
|
238
|
+
for result in batch_results:
|
|
239
|
+
yield result
|
|
240
|
+
else:
|
|
241
|
+
# fall back to processing each input individually
|
|
242
|
+
for input_msgs, input_tools, tool_choice, config in preprocessed_data:
|
|
243
|
+
result = self.api.generate(input_msgs, input_tools, tool_choice, config)
|
|
244
|
+
yield result
|
|
245
|
+
|
|
246
|
+
def _preprocess_input(
|
|
247
|
+
self,
|
|
248
|
+
input: Union[str, List[ChatMessage]],
|
|
249
|
+
tools: Optional[Sequence[ToolInfo]] = None,
|
|
250
|
+
tool_choice: Optional[ToolChoice] = None,
|
|
251
|
+
config: Optional[GenerateConfig] = None,
|
|
252
|
+
) -> tuple[List[ChatMessage], List[ToolInfo], ToolChoice, GenerateConfig]:
|
|
253
|
+
"""pre process input for generate."""
|
|
254
|
+
|
|
255
|
+
# merge passed config
|
|
256
|
+
if config is not None:
|
|
257
|
+
config = self.config.merge(config)
|
|
258
|
+
else:
|
|
259
|
+
config = self.config.model_copy(deep=True)
|
|
260
|
+
|
|
261
|
+
# provide max_tokens from the model api if required
|
|
262
|
+
if config.max_tokens is None:
|
|
263
|
+
config.max_tokens = self.api.max_tokens_for_config(config)
|
|
264
|
+
if config.max_tokens is None:
|
|
265
|
+
config.max_tokens = self.api.max_tokens()
|
|
266
|
+
|
|
267
|
+
# normalize input to chat
|
|
268
|
+
if isinstance(input, str):
|
|
269
|
+
input = [ChatMessageUser(content=input)]
|
|
270
|
+
|
|
271
|
+
# handle tools and tool_choice
|
|
272
|
+
tool_choice = tool_choice if tool_choice is not None else 'auto'
|
|
273
|
+
tools_info = list(tools) if tools is not None else []
|
|
274
|
+
|
|
275
|
+
if isinstance(tool_choice, ToolFunction):
|
|
276
|
+
tools_info = [tool for tool in tools_info if tool.name == tool_choice.name]
|
|
277
|
+
|
|
278
|
+
if tool_choice == 'none' or len(tools_info) == 0:
|
|
279
|
+
if not self.api.tools_required():
|
|
280
|
+
tools_info = []
|
|
281
|
+
tool_choice = 'none'
|
|
282
|
+
|
|
283
|
+
return input, tools_info, tool_choice, config
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
class ModelCache:
|
|
287
|
+
_models: Dict[str, 'Model'] = {}
|
|
288
|
+
|
|
289
|
+
@classmethod
|
|
290
|
+
def get(cls, key: str) -> Optional['Model']:
|
|
291
|
+
return cls._models.get(key, None)
|
|
292
|
+
|
|
293
|
+
@classmethod
|
|
294
|
+
def set(cls, key: str, model: 'Model') -> None:
|
|
295
|
+
cls._models[key] = model
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def get_model_with_task_config(task_config: 'TaskConfig') -> Model:
|
|
299
|
+
"""Get an instance of a model with the specified task configuration.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
task_config (TaskConfig): Task configuration.
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
Model: An instance of the model.
|
|
306
|
+
"""
|
|
307
|
+
model = task_config.model
|
|
308
|
+
eval_type = task_config.eval_type
|
|
309
|
+
base_url = task_config.api_url
|
|
310
|
+
api_key = task_config.api_key
|
|
311
|
+
config = task_config.generation_config
|
|
312
|
+
model_args = task_config.model_args or {}
|
|
313
|
+
|
|
314
|
+
return get_model(
|
|
315
|
+
model=model, eval_type=eval_type, base_url=base_url, api_key=api_key, config=config, model_args=model_args
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
@thread_safe
|
|
320
|
+
def get_model(
|
|
321
|
+
model: Union[str, Model, ModelAPI],
|
|
322
|
+
eval_type: str,
|
|
323
|
+
base_url: Optional[str] = None,
|
|
324
|
+
api_key: Optional[str] = None,
|
|
325
|
+
config: GenerateConfig = GenerateConfig(),
|
|
326
|
+
model_args: dict = {},
|
|
327
|
+
role: Optional[str] = None,
|
|
328
|
+
memoize: bool = True,
|
|
329
|
+
) -> Model:
|
|
330
|
+
"""Get an instance of a model.
|
|
331
|
+
|
|
332
|
+
Calls to get_model() are memoized (i.e. a call with the same arguments
|
|
333
|
+
will return an existing instance of the model rather than creating a
|
|
334
|
+
new one). You can disable this with `memoize=False`.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
task_config (TaskConfig): Task configuration.
|
|
338
|
+
memoize (bool): Whether to memoize the model instance.
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
Model instance.
|
|
342
|
+
|
|
343
|
+
"""
|
|
344
|
+
|
|
345
|
+
# start with seeing if a model was passed
|
|
346
|
+
if isinstance(model, Model):
|
|
347
|
+
return model
|
|
348
|
+
|
|
349
|
+
if isinstance(model, ModelAPI):
|
|
350
|
+
return Model(model, config, model_args)
|
|
351
|
+
|
|
352
|
+
# see if we can return a memoized model instance
|
|
353
|
+
# (exclude mockllm since custom_outputs is an infinite generator)
|
|
354
|
+
model_cache_key: str = ''
|
|
355
|
+
if eval_type.startswith('mock_llm'):
|
|
356
|
+
memoize = False
|
|
357
|
+
if memoize:
|
|
358
|
+
model_cache_key = (
|
|
359
|
+
model + str(role) + config.model_dump_json(exclude_none=True) + str(base_url) + str(api_key)
|
|
360
|
+
+ str(to_jsonable_python(model_args, fallback=lambda _: None))
|
|
361
|
+
)
|
|
362
|
+
cached = ModelCache.get(model_cache_key)
|
|
363
|
+
if cached is not None:
|
|
364
|
+
return cached
|
|
365
|
+
|
|
366
|
+
logger.info(
|
|
367
|
+
f'Creating model {model} with eval_type={eval_type} '
|
|
368
|
+
f'base_url={base_url}, config={config.model_dump(exclude_none=True)}, model_args={model_args}'
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
# find a matching model type
|
|
372
|
+
modelapi_type = get_model_api(eval_type)
|
|
373
|
+
|
|
374
|
+
modelapi_instance = modelapi_type(
|
|
375
|
+
model_name=model,
|
|
376
|
+
base_url=base_url,
|
|
377
|
+
api_key=api_key,
|
|
378
|
+
config=config,
|
|
379
|
+
**model_args,
|
|
380
|
+
)
|
|
381
|
+
m = Model(modelapi_instance, config, model_args)
|
|
382
|
+
if role is not None:
|
|
383
|
+
m.role = role
|
|
384
|
+
if memoize:
|
|
385
|
+
ModelCache.set(model_cache_key, m)
|
|
386
|
+
return m
|