PyPI - evalscope - Versions diffs - 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 0.10.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (606) hide show

evalscope/__init__.py +4 -1
evalscope/api/benchmark/__init__.py +11 -0
evalscope/api/benchmark/adapters/__init__.py +7 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +404 -0
evalscope/api/benchmark/meta.py +124 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +370 -0
evalscope/api/dataset/loader.py +266 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +382 -0
evalscope/api/evaluator/evaluator.py +61 -0
evalscope/api/evaluator/state.py +280 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +12 -0
evalscope/api/messages/chat_message.py +248 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +60 -0
evalscope/api/metric/scorer.py +113 -0
evalscope/api/mixin/__init__.py +2 -0
evalscope/api/mixin/llm_judge_mixin.py +170 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +161 -0
evalscope/api/model/model.py +386 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/__init__.py +28 -0
evalscope/app/app.py +38 -0
evalscope/app/arguments.py +11 -0
evalscope/app/constants.py +22 -0
evalscope/app/ui/__init__.py +20 -0
evalscope/app/ui/app_ui.py +53 -0
evalscope/app/ui/multi_model.py +353 -0
evalscope/app/ui/sidebar.py +42 -0
evalscope/app/ui/single_model.py +220 -0
evalscope/app/ui/visualization.py +36 -0
evalscope/app/utils/data_utils.py +195 -0
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/localization.py +221 -0
evalscope/app/utils/text_utils.py +119 -0
evalscope/app/utils/visualization.py +96 -0
evalscope/arguments.py +32 -9
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +10 -7
evalscope/backend/rag_eval/__init__.py +1 -1
evalscope/backend/rag_eval/backend_manager.py +23 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
evalscope/backend/rag_eval/ragas/arguments.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
evalscope/backend/rag_eval/utils/embedding.py +125 -32
evalscope/backend/rag_eval/utils/llm.py +16 -16
evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
evalscope/benchmarks/__init__.py +17 -5
evalscope/benchmarks/aa_lcr/__init__.py +0 -0
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/__init__.py +0 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/__init__.py +0 -0
evalscope/benchmarks/aime/aime24_adapter.py +55 -0
evalscope/benchmarks/aime/aime25_adapter.py +181 -0
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
evalscope/benchmarks/amc/__init__.py +0 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arc/arc_adapter.py +34 -149
evalscope/benchmarks/arena_hard/__init__.py +0 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
evalscope/benchmarks/arena_hard/utils.py +186 -0
evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
evalscope/benchmarks/bfcl/__init__.py +0 -0
evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
evalscope/benchmarks/bfcl/v3/generation.py +222 -0
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
evalscope/benchmarks/data_collection/__init__.py +0 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
evalscope/benchmarks/docmath/__init__.py +0 -0
evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
evalscope/benchmarks/docmath/utils.py +219 -0
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/__init__.py +0 -0
evalscope/benchmarks/drop/drop_adapter.py +155 -0
evalscope/benchmarks/drop/utils.py +156 -0
evalscope/benchmarks/frames/__init__.py +0 -0
evalscope/benchmarks/frames/frames_adapter.py +175 -0
evalscope/benchmarks/frames/utils.py +37 -0
evalscope/benchmarks/general_arena/__init__.py +0 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
evalscope/benchmarks/general_arena/utils.py +223 -0
evalscope/benchmarks/general_mcq/__init__.py +0 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
evalscope/benchmarks/gpqa/__init__.py +0 -0
evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
evalscope/benchmarks/gpqa/prompt.py +88 -0
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
evalscope/benchmarks/hle/__init__.py +0 -0
evalscope/benchmarks/hle/hle_adapter.py +153 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
evalscope/benchmarks/ifeval/instructions.py +112 -68
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
evalscope/benchmarks/live_code_bench/__init__.py +0 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
evalscope/benchmarks/live_code_bench/prompts.py +207 -0
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/maritime_bench/__init__.py +0 -0
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
evalscope/benchmarks/math_500/__init__.py +0 -0
evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/musr/__init__.py +0 -0
evalscope/benchmarks/musr/musr_adapter.py +43 -0
evalscope/benchmarks/needle_haystack/__init__.py +0 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
evalscope/benchmarks/needle_haystack/utils.py +79 -0
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/__init__.py +0 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/race/race_adapter.py +33 -120
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/__init__.py +0 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/super_gpqa/__init__.py +0 -0
evalscope/benchmarks/super_gpqa/prompt.py +88 -0
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
evalscope/benchmarks/super_gpqa/utils.py +86 -0
evalscope/benchmarks/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
evalscope/benchmarks/tool_bench/__init__.py +0 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
evalscope/benchmarks/tool_bench/utils.py +203 -0
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/winogrande/__init__.py +0 -0
evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +12 -2
evalscope/cli/start_eval.py +4 -3
evalscope/cli/start_perf.py +10 -2
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +27 -3
evalscope/collections/sampler.py +12 -11
evalscope/collections/schema.py +13 -12
evalscope/config.py +218 -147
evalscope/constants.py +78 -82
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +334 -318
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +59 -3
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
evalscope/metrics/llm_judge.py +211 -0
evalscope/metrics/math_parser.py +545 -0
evalscope/metrics/metric.py +611 -0
evalscope/metrics/metrics.py +112 -23
evalscope/metrics/rouge_metric.py +11 -13
evalscope/metrics/t2v_metrics/__init__.py +0 -0
evalscope/metrics/t2v_metrics/clipscore.py +14 -0
evalscope/metrics/t2v_metrics/constants.py +12 -0
evalscope/metrics/t2v_metrics/itmscore.py +14 -0
evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
evalscope/metrics/t2v_metrics/models/model.py +45 -0
evalscope/metrics/t2v_metrics/models/utils.py +25 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
evalscope/metrics/t2v_metrics/score.py +78 -0
evalscope/metrics/t2v_metrics/vqascore.py +14 -0
evalscope/models/__init__.py +23 -13
evalscope/models/image_edit_model.py +125 -0
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +69 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +144 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +708 -0
evalscope/perf/__init__.py +0 -1
evalscope/perf/arguments.py +103 -69
evalscope/perf/benchmark.py +114 -163
evalscope/perf/http_client.py +59 -89
evalscope/perf/main.py +91 -18
evalscope/perf/plugin/__init__.py +3 -2
evalscope/perf/plugin/api/__init__.py +4 -3
evalscope/perf/plugin/api/base.py +27 -7
evalscope/perf/plugin/api/custom_api.py +170 -57
evalscope/perf/plugin/api/dashscope_api.py +4 -10
evalscope/perf/plugin/api/default_api.py +214 -0
evalscope/perf/plugin/api/openai_api.py +120 -41
evalscope/perf/plugin/datasets/__init__.py +10 -6
evalscope/perf/plugin/datasets/base.py +43 -1
evalscope/perf/plugin/datasets/custom.py +22 -3
evalscope/perf/plugin/datasets/flickr8k.py +5 -27
evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
evalscope/perf/plugin/datasets/line_by_line.py +7 -3
evalscope/perf/plugin/datasets/longalpaca.py +7 -3
evalscope/perf/plugin/datasets/openqa.py +13 -14
evalscope/perf/plugin/datasets/random_dataset.py +67 -0
evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
evalscope/perf/plugin/registry.py +36 -16
evalscope/perf/utils/analysis_result.py +24 -23
evalscope/perf/utils/benchmark_util.py +95 -55
evalscope/perf/utils/db_util.py +115 -78
evalscope/perf/utils/local_server.py +12 -47
evalscope/perf/utils/log_utils.py +63 -0
evalscope/perf/utils/rich_display.py +192 -0
evalscope/report/__init__.py +46 -3
evalscope/report/combinator.py +143 -32
evalscope/report/generator.py +74 -34
evalscope/report/report.py +238 -0
evalscope/run.py +71 -46
evalscope/summarizer.py +5 -5
evalscope/third_party/longbench_write/infer.py +1 -1
evalscope/third_party/thinkbench/__init__.py +3 -0
evalscope/third_party/thinkbench/eval.py +441 -0
evalscope/third_party/thinkbench/infer.py +130 -0
evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
evalscope/third_party/thinkbench/tools/__init__.py +0 -0
evalscope/third_party/thinkbench/tools/llm.py +48 -0
evalscope/third_party/thinkbench/tools/utils.py +13 -0
evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
evalscope/utils/__init__.py +82 -2
evalscope/utils/argument_utils.py +64 -0
evalscope/utils/chat_service.py +8 -6
evalscope/utils/deprecation_utils.py +53 -0
evalscope/utils/function_utils.py +266 -0
evalscope/utils/import_utils.py +154 -0
evalscope/utils/io_utils.py +336 -8
evalscope/utils/json_schema.py +231 -0
evalscope/utils/logger.py +121 -31
evalscope/utils/model_utils.py +57 -1
evalscope/utils/multi_choices.py +303 -0
evalscope/utils/ner.py +377 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
evalscope-1.2.0.dist-info/METADATA +553 -0
evalscope-1.2.0.dist-info/RECORD +628 -0
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -76
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/ceval/samples.jsonl +0 -1
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -291
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/collections/evaluator.py +0 -198
evalscope/evaluator/rating_eval.py +0 -157
evalscope/evaluator/reviewer/__init__.py +0 -1
evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
evalscope/metrics/code_metric.py +0 -98
evalscope/metrics/named_metrics.py +0 -17
evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
evalscope/models/base_adapter.py +0 -52
evalscope/models/chat_adapter.py +0 -138
evalscope/models/choice_adapter.py +0 -211
evalscope/models/custom/__init__.py +0 -3
evalscope/models/custom/custom_model.py +0 -53
evalscope/models/custom/dummy_model.py +0 -63
evalscope/models/custom_adapter.py +0 -67
evalscope/models/local_model.py +0 -74
evalscope/models/model.py +0 -229
evalscope/models/server_adapter.py +0 -111
evalscope/registry/__init__.py +0 -1
evalscope/registry/config/cfg_arena.yaml +0 -77
evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
evalscope/registry/config/cfg_single.yaml +0 -78
evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
evalscope/registry/data/qa_browser/battle.jsonl +0 -634
evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
evalscope/registry/data/question.jsonl +0 -80
evalscope/registry/tasks/arc.yaml +0 -28
evalscope/registry/tasks/bbh.yaml +0 -26
evalscope/registry/tasks/bbh_mini.yaml +0 -26
evalscope/registry/tasks/ceval.yaml +0 -27
evalscope/registry/tasks/ceval_mini.yaml +0 -26
evalscope/registry/tasks/cmmlu.yaml +0 -27
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
evalscope/registry/tasks/general_qa.yaml +0 -27
evalscope/registry/tasks/gsm8k.yaml +0 -29
evalscope/registry/tasks/mmlu.yaml +0 -29
evalscope/registry/tasks/mmlu_mini.yaml +0 -27
evalscope/report/app.py +0 -506
evalscope/report/utils.py +0 -133
evalscope/run_arena.py +0 -202
evalscope/utils/arena_utils.py +0 -217
evalscope/utils/completion_parsers.py +0 -82
evalscope/utils/utils.py +0 -301
evalscope-0.10.0.dist-info/METADATA +0 -565
evalscope-0.10.0.dist-info/RECORD +0 -286
tests/__init__.py +0 -1
tests/cli/__init__.py +0 -1
tests/cli/test_collection.py +0 -57
tests/cli/test_run.py +0 -165
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -101
tests/rag/test_clip_benchmark.py +0 -85
tests/rag/test_mteb.py +0 -138
tests/rag/test_ragas.py +0 -120
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -145
tests/swift/test_run_swift_vlm_eval.py +0 -127
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
tests/test_run_all.py +0 -12
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -60
{tests/rag → evalscope/api}/__init__.py +0 -0
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/models/utils/openai.py ADDED Viewed

@@ -0,0 +1,708 @@
+import base64
+import json
+import re
+from collections import defaultdict
+from copy import copy
+from openai import APIStatusError, OpenAIError
+from openai.types.chat import (
+    ChatCompletion,
+    ChatCompletionAssistantMessageParam,
+    ChatCompletionChunk,
+    ChatCompletionContentPartImageParam,
+    ChatCompletionContentPartInputAudioParam,
+    ChatCompletionContentPartParam,
+    ChatCompletionContentPartRefusalParam,
+    ChatCompletionContentPartTextParam,
+    ChatCompletionDeveloperMessageParam,
+    ChatCompletionMessage,
+    ChatCompletionMessageParam,
+    ChatCompletionMessageToolCall,
+    ChatCompletionMessageToolCallParam,
+    ChatCompletionNamedToolChoiceParam,
+    ChatCompletionSystemMessageParam,
+    ChatCompletionToolChoiceOptionParam,
+    ChatCompletionToolMessageParam,
+    ChatCompletionToolParam,
+    ChatCompletionUserMessageParam,
+)
+from openai.types.chat.chat_completion import Choice, ChoiceLogprobs
+from openai.types.chat.chat_completion_message_tool_call import Function
+from openai.types.completion_usage import CompletionUsage
+from openai.types.shared_params.function_definition import FunctionDefinition
+from pydantic import JsonValue
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+from evalscope.api.messages import (
+    ChatMessage,
+    ChatMessageAssistant,
+    ChatMessageSystem,
+    ChatMessageTool,
+    ChatMessageUser,
+    Content,
+    ContentAudio,
+    ContentImage,
+    ContentReasoning,
+    ContentText,
+    parse_content_with_reasoning,
+)
+from evalscope.api.model import (
+    ChatCompletionChoice,
+    GenerateConfig,
+    Logprobs,
+    ModelOutput,
+    ModelUsage,
+    StopReason,
+    as_stop_reason,
+)
+from evalscope.api.tool import ToolCall, ToolChoice, ToolFunction, ToolInfo, parse_tool_call
+from evalscope.utils.url_utils import file_as_data_uri, is_http_url
+BASE_64_DATA_REMOVED = '<base64-data-removed>'
+class OpenAIResponseError(OpenAIError):
+    def __init__(self, code: str, message: str) -> None:
+        self.code = code
+        self.message = message
+    def __str__(self) -> str:
+        return f'{self.code}: {self.message}'
+def openai_chat_tool_call(tool_call: ToolCall) -> ChatCompletionMessageToolCall:
+    return ChatCompletionMessageToolCall(
+        type='function',
+        id=tool_call.id,
+        function=Function(name=tool_call.function.name, arguments=json.dumps(tool_call.function.arguments)),
+    )
+def openai_chat_tool_call_param(tool_call: ToolCall) -> ChatCompletionMessageToolCallParam:
+    return ChatCompletionMessageToolCallParam(
+        id=tool_call.id,
+        function=dict(name=tool_call.function.name, arguments=json.dumps(tool_call.function.arguments)),
+        type='function',
+    )
+def openai_chat_completion_part(content: Content) -> ChatCompletionContentPartParam:
+    if content.type == 'text':
+        return ChatCompletionContentPartTextParam(type='text', text=content.text)
+    elif content.type == 'image':
+        # API takes URL or base64 encoded file. If it's a remote file or
+        # data URL leave it alone, otherwise encode it
+        image_url = content.image
+        detail = content.detail
+        if not is_http_url(image_url):
+            image_url = file_as_data_uri(image_url)
+        return ChatCompletionContentPartImageParam(
+            type='image_url',
+            image_url=dict(url=image_url, detail=detail),
+        )
+    elif content.type == 'audio':
+        audio_data_uri = file_as_data_uri(content.audio)
+        return ChatCompletionContentPartInputAudioParam(
+            type='input_audio', input_audio=dict(data=audio_data_uri, format=content.format)
+        )
+    else:
+        raise RuntimeError('Video content is not currently supported by Open AI chat models.')
+def openai_chat_message(
+    message: ChatMessage, system_role: Literal['user', 'system', 'developer'] = 'system'
+) -> ChatCompletionMessageParam:
+    if message.role == 'system':
+        if system_role == 'user':
+            return ChatCompletionUserMessageParam(role='user', content=message.text)
+        elif system_role == 'system':
+            return ChatCompletionSystemMessageParam(role=message.role, content=message.text)
+        elif system_role == 'developer':
+            return ChatCompletionDeveloperMessageParam(role='developer', content=message.text)
+    elif message.role == 'user':
+        return ChatCompletionUserMessageParam(
+            role=message.role,
+            content=(
+                message.content if isinstance(message.content, str) else
+                [openai_chat_completion_part(content) for content in message.content]
+            ),
+        )
+    elif message.role == 'assistant':
+        if message.tool_calls:
+            return ChatCompletionAssistantMessageParam(
+                role=message.role,
+                content=openai_assistant_content(message),
+                tool_calls=[openai_chat_tool_call_param(call) for call in message.tool_calls],
+            )
+        else:
+            return ChatCompletionAssistantMessageParam(role=message.role, content=openai_assistant_content(message))
+    elif message.role == 'tool':
+        return ChatCompletionToolMessageParam(
+            role=message.role,
+            content=(f'Error: {message.error.message}' if message.error else message.text),
+            tool_call_id=str(message.tool_call_id),
+        )
+    else:
+        raise ValueError(f'Unexpected message role {message.role}')
+def openai_chat_messages(
+    messages: List[ChatMessage],
+    system_role: Literal['user', 'system', 'developer'] = 'system',
+) -> List[ChatCompletionMessageParam]:
+    return [openai_chat_message(message, system_role) for message in messages]
+def openai_completion_params(model: str, config: GenerateConfig, tools: bool) -> Dict[str, Any]:
+    params: Dict[str, Any] = dict(model=model)
+    # handle stream option
+    if config.stream is not None:
+        params['stream'] = config.stream
+        if config.stream:
+            params['stream_options'] = {'include_usage': True}
+    if config.timeout is not None:
+        params['timeout'] = config.timeout
+    if config.max_tokens is not None:
+        params['max_tokens'] = config.max_tokens
+    if config.frequency_penalty is not None:
+        params['frequency_penalty'] = config.frequency_penalty
+    if config.stop_seqs is not None:
+        params['stop'] = config.stop_seqs
+    if config.presence_penalty is not None:
+        params['presence_penalty'] = config.presence_penalty
+    if config.repetition_penalty is not None:
+        params['repetition_penalty'] = config.repetition_penalty
+    if config.logit_bias is not None:
+        params['logit_bias'] = config.logit_bias
+    if config.seed is not None:
+        params['seed'] = config.seed
+    if config.temperature is not None:
+        params['temperature'] = config.temperature
+    if config.top_p is not None:
+        params['top_p'] = config.top_p
+    if config.top_k is not None:
+        params['top_k'] = config.top_k
+    if config.n is not None:
+        params['n'] = config.n
+    if config.logprobs is not None:
+        params['logprobs'] = config.logprobs
+    if config.top_logprobs is not None:
+        params['top_logprobs'] = config.top_logprobs
+    if tools and config.parallel_tool_calls is not None:
+        params['parallel_tool_calls'] = config.parallel_tool_calls
+    if config.reasoning_effort is not None:
+        params['reasoning_effort'] = config.reasoning_effort
+    if config.response_schema is not None:
+        params['response_format'] = dict(
+            type='json_schema',
+            json_schema=dict(
+                name=config.response_schema.name,
+                schema=config.response_schema.json_schema.model_dump(exclude_none=True),
+                description=config.response_schema.description,
+                strict=config.response_schema.strict,
+            ),
+        )
+    if config.extra_body:
+        params['extra_body'] = config.extra_body
+    if config.extra_query:
+        params['extra_query'] = config.extra_query
+    if config.extra_headers:
+        params['extra_headers'] = config.extra_headers
+    return params
+def openai_assistant_content(message: ChatMessageAssistant, include_reasoning=True) -> str:
+    # In agent bridge scenarios, we could encounter concepts such as reasoning and
+    # .internal use in the ChatMessageAssistant that are not supported by the OpenAI
+    # choices API. This code smuggles that data into the plain text so that it
+    # survives multi-turn round trips.
+    if isinstance(message.content, str):
+        content = message.content
+    else:
+        content = ''
+        for c in message.content:
+            if c.type == 'reasoning' and include_reasoning:
+                attribs = ''
+                if c.signature is not None:
+                    attribs = f'{attribs} signature="{c.signature}"'
+                if c.redacted:
+                    attribs = f'{attribs} redacted="true"'
+                content = f'{content}\n<think{attribs}>\n{c.reasoning}\n</think>\n'
+            elif c.type == 'text':
+                content = f'{content}\n{c.text}'
+    if message.internal:
+        content = f"""{content}\n<internal>{
+            base64.b64encode(json.dumps(message.internal).encode("utf-8")).decode(
+                "utf-8"
+            )
+        }</internal>\n"""
+    return content
+def openai_chat_choices(choices: List[ChatCompletionChoice], include_reasoning: bool = True) -> List[Choice]:
+    oai_choices: List[Choice] = []
+    for index, choice in enumerate(choices):
+        # Handle content
+        content = openai_assistant_content(choice.message, include_reasoning=include_reasoning)
+        # Handle tool calls
+        if choice.message.tool_calls:
+            tool_calls = [openai_chat_tool_call(tc) for tc in choice.message.tool_calls]
+        else:
+            tool_calls = None
+        message = ChatCompletionMessage(role='assistant', content=content, tool_calls=tool_calls)
+        oai_choices.append(
+            Choice(
+                finish_reason=openai_finish_reason(choice.stop_reason),
+                index=index,
+                message=message,
+                logprobs=ChoiceLogprobs(**choice.logprobs.model_dump()) if choice.logprobs is not None else None,
+            )
+        )
+    return oai_choices
+def openai_completion_usage(usage: ModelUsage) -> CompletionUsage:
+    return CompletionUsage(
+        completion_tokens=usage.output_tokens,
+        prompt_tokens=usage.input_tokens,
+        total_tokens=usage.total_tokens,
+    )
+def openai_finish_reason(
+    stop_reason: StopReason
+) -> Literal['stop', 'length', 'tool_calls', 'content_filter', 'function_call']:
+    if stop_reason in ('stop', 'tool_calls', 'content_filter'):
+        return stop_reason
+    elif stop_reason == 'model_length':
+        return 'length'
+    else:
+        return 'stop'
+def openai_chat_tool_param(tool: ToolInfo) -> ChatCompletionToolParam:
+    function = FunctionDefinition(
+        name=tool.name,
+        description=tool.description,
+        parameters=tool.parameters.model_dump(exclude_none=True),
+    )
+    return ChatCompletionToolParam(type='function', function=function)
+def openai_chat_tools(tools: List[ToolInfo]) -> List[ChatCompletionToolParam]:
+    return [openai_chat_tool_param(tool) for tool in tools]
+def openai_chat_tool_choice(tool_choice: ToolChoice, ) -> ChatCompletionToolChoiceOptionParam:
+    if isinstance(tool_choice, ToolFunction):
+        return ChatCompletionNamedToolChoiceParam(type='function', function=dict(name=tool_choice.name))
+    # openai supports 'any' via the 'required' keyword
+    elif tool_choice == 'any':
+        return 'required'
+    else:
+        return tool_choice
+def chat_tool_calls_from_openai(message: ChatCompletionMessage, tools: List[ToolInfo]) -> Optional[List[ToolCall]]:
+    if message.tool_calls:
+        return [
+            parse_tool_call(call.id, call.function.name, call.function.arguments, tools) for call in message.tool_calls
+        ]
+    else:
+        return None
+def chat_messages_from_openai(
+    model: str,
+    messages: List[ChatCompletionMessageParam],
+) -> List[ChatMessage]:
+    # track tool names by id
+    tool_names: Dict[str, str] = {}
+    chat_messages: List[ChatMessage] = []
+    for message in messages:
+        content: Union[str, List[Content]] = []
+        if message['role'] == 'system' or message['role'] == 'developer':
+            sys_content = message['content']
+            if isinstance(sys_content, str):
+                chat_messages.append(ChatMessageSystem(content=sys_content))
+            else:
+                content = []
+                for sc in sys_content:
+                    content.extend(content_from_openai(sc))
+                chat_messages.append(ChatMessageSystem(content=content))
+        elif message['role'] == 'user':
+            user_content = message['content']
+            if isinstance(user_content, str):
+                chat_messages.append(ChatMessageUser(content=user_content))
+            else:
+                content = []
+                for uc in user_content:
+                    content.extend(content_from_openai(uc))
+                chat_messages.append(ChatMessageUser(content=content))
+        elif message['role'] == 'assistant':
+            # resolve content
+            refusal: Optional[Literal[True]] = None
+            internal: Optional[JsonValue] = None
+            asst_content = message.get('content', None)
+            if isinstance(asst_content, str):
+                # Even though the choices API doesn't take advantage of .internal,
+                # we could be transforming from OpenAI choices to Inspect for agent
+                # bridge scenarios where a different model (that does use .internal)
+                # is the actual model being used.
+                asst_content, internal = _parse_content_with_internal(asst_content)
+                asst_content, smuggled_reasoning = parse_content_with_reasoning(asst_content)
+                if smuggled_reasoning:
+                    content = [
+                        smuggled_reasoning,
+                        ContentText(text=asst_content),
+                    ]
+                else:
+                    content = asst_content
+            elif asst_content is None:
+                content = message.get('refusal', None) or ''
+                if content:
+                    refusal = True
+            else:
+                content = []
+                for ac in asst_content:
+                    content.extend(content_from_openai(ac, parse_reasoning=True))
+            # resolve reasoning (OpenAI doesn't suport this however OpenAI-compatible
+            # interfaces e.g. DeepSeek do include this field so we pluck it out)
+            reasoning = message.get('reasoning_content', None) or message.get('reasoning', None)
+            if reasoning is not None:
+                # normalize content to an array
+                if isinstance(content, str):
+                    content = [ContentText(text=content, refusal=refusal)]
+                # insert reasoning
+                content.insert(0, ContentReasoning(reasoning=str(reasoning)))
+            # return message
+            if 'tool_calls' in message:
+                tool_calls: List[ToolCall] = []
+                for call in message['tool_calls']:
+                    tool_calls.append(tool_call_from_openai(call))
+                    tool_names[call['id']] = call['function']['name']
+            else:
+                tool_calls = []
+            chat_messages.append(
+                ChatMessageAssistant(
+                    content=content,
+                    tool_calls=tool_calls or None,
+                    model=model,
+                    source='generate',
+                    internal=internal,
+                )
+            )
+        elif message['role'] == 'tool':
+            tool_content = message.get('content', None) or ''
+            if isinstance(tool_content, str):
+                # If tool_content is a simple str, it could be the result of some
+                # sub-agent tool call that has <think> or <internal> smuggled inside
+                # of it to support agent bridge scenarios. We have to strip that
+                # data. To be clear, if it's <think>, we'll strip the <think> tag,
+                # but the reasoning summary itself will remain in the content.
+                content, _ = _parse_content_with_internal(tool_content)
+                content, _ = parse_content_with_reasoning(content)
+            else:
+                content = []
+                for tc in tool_content:
+                    content.extend(content_from_openai(tc))
+            chat_messages.append(
+                ChatMessageTool(
+                    content=content,
+                    tool_call_id=message['tool_call_id'],
+                    function=tool_names.get(message['tool_call_id'], ''),
+                )
+            )
+        else:
+            raise ValueError(f'Unexpected message param type: {type(message)}')
+    return chat_messages
+def tool_call_from_openai(tool_call: ChatCompletionMessageToolCallParam) -> ToolCall:
+    return parse_tool_call(
+        tool_call['id'],
+        tool_call['function']['name'],
+        tool_call['function']['arguments'],
+    )
+def content_from_openai(
+    content: Union[ChatCompletionContentPartParam, ChatCompletionContentPartRefusalParam],
+    parse_reasoning: bool = False,
+) -> List[Content]:
+    # Some providers omit the type tag and use "object-with-a-single-field" encoding
+    if 'type' not in content and len(content) == 1:
+        content['type'] = list(content.keys())[0]  # type: ignore[arg-type]
+    if content['type'] == 'text':
+        text = content['text']
+        if parse_reasoning:
+            content_text, content_reasoning = parse_content_with_reasoning(text)
+            if content_reasoning:
+                return [
+                    content_reasoning,
+                    ContentText(text=content_text),
+                ]
+            else:
+                return [ContentText(text=text)]
+        else:
+            return [ContentText(text=text)]
+    elif content['type'] == 'reasoning':  # type: ignore[comparison-overlap]
+        return [ContentReasoning(reasoning=content['reasoning'])]
+    elif content['type'] == 'image_url':
+        return [ContentImage(image=content['image_url']['url'], detail=content['image_url']['detail'])]
+    elif content['type'] == 'input_audio':
+        return [ContentAudio(
+            audio=content['input_audio']['data'],
+            format=content['input_audio']['format'],
+        )]
+    elif content['type'] == 'refusal':
+        return [ContentText(text=content['refusal'], refusal=True)]
+    else:
+        content_type = content['type']
+        raise ValueError(f"Unexpected content type '{content_type}' in message.")
+def chat_message_assistant_from_openai(
+    model: str, message: ChatCompletionMessage, tools: List[ToolInfo]
+) -> ChatMessageAssistant:
+    refusal = getattr(message, 'refusal', None)
+    reasoning = getattr(message, 'reasoning_content', None) or getattr(message, 'reasoning', None)
+    msg_content = refusal or message.content or ''
+    if reasoning is not None:
+        content: Union[str, List[Content]] = [
+            ContentReasoning(reasoning=str(reasoning)),
+            ContentText(text=msg_content, refusal=True if refusal else None),
+        ]
+    elif refusal is not None:
+        content = [ContentText(text=msg_content, refusal=True)]
+    else:
+        content = msg_content
+    return ChatMessageAssistant(
+        content=content,
+        model=model,
+        source='generate',
+        tool_calls=chat_tool_calls_from_openai(message, tools),
+    )
+def model_output_from_openai(
+    completion: ChatCompletion,
+    choices: list[ChatCompletionChoice],
+) -> ModelOutput:
+    return ModelOutput(
+        model=completion.model,
+        choices=choices,
+        usage=(
+            ModelUsage(
+                input_tokens=completion.usage.prompt_tokens,
+                output_tokens=completion.usage.completion_tokens,
+                input_tokens_cache_read=(
+                    completion.usage.prompt_tokens_details.cached_tokens if completion.usage.prompt_tokens_details
+                    is not None else None  # openai only have cache read stats/pricing.
+                ),
+                reasoning_tokens=(
+                    completion.usage.completion_tokens_details.reasoning_tokens
+                    if completion.usage.completion_tokens_details is not None else None
+                ),
+                total_tokens=completion.usage.total_tokens,
+            ) if completion.usage else None
+        ),
+    )
+def chat_choices_from_openai(response: ChatCompletion, tools: List[ToolInfo]) -> List[ChatCompletionChoice]:
+    choices = list(response.choices)
+    choices.sort(key=lambda c: c.index)
+    return [
+        ChatCompletionChoice(
+            message=chat_message_assistant_from_openai(response.model, choice.message, tools),
+            stop_reason=as_stop_reason(choice.finish_reason),
+            logprobs=(
+                Logprobs(**choice.logprobs.model_dump())
+                if choice.logprobs and choice.logprobs.content is not None else None
+            ),
+        ) for choice in choices
+    ]
+def openai_handle_bad_request(model_name: str, e: APIStatusError) -> Union[ModelOutput, Exception]:
+    # extract message
+    if isinstance(e.body, dict) and 'message' in e.body.keys():
+        content = str(e.body.get('message'))
+    else:
+        content = e.message
+    # narrow stop_reason
+    stop_reason: Optional[StopReason] = None
+    if e.code == 'context_length_exceeded':
+        stop_reason = 'model_length'
+    elif (
+        e.code == 'invalid_prompt'  # seems to happen for o1/o3
+        or e.code == 'content_policy_violation'  # seems to happen for vision
+        or e.code == 'content_filter'  # seems to happen on azure
+    ):
+        stop_reason = 'content_filter'
+    if stop_reason:
+        return ModelOutput.from_content(model=model_name, content=content, stop_reason=stop_reason)
+    else:
+        raise e
+def openai_media_filter(key: Optional[JsonValue], value: JsonValue) -> JsonValue:
+    # remove images from raw api call
+    if key == 'output' and isinstance(value, dict) and 'image_url' in value:
+        value = copy(value)
+        value.update(image_url=BASE_64_DATA_REMOVED)
+    if key == 'image_url' and isinstance(value, dict) and 'url' in value:
+        url = str(value.get('url'))
+        if url.startswith('data:'):
+            value = copy(value)
+            value.update(url=BASE_64_DATA_REMOVED)
+    elif key == 'input_audio' and isinstance(value, dict) and 'data' in value:
+        value = copy(value)
+        value.update(data=BASE_64_DATA_REMOVED)
+    return value
+def _parse_content_with_internal(content: str, ) -> Tuple[str, Optional[JsonValue]]:
+    """
+    Extracts and removes a smuggled <internal>...</internal> tag from the content string, if present.
+    Note:
+        This OpenAI model does not natively use `.internal`. However, in bridge
+        scenarios—where output from a model that does use `.internal` is routed
+        through this code—such a tag may be present and should be handled.
+    Args:
+        content: The input string, possibly containing an <internal> tag with
+        base64-encoded JSON.
+    Returns:
+        tuple[str, JsonValue | None]:
+            - The content string with the <internal>...</internal> tag removed (if present), otherwise the original string.
+            - The decoded and parsed internal value (if present), otherwise None.
+    Raises:
+        json.JSONDecodeError: If the content of the <internal> tag is not valid JSON after decoding.
+        UnicodeDecodeError: If the content of the <internal> tag is not valid UTF-8 after base64 decoding.
+    """  # noqa: E501
+    internal_pattern = r'<internal>(.*?)</internal>'
+    internal_match = re.search(r'<internal>(.*?)</internal>', content, re.DOTALL)
+    return ((
+        re.sub(internal_pattern, '', content, flags=re.DOTALL).strip(),
+        json.loads(base64.b64decode(internal_match.group(1)).decode('utf-8')),
+    ) if internal_match else (content, None))
+def collect_stream_response(response_stream: List[ChatCompletionChunk]) -> ChatCompletion:
+    collected_chunks: List[ChatCompletionChunk] = []
+    collected_messages = defaultdict(list)
+    collected_reasoning = defaultdict(list)
+    collected_tool_calls = defaultdict(dict)
+    for chunk in response_stream:
+        collected_chunks.append(chunk)
+        for choice in chunk.choices:
+            # Handle reasoning content
+            if hasattr(choice.delta, 'reasoning_content') and choice.delta.reasoning_content is not None:
+                collected_reasoning[choice.index].append(choice.delta.reasoning_content)
+            # Handle regular content
+            if choice.delta.content is not None:
+                collected_messages[choice.index].append(choice.delta.content)
+            # Handle tool calls
+            if hasattr(choice.delta, 'tool_calls') and choice.delta.tool_calls:
+                for tool_call in choice.delta.tool_calls:
+                    tool_id = tool_call.index
+                    # Initialize tool call if not present
+                    if tool_id not in collected_tool_calls[choice.index]:
+                        collected_tool_calls[choice.index][tool_id] = {
+                            'id': tool_call.id if hasattr(tool_call, 'id') and tool_call.id else None,
+                            'type': tool_call.type if hasattr(tool_call, 'type') and tool_call.type else None,
+                            'function': {
+                                'name': '',
+                                'arguments': ''
+                            }
+                        }
+                    # Update tool call with new chunks
+                    if hasattr(tool_call, 'function'):
+                        if hasattr(tool_call.function, 'name') and tool_call.function.name:
+                            collected_tool_calls[choice.index][tool_id]['function']['name'] = tool_call.function.name
+                        if hasattr(tool_call.function, 'arguments') and tool_call.function.arguments:
+                            collected_tool_calls[choice.index
+                                                 ][tool_id]['function']['arguments'] += tool_call.function.arguments
+                    # Update ID if it was received later
+                    if hasattr(tool_call, 'id') and tool_call.id:
+                        collected_tool_calls[choice.index][tool_id]['id'] = tool_call.id
+    # Get all unique choice indices from all collections
+    all_indices = set(collected_messages.keys()) | set(collected_reasoning.keys()) | set(collected_tool_calls.keys())
+    choices = []
+    for index in all_indices:
+        full_reply_content = ''.join(collected_messages.get(index, []))
+        reasoning = ''.join(collected_reasoning.get(index, []))
+        # Process tool_calls for this choice if any exists
+        tool_calls_list = None
+        if index in collected_tool_calls and collected_tool_calls[index]:
+            tool_calls_list = list(collected_tool_calls[index].values())
+            # Filter out any tool calls with None id (incomplete tool calls)
+            tool_calls_list = [tc for tc in tool_calls_list if tc['id'] is not None]
+        # use the finish_reason from the last chunk that generated this choice
+        finish_reason = None
+        for chunk in reversed(collected_chunks):
+            if chunk.choices and chunk.choices[0].index == index:
+                finish_reason = chunk.choices[0].finish_reason
+                break
+        message_kwargs = {'role': 'assistant', 'content': full_reply_content}
+        if reasoning:
+            message_kwargs['reasoning_content'] = reasoning
+        if tool_calls_list:
+            message_kwargs['tool_calls'] = tool_calls_list
+        choice = Choice(
+            finish_reason=finish_reason or 'stop', index=index, message=ChatCompletionMessage(**message_kwargs)
+        )
+        choices.append(choice)
+    # build the final completion object
+    return ChatCompletion(
+        id=collected_chunks[0].id,
+        choices=choices,
+        created=collected_chunks[0].created,
+        model=collected_chunks[0].model,
+        object='chat.completion',
+        usage=collected_chunks[-1].usage  # use the usage from the last chunk
+    )

evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 0.10.0py3-none-any.whl → 1.2.0py3-none-any.whl