evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field, JsonValue
|
|
2
|
+
from typing import Dict, Literal, Optional, Sequence, Union
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ContentBase(BaseModel):
|
|
6
|
+
internal: Optional[JsonValue] = Field(default=None)
|
|
7
|
+
"""Model provider specific payload - typically used to aid transformation back to model types."""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ContentText(ContentBase):
|
|
11
|
+
"""Text content."""
|
|
12
|
+
|
|
13
|
+
type: Literal['text'] = Field(default='text')
|
|
14
|
+
"""Type."""
|
|
15
|
+
|
|
16
|
+
text: str
|
|
17
|
+
"""Text content."""
|
|
18
|
+
|
|
19
|
+
refusal: Optional[bool] = Field(default=None)
|
|
20
|
+
"""Was this a refusal message?"""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ContentReasoning(ContentBase):
|
|
24
|
+
"""Reasoning content.
|
|
25
|
+
|
|
26
|
+
See the specification for [thinking blocks](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks) for Claude models.
|
|
27
|
+
""" # noqa: E501
|
|
28
|
+
|
|
29
|
+
type: Literal['reasoning'] = Field(default='reasoning')
|
|
30
|
+
"""Type."""
|
|
31
|
+
|
|
32
|
+
reasoning: str
|
|
33
|
+
"""Reasoning content."""
|
|
34
|
+
|
|
35
|
+
signature: Optional[str] = Field(default=None)
|
|
36
|
+
"""Signature for reasoning content (used by some models to ensure that reasoning content is not modified for replay)""" # noqa: E501
|
|
37
|
+
|
|
38
|
+
redacted: bool = Field(default=False)
|
|
39
|
+
"""Indicates that the explicit content of this reasoning block has been redacted."""
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ContentImage(ContentBase):
|
|
43
|
+
"""Image content."""
|
|
44
|
+
|
|
45
|
+
type: Literal['image'] = Field(default='image')
|
|
46
|
+
"""Type."""
|
|
47
|
+
|
|
48
|
+
image: str
|
|
49
|
+
"""Either a URL of the image or the base64 encoded image data."""
|
|
50
|
+
|
|
51
|
+
detail: Literal['auto', 'low', 'high'] = Field(default='auto')
|
|
52
|
+
"""Specifies the detail level of the image.
|
|
53
|
+
|
|
54
|
+
Currently only supported for OpenAI. Learn more in the [Vision guide](https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding).
|
|
55
|
+
""" # noqa: E501
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class ContentAudio(ContentBase):
|
|
59
|
+
"""Audio content."""
|
|
60
|
+
|
|
61
|
+
type: Literal['audio'] = Field(default='audio')
|
|
62
|
+
"""Type."""
|
|
63
|
+
|
|
64
|
+
audio: str
|
|
65
|
+
"""Audio file path or base64 encoded data URL."""
|
|
66
|
+
|
|
67
|
+
format: Literal['wav', 'mp3']
|
|
68
|
+
"""Format of audio data ('mp3' or 'wav')"""
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class ContentVideo(ContentBase):
|
|
72
|
+
"""Video content."""
|
|
73
|
+
|
|
74
|
+
type: Literal['video'] = Field(default='video')
|
|
75
|
+
"""Type."""
|
|
76
|
+
|
|
77
|
+
video: str
|
|
78
|
+
"""Audio file path or base64 encoded data URL."""
|
|
79
|
+
|
|
80
|
+
format: Literal['mp4', 'mpeg', 'mov']
|
|
81
|
+
"""Format of video data ('mp4', 'mpeg', or 'mov')"""
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class ContentData(ContentBase):
|
|
85
|
+
"""Model internal."""
|
|
86
|
+
|
|
87
|
+
type: Literal['data'] = Field(default='data')
|
|
88
|
+
"""Type."""
|
|
89
|
+
|
|
90
|
+
data: Dict[str, JsonValue]
|
|
91
|
+
"""Model provider specific payload - required for internal content."""
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
Content = Union[
|
|
95
|
+
ContentText,
|
|
96
|
+
ContentReasoning,
|
|
97
|
+
ContentImage,
|
|
98
|
+
ContentAudio,
|
|
99
|
+
ContentVideo,
|
|
100
|
+
ContentData,
|
|
101
|
+
]
|
|
102
|
+
"""Content sent to or received from a model."""
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from .content import ContentReasoning
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def parse_content_with_reasoning(content: str) -> tuple[str, Optional[ContentReasoning]]:
|
|
8
|
+
"""
|
|
9
|
+
Looks for and extracts <think/> tags into reasoning text.
|
|
10
|
+
|
|
11
|
+
Returns a tuple:
|
|
12
|
+
- The first element is the input content with the <think> tag and its contents fully removed.
|
|
13
|
+
- The second element is a ContentReasoning object (or None if no <think> tag is found).
|
|
14
|
+
"""
|
|
15
|
+
# Match <think> tag with optional attributes anywhere in the string
|
|
16
|
+
pattern = (r'<think(?:\s+signature="([^"]*)")?(?:\s+redacted="(true)")?\s*>(.*?)</think>')
|
|
17
|
+
match = re.search(pattern, content, re.DOTALL)
|
|
18
|
+
|
|
19
|
+
if match:
|
|
20
|
+
signature = match.group(1) # This will be None if not present
|
|
21
|
+
redacted_value = match.group(2) # This will be "true" or None
|
|
22
|
+
reasoning = match.group(3).strip()
|
|
23
|
+
# Remove the matched <think>...</think> from the input
|
|
24
|
+
start, end = match.span()
|
|
25
|
+
|
|
26
|
+
return (
|
|
27
|
+
(content[:start] + content[end:]).strip(),
|
|
28
|
+
ContentReasoning(
|
|
29
|
+
reasoning=reasoning,
|
|
30
|
+
signature=signature,
|
|
31
|
+
redacted=redacted_value == 'true',
|
|
32
|
+
),
|
|
33
|
+
)
|
|
34
|
+
else:
|
|
35
|
+
return content, None
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Callable, Iterable, List, Union
|
|
3
|
+
|
|
4
|
+
from evalscope.utils import get_logger
|
|
5
|
+
from evalscope.utils.function_utils import thread_safe
|
|
6
|
+
|
|
7
|
+
logger = get_logger()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Metric(ABC):
|
|
11
|
+
"""
|
|
12
|
+
Metric classes operate on a sample level.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, *args, **kwargs) -> None:
|
|
16
|
+
"""
|
|
17
|
+
Can define custom behavior here, if an individual instantiation of a Metric class should have state.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def apply(self, predictions: List[str], references: List[str]) -> List[float]:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
def __call__(self, prediction: str, reference: str) -> float:
|
|
25
|
+
"""
|
|
26
|
+
Allows the metric to be called like a function.
|
|
27
|
+
"""
|
|
28
|
+
return self.apply([prediction], [reference])[0]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class SingletonMetric(Metric):
|
|
32
|
+
"""Singleton base class for metrics."""
|
|
33
|
+
_instance = None
|
|
34
|
+
|
|
35
|
+
@thread_safe
|
|
36
|
+
def __new__(cls, *args, **kwargs):
|
|
37
|
+
if cls._instance is None:
|
|
38
|
+
cls._instance = super().__new__(cls)
|
|
39
|
+
return cls._instance
|
|
40
|
+
|
|
41
|
+
def __init__(self, *args, **kwargs):
|
|
42
|
+
cls = self.__class__
|
|
43
|
+
if hasattr(self, '_init_done'):
|
|
44
|
+
return
|
|
45
|
+
logger.info(f'Initializing {cls.__name__}...')
|
|
46
|
+
self._init_once(*args, **kwargs)
|
|
47
|
+
self._init_done = True
|
|
48
|
+
|
|
49
|
+
def _init_once(self, *args, **kwargs):
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class T2IMetric(SingletonMetric):
|
|
54
|
+
"""Singleton base class for T2I metrics."""
|
|
55
|
+
|
|
56
|
+
def apply(self, images: List[str], texts: List[str], **kwargs) -> List[Union[float, dict]]:
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
def __call__(self, image: str, text: str, **kwargs) -> Union[float, dict]:
|
|
60
|
+
return self.apply([image], [text], **kwargs)[0]
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
|
|
3
|
+
|
|
4
|
+
from evalscope.utils.logger import get_logger
|
|
5
|
+
|
|
6
|
+
logger = get_logger()
|
|
7
|
+
|
|
8
|
+
Value = Dict[str, Union[int, float, bool]]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Score(BaseModel):
|
|
12
|
+
"""Score generated by a scorer."""
|
|
13
|
+
|
|
14
|
+
value: Value = Field(default_factory=dict)
|
|
15
|
+
"""Score value as a dictionary. Key is the score name, value is the score value.
|
|
16
|
+
The first key is considered the main score by default."""
|
|
17
|
+
|
|
18
|
+
extracted_prediction: Optional[str] = Field(default=None)
|
|
19
|
+
"""Answer extracted from model output (optional)"""
|
|
20
|
+
|
|
21
|
+
prediction: Optional[str] = Field(default=None)
|
|
22
|
+
"""Original prediction text from the model (optional)"""
|
|
23
|
+
|
|
24
|
+
explanation: Optional[str] = Field(default=None)
|
|
25
|
+
"""Explanation of score (optional)."""
|
|
26
|
+
|
|
27
|
+
metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
|
28
|
+
"""Additional metadata related to the score"""
|
|
29
|
+
|
|
30
|
+
main_score_name: Optional[str] = Field(default=None)
|
|
31
|
+
"""Main score name, if applicable. This is used to indicate which score is the primary score in a multi-score scenario.""" # noqa: E501
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def main_value(self) -> Union[int, float, bool]:
|
|
35
|
+
"""Main score value."""
|
|
36
|
+
if self.main_score_name and self.main_score_name in self.value:
|
|
37
|
+
return self.value[self.main_score_name]
|
|
38
|
+
elif self.value:
|
|
39
|
+
# If main_score_name is not set or not found, use the first value and update main_score_name
|
|
40
|
+
first_key = next(iter(self.value))
|
|
41
|
+
self.main_score_name = first_key
|
|
42
|
+
return self.value[first_key]
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
@main_value.setter
|
|
46
|
+
def main_value(self, value: Union[int, float, bool]):
|
|
47
|
+
"""Set the main score value."""
|
|
48
|
+
if self.main_score_name:
|
|
49
|
+
# If main_score_name is already set, use it
|
|
50
|
+
self.value[self.main_score_name] = value
|
|
51
|
+
elif self.value:
|
|
52
|
+
# If no main_score_name but value dict exists, use the first key
|
|
53
|
+
first_key = next(iter(self.value))
|
|
54
|
+
self.main_score_name = first_key
|
|
55
|
+
self.value[first_key] = value
|
|
56
|
+
else:
|
|
57
|
+
# If neither main_score_name nor value dict exists, initialize both
|
|
58
|
+
self.main_score_name = 'default'
|
|
59
|
+
self.value[self.main_score_name] = value
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class SampleScore(BaseModel):
|
|
63
|
+
"""Score for a Sample."""
|
|
64
|
+
|
|
65
|
+
score: Score
|
|
66
|
+
"""A score"""
|
|
67
|
+
|
|
68
|
+
sample_id: Optional[Union[str, int]] = Field(default=None)
|
|
69
|
+
"""A sample id"""
|
|
70
|
+
|
|
71
|
+
group_id: Optional[Union[str, int]] = Field(default=None)
|
|
72
|
+
"""A group id for the sample, used for grouping k repeated samples."""
|
|
73
|
+
|
|
74
|
+
sample_metadata: Optional[Dict[str, Any]] = Field(default=None)
|
|
75
|
+
"""Metadata from the sample"""
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class AggScore(BaseModel):
|
|
79
|
+
"""Output of an aggregation operation."""
|
|
80
|
+
|
|
81
|
+
score: float = Field(default=0.0)
|
|
82
|
+
"""Aggregated value as a float."""
|
|
83
|
+
|
|
84
|
+
metric_name: str = Field(default='')
|
|
85
|
+
"""Name of the metric being aggregated."""
|
|
86
|
+
|
|
87
|
+
aggregation_name: str = Field(default='')
|
|
88
|
+
"""Name of the aggregation methods"""
|
|
89
|
+
|
|
90
|
+
num: int = Field(default=0)
|
|
91
|
+
"""Number of samples used in the aggregation."""
|
|
92
|
+
|
|
93
|
+
ids: Optional[List[Union[str, int]]] = Field(default=None)
|
|
94
|
+
"""List of sample IDs used in the aggregation, if applicable."""
|
|
95
|
+
|
|
96
|
+
metadata: Optional[Dict[str, Any]] = Field(default=None)
|
|
97
|
+
"""Additional metadata related to the aggregation."""
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class Aggregator:
|
|
101
|
+
|
|
102
|
+
name = 'default'
|
|
103
|
+
|
|
104
|
+
def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
|
|
105
|
+
r"""Aggregate a metric on a list of scores.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
scores: List of scores.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
List[AggregatOutput]: List of aggregated outputs.
|
|
112
|
+
"""
|
|
113
|
+
...
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
from evalscope.api.evaluator import TaskState
|
|
4
|
+
from evalscope.api.metric import Score
|
|
5
|
+
from evalscope.constants import JudgeStrategy
|
|
6
|
+
from evalscope.metrics import LLMJudge
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from evalscope.config import TaskConfig
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LLMJudgeMixin:
|
|
16
|
+
"""
|
|
17
|
+
Mixin class for LLM Judge functionality.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, task_config: 'TaskConfig'):
|
|
21
|
+
self._task_config = task_config
|
|
22
|
+
self._use_llm_judge = False
|
|
23
|
+
"""Whether to use LLM as a judge"""
|
|
24
|
+
|
|
25
|
+
self._llm_judge: Optional[LLMJudge] = None
|
|
26
|
+
|
|
27
|
+
super().__init__(task_config=task_config)
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def llm_judge(self) -> Optional[LLMJudge]:
|
|
31
|
+
"""Get LLM judge instance with lazy initialization."""
|
|
32
|
+
if self._llm_judge is None and self.use_llm_judge:
|
|
33
|
+
self._llm_judge = self.init_llm_judge()
|
|
34
|
+
return self._llm_judge
|
|
35
|
+
|
|
36
|
+
@llm_judge.setter
|
|
37
|
+
def llm_judge(self, value: Optional[LLMJudge]):
|
|
38
|
+
"""Set LLM judge instance."""
|
|
39
|
+
self._llm_judge = value
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def judge_strategy(self) -> str:
|
|
43
|
+
"""Get the judge strategy from the task configuration."""
|
|
44
|
+
return self._task_config.judge_strategy
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def use_llm_judge(self) -> bool:
|
|
48
|
+
"""Check if LLM judge is enabled."""
|
|
49
|
+
if self.judge_strategy == JudgeStrategy.RULE:
|
|
50
|
+
return False
|
|
51
|
+
elif self.judge_strategy == JudgeStrategy.LLM:
|
|
52
|
+
return True
|
|
53
|
+
elif self.judge_strategy == JudgeStrategy.LLM_RECALL:
|
|
54
|
+
return True
|
|
55
|
+
elif self.judge_strategy == JudgeStrategy.AUTO:
|
|
56
|
+
return self._use_llm_judge
|
|
57
|
+
else:
|
|
58
|
+
logger.warning(f'Unknown judge strategy: {self.judge_strategy}. Defaulting to False.')
|
|
59
|
+
return False
|
|
60
|
+
|
|
61
|
+
def init_llm_judge(self) -> Optional[LLMJudge]:
|
|
62
|
+
"""
|
|
63
|
+
Initialize the LLM judge for the benchmark.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Optional[LLMJudge]: The initialized LLM judge instance or None
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
if self.judge_strategy == JudgeStrategy.RULE:
|
|
70
|
+
return None
|
|
71
|
+
else:
|
|
72
|
+
return LLMJudge(**self._task_config.judge_model_args)
|
|
73
|
+
|
|
74
|
+
def maybe_llm_match_score(
|
|
75
|
+
self,
|
|
76
|
+
original_prediction: str,
|
|
77
|
+
filtered_prediction: str,
|
|
78
|
+
reference: str,
|
|
79
|
+
task_state: TaskState,
|
|
80
|
+
rule_based_score: Optional[Score] = None,
|
|
81
|
+
) -> Score:
|
|
82
|
+
"""
|
|
83
|
+
Compute the match score between the original and filtered predictions against the reference.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
original_prediction: The original prediction output from the model.
|
|
87
|
+
filtered_prediction: The filtered prediction output from the model.
|
|
88
|
+
reference: The ground truth reference output.
|
|
89
|
+
task_state: The current task state.
|
|
90
|
+
original_score: Optional original score to be used for comparison.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Score: The computed match score.
|
|
94
|
+
"""
|
|
95
|
+
# If LLM judge is not used, return the rule-based score directly
|
|
96
|
+
if not self.use_llm_judge:
|
|
97
|
+
return rule_based_score
|
|
98
|
+
|
|
99
|
+
# For LLM_RECALL, if rule-based score is already perfect, skip LLM judge
|
|
100
|
+
if float(rule_based_score.main_value) > 0.99:
|
|
101
|
+
return rule_based_score
|
|
102
|
+
|
|
103
|
+
# Compute LLM judge score
|
|
104
|
+
llm_score = self.llm_match_score(
|
|
105
|
+
original_prediction=original_prediction,
|
|
106
|
+
filtered_prediction=filtered_prediction,
|
|
107
|
+
reference=reference,
|
|
108
|
+
task_state=task_state,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# For LLM RECALL, merge the scores
|
|
112
|
+
return self._merge_scores(rule_based_score, llm_score)
|
|
113
|
+
|
|
114
|
+
def llm_match_score(
|
|
115
|
+
self,
|
|
116
|
+
original_prediction: str,
|
|
117
|
+
filtered_prediction: str,
|
|
118
|
+
reference: str,
|
|
119
|
+
task_state: TaskState,
|
|
120
|
+
) -> Score:
|
|
121
|
+
"""Compute the LLM match score.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
original_prediction (str): The original prediction output from the model.
|
|
125
|
+
filtered_prediction (str): The filtered prediction output from the model.
|
|
126
|
+
reference (str): The ground truth reference output.
|
|
127
|
+
task_state (TaskState): The current task state.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Score: The computed match score.
|
|
131
|
+
"""
|
|
132
|
+
score = Score(
|
|
133
|
+
extracted_prediction=filtered_prediction,
|
|
134
|
+
prediction=original_prediction,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
question = task_state.input_text
|
|
138
|
+
|
|
139
|
+
# Request judge and obtain score
|
|
140
|
+
prompt = self.llm_judge.build_prompt(pred=original_prediction, gold=reference, question=question)
|
|
141
|
+
judge_response = self.llm_judge.judge(prompt)
|
|
142
|
+
judge_score = self.llm_judge.get_score(judge_response)
|
|
143
|
+
|
|
144
|
+
score.value = {'acc': judge_score}
|
|
145
|
+
score.explanation = f'LLM judge: {judge_response}'
|
|
146
|
+
score.metadata = {
|
|
147
|
+
'source': 'llm_judge',
|
|
148
|
+
'judge_strategy': self.judge_strategy,
|
|
149
|
+
'model': self.llm_judge.model_id
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
return score
|
|
153
|
+
|
|
154
|
+
def _merge_scores(self, rule_based_score: Score, llm_score: Score) -> Score:
|
|
155
|
+
"""
|
|
156
|
+
Merge rule-based score with LLM judge score for LLM_RECALL strategy.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
rule_based_score: The original rule-based score
|
|
160
|
+
llm_score: The LLM judge score
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Score: The merged score
|
|
164
|
+
"""
|
|
165
|
+
# Update the main value with LLM judge result
|
|
166
|
+
rule_based_score.main_value = llm_score.main_value
|
|
167
|
+
rule_based_score.explanation = llm_score.explanation
|
|
168
|
+
rule_based_score.metadata = llm_score.metadata
|
|
169
|
+
|
|
170
|
+
return rule_based_score
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional
|
|
2
|
+
|
|
3
|
+
from evalscope.utils.function_utils import AsyncioLoopRunner, thread_safe
|
|
4
|
+
from evalscope.utils.logger import get_logger
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from ms_enclave.sandbox.manager import SandboxManager
|
|
8
|
+
|
|
9
|
+
from evalscope.config import TaskConfig
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SandboxMixin:
|
|
15
|
+
"""Sandbox mixin for sandboxed code execution."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, task_config: 'TaskConfig'):
|
|
18
|
+
self._task_config = task_config
|
|
19
|
+
|
|
20
|
+
self._manager: Optional['SandboxManager'] = None
|
|
21
|
+
"""Sandbox manager instance."""
|
|
22
|
+
|
|
23
|
+
self._sandbox_id: Optional[str] = None
|
|
24
|
+
"""Sandbox ID."""
|
|
25
|
+
|
|
26
|
+
# Lazy init state
|
|
27
|
+
self._initialized: bool = False
|
|
28
|
+
|
|
29
|
+
# NOTE: Initialization is deferred.
|
|
30
|
+
super().__init__()
|
|
31
|
+
|
|
32
|
+
async def _async_init(self):
|
|
33
|
+
"""Async initialization helper."""
|
|
34
|
+
await self.init_sandbox_manager_async()
|
|
35
|
+
await self.init_sandbox_async()
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def use_sandbox(self) -> bool:
|
|
39
|
+
"""
|
|
40
|
+
Return whether to use sandbox for the benchmark.
|
|
41
|
+
"""
|
|
42
|
+
if not self._task_config:
|
|
43
|
+
return False
|
|
44
|
+
else:
|
|
45
|
+
return self._task_config.use_sandbox
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def sandbox_manager(self) -> Optional['SandboxManager']:
|
|
49
|
+
"""Get the sandbox manager instance."""
|
|
50
|
+
return self._manager
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def sandbox_id(self) -> Optional[str]:
|
|
54
|
+
"""Get the sandbox ID."""
|
|
55
|
+
return self._sandbox_id
|
|
56
|
+
|
|
57
|
+
@thread_safe
|
|
58
|
+
def ensure_sandbox_ready(self) -> bool:
|
|
59
|
+
"""
|
|
60
|
+
Ensure the sandbox loop, manager, and sandbox instance are initialized.
|
|
61
|
+
This method is thread-safe and idempotent.
|
|
62
|
+
"""
|
|
63
|
+
if not self.use_sandbox:
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
if self._initialized and self._manager and self._sandbox_id:
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
# Initialize manager and sandbox using the class-level runner
|
|
70
|
+
AsyncioLoopRunner.run(self.init_sandbox_manager_async())
|
|
71
|
+
AsyncioLoopRunner.run(self.init_sandbox_async())
|
|
72
|
+
|
|
73
|
+
self._initialized = True
|
|
74
|
+
return True
|
|
75
|
+
|
|
76
|
+
async def init_sandbox_manager_async(self) -> Optional['SandboxManager']:
|
|
77
|
+
"""Initialize the sandbox manager asynchronously."""
|
|
78
|
+
if self._manager is not None:
|
|
79
|
+
return self._manager
|
|
80
|
+
|
|
81
|
+
if not self.use_sandbox:
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
from ms_enclave.sandbox.manager import HttpSandboxManager, LocalSandboxManager
|
|
85
|
+
|
|
86
|
+
manager_config = self._task_config.sandbox_manager_config or {}
|
|
87
|
+
if manager_config.get('base_url'):
|
|
88
|
+
# Remote manager
|
|
89
|
+
self._manager = HttpSandboxManager(**manager_config)
|
|
90
|
+
else:
|
|
91
|
+
# Local manager
|
|
92
|
+
self._manager = LocalSandboxManager(**manager_config)
|
|
93
|
+
|
|
94
|
+
await self._manager.start()
|
|
95
|
+
logger.info('Sandbox manager initialized.')
|
|
96
|
+
return self._manager
|
|
97
|
+
|
|
98
|
+
def init_sandbox_manager(self) -> Optional['SandboxManager']:
|
|
99
|
+
"""Initialize the sandbox manager."""
|
|
100
|
+
if self._manager is not None:
|
|
101
|
+
return self._manager
|
|
102
|
+
|
|
103
|
+
if not self.use_sandbox:
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
return AsyncioLoopRunner.run(self.init_sandbox_manager_async())
|
|
107
|
+
|
|
108
|
+
async def init_sandbox_async(self) -> Optional[str]:
|
|
109
|
+
"""Initialize the sandbox instance asynchronously."""
|
|
110
|
+
if self._sandbox_id is not None:
|
|
111
|
+
return self._sandbox_id
|
|
112
|
+
|
|
113
|
+
if not self.use_sandbox:
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
from ms_enclave.sandbox.model import DockerSandboxConfig, SandboxType
|
|
117
|
+
|
|
118
|
+
sandbox_config = self._task_config.sandbox_config or DockerSandboxConfig(
|
|
119
|
+
image='python:3.11-slim', tools_config={
|
|
120
|
+
'shell_executor': {},
|
|
121
|
+
'python_executor': {}
|
|
122
|
+
}
|
|
123
|
+
)
|
|
124
|
+
sandbox_type = self._task_config.sandbox_type or SandboxType.DOCKER
|
|
125
|
+
|
|
126
|
+
self._sandbox_id = await self._manager.create_sandbox(sandbox_type=sandbox_type, config=sandbox_config)
|
|
127
|
+
|
|
128
|
+
sandbox_info = await self._manager.get_sandbox_info(self._sandbox_id)
|
|
129
|
+
|
|
130
|
+
logger.info(f'Sandbox of type {sandbox_type} initialized. Info: {sandbox_info.model_dump(exclude_none=True)}')
|
|
131
|
+
return self._sandbox_id
|
|
132
|
+
|
|
133
|
+
def init_sandbox(self) -> Optional[str]:
|
|
134
|
+
"""Initialize the sandbox instance."""
|
|
135
|
+
if self._sandbox_id is not None:
|
|
136
|
+
return self._sandbox_id
|
|
137
|
+
|
|
138
|
+
if not self.use_sandbox:
|
|
139
|
+
return None
|
|
140
|
+
|
|
141
|
+
return AsyncioLoopRunner.run(self.init_sandbox_async())
|
|
142
|
+
|
|
143
|
+
def execute_code_in_sandbox(self, code: str, timeout: int = 60, language: str = 'python') -> Dict[str, Any]:
|
|
144
|
+
"""Execute code in the sandbox."""
|
|
145
|
+
# Lazy, thread-safe initialization
|
|
146
|
+
if not self.ensure_sandbox_ready():
|
|
147
|
+
logger.warning('Sandbox is not initialized.')
|
|
148
|
+
return {'error': 'Sandbox is not initialized.'}
|
|
149
|
+
|
|
150
|
+
from ms_enclave.sandbox.model import ExecutionStatus, ToolResult
|
|
151
|
+
|
|
152
|
+
async def _execute_async():
|
|
153
|
+
if language.lower() == 'python':
|
|
154
|
+
tool_name = 'python_executor'
|
|
155
|
+
parameters = {'code': code, 'timeout': timeout}
|
|
156
|
+
result = await self._manager.execute_tool(self._sandbox_id, tool_name, parameters)
|
|
157
|
+
elif language.lower() == 'shell':
|
|
158
|
+
tool_name = 'shell_executor'
|
|
159
|
+
parameters = {'command': code, 'timeout': timeout}
|
|
160
|
+
result = await self._manager.execute_tool(self._sandbox_id, tool_name, parameters)
|
|
161
|
+
else:
|
|
162
|
+
logger.warning(f"Unsupported language: {language}. Supported languages are 'python' and 'shell'.")
|
|
163
|
+
result = ToolResult(
|
|
164
|
+
status=ExecutionStatus.ERROR,
|
|
165
|
+
tool_name='code_executor',
|
|
166
|
+
output=f"Unsupported language: {language}. Supported languages are 'python' and 'shell'."
|
|
167
|
+
)
|
|
168
|
+
return result
|
|
169
|
+
|
|
170
|
+
# Execute in background loop via class-level runner
|
|
171
|
+
result = AsyncioLoopRunner.run(_execute_async(), timeout=timeout + 10)
|
|
172
|
+
return result.model_dump(exclude_none=True)
|
|
173
|
+
|
|
174
|
+
def sandbox_finalize(self, *args, **kwargs):
|
|
175
|
+
"""Finalize the sandbox manager."""
|
|
176
|
+
if self._manager:
|
|
177
|
+
try:
|
|
178
|
+
# Stop the manager but keep the shared loop alive
|
|
179
|
+
AsyncioLoopRunner.run(self._manager.stop(), timeout=30)
|
|
180
|
+
logger.info('Sandbox manager finalized.')
|
|
181
|
+
except Exception as e:
|
|
182
|
+
logger.warning(f'Error finalizing sandbox manager: {e}')
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from .generate_config import GenerateConfig
|
|
2
|
+
from .model import Model, ModelAPI, get_model, get_model_with_task_config
|
|
3
|
+
from .model_output import (
|
|
4
|
+
ChatCompletionChoice,
|
|
5
|
+
Logprob,
|
|
6
|
+
Logprobs,
|
|
7
|
+
ModelOutput,
|
|
8
|
+
ModelUsage,
|
|
9
|
+
StopReason,
|
|
10
|
+
TopLogprob,
|
|
11
|
+
as_stop_reason,
|
|
12
|
+
)
|