evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,35 +1,38 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import math
|
|
2
3
|
import os
|
|
3
|
-
from
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
4
6
|
|
|
5
7
|
from evalscope.perf.arguments import Arguments
|
|
6
|
-
from evalscope.perf.plugin.api.
|
|
8
|
+
from evalscope.perf.plugin.api.default_api import DefaultApiPlugin
|
|
7
9
|
from evalscope.perf.plugin.registry import register_api
|
|
10
|
+
from evalscope.utils.io_utils import base64_to_PIL
|
|
8
11
|
from evalscope.utils.logger import get_logger
|
|
9
12
|
|
|
10
13
|
logger = get_logger()
|
|
11
14
|
|
|
12
15
|
|
|
13
16
|
@register_api(['openai', 'local_vllm', 'local'])
|
|
14
|
-
class OpenaiPlugin(
|
|
17
|
+
class OpenaiPlugin(DefaultApiPlugin):
|
|
15
18
|
"""Base of openai interface."""
|
|
16
19
|
|
|
17
|
-
def __init__(self,
|
|
18
|
-
"""
|
|
20
|
+
def __init__(self, param: Arguments):
|
|
21
|
+
"""Initialize the OpenaiPlugin.
|
|
19
22
|
|
|
20
23
|
Args:
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
+
param (Arguments): Configuration object containing parameters
|
|
25
|
+
such as the tokenizer path and model details. If a tokenizer
|
|
26
|
+
path is provided, it is used to initialize the tokenizer.
|
|
24
27
|
"""
|
|
25
|
-
super().__init__(
|
|
26
|
-
if
|
|
27
|
-
from
|
|
28
|
-
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
28
|
+
super().__init__(param=param)
|
|
29
|
+
if param.tokenizer_path is not None:
|
|
30
|
+
from modelscope import AutoTokenizer
|
|
31
|
+
self.tokenizer = AutoTokenizer.from_pretrained(param.tokenizer_path)
|
|
29
32
|
else:
|
|
30
33
|
self.tokenizer = None
|
|
31
34
|
|
|
32
|
-
def build_request(self, messages: Union[List[Dict], str], param: Arguments) -> Dict:
|
|
35
|
+
def build_request(self, messages: Union[List[Dict], str], param: Arguments = None) -> Dict:
|
|
33
36
|
"""Build the openai format request based on prompt, dataset
|
|
34
37
|
|
|
35
38
|
Args:
|
|
@@ -42,6 +45,7 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
42
45
|
Returns:
|
|
43
46
|
Dict: The request body. None if prompt format is error.
|
|
44
47
|
"""
|
|
48
|
+
param = param or self.param
|
|
45
49
|
try:
|
|
46
50
|
if param.query_template is not None:
|
|
47
51
|
if param.query_template.startswith('@'):
|
|
@@ -54,8 +58,6 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
54
58
|
else:
|
|
55
59
|
query = json.loads(param.query_template)
|
|
56
60
|
|
|
57
|
-
if 'stream' in query.keys():
|
|
58
|
-
param.stream = query['stream']
|
|
59
61
|
# replace template messages with input messages.
|
|
60
62
|
query['messages'] = messages
|
|
61
63
|
elif isinstance(messages, str):
|
|
@@ -75,6 +77,8 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
75
77
|
payload['min_tokens'] = param.min_tokens
|
|
76
78
|
if param.frequency_penalty is not None:
|
|
77
79
|
payload['frequency_penalty'] = param.frequency_penalty
|
|
80
|
+
if param.repetition_penalty is not None:
|
|
81
|
+
payload['repetition_penalty'] = param.repetition_penalty
|
|
78
82
|
if param.logprobs is not None:
|
|
79
83
|
payload['logprobs'] = param.logprobs
|
|
80
84
|
if param.n_choices is not None:
|
|
@@ -92,68 +96,143 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
92
96
|
payload['temperature'] = param.temperature
|
|
93
97
|
if param.top_p is not None:
|
|
94
98
|
payload['top_p'] = param.top_p
|
|
99
|
+
if param.top_k is not None:
|
|
100
|
+
payload['top_k'] = param.top_k
|
|
101
|
+
if param.extra_args is not None:
|
|
102
|
+
payload.update(param.extra_args)
|
|
95
103
|
return payload
|
|
96
104
|
|
|
97
|
-
def parse_responses(self, responses, request:
|
|
105
|
+
def parse_responses(self, responses: List[Dict], request: str = None, **kwargs) -> tuple[int, int]:
|
|
98
106
|
"""Parser responses and return number of request and response tokens.
|
|
99
107
|
Only one response for non-stream, multiple responses for stream.
|
|
100
108
|
"""
|
|
101
109
|
|
|
102
110
|
# when stream, the last response is the full usage
|
|
103
111
|
# when non-stream, the last response is the first response
|
|
104
|
-
last_response_js =
|
|
112
|
+
last_response_js = responses[-1]
|
|
105
113
|
if 'usage' in last_response_js and last_response_js['usage']:
|
|
106
114
|
input_tokens = last_response_js['usage']['prompt_tokens']
|
|
107
115
|
output_tokens = last_response_js['usage']['completion_tokens']
|
|
108
116
|
return input_tokens, output_tokens
|
|
109
117
|
|
|
110
118
|
# no usage information in the response, parse the response to get the tokens
|
|
111
|
-
delta_contents =
|
|
119
|
+
delta_contents = defaultdict(list)
|
|
112
120
|
for response in responses:
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
self.__process_response_object(js, delta_contents)
|
|
121
|
+
if 'object' in response:
|
|
122
|
+
self.__process_response_object(response, delta_contents)
|
|
116
123
|
else:
|
|
117
|
-
self.__process_no_object(
|
|
124
|
+
self.__process_no_object(response, delta_contents)
|
|
118
125
|
|
|
119
126
|
input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
|
|
120
127
|
return input_tokens, output_tokens
|
|
121
128
|
|
|
122
|
-
def __process_response_object(self,
|
|
123
|
-
if
|
|
124
|
-
|
|
129
|
+
def __process_response_object(self, response, delta_contents):
|
|
130
|
+
if not response.get('choices'):
|
|
131
|
+
return
|
|
132
|
+
if response['object'] == 'chat.completion':
|
|
133
|
+
for choice in response['choices']:
|
|
125
134
|
delta_contents[choice['index']] = [choice['message']['content']]
|
|
126
|
-
elif
|
|
127
|
-
for choice in
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
135
|
+
elif response['object'] == 'text_completion':
|
|
136
|
+
for choice in response['choices']:
|
|
137
|
+
if 'text' in choice and 'index' in choice:
|
|
138
|
+
delta_contents[choice['index']].append(choice['text'])
|
|
139
|
+
elif response['object'] == 'chat.completion.chunk':
|
|
140
|
+
for choice in response['choices']:
|
|
131
141
|
if 'delta' in choice and 'index' in choice:
|
|
132
142
|
delta = choice['delta']
|
|
133
143
|
idx = choice['index']
|
|
134
144
|
if 'content' in delta:
|
|
135
|
-
|
|
136
|
-
delta_contents.setdefault(idx, []).append(delta_content)
|
|
145
|
+
delta_contents[idx].append(delta['content'])
|
|
137
146
|
|
|
138
|
-
def __process_no_object(self,
|
|
147
|
+
def __process_no_object(self, response, delta_contents):
|
|
139
148
|
# assume the response is a single choice
|
|
140
|
-
|
|
149
|
+
if not response.get('choices'):
|
|
150
|
+
return
|
|
151
|
+
for choice in response['choices']:
|
|
141
152
|
if 'delta' in choice:
|
|
142
153
|
delta = choice['delta']
|
|
143
154
|
idx = choice['index']
|
|
144
155
|
if 'content' in delta:
|
|
145
|
-
|
|
146
|
-
delta_contents.setdefault(idx, []).append(delta_content)
|
|
156
|
+
delta_contents[idx].append(delta['content'])
|
|
147
157
|
else:
|
|
148
158
|
delta_contents[choice['index']] = [choice['message']['content']]
|
|
149
159
|
|
|
150
|
-
def __calculate_tokens_from_content(self, request,
|
|
160
|
+
def __calculate_tokens_from_content(self, request, content):
|
|
151
161
|
input_tokens = output_tokens = 0
|
|
152
162
|
if self.tokenizer is not None:
|
|
153
|
-
|
|
163
|
+
# Calculate input tokens
|
|
164
|
+
input_tokens += self._count_input_tokens(request)
|
|
165
|
+
for idx, choice_contents in content.items():
|
|
154
166
|
full_response_content = ''.join(choice_contents)
|
|
155
|
-
|
|
156
|
-
output_tokens +=
|
|
167
|
+
# Calculate output tokens
|
|
168
|
+
output_tokens += self._count_output_tokens(full_response_content)
|
|
157
169
|
else:
|
|
158
|
-
|
|
170
|
+
raise ValueError(
|
|
171
|
+
'Error: Unable to retrieve usage information\n\n'
|
|
172
|
+
'This error occurs when:\n'
|
|
173
|
+
'1. The API response does not contain usage data, AND\n'
|
|
174
|
+
'2. No tokenizer has been specified or found.\n\n'
|
|
175
|
+
'To resolve this issue, do ONE of the following:\n'
|
|
176
|
+
"a) Ensure that the API you're using supports and returns usage information, OR\n"
|
|
177
|
+
'b) Specify a tokenizer using the `--tokenizer-path` parameter.\n\n'
|
|
178
|
+
'If you continue to experience issues, '
|
|
179
|
+
'please open an issue on our GitHub repository https://github.com/modelscope/evalscope .'
|
|
180
|
+
)
|
|
159
181
|
return input_tokens, output_tokens
|
|
182
|
+
|
|
183
|
+
def _count_input_tokens(self, request_str: str) -> int:
|
|
184
|
+
"""Count the number of input tokens in the request.
|
|
185
|
+
|
|
186
|
+
This method handles different types of requests and calculates tokens for:
|
|
187
|
+
- Text content in messages or prompts
|
|
188
|
+
- Images in multimodal messages (converted to patch tokens)
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
request_str (str): The request json str containing either 'messages' for chat
|
|
192
|
+
completion or 'prompt' for text completion.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
int: The total number of input tokens including text and image tokens.
|
|
196
|
+
"""
|
|
197
|
+
input_tokens = 0
|
|
198
|
+
request = json.loads(request_str)
|
|
199
|
+
if 'messages' in request:
|
|
200
|
+
input_content = self.tokenizer.apply_chat_template(
|
|
201
|
+
request['messages'], tokenize=True, add_generation_prompt=True
|
|
202
|
+
)
|
|
203
|
+
input_tokens += len(input_content)
|
|
204
|
+
# handle image tokens if any
|
|
205
|
+
for message in request['messages']:
|
|
206
|
+
content = message.get('content', '')
|
|
207
|
+
if isinstance(content, str):
|
|
208
|
+
continue
|
|
209
|
+
for cont in content:
|
|
210
|
+
if cont['type'] == 'image_url':
|
|
211
|
+
try:
|
|
212
|
+
# assuming image_url is base64 string
|
|
213
|
+
image_base64 = cont['image_url']['url']
|
|
214
|
+
image = base64_to_PIL(image_base64)
|
|
215
|
+
# Use math.ceil for more accurate token count when image dimensions
|
|
216
|
+
# aren't perfectly divisible by patch size
|
|
217
|
+
n_patches = (
|
|
218
|
+
math.ceil(image.height / self.param.image_patch_size)
|
|
219
|
+
* math.ceil(image.width / self.param.image_patch_size)
|
|
220
|
+
)
|
|
221
|
+
input_tokens += n_patches
|
|
222
|
+
except Exception as e:
|
|
223
|
+
logger.warning(f'Failed to process image for token counting: {e}')
|
|
224
|
+
# Continue processing other content without failing
|
|
225
|
+
elif 'prompt' in request:
|
|
226
|
+
input_tokens += len(self.tokenizer.encode(request['prompt'], add_special_tokens=False))
|
|
227
|
+
return input_tokens
|
|
228
|
+
|
|
229
|
+
def _count_output_tokens(self, response: str) -> int:
|
|
230
|
+
"""Count the number of output tokens in the response. Only string response is supported.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
response (str): The API response text.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
int: The number of output tokens.
|
|
237
|
+
"""
|
|
238
|
+
return len(self.tokenizer.encode(response, add_special_tokens=False))
|
|
@@ -1,6 +1,10 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
3
|
-
from
|
|
4
|
-
from
|
|
5
|
-
from
|
|
6
|
-
from
|
|
1
|
+
from .base import DatasetPluginBase
|
|
2
|
+
from .custom import CustomDatasetPlugin
|
|
3
|
+
from .flickr8k import FlickrDatasetPlugin
|
|
4
|
+
from .kontext_bench import KontextDatasetPlugin
|
|
5
|
+
from .line_by_line import LineByLineDatasetPlugin
|
|
6
|
+
from .longalpaca import LongAlpacaDatasetPlugin
|
|
7
|
+
from .openqa import OpenqaDatasetPlugin
|
|
8
|
+
from .random_dataset import RandomDatasetPlugin
|
|
9
|
+
from .random_vl_dataset import RandomVLDatasetPlugin
|
|
10
|
+
from .speed_benchmark import SpeedBenchmarkDatasetPlugin, SpeedBenchmarkLongDatasetPlugin
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import sys
|
|
3
3
|
from abc import abstractmethod
|
|
4
|
-
from typing import Any, Dict, Iterator, List, Tuple
|
|
4
|
+
from typing import Any, Dict, Iterator, List, Tuple, Union
|
|
5
5
|
|
|
6
6
|
from evalscope.perf.arguments import Arguments
|
|
7
7
|
|
|
@@ -15,6 +15,11 @@ class DatasetPluginBase:
|
|
|
15
15
|
dataset_path (str, optional): The input dataset path. Defaults to None.
|
|
16
16
|
"""
|
|
17
17
|
self.query_parameters = query_parameters
|
|
18
|
+
if query_parameters.tokenizer_path:
|
|
19
|
+
from modelscope import AutoTokenizer
|
|
20
|
+
self.tokenizer = AutoTokenizer.from_pretrained(query_parameters.tokenizer_path, trust_remote_code=True)
|
|
21
|
+
else:
|
|
22
|
+
self.tokenizer = None
|
|
18
23
|
|
|
19
24
|
def __next__(self):
|
|
20
25
|
for item in self.build_messages():
|
|
@@ -64,3 +69,40 @@ class DatasetPluginBase:
|
|
|
64
69
|
data = json.loads(content)
|
|
65
70
|
for item in data:
|
|
66
71
|
yield item
|
|
72
|
+
|
|
73
|
+
def create_message(self, text: str, image_urls: Union[List[str], str] = None, role: str = 'user') -> Dict:
|
|
74
|
+
"""Create a message with text and optional image URLs.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
text (str): The text content of the message.
|
|
78
|
+
image_urls (List[str], optional): List of image URLs. Defaults to None.
|
|
79
|
+
role (str, optional): The role of the message sender. Defaults to "user".
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Dict: A dictionary representing the message.
|
|
83
|
+
"""
|
|
84
|
+
if image_urls is None:
|
|
85
|
+
message = {'role': role, 'content': text}
|
|
86
|
+
else:
|
|
87
|
+
message = {'role': role, 'content': [{'type': 'text', 'text': text}]}
|
|
88
|
+
if isinstance(image_urls, str):
|
|
89
|
+
image_urls = [image_urls]
|
|
90
|
+
for url in image_urls:
|
|
91
|
+
message['content'].append({'type': 'image_url', 'image_url': {'url': url}})
|
|
92
|
+
return message
|
|
93
|
+
|
|
94
|
+
def check_prompt_length(self, prompt: str) -> Tuple[bool, int]:
|
|
95
|
+
"""Check if the prompt length is within the specified range.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
prompt (str): The input prompt string.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Tuple[bool, int]: A tuple containing a boolean indicating whether the prompt is valid and its length.
|
|
102
|
+
"""
|
|
103
|
+
if self.tokenizer is None:
|
|
104
|
+
prompt_length = len(prompt)
|
|
105
|
+
else:
|
|
106
|
+
prompt_length = len(self.tokenizer.encode(prompt))
|
|
107
|
+
is_valid = self.query_parameters.min_prompt_length <= prompt_length <= self.query_parameters.max_prompt_length
|
|
108
|
+
return is_valid, prompt_length
|
|
@@ -16,6 +16,25 @@ class CustomDatasetPlugin(DatasetPluginBase):
|
|
|
16
16
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
17
17
|
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
18
18
|
prompt = item.strip()
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
19
|
+
is_valid, _ = self.check_prompt_length(prompt)
|
|
20
|
+
if is_valid:
|
|
21
|
+
if self.query_parameters.apply_chat_template:
|
|
22
|
+
message = self.create_message(prompt)
|
|
23
|
+
yield [message]
|
|
24
|
+
else:
|
|
25
|
+
yield prompt
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
if __name__ == '__main__':
|
|
29
|
+
from evalscope.perf.arguments import Arguments
|
|
30
|
+
from evalscope.perf.main import run_perf_benchmark
|
|
31
|
+
|
|
32
|
+
args = Arguments(
|
|
33
|
+
model='qwen2.5-7b-instruct',
|
|
34
|
+
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
35
|
+
dataset_path='outputs/perf_data.txt',
|
|
36
|
+
api_key='EMPTY',
|
|
37
|
+
dataset='custom',
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
run_perf_benchmark(args)
|
|
@@ -1,18 +1,9 @@
|
|
|
1
|
-
import base64
|
|
2
|
-
from io import BytesIO
|
|
3
|
-
from PIL import Image
|
|
4
1
|
from typing import Any, Dict, Iterator, List
|
|
5
2
|
|
|
6
3
|
from evalscope.perf.arguments import Arguments
|
|
7
4
|
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
8
5
|
from evalscope.perf.plugin.registry import register_dataset
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def PIL_to_base64(image: Image.Image) -> str:
|
|
12
|
-
buffered = BytesIO()
|
|
13
|
-
image.save(buffered, format='JPEG')
|
|
14
|
-
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
|
15
|
-
return img_str
|
|
6
|
+
from evalscope.utils.io_utils import PIL_to_base64
|
|
16
7
|
|
|
17
8
|
|
|
18
9
|
@register_dataset('flickr8k')
|
|
@@ -30,21 +21,8 @@ class FlickrDatasetPlugin(DatasetPluginBase):
|
|
|
30
21
|
|
|
31
22
|
for item in dataset:
|
|
32
23
|
pil_image = item['jpg']
|
|
33
|
-
|
|
24
|
+
text = item['txt']
|
|
25
|
+
base64_image = PIL_to_base64(pil_image, add_header=True)
|
|
34
26
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
'user',
|
|
38
|
-
'content': [
|
|
39
|
-
{
|
|
40
|
-
'type': 'text',
|
|
41
|
-
'text': 'Describe the image'
|
|
42
|
-
},
|
|
43
|
-
{
|
|
44
|
-
'type': 'image_url',
|
|
45
|
-
'image_url': {
|
|
46
|
-
'url': f'data:image/jpeg;base64,{base64_iamge}',
|
|
47
|
-
}
|
|
48
|
-
},
|
|
49
|
-
],
|
|
50
|
-
}]
|
|
27
|
+
message = self.create_message(text=text, image_urls=base64_image)
|
|
28
|
+
yield [message]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from typing import Any, Dict, Iterator, List
|
|
2
|
+
|
|
3
|
+
from evalscope.perf.arguments import Arguments
|
|
4
|
+
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
5
|
+
from evalscope.perf.plugin.registry import register_dataset
|
|
6
|
+
from evalscope.utils.io_utils import PIL_to_base64
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_dataset('kontext_bench')
|
|
10
|
+
class KontextDatasetPlugin(DatasetPluginBase):
|
|
11
|
+
"""Read dataset and return prompt.
|
|
12
|
+
Datasets: https://modelscope.cn/datasets/black-forest-labs/kontext-bench/dataPeview
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, query_parameters: Arguments):
|
|
16
|
+
super().__init__(query_parameters)
|
|
17
|
+
|
|
18
|
+
def build_messages(self) -> Iterator[List[Dict]]:
|
|
19
|
+
from modelscope.msdatasets import MsDataset
|
|
20
|
+
dataset = MsDataset.load('black-forest-labs/kontext-bench', subset_name='default', split='test')
|
|
21
|
+
|
|
22
|
+
for item in dataset:
|
|
23
|
+
pil_image = item['image']
|
|
24
|
+
text = item['instruction']
|
|
25
|
+
base64_image = PIL_to_base64(pil_image, add_header=True)
|
|
26
|
+
|
|
27
|
+
message = self.create_message(text=text, image_urls=base64_image)
|
|
28
|
+
yield [message]
|
|
@@ -17,6 +17,10 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
|
|
|
17
17
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
18
18
|
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
19
19
|
prompt = item.strip()
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
20
|
+
is_valid, _ = self.check_prompt_length(prompt)
|
|
21
|
+
if is_valid:
|
|
22
|
+
if self.query_parameters.apply_chat_template:
|
|
23
|
+
message = self.create_message(prompt)
|
|
24
|
+
yield [message]
|
|
25
|
+
else:
|
|
26
|
+
yield prompt
|
|
@@ -22,6 +22,10 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
|
|
|
22
22
|
ds = self.dataset_json_list(self.query_parameters.dataset_path)
|
|
23
23
|
for item in ds:
|
|
24
24
|
prompt = item['instruction'].strip()
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
25
|
+
is_valid, _ = self.check_prompt_length(prompt)
|
|
26
|
+
if is_valid:
|
|
27
|
+
if self.query_parameters.apply_chat_template:
|
|
28
|
+
message = self.create_message(prompt)
|
|
29
|
+
yield [message]
|
|
30
|
+
else:
|
|
31
|
+
yield prompt
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import
|
|
2
|
+
import os
|
|
3
3
|
from typing import Any, Dict, Iterator, List
|
|
4
4
|
|
|
5
5
|
from evalscope.perf.arguments import Arguments
|
|
@@ -18,20 +18,19 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
|
|
|
18
18
|
|
|
19
19
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
20
20
|
if not self.query_parameters.dataset_path:
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
'open_qa.jsonl',
|
|
27
|
-
'--local_dir',
|
|
28
|
-
'./data',
|
|
29
|
-
])
|
|
30
|
-
self.query_parameters.dataset_path = './data/open_qa.jsonl'
|
|
21
|
+
from modelscope import dataset_snapshot_download
|
|
22
|
+
|
|
23
|
+
file_name = 'open_qa.jsonl'
|
|
24
|
+
local_path = dataset_snapshot_download('AI-ModelScope/HC3-Chinese', allow_patterns=[file_name])
|
|
25
|
+
self.query_parameters.dataset_path = os.path.join(local_path, file_name)
|
|
31
26
|
|
|
32
27
|
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
33
28
|
item = json.loads(item)
|
|
34
29
|
prompt = item['question'].strip()
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
30
|
+
is_valid, _ = self.check_prompt_length(prompt)
|
|
31
|
+
if is_valid:
|
|
32
|
+
if self.query_parameters.apply_chat_template:
|
|
33
|
+
message = self.create_message(prompt)
|
|
34
|
+
yield [message]
|
|
35
|
+
else:
|
|
36
|
+
yield prompt
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from typing import Dict, Iterator, List
|
|
3
|
+
|
|
4
|
+
from evalscope.perf.arguments import Arguments
|
|
5
|
+
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
6
|
+
from evalscope.perf.plugin.registry import register_dataset
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_dataset('random')
|
|
10
|
+
class RandomDatasetPlugin(DatasetPluginBase):
|
|
11
|
+
"""Read dataset and return prompt.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, query_parameters: Arguments):
|
|
15
|
+
assert query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer-path`.' # noqa: E501
|
|
16
|
+
super().__init__(query_parameters)
|
|
17
|
+
|
|
18
|
+
self.prefix_length = self.query_parameters.prefix_length
|
|
19
|
+
self.prefix_ids = self.get_random_inputs(self.prefix_length)
|
|
20
|
+
self.template_len = self.get_template_len()
|
|
21
|
+
self.number = self.query_parameters.number or 1
|
|
22
|
+
|
|
23
|
+
def build_messages(self) -> Iterator[List[Dict]]:
|
|
24
|
+
if self.query_parameters.apply_chat_template:
|
|
25
|
+
min_prompt_length = self.query_parameters.min_prompt_length - self.template_len
|
|
26
|
+
max_prompt_length = self.query_parameters.max_prompt_length - self.template_len + 1
|
|
27
|
+
else:
|
|
28
|
+
min_prompt_length = self.query_parameters.min_prompt_length
|
|
29
|
+
max_prompt_length = self.query_parameters.max_prompt_length + 1
|
|
30
|
+
|
|
31
|
+
assert min_prompt_length >= 0, f'min_prompt_length should be greater than or equal to the template length {self.template_len}.' # noqa: E501
|
|
32
|
+
assert max_prompt_length >= min_prompt_length, 'max_prompt_length should be greater than or equal to min_prompt_length.' # noqa: E501
|
|
33
|
+
|
|
34
|
+
# refer to https://github.com/vllm-project/vllm/blob/ed6e9075d31e32c8548b480a47d1ffb77da1f54c/benchmarks/benchmark_serving.py#L366C1-L399C1 # noqa: E501
|
|
35
|
+
input_lens = np.random.randint(min_prompt_length, max_prompt_length, size=self.number)
|
|
36
|
+
offsets = np.random.randint(0, self.tokenizer.vocab_size, size=self.number)
|
|
37
|
+
|
|
38
|
+
vocab_size = self.tokenizer.vocab_size
|
|
39
|
+
|
|
40
|
+
for i in range(self.number):
|
|
41
|
+
inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) % vocab_size).tolist()
|
|
42
|
+
token_sequence = self.prefix_ids + inner_seq
|
|
43
|
+
prompt = self.tokenizer.decode(token_sequence)
|
|
44
|
+
|
|
45
|
+
# After decoding the prompt we have to encode and decode it again.
|
|
46
|
+
# This is done because in some cases N consecutive tokens
|
|
47
|
+
# give a string tokenized into != N number of tokens.
|
|
48
|
+
total_input_len = self.prefix_length + int(input_lens[i])
|
|
49
|
+
re_encoded_sequence = self.tokenizer.encode(prompt, add_special_tokens=False)[:total_input_len]
|
|
50
|
+
prompt = self.tokenizer.decode(re_encoded_sequence)
|
|
51
|
+
|
|
52
|
+
if self.query_parameters.apply_chat_template:
|
|
53
|
+
message = self.create_message(prompt)
|
|
54
|
+
yield [message]
|
|
55
|
+
else:
|
|
56
|
+
yield prompt
|
|
57
|
+
|
|
58
|
+
def get_random_inputs(self, length: int) -> List[int]:
|
|
59
|
+
if length <= 0:
|
|
60
|
+
return []
|
|
61
|
+
input_ids = np.random.randint(0, self.tokenizer.vocab_size, size=length).tolist()
|
|
62
|
+
return input_ids
|
|
63
|
+
|
|
64
|
+
def get_template_len(self):
|
|
65
|
+
empty_message = [self.create_message(text='')]
|
|
66
|
+
template = self.tokenizer.apply_chat_template(empty_message, tokenize=True, add_generation_prompt=True)
|
|
67
|
+
return len(template)
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from PIL import Image, ImageDraw
|
|
3
|
+
from typing import Dict, Iterator, List
|
|
4
|
+
|
|
5
|
+
from evalscope.perf.arguments import Arguments
|
|
6
|
+
from evalscope.perf.plugin.datasets.random_dataset import RandomDatasetPlugin
|
|
7
|
+
from evalscope.perf.plugin.registry import register_dataset
|
|
8
|
+
from evalscope.utils.io_utils import PIL_to_base64
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@register_dataset('random_vl')
|
|
12
|
+
class RandomVLDatasetPlugin(RandomDatasetPlugin):
|
|
13
|
+
"""Random Vision-Language Dataset Plugin for multimodal model stress testing."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, query_parameters: Arguments):
|
|
16
|
+
super().__init__(query_parameters)
|
|
17
|
+
|
|
18
|
+
# Vision-language specific parameters
|
|
19
|
+
self.image_width = query_parameters.image_width
|
|
20
|
+
self.image_height = query_parameters.image_height
|
|
21
|
+
self.image_format = query_parameters.image_format
|
|
22
|
+
self.image_num = query_parameters.image_num
|
|
23
|
+
|
|
24
|
+
assert self.image_num > 0, 'image_num must be greater than 0.'
|
|
25
|
+
|
|
26
|
+
def build_messages(self) -> Iterator[List[Dict]]:
|
|
27
|
+
# Reuse parent's message generation logic
|
|
28
|
+
for messages in super().build_messages():
|
|
29
|
+
prompt = messages[0]['content'] if isinstance(messages[0], dict) else messages[0]
|
|
30
|
+
|
|
31
|
+
# Generate random images based on image_num
|
|
32
|
+
images_b64 = []
|
|
33
|
+
for _ in range(self.image_num):
|
|
34
|
+
images_b64.append(self._generate_random_image_b64())
|
|
35
|
+
|
|
36
|
+
message = self.create_message(text=prompt, image_urls=images_b64)
|
|
37
|
+
yield [message]
|
|
38
|
+
|
|
39
|
+
def _generate_random_image_b64(self) -> str:
|
|
40
|
+
"""Generate a random image and return as base64 string."""
|
|
41
|
+
# Create a random colored image
|
|
42
|
+
color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
|
|
43
|
+
image = Image.new(self.image_format, (self.image_width, self.image_height), color)
|
|
44
|
+
|
|
45
|
+
# Add some random shapes for variety
|
|
46
|
+
draw = ImageDraw.Draw(image)
|
|
47
|
+
for _ in range(random.randint(1, 5)):
|
|
48
|
+
shape_type = random.choice(['rectangle', 'ellipse', 'line'])
|
|
49
|
+
|
|
50
|
+
# Generate two random points
|
|
51
|
+
x1 = random.randint(0, self.image_width - 1)
|
|
52
|
+
y1 = random.randint(0, self.image_height - 1)
|
|
53
|
+
x2 = random.randint(0, self.image_width - 1)
|
|
54
|
+
y2 = random.randint(0, self.image_height - 1)
|
|
55
|
+
|
|
56
|
+
# Ensure proper coordinate ordering (x1 <= x2, y1 <= y2)
|
|
57
|
+
if x1 > x2:
|
|
58
|
+
x1, x2 = x2, x1
|
|
59
|
+
if y1 > y2:
|
|
60
|
+
y1, y2 = y2, y1
|
|
61
|
+
|
|
62
|
+
# Ensure we have at least a 1-pixel difference
|
|
63
|
+
if x1 == x2:
|
|
64
|
+
x2 = min(x1 + 1, self.image_width - 1)
|
|
65
|
+
if y1 == y2:
|
|
66
|
+
y2 = min(y1 + 1, self.image_height - 1)
|
|
67
|
+
|
|
68
|
+
coords = [x1, y1, x2, y2]
|
|
69
|
+
|
|
70
|
+
shape_color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
|
|
71
|
+
|
|
72
|
+
if shape_type == 'rectangle':
|
|
73
|
+
draw.rectangle(coords, fill=shape_color)
|
|
74
|
+
elif shape_type == 'ellipse':
|
|
75
|
+
draw.ellipse(coords, fill=shape_color)
|
|
76
|
+
else:
|
|
77
|
+
draw.line(coords, fill=shape_color, width=random.randint(1, 5))
|
|
78
|
+
|
|
79
|
+
# Convert to base64
|
|
80
|
+
return PIL_to_base64(image, format='PNG', add_header=True)
|