evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
evalscope/perf/__init__.py
CHANGED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
from evalscope.perf.main import run_perf_benchmark
|
evalscope/perf/arguments.py
CHANGED
|
@@ -3,13 +3,14 @@ import json
|
|
|
3
3
|
import os
|
|
4
4
|
import sys
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
|
-
from typing import Any, Dict, List, Optional
|
|
6
|
+
from typing import Any, Dict, List, Optional, Union
|
|
7
7
|
|
|
8
8
|
from evalscope.constants import DEFAULT_WORK_DIR
|
|
9
|
+
from evalscope.utils import BaseArgument
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
@dataclass
|
|
12
|
-
class Arguments:
|
|
13
|
+
class Arguments(BaseArgument):
|
|
13
14
|
# Model and API
|
|
14
15
|
model: str # Model name or path
|
|
15
16
|
model_id: Optional[str] = None # Model identifier
|
|
@@ -21,29 +22,46 @@ class Arguments:
|
|
|
21
22
|
# Connection settings
|
|
22
23
|
url: str = 'http://127.0.0.1:8877/v1/chat/completions' # URL for the API connection
|
|
23
24
|
headers: Dict[str, Any] = field(default_factory=dict) # Custom headers
|
|
24
|
-
connect_timeout: int =
|
|
25
|
-
read_timeout: int =
|
|
26
|
-
api_key: str =
|
|
25
|
+
connect_timeout: int = 600 # Connection timeout in seconds
|
|
26
|
+
read_timeout: int = 600 # Read timeout in seconds
|
|
27
|
+
api_key: Optional[str] = None
|
|
28
|
+
no_test_connection: bool = False # Test the connection before starting the benchmark
|
|
27
29
|
|
|
28
30
|
# Performance and parallelism
|
|
29
|
-
number:
|
|
30
|
-
parallel: int = 1 # Number of parallel requests
|
|
31
|
+
number: Union[int, List[int]] = 1000 # Number of requests to be made
|
|
32
|
+
parallel: Union[int, List[int]] = 1 # Number of parallel requests
|
|
31
33
|
rate: int = -1 # Rate limit for requests (default: -1, no limit)
|
|
34
|
+
sleep_interval: int = 5 # Sleep interval between performance runs, in seconds
|
|
35
|
+
|
|
36
|
+
# Tuning knobs
|
|
37
|
+
db_commit_interval: int = 1000 # Number of rows buffered before committing to the DB
|
|
38
|
+
queue_size_multiplier: int = 5 # Maxsize for queue = parallel * this multiplier
|
|
39
|
+
in_flight_task_multiplier: int = 2 # Max scheduled tasks = parallel * this multiplier
|
|
32
40
|
|
|
33
41
|
# Logging and debugging
|
|
34
42
|
log_every_n_query: int = 10 # Log every N queries
|
|
35
43
|
debug: bool = False # Debug mode
|
|
36
|
-
|
|
44
|
+
visualizer: Optional[str] = None # Visualizer for logging, supports 'swanlab' or 'wandb'
|
|
45
|
+
wandb_api_key: Optional[str] = None # Will be deprecated in the future
|
|
46
|
+
swanlab_api_key: Optional[str] = None # Will be deprecated in the future
|
|
37
47
|
name: Optional[str] = None # Name for the run
|
|
38
48
|
|
|
39
49
|
# Output settings
|
|
40
50
|
outputs_dir: str = DEFAULT_WORK_DIR
|
|
41
51
|
|
|
42
52
|
# Prompt settings
|
|
43
|
-
max_prompt_length: int =
|
|
53
|
+
max_prompt_length: int = 131072 # Maximum length of the prompt
|
|
44
54
|
min_prompt_length: int = 0 # Minimum length of the prompt
|
|
55
|
+
prefix_length: int = 0 # Length of the prefix, only for random dataset
|
|
45
56
|
prompt: Optional[str] = None # The prompt text
|
|
46
57
|
query_template: Optional[str] = None # Template for the query
|
|
58
|
+
apply_chat_template: Optional[bool] = None # Whether to apply chat template
|
|
59
|
+
# random vl settings
|
|
60
|
+
image_width: int = 224 # Width of the image for random VL dataset
|
|
61
|
+
image_height: int = 224 # Height of the image for random VL dataset
|
|
62
|
+
image_format: str = 'RGB' # Image format for random VL dataset
|
|
63
|
+
image_num: int = 1 # Number of images for random VL dataset
|
|
64
|
+
image_patch_size: int = 28 # Patch size for image tokenizer, only for local image token calculation
|
|
47
65
|
|
|
48
66
|
# Dataset settings
|
|
49
67
|
dataset: str = 'openqa' # Dataset type (default: 'line_by_line')
|
|
@@ -51,68 +69,57 @@ class Arguments:
|
|
|
51
69
|
|
|
52
70
|
# Response settings
|
|
53
71
|
frequency_penalty: Optional[float] = None # Frequency penalty for the response
|
|
72
|
+
repetition_penalty: Optional[float] = None # Repetition penalty for the response
|
|
54
73
|
logprobs: Optional[bool] = None # Whether to log probabilities
|
|
55
74
|
max_tokens: Optional[int] = 2048 # Maximum number of tokens in the response
|
|
56
75
|
min_tokens: Optional[int] = None # Minimum number of tokens in the response
|
|
57
76
|
n_choices: Optional[int] = None # Number of response choices
|
|
58
|
-
seed: Optional[int] =
|
|
59
|
-
stop: Optional[List[str]] =
|
|
60
|
-
stop_token_ids: Optional[List[str]] =
|
|
61
|
-
stream: Optional[bool] =
|
|
62
|
-
temperature:
|
|
77
|
+
seed: Optional[int] = None # Random seed for reproducibility
|
|
78
|
+
stop: Optional[List[str]] = None # Stop sequences for the response
|
|
79
|
+
stop_token_ids: Optional[List[str]] = None # Stop token IDs for the response
|
|
80
|
+
stream: Optional[bool] = True # Whether to stream the response
|
|
81
|
+
temperature: float = 0.0 # Temperature setting for the response
|
|
63
82
|
top_p: Optional[float] = None # Top-p (nucleus) sampling setting for the response
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def from_args(args):
|
|
67
|
-
return Arguments(
|
|
68
|
-
model=args.model,
|
|
69
|
-
attn_implementation=args.attn_implementation,
|
|
70
|
-
url=args.url,
|
|
71
|
-
port=args.port,
|
|
72
|
-
api_key=args.api_key,
|
|
73
|
-
connect_timeout=args.connect_timeout,
|
|
74
|
-
read_timeout=args.read_timeout,
|
|
75
|
-
number=args.number,
|
|
76
|
-
parallel=args.parallel,
|
|
77
|
-
rate=args.rate,
|
|
78
|
-
log_every_n_query=args.log_every_n_query,
|
|
79
|
-
headers=args.headers,
|
|
80
|
-
wandb_api_key=args.wandb_api_key,
|
|
81
|
-
name=args.name,
|
|
82
|
-
outputs_dir=args.outputs_dir,
|
|
83
|
-
debug=args.debug,
|
|
84
|
-
tokenizer_path=args.tokenizer_path,
|
|
85
|
-
api=args.api,
|
|
86
|
-
max_prompt_length=args.max_prompt_length,
|
|
87
|
-
min_prompt_length=args.min_prompt_length,
|
|
88
|
-
prompt=args.prompt,
|
|
89
|
-
query_template=args.query_template,
|
|
90
|
-
dataset=args.dataset,
|
|
91
|
-
dataset_path=args.dataset_path,
|
|
92
|
-
frequency_penalty=args.frequency_penalty,
|
|
93
|
-
logprobs=args.logprobs,
|
|
94
|
-
max_tokens=args.max_tokens,
|
|
95
|
-
min_tokens=args.min_tokens,
|
|
96
|
-
n_choices=args.n_choices,
|
|
97
|
-
seed=args.seed,
|
|
98
|
-
stop=args.stop,
|
|
99
|
-
stop_token_ids=args.stop_token_ids,
|
|
100
|
-
stream=args.stream,
|
|
101
|
-
temperature=args.temperature,
|
|
102
|
-
top_p=args.top_p)
|
|
83
|
+
top_k: Optional[int] = None # Top-k sampling setting for the response
|
|
84
|
+
extra_args: Optional[Dict[str, Any]] = None # Extra arguments
|
|
103
85
|
|
|
104
86
|
def __post_init__(self):
|
|
87
|
+
# Set the default headers
|
|
105
88
|
self.headers = self.headers or {} # Default to empty dictionary
|
|
106
89
|
if self.api_key:
|
|
107
90
|
# Assuming the API key is used as a Bearer token
|
|
108
91
|
self.headers['Authorization'] = f'Bearer {self.api_key}'
|
|
109
|
-
self.model_id = os.path.basename(self.model)
|
|
110
92
|
|
|
111
|
-
|
|
112
|
-
|
|
93
|
+
# Set the model ID based on the model name
|
|
94
|
+
self.model_id = os.path.basename(self.model)
|
|
113
95
|
|
|
114
|
-
|
|
115
|
-
|
|
96
|
+
# Set the URL based on the dataset type
|
|
97
|
+
if self.api.startswith('local'):
|
|
98
|
+
if self.dataset.startswith('speed_benchmark'):
|
|
99
|
+
self.url = f'http://127.0.0.1:{self.port}/v1/completions'
|
|
100
|
+
else:
|
|
101
|
+
self.url = f'http://127.0.0.1:{self.port}/v1/chat/completions'
|
|
102
|
+
|
|
103
|
+
# Set the apply_chat_template flag based on the URL
|
|
104
|
+
if self.apply_chat_template is None:
|
|
105
|
+
self.apply_chat_template = self.url.strip('/').endswith('chat/completions')
|
|
106
|
+
|
|
107
|
+
# Set number and parallel to lists if they are integers
|
|
108
|
+
if isinstance(self.number, int):
|
|
109
|
+
self.number = [self.number]
|
|
110
|
+
if isinstance(self.parallel, int):
|
|
111
|
+
self.parallel = [self.parallel]
|
|
112
|
+
assert len(self.number) == len(
|
|
113
|
+
self.parallel
|
|
114
|
+
), f'The length of number and parallel should be the same, but got number: {self.number} and parallel: {self.parallel}' # noqa: E501
|
|
115
|
+
|
|
116
|
+
# Validate tuning knobs
|
|
117
|
+
if self.db_commit_interval <= 0:
|
|
118
|
+
self.db_commit_interval = 1
|
|
119
|
+
if self.queue_size_multiplier <= 0:
|
|
120
|
+
self.queue_size_multiplier = 1
|
|
121
|
+
if self.in_flight_task_multiplier <= 0:
|
|
122
|
+
self.in_flight_task_multiplier = 1
|
|
116
123
|
|
|
117
124
|
|
|
118
125
|
class ParseKVAction(argparse.Action):
|
|
@@ -122,7 +129,13 @@ class ParseKVAction(argparse.Action):
|
|
|
122
129
|
setattr(namespace, self.dest, {})
|
|
123
130
|
else:
|
|
124
131
|
try:
|
|
125
|
-
kv_dict =
|
|
132
|
+
kv_dict = {}
|
|
133
|
+
for kv in values:
|
|
134
|
+
parts = kv.split('=', 1) # only split the first '='
|
|
135
|
+
if len(parts) != 2:
|
|
136
|
+
raise ValueError(f'Invalid key-value pair: {kv}')
|
|
137
|
+
key, value = parts
|
|
138
|
+
kv_dict[key.strip()] = value.strip()
|
|
126
139
|
setattr(namespace, self.dest, kv_dict)
|
|
127
140
|
except ValueError as e:
|
|
128
141
|
parser.error(f'Error parsing key-value pairs: {e}')
|
|
@@ -141,26 +154,45 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
141
154
|
parser.add_argument('--url', type=str, default='http://127.0.0.1:8877/v1/chat/completions')
|
|
142
155
|
parser.add_argument('--port', type=int, default=8877, help='The port for local inference')
|
|
143
156
|
parser.add_argument('--headers', nargs='+', dest='headers', action=ParseKVAction, help='Extra HTTP headers')
|
|
144
|
-
parser.add_argument('--api-key', type=str, required=False, default=
|
|
145
|
-
parser.add_argument('--connect-timeout', type=int, default=
|
|
146
|
-
parser.add_argument('--read-timeout', type=int, default=
|
|
157
|
+
parser.add_argument('--api-key', type=str, required=False, default=None, help='The API key for authentication')
|
|
158
|
+
parser.add_argument('--connect-timeout', type=int, default=600, help='The network connection timeout')
|
|
159
|
+
parser.add_argument('--read-timeout', type=int, default=600, help='The network read timeout')
|
|
160
|
+
parser.add_argument('--no-test-connection', action='store_false', default=False, help='Do not test the connection before starting the benchmark') # noqa: E501
|
|
147
161
|
|
|
148
162
|
# Performance and parallelism
|
|
149
|
-
parser.add_argument('-n', '--number', type=int, default=
|
|
150
|
-
parser.add_argument('--parallel', type=int, default=1, help='Set number of concurrency requests, default 1')
|
|
163
|
+
parser.add_argument('-n', '--number', type=int, default=1000, nargs='+', help='How many requests to be made')
|
|
164
|
+
parser.add_argument('--parallel', type=int, default=1, nargs='+', help='Set number of concurrency requests, default 1') # noqa: E501
|
|
151
165
|
parser.add_argument('--rate', type=int, default=-1, help='Number of requests per second. default None')
|
|
166
|
+
parser.add_argument(
|
|
167
|
+
'--sleep-interval', type=int, default=5, help='Sleep interval between performance runs, in seconds. Default 5') # noqa: E501
|
|
168
|
+
|
|
169
|
+
# Tuning knobs
|
|
170
|
+
parser.add_argument('--db-commit-interval', type=int, default=1000, help='Rows buffered before SQLite commit')
|
|
171
|
+
parser.add_argument('--queue-size-multiplier', type=int, default=5, help='Queue maxsize = parallel * multiplier')
|
|
172
|
+
parser.add_argument('--in-flight-task-multiplier', type=int, default=2, help='Max scheduled tasks = parallel * multiplier') # noqa: E501
|
|
152
173
|
|
|
153
174
|
# Logging and debugging
|
|
154
175
|
parser.add_argument('--log-every-n-query', type=int, default=10, help='Logging every n query')
|
|
155
176
|
parser.add_argument('--debug', action='store_true', default=False, help='Debug request send')
|
|
177
|
+
parser.add_argument('--visualizer', type=str, default=None, help='The visualizer to use, default None')
|
|
156
178
|
parser.add_argument('--wandb-api-key', type=str, default=None, help='The wandb API key')
|
|
157
|
-
parser.add_argument('--
|
|
179
|
+
parser.add_argument('--swanlab-api-key', type=str, default=None, help='The swanlab API key')
|
|
180
|
+
parser.add_argument('--name', type=str, help='The wandb/swanlab db result name and result db name')
|
|
158
181
|
|
|
159
182
|
# Prompt settings
|
|
160
183
|
parser.add_argument('--max-prompt-length', type=int, default=sys.maxsize, help='Maximum input prompt length')
|
|
161
184
|
parser.add_argument('--min-prompt-length', type=int, default=0, help='Minimum input prompt length')
|
|
185
|
+
parser.add_argument('--prefix-length', type=int, default=0, help='The prefix length')
|
|
162
186
|
parser.add_argument('--prompt', type=str, required=False, default=None, help='Specified the request prompt')
|
|
163
187
|
parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
|
|
188
|
+
parser.add_argument(
|
|
189
|
+
'--apply-chat-template', type=argparse.BooleanOptionalAction, default=None, help='Apply chat template to the prompt') # noqa: E501
|
|
190
|
+
# random vl settings
|
|
191
|
+
parser.add_argument('--image-width', type=int, default=224, help='Width of the image for random VL dataset')
|
|
192
|
+
parser.add_argument('--image-height', type=int, default=224, help='Height of the image for random VL dataset')
|
|
193
|
+
parser.add_argument('--image-format', type=str, default='RGB', help='Image format for random VL dataset')
|
|
194
|
+
parser.add_argument('--image-num', type=int, default=1, help='Number of images for random VL dataset')
|
|
195
|
+
parser.add_argument('--image-patch-size', type=int, default=28, help='Patch size for image tokenizer, only for local image token calculation') # noqa: E501
|
|
164
196
|
|
|
165
197
|
# Output settings
|
|
166
198
|
parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
|
|
@@ -171,19 +203,21 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
171
203
|
|
|
172
204
|
# Response settings
|
|
173
205
|
parser.add_argument('--frequency-penalty', type=float, help='The frequency_penalty value', default=None)
|
|
206
|
+
parser.add_argument('--repetition-penalty', type=float, help='The repetition_penalty value', default=None)
|
|
174
207
|
parser.add_argument('--logprobs', action='store_true', help='The logprobs', default=None)
|
|
175
208
|
parser.add_argument(
|
|
176
209
|
'--max-tokens', type=int, help='The maximum number of tokens that can be generated', default=2048)
|
|
177
210
|
parser.add_argument(
|
|
178
211
|
'--min-tokens', type=int, help='The minimum number of tokens that can be generated', default=None)
|
|
179
212
|
parser.add_argument('--n-choices', type=int, help='How many completion choices to generate', default=None)
|
|
180
|
-
parser.add_argument('--seed', type=int, help='The random seed', default=
|
|
213
|
+
parser.add_argument('--seed', type=int, help='The random seed', default=None)
|
|
181
214
|
parser.add_argument('--stop', nargs='*', help='The stop tokens', default=None)
|
|
182
215
|
parser.add_argument('--stop-token-ids', nargs='*', help='Set the stop token IDs', default=None)
|
|
183
|
-
parser.add_argument('--stream', action=
|
|
184
|
-
parser.add_argument('--temperature', type=float, help='The sample temperature', default=
|
|
216
|
+
parser.add_argument('--stream', action=argparse.BooleanOptionalAction, help='Stream output with SSE', default=True)
|
|
217
|
+
parser.add_argument('--temperature', type=float, help='The sample temperature', default=0.0)
|
|
185
218
|
parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
|
|
186
|
-
|
|
219
|
+
parser.add_argument('--top-k', type=int, help='Sampling top k', default=None)
|
|
220
|
+
parser.add_argument('--extra-args', type=json.loads, default='{}', help='Extra arguments, should in JSON format',)
|
|
187
221
|
# yapf: enable
|
|
188
222
|
|
|
189
223
|
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -1,243 +1,194 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
import copy
|
|
3
2
|
import json
|
|
4
3
|
import numpy as np
|
|
5
|
-
import os
|
|
6
4
|
import platform
|
|
7
5
|
import sqlite3
|
|
8
|
-
import threading
|
|
9
|
-
import time
|
|
10
|
-
from http import HTTPStatus
|
|
11
6
|
from tqdm import tqdm
|
|
12
|
-
from typing import List
|
|
13
|
-
|
|
14
|
-
from evalscope.perf.arguments import Arguments
|
|
15
|
-
from evalscope.perf.http_client import AioHttpClient, test_connection
|
|
16
|
-
from evalscope.perf.plugin.registry import ApiRegistry, DatasetRegistry
|
|
17
|
-
from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
18
|
-
from evalscope.perf.utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, summary_result
|
|
19
|
-
from evalscope.perf.utils.handler import add_signal_handlers, exception_handler
|
|
20
|
-
from evalscope.perf.utils.local_server import start_app
|
|
7
|
+
from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
|
|
8
|
+
|
|
21
9
|
from evalscope.utils.logger import get_logger
|
|
10
|
+
from .arguments import Arguments
|
|
11
|
+
from .http_client import AioHttpClient, test_connection
|
|
12
|
+
from .plugin import ApiRegistry, DatasetRegistry
|
|
13
|
+
from .utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
14
|
+
from .utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, load_prompt, summary_result
|
|
15
|
+
from .utils.handler import add_signal_handlers, exception_handler
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from .plugin import ApiPluginBase, DatasetPluginBase
|
|
22
19
|
|
|
23
20
|
logger = get_logger()
|
|
24
|
-
|
|
21
|
+
|
|
25
22
|
data_process_completed_event = asyncio.Event()
|
|
26
23
|
|
|
27
24
|
|
|
28
25
|
@exception_handler
|
|
29
|
-
async def
|
|
30
|
-
query_generator_class = ApiRegistry(args.api)
|
|
31
|
-
query_generator = query_generator_class(args.tokenizer_path)
|
|
32
|
-
|
|
33
|
-
def load_prompt(prompt_path_or_text):
|
|
34
|
-
"""Load the prompt from a file or directly from the input text."""
|
|
35
|
-
if prompt_path_or_text.startswith('@'):
|
|
36
|
-
with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
|
|
37
|
-
return file.read()
|
|
38
|
-
return prompt_path_or_text
|
|
39
|
-
|
|
40
|
-
async def dispatch_request(request):
|
|
41
|
-
"""Dispatch a single request with optional rate limiting."""
|
|
42
|
-
await request_queue.put(request)
|
|
43
|
-
if args.rate != -1:
|
|
44
|
-
interval = np.random.exponential(1.0 / args.rate)
|
|
45
|
-
await asyncio.sleep(interval)
|
|
26
|
+
async def get_requests(args: Arguments, api_plugin: 'ApiPluginBase') -> AsyncGenerator[dict, None]:
|
|
46
27
|
|
|
47
|
-
async def
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
await dispatch_request(request)
|
|
52
|
-
return 1
|
|
28
|
+
async def generate_requests_from_prompt():
|
|
29
|
+
prompt = load_prompt(args.prompt)
|
|
30
|
+
messages = [{'role': 'user', 'content': prompt}] if args.apply_chat_template else prompt
|
|
31
|
+
request = api_plugin.build_request(messages)
|
|
53
32
|
for _ in range(args.number):
|
|
54
|
-
|
|
55
|
-
return args.number
|
|
33
|
+
yield request
|
|
56
34
|
|
|
57
|
-
async def
|
|
58
|
-
|
|
59
|
-
total_query_count = 0
|
|
60
|
-
message_generator_class = DatasetRegistry(args.dataset)
|
|
35
|
+
async def generate_requests_from_dataset():
|
|
36
|
+
message_generator_class = DatasetRegistry.get_class(args.dataset)
|
|
61
37
|
message_generator = message_generator_class(args)
|
|
62
38
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
39
|
+
dataset_messages = []
|
|
40
|
+
try:
|
|
41
|
+
for messages in message_generator.build_messages():
|
|
42
|
+
dataset_messages.append(messages)
|
|
43
|
+
if len(dataset_messages) >= args.number:
|
|
44
|
+
break
|
|
45
|
+
except StopIteration:
|
|
46
|
+
pass
|
|
71
47
|
|
|
72
|
-
|
|
48
|
+
if not dataset_messages:
|
|
49
|
+
raise Exception('Dataset is empty!')
|
|
50
|
+
|
|
51
|
+
count = 0
|
|
52
|
+
dataset_index = 0
|
|
53
|
+
|
|
54
|
+
while count < args.number:
|
|
55
|
+
messages = dataset_messages[dataset_index]
|
|
56
|
+
request = api_plugin.build_request(messages)
|
|
57
|
+
if request is not None:
|
|
58
|
+
yield request
|
|
59
|
+
count += 1
|
|
60
|
+
|
|
61
|
+
dataset_index = (dataset_index + 1) % len(dataset_messages)
|
|
73
62
|
|
|
74
|
-
# Load prompt or dataset and dispatch requests accordingly
|
|
75
63
|
if args.prompt:
|
|
76
|
-
|
|
77
|
-
messages = [{'role': 'user', 'content': prompt}]
|
|
78
|
-
total_queries = await dispatch_requests_from_prompt(messages)
|
|
64
|
+
generator = generate_requests_from_prompt()
|
|
79
65
|
elif args.dataset:
|
|
80
|
-
|
|
66
|
+
generator = generate_requests_from_dataset()
|
|
81
67
|
else:
|
|
82
|
-
raise
|
|
68
|
+
raise ValueError('Either prompt or dataset is required!')
|
|
83
69
|
|
|
84
|
-
|
|
70
|
+
async for request in generator:
|
|
71
|
+
yield request
|
|
72
|
+
if args.rate != -1:
|
|
73
|
+
interval = np.random.exponential(1.0 / args.rate)
|
|
74
|
+
await asyncio.sleep(interval)
|
|
85
75
|
|
|
86
76
|
|
|
87
77
|
@exception_handler
|
|
88
|
-
async def
|
|
89
|
-
|
|
90
|
-
|
|
78
|
+
async def send_request(
|
|
79
|
+
semaphore: asyncio.Semaphore,
|
|
80
|
+
request: dict,
|
|
91
81
|
benchmark_data_queue: asyncio.Queue,
|
|
92
82
|
args: Arguments,
|
|
83
|
+
client: AioHttpClient, # reuse shared client
|
|
93
84
|
):
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
# Attempt to get a request from the queue with a timeout
|
|
99
|
-
request = await asyncio.wait_for(request_queue.get(), timeout=0.0001)
|
|
100
|
-
request_queue.task_done()
|
|
101
|
-
except asyncio.TimeoutError:
|
|
102
|
-
# If timeout, continue to the next iteration
|
|
103
|
-
continue
|
|
104
|
-
|
|
105
|
-
# Initialize benchmark data for the current request
|
|
106
|
-
benchmark_data = BenchmarkData(request=request)
|
|
107
|
-
collected_messages = []
|
|
108
|
-
try:
|
|
109
|
-
# Send the request and process the response
|
|
110
|
-
async for is_error, state_code, response_data in client.post(request):
|
|
111
|
-
if is_error or state_code != HTTPStatus.OK:
|
|
112
|
-
logger.error(f'Request: {request} failed, state_code: {state_code}, data: {response_data}')
|
|
113
|
-
benchmark_data.success = False
|
|
114
|
-
break
|
|
115
|
-
if response_data:
|
|
116
|
-
collected_messages.append(response_data)
|
|
117
|
-
benchmark_data.chunk_times.append(time.perf_counter())
|
|
118
|
-
benchmark_data.success = True
|
|
119
|
-
benchmark_data.update_gpu_usage()
|
|
120
|
-
except Exception as e:
|
|
121
|
-
if response_data:
|
|
122
|
-
collected_messages.append(response_data)
|
|
123
|
-
benchmark_data.success = False
|
|
124
|
-
logger.exception(e)
|
|
125
|
-
logger.error(f'Request query: {request} exception')
|
|
126
|
-
finally:
|
|
127
|
-
# Record completion time and collected messages
|
|
128
|
-
benchmark_data.completed_time = time.perf_counter()
|
|
129
|
-
benchmark_data.response_messages = collected_messages
|
|
130
|
-
await benchmark_data_queue.put(benchmark_data)
|
|
85
|
+
async with semaphore:
|
|
86
|
+
benchmark_data = await client.post(request)
|
|
87
|
+
benchmark_data.update_gpu_usage()
|
|
88
|
+
await benchmark_data_queue.put(benchmark_data)
|
|
131
89
|
|
|
132
90
|
|
|
133
91
|
@exception_handler
|
|
134
|
-
async def
|
|
92
|
+
async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args: Arguments, api_plugin: 'ApiPluginBase'):
|
|
135
93
|
metrics = BenchmarkMetrics(concurrency=args.parallel)
|
|
136
|
-
|
|
137
|
-
api_plugin_class = ApiRegistry(args.api)
|
|
138
|
-
api_plugin = api_plugin_class(args.tokenizer_path)
|
|
139
|
-
|
|
140
94
|
result_db_path = get_result_db_path(args)
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
os.environ['WANDB_SILENT'] = 'true'
|
|
146
|
-
os.environ['WANDB_DIR'] = args.outputs_dir
|
|
147
|
-
|
|
148
|
-
wandb.login(key=args.wandb_api_key)
|
|
149
|
-
current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
150
|
-
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
151
|
-
wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
|
|
95
|
+
|
|
96
|
+
# Stream inserts to DB to avoid accumulating all results in memory
|
|
97
|
+
commit_every = args.db_commit_interval
|
|
98
|
+
processed_since_commit = 0
|
|
152
99
|
|
|
153
100
|
with sqlite3.connect(result_db_path) as con:
|
|
154
101
|
cursor = con.cursor()
|
|
155
102
|
create_result_table(cursor)
|
|
156
|
-
|
|
103
|
+
|
|
104
|
+
with tqdm(desc='Processing', total=args.number) as pbar:
|
|
157
105
|
while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
|
|
158
106
|
try:
|
|
159
|
-
|
|
160
|
-
benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
|
|
161
|
-
benchmark_data_queue.task_done()
|
|
107
|
+
benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.1)
|
|
162
108
|
except asyncio.TimeoutError:
|
|
163
|
-
# If timeout, continue to the next iteration
|
|
164
109
|
continue
|
|
165
110
|
|
|
166
|
-
# Update metrics
|
|
111
|
+
# Update metrics and write to DB immediately
|
|
167
112
|
metrics.update_metrics(benchmark_data, api_plugin)
|
|
168
|
-
|
|
169
|
-
# Insert benchmark data into the database and commit the transaction
|
|
170
113
|
insert_benchmark_data(cursor, benchmark_data)
|
|
171
|
-
|
|
114
|
+
processed_since_commit += 1
|
|
115
|
+
if processed_since_commit >= commit_every:
|
|
116
|
+
con.commit()
|
|
117
|
+
processed_since_commit = 0
|
|
172
118
|
|
|
173
|
-
# Create a message with the updated metrics
|
|
174
119
|
message = metrics.create_message()
|
|
175
120
|
|
|
176
|
-
# Log the message to wandb if the api key is provided
|
|
177
121
|
if args.wandb_api_key:
|
|
122
|
+
import wandb
|
|
178
123
|
wandb.log(message)
|
|
124
|
+
if args.swanlab_api_key:
|
|
125
|
+
import swanlab
|
|
126
|
+
swanlab.log(message)
|
|
179
127
|
|
|
180
|
-
# Log the message to the logger every n queries
|
|
181
128
|
if int(metrics.n_total_queries) % args.log_every_n_query == 0:
|
|
182
129
|
msg = json.dumps(message, ensure_ascii=False, indent=2)
|
|
183
130
|
logger.info(msg)
|
|
184
131
|
|
|
185
|
-
|
|
132
|
+
benchmark_data_queue.task_done()
|
|
133
|
+
pbar.update(1)
|
|
134
|
+
|
|
135
|
+
con.commit()
|
|
186
136
|
|
|
187
137
|
return metrics, result_db_path
|
|
188
138
|
|
|
189
139
|
|
|
190
140
|
@exception_handler
|
|
191
|
-
async def
|
|
192
|
-
if args.
|
|
193
|
-
# start local server
|
|
194
|
-
server = threading.Thread(target=start_app, args=(copy.deepcopy(args), ), daemon=True)
|
|
195
|
-
server.start()
|
|
196
|
-
|
|
197
|
-
if args.dataset.startswith('speed_benchmark'):
|
|
198
|
-
args.url = f'http://127.0.0.1:{args.port}/v1/completions'
|
|
199
|
-
else:
|
|
200
|
-
args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
|
|
201
|
-
|
|
202
|
-
if not await test_connection(args):
|
|
141
|
+
async def connect_test(args: Arguments, api_plugin) -> bool:
|
|
142
|
+
if (not args.no_test_connection) and (not await test_connection(args, api_plugin)):
|
|
203
143
|
raise TimeoutError('Test connection failed')
|
|
204
144
|
|
|
205
145
|
|
|
206
146
|
@exception_handler
|
|
207
|
-
async def benchmark(args: Arguments) ->
|
|
147
|
+
async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
|
|
208
148
|
if platform.system() != 'Windows':
|
|
209
149
|
loop = asyncio.get_running_loop()
|
|
210
150
|
add_signal_handlers(loop)
|
|
211
151
|
|
|
212
|
-
|
|
213
|
-
|
|
152
|
+
api_plugin_class = ApiRegistry.get_class(args.api)
|
|
153
|
+
api_plugin = api_plugin_class(args)
|
|
214
154
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
for idx in range(args.parallel):
|
|
218
|
-
task = asyncio.create_task(send_requests_worker(idx, request_queue, benchmark_data_queue, args))
|
|
219
|
-
tasks.append(task)
|
|
220
|
-
return tasks
|
|
155
|
+
benchmark_data_queue: asyncio.Queue = asyncio.Queue(maxsize=max(1, args.parallel * args.queue_size_multiplier))
|
|
156
|
+
data_process_completed_event.clear()
|
|
221
157
|
|
|
222
|
-
|
|
223
|
-
|
|
158
|
+
# test connection
|
|
159
|
+
await connect_test(args, api_plugin)
|
|
224
160
|
|
|
225
|
-
|
|
161
|
+
# Create a single shared client session for all requests
|
|
162
|
+
client = AioHttpClient(args, api_plugin)
|
|
163
|
+
async with client:
|
|
164
|
+
# start statistic benchmark metric (consumer)
|
|
226
165
|
statistic_benchmark_metric_task = asyncio.create_task(
|
|
227
|
-
|
|
228
|
-
|
|
166
|
+
statistic_benchmark_metric(benchmark_data_queue, args, api_plugin)
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# start sending requests with bounded in-flight tasks
|
|
170
|
+
semaphore = asyncio.Semaphore(args.parallel)
|
|
171
|
+
in_flight: set[asyncio.Task] = set()
|
|
172
|
+
max_in_flight = args.parallel * args.in_flight_task_multiplier
|
|
229
173
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
174
|
+
async for request in get_requests(args, api_plugin):
|
|
175
|
+
# Keep the number of scheduled tasks bounded to avoid OOM
|
|
176
|
+
if len(in_flight) >= max_in_flight:
|
|
177
|
+
done, pending = await asyncio.wait(in_flight, return_when=asyncio.FIRST_COMPLETED)
|
|
178
|
+
in_flight = pending
|
|
233
179
|
|
|
234
|
-
|
|
180
|
+
task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args, client))
|
|
181
|
+
in_flight.add(task)
|
|
182
|
+
|
|
183
|
+
# Wait for remaining in-flight tasks
|
|
184
|
+
if in_flight:
|
|
185
|
+
await asyncio.gather(*in_flight, return_exceptions=True)
|
|
186
|
+
|
|
187
|
+
# Drain queue and finish
|
|
235
188
|
await benchmark_data_queue.join()
|
|
236
189
|
data_process_completed_event.set()
|
|
237
190
|
|
|
238
191
|
metrics, result_db_path = await statistic_benchmark_metric_task
|
|
239
|
-
summary_result(args, metrics, expected_number_of_queries, result_db_path)
|
|
240
|
-
|
|
241
|
-
await asyncio.sleep(0.250)
|
|
242
192
|
|
|
243
|
-
|
|
193
|
+
metrics_result, percentile_result = summary_result(args, metrics, result_db_path)
|
|
194
|
+
return metrics_result, percentile_result
|