evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -2,11 +2,12 @@ import base64
|
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
4
|
import pickle
|
|
5
|
+
import re
|
|
5
6
|
import sqlite3
|
|
6
7
|
import sys
|
|
7
8
|
from datetime import datetime
|
|
8
9
|
from tabulate import tabulate
|
|
9
|
-
from typing import Dict, List
|
|
10
|
+
from typing import Dict, List, Tuple
|
|
10
11
|
|
|
11
12
|
from evalscope.perf.arguments import Arguments
|
|
12
13
|
from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
@@ -15,6 +16,28 @@ from evalscope.utils.logger import get_logger
|
|
|
15
16
|
logger = get_logger()
|
|
16
17
|
|
|
17
18
|
|
|
19
|
+
class DatabaseColumns:
|
|
20
|
+
REQUEST = 'request'
|
|
21
|
+
START_TIME = 'start_time'
|
|
22
|
+
INTER_TOKEN_LATENCIES = 'inter_token_latencies'
|
|
23
|
+
SUCCESS = 'success'
|
|
24
|
+
RESPONSE_MESSAGES = 'response_messages'
|
|
25
|
+
COMPLETED_TIME = 'completed_time'
|
|
26
|
+
LATENCY = 'latency'
|
|
27
|
+
FIRST_CHUNK_LATENCY = 'first_chunk_latency'
|
|
28
|
+
PROMPT_TOKENS = 'prompt_tokens'
|
|
29
|
+
COMPLETION_TOKENS = 'completion_tokens'
|
|
30
|
+
MAX_GPU_MEMORY_COST = 'max_gpu_memory_cost'
|
|
31
|
+
TIME_PER_OUTPUT_TOKEN = 'time_per_output_token'
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def load_prompt(prompt_path_or_text):
|
|
35
|
+
if prompt_path_or_text.startswith('@'):
|
|
36
|
+
with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
|
|
37
|
+
return file.read()
|
|
38
|
+
return prompt_path_or_text
|
|
39
|
+
|
|
40
|
+
|
|
18
41
|
def encode_data(data) -> str:
|
|
19
42
|
"""Encodes data using base64 and pickle."""
|
|
20
43
|
return base64.b64encode(pickle.dumps(data)).decode('utf-8')
|
|
@@ -33,32 +56,34 @@ def transpose_results(data):
|
|
|
33
56
|
|
|
34
57
|
|
|
35
58
|
def create_result_table(cursor):
|
|
36
|
-
cursor.execute(
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
59
|
+
cursor.execute(
|
|
60
|
+
f'''CREATE TABLE IF NOT EXISTS result(
|
|
61
|
+
{DatabaseColumns.REQUEST} TEXT,
|
|
62
|
+
{DatabaseColumns.START_TIME} REAL,
|
|
63
|
+
{DatabaseColumns.INTER_TOKEN_LATENCIES} TEXT,
|
|
64
|
+
{DatabaseColumns.SUCCESS} INTEGER,
|
|
65
|
+
{DatabaseColumns.RESPONSE_MESSAGES} TEXT,
|
|
66
|
+
{DatabaseColumns.COMPLETED_TIME} REAL,
|
|
67
|
+
{DatabaseColumns.LATENCY} REAL,
|
|
68
|
+
{DatabaseColumns.FIRST_CHUNK_LATENCY} REAL,
|
|
69
|
+
{DatabaseColumns.PROMPT_TOKENS} INTEGER,
|
|
70
|
+
{DatabaseColumns.COMPLETION_TOKENS} INTEGER,
|
|
71
|
+
{DatabaseColumns.MAX_GPU_MEMORY_COST} REAL,
|
|
72
|
+
{DatabaseColumns.TIME_PER_OUTPUT_TOKEN} REAL
|
|
73
|
+
)'''
|
|
74
|
+
)
|
|
50
75
|
|
|
51
76
|
|
|
52
77
|
def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
|
|
53
|
-
request =
|
|
54
|
-
|
|
78
|
+
request = benchmark_data.request
|
|
79
|
+
inter_token_latencies = json.dumps(benchmark_data.inter_chunk_latency)
|
|
55
80
|
response_messages = encode_data(benchmark_data.response_messages)
|
|
56
81
|
|
|
57
82
|
# Columns common to both success and failure cases
|
|
58
83
|
common_columns = (
|
|
59
84
|
request,
|
|
60
85
|
benchmark_data.start_time,
|
|
61
|
-
|
|
86
|
+
inter_token_latencies,
|
|
62
87
|
benchmark_data.success,
|
|
63
88
|
response_messages,
|
|
64
89
|
benchmark_data.completed_time,
|
|
@@ -67,23 +92,21 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
|
|
|
67
92
|
if benchmark_data.success:
|
|
68
93
|
# Add additional columns for success case
|
|
69
94
|
additional_columns = (
|
|
70
|
-
benchmark_data.query_latency,
|
|
71
|
-
benchmark_data.
|
|
72
|
-
benchmark_data.n_chunks,
|
|
73
|
-
benchmark_data.n_chunks_time,
|
|
74
|
-
benchmark_data.prompt_tokens,
|
|
75
|
-
benchmark_data.completion_tokens,
|
|
76
|
-
benchmark_data.max_gpu_memory_cost,
|
|
95
|
+
benchmark_data.query_latency, benchmark_data.first_chunk_latency, benchmark_data.prompt_tokens,
|
|
96
|
+
benchmark_data.completion_tokens, benchmark_data.max_gpu_memory_cost, benchmark_data.time_per_output_token
|
|
77
97
|
)
|
|
78
|
-
query = """INSERT INTO result(
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
98
|
+
query = f"""INSERT INTO result(
|
|
99
|
+
{DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES},
|
|
100
|
+
{DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME},
|
|
101
|
+
{DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY}, {DatabaseColumns.PROMPT_TOKENS},
|
|
102
|
+
{DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.MAX_GPU_MEMORY_COST},
|
|
103
|
+
{DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
|
|
104
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
|
|
83
105
|
cursor.execute(query, common_columns + additional_columns)
|
|
84
106
|
else:
|
|
85
|
-
query = """INSERT INTO result(
|
|
86
|
-
|
|
107
|
+
query = f"""INSERT INTO result(
|
|
108
|
+
{DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES},
|
|
109
|
+
{DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME}
|
|
87
110
|
) VALUES (?, ?, ?, ?, ?, ?)"""
|
|
88
111
|
cursor.execute(query, common_columns)
|
|
89
112
|
|
|
@@ -91,6 +114,8 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
|
|
|
91
114
|
def get_output_path(args: Arguments) -> str:
|
|
92
115
|
current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
93
116
|
output_path = os.path.join(args.outputs_dir, current_time, f'{args.name or args.model_id}')
|
|
117
|
+
# Filter illegal characters
|
|
118
|
+
output_path = re.sub(r'[<>:"|?*]', '_', output_path)
|
|
94
119
|
if not os.path.exists(output_path):
|
|
95
120
|
os.makedirs(output_path, exist_ok=True)
|
|
96
121
|
logger.info(f'Save the result to: {output_path}')
|
|
@@ -102,12 +127,24 @@ def get_result_db_path(args: Arguments):
|
|
|
102
127
|
|
|
103
128
|
logger.info(f'Save the data base to: {result_db_path}')
|
|
104
129
|
if os.path.exists(result_db_path):
|
|
105
|
-
logger.
|
|
130
|
+
logger.error(f'The db file {result_db_path} exists, delete it and start again!.')
|
|
106
131
|
sys.exit(1)
|
|
107
132
|
|
|
108
133
|
return result_db_path
|
|
109
134
|
|
|
110
135
|
|
|
136
|
+
class PercentileMetrics:
|
|
137
|
+
TTFT = 'TTFT (s)'
|
|
138
|
+
ITL = 'ITL (s)'
|
|
139
|
+
TPOT = 'TPOT (s)'
|
|
140
|
+
LATENCY = 'Latency (s)'
|
|
141
|
+
INPUT_TOKENS = 'Input tokens'
|
|
142
|
+
OUTPUT_TOKENS = 'Output tokens'
|
|
143
|
+
OUTPUT_THROUGHPUT = 'Output (tok/s)'
|
|
144
|
+
TOTAL_THROUGHPUT = 'Total (tok/s)'
|
|
145
|
+
PERCENTILES = 'Percentiles'
|
|
146
|
+
|
|
147
|
+
|
|
111
148
|
def calculate_percentiles(data: List[float], percentiles: List[int]) -> Dict[int, float]:
|
|
112
149
|
"""
|
|
113
150
|
Calculate the percentiles for a specific list of data.
|
|
@@ -136,54 +173,51 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
136
173
|
:param result_db_path: Path to the SQLite database file.
|
|
137
174
|
:return: Dictionary of percentiles for various metrics.
|
|
138
175
|
"""
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
except (json.JSONDecodeError, TypeError) as e:
|
|
145
|
-
logger.error(f'Error parsing chunk times: {e}')
|
|
146
|
-
return []
|
|
147
|
-
|
|
148
|
-
query_sql = ('SELECT start_time, chunk_times, success, completed_time, latency, first_chunk_latency, '
|
|
149
|
-
'n_chunks, chunk_time, prompt_tokens, completion_tokens '
|
|
150
|
-
'FROM result WHERE success=1')
|
|
176
|
+
query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES}, {DatabaseColumns.SUCCESS},
|
|
177
|
+
{DatabaseColumns.COMPLETED_TIME}, {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY},
|
|
178
|
+
{DatabaseColumns.PROMPT_TOKENS},
|
|
179
|
+
{DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
|
|
180
|
+
FROM result WHERE {DatabaseColumns.SUCCESS}=1''' # noqa: E501
|
|
151
181
|
|
|
152
182
|
percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
|
|
153
183
|
|
|
154
184
|
with sqlite3.connect(result_db_path) as con:
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
return {}
|
|
185
|
+
cursor = con.cursor()
|
|
186
|
+
cursor.execute(query_sql)
|
|
187
|
+
columns = [description[0] for description in cursor.description]
|
|
188
|
+
rows = cursor.fetchall()
|
|
160
189
|
|
|
161
|
-
#
|
|
162
|
-
|
|
163
|
-
LATENCY_INDEX = 4
|
|
164
|
-
FIRST_CHUNK_LATENCY_INDEX = 5
|
|
165
|
-
PROMPT_TOKENS_INDEX = 8
|
|
166
|
-
COMPLETION_TOKENS_INDEX = 9
|
|
190
|
+
# Create column index mapping
|
|
191
|
+
col_indices = {col: idx for idx, col in enumerate(columns)}
|
|
167
192
|
|
|
168
193
|
# Prepare data for each metric
|
|
169
194
|
inter_token_latencies_all = []
|
|
170
195
|
for row in rows:
|
|
171
|
-
|
|
196
|
+
try:
|
|
197
|
+
itl = json.loads(row[col_indices[DatabaseColumns.INTER_TOKEN_LATENCIES]]) or []
|
|
198
|
+
inter_token_latencies_all.extend(itl)
|
|
199
|
+
except (json.JSONDecodeError, TypeError) as e:
|
|
200
|
+
logger.error(f'Error parsing inter token latencies: {e}')
|
|
172
201
|
|
|
173
202
|
metrics = {
|
|
174
|
-
|
|
175
|
-
|
|
203
|
+
PercentileMetrics.TTFT: [row[col_indices[DatabaseColumns.FIRST_CHUNK_LATENCY]] for row in rows],
|
|
204
|
+
PercentileMetrics.ITL:
|
|
176
205
|
inter_token_latencies_all,
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
206
|
+
PercentileMetrics.TPOT: [row[col_indices[DatabaseColumns.TIME_PER_OUTPUT_TOKEN]] for row in rows],
|
|
207
|
+
PercentileMetrics.LATENCY: [row[col_indices[DatabaseColumns.LATENCY]] for row in rows],
|
|
208
|
+
PercentileMetrics.INPUT_TOKENS: [row[col_indices[DatabaseColumns.PROMPT_TOKENS]] for row in rows],
|
|
209
|
+
PercentileMetrics.OUTPUT_TOKENS: [row[col_indices[DatabaseColumns.COMPLETION_TOKENS]] for row in rows],
|
|
210
|
+
PercentileMetrics.OUTPUT_THROUGHPUT:
|
|
211
|
+
[(row[col_indices[DatabaseColumns.COMPLETION_TOKENS]] / row[col_indices[DatabaseColumns.LATENCY]])
|
|
212
|
+
if row[col_indices[DatabaseColumns.LATENCY]] > 0 else float('nan') for row in rows],
|
|
213
|
+
PercentileMetrics.TOTAL_THROUGHPUT:
|
|
214
|
+
[((row[col_indices[DatabaseColumns.PROMPT_TOKENS]] + row[col_indices[DatabaseColumns.COMPLETION_TOKENS]])
|
|
215
|
+
/ row[col_indices[DatabaseColumns.LATENCY]])
|
|
216
|
+
if row[col_indices[DatabaseColumns.LATENCY]] > 0 else float('nan') for row in rows]
|
|
183
217
|
}
|
|
184
218
|
|
|
185
219
|
# Calculate percentiles for each metric
|
|
186
|
-
results = {
|
|
220
|
+
results = {PercentileMetrics.PERCENTILES: [f'{p}%' for p in percentiles]}
|
|
187
221
|
for metric_name, data in metrics.items():
|
|
188
222
|
metric_percentiles = calculate_percentiles(data, percentiles)
|
|
189
223
|
results[metric_name] = [metric_percentiles[p] for p in percentiles]
|
|
@@ -191,16 +225,15 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
191
225
|
return results
|
|
192
226
|
|
|
193
227
|
|
|
194
|
-
def summary_result(args: Arguments, metrics: BenchmarkMetrics,
|
|
228
|
+
def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str) -> Tuple[Dict, Dict]:
|
|
195
229
|
result_path = os.path.dirname(result_db_path)
|
|
196
230
|
write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
|
|
197
231
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
write_json_file(data, os.path.join(result_path, 'benchmark_summary.json'))
|
|
232
|
+
metrics_result = metrics.create_message()
|
|
233
|
+
write_json_file(metrics_result, os.path.join(result_path, 'benchmark_summary.json'))
|
|
201
234
|
|
|
202
235
|
# Print summary in a table
|
|
203
|
-
table = tabulate(list(
|
|
236
|
+
table = tabulate(list(metrics_result.items()), headers=['Key', 'Value'], tablefmt='grid')
|
|
204
237
|
logger.info('\nBenchmarking summary:\n' + table)
|
|
205
238
|
|
|
206
239
|
# Get percentile results
|
|
@@ -214,20 +247,24 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, expected_number_o
|
|
|
214
247
|
if args.dataset.startswith('speed_benchmark'):
|
|
215
248
|
speed_benchmark_result(result_db_path)
|
|
216
249
|
|
|
250
|
+
logger.info(f'Save the summary to: {result_path}')
|
|
251
|
+
|
|
252
|
+
return metrics_result, percentile_result
|
|
253
|
+
|
|
217
254
|
|
|
218
255
|
def speed_benchmark_result(result_db_path: str):
|
|
219
|
-
query_sql = """
|
|
256
|
+
query_sql = f"""
|
|
220
257
|
SELECT
|
|
221
|
-
|
|
222
|
-
ROUND(AVG(
|
|
223
|
-
ROUND(AVG(
|
|
258
|
+
{DatabaseColumns.PROMPT_TOKENS},
|
|
259
|
+
ROUND(AVG({DatabaseColumns.COMPLETION_TOKENS} / {DatabaseColumns.LATENCY}), 2) AS avg_completion_token_per_second,
|
|
260
|
+
ROUND(AVG({DatabaseColumns.MAX_GPU_MEMORY_COST}), 2)
|
|
224
261
|
FROM
|
|
225
262
|
result
|
|
226
263
|
WHERE
|
|
227
|
-
|
|
264
|
+
{DatabaseColumns.SUCCESS} = 1 AND {DatabaseColumns.LATENCY} > 0
|
|
228
265
|
GROUP BY
|
|
229
|
-
|
|
230
|
-
"""
|
|
266
|
+
{DatabaseColumns.PROMPT_TOKENS}
|
|
267
|
+
""" # noqa: E501
|
|
231
268
|
|
|
232
269
|
with sqlite3.connect(result_db_path) as con:
|
|
233
270
|
cursor = con.cursor()
|
|
@@ -1,68 +1,28 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import subprocess
|
|
3
|
-
import torch
|
|
4
3
|
import uvicorn
|
|
5
4
|
from contextlib import asynccontextmanager
|
|
6
|
-
from dataclasses import dataclass
|
|
7
5
|
from fastapi import FastAPI
|
|
8
6
|
from fastapi.middleware.cors import CORSMiddleware
|
|
9
7
|
from sse_starlette.sse import EventSourceResponse
|
|
10
8
|
|
|
11
9
|
from evalscope.perf.arguments import Arguments
|
|
12
10
|
from evalscope.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
|
|
11
|
+
from evalscope.utils.import_utils import check_import
|
|
13
12
|
from evalscope.utils.logger import get_logger
|
|
14
13
|
|
|
15
14
|
logger = get_logger()
|
|
16
15
|
|
|
17
16
|
|
|
18
|
-
@dataclass
|
|
19
|
-
class ServerSentEvent(object):
|
|
20
|
-
|
|
21
|
-
def __init__(self, data='', event=None, id=None, retry=None):
|
|
22
|
-
self.data = data
|
|
23
|
-
self.event = event
|
|
24
|
-
self.id = id
|
|
25
|
-
self.retry = retry
|
|
26
|
-
|
|
27
|
-
@classmethod
|
|
28
|
-
def decode(cls, line):
|
|
29
|
-
"""Decode line to ServerSentEvent
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
Args:
|
|
33
|
-
line (str): The line.
|
|
34
|
-
|
|
35
|
-
Return:
|
|
36
|
-
ServerSentEvent (obj:`ServerSentEvent`): The ServerSentEvent object.
|
|
37
|
-
|
|
38
|
-
"""
|
|
39
|
-
if not line:
|
|
40
|
-
return None
|
|
41
|
-
sse_msg = cls()
|
|
42
|
-
# format data:xxx
|
|
43
|
-
field_type, _, field_value = line.partition(':')
|
|
44
|
-
if field_value.startswith(' '): # compatible with openai api
|
|
45
|
-
field_value = field_value[1:]
|
|
46
|
-
if field_type == 'event':
|
|
47
|
-
sse_msg.event = field_value
|
|
48
|
-
elif field_type == 'data':
|
|
49
|
-
field_value = field_value.rstrip()
|
|
50
|
-
sse_msg.data = field_value
|
|
51
|
-
elif field_type == 'id':
|
|
52
|
-
sse_msg.id = field_value
|
|
53
|
-
elif field_type == 'retry':
|
|
54
|
-
sse_msg.retry = field_value
|
|
55
|
-
else:
|
|
56
|
-
pass
|
|
57
|
-
|
|
58
|
-
return sse_msg
|
|
59
|
-
|
|
60
|
-
|
|
61
17
|
@asynccontextmanager
|
|
62
18
|
async def lifespan(app: FastAPI):
|
|
63
19
|
yield
|
|
64
|
-
|
|
65
|
-
torch
|
|
20
|
+
try:
|
|
21
|
+
import torch
|
|
22
|
+
if torch.cuda.is_available():
|
|
23
|
+
torch.cuda.empty_cache()
|
|
24
|
+
except ImportError:
|
|
25
|
+
pass
|
|
66
26
|
|
|
67
27
|
|
|
68
28
|
def create_app(model, attn_implementation=None) -> FastAPI:
|
|
@@ -96,11 +56,16 @@ def create_app(model, attn_implementation=None) -> FastAPI:
|
|
|
96
56
|
|
|
97
57
|
|
|
98
58
|
def start_app(args: Arguments):
|
|
59
|
+
logger.info('Starting local server, please wait...')
|
|
99
60
|
if args.api == 'local':
|
|
61
|
+
check_import('torch', 'torch', raise_error=True)
|
|
62
|
+
|
|
100
63
|
app = create_app(args.model, args.attn_implementation)
|
|
101
64
|
uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
|
|
102
65
|
|
|
103
66
|
elif args.api == 'local_vllm':
|
|
67
|
+
import torch
|
|
68
|
+
|
|
104
69
|
os.environ['VLLM_USE_MODELSCOPE'] = 'True'
|
|
105
70
|
os.environ['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = '1'
|
|
106
71
|
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from evalscope.perf.arguments import Arguments
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def init_wandb(args: Arguments) -> None:
|
|
7
|
+
"""
|
|
8
|
+
Initialize WandB for logging.
|
|
9
|
+
"""
|
|
10
|
+
# Initialize wandb if the api key is provided
|
|
11
|
+
import datetime
|
|
12
|
+
try:
|
|
13
|
+
import wandb
|
|
14
|
+
except ImportError:
|
|
15
|
+
raise RuntimeError('Cannot import wandb. Please install it with command: \n pip install wandb')
|
|
16
|
+
os.environ['WANDB_SILENT'] = 'true'
|
|
17
|
+
os.environ['WANDB_DIR'] = args.outputs_dir
|
|
18
|
+
current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
19
|
+
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
20
|
+
|
|
21
|
+
# Remove sensitive information from logging config
|
|
22
|
+
logging_config = args.to_dict()
|
|
23
|
+
logging_config.pop('api_key', None)
|
|
24
|
+
logging_config.pop('wandb_api_key', None)
|
|
25
|
+
|
|
26
|
+
if args.wandb_api_key is not None:
|
|
27
|
+
wandb.login(key=args.wandb_api_key)
|
|
28
|
+
wandb.init(project='perf_benchmark', name=name, config=logging_config)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def init_swanlab(args: Arguments) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Initialize SwanLab for logging.
|
|
34
|
+
"""
|
|
35
|
+
import datetime
|
|
36
|
+
try:
|
|
37
|
+
import swanlab
|
|
38
|
+
except ImportError:
|
|
39
|
+
raise RuntimeError('Cannot import swanlab. Please install it with command: \n pip install swanlab')
|
|
40
|
+
os.environ['SWANLAB_LOG_DIR'] = args.outputs_dir
|
|
41
|
+
current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
42
|
+
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
43
|
+
swanlab.config.update({'framework': '📏evalscope'})
|
|
44
|
+
|
|
45
|
+
# Remove sensitive information from logging config
|
|
46
|
+
logging_config = args.to_dict()
|
|
47
|
+
logging_config.pop('api_key', None)
|
|
48
|
+
logging_config.pop('swanlab_api_key', None)
|
|
49
|
+
|
|
50
|
+
init_kwargs = {
|
|
51
|
+
'project': os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
|
|
52
|
+
'name': name,
|
|
53
|
+
'config': logging_config,
|
|
54
|
+
'mode': 'local' if args.swanlab_api_key == 'local' else None
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
workspace = os.getenv('SWANLAB_WORKSPACE')
|
|
58
|
+
if workspace:
|
|
59
|
+
init_kwargs['workspace'] = workspace
|
|
60
|
+
|
|
61
|
+
if isinstance(args.swanlab_api_key, str) and not args.swanlab_api_key == 'local':
|
|
62
|
+
swanlab.login(api_key=args.swanlab_api_key)
|
|
63
|
+
swanlab.init(**init_kwargs)
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
# the following code is largely adapted from https://github.com/lework/llm-benchmark
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from rich.console import Console
|
|
5
|
+
from rich.panel import Panel
|
|
6
|
+
from rich.style import Style
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
from rich.text import Text
|
|
9
|
+
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
from .benchmark_util import Metrics
|
|
12
|
+
from .db_util import PercentileMetrics
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def analyze_results(all_results):
|
|
18
|
+
"""Analyze all test results and generate a summary report"""
|
|
19
|
+
summary = []
|
|
20
|
+
total_tokens = 0
|
|
21
|
+
total_time = 0
|
|
22
|
+
|
|
23
|
+
for result in all_results:
|
|
24
|
+
total_metrics = result[0]
|
|
25
|
+
percentile_metrics = result[1]
|
|
26
|
+
percentiles = percentile_metrics[PercentileMetrics.PERCENTILES]
|
|
27
|
+
try:
|
|
28
|
+
concurrency = total_metrics.get(Metrics.NUMBER_OF_CONCURRENCY, 0)
|
|
29
|
+
rps = total_metrics.get(Metrics.REQUEST_THROUGHPUT, 0)
|
|
30
|
+
avg_latency = total_metrics.get(Metrics.AVERAGE_LATENCY, 0)
|
|
31
|
+
p99_latency = percentile_metrics.get(PercentileMetrics.LATENCY)[percentiles.index('99%')]
|
|
32
|
+
avg_tps = total_metrics.get(Metrics.OUTPUT_TOKEN_THROUGHPUT, 0)
|
|
33
|
+
avg_ttft = total_metrics.get(Metrics.AVERAGE_TIME_TO_FIRST_TOKEN, 0)
|
|
34
|
+
p99_ttft = percentile_metrics.get(PercentileMetrics.TTFT)[percentiles.index('99%')]
|
|
35
|
+
success_rate = (
|
|
36
|
+
total_metrics.get(Metrics.SUCCEED_REQUESTS, 0) / total_metrics.get(Metrics.TOTAL_REQUESTS, 1)
|
|
37
|
+
) * 100
|
|
38
|
+
avg_tpot = total_metrics.get(Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN, 0)
|
|
39
|
+
p99_tpot = percentile_metrics.get(PercentileMetrics.TPOT)[percentiles.index('99%')]
|
|
40
|
+
|
|
41
|
+
# Ensure all values are valid numbers
|
|
42
|
+
if any(x is None for x in [concurrency, rps, avg_latency, p99_latency, avg_tps, avg_ttft]):
|
|
43
|
+
logger.warning(f'Warning: Test results for concurrency {concurrency} contain invalid data, skipped')
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
summary.append([
|
|
47
|
+
concurrency,
|
|
48
|
+
f'{rps:.2f}' if rps is not None else 'N/A',
|
|
49
|
+
f'{avg_latency:.3f}' if avg_latency is not None else 'N/A',
|
|
50
|
+
f'{p99_latency:.3f}' if p99_latency is not None else 'N/A',
|
|
51
|
+
f'{avg_tps:.2f}' if avg_tps is not None else 'N/A',
|
|
52
|
+
f'{avg_ttft:.3f}' if avg_ttft is not None else 'N/A',
|
|
53
|
+
f'{success_rate:.1f}%' if success_rate is not None else 'N/A',
|
|
54
|
+
f'{p99_ttft:.3f}' if p99_ttft is not None else 'N/A',
|
|
55
|
+
f'{avg_tpot:.3f}' if avg_tpot is not None else 'N/A',
|
|
56
|
+
f'{p99_tpot:.3f}' if p99_tpot is not None else 'N/A',
|
|
57
|
+
])
|
|
58
|
+
|
|
59
|
+
total_tokens += total_metrics.get(Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST,
|
|
60
|
+
0) * total_metrics.get(Metrics.SUCCEED_REQUESTS, 0)
|
|
61
|
+
total_time += total_metrics.get(Metrics.TIME_TAKEN_FOR_TESTS, 0)
|
|
62
|
+
except Exception as e:
|
|
63
|
+
logger.warning(
|
|
64
|
+
f"Warning: Error processing results for concurrency {result.get('concurrency', 'unknown')}: {str(e)}"
|
|
65
|
+
)
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
if not summary:
|
|
69
|
+
logger.warning('Error: No valid test result data')
|
|
70
|
+
return [], 0, 0
|
|
71
|
+
|
|
72
|
+
return summary, total_tokens, total_time
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def print_summary(all_results, model_name):
|
|
76
|
+
"""Print test results summary"""
|
|
77
|
+
summary, total_tokens, total_time = analyze_results(all_results)
|
|
78
|
+
|
|
79
|
+
if not summary:
|
|
80
|
+
logger.warning('No available test result data to display')
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
console = Console(width=100) # Set fixed width
|
|
84
|
+
|
|
85
|
+
# Create title panel
|
|
86
|
+
title = Text('Performance Test Summary Report', style='bold')
|
|
87
|
+
console.print(Panel(title, width=60))
|
|
88
|
+
|
|
89
|
+
# Print basic information
|
|
90
|
+
basic_info = Table(show_header=False, width=60)
|
|
91
|
+
basic_info.add_column('Name', style='cyan', width=25)
|
|
92
|
+
basic_info.add_column('Value', style='green', width=35)
|
|
93
|
+
|
|
94
|
+
basic_info.add_row('Model', model_name)
|
|
95
|
+
basic_info.add_row('Total Generated', f'{total_tokens:,} tokens')
|
|
96
|
+
basic_info.add_row('Total Test Time', f'{total_time:.2f} seconds')
|
|
97
|
+
basic_info.add_row('Avg Output Rate', f'{total_tokens / total_time:.2f} tokens/sec')
|
|
98
|
+
|
|
99
|
+
console.print('\nBasic Information:')
|
|
100
|
+
console.print(basic_info)
|
|
101
|
+
|
|
102
|
+
# Create detailed performance metrics table
|
|
103
|
+
table = Table(
|
|
104
|
+
title='Detailed Performance Metrics',
|
|
105
|
+
show_header=True,
|
|
106
|
+
header_style='bold cyan',
|
|
107
|
+
border_style='blue',
|
|
108
|
+
width=100, # Set total table width
|
|
109
|
+
pad_edge=False, # Reduce edge padding
|
|
110
|
+
min_width=60, # Minimum width
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Add columns (set fixed column widths)
|
|
114
|
+
table.add_column('Conc.', justify='right', style='cyan')
|
|
115
|
+
table.add_column('RPS', justify='right')
|
|
116
|
+
table.add_column('Avg Lat.(s)', justify='right')
|
|
117
|
+
table.add_column('P99 Lat.(s)', justify='right')
|
|
118
|
+
table.add_column('Gen. toks/s', justify='right')
|
|
119
|
+
table.add_column('Avg TTFT(s)', justify='right')
|
|
120
|
+
table.add_column('P99 TTFT(s)', justify='right')
|
|
121
|
+
table.add_column('Avg TPOT(s)', justify='right')
|
|
122
|
+
table.add_column('P99 TPOT(s)', justify='right')
|
|
123
|
+
table.add_column('Success Rate', justify='right', style='green')
|
|
124
|
+
|
|
125
|
+
# Add data rows
|
|
126
|
+
for row in summary:
|
|
127
|
+
try:
|
|
128
|
+
# Set row style based on success rate
|
|
129
|
+
success_rate = float(row[6].rstrip('%'))
|
|
130
|
+
row_style = 'green' if success_rate >= 95 else 'yellow' if success_rate >= 80 else 'red'
|
|
131
|
+
|
|
132
|
+
table.add_row(
|
|
133
|
+
str(row[0]), # Concurrency
|
|
134
|
+
f'{float(row[1]):.2f}', # RPS
|
|
135
|
+
f'{float(row[2]):.3f}', # Average Latency
|
|
136
|
+
f'{float(row[3]):.3f}', # P99 Latency
|
|
137
|
+
f'{float(row[4]):.2f}', # Average TPS
|
|
138
|
+
f'{float(row[5]):.3f}', # First Token Latency
|
|
139
|
+
f'{float(row[7]):.3f}', # P99 TTFT
|
|
140
|
+
f'{float(row[8]):.3f}', # Average TPOT
|
|
141
|
+
f'{float(row[9]):.3f}', # P99 TPOT
|
|
142
|
+
row[6], # Success Rate
|
|
143
|
+
style=row_style
|
|
144
|
+
)
|
|
145
|
+
except ValueError as e:
|
|
146
|
+
console.print(f'Warning: Error processing row data: {str(e)}', style='bold red')
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
console.print('\n')
|
|
150
|
+
console.print(table)
|
|
151
|
+
|
|
152
|
+
# Calculate and display best performance configuration
|
|
153
|
+
try:
|
|
154
|
+
best_rps_idx = np.argmax([float(row[1]) if row[1] != 'N/A' else -1 for row in summary])
|
|
155
|
+
best_latency_idx = np.argmin([float(row[2]) if row[2] != 'N/A' else float('inf') for row in summary])
|
|
156
|
+
|
|
157
|
+
perf_info = Table(title='Best Performance Configuration', show_header=False, box=None, width=60)
|
|
158
|
+
perf_info.add_column('Metric', style='cyan', width=20)
|
|
159
|
+
perf_info.add_column('Value', style='green', width=40)
|
|
160
|
+
|
|
161
|
+
perf_info.add_row('Highest RPS', f'Concurrency {summary[best_rps_idx][0]} ({summary[best_rps_idx][1]} req/sec)')
|
|
162
|
+
perf_info.add_row(
|
|
163
|
+
'Lowest Latency', f'Concurrency {summary[best_latency_idx][0]} ({summary[best_latency_idx][2]} seconds)'
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
console.print('\n')
|
|
167
|
+
console.print(perf_info)
|
|
168
|
+
|
|
169
|
+
# Performance recommendations
|
|
170
|
+
recommendations = []
|
|
171
|
+
if best_rps_idx == len(summary) - 1:
|
|
172
|
+
recommendations.append(
|
|
173
|
+
'The system seems not to have reached its performance bottleneck, try higher concurrency'
|
|
174
|
+
)
|
|
175
|
+
elif best_rps_idx == 0:
|
|
176
|
+
recommendations.append('Consider lowering concurrency, current load may be too high')
|
|
177
|
+
else:
|
|
178
|
+
recommendations.append(f'Optimal concurrency range is around {summary[best_rps_idx][0]}')
|
|
179
|
+
|
|
180
|
+
success_rate = float(summary[-1][6][:-1])
|
|
181
|
+
if success_rate < 95:
|
|
182
|
+
recommendations.append(
|
|
183
|
+
'Success rate is low at high concurrency, check system resources or reduce concurrency'
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
recommend_text = Text('\nPerformance Recommendations:', style='bold cyan')
|
|
187
|
+
console.print(recommend_text)
|
|
188
|
+
for rec in recommendations:
|
|
189
|
+
console.print(f'• {rec}', style='yellow')
|
|
190
|
+
|
|
191
|
+
except Exception as e:
|
|
192
|
+
console.print(f'Warning: Error generating performance analysis: {str(e)}', style='bold red')
|