PyPI - evalscope - Versions diffs - 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 0.10.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (606) hide show

evalscope/__init__.py +4 -1
evalscope/api/benchmark/__init__.py +11 -0
evalscope/api/benchmark/adapters/__init__.py +7 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +404 -0
evalscope/api/benchmark/meta.py +124 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +370 -0
evalscope/api/dataset/loader.py +266 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +382 -0
evalscope/api/evaluator/evaluator.py +61 -0
evalscope/api/evaluator/state.py +280 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +12 -0
evalscope/api/messages/chat_message.py +248 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +60 -0
evalscope/api/metric/scorer.py +113 -0
evalscope/api/mixin/__init__.py +2 -0
evalscope/api/mixin/llm_judge_mixin.py +170 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +161 -0
evalscope/api/model/model.py +386 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/__init__.py +28 -0
evalscope/app/app.py +38 -0
evalscope/app/arguments.py +11 -0
evalscope/app/constants.py +22 -0
evalscope/app/ui/__init__.py +20 -0
evalscope/app/ui/app_ui.py +53 -0
evalscope/app/ui/multi_model.py +353 -0
evalscope/app/ui/sidebar.py +42 -0
evalscope/app/ui/single_model.py +220 -0
evalscope/app/ui/visualization.py +36 -0
evalscope/app/utils/data_utils.py +195 -0
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/localization.py +221 -0
evalscope/app/utils/text_utils.py +119 -0
evalscope/app/utils/visualization.py +96 -0
evalscope/arguments.py +32 -9
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +10 -7
evalscope/backend/rag_eval/__init__.py +1 -1
evalscope/backend/rag_eval/backend_manager.py +23 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
evalscope/backend/rag_eval/ragas/arguments.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
evalscope/backend/rag_eval/utils/embedding.py +125 -32
evalscope/backend/rag_eval/utils/llm.py +16 -16
evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
evalscope/benchmarks/__init__.py +17 -5
evalscope/benchmarks/aa_lcr/__init__.py +0 -0
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/__init__.py +0 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/__init__.py +0 -0
evalscope/benchmarks/aime/aime24_adapter.py +55 -0
evalscope/benchmarks/aime/aime25_adapter.py +181 -0
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
evalscope/benchmarks/amc/__init__.py +0 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arc/arc_adapter.py +34 -149
evalscope/benchmarks/arena_hard/__init__.py +0 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
evalscope/benchmarks/arena_hard/utils.py +186 -0
evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
evalscope/benchmarks/bfcl/__init__.py +0 -0
evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
evalscope/benchmarks/bfcl/v3/generation.py +222 -0
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
evalscope/benchmarks/data_collection/__init__.py +0 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
evalscope/benchmarks/docmath/__init__.py +0 -0
evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
evalscope/benchmarks/docmath/utils.py +219 -0
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/__init__.py +0 -0
evalscope/benchmarks/drop/drop_adapter.py +155 -0
evalscope/benchmarks/drop/utils.py +156 -0
evalscope/benchmarks/frames/__init__.py +0 -0
evalscope/benchmarks/frames/frames_adapter.py +175 -0
evalscope/benchmarks/frames/utils.py +37 -0
evalscope/benchmarks/general_arena/__init__.py +0 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
evalscope/benchmarks/general_arena/utils.py +223 -0
evalscope/benchmarks/general_mcq/__init__.py +0 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
evalscope/benchmarks/gpqa/__init__.py +0 -0
evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
evalscope/benchmarks/gpqa/prompt.py +88 -0
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
evalscope/benchmarks/hle/__init__.py +0 -0
evalscope/benchmarks/hle/hle_adapter.py +153 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
evalscope/benchmarks/ifeval/instructions.py +112 -68
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
evalscope/benchmarks/live_code_bench/__init__.py +0 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
evalscope/benchmarks/live_code_bench/prompts.py +207 -0
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/maritime_bench/__init__.py +0 -0
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
evalscope/benchmarks/math_500/__init__.py +0 -0
evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/musr/__init__.py +0 -0
evalscope/benchmarks/musr/musr_adapter.py +43 -0
evalscope/benchmarks/needle_haystack/__init__.py +0 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
evalscope/benchmarks/needle_haystack/utils.py +79 -0
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/__init__.py +0 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/race/race_adapter.py +33 -120
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/__init__.py +0 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/super_gpqa/__init__.py +0 -0
evalscope/benchmarks/super_gpqa/prompt.py +88 -0
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
evalscope/benchmarks/super_gpqa/utils.py +86 -0
evalscope/benchmarks/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
evalscope/benchmarks/tool_bench/__init__.py +0 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
evalscope/benchmarks/tool_bench/utils.py +203 -0
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/winogrande/__init__.py +0 -0
evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +12 -2
evalscope/cli/start_eval.py +4 -3
evalscope/cli/start_perf.py +10 -2
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +27 -3
evalscope/collections/sampler.py +12 -11
evalscope/collections/schema.py +13 -12
evalscope/config.py +218 -147
evalscope/constants.py +78 -82
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +334 -318
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +59 -3
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
evalscope/metrics/llm_judge.py +211 -0
evalscope/metrics/math_parser.py +545 -0
evalscope/metrics/metric.py +611 -0
evalscope/metrics/metrics.py +112 -23
evalscope/metrics/rouge_metric.py +11 -13
evalscope/metrics/t2v_metrics/__init__.py +0 -0
evalscope/metrics/t2v_metrics/clipscore.py +14 -0
evalscope/metrics/t2v_metrics/constants.py +12 -0
evalscope/metrics/t2v_metrics/itmscore.py +14 -0
evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
evalscope/metrics/t2v_metrics/models/model.py +45 -0
evalscope/metrics/t2v_metrics/models/utils.py +25 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
evalscope/metrics/t2v_metrics/score.py +78 -0
evalscope/metrics/t2v_metrics/vqascore.py +14 -0
evalscope/models/__init__.py +23 -13
evalscope/models/image_edit_model.py +125 -0
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +69 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +144 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +708 -0
evalscope/perf/__init__.py +0 -1
evalscope/perf/arguments.py +103 -69
evalscope/perf/benchmark.py +114 -163
evalscope/perf/http_client.py +59 -89
evalscope/perf/main.py +91 -18
evalscope/perf/plugin/__init__.py +3 -2
evalscope/perf/plugin/api/__init__.py +4 -3
evalscope/perf/plugin/api/base.py +27 -7
evalscope/perf/plugin/api/custom_api.py +170 -57
evalscope/perf/plugin/api/dashscope_api.py +4 -10
evalscope/perf/plugin/api/default_api.py +214 -0
evalscope/perf/plugin/api/openai_api.py +120 -41
evalscope/perf/plugin/datasets/__init__.py +10 -6
evalscope/perf/plugin/datasets/base.py +43 -1
evalscope/perf/plugin/datasets/custom.py +22 -3
evalscope/perf/plugin/datasets/flickr8k.py +5 -27
evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
evalscope/perf/plugin/datasets/line_by_line.py +7 -3
evalscope/perf/plugin/datasets/longalpaca.py +7 -3
evalscope/perf/plugin/datasets/openqa.py +13 -14
evalscope/perf/plugin/datasets/random_dataset.py +67 -0
evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
evalscope/perf/plugin/registry.py +36 -16
evalscope/perf/utils/analysis_result.py +24 -23
evalscope/perf/utils/benchmark_util.py +95 -55
evalscope/perf/utils/db_util.py +115 -78
evalscope/perf/utils/local_server.py +12 -47
evalscope/perf/utils/log_utils.py +63 -0
evalscope/perf/utils/rich_display.py +192 -0
evalscope/report/__init__.py +46 -3
evalscope/report/combinator.py +143 -32
evalscope/report/generator.py +74 -34
evalscope/report/report.py +238 -0
evalscope/run.py +71 -46
evalscope/summarizer.py +5 -5
evalscope/third_party/longbench_write/infer.py +1 -1
evalscope/third_party/thinkbench/__init__.py +3 -0
evalscope/third_party/thinkbench/eval.py +441 -0
evalscope/third_party/thinkbench/infer.py +130 -0
evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
evalscope/third_party/thinkbench/tools/__init__.py +0 -0
evalscope/third_party/thinkbench/tools/llm.py +48 -0
evalscope/third_party/thinkbench/tools/utils.py +13 -0
evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
evalscope/utils/__init__.py +82 -2
evalscope/utils/argument_utils.py +64 -0
evalscope/utils/chat_service.py +8 -6
evalscope/utils/deprecation_utils.py +53 -0
evalscope/utils/function_utils.py +266 -0
evalscope/utils/import_utils.py +154 -0
evalscope/utils/io_utils.py +336 -8
evalscope/utils/json_schema.py +231 -0
evalscope/utils/logger.py +121 -31
evalscope/utils/model_utils.py +57 -1
evalscope/utils/multi_choices.py +303 -0
evalscope/utils/ner.py +377 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
evalscope-1.2.0.dist-info/METADATA +553 -0
evalscope-1.2.0.dist-info/RECORD +628 -0
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -76
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/ceval/samples.jsonl +0 -1
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -291
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/collections/evaluator.py +0 -198
evalscope/evaluator/rating_eval.py +0 -157
evalscope/evaluator/reviewer/__init__.py +0 -1
evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
evalscope/metrics/code_metric.py +0 -98
evalscope/metrics/named_metrics.py +0 -17
evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
evalscope/models/base_adapter.py +0 -52
evalscope/models/chat_adapter.py +0 -138
evalscope/models/choice_adapter.py +0 -211
evalscope/models/custom/__init__.py +0 -3
evalscope/models/custom/custom_model.py +0 -53
evalscope/models/custom/dummy_model.py +0 -63
evalscope/models/custom_adapter.py +0 -67
evalscope/models/local_model.py +0 -74
evalscope/models/model.py +0 -229
evalscope/models/server_adapter.py +0 -111
evalscope/registry/__init__.py +0 -1
evalscope/registry/config/cfg_arena.yaml +0 -77
evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
evalscope/registry/config/cfg_single.yaml +0 -78
evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
evalscope/registry/data/qa_browser/battle.jsonl +0 -634
evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
evalscope/registry/data/question.jsonl +0 -80
evalscope/registry/tasks/arc.yaml +0 -28
evalscope/registry/tasks/bbh.yaml +0 -26
evalscope/registry/tasks/bbh_mini.yaml +0 -26
evalscope/registry/tasks/ceval.yaml +0 -27
evalscope/registry/tasks/ceval_mini.yaml +0 -26
evalscope/registry/tasks/cmmlu.yaml +0 -27
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
evalscope/registry/tasks/general_qa.yaml +0 -27
evalscope/registry/tasks/gsm8k.yaml +0 -29
evalscope/registry/tasks/mmlu.yaml +0 -29
evalscope/registry/tasks/mmlu_mini.yaml +0 -27
evalscope/report/app.py +0 -506
evalscope/report/utils.py +0 -133
evalscope/run_arena.py +0 -202
evalscope/utils/arena_utils.py +0 -217
evalscope/utils/completion_parsers.py +0 -82
evalscope/utils/utils.py +0 -301
evalscope-0.10.0.dist-info/METADATA +0 -565
evalscope-0.10.0.dist-info/RECORD +0 -286
tests/__init__.py +0 -1
tests/cli/__init__.py +0 -1
tests/cli/test_collection.py +0 -57
tests/cli/test_run.py +0 -165
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -101
tests/rag/test_clip_benchmark.py +0 -85
tests/rag/test_mteb.py +0 -138
tests/rag/test_ragas.py +0 -120
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -145
tests/swift/test_run_swift_vlm_eval.py +0 -127
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
tests/test_run_all.py +0 -12
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -60
{tests/rag → evalscope/api}/__init__.py +0 -0
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/perf/utils/db_util.py CHANGED Viewed

@@ -2,11 +2,12 @@ import base64
 import json
 import os
 import pickle
+import re
 import sqlite3
 import sys
 from datetime import datetime
 from tabulate import tabulate
-from typing import Dict, List
+from typing import Dict, List, Tuple
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
@@ -15,6 +16,28 @@ from evalscope.utils.logger import get_logger
 logger = get_logger()
+class DatabaseColumns:
+    REQUEST = 'request'
+    START_TIME = 'start_time'
+    INTER_TOKEN_LATENCIES = 'inter_token_latencies'
+    SUCCESS = 'success'
+    RESPONSE_MESSAGES = 'response_messages'
+    COMPLETED_TIME = 'completed_time'
+    LATENCY = 'latency'
+    FIRST_CHUNK_LATENCY = 'first_chunk_latency'
+    PROMPT_TOKENS = 'prompt_tokens'
+    COMPLETION_TOKENS = 'completion_tokens'
+    MAX_GPU_MEMORY_COST = 'max_gpu_memory_cost'
+    TIME_PER_OUTPUT_TOKEN = 'time_per_output_token'
+def load_prompt(prompt_path_or_text):
+    if prompt_path_or_text.startswith('@'):
+        with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
+            return file.read()
+    return prompt_path_or_text
 def encode_data(data) -> str:
     """Encodes data using base64 and pickle."""
     return base64.b64encode(pickle.dumps(data)).decode('utf-8')
@@ -33,32 +56,34 @@ def transpose_results(data):
 def create_result_table(cursor):
-    cursor.execute('''CREATE TABLE IF NOT EXISTS result(
-                      request TEXT,
-                      start_time REAL,
-                      chunk_times TEXT,
-                      success INTEGER,
-                      response_messages TEXT,
-                      completed_time REAL,
-                      latency REAL,
-                      first_chunk_latency REAL,
-                      n_chunks INTEGER,
-                      chunk_time REAL,
-                      prompt_tokens INTEGER,
-                      completion_tokens INTEGER,
-                      max_gpu_memory_cost REAL)''')
+    cursor.execute(
+        f'''CREATE TABLE IF NOT EXISTS result(
+                      {DatabaseColumns.REQUEST} TEXT,
+                      {DatabaseColumns.START_TIME} REAL,
+                      {DatabaseColumns.INTER_TOKEN_LATENCIES} TEXT,
+                      {DatabaseColumns.SUCCESS} INTEGER,
+                      {DatabaseColumns.RESPONSE_MESSAGES} TEXT,
+                      {DatabaseColumns.COMPLETED_TIME} REAL,
+                      {DatabaseColumns.LATENCY} REAL,
+                      {DatabaseColumns.FIRST_CHUNK_LATENCY} REAL,
+                      {DatabaseColumns.PROMPT_TOKENS} INTEGER,
+                      {DatabaseColumns.COMPLETION_TOKENS} INTEGER,
+                      {DatabaseColumns.MAX_GPU_MEMORY_COST} REAL,
+                      {DatabaseColumns.TIME_PER_OUTPUT_TOKEN} REAL
+                   )'''
+    )
 def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
-    request = encode_data(benchmark_data.request)
-    chunk_times = json.dumps(benchmark_data.chunk_times)
+    request = benchmark_data.request
+    inter_token_latencies = json.dumps(benchmark_data.inter_chunk_latency)
     response_messages = encode_data(benchmark_data.response_messages)
     # Columns common to both success and failure cases
     common_columns = (
         request,
         benchmark_data.start_time,
-        chunk_times,
+        inter_token_latencies,
         benchmark_data.success,
         response_messages,
         benchmark_data.completed_time,
@@ -67,23 +92,21 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
     if benchmark_data.success:
         # Add additional columns for success case
         additional_columns = (
-            benchmark_data.query_latency,
-            benchmark_data.first_chunk_latency,
-            benchmark_data.n_chunks,
-            benchmark_data.n_chunks_time,
-            benchmark_data.prompt_tokens,
-            benchmark_data.completion_tokens,
-            benchmark_data.max_gpu_memory_cost,
+            benchmark_data.query_latency, benchmark_data.first_chunk_latency, benchmark_data.prompt_tokens,
+            benchmark_data.completion_tokens, benchmark_data.max_gpu_memory_cost, benchmark_data.time_per_output_token
         )
-        query = """INSERT INTO result(
-                      request, start_time, chunk_times, success, response_messages,
-                      completed_time, latency, first_chunk_latency,
-                      n_chunks, chunk_time, prompt_tokens, completion_tokens, max_gpu_memory_cost
-                   ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
+        query = f"""INSERT INTO result(
+                      {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES},
+                      {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME},
+                      {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY}, {DatabaseColumns.PROMPT_TOKENS},
+                      {DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.MAX_GPU_MEMORY_COST},
+                      {DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
+                   ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
         cursor.execute(query, common_columns + additional_columns)
     else:
-        query = """INSERT INTO result(
-                      request, start_time, chunk_times, success, response_messages, completed_time
+        query = f"""INSERT INTO result(
+                      {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES},
+                      {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME}
                    ) VALUES (?, ?, ?, ?, ?, ?)"""
         cursor.execute(query, common_columns)
@@ -91,6 +114,8 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
 def get_output_path(args: Arguments) -> str:
     current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
     output_path = os.path.join(args.outputs_dir, current_time, f'{args.name or args.model_id}')
+    # Filter illegal characters
+    output_path = re.sub(r'[<>:"|?*]', '_', output_path)
     if not os.path.exists(output_path):
         os.makedirs(output_path, exist_ok=True)
     logger.info(f'Save the result to: {output_path}')
@@ -102,12 +127,24 @@ def get_result_db_path(args: Arguments):
     logger.info(f'Save the data base to: {result_db_path}')
     if os.path.exists(result_db_path):
-        logger.warning('The db file exists, delete it and start again!.')
+        logger.error(f'The db file {result_db_path} exists, delete it and start again!.')
         sys.exit(1)
     return result_db_path
+class PercentileMetrics:
+    TTFT = 'TTFT (s)'
+    ITL = 'ITL (s)'
+    TPOT = 'TPOT (s)'
+    LATENCY = 'Latency (s)'
+    INPUT_TOKENS = 'Input tokens'
+    OUTPUT_TOKENS = 'Output tokens'
+    OUTPUT_THROUGHPUT = 'Output (tok/s)'
+    TOTAL_THROUGHPUT = 'Total (tok/s)'
+    PERCENTILES = 'Percentiles'
 def calculate_percentiles(data: List[float], percentiles: List[int]) -> Dict[int, float]:
     """
     Calculate the percentiles for a specific list of data.
@@ -136,54 +173,51 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
     :param result_db_path: Path to the SQLite database file.
     :return: Dictionary of percentiles for various metrics.
     """
-    def inter_token_latencies(chunk_times_json: str) -> List[float]:
-        try:
-            chunk_times = json.loads(chunk_times_json)
-            return [t2 - t1 for t1, t2 in zip(chunk_times[:-1], chunk_times[1:])]
-        except (json.JSONDecodeError, TypeError) as e:
-            logger.error(f'Error parsing chunk times: {e}')
-            return []
-    query_sql = ('SELECT start_time, chunk_times, success, completed_time, latency, first_chunk_latency, '
-                 'n_chunks, chunk_time, prompt_tokens, completion_tokens '
-                 'FROM result WHERE success=1')
+    query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES}, {DatabaseColumns.SUCCESS},
+                    {DatabaseColumns.COMPLETED_TIME}, {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY},
+                    {DatabaseColumns.PROMPT_TOKENS},
+                    {DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
+                    FROM result WHERE {DatabaseColumns.SUCCESS}=1'''  # noqa: E501
     percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
     with sqlite3.connect(result_db_path) as con:
-        rows = con.execute(query_sql).fetchall()
-    if len(rows) < len(percentiles):
-        logger.info('Too little data to calculate quantiles!')
-        return {}
+        cursor = con.cursor()
+        cursor.execute(query_sql)
+        columns = [description[0] for description in cursor.description]
+        rows = cursor.fetchall()
-    # Define index variables for columns
-    CHUNK_TIMES_INDEX = 1
-    LATENCY_INDEX = 4
-    FIRST_CHUNK_LATENCY_INDEX = 5
-    PROMPT_TOKENS_INDEX = 8
-    COMPLETION_TOKENS_INDEX = 9
+    # Create column index mapping
+    col_indices = {col: idx for idx, col in enumerate(columns)}
     # Prepare data for each metric
     inter_token_latencies_all = []
     for row in rows:
-        inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))
+        try:
+            itl = json.loads(row[col_indices[DatabaseColumns.INTER_TOKEN_LATENCIES]]) or []
+            inter_token_latencies_all.extend(itl)
+        except (json.JSONDecodeError, TypeError) as e:
+            logger.error(f'Error parsing inter token latencies: {e}')
     metrics = {
-        'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
-        'TPOT (s)':
+        PercentileMetrics.TTFT: [row[col_indices[DatabaseColumns.FIRST_CHUNK_LATENCY]] for row in rows],
+        PercentileMetrics.ITL:
         inter_token_latencies_all,
-        'Latency (s)': [row[LATENCY_INDEX] for row in rows],
-        'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows],
-        'Output tokens': [row[COMPLETION_TOKENS_INDEX] for row in rows],
-        'Throughput(tokens/s)':
-        [(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
-         for row in rows]
+        PercentileMetrics.TPOT: [row[col_indices[DatabaseColumns.TIME_PER_OUTPUT_TOKEN]] for row in rows],
+        PercentileMetrics.LATENCY: [row[col_indices[DatabaseColumns.LATENCY]] for row in rows],
+        PercentileMetrics.INPUT_TOKENS: [row[col_indices[DatabaseColumns.PROMPT_TOKENS]] for row in rows],
+        PercentileMetrics.OUTPUT_TOKENS: [row[col_indices[DatabaseColumns.COMPLETION_TOKENS]] for row in rows],
+        PercentileMetrics.OUTPUT_THROUGHPUT:
+        [(row[col_indices[DatabaseColumns.COMPLETION_TOKENS]] / row[col_indices[DatabaseColumns.LATENCY]])
+         if row[col_indices[DatabaseColumns.LATENCY]] > 0 else float('nan') for row in rows],
+        PercentileMetrics.TOTAL_THROUGHPUT:
+        [((row[col_indices[DatabaseColumns.PROMPT_TOKENS]] + row[col_indices[DatabaseColumns.COMPLETION_TOKENS]])
+          / row[col_indices[DatabaseColumns.LATENCY]])
+         if row[col_indices[DatabaseColumns.LATENCY]] > 0 else float('nan') for row in rows]
     }
     # Calculate percentiles for each metric
-    results = {'Percentile': [f'{p}%' for p in percentiles]}
+    results = {PercentileMetrics.PERCENTILES: [f'{p}%' for p in percentiles]}
     for metric_name, data in metrics.items():
         metric_percentiles = calculate_percentiles(data, percentiles)
         results[metric_name] = [metric_percentiles[p] for p in percentiles]
@@ -191,16 +225,15 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
     return results
-def summary_result(args: Arguments, metrics: BenchmarkMetrics, expected_number_of_queries: int, result_db_path: str):
+def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str) -> Tuple[Dict, Dict]:
     result_path = os.path.dirname(result_db_path)
     write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
-    data = metrics.create_message()
-    data.update({'Expected number of requests': expected_number_of_queries, 'Result DB path': result_db_path})
-    write_json_file(data, os.path.join(result_path, 'benchmark_summary.json'))
+    metrics_result = metrics.create_message()
+    write_json_file(metrics_result, os.path.join(result_path, 'benchmark_summary.json'))
     # Print summary in a table
-    table = tabulate(list(data.items()), headers=['Key', 'Value'], tablefmt='grid')
+    table = tabulate(list(metrics_result.items()), headers=['Key', 'Value'], tablefmt='grid')
     logger.info('\nBenchmarking summary:\n' + table)
     # Get percentile results
@@ -214,20 +247,24 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, expected_number_o
     if args.dataset.startswith('speed_benchmark'):
         speed_benchmark_result(result_db_path)
+    logger.info(f'Save the summary to: {result_path}')
+    return metrics_result, percentile_result
 def speed_benchmark_result(result_db_path: str):
-    query_sql = """
+    query_sql = f"""
         SELECT
-            prompt_tokens,
-            ROUND(AVG(completion_tokens / latency), 2) AS avg_completion_token_per_second,
-            ROUND(AVG(max_gpu_memory_cost), 2)
+            {DatabaseColumns.PROMPT_TOKENS},
+            ROUND(AVG({DatabaseColumns.COMPLETION_TOKENS} / {DatabaseColumns.LATENCY}), 2) AS avg_completion_token_per_second,
+            ROUND(AVG({DatabaseColumns.MAX_GPU_MEMORY_COST}), 2)
         FROM
             result
         WHERE
-            success = 1 AND latency > 0
+            {DatabaseColumns.SUCCESS} = 1 AND {DatabaseColumns.LATENCY} > 0
         GROUP BY
-            prompt_tokens
-    """
+            {DatabaseColumns.PROMPT_TOKENS}
+    """  # noqa: E501
     with sqlite3.connect(result_db_path) as con:
         cursor = con.cursor()

evalscope/perf/utils/local_server.py CHANGED Viewed

@@ -1,68 +1,28 @@
 import os
 import subprocess
-import torch
 import uvicorn
 from contextlib import asynccontextmanager
-from dataclasses import dataclass
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from sse_starlette.sse import EventSourceResponse
 from evalscope.perf.arguments import Arguments
 from evalscope.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
+from evalscope.utils.import_utils import check_import
 from evalscope.utils.logger import get_logger
 logger = get_logger()
-@dataclass
-class ServerSentEvent(object):
-    def __init__(self, data='', event=None, id=None, retry=None):
-        self.data = data
-        self.event = event
-        self.id = id
-        self.retry = retry
-    @classmethod
-    def decode(cls, line):
-        """Decode line to ServerSentEvent
-        Args:
-            line (str): The line.
-        Return:
-            ServerSentEvent (obj:`ServerSentEvent`): The ServerSentEvent object.
-        """
-        if not line:
-            return None
-        sse_msg = cls()
-        # format data:xxx
-        field_type, _, field_value = line.partition(':')
-        if field_value.startswith(' '):  # compatible with openai api
-            field_value = field_value[1:]
-        if field_type == 'event':
-            sse_msg.event = field_value
-        elif field_type == 'data':
-            field_value = field_value.rstrip()
-            sse_msg.data = field_value
-        elif field_type == 'id':
-            sse_msg.id = field_value
-        elif field_type == 'retry':
-            sse_msg.retry = field_value
-        else:
-            pass
-        return sse_msg
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     yield
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
+    try:
+        import torch
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    except ImportError:
+        pass
 def create_app(model, attn_implementation=None) -> FastAPI:
@@ -96,11 +56,16 @@ def create_app(model, attn_implementation=None) -> FastAPI:
 def start_app(args: Arguments):
+    logger.info('Starting local server, please wait...')
     if args.api == 'local':
+        check_import('torch', 'torch', raise_error=True)
         app = create_app(args.model, args.attn_implementation)
         uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
     elif args.api == 'local_vllm':
+        import torch
         os.environ['VLLM_USE_MODELSCOPE'] = 'True'
         os.environ['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = '1'
         os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'

evalscope/perf/utils/log_utils.py ADDED Viewed

@@ -0,0 +1,63 @@
+import os
+from evalscope.perf.arguments import Arguments
+def init_wandb(args: Arguments) -> None:
+    """
+    Initialize WandB for logging.
+    """
+    # Initialize wandb if the api key is provided
+    import datetime
+    try:
+        import wandb
+    except ImportError:
+        raise RuntimeError('Cannot import wandb. Please install it with command: \n pip install wandb')
+    os.environ['WANDB_SILENT'] = 'true'
+    os.environ['WANDB_DIR'] = args.outputs_dir
+    current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
+    name = args.name if args.name else f'{args.model_id}_{current_time}'
+    # Remove sensitive information from logging config
+    logging_config = args.to_dict()
+    logging_config.pop('api_key', None)
+    logging_config.pop('wandb_api_key', None)
+    if args.wandb_api_key is not None:
+        wandb.login(key=args.wandb_api_key)
+    wandb.init(project='perf_benchmark', name=name, config=logging_config)
+def init_swanlab(args: Arguments) -> None:
+    """
+    Initialize SwanLab for logging.
+    """
+    import datetime
+    try:
+        import swanlab
+    except ImportError:
+        raise RuntimeError('Cannot import swanlab. Please install it with command: \n pip install swanlab')
+    os.environ['SWANLAB_LOG_DIR'] = args.outputs_dir
+    current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
+    name = args.name if args.name else f'{args.model_id}_{current_time}'
+    swanlab.config.update({'framework': '📏evalscope'})
+    # Remove sensitive information from logging config
+    logging_config = args.to_dict()
+    logging_config.pop('api_key', None)
+    logging_config.pop('swanlab_api_key', None)
+    init_kwargs = {
+        'project': os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
+        'name': name,
+        'config': logging_config,
+        'mode': 'local' if args.swanlab_api_key == 'local' else None
+    }
+    workspace = os.getenv('SWANLAB_WORKSPACE')
+    if workspace:
+        init_kwargs['workspace'] = workspace
+    if isinstance(args.swanlab_api_key, str) and not args.swanlab_api_key == 'local':
+        swanlab.login(api_key=args.swanlab_api_key)
+    swanlab.init(**init_kwargs)

evalscope/perf/utils/rich_display.py ADDED Viewed

@@ -0,0 +1,192 @@
+# the following code is largely adapted from https://github.com/lework/llm-benchmark
+import numpy as np
+from rich.console import Console
+from rich.panel import Panel
+from rich.style import Style
+from rich.table import Table
+from rich.text import Text
+from evalscope.utils.logger import get_logger
+from .benchmark_util import Metrics
+from .db_util import PercentileMetrics
+logger = get_logger()
+def analyze_results(all_results):
+    """Analyze all test results and generate a summary report"""
+    summary = []
+    total_tokens = 0
+    total_time = 0
+    for result in all_results:
+        total_metrics = result[0]
+        percentile_metrics = result[1]
+        percentiles = percentile_metrics[PercentileMetrics.PERCENTILES]
+        try:
+            concurrency = total_metrics.get(Metrics.NUMBER_OF_CONCURRENCY, 0)
+            rps = total_metrics.get(Metrics.REQUEST_THROUGHPUT, 0)
+            avg_latency = total_metrics.get(Metrics.AVERAGE_LATENCY, 0)
+            p99_latency = percentile_metrics.get(PercentileMetrics.LATENCY)[percentiles.index('99%')]
+            avg_tps = total_metrics.get(Metrics.OUTPUT_TOKEN_THROUGHPUT, 0)
+            avg_ttft = total_metrics.get(Metrics.AVERAGE_TIME_TO_FIRST_TOKEN, 0)
+            p99_ttft = percentile_metrics.get(PercentileMetrics.TTFT)[percentiles.index('99%')]
+            success_rate = (
+                total_metrics.get(Metrics.SUCCEED_REQUESTS, 0) / total_metrics.get(Metrics.TOTAL_REQUESTS, 1)
+            ) * 100
+            avg_tpot = total_metrics.get(Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN, 0)
+            p99_tpot = percentile_metrics.get(PercentileMetrics.TPOT)[percentiles.index('99%')]
+            # Ensure all values are valid numbers
+            if any(x is None for x in [concurrency, rps, avg_latency, p99_latency, avg_tps, avg_ttft]):
+                logger.warning(f'Warning: Test results for concurrency {concurrency} contain invalid data, skipped')
+                continue
+            summary.append([
+                concurrency,
+                f'{rps:.2f}' if rps is not None else 'N/A',
+                f'{avg_latency:.3f}' if avg_latency is not None else 'N/A',
+                f'{p99_latency:.3f}' if p99_latency is not None else 'N/A',
+                f'{avg_tps:.2f}' if avg_tps is not None else 'N/A',
+                f'{avg_ttft:.3f}' if avg_ttft is not None else 'N/A',
+                f'{success_rate:.1f}%' if success_rate is not None else 'N/A',
+                f'{p99_ttft:.3f}' if p99_ttft is not None else 'N/A',
+                f'{avg_tpot:.3f}' if avg_tpot is not None else 'N/A',
+                f'{p99_tpot:.3f}' if p99_tpot is not None else 'N/A',
+            ])
+            total_tokens += total_metrics.get(Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST,
+                                              0) * total_metrics.get(Metrics.SUCCEED_REQUESTS, 0)
+            total_time += total_metrics.get(Metrics.TIME_TAKEN_FOR_TESTS, 0)
+        except Exception as e:
+            logger.warning(
+                f"Warning: Error processing results for concurrency {result.get('concurrency', 'unknown')}: {str(e)}"
+            )
+            continue
+    if not summary:
+        logger.warning('Error: No valid test result data')
+        return [], 0, 0
+    return summary, total_tokens, total_time
+def print_summary(all_results, model_name):
+    """Print test results summary"""
+    summary, total_tokens, total_time = analyze_results(all_results)
+    if not summary:
+        logger.warning('No available test result data to display')
+        return
+    console = Console(width=100)  # Set fixed width
+    # Create title panel
+    title = Text('Performance Test Summary Report', style='bold')
+    console.print(Panel(title, width=60))
+    # Print basic information
+    basic_info = Table(show_header=False, width=60)
+    basic_info.add_column('Name', style='cyan', width=25)
+    basic_info.add_column('Value', style='green', width=35)
+    basic_info.add_row('Model', model_name)
+    basic_info.add_row('Total Generated', f'{total_tokens:,} tokens')
+    basic_info.add_row('Total Test Time', f'{total_time:.2f} seconds')
+    basic_info.add_row('Avg Output Rate', f'{total_tokens / total_time:.2f} tokens/sec')
+    console.print('\nBasic Information:')
+    console.print(basic_info)
+    # Create detailed performance metrics table
+    table = Table(
+        title='Detailed Performance Metrics',
+        show_header=True,
+        header_style='bold cyan',
+        border_style='blue',
+        width=100,  # Set total table width
+        pad_edge=False,  # Reduce edge padding
+        min_width=60,  # Minimum width
+    )
+    # Add columns (set fixed column widths)
+    table.add_column('Conc.', justify='right', style='cyan')
+    table.add_column('RPS', justify='right')
+    table.add_column('Avg Lat.(s)', justify='right')
+    table.add_column('P99 Lat.(s)', justify='right')
+    table.add_column('Gen. toks/s', justify='right')
+    table.add_column('Avg TTFT(s)', justify='right')
+    table.add_column('P99 TTFT(s)', justify='right')
+    table.add_column('Avg TPOT(s)', justify='right')
+    table.add_column('P99 TPOT(s)', justify='right')
+    table.add_column('Success Rate', justify='right', style='green')
+    # Add data rows
+    for row in summary:
+        try:
+            # Set row style based on success rate
+            success_rate = float(row[6].rstrip('%'))
+            row_style = 'green' if success_rate >= 95 else 'yellow' if success_rate >= 80 else 'red'
+            table.add_row(
+                str(row[0]),  # Concurrency
+                f'{float(row[1]):.2f}',  # RPS
+                f'{float(row[2]):.3f}',  # Average Latency
+                f'{float(row[3]):.3f}',  # P99 Latency
+                f'{float(row[4]):.2f}',  # Average TPS
+                f'{float(row[5]):.3f}',  # First Token Latency
+                f'{float(row[7]):.3f}',  # P99 TTFT
+                f'{float(row[8]):.3f}',  # Average TPOT
+                f'{float(row[9]):.3f}',  # P99 TPOT
+                row[6],  # Success Rate
+                style=row_style
+            )
+        except ValueError as e:
+            console.print(f'Warning: Error processing row data: {str(e)}', style='bold red')
+            continue
+    console.print('\n')
+    console.print(table)
+    # Calculate and display best performance configuration
+    try:
+        best_rps_idx = np.argmax([float(row[1]) if row[1] != 'N/A' else -1 for row in summary])
+        best_latency_idx = np.argmin([float(row[2]) if row[2] != 'N/A' else float('inf') for row in summary])
+        perf_info = Table(title='Best Performance Configuration', show_header=False, box=None, width=60)
+        perf_info.add_column('Metric', style='cyan', width=20)
+        perf_info.add_column('Value', style='green', width=40)
+        perf_info.add_row('Highest RPS', f'Concurrency {summary[best_rps_idx][0]} ({summary[best_rps_idx][1]} req/sec)')
+        perf_info.add_row(
+            'Lowest Latency', f'Concurrency {summary[best_latency_idx][0]} ({summary[best_latency_idx][2]} seconds)'
+        )
+        console.print('\n')
+        console.print(perf_info)
+        # Performance recommendations
+        recommendations = []
+        if best_rps_idx == len(summary) - 1:
+            recommendations.append(
+                'The system seems not to have reached its performance bottleneck, try higher concurrency'
+            )
+        elif best_rps_idx == 0:
+            recommendations.append('Consider lowering concurrency, current load may be too high')
+        else:
+            recommendations.append(f'Optimal concurrency range is around {summary[best_rps_idx][0]}')
+        success_rate = float(summary[-1][6][:-1])
+        if success_rate < 95:
+            recommendations.append(
+                'Success rate is low at high concurrency, check system resources or reduce concurrency'
+            )
+        recommend_text = Text('\nPerformance Recommendations:', style='bold cyan')
+        console.print(recommend_text)
+        for rec in recommendations:
+            console.print(f'• {rec}', style='yellow')
+    except Exception as e:
+        console.print(f'Warning: Error generating performance analysis: {str(e)}', style='bold red')

evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 0.10.0py3-none-any.whl → 1.2.0py3-none-any.whl