evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/api/benchmark/__init__.py +9 -1
- evalscope/api/benchmark/adapters/__init__.py +4 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +85 -2
- evalscope/api/benchmark/meta.py +10 -1
- evalscope/api/dataset/dataset.py +27 -6
- evalscope/api/dataset/loader.py +8 -3
- evalscope/api/evaluator/cache.py +31 -4
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/__init__.py +1 -1
- evalscope/api/metric/metric.py +6 -1
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/generate_config.py +10 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +11 -5
- evalscope/app/utils/data_utils.py +8 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/aime24_adapter.py +5 -0
- evalscope/benchmarks/aime/aime25_adapter.py +136 -1
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/benchmarks/aime/math_normalize.py +189 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
- evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/drop_adapter.py +15 -44
- evalscope/benchmarks/drop/utils.py +97 -0
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
- evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +103 -18
- evalscope/constants.py +18 -0
- evalscope/evaluator/evaluator.py +138 -82
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +317 -13
- evalscope/metrics/metrics.py +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +21 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +16 -6
- evalscope/perf/arguments.py +26 -4
- evalscope/perf/benchmark.py +76 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +188 -79
- evalscope/perf/plugin/api/openai_api.py +85 -20
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +43 -27
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +3 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +13 -3
- evalscope/report/combinator.py +91 -20
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +13 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/argument_utils.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +249 -12
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +132 -7
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +69 -18
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +39 -7
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/__init__.py +0 -1
- tests/aigc/__init__.py +0 -1
- tests/aigc/test_t2i.py +0 -142
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -386
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -229
- tests/cli/test_collection.py +0 -96
- tests/cli/test_custom.py +0 -268
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -176
- tests/rag/test_clip_benchmark.py +0 -90
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
- {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import math
|
|
2
3
|
import os
|
|
4
|
+
from collections import defaultdict
|
|
3
5
|
from typing import Any, Dict, List, Tuple, Union
|
|
4
6
|
|
|
5
7
|
from evalscope.perf.arguments import Arguments
|
|
6
8
|
from evalscope.perf.plugin.api.default_api import DefaultApiPlugin
|
|
7
9
|
from evalscope.perf.plugin.registry import register_api
|
|
10
|
+
from evalscope.utils.io_utils import base64_to_PIL
|
|
8
11
|
from evalscope.utils.logger import get_logger
|
|
9
12
|
|
|
10
13
|
logger = get_logger()
|
|
@@ -99,7 +102,7 @@ class OpenaiPlugin(DefaultApiPlugin):
|
|
|
99
102
|
payload.update(param.extra_args)
|
|
100
103
|
return payload
|
|
101
104
|
|
|
102
|
-
def parse_responses(self, responses, request:
|
|
105
|
+
def parse_responses(self, responses: List[Dict], request: str = None, **kwargs) -> tuple[int, int]:
|
|
103
106
|
"""Parser responses and return number of request and response tokens.
|
|
104
107
|
Only one response for non-stream, multiple responses for stream.
|
|
105
108
|
"""
|
|
@@ -113,7 +116,7 @@ class OpenaiPlugin(DefaultApiPlugin):
|
|
|
113
116
|
return input_tokens, output_tokens
|
|
114
117
|
|
|
115
118
|
# no usage information in the response, parse the response to get the tokens
|
|
116
|
-
delta_contents =
|
|
119
|
+
delta_contents = defaultdict(list)
|
|
117
120
|
for response in responses:
|
|
118
121
|
if 'object' in response:
|
|
119
122
|
self.__process_response_object(response, delta_contents)
|
|
@@ -123,41 +126,46 @@ class OpenaiPlugin(DefaultApiPlugin):
|
|
|
123
126
|
input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
|
|
124
127
|
return input_tokens, output_tokens
|
|
125
128
|
|
|
126
|
-
def __process_response_object(self,
|
|
127
|
-
if
|
|
128
|
-
|
|
129
|
+
def __process_response_object(self, response, delta_contents):
|
|
130
|
+
if not response.get('choices'):
|
|
131
|
+
return
|
|
132
|
+
if response['object'] == 'chat.completion':
|
|
133
|
+
for choice in response['choices']:
|
|
129
134
|
delta_contents[choice['index']] = [choice['message']['content']]
|
|
130
|
-
elif
|
|
131
|
-
for choice in
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
+
elif response['object'] == 'text_completion':
|
|
136
|
+
for choice in response['choices']:
|
|
137
|
+
if 'text' in choice and 'index' in choice:
|
|
138
|
+
delta_contents[choice['index']].append(choice['text'])
|
|
139
|
+
elif response['object'] == 'chat.completion.chunk':
|
|
140
|
+
for choice in response['choices']:
|
|
135
141
|
if 'delta' in choice and 'index' in choice:
|
|
136
142
|
delta = choice['delta']
|
|
137
143
|
idx = choice['index']
|
|
138
144
|
if 'content' in delta:
|
|
139
|
-
|
|
140
|
-
delta_contents.setdefault(idx, []).append(delta_content)
|
|
145
|
+
delta_contents[idx].append(delta['content'])
|
|
141
146
|
|
|
142
|
-
def __process_no_object(self,
|
|
147
|
+
def __process_no_object(self, response, delta_contents):
|
|
143
148
|
# assume the response is a single choice
|
|
144
|
-
|
|
149
|
+
if not response.get('choices'):
|
|
150
|
+
return
|
|
151
|
+
for choice in response['choices']:
|
|
145
152
|
if 'delta' in choice:
|
|
146
153
|
delta = choice['delta']
|
|
147
154
|
idx = choice['index']
|
|
148
155
|
if 'content' in delta:
|
|
149
|
-
|
|
150
|
-
delta_contents.setdefault(idx, []).append(delta_content)
|
|
156
|
+
delta_contents[idx].append(delta['content'])
|
|
151
157
|
else:
|
|
152
158
|
delta_contents[choice['index']] = [choice['message']['content']]
|
|
153
159
|
|
|
154
|
-
def __calculate_tokens_from_content(self, request,
|
|
160
|
+
def __calculate_tokens_from_content(self, request, content):
|
|
155
161
|
input_tokens = output_tokens = 0
|
|
156
162
|
if self.tokenizer is not None:
|
|
157
|
-
|
|
163
|
+
# Calculate input tokens
|
|
164
|
+
input_tokens += self._count_input_tokens(request)
|
|
165
|
+
for idx, choice_contents in content.items():
|
|
158
166
|
full_response_content = ''.join(choice_contents)
|
|
159
|
-
|
|
160
|
-
output_tokens +=
|
|
167
|
+
# Calculate output tokens
|
|
168
|
+
output_tokens += self._count_output_tokens(full_response_content)
|
|
161
169
|
else:
|
|
162
170
|
raise ValueError(
|
|
163
171
|
'Error: Unable to retrieve usage information\n\n'
|
|
@@ -171,3 +179,60 @@ class OpenaiPlugin(DefaultApiPlugin):
|
|
|
171
179
|
'please open an issue on our GitHub repository https://github.com/modelscope/evalscope .'
|
|
172
180
|
)
|
|
173
181
|
return input_tokens, output_tokens
|
|
182
|
+
|
|
183
|
+
def _count_input_tokens(self, request_str: str) -> int:
|
|
184
|
+
"""Count the number of input tokens in the request.
|
|
185
|
+
|
|
186
|
+
This method handles different types of requests and calculates tokens for:
|
|
187
|
+
- Text content in messages or prompts
|
|
188
|
+
- Images in multimodal messages (converted to patch tokens)
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
request_str (str): The request json str containing either 'messages' for chat
|
|
192
|
+
completion or 'prompt' for text completion.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
int: The total number of input tokens including text and image tokens.
|
|
196
|
+
"""
|
|
197
|
+
input_tokens = 0
|
|
198
|
+
request = json.loads(request_str)
|
|
199
|
+
if 'messages' in request:
|
|
200
|
+
input_content = self.tokenizer.apply_chat_template(
|
|
201
|
+
request['messages'], tokenize=True, add_generation_prompt=True
|
|
202
|
+
)
|
|
203
|
+
input_tokens += len(input_content)
|
|
204
|
+
# handle image tokens if any
|
|
205
|
+
for message in request['messages']:
|
|
206
|
+
content = message.get('content', '')
|
|
207
|
+
if isinstance(content, str):
|
|
208
|
+
continue
|
|
209
|
+
for cont in content:
|
|
210
|
+
if cont['type'] == 'image_url':
|
|
211
|
+
try:
|
|
212
|
+
# assuming image_url is base64 string
|
|
213
|
+
image_base64 = cont['image_url']['url']
|
|
214
|
+
image = base64_to_PIL(image_base64)
|
|
215
|
+
# Use math.ceil for more accurate token count when image dimensions
|
|
216
|
+
# aren't perfectly divisible by patch size
|
|
217
|
+
n_patches = (
|
|
218
|
+
math.ceil(image.height / self.param.image_patch_size)
|
|
219
|
+
* math.ceil(image.width / self.param.image_patch_size)
|
|
220
|
+
)
|
|
221
|
+
input_tokens += n_patches
|
|
222
|
+
except Exception as e:
|
|
223
|
+
logger.warning(f'Failed to process image for token counting: {e}')
|
|
224
|
+
# Continue processing other content without failing
|
|
225
|
+
elif 'prompt' in request:
|
|
226
|
+
input_tokens += len(self.tokenizer.encode(request['prompt'], add_special_tokens=False))
|
|
227
|
+
return input_tokens
|
|
228
|
+
|
|
229
|
+
def _count_output_tokens(self, response: str) -> int:
|
|
230
|
+
"""Count the number of output tokens in the response. Only string response is supported.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
response (str): The API response text.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
int: The number of output tokens.
|
|
237
|
+
"""
|
|
238
|
+
return len(self.tokenizer.encode(response, add_special_tokens=False))
|
|
@@ -15,6 +15,11 @@ class DatasetPluginBase:
|
|
|
15
15
|
dataset_path (str, optional): The input dataset path. Defaults to None.
|
|
16
16
|
"""
|
|
17
17
|
self.query_parameters = query_parameters
|
|
18
|
+
if query_parameters.tokenizer_path:
|
|
19
|
+
from modelscope import AutoTokenizer
|
|
20
|
+
self.tokenizer = AutoTokenizer.from_pretrained(query_parameters.tokenizer_path, trust_remote_code=True)
|
|
21
|
+
else:
|
|
22
|
+
self.tokenizer = None
|
|
18
23
|
|
|
19
24
|
def __next__(self):
|
|
20
25
|
for item in self.build_messages():
|
|
@@ -85,3 +90,19 @@ class DatasetPluginBase:
|
|
|
85
90
|
for url in image_urls:
|
|
86
91
|
message['content'].append({'type': 'image_url', 'image_url': {'url': url}})
|
|
87
92
|
return message
|
|
93
|
+
|
|
94
|
+
def check_prompt_length(self, prompt: str) -> Tuple[bool, int]:
|
|
95
|
+
"""Check if the prompt length is within the specified range.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
prompt (str): The input prompt string.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Tuple[bool, int]: A tuple containing a boolean indicating whether the prompt is valid and its length.
|
|
102
|
+
"""
|
|
103
|
+
if self.tokenizer is None:
|
|
104
|
+
prompt_length = len(prompt)
|
|
105
|
+
else:
|
|
106
|
+
prompt_length = len(self.tokenizer.encode(prompt))
|
|
107
|
+
is_valid = self.query_parameters.min_prompt_length <= prompt_length <= self.query_parameters.max_prompt_length
|
|
108
|
+
return is_valid, prompt_length
|
|
@@ -16,9 +16,8 @@ class CustomDatasetPlugin(DatasetPluginBase):
|
|
|
16
16
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
17
17
|
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
18
18
|
prompt = item.strip()
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
) < self.query_parameters.max_prompt_length:
|
|
19
|
+
is_valid, _ = self.check_prompt_length(prompt)
|
|
20
|
+
if is_valid:
|
|
22
21
|
if self.query_parameters.apply_chat_template:
|
|
23
22
|
message = self.create_message(prompt)
|
|
24
23
|
yield [message]
|
|
@@ -22,7 +22,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
|
|
|
22
22
|
for item in dataset:
|
|
23
23
|
pil_image = item['jpg']
|
|
24
24
|
text = item['txt']
|
|
25
|
-
base64_image = PIL_to_base64(pil_image)
|
|
25
|
+
base64_image = PIL_to_base64(pil_image, add_header=True)
|
|
26
26
|
|
|
27
|
-
message = self.create_message(text=text, image_urls=
|
|
27
|
+
message = self.create_message(text=text, image_urls=base64_image)
|
|
28
28
|
yield [message]
|
|
@@ -22,7 +22,7 @@ class KontextDatasetPlugin(DatasetPluginBase):
|
|
|
22
22
|
for item in dataset:
|
|
23
23
|
pil_image = item['image']
|
|
24
24
|
text = item['instruction']
|
|
25
|
-
base64_image = PIL_to_base64(pil_image)
|
|
25
|
+
base64_image = PIL_to_base64(pil_image, add_header=True)
|
|
26
26
|
|
|
27
|
-
message = self.create_message(text=text, image_urls=
|
|
27
|
+
message = self.create_message(text=text, image_urls=base64_image)
|
|
28
28
|
yield [message]
|
|
@@ -17,9 +17,8 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
|
|
|
17
17
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
18
18
|
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
19
19
|
prompt = item.strip()
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
) < self.query_parameters.max_prompt_length:
|
|
20
|
+
is_valid, _ = self.check_prompt_length(prompt)
|
|
21
|
+
if is_valid:
|
|
23
22
|
if self.query_parameters.apply_chat_template:
|
|
24
23
|
message = self.create_message(prompt)
|
|
25
24
|
yield [message]
|
|
@@ -22,9 +22,8 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
|
|
|
22
22
|
ds = self.dataset_json_list(self.query_parameters.dataset_path)
|
|
23
23
|
for item in ds:
|
|
24
24
|
prompt = item['instruction'].strip()
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
) < self.query_parameters.max_prompt_length:
|
|
25
|
+
is_valid, _ = self.check_prompt_length(prompt)
|
|
26
|
+
if is_valid:
|
|
28
27
|
if self.query_parameters.apply_chat_template:
|
|
29
28
|
message = self.create_message(prompt)
|
|
30
29
|
yield [message]
|
|
@@ -27,10 +27,8 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
|
|
|
27
27
|
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
28
28
|
item = json.loads(item)
|
|
29
29
|
prompt = item['question'].strip()
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
and len(prompt) < self.query_parameters.max_prompt_length
|
|
33
|
-
):
|
|
30
|
+
is_valid, _ = self.check_prompt_length(prompt)
|
|
31
|
+
if is_valid:
|
|
34
32
|
if self.query_parameters.apply_chat_template:
|
|
35
33
|
message = self.create_message(prompt)
|
|
36
34
|
yield [message]
|
|
@@ -12,11 +12,9 @@ class RandomDatasetPlugin(DatasetPluginBase):
|
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
14
|
def __init__(self, query_parameters: Arguments):
|
|
15
|
+
assert query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer-path`.' # noqa: E501
|
|
15
16
|
super().__init__(query_parameters)
|
|
16
|
-
assert self.query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer_path`.' # noqa: E501
|
|
17
17
|
|
|
18
|
-
from modelscope import AutoTokenizer
|
|
19
|
-
self.tokenizer = AutoTokenizer.from_pretrained(self.query_parameters.tokenizer_path, trust_remote_code=True)
|
|
20
18
|
self.prefix_length = self.query_parameters.prefix_length
|
|
21
19
|
self.prefix_ids = self.get_random_inputs(self.prefix_length)
|
|
22
20
|
self.template_len = self.get_template_len()
|
|
@@ -31,7 +31,7 @@ class RandomVLDatasetPlugin(RandomDatasetPlugin):
|
|
|
31
31
|
# Generate random images based on image_num
|
|
32
32
|
images_b64 = []
|
|
33
33
|
for _ in range(self.image_num):
|
|
34
|
-
images_b64.append(
|
|
34
|
+
images_b64.append(self._generate_random_image_b64())
|
|
35
35
|
|
|
36
36
|
message = self.create_message(text=prompt, image_urls=images_b64)
|
|
37
37
|
yield [message]
|
|
@@ -77,4 +77,4 @@ class RandomVLDatasetPlugin(RandomDatasetPlugin):
|
|
|
77
77
|
draw.line(coords, fill=shape_color, width=random.randint(1, 5))
|
|
78
78
|
|
|
79
79
|
# Convert to base64
|
|
80
|
-
return PIL_to_base64(image, format='PNG')
|
|
80
|
+
return PIL_to_base64(image, format='PNG', add_header=True)
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import time
|
|
2
|
-
import torch
|
|
3
1
|
from dataclasses import dataclass, field
|
|
4
2
|
from typing import Any, List, Optional, Tuple
|
|
5
3
|
|
|
4
|
+
from evalscope.utils.import_utils import check_import
|
|
6
5
|
from evalscope.utils.logger import get_logger
|
|
7
6
|
|
|
8
7
|
logger = get_logger()
|
|
@@ -10,7 +9,7 @@ logger = get_logger()
|
|
|
10
9
|
|
|
11
10
|
@dataclass
|
|
12
11
|
class BenchmarkData:
|
|
13
|
-
request:
|
|
12
|
+
request: str = None # json serialized request body
|
|
14
13
|
start_time: float = 0.0
|
|
15
14
|
completed_time: float = 0.0
|
|
16
15
|
chunk_times: List[float] = field(default_factory=list)
|
|
@@ -24,30 +23,34 @@ class BenchmarkData:
|
|
|
24
23
|
time_per_output_token: float = 0.0
|
|
25
24
|
inter_chunk_latency: List[float] = field(default_factory=list)
|
|
26
25
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
# only for stream responses
|
|
33
|
-
if len(self.chunk_times) > 1:
|
|
34
|
-
self.first_chunk_latency = self.chunk_times[0] - self.start_time
|
|
35
|
-
# remove the first chunk time from the total latency
|
|
36
|
-
self.time_per_output_token = (self.query_latency - self.first_chunk_latency
|
|
37
|
-
) / (self.completion_tokens - 1) if self.completion_tokens > 1 else 0.0
|
|
38
|
-
self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
|
|
39
|
-
else:
|
|
40
|
-
self.first_chunk_latency = self.query_latency
|
|
26
|
+
# response content
|
|
27
|
+
generated_text: str = ''
|
|
28
|
+
error: Optional[str] = None
|
|
29
|
+
prompt_tokens: Optional[int] = None
|
|
30
|
+
completion_tokens: Optional[int] = None
|
|
41
31
|
|
|
42
32
|
def _calculate_tokens(self, api_plugin):
|
|
43
|
-
self.prompt_tokens
|
|
44
|
-
|
|
33
|
+
if self.prompt_tokens is None or self.completion_tokens is None:
|
|
34
|
+
self.prompt_tokens, self.completion_tokens = api_plugin.parse_responses(
|
|
35
|
+
self.response_messages, request=self.request
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Calculate time per output token
|
|
39
|
+
if self.completion_tokens and self.completion_tokens > 1:
|
|
40
|
+
# tpot = (latency - ttft) / (output_len - 1)
|
|
41
|
+
self.time_per_output_token = (self.query_latency - self.first_chunk_latency) / (self.completion_tokens - 1)
|
|
42
|
+
|
|
43
|
+
# Ensure inter-chunk latency is available (compute from chunk_times if needed)
|
|
44
|
+
if not self.inter_chunk_latency and self.chunk_times:
|
|
45
|
+
self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
|
|
45
46
|
|
|
46
47
|
def update_gpu_usage(self):
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
total_memory
|
|
50
|
-
|
|
48
|
+
if check_import('torch', raise_warning=False):
|
|
49
|
+
import torch
|
|
50
|
+
total_memory = 0
|
|
51
|
+
for i in range(torch.cuda.device_count()):
|
|
52
|
+
total_memory += (torch.cuda.max_memory_allocated(i) / 2**30) # GB
|
|
53
|
+
self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
|
|
51
54
|
|
|
52
55
|
|
|
53
56
|
class Metrics:
|
|
@@ -77,6 +80,7 @@ class BenchmarkMetrics:
|
|
|
77
80
|
n_total_prompt_tokens: int = 0
|
|
78
81
|
n_total_completion_tokens: int = 0
|
|
79
82
|
start_time: Optional[float] = None
|
|
83
|
+
last_completed_time: Optional[float] = None
|
|
80
84
|
total_time: float = 1.0
|
|
81
85
|
n_total_queries: int = 0
|
|
82
86
|
n_time_per_output_token: float = 0.0
|
|
@@ -95,9 +99,6 @@ class BenchmarkMetrics:
|
|
|
95
99
|
|
|
96
100
|
def update_metrics(self, benchmark_data: BenchmarkData, api_plugin):
|
|
97
101
|
self.n_total_queries += 1
|
|
98
|
-
if self.start_time is None:
|
|
99
|
-
self.start_time = benchmark_data.start_time
|
|
100
|
-
self.total_time = time.perf_counter() - self.start_time
|
|
101
102
|
|
|
102
103
|
if benchmark_data.success:
|
|
103
104
|
self.n_succeed_queries += 1
|
|
@@ -106,7 +107,6 @@ class BenchmarkMetrics:
|
|
|
106
107
|
self.n_total_prompt_tokens += benchmark_data.prompt_tokens
|
|
107
108
|
self.n_total_completion_tokens += benchmark_data.completion_tokens
|
|
108
109
|
|
|
109
|
-
benchmark_data._calculate_query_stream_metric()
|
|
110
110
|
self.total_latency += benchmark_data.query_latency
|
|
111
111
|
self.total_first_chunk_latency += benchmark_data.first_chunk_latency
|
|
112
112
|
self.n_time_per_output_token += benchmark_data.time_per_output_token
|
|
@@ -115,6 +115,22 @@ class BenchmarkMetrics:
|
|
|
115
115
|
self.n_failed_queries += 1
|
|
116
116
|
|
|
117
117
|
self.calculate_averages()
|
|
118
|
+
self.update_total_time(benchmark_data)
|
|
119
|
+
|
|
120
|
+
def update_total_time(self, benchmark_data: BenchmarkData):
|
|
121
|
+
# Use the earliest start_time seen so far
|
|
122
|
+
if self.start_time is None:
|
|
123
|
+
self.start_time = benchmark_data.start_time
|
|
124
|
+
else:
|
|
125
|
+
self.start_time = min(self.start_time, benchmark_data.start_time)
|
|
126
|
+
# Track the latest completion time
|
|
127
|
+
if self.last_completed_time is None:
|
|
128
|
+
self.last_completed_time = benchmark_data.completed_time
|
|
129
|
+
else:
|
|
130
|
+
self.last_completed_time = max(self.last_completed_time, benchmark_data.completed_time)
|
|
131
|
+
# Compute total_time from request lifecycle timestamps to avoid consumer overhead
|
|
132
|
+
if self.start_time is not None and self.last_completed_time is not None:
|
|
133
|
+
self.total_time = max(self.last_completed_time - self.start_time, 0.0)
|
|
118
134
|
|
|
119
135
|
def calculate_averages(self):
|
|
120
136
|
if self.n_succeed_queries == 0:
|
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -19,7 +19,7 @@ logger = get_logger()
|
|
|
19
19
|
class DatabaseColumns:
|
|
20
20
|
REQUEST = 'request'
|
|
21
21
|
START_TIME = 'start_time'
|
|
22
|
-
|
|
22
|
+
INTER_TOKEN_LATENCIES = 'inter_token_latencies'
|
|
23
23
|
SUCCESS = 'success'
|
|
24
24
|
RESPONSE_MESSAGES = 'response_messages'
|
|
25
25
|
COMPLETED_TIME = 'completed_time'
|
|
@@ -60,7 +60,7 @@ def create_result_table(cursor):
|
|
|
60
60
|
f'''CREATE TABLE IF NOT EXISTS result(
|
|
61
61
|
{DatabaseColumns.REQUEST} TEXT,
|
|
62
62
|
{DatabaseColumns.START_TIME} REAL,
|
|
63
|
-
{DatabaseColumns.
|
|
63
|
+
{DatabaseColumns.INTER_TOKEN_LATENCIES} TEXT,
|
|
64
64
|
{DatabaseColumns.SUCCESS} INTEGER,
|
|
65
65
|
{DatabaseColumns.RESPONSE_MESSAGES} TEXT,
|
|
66
66
|
{DatabaseColumns.COMPLETED_TIME} REAL,
|
|
@@ -75,15 +75,15 @@ def create_result_table(cursor):
|
|
|
75
75
|
|
|
76
76
|
|
|
77
77
|
def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
|
|
78
|
-
request =
|
|
79
|
-
|
|
78
|
+
request = benchmark_data.request
|
|
79
|
+
inter_token_latencies = json.dumps(benchmark_data.inter_chunk_latency)
|
|
80
80
|
response_messages = encode_data(benchmark_data.response_messages)
|
|
81
81
|
|
|
82
82
|
# Columns common to both success and failure cases
|
|
83
83
|
common_columns = (
|
|
84
84
|
request,
|
|
85
85
|
benchmark_data.start_time,
|
|
86
|
-
|
|
86
|
+
inter_token_latencies,
|
|
87
87
|
benchmark_data.success,
|
|
88
88
|
response_messages,
|
|
89
89
|
benchmark_data.completed_time,
|
|
@@ -96,7 +96,7 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
|
|
|
96
96
|
benchmark_data.completion_tokens, benchmark_data.max_gpu_memory_cost, benchmark_data.time_per_output_token
|
|
97
97
|
)
|
|
98
98
|
query = f"""INSERT INTO result(
|
|
99
|
-
{DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.
|
|
99
|
+
{DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES},
|
|
100
100
|
{DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME},
|
|
101
101
|
{DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY}, {DatabaseColumns.PROMPT_TOKENS},
|
|
102
102
|
{DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.MAX_GPU_MEMORY_COST},
|
|
@@ -105,7 +105,7 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
|
|
|
105
105
|
cursor.execute(query, common_columns + additional_columns)
|
|
106
106
|
else:
|
|
107
107
|
query = f"""INSERT INTO result(
|
|
108
|
-
{DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.
|
|
108
|
+
{DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES},
|
|
109
109
|
{DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME}
|
|
110
110
|
) VALUES (?, ?, ?, ?, ?, ?)"""
|
|
111
111
|
cursor.execute(query, common_columns)
|
|
@@ -173,20 +173,11 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
173
173
|
:param result_db_path: Path to the SQLite database file.
|
|
174
174
|
:return: Dictionary of percentiles for various metrics.
|
|
175
175
|
"""
|
|
176
|
-
|
|
177
|
-
def inter_token_latencies(chunk_times_json: str) -> List[float]:
|
|
178
|
-
try:
|
|
179
|
-
chunk_times = json.loads(chunk_times_json)
|
|
180
|
-
return [t2 - t1 for t1, t2 in zip(chunk_times[:-1], chunk_times[1:])]
|
|
181
|
-
except (json.JSONDecodeError, TypeError) as e:
|
|
182
|
-
logger.error(f'Error parsing chunk times: {e}')
|
|
183
|
-
return []
|
|
184
|
-
|
|
185
|
-
query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES}, {DatabaseColumns.SUCCESS},
|
|
176
|
+
query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES}, {DatabaseColumns.SUCCESS},
|
|
186
177
|
{DatabaseColumns.COMPLETED_TIME}, {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY},
|
|
187
178
|
{DatabaseColumns.PROMPT_TOKENS},
|
|
188
179
|
{DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
|
|
189
|
-
FROM result WHERE {DatabaseColumns.SUCCESS}=1'''
|
|
180
|
+
FROM result WHERE {DatabaseColumns.SUCCESS}=1''' # noqa: E501
|
|
190
181
|
|
|
191
182
|
percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
|
|
192
183
|
|
|
@@ -202,7 +193,11 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
202
193
|
# Prepare data for each metric
|
|
203
194
|
inter_token_latencies_all = []
|
|
204
195
|
for row in rows:
|
|
205
|
-
|
|
196
|
+
try:
|
|
197
|
+
itl = json.loads(row[col_indices[DatabaseColumns.INTER_TOKEN_LATENCIES]]) or []
|
|
198
|
+
inter_token_latencies_all.extend(itl)
|
|
199
|
+
except (json.JSONDecodeError, TypeError) as e:
|
|
200
|
+
logger.error(f'Error parsing inter token latencies: {e}')
|
|
206
201
|
|
|
207
202
|
metrics = {
|
|
208
203
|
PercentileMetrics.TTFT: [row[col_indices[DatabaseColumns.FIRST_CHUNK_LATENCY]] for row in rows],
|
|
@@ -2,61 +2,18 @@ import os
|
|
|
2
2
|
import subprocess
|
|
3
3
|
import uvicorn
|
|
4
4
|
from contextlib import asynccontextmanager
|
|
5
|
-
from dataclasses import dataclass
|
|
6
5
|
from fastapi import FastAPI
|
|
7
6
|
from fastapi.middleware.cors import CORSMiddleware
|
|
8
7
|
from sse_starlette.sse import EventSourceResponse
|
|
9
8
|
|
|
10
9
|
from evalscope.perf.arguments import Arguments
|
|
11
10
|
from evalscope.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
|
|
11
|
+
from evalscope.utils.import_utils import check_import
|
|
12
12
|
from evalscope.utils.logger import get_logger
|
|
13
13
|
|
|
14
14
|
logger = get_logger()
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
@dataclass
|
|
18
|
-
class ServerSentEvent(object):
|
|
19
|
-
|
|
20
|
-
def __init__(self, data='', event=None, id=None, retry=None):
|
|
21
|
-
self.data = data
|
|
22
|
-
self.event = event
|
|
23
|
-
self.id = id
|
|
24
|
-
self.retry = retry
|
|
25
|
-
|
|
26
|
-
@classmethod
|
|
27
|
-
def decode(cls, line):
|
|
28
|
-
"""Decode line to ServerSentEvent
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
Args:
|
|
32
|
-
line (str): The line.
|
|
33
|
-
|
|
34
|
-
Return:
|
|
35
|
-
ServerSentEvent (obj:`ServerSentEvent`): The ServerSentEvent object.
|
|
36
|
-
|
|
37
|
-
"""
|
|
38
|
-
if not line:
|
|
39
|
-
return None
|
|
40
|
-
sse_msg = cls()
|
|
41
|
-
# format data:xxx
|
|
42
|
-
field_type, _, field_value = line.partition(':')
|
|
43
|
-
if field_value.startswith(' '): # compatible with openai api
|
|
44
|
-
field_value = field_value[1:]
|
|
45
|
-
if field_type == 'event':
|
|
46
|
-
sse_msg.event = field_value
|
|
47
|
-
elif field_type == 'data':
|
|
48
|
-
field_value = field_value.rstrip()
|
|
49
|
-
sse_msg.data = field_value
|
|
50
|
-
elif field_type == 'id':
|
|
51
|
-
sse_msg.id = field_value
|
|
52
|
-
elif field_type == 'retry':
|
|
53
|
-
sse_msg.retry = field_value
|
|
54
|
-
else:
|
|
55
|
-
pass
|
|
56
|
-
|
|
57
|
-
return sse_msg
|
|
58
|
-
|
|
59
|
-
|
|
60
17
|
@asynccontextmanager
|
|
61
18
|
async def lifespan(app: FastAPI):
|
|
62
19
|
yield
|
|
@@ -101,6 +58,8 @@ def create_app(model, attn_implementation=None) -> FastAPI:
|
|
|
101
58
|
def start_app(args: Arguments):
|
|
102
59
|
logger.info('Starting local server, please wait...')
|
|
103
60
|
if args.api == 'local':
|
|
61
|
+
check_import('torch', 'torch', raise_error=True)
|
|
62
|
+
|
|
104
63
|
app = create_app(args.model, args.attn_implementation)
|
|
105
64
|
uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
|
|
106
65
|
|
|
@@ -15,29 +15,42 @@ def init_wandb(args: Arguments) -> None:
|
|
|
15
15
|
raise RuntimeError('Cannot import wandb. Please install it with command: \n pip install wandb')
|
|
16
16
|
os.environ['WANDB_SILENT'] = 'true'
|
|
17
17
|
os.environ['WANDB_DIR'] = args.outputs_dir
|
|
18
|
-
|
|
19
|
-
wandb.login(key=args.wandb_api_key)
|
|
20
18
|
current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
21
19
|
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
22
|
-
|
|
20
|
+
|
|
21
|
+
# Remove sensitive information from logging config
|
|
22
|
+
logging_config = args.to_dict()
|
|
23
|
+
logging_config.pop('api_key', None)
|
|
24
|
+
logging_config.pop('wandb_api_key', None)
|
|
25
|
+
|
|
26
|
+
if args.wandb_api_key is not None:
|
|
27
|
+
wandb.login(key=args.wandb_api_key)
|
|
28
|
+
wandb.init(project='perf_benchmark', name=name, config=logging_config)
|
|
23
29
|
|
|
24
30
|
|
|
25
31
|
def init_swanlab(args: Arguments) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Initialize SwanLab for logging.
|
|
34
|
+
"""
|
|
26
35
|
import datetime
|
|
27
36
|
try:
|
|
28
37
|
import swanlab
|
|
29
38
|
except ImportError:
|
|
30
39
|
raise RuntimeError('Cannot import swanlab. Please install it with command: \n pip install swanlab')
|
|
31
40
|
os.environ['SWANLAB_LOG_DIR'] = args.outputs_dir
|
|
32
|
-
if not args.swanlab_api_key == 'local':
|
|
33
|
-
swanlab.login(api_key=args.swanlab_api_key)
|
|
34
41
|
current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
35
42
|
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
36
43
|
swanlab.config.update({'framework': '📏evalscope'})
|
|
44
|
+
|
|
45
|
+
# Remove sensitive information from logging config
|
|
46
|
+
logging_config = args.to_dict()
|
|
47
|
+
logging_config.pop('api_key', None)
|
|
48
|
+
logging_config.pop('swanlab_api_key', None)
|
|
49
|
+
|
|
37
50
|
init_kwargs = {
|
|
38
51
|
'project': os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
|
|
39
52
|
'name': name,
|
|
40
|
-
'config':
|
|
53
|
+
'config': logging_config,
|
|
41
54
|
'mode': 'local' if args.swanlab_api_key == 'local' else None
|
|
42
55
|
}
|
|
43
56
|
|
|
@@ -45,4 +58,6 @@ def init_swanlab(args: Arguments) -> None:
|
|
|
45
58
|
if workspace:
|
|
46
59
|
init_kwargs['workspace'] = workspace
|
|
47
60
|
|
|
61
|
+
if isinstance(args.swanlab_api_key, str) and not args.swanlab_api_key == 'local':
|
|
62
|
+
swanlab.login(api_key=args.swanlab_api_key)
|
|
48
63
|
swanlab.init(**init_kwargs)
|