evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/api/benchmark/__init__.py +9 -1
- evalscope/api/benchmark/adapters/__init__.py +4 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +85 -2
- evalscope/api/benchmark/meta.py +10 -1
- evalscope/api/dataset/dataset.py +27 -6
- evalscope/api/dataset/loader.py +8 -3
- evalscope/api/evaluator/cache.py +31 -4
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/__init__.py +1 -1
- evalscope/api/metric/metric.py +6 -1
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/generate_config.py +10 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +11 -5
- evalscope/app/utils/data_utils.py +8 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/aime24_adapter.py +5 -0
- evalscope/benchmarks/aime/aime25_adapter.py +136 -1
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/benchmarks/aime/math_normalize.py +189 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
- evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/drop_adapter.py +15 -44
- evalscope/benchmarks/drop/utils.py +97 -0
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
- evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +103 -18
- evalscope/constants.py +18 -0
- evalscope/evaluator/evaluator.py +138 -82
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +317 -13
- evalscope/metrics/metrics.py +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +21 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +16 -6
- evalscope/perf/arguments.py +26 -4
- evalscope/perf/benchmark.py +76 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +188 -79
- evalscope/perf/plugin/api/openai_api.py +85 -20
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +43 -27
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +3 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +13 -3
- evalscope/report/combinator.py +91 -20
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +13 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/argument_utils.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +249 -12
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +132 -7
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +69 -18
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +39 -7
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/__init__.py +0 -1
- tests/aigc/__init__.py +0 -1
- tests/aigc/test_t2i.py +0 -142
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -386
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -229
- tests/cli/test_collection.py +0 -96
- tests/cli/test_custom.py +0 -268
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -176
- tests/rag/test_clip_benchmark.py +0 -90
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
- {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
evalscope/perf/http_client.py
CHANGED
|
@@ -3,6 +3,7 @@ import asyncio
|
|
|
3
3
|
import time
|
|
4
4
|
from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
|
|
5
5
|
|
|
6
|
+
from evalscope.perf.utils.benchmark_util import BenchmarkData
|
|
6
7
|
from evalscope.utils.logger import get_logger
|
|
7
8
|
from .arguments import Arguments
|
|
8
9
|
|
|
@@ -24,7 +25,22 @@ class AioHttpClient:
|
|
|
24
25
|
self.read_timeout = args.read_timeout
|
|
25
26
|
self.connect_timeout = args.connect_timeout
|
|
26
27
|
self.api_plugin = api_plugin
|
|
28
|
+
|
|
29
|
+
# Configure connector similar to vLLM bench for better TTFT under load.
|
|
30
|
+
connector = aiohttp.TCPConnector(
|
|
31
|
+
limit=args.parallel or 0, # 0 means no limit in aiohttp; use parallel as limit if set
|
|
32
|
+
limit_per_host=args.parallel or 0,
|
|
33
|
+
ttl_dns_cache=300,
|
|
34
|
+
use_dns_cache=True,
|
|
35
|
+
keepalive_timeout=60,
|
|
36
|
+
enable_cleanup_closed=True,
|
|
37
|
+
force_close=False,
|
|
38
|
+
ssl=('https://' in self.url),
|
|
39
|
+
)
|
|
40
|
+
|
|
27
41
|
self.client = aiohttp.ClientSession(
|
|
42
|
+
connector=connector,
|
|
43
|
+
trust_env=True,
|
|
28
44
|
timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
|
|
29
45
|
trace_configs=[self._create_trace_config()] if args.debug else []
|
|
30
46
|
)
|
|
@@ -43,23 +59,25 @@ class AioHttpClient:
|
|
|
43
59
|
trace_config.on_response_chunk_received.append(self.on_response_chunk_received)
|
|
44
60
|
return trace_config
|
|
45
61
|
|
|
46
|
-
async def post(self, body):
|
|
47
|
-
"""
|
|
48
|
-
|
|
49
|
-
|
|
62
|
+
async def post(self, body) -> BenchmarkData:
|
|
63
|
+
"""
|
|
64
|
+
Send POST request and delegate response handling to API plugin.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
BenchmarkData: The benchmark data object containing request and response information.
|
|
50
68
|
"""
|
|
51
69
|
try:
|
|
52
70
|
# Delegate the request processing to the API plugin
|
|
53
|
-
|
|
54
|
-
|
|
71
|
+
output = await self.api_plugin.process_request(self.client, self.url, self.headers, body)
|
|
72
|
+
return output
|
|
55
73
|
except asyncio.TimeoutError as e:
|
|
56
74
|
logger.error(
|
|
57
75
|
f'TimeoutError: connect_timeout: {self.connect_timeout}, read_timeout: {self.read_timeout}. Please set longer timeout.' # noqa: E501
|
|
58
76
|
)
|
|
59
|
-
|
|
77
|
+
return BenchmarkData(success=False, error=str(e))
|
|
60
78
|
except (aiohttp.ClientConnectorError, Exception) as e:
|
|
61
79
|
logger.error(e)
|
|
62
|
-
|
|
80
|
+
return BenchmarkData(success=False, error=str(e))
|
|
63
81
|
|
|
64
82
|
@staticmethod
|
|
65
83
|
async def on_request_start(session, context, params: aiohttp.TraceRequestStartParams):
|
|
@@ -91,7 +109,6 @@ class AioHttpClient:
|
|
|
91
109
|
|
|
92
110
|
|
|
93
111
|
async def test_connection(args: Arguments, api_plugin: 'ApiPluginBase') -> bool:
|
|
94
|
-
is_error = True
|
|
95
112
|
start_time = time.perf_counter()
|
|
96
113
|
|
|
97
114
|
async def attempt_connection():
|
|
@@ -100,18 +117,16 @@ async def test_connection(args: Arguments, api_plugin: 'ApiPluginBase') -> bool:
|
|
|
100
117
|
messages = [{'role': 'user', 'content': 'hello'}] if args.apply_chat_template else 'hello'
|
|
101
118
|
request = api_plugin.build_request(messages)
|
|
102
119
|
|
|
103
|
-
|
|
104
|
-
|
|
120
|
+
output = await client.post(request)
|
|
121
|
+
return output
|
|
105
122
|
|
|
106
123
|
while True:
|
|
107
124
|
try:
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
)
|
|
111
|
-
if not is_error:
|
|
125
|
+
output = await asyncio.wait_for(attempt_connection(), timeout=args.connect_timeout)
|
|
126
|
+
if output.success:
|
|
112
127
|
logger.info('Test connection successful.')
|
|
113
128
|
return True
|
|
114
|
-
logger.warning(f'Retrying...
|
|
129
|
+
logger.warning(f'Retrying... <{output.error}>')
|
|
115
130
|
except Exception as e:
|
|
116
131
|
logger.warning(f'Retrying... <{e}>')
|
|
117
132
|
|
evalscope/perf/main.py
CHANGED
|
@@ -4,7 +4,9 @@ import os
|
|
|
4
4
|
import platform
|
|
5
5
|
import threading
|
|
6
6
|
import time
|
|
7
|
+
import warnings
|
|
7
8
|
from argparse import Namespace
|
|
9
|
+
from logging import warn
|
|
8
10
|
|
|
9
11
|
from evalscope.perf.utils.local_server import start_app
|
|
10
12
|
from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
|
|
@@ -79,9 +81,20 @@ def run_perf_benchmark(args):
|
|
|
79
81
|
configure_logging(args.debug, os.path.join(output_path, 'benchmark.log'))
|
|
80
82
|
|
|
81
83
|
# Initialize wandb and swanlab
|
|
82
|
-
|
|
84
|
+
visualizer = args.visualizer
|
|
85
|
+
if visualizer is None:
|
|
86
|
+
if args.wandb_api_key is not None:
|
|
87
|
+
visualizer = 'wandb'
|
|
88
|
+
warnings.warn('--wandb-api-key is deprecated. Please use `--visualizer wandb` instead.', DeprecationWarning)
|
|
89
|
+
elif args.swanlab_api_key is not None:
|
|
90
|
+
visualizer = 'swanlab'
|
|
91
|
+
warnings.warn(
|
|
92
|
+
'--swanlab-api-key is deprecated. Please use `--visualizer swanlab` instead.', DeprecationWarning
|
|
93
|
+
)
|
|
94
|
+
args.visualizer = visualizer
|
|
95
|
+
if visualizer == 'wandb':
|
|
83
96
|
init_wandb(args)
|
|
84
|
-
|
|
97
|
+
elif visualizer == 'swanlab':
|
|
85
98
|
init_swanlab(args)
|
|
86
99
|
|
|
87
100
|
# Initialize local server if needed
|
|
@@ -3,6 +3,7 @@ from abc import abstractmethod
|
|
|
3
3
|
from typing import Any, AsyncGenerator, Dict, List, Tuple
|
|
4
4
|
|
|
5
5
|
from evalscope.perf.arguments import Arguments
|
|
6
|
+
from evalscope.perf.utils.benchmark_util import BenchmarkData
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class ApiPluginBase:
|
|
@@ -28,13 +29,13 @@ class ApiPluginBase:
|
|
|
28
29
|
raise NotImplementedError
|
|
29
30
|
|
|
30
31
|
@abstractmethod
|
|
31
|
-
def parse_responses(self, responses: List, request:
|
|
32
|
+
def parse_responses(self, responses: List[Dict], request: str = None, **kwargs: Any) -> Tuple[int, int]:
|
|
32
33
|
"""Parser responses and return number of request and response tokens.
|
|
33
34
|
|
|
34
35
|
Args:
|
|
35
|
-
responses (List[
|
|
36
|
+
responses (List[Dict]): List of http response body, for stream output,
|
|
36
37
|
there are multiple responses, each is bytes, for general only one.
|
|
37
|
-
request (
|
|
38
|
+
request (str): The json string of request.
|
|
38
39
|
|
|
39
40
|
Returns:
|
|
40
41
|
Tuple: (Number of prompt_tokens and number of completion_tokens).
|
|
@@ -42,8 +43,9 @@ class ApiPluginBase:
|
|
|
42
43
|
raise NotImplementedError
|
|
43
44
|
|
|
44
45
|
@abstractmethod
|
|
45
|
-
async def process_request(
|
|
46
|
-
|
|
46
|
+
async def process_request(
|
|
47
|
+
self, client_session: aiohttp.ClientSession, url: str, headers: Dict, body: Dict
|
|
48
|
+
) -> BenchmarkData:
|
|
47
49
|
"""Process the HTTP request and handle the response.
|
|
48
50
|
|
|
49
51
|
Args:
|
|
@@ -52,8 +54,8 @@ class ApiPluginBase:
|
|
|
52
54
|
headers: The request headers
|
|
53
55
|
body: The request body
|
|
54
56
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
+
Returns:
|
|
58
|
+
BenchmarkData: The benchmark data including response and timing info.
|
|
57
59
|
"""
|
|
58
60
|
raise NotImplementedError
|
|
59
61
|
|
|
@@ -5,6 +5,7 @@ from typing import Any, AsyncGenerator, Dict, List, Tuple, Union
|
|
|
5
5
|
from evalscope.perf.arguments import Arguments
|
|
6
6
|
from evalscope.perf.plugin.api.base import ApiPluginBase
|
|
7
7
|
from evalscope.perf.plugin.registry import register_api
|
|
8
|
+
from evalscope.perf.utils.benchmark_util import BenchmarkData
|
|
8
9
|
from evalscope.utils.logger import get_logger
|
|
9
10
|
|
|
10
11
|
logger = get_logger()
|
|
@@ -98,7 +99,7 @@ class CustomPlugin(ApiPluginBase):
|
|
|
98
99
|
|
|
99
100
|
return payload
|
|
100
101
|
|
|
101
|
-
def parse_responses(self, responses: List[
|
|
102
|
+
def parse_responses(self, responses: List[Dict], request: str = None, **kwargs) -> Tuple[int, int]:
|
|
102
103
|
"""Parse API responses and return token counts.
|
|
103
104
|
|
|
104
105
|
This method extracts the number of input and output tokens from the API responses.
|
|
@@ -106,8 +107,8 @@ class CustomPlugin(ApiPluginBase):
|
|
|
106
107
|
to calculate it using a tokenizer.
|
|
107
108
|
|
|
108
109
|
Args:
|
|
109
|
-
responses (List[
|
|
110
|
-
request (
|
|
110
|
+
responses (List[Dict]): List of API response strings.
|
|
111
|
+
request (str, optional): The original request, which might be needed for token calculation.
|
|
111
112
|
**kwargs: Additional arguments.
|
|
112
113
|
|
|
113
114
|
Returns:
|
|
@@ -160,8 +161,9 @@ class CustomPlugin(ApiPluginBase):
|
|
|
160
161
|
logger.error(f'Error parsing responses: {e}')
|
|
161
162
|
return 0, 0
|
|
162
163
|
|
|
163
|
-
async def process_request(
|
|
164
|
-
|
|
164
|
+
async def process_request(
|
|
165
|
+
self, client_session: aiohttp.ClientSession, url: str, headers: Dict, body: Dict
|
|
166
|
+
) -> BenchmarkData:
|
|
165
167
|
"""Process the HTTP request and handle the response.
|
|
166
168
|
|
|
167
169
|
This method handles sending the request to your API and processing the response,
|
|
@@ -173,60 +175,13 @@ class CustomPlugin(ApiPluginBase):
|
|
|
173
175
|
headers (Dict): The request headers.
|
|
174
176
|
body (Dict): The request body.
|
|
175
177
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
- is_error: Whether the response indicates an error
|
|
179
|
-
- status_code: HTTP status code
|
|
180
|
-
- response_data: Response content
|
|
178
|
+
Returns:
|
|
179
|
+
BenchmarkData: The benchmark data including response and timing info.
|
|
181
180
|
"""
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
# Convert body to JSON
|
|
187
|
-
data = json.dumps(body, ensure_ascii=False)
|
|
188
|
-
|
|
189
|
-
# Send the request
|
|
190
|
-
async with client_session.request('POST', url=url, data=data, headers=headers) as response: # noqa: E125
|
|
191
|
-
# Get the status code
|
|
192
|
-
status_code = response.status
|
|
193
|
-
|
|
194
|
-
# Check if it's a streaming response
|
|
195
|
-
if 'text/event-stream' in response.content_type:
|
|
196
|
-
# Handle streaming response
|
|
197
|
-
async for line in response.content:
|
|
198
|
-
line_str = line.decode('utf-8').strip()
|
|
199
|
-
if not line_str:
|
|
200
|
-
continue
|
|
201
|
-
|
|
202
|
-
# Check for data prefix in server-sent events
|
|
203
|
-
if line_str.startswith('data: '):
|
|
204
|
-
data = line_str[6:] # Remove 'data: ' prefix
|
|
205
|
-
|
|
206
|
-
# Check if it's the end of the stream
|
|
207
|
-
if data == '[DONE]':
|
|
208
|
-
break
|
|
209
|
-
|
|
210
|
-
try:
|
|
211
|
-
# Parse the JSON data
|
|
212
|
-
parsed_data = json.loads(data)
|
|
213
|
-
yield (False, status_code, json.dumps(parsed_data))
|
|
214
|
-
except json.JSONDecodeError:
|
|
215
|
-
yield (True, status_code, f'Failed to parse JSON: {data}')
|
|
216
|
-
else:
|
|
217
|
-
# Handle regular response
|
|
218
|
-
if 'application/json' in response.content_type:
|
|
219
|
-
# JSON response
|
|
220
|
-
content = await response.json()
|
|
221
|
-
yield (status_code >= 400, status_code, json.dumps(content))
|
|
222
|
-
else:
|
|
223
|
-
# Text response
|
|
224
|
-
content = await response.text()
|
|
225
|
-
yield (status_code >= 400, status_code, content)
|
|
226
|
-
|
|
227
|
-
except Exception as e:
|
|
228
|
-
logger.error(f'Error in process_request: {e}')
|
|
229
|
-
yield (True, 500, str(e))
|
|
181
|
+
raise NotImplementedError(
|
|
182
|
+
'The `process_request` method must be implemented in a subclass. '
|
|
183
|
+
'For OpenAI-compatible APIs, consider inheriting from `DefaultApiPlugin` to reuse the default implementation.' # noqa: E501
|
|
184
|
+
)
|
|
230
185
|
|
|
231
186
|
|
|
232
187
|
if __name__ == '__main__':
|
|
@@ -1,24 +1,77 @@
|
|
|
1
1
|
import aiohttp
|
|
2
|
+
import codecs
|
|
2
3
|
import json
|
|
3
|
-
|
|
4
|
-
|
|
4
|
+
import sys
|
|
5
|
+
import time
|
|
6
|
+
import traceback
|
|
7
|
+
from typing import Any, Dict
|
|
5
8
|
|
|
6
9
|
from evalscope.perf.arguments import Arguments
|
|
7
10
|
from evalscope.perf.plugin.api.base import ApiPluginBase
|
|
8
|
-
from evalscope.perf.utils.
|
|
11
|
+
from evalscope.perf.utils.benchmark_util import BenchmarkData
|
|
9
12
|
from evalscope.utils.logger import get_logger
|
|
10
13
|
|
|
11
14
|
logger = get_logger()
|
|
12
15
|
|
|
13
16
|
|
|
17
|
+
class StreamedResponseHandler:
|
|
18
|
+
"""Handles streaming HTTP responses by accumulating chunks until complete
|
|
19
|
+
messages are available."""
|
|
20
|
+
|
|
21
|
+
def __init__(self):
|
|
22
|
+
self.buffer = ''
|
|
23
|
+
# Keep decoder state across chunks to handle split multibyte sequences
|
|
24
|
+
self.decoder = codecs.getincrementaldecoder('utf-8')()
|
|
25
|
+
|
|
26
|
+
def add_chunk(self, chunk_bytes: bytes) -> list[str]:
|
|
27
|
+
"""Add a chunk of bytes to the buffer and return any complete
|
|
28
|
+
messages."""
|
|
29
|
+
# Use incremental decoding so incomplete multibyte sequences don't error
|
|
30
|
+
try:
|
|
31
|
+
chunk_str = self.decoder.decode(chunk_bytes, final=False)
|
|
32
|
+
except UnicodeDecodeError:
|
|
33
|
+
# Bad bytes: drop them and reset decoder state to avoid corruption
|
|
34
|
+
self.decoder.reset()
|
|
35
|
+
chunk_str = chunk_bytes.decode('utf-8', errors='ignore')
|
|
36
|
+
self.buffer += chunk_str
|
|
37
|
+
|
|
38
|
+
messages = []
|
|
39
|
+
|
|
40
|
+
# Split by double newlines (SSE message separator)
|
|
41
|
+
while '\n\n' in self.buffer:
|
|
42
|
+
message, self.buffer = self.buffer.split('\n\n', 1)
|
|
43
|
+
message = message.strip()
|
|
44
|
+
if message:
|
|
45
|
+
messages.append(message)
|
|
46
|
+
|
|
47
|
+
# if self.buffer is not empty, check if it is a complete message
|
|
48
|
+
# by removing data: prefix and check if it is a valid JSON
|
|
49
|
+
if self.buffer.startswith('data: '):
|
|
50
|
+
message_content = self.buffer.removeprefix('data: ').strip()
|
|
51
|
+
if message_content == '[DONE]':
|
|
52
|
+
messages.append(self.buffer.strip())
|
|
53
|
+
self.buffer = ''
|
|
54
|
+
elif message_content:
|
|
55
|
+
try:
|
|
56
|
+
json.loads(message_content)
|
|
57
|
+
messages.append(self.buffer.strip())
|
|
58
|
+
self.buffer = ''
|
|
59
|
+
except json.JSONDecodeError:
|
|
60
|
+
# Incomplete JSON, wait for more chunks.
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
return messages
|
|
64
|
+
|
|
65
|
+
|
|
14
66
|
class DefaultApiPlugin(ApiPluginBase):
|
|
15
67
|
"""Default implementation of API plugin with common HTTP handling methods."""
|
|
16
68
|
|
|
17
69
|
def __init__(self, param: Arguments):
|
|
18
70
|
super().__init__(param)
|
|
19
71
|
|
|
20
|
-
async def process_request(
|
|
21
|
-
|
|
72
|
+
async def process_request(
|
|
73
|
+
self, client_session: aiohttp.ClientSession, url: str, headers: Dict, body: Dict
|
|
74
|
+
) -> BenchmarkData:
|
|
22
75
|
"""Process the HTTP request and handle the response.
|
|
23
76
|
|
|
24
77
|
Args:
|
|
@@ -27,79 +80,135 @@ class DefaultApiPlugin(ApiPluginBase):
|
|
|
27
80
|
headers: The request headers
|
|
28
81
|
body: The request body
|
|
29
82
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
"""
|
|
33
|
-
try:
|
|
34
|
-
headers = {'Content-Type': 'application/json', **headers}
|
|
35
|
-
data = json.dumps(body, ensure_ascii=False) # serialize to JSON
|
|
36
|
-
async with client_session.request('POST', url=url, data=data, headers=headers) as response:
|
|
37
|
-
async for result in self._handle_response(response):
|
|
38
|
-
yield result
|
|
39
|
-
except Exception as e:
|
|
40
|
-
logger.error(f'Error in process_request: {e}')
|
|
41
|
-
yield (True, None, str(e))
|
|
42
|
-
|
|
43
|
-
async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
|
|
44
|
-
"""Handle streaming response from server-sent events.
|
|
45
|
-
|
|
46
|
-
Args:
|
|
47
|
-
response: The aiohttp response object containing a stream
|
|
48
|
-
|
|
49
|
-
Yields:
|
|
50
|
-
Tuple[bool, int, Any]: (is_error, status_code, data)
|
|
83
|
+
Returns:
|
|
84
|
+
BenchmarkData: Aggregated benchmarking data for the request/response.
|
|
51
85
|
"""
|
|
86
|
+
headers = {'Content-Type': 'application/json', **headers}
|
|
87
|
+
data = json.dumps(body, ensure_ascii=False) # serialize to JSON
|
|
88
|
+
|
|
89
|
+
output = BenchmarkData()
|
|
90
|
+
ttft = 0.0
|
|
91
|
+
generated_text = ''
|
|
92
|
+
st = time.perf_counter()
|
|
93
|
+
output.start_time = st
|
|
94
|
+
output.request = data
|
|
95
|
+
most_recent_timestamp = st
|
|
52
96
|
try:
|
|
53
|
-
async
|
|
54
|
-
|
|
55
|
-
if
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
97
|
+
async with client_session.post(url=url, data=data, headers=headers) as response:
|
|
98
|
+
content_type = response.headers.get('Content-Type', '')
|
|
99
|
+
if response.status == 200:
|
|
100
|
+
# Handle streaming responses (SSE)
|
|
101
|
+
if 'text/event-stream' in content_type:
|
|
102
|
+
handler = StreamedResponseHandler()
|
|
103
|
+
async for chunk_bytes in response.content.iter_any():
|
|
104
|
+
|
|
105
|
+
if not chunk_bytes:
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
messages = handler.add_chunk(chunk_bytes)
|
|
109
|
+
for message in messages:
|
|
110
|
+
# NOTE: SSE comments (often used as pings) start with
|
|
111
|
+
# a colon. These are not JSON data payload and should
|
|
112
|
+
# be skipped.
|
|
113
|
+
if message.startswith(':'):
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
chunk = message.removeprefix('data: ')
|
|
117
|
+
|
|
118
|
+
if chunk != '[DONE]':
|
|
119
|
+
timestamp = time.perf_counter()
|
|
120
|
+
data = json.loads(chunk)
|
|
121
|
+
|
|
122
|
+
if choices := data.get('choices'):
|
|
123
|
+
content = choices[0]['delta'].get('content')
|
|
124
|
+
# First token
|
|
125
|
+
if ttft == 0.0:
|
|
126
|
+
ttft = timestamp - st
|
|
127
|
+
output.first_chunk_latency = ttft
|
|
128
|
+
|
|
129
|
+
# Decoding phase
|
|
130
|
+
else:
|
|
131
|
+
output.inter_chunk_latency.append(timestamp - most_recent_timestamp)
|
|
132
|
+
|
|
133
|
+
generated_text += content or ''
|
|
134
|
+
output.response_messages.append(data)
|
|
135
|
+
elif usage := data.get('usage'):
|
|
136
|
+
output.prompt_tokens = usage.get('prompt_tokens')
|
|
137
|
+
output.completion_tokens = usage.get('completion_tokens')
|
|
138
|
+
|
|
139
|
+
most_recent_timestamp = timestamp
|
|
140
|
+
|
|
141
|
+
output.generated_text = generated_text
|
|
142
|
+
output.success = True
|
|
143
|
+
output.completed_time = most_recent_timestamp
|
|
144
|
+
output.query_latency = most_recent_timestamp - st
|
|
145
|
+
|
|
146
|
+
# Handle non-stream JSON responses
|
|
147
|
+
elif 'application/json' in content_type or 'application/' in content_type:
|
|
148
|
+
payload: Any
|
|
149
|
+
try:
|
|
150
|
+
payload = await response.json()
|
|
151
|
+
except Exception:
|
|
152
|
+
# Fallback to text if JSON parsing fails
|
|
153
|
+
payload = await response.text()
|
|
154
|
+
|
|
155
|
+
timestamp = time.perf_counter()
|
|
156
|
+
output.completed_time = timestamp
|
|
157
|
+
output.query_latency = timestamp - st
|
|
158
|
+
# For non-stream, first chunk equals full latency
|
|
159
|
+
output.first_chunk_latency = output.query_latency
|
|
160
|
+
|
|
161
|
+
if isinstance(payload, dict):
|
|
162
|
+
# Extract generated text from choices
|
|
163
|
+
text = ''
|
|
164
|
+
if choices := payload.get('choices'):
|
|
165
|
+
first = choices[0] if choices else {}
|
|
166
|
+
# Chat Completions format
|
|
167
|
+
msg = first.get('message') or {}
|
|
168
|
+
if isinstance(msg, dict) and msg.get('content') is not None:
|
|
169
|
+
text = msg.get('content') or ''
|
|
170
|
+
else:
|
|
171
|
+
# Legacy Completions format
|
|
172
|
+
text = first.get('text') or ''
|
|
173
|
+
generated_text = text
|
|
174
|
+
|
|
175
|
+
# Extract usage if provided
|
|
176
|
+
if usage := payload.get('usage'):
|
|
177
|
+
output.prompt_tokens = usage.get('prompt_tokens')
|
|
178
|
+
output.completion_tokens = usage.get('completion_tokens')
|
|
179
|
+
|
|
180
|
+
output.response_messages.append(payload)
|
|
181
|
+
else:
|
|
182
|
+
generated_text = str(payload)
|
|
183
|
+
|
|
184
|
+
output.generated_text = generated_text
|
|
185
|
+
output.success = True
|
|
186
|
+
|
|
187
|
+
else:
|
|
188
|
+
# Unknown successful content-type: read as text
|
|
189
|
+
raw = await response.text()
|
|
190
|
+
timestamp = time.perf_counter()
|
|
191
|
+
output.completed_time = timestamp
|
|
192
|
+
output.query_latency = timestamp - st
|
|
193
|
+
output.first_chunk_latency = output.query_latency
|
|
194
|
+
output.generated_text = raw
|
|
195
|
+
output.response_messages.append(raw)
|
|
196
|
+
output.success = True
|
|
197
|
+
else:
|
|
198
|
+
# Try to parse structured error, fallback to reason/text
|
|
199
|
+
try:
|
|
200
|
+
err_payload = await response.json()
|
|
201
|
+
output.error = json.dumps(err_payload, ensure_ascii=False)
|
|
202
|
+
except Exception:
|
|
203
|
+
try:
|
|
204
|
+
output.error = await response.text()
|
|
205
|
+
except Exception:
|
|
206
|
+
output.error = response.reason or ''
|
|
207
|
+
output.success = False
|
|
208
|
+
except Exception:
|
|
209
|
+
output.success = False
|
|
210
|
+
exc_info = sys.exc_info()
|
|
211
|
+
output.error = ''.join(traceback.format_exception(*exc_info))
|
|
212
|
+
logger.error(output.error)
|
|
213
|
+
|
|
214
|
+
return output
|