sglang 0.5.1.post3__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/bench_one_batch_server.py +10 -1
- sglang/bench_serving.py +251 -26
- sglang/lang/interpreter.py +1 -1
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/longcat_flash.py +104 -0
- sglang/srt/configs/model_config.py +37 -7
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/connector/__init__.py +1 -1
- sglang/srt/connector/base_connector.py +1 -2
- sglang/srt/connector/redis.py +2 -2
- sglang/srt/connector/serde/__init__.py +1 -1
- sglang/srt/connector/serde/safe_serde.py +4 -3
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +11 -3
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +75 -0
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +15 -12
- sglang/srt/disaggregation/decode.py +6 -4
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -420
- sglang/srt/disaggregation/mooncake/conn.py +18 -10
- sglang/srt/disaggregation/nixl/conn.py +180 -16
- sglang/srt/disaggregation/prefill.py +6 -4
- sglang/srt/disaggregation/utils.py +5 -50
- sglang/srt/distributed/parallel_state.py +94 -58
- sglang/srt/entrypoints/engine.py +34 -14
- sglang/srt/entrypoints/http_server.py +172 -47
- sglang/srt/entrypoints/openai/protocol.py +63 -3
- sglang/srt/entrypoints/openai/serving_base.py +6 -2
- sglang/srt/entrypoints/openai/serving_chat.py +34 -19
- sglang/srt/entrypoints/openai/serving_completions.py +10 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
- sglang/srt/entrypoints/openai/serving_responses.py +7 -4
- sglang/srt/eplb/eplb_manager.py +28 -4
- sglang/srt/eplb/expert_distribution.py +55 -15
- sglang/srt/eplb/expert_location.py +8 -3
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +1 -1
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/hf_transformers_utils.py +12 -0
- sglang/srt/layers/activation.py +44 -9
- sglang/srt/layers/attention/aiter_backend.py +93 -68
- sglang/srt/layers/attention/ascend_backend.py +250 -112
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashinfer_backend.py +6 -4
- sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
- sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
- sglang/srt/layers/attention/mamba/mamba.py +64 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +45 -7
- sglang/srt/layers/layernorm.py +54 -12
- sglang/srt/layers/logits_processor.py +10 -3
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
- sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
- sglang/srt/layers/moe/ep_moe/layer.py +110 -49
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
- sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +43 -12
- sglang/srt/layers/moe/utils.py +6 -5
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +9 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -3
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +76 -47
- sglang/srt/layers/quantization/fp8_utils.py +43 -29
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +107 -40
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +77 -45
- sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/utils.py +13 -0
- sglang/srt/layers/quantization/w4afp8.py +60 -42
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +83 -41
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +28 -19
- sglang/srt/layers/sampler.py +29 -5
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/triton_backend.py +90 -2
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +4 -1
- sglang/srt/lora/lora_manager.py +35 -112
- sglang/srt/lora/mem_pool.py +24 -10
- sglang/srt/lora/utils.py +18 -9
- sglang/srt/managers/cache_controller.py +242 -278
- sglang/srt/managers/data_parallel_controller.py +30 -15
- sglang/srt/managers/detokenizer_manager.py +13 -2
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +160 -11
- sglang/srt/managers/mm_utils.py +6 -1
- sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
- sglang/srt/managers/schedule_batch.py +27 -44
- sglang/srt/managers/schedule_policy.py +4 -3
- sglang/srt/managers/scheduler.py +90 -115
- sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
- sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
- sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
- sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
- sglang/srt/managers/tokenizer_manager.py +41 -477
- sglang/srt/managers/tp_worker.py +16 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
- sglang/srt/mem_cache/allocator.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +24 -22
- sglang/srt/mem_cache/hiradix_cache.py +184 -101
- sglang/srt/mem_cache/lora_radix_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +324 -41
- sglang/srt/mem_cache/memory_pool_host.py +25 -18
- sglang/srt/mem_cache/radix_cache.py +5 -6
- sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +149 -12
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +74 -19
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/mem_cache/swa_radix_cache.py +1 -3
- sglang/srt/metrics/collector.py +484 -63
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +48 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +13 -5
- sglang/srt/model_executor/forward_batch_info.py +72 -18
- sglang/srt/model_executor/model_runner.py +189 -31
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +33 -28
- sglang/srt/model_loader/utils.py +12 -0
- sglang/srt/model_loader/weight_utils.py +2 -1
- sglang/srt/models/deepseek_v2.py +311 -50
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +10 -1
- sglang/srt/models/glm4v.py +4 -2
- sglang/srt/models/gpt_oss.py +5 -18
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +17 -0
- sglang/srt/models/longcat_flash.py +1026 -0
- sglang/srt/models/longcat_flash_nextn.py +699 -0
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/mllama4.py +25 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2.py +33 -3
- sglang/srt/models/qwen2_5_vl.py +90 -42
- sglang/srt/models/qwen2_moe.py +79 -14
- sglang/srt/models/qwen3.py +8 -2
- sglang/srt/models/qwen3_moe.py +39 -8
- sglang/srt/models/qwen3_next.py +1039 -0
- sglang/srt/models/qwen3_next_mtp.py +109 -0
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/models/transformers.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +4 -2
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +141 -129
- sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
- sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
- sglang/srt/sampling/sampling_batch_info.py +18 -15
- sglang/srt/server_args.py +297 -79
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
- sglang/srt/speculative/eagle_worker.py +216 -120
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/utils.py +37 -2
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_trtllm_mla_backend.py +181 -8
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_cutlass_w4a8_moe.py +24 -9
- sglang/test/test_disaggregation_utils.py +66 -0
- sglang/test/test_utils.py +25 -1
- sglang/utils.py +5 -0
- sglang/version.py +1 -1
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/METADATA +11 -9
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/RECORD +243 -194
- sglang/srt/disaggregation/launch_lb.py +0 -131
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
- /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
- /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0
sglang/srt/entrypoints/engine.py
CHANGED
@@ -60,6 +60,7 @@ from sglang.srt.managers.io_struct import (
|
|
60
60
|
UpdateWeightsFromDistributedReqInput,
|
61
61
|
UpdateWeightsFromTensorReqInput,
|
62
62
|
)
|
63
|
+
from sglang.srt.managers.multi_tokenizer_mixin import MultiTokenizerRouter
|
63
64
|
from sglang.srt.managers.scheduler import run_scheduler_process
|
64
65
|
from sglang.srt.managers.template_manager import TemplateManager
|
65
66
|
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
@@ -654,7 +655,8 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
654
655
|
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
|
655
656
|
os.environ["CUDA_MODULE_LOADING"] = "AUTO"
|
656
657
|
# flashinfer uses this environment variable for various kernels from MoE to quant kernels
|
657
|
-
os.environ
|
658
|
+
if os.environ.get("TRTLLM_ENABLE_PDL", "1") != "0":
|
659
|
+
os.environ["TRTLLM_ENABLE_PDL"] = "1"
|
658
660
|
|
659
661
|
# Can also be passed as argument
|
660
662
|
os.environ["SGLANG_RUN_ID"] = (
|
@@ -672,7 +674,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
672
674
|
if server_args.attention_backend == "flashinfer":
|
673
675
|
assert_pkg_version(
|
674
676
|
"flashinfer_python",
|
675
|
-
"0.
|
677
|
+
"0.3.1",
|
676
678
|
"Please uninstall the old version and "
|
677
679
|
"reinstall the latest version by following the instructions "
|
678
680
|
"at https://docs.flashinfer.ai/installation.html.",
|
@@ -680,7 +682,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
680
682
|
if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
|
681
683
|
assert_pkg_version(
|
682
684
|
"sgl-kernel",
|
683
|
-
"0.3.
|
685
|
+
"0.3.9.post2",
|
684
686
|
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
|
685
687
|
)
|
686
688
|
|
@@ -702,6 +704,24 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
702
704
|
mp.set_start_method("spawn", force=True)
|
703
705
|
|
704
706
|
|
707
|
+
def _init_tokenizer_manager(
|
708
|
+
server_args: ServerArgs, port_args: PortArgs
|
709
|
+
) -> TokenizerManager:
|
710
|
+
# Launch tokenizer process
|
711
|
+
tokenizer_manager = TokenizerManager(server_args, port_args)
|
712
|
+
|
713
|
+
# Initialize templates
|
714
|
+
template_manager = TemplateManager()
|
715
|
+
template_manager.initialize_templates(
|
716
|
+
tokenizer_manager=tokenizer_manager,
|
717
|
+
model_path=server_args.model_path,
|
718
|
+
chat_template=server_args.chat_template,
|
719
|
+
completion_template=server_args.completion_template,
|
720
|
+
)
|
721
|
+
|
722
|
+
return tokenizer_manager, template_manager
|
723
|
+
|
724
|
+
|
705
725
|
def _launch_subprocesses(
|
706
726
|
server_args: ServerArgs, port_args: Optional[PortArgs] = None
|
707
727
|
) -> Tuple[TokenizerManager, TemplateManager, Dict]:
|
@@ -815,17 +835,15 @@ def _launch_subprocesses(
|
|
815
835
|
)
|
816
836
|
detoken_proc.start()
|
817
837
|
|
818
|
-
#
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
tokenizer_manager=
|
825
|
-
|
826
|
-
|
827
|
-
completion_template=server_args.completion_template,
|
828
|
-
)
|
838
|
+
# Init tokenizer manager first, as the bootstrap server is initialized here
|
839
|
+
if server_args.tokenizer_worker_num > 1:
|
840
|
+
# Launch multi-tokenizer router
|
841
|
+
tokenizer_manager = MultiTokenizerRouter(server_args, port_args)
|
842
|
+
template_manager = None
|
843
|
+
else:
|
844
|
+
tokenizer_manager, template_manager = _init_tokenizer_manager(
|
845
|
+
server_args, port_args
|
846
|
+
)
|
829
847
|
|
830
848
|
# Wait for the model to finish loading
|
831
849
|
scheduler_infos = []
|
@@ -848,5 +866,7 @@ def _launch_subprocesses(
|
|
848
866
|
|
849
867
|
# Assume all schedulers have the same scheduler_info
|
850
868
|
scheduler_info = scheduler_infos[0]
|
869
|
+
|
851
870
|
tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
|
871
|
+
|
852
872
|
return tokenizer_manager, template_manager, scheduler_info
|
@@ -23,11 +23,14 @@ import json
|
|
23
23
|
import logging
|
24
24
|
import multiprocessing as multiprocessing
|
25
25
|
import os
|
26
|
+
import tempfile
|
26
27
|
import threading
|
27
28
|
import time
|
28
29
|
from http import HTTPStatus
|
29
30
|
from typing import Any, AsyncIterator, Callable, Dict, List, Optional
|
30
31
|
|
32
|
+
import setproctitle
|
33
|
+
|
31
34
|
# Fix a bug of Python threading
|
32
35
|
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
|
33
36
|
|
@@ -44,11 +47,7 @@ from fastapi.exceptions import RequestValidationError
|
|
44
47
|
from fastapi.middleware.cors import CORSMiddleware
|
45
48
|
from fastapi.responses import ORJSONResponse, Response, StreamingResponse
|
46
49
|
|
47
|
-
from sglang.srt.disaggregation.utils import
|
48
|
-
FAKE_BOOTSTRAP_HOST,
|
49
|
-
DisaggregationMode,
|
50
|
-
register_disaggregation_server,
|
51
|
-
)
|
50
|
+
from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
|
52
51
|
from sglang.srt.entrypoints.engine import _launch_subprocesses
|
53
52
|
from sglang.srt.entrypoints.openai.protocol import (
|
54
53
|
ChatCompletionRequest,
|
@@ -91,11 +90,18 @@ from sglang.srt.managers.io_struct import (
|
|
91
90
|
UpdateWeightVersionReqInput,
|
92
91
|
VertexGenerateReqInput,
|
93
92
|
)
|
93
|
+
from sglang.srt.managers.multi_tokenizer_mixin import (
|
94
|
+
MultiTokenizerManager,
|
95
|
+
get_main_process_id,
|
96
|
+
monkey_patch_uvicorn_multiprocessing,
|
97
|
+
read_from_shared_memory,
|
98
|
+
write_data_for_multi_tokenizer,
|
99
|
+
)
|
94
100
|
from sglang.srt.managers.template_manager import TemplateManager
|
95
101
|
from sglang.srt.managers.tokenizer_manager import ServerStatus, TokenizerManager
|
96
102
|
from sglang.srt.metrics.func_timer import enable_func_timer
|
97
|
-
from sglang.srt.reasoning_parser import ReasoningParser
|
98
|
-
from sglang.srt.server_args import ServerArgs
|
103
|
+
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
104
|
+
from sglang.srt.server_args import PortArgs, ServerArgs
|
99
105
|
from sglang.srt.utils import (
|
100
106
|
add_api_key_middleware,
|
101
107
|
add_prometheus_middleware,
|
@@ -130,8 +136,72 @@ def set_global_state(global_state: _GlobalState):
|
|
130
136
|
_global_state = global_state
|
131
137
|
|
132
138
|
|
139
|
+
async def init_multi_tokenizer() -> ServerArgs:
|
140
|
+
"""Read args information from shm and init tokenizer manager for current process"""
|
141
|
+
pid = os.getpid()
|
142
|
+
main_pid = get_main_process_id()
|
143
|
+
logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
|
144
|
+
|
145
|
+
# Read configuration from shared memory
|
146
|
+
port_args, server_args, scheduler_info = read_from_shared_memory(
|
147
|
+
f"multi_tokenizer_args_{main_pid}"
|
148
|
+
)
|
149
|
+
server_args: ServerArgs
|
150
|
+
|
151
|
+
# API key authentication is not supported in multi-tokenizer mode
|
152
|
+
assert (
|
153
|
+
server_args.api_key is None
|
154
|
+
), "API key is not supported in multi-tokenizer mode"
|
155
|
+
|
156
|
+
port_args.tokenizer_ipc_name = (
|
157
|
+
f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
|
158
|
+
)
|
159
|
+
|
160
|
+
# Launch multi-tokenizer manager process
|
161
|
+
tokenizer_manager = MultiTokenizerManager(server_args, port_args)
|
162
|
+
template_manager = TemplateManager()
|
163
|
+
template_manager.initialize_templates(
|
164
|
+
tokenizer_manager=tokenizer_manager,
|
165
|
+
model_path=server_args.model_path,
|
166
|
+
chat_template=server_args.chat_template,
|
167
|
+
completion_template=server_args.completion_template,
|
168
|
+
)
|
169
|
+
# Register this tokenizer with the main tokenizer manager
|
170
|
+
await tokenizer_manager.register_to_main_tokenizer_manager()
|
171
|
+
|
172
|
+
tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
|
173
|
+
set_global_state(
|
174
|
+
_GlobalState(
|
175
|
+
tokenizer_manager=tokenizer_manager,
|
176
|
+
template_manager=template_manager,
|
177
|
+
scheduler_info=scheduler_info,
|
178
|
+
)
|
179
|
+
)
|
180
|
+
return server_args
|
181
|
+
|
182
|
+
|
133
183
|
@asynccontextmanager
|
134
184
|
async def lifespan(fast_api_app: FastAPI):
|
185
|
+
if not getattr(fast_api_app, "is_single_tokenizer_mode", False):
|
186
|
+
# Initialize multi-tokenizer support for worker processes
|
187
|
+
fast_api_app.server_args: ServerArgs = await init_multi_tokenizer()
|
188
|
+
|
189
|
+
# only metrics middleware is supported in multi-tokenizer mode
|
190
|
+
worker_pid = os.getpid()
|
191
|
+
if fast_api_app.server_args.enable_metrics:
|
192
|
+
add_prometheus_middleware(app)
|
193
|
+
enable_func_timer()
|
194
|
+
|
195
|
+
logger.info(f"Worker {worker_pid} added prometheus middleware")
|
196
|
+
fast_api_app.warmup_thread = threading.Thread(
|
197
|
+
target=_wait_and_warmup,
|
198
|
+
args=(
|
199
|
+
fast_api_app.server_args,
|
200
|
+
None, # pipe_finish_writer not needed in worker
|
201
|
+
None, # launch_callback not needed in worker
|
202
|
+
),
|
203
|
+
)
|
204
|
+
|
135
205
|
# Initialize OpenAI serving handlers
|
136
206
|
fast_api_app.state.openai_serving_completion = OpenAIServingCompletion(
|
137
207
|
_global_state.tokenizer_manager, _global_state.template_manager
|
@@ -191,7 +261,15 @@ async def lifespan(fast_api_app: FastAPI):
|
|
191
261
|
warmup_thread = getattr(fast_api_app, "warmup_thread", None)
|
192
262
|
if warmup_thread is not None:
|
193
263
|
warmup_thread.start()
|
194
|
-
|
264
|
+
|
265
|
+
try:
|
266
|
+
yield
|
267
|
+
finally:
|
268
|
+
if server_args.tokenizer_worker_num > 1:
|
269
|
+
pid = os.getpid()
|
270
|
+
logger.info(f"uvicorn worker {pid} ending...")
|
271
|
+
warmup_thread.join()
|
272
|
+
logger.info(f"uvicorn worker {pid} ended.")
|
195
273
|
|
196
274
|
|
197
275
|
# Fast API
|
@@ -480,6 +558,16 @@ async def flush_cache():
|
|
480
558
|
)
|
481
559
|
|
482
560
|
|
561
|
+
@app.api_route("/clear_hicache_storage_backend", methods=["GET", "POST"])
|
562
|
+
async def clear_hicache_storage_backend():
|
563
|
+
"""Clear the hierarchical cache storage backend."""
|
564
|
+
ret = await _global_state.tokenizer_manager.clear_hicache_storage()
|
565
|
+
return Response(
|
566
|
+
content="Hierarchical cache storage backend cleared.\n",
|
567
|
+
status_code=200 if ret.success else HTTPStatus.BAD_REQUEST,
|
568
|
+
)
|
569
|
+
|
570
|
+
|
483
571
|
@app.api_route("/start_profile", methods=["GET", "POST"])
|
484
572
|
async def start_profile_async(obj: Optional[ProfileReqInput] = None):
|
485
573
|
"""Start profiling."""
|
@@ -1068,9 +1156,21 @@ def launch_server(
|
|
1068
1156
|
1. The HTTP server, Engine, and TokenizerManager both run in the main process.
|
1069
1157
|
2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
|
1070
1158
|
"""
|
1071
|
-
|
1072
|
-
|
1073
|
-
|
1159
|
+
if server_args.tokenizer_worker_num > 1:
|
1160
|
+
setproctitle.setproctitle(f"sglang::http_server/multi_tokenizer_router")
|
1161
|
+
port_args = PortArgs.init_new(server_args)
|
1162
|
+
port_args.tokenizer_worker_ipc_name = (
|
1163
|
+
f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
|
1164
|
+
)
|
1165
|
+
tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
|
1166
|
+
server_args=server_args, port_args=port_args
|
1167
|
+
)
|
1168
|
+
else:
|
1169
|
+
setproctitle.setproctitle(f"sglang::http_server/tokenizer_manager")
|
1170
|
+
tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
|
1171
|
+
server_args=server_args,
|
1172
|
+
)
|
1173
|
+
|
1074
1174
|
set_global_state(
|
1075
1175
|
_GlobalState(
|
1076
1176
|
tokenizer_manager=tokenizer_manager,
|
@@ -1079,42 +1179,75 @@ def launch_server(
|
|
1079
1179
|
)
|
1080
1180
|
)
|
1081
1181
|
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1086
|
-
# Add prometheus middleware
|
1087
|
-
if server_args.enable_metrics:
|
1088
|
-
add_prometheus_middleware(app)
|
1089
|
-
enable_func_timer()
|
1090
|
-
|
1091
|
-
# Send a warmup request - we will create the thread launch it
|
1092
|
-
# in the lifespan after all other warmups have fired.
|
1093
|
-
warmup_thread = threading.Thread(
|
1094
|
-
target=_wait_and_warmup,
|
1095
|
-
args=(
|
1182
|
+
if server_args.tokenizer_worker_num > 1:
|
1183
|
+
multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
|
1184
|
+
port_args,
|
1096
1185
|
server_args,
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1186
|
+
scheduler_info,
|
1187
|
+
)
|
1188
|
+
else:
|
1189
|
+
# Add api key authorization
|
1190
|
+
if server_args.api_key:
|
1191
|
+
add_api_key_middleware(app, server_args.api_key)
|
1192
|
+
|
1193
|
+
# Add prometheus middleware
|
1194
|
+
if server_args.enable_metrics:
|
1195
|
+
add_prometheus_middleware(app)
|
1196
|
+
enable_func_timer()
|
1197
|
+
|
1198
|
+
# Send a warmup request - we will create the thread launch it
|
1199
|
+
# in the lifespan after all other warmups have fired.
|
1200
|
+
warmup_thread = threading.Thread(
|
1201
|
+
target=_wait_and_warmup,
|
1202
|
+
args=(
|
1203
|
+
server_args,
|
1204
|
+
pipe_finish_writer,
|
1205
|
+
launch_callback,
|
1206
|
+
),
|
1207
|
+
)
|
1208
|
+
app.warmup_thread = warmup_thread
|
1102
1209
|
|
1103
1210
|
try:
|
1104
1211
|
# Update logging configs
|
1105
1212
|
set_uvicorn_logging_configs()
|
1106
1213
|
app.server_args = server_args
|
1107
1214
|
# Listen for HTTP requests
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1215
|
+
if server_args.tokenizer_worker_num > 1:
|
1216
|
+
from uvicorn.config import LOGGING_CONFIG
|
1217
|
+
|
1218
|
+
LOGGING_CONFIG["loggers"]["sglang.srt.entrypoints.http_server"] = {
|
1219
|
+
"handlers": ["default"],
|
1220
|
+
"level": "INFO",
|
1221
|
+
"propagate": False,
|
1222
|
+
}
|
1223
|
+
|
1224
|
+
monkey_patch_uvicorn_multiprocessing()
|
1225
|
+
|
1226
|
+
uvicorn.run(
|
1227
|
+
"sglang.srt.entrypoints.http_server:app",
|
1228
|
+
host=server_args.host,
|
1229
|
+
port=server_args.port,
|
1230
|
+
log_level=server_args.log_level_http or server_args.log_level,
|
1231
|
+
timeout_keep_alive=5,
|
1232
|
+
loop="uvloop",
|
1233
|
+
workers=server_args.tokenizer_worker_num,
|
1234
|
+
)
|
1235
|
+
else:
|
1236
|
+
app.is_single_tokenizer_mode = True
|
1237
|
+
uvicorn.run(
|
1238
|
+
app,
|
1239
|
+
host=server_args.host,
|
1240
|
+
port=server_args.port,
|
1241
|
+
log_level=server_args.log_level_http or server_args.log_level,
|
1242
|
+
timeout_keep_alive=5,
|
1243
|
+
loop="uvloop",
|
1244
|
+
)
|
1116
1245
|
finally:
|
1117
|
-
|
1246
|
+
if server_args.tokenizer_worker_num > 1:
|
1247
|
+
multi_tokenizer_args_shm.unlink()
|
1248
|
+
_global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
|
1249
|
+
else:
|
1250
|
+
warmup_thread.join()
|
1118
1251
|
|
1119
1252
|
|
1120
1253
|
def _execute_server_warmup(
|
@@ -1261,13 +1394,5 @@ def _wait_and_warmup(
|
|
1261
1394
|
if server_args.debug_tensor_dump_input_file:
|
1262
1395
|
kill_process_tree(os.getpid())
|
1263
1396
|
|
1264
|
-
if server_args.pdlb_url is not None:
|
1265
|
-
register_disaggregation_server(
|
1266
|
-
server_args.disaggregation_mode,
|
1267
|
-
server_args.port,
|
1268
|
-
server_args.disaggregation_bootstrap_port,
|
1269
|
-
server_args.pdlb_url,
|
1270
|
-
)
|
1271
|
-
|
1272
1397
|
if launch_callback is not None:
|
1273
1398
|
launch_callback()
|
@@ -460,6 +460,66 @@ class ChatCompletionRequest(BaseModel):
|
|
460
460
|
values["tool_choice"] = "auto"
|
461
461
|
return values
|
462
462
|
|
463
|
+
@model_validator(mode="before")
|
464
|
+
@classmethod
|
465
|
+
def normalize_reasoning_inputs(cls, values: Dict):
|
466
|
+
r = values.get("reasoning")
|
467
|
+
if r is None:
|
468
|
+
return values
|
469
|
+
|
470
|
+
if isinstance(r, dict):
|
471
|
+
effort = r.get("effort") or r.get("reasoning_effort")
|
472
|
+
if effort in {"low", "medium", "high"}:
|
473
|
+
values["reasoning_effort"] = effort
|
474
|
+
|
475
|
+
enabled = (
|
476
|
+
r.get("enabled")
|
477
|
+
if r.get("enabled") is not None
|
478
|
+
else r.get("enable", False)
|
479
|
+
)
|
480
|
+
if isinstance(enabled, str):
|
481
|
+
enabled = enabled.strip().lower() in {"1", "true", "yes", "y", "on"}
|
482
|
+
if enabled:
|
483
|
+
ctk = values.get("chat_template_kwargs")
|
484
|
+
if not isinstance(ctk, dict):
|
485
|
+
ctk = {}
|
486
|
+
ctk.setdefault("thinking", True)
|
487
|
+
values["chat_template_kwargs"] = ctk
|
488
|
+
|
489
|
+
return values
|
490
|
+
|
491
|
+
@model_validator(mode="before")
|
492
|
+
@classmethod
|
493
|
+
def set_json_schema(cls, values):
|
494
|
+
response_format = values.get("response_format")
|
495
|
+
if not response_format:
|
496
|
+
return values
|
497
|
+
|
498
|
+
if response_format.get("type") != "json_schema":
|
499
|
+
return values
|
500
|
+
|
501
|
+
schema = response_format.pop("schema", None)
|
502
|
+
json_schema = response_format.get("json_schema")
|
503
|
+
|
504
|
+
if json_schema:
|
505
|
+
return values
|
506
|
+
|
507
|
+
if schema:
|
508
|
+
name_ = schema.get("title", "Schema")
|
509
|
+
strict_ = False
|
510
|
+
if "properties" in schema and "strict" in schema["properties"]:
|
511
|
+
item = schema["properties"].pop("strict", None)
|
512
|
+
if item and item.get("default", False):
|
513
|
+
strict_ = True
|
514
|
+
|
515
|
+
response_format["json_schema"] = {
|
516
|
+
"name": name_,
|
517
|
+
"schema": schema,
|
518
|
+
"strict": strict_,
|
519
|
+
}
|
520
|
+
|
521
|
+
return values
|
522
|
+
|
463
523
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
464
524
|
top_k: int = -1
|
465
525
|
min_p: float = 0.0
|
@@ -482,9 +542,9 @@ class ChatCompletionRequest(BaseModel):
|
|
482
542
|
rid: Optional[Union[List[str], str]] = None
|
483
543
|
|
484
544
|
# For PD disaggregation
|
485
|
-
bootstrap_host: Optional[str] = None
|
486
|
-
bootstrap_port: Optional[int] = None
|
487
|
-
bootstrap_room: Optional[int] = None
|
545
|
+
bootstrap_host: Optional[Union[List[str], str]] = None
|
546
|
+
bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
|
547
|
+
bootstrap_room: Optional[Union[List[int], int]] = None
|
488
548
|
|
489
549
|
|
490
550
|
class ChatMessage(BaseModel):
|
@@ -1,15 +1,19 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import json
|
2
4
|
import logging
|
3
5
|
import uuid
|
4
6
|
from abc import ABC, abstractmethod
|
5
|
-
from typing import Any, Optional, Union
|
7
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
6
8
|
|
7
9
|
from fastapi import HTTPException, Request
|
8
10
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
9
11
|
|
10
12
|
from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
|
11
13
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
12
|
-
|
14
|
+
|
15
|
+
if TYPE_CHECKING:
|
16
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
13
17
|
|
14
18
|
logger = logging.getLogger(__name__)
|
15
19
|
|
@@ -1,14 +1,15 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import copy
|
2
4
|
import json
|
3
5
|
import logging
|
4
6
|
import time
|
5
7
|
import uuid
|
6
|
-
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
8
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
|
7
9
|
|
8
10
|
from fastapi import Request
|
9
11
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
10
12
|
|
11
|
-
from sglang.srt.conversation import generate_chat_conv
|
12
13
|
from sglang.srt.entrypoints.openai.protocol import (
|
13
14
|
ChatCompletionRequest,
|
14
15
|
ChatCompletionResponse,
|
@@ -33,13 +34,16 @@ from sglang.srt.entrypoints.openai.utils import (
|
|
33
34
|
to_openai_style_logprobs,
|
34
35
|
)
|
35
36
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
36
|
-
from sglang.srt.jinja_template_utils import process_content_for_template_format
|
37
37
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
38
|
-
from sglang.srt.
|
39
|
-
from sglang.srt.
|
40
|
-
from sglang.srt.reasoning_parser import ReasoningParser
|
38
|
+
from sglang.srt.parser.conversation import generate_chat_conv
|
39
|
+
from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
|
40
|
+
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
41
41
|
from sglang.utils import convert_json_schema_to_str
|
42
42
|
|
43
|
+
if TYPE_CHECKING:
|
44
|
+
from sglang.srt.managers.template_manager import TemplateManager
|
45
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
46
|
+
|
43
47
|
logger = logging.getLogger(__name__)
|
44
48
|
|
45
49
|
|
@@ -53,6 +57,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
53
57
|
):
|
54
58
|
super().__init__(tokenizer_manager)
|
55
59
|
self.template_manager = template_manager
|
60
|
+
self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
|
56
61
|
|
57
62
|
def _request_id_prefix(self) -> str:
|
58
63
|
return "chatcmpl-"
|
@@ -172,10 +177,11 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
172
177
|
]
|
173
178
|
else:
|
174
179
|
tools = [item.function.model_dump() for item in request.tools]
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
180
|
+
if self.tool_call_parser:
|
181
|
+
parser = FunctionCallParser(request.tools, self.tool_call_parser)
|
182
|
+
tool_call_constraint = parser.get_structure_constraint(
|
183
|
+
request.tool_choice
|
184
|
+
)
|
179
185
|
|
180
186
|
# Use chat template
|
181
187
|
if self.template_manager.chat_template_name is None:
|
@@ -537,7 +543,11 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
537
543
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
538
544
|
|
539
545
|
# Handle tool calls
|
540
|
-
if
|
546
|
+
if (
|
547
|
+
request.tool_choice != "none"
|
548
|
+
and request.tools
|
549
|
+
and self.tool_call_parser
|
550
|
+
):
|
541
551
|
async for chunk in self._process_tool_call_stream(
|
542
552
|
index,
|
543
553
|
delta,
|
@@ -727,10 +737,13 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
727
737
|
|
728
738
|
# Handle tool calls
|
729
739
|
tool_calls = None
|
730
|
-
if
|
731
|
-
|
740
|
+
if (
|
741
|
+
request.tool_choice != "none"
|
742
|
+
and request.tools
|
743
|
+
and self.tool_call_parser
|
744
|
+
):
|
732
745
|
tool_calls, text, finish_reason = self._process_tool_calls(
|
733
|
-
text, request.tools,
|
746
|
+
text, request.tools, finish_reason
|
734
747
|
)
|
735
748
|
|
736
749
|
choice_data = ChatCompletionResponseChoice(
|
@@ -824,11 +837,10 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
824
837
|
self,
|
825
838
|
text: str,
|
826
839
|
tools: List[Any],
|
827
|
-
tool_call_parser: Optional[str],
|
828
840
|
finish_reason: Dict[str, Any],
|
829
841
|
) -> tuple[Optional[List[ToolCall]], str, Dict[str, Any]]:
|
830
842
|
"""Process tool calls in the response"""
|
831
|
-
parser = FunctionCallParser(tools, tool_call_parser)
|
843
|
+
parser = FunctionCallParser(tools, self.tool_call_parser)
|
832
844
|
if parser.has_tool_call(text):
|
833
845
|
if finish_reason["type"] == "stop":
|
834
846
|
finish_reason["type"] = "tool_calls"
|
@@ -838,7 +850,10 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
838
850
|
tool_calls = []
|
839
851
|
for call_info in call_info_list:
|
840
852
|
# For Kimi-K2, align tool_call_id with the model format: functions.{name}:{index}
|
841
|
-
if
|
853
|
+
if (
|
854
|
+
self.tool_call_parser == "kimi_k2"
|
855
|
+
and call_info.name is not None
|
856
|
+
):
|
842
857
|
tool_id = f"functions.{call_info.name}:{call_info.tool_index}"
|
843
858
|
else:
|
844
859
|
tool_id = f"call_{uuid.uuid4().hex[:24]}"
|
@@ -933,7 +948,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
933
948
|
if index not in parser_dict:
|
934
949
|
parser_dict[index] = FunctionCallParser(
|
935
950
|
tools=request.tools,
|
936
|
-
tool_call_parser=self.
|
951
|
+
tool_call_parser=self.tool_call_parser,
|
937
952
|
)
|
938
953
|
parser = parser_dict[index]
|
939
954
|
|
@@ -962,7 +977,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
962
977
|
# Tool call ID should be generated only once per tool call
|
963
978
|
if call_item.name:
|
964
979
|
# First chunk: include ID and function name
|
965
|
-
if self.
|
980
|
+
if self.tool_call_parser == "kimi_k2":
|
966
981
|
# Align with Kimi-K2 format: functions.{name}:{index}
|
967
982
|
tool_call_id = f"functions.{call_item.name}:{call_item.tool_index}"
|
968
983
|
else:
|
@@ -1,11 +1,12 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import logging
|
2
4
|
import time
|
3
|
-
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
5
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
|
4
6
|
|
5
7
|
from fastapi import Request
|
6
8
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
7
9
|
|
8
|
-
from sglang.srt.code_completion_parser import generate_completion_prompt_from_request
|
9
10
|
from sglang.srt.entrypoints.openai.protocol import (
|
10
11
|
CompletionRequest,
|
11
12
|
CompletionResponse,
|
@@ -21,10 +22,15 @@ from sglang.srt.entrypoints.openai.utils import (
|
|
21
22
|
to_openai_style_logprobs,
|
22
23
|
)
|
23
24
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
24
|
-
from sglang.srt.
|
25
|
-
|
25
|
+
from sglang.srt.parser.code_completion_parser import (
|
26
|
+
generate_completion_prompt_from_request,
|
27
|
+
)
|
26
28
|
from sglang.utils import convert_json_schema_to_str
|
27
29
|
|
30
|
+
if TYPE_CHECKING:
|
31
|
+
from sglang.srt.managers.template_manager import TemplateManager
|
32
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
33
|
+
|
28
34
|
logger = logging.getLogger(__name__)
|
29
35
|
|
30
36
|
|