sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +54 -37
- sglang/bench_one_batch_server.py +340 -34
- sglang/bench_serving.py +340 -159
- sglang/check_env.py +1 -1
- sglang/compile_deep_gemm.py +6 -2
- sglang/global_config.py +1 -25
- sglang/lang/api.py +6 -0
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/interpreter.py +1 -0
- sglang/lang/ir.py +13 -0
- sglang/launch_server.py +9 -2
- sglang/profiler.py +20 -3
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
- sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
- sglang/srt/compilation/backend.py +437 -0
- sglang/srt/compilation/compilation_config.py +20 -0
- sglang/srt/compilation/compilation_counter.py +47 -0
- sglang/srt/compilation/compile.py +210 -0
- sglang/srt/compilation/compiler_interface.py +503 -0
- sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
- sglang/srt/compilation/fix_functionalization.py +134 -0
- sglang/srt/compilation/fx_utils.py +83 -0
- sglang/srt/compilation/inductor_pass.py +140 -0
- sglang/srt/compilation/pass_manager.py +66 -0
- sglang/srt/compilation/piecewise_context_manager.py +40 -0
- sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/deepseek_ocr.py +262 -0
- sglang/srt/configs/deepseekvl2.py +194 -96
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +2 -7
- sglang/srt/configs/falcon_h1.py +309 -0
- sglang/srt/configs/load_config.py +33 -2
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +284 -118
- sglang/srt/configs/modelopt_config.py +30 -0
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/olmo3.py +105 -0
- sglang/srt/configs/points_v15_chat.py +29 -0
- sglang/srt/configs/qwen3_next.py +11 -47
- sglang/srt/configs/qwen3_omni.py +613 -0
- sglang/srt/configs/qwen3_vl.py +576 -0
- sglang/srt/connector/remote_instance.py +1 -1
- sglang/srt/constrained/base_grammar_backend.py +6 -1
- sglang/srt/constrained/llguidance_backend.py +5 -0
- sglang/srt/constrained/outlines_backend.py +1 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
- sglang/srt/constrained/utils.py +12 -0
- sglang/srt/constrained/xgrammar_backend.py +26 -15
- sglang/srt/debug_utils/dumper.py +10 -3
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
- sglang/srt/disaggregation/base/conn.py +17 -4
- sglang/srt/disaggregation/common/conn.py +268 -98
- sglang/srt/disaggregation/decode.py +172 -39
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
- sglang/srt/disaggregation/fake/conn.py +11 -3
- sglang/srt/disaggregation/mooncake/conn.py +203 -555
- sglang/srt/disaggregation/nixl/conn.py +217 -63
- sglang/srt/disaggregation/prefill.py +113 -270
- sglang/srt/disaggregation/utils.py +36 -5
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
- sglang/srt/distributed/device_communicators/pynccl.py +24 -12
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/naive_distributed.py +5 -4
- sglang/srt/distributed/parallel_state.py +203 -97
- sglang/srt/elastic_ep/elastic_ep.py +74 -0
- sglang/srt/entrypoints/context.py +3 -2
- sglang/srt/entrypoints/engine.py +85 -65
- sglang/srt/entrypoints/grpc_server.py +632 -305
- sglang/srt/entrypoints/harmony_utils.py +2 -2
- sglang/srt/entrypoints/http_server.py +169 -17
- sglang/srt/entrypoints/http_server_engine.py +1 -7
- sglang/srt/entrypoints/openai/protocol.py +327 -34
- sglang/srt/entrypoints/openai/serving_base.py +74 -8
- sglang/srt/entrypoints/openai/serving_chat.py +202 -118
- sglang/srt/entrypoints/openai/serving_classify.py +204 -0
- sglang/srt/entrypoints/openai/serving_completions.py +20 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
- sglang/srt/entrypoints/openai/serving_responses.py +47 -2
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +323 -0
- sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
- sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
- sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
- sglang/srt/eplb/expert_distribution.py +3 -4
- sglang/srt/eplb/expert_location.py +30 -5
- sglang/srt/eplb/expert_location_dispatch.py +2 -2
- sglang/srt/eplb/expert_location_updater.py +2 -2
- sglang/srt/function_call/base_format_detector.py +17 -18
- sglang/srt/function_call/function_call_parser.py +21 -16
- sglang/srt/function_call/glm4_moe_detector.py +4 -8
- sglang/srt/function_call/gpt_oss_detector.py +24 -1
- sglang/srt/function_call/json_array_parser.py +61 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/utils.py +98 -7
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/grpc_request_manager.py +915 -0
- sglang/srt/grpc/health_servicer.py +189 -0
- sglang/srt/grpc/scheduler_launcher.py +181 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
- sglang/srt/layers/activation.py +11 -7
- sglang/srt/layers/attention/aiter_backend.py +17 -18
- sglang/srt/layers/attention/ascend_backend.py +125 -10
- sglang/srt/layers/attention/attention_registry.py +226 -0
- sglang/srt/layers/attention/base_attn_backend.py +32 -4
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +0 -1
- sglang/srt/layers/attention/fla/chunk_o.py +1 -1
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
- sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
- sglang/srt/layers/attention/fla/index.py +0 -2
- sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
- sglang/srt/layers/attention/fla/utils.py +0 -3
- sglang/srt/layers/attention/fla/wy_fast.py +0 -2
- sglang/srt/layers/attention/flashattention_backend.py +52 -15
- sglang/srt/layers/attention/flashinfer_backend.py +357 -212
- sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
- sglang/srt/layers/attention/flashmla_backend.py +9 -7
- sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
- sglang/srt/layers/attention/intel_amx_backend.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
- sglang/srt/layers/attention/mamba/mamba.py +514 -1
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
- sglang/srt/layers/attention/nsa/utils.py +23 -0
- sglang/srt/layers/attention/nsa_backend.py +1201 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/triton_backend.py +249 -42
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
- sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
- sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
- sglang/srt/layers/attention/utils.py +11 -7
- sglang/srt/layers/attention/vision.py +61 -3
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/xpu_backend.py +1028 -0
- sglang/srt/layers/communicator.py +19 -7
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
- sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
- sglang/srt/layers/dp_attention.py +28 -1
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +47 -15
- sglang/srt/layers/linear.py +30 -5
- sglang/srt/layers/logits_processor.py +161 -18
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/cutlass_moe.py +0 -2
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
- sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
- sglang/srt/layers/moe/ep_moe/layer.py +243 -448
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
- sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +3 -0
- sglang/srt/layers/moe/moe_runner/triton.py +3 -1
- sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
- sglang/srt/layers/moe/router.py +51 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
- sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
- sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
- sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
- sglang/srt/layers/moe/topk.py +3 -2
- sglang/srt/layers/moe/utils.py +27 -1
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/__init__.py +2 -53
- sglang/srt/layers/quantization/awq.py +183 -6
- sglang/srt/layers/quantization/awq_triton.py +29 -0
- sglang/srt/layers/quantization/base_config.py +20 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
- sglang/srt/layers/quantization/fp8.py +86 -20
- sglang/srt/layers/quantization/fp8_kernel.py +55 -10
- sglang/srt/layers/quantization/fp8_utils.py +43 -15
- sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
- sglang/srt/layers/quantization/gptq.py +0 -1
- sglang/srt/layers/quantization/int8_kernel.py +18 -2
- sglang/srt/layers/quantization/marlin_utils.py +12 -0
- sglang/srt/layers/quantization/modelopt_quant.py +141 -81
- sglang/srt/layers/quantization/mxfp4.py +17 -34
- sglang/srt/layers/quantization/petit.py +1 -1
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
- sglang/srt/layers/quantization/unquant.py +1 -4
- sglang/srt/layers/quantization/utils.py +0 -1
- sglang/srt/layers/quantization/w4afp8.py +51 -24
- sglang/srt/layers/quantization/w8a8_int8.py +45 -27
- sglang/srt/layers/radix_attention.py +59 -9
- sglang/srt/layers/rotary_embedding.py +750 -46
- sglang/srt/layers/sampler.py +84 -16
- sglang/srt/layers/sparse_pooler.py +98 -0
- sglang/srt/layers/utils.py +23 -1
- sglang/srt/layers/vocab_parallel_embedding.py +4 -1
- sglang/srt/lora/backend/base_backend.py +3 -3
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +9 -4
- sglang/srt/lora/eviction_policy.py +139 -0
- sglang/srt/lora/lora.py +7 -5
- sglang/srt/lora/lora_manager.py +33 -7
- sglang/srt/lora/lora_registry.py +1 -1
- sglang/srt/lora/mem_pool.py +41 -17
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
- sglang/srt/lora/utils.py +7 -5
- sglang/srt/managers/cache_controller.py +83 -152
- sglang/srt/managers/data_parallel_controller.py +156 -87
- sglang/srt/managers/detokenizer_manager.py +51 -24
- sglang/srt/managers/io_struct.py +223 -129
- sglang/srt/managers/mm_utils.py +49 -10
- sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +130 -0
- sglang/srt/managers/schedule_batch.py +340 -529
- sglang/srt/managers/schedule_policy.py +158 -18
- sglang/srt/managers/scheduler.py +665 -620
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
- sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
- sglang/srt/managers/scheduler_pp_mixin.py +341 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
- sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
- sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
- sglang/srt/managers/tokenizer_manager.py +462 -226
- sglang/srt/managers/tp_worker.py +217 -156
- sglang/srt/managers/utils.py +79 -47
- sglang/srt/mem_cache/allocator.py +21 -22
- sglang/srt/mem_cache/allocator_ascend.py +42 -28
- sglang/srt/mem_cache/base_prefix_cache.py +3 -3
- sglang/srt/mem_cache/chunk_cache.py +20 -2
- sglang/srt/mem_cache/common.py +480 -0
- sglang/srt/mem_cache/evict_policy.py +38 -0
- sglang/srt/mem_cache/hicache_storage.py +44 -2
- sglang/srt/mem_cache/hiradix_cache.py +134 -34
- sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
- sglang/srt/mem_cache/memory_pool.py +602 -208
- sglang/srt/mem_cache/memory_pool_host.py +134 -183
- sglang/srt/mem_cache/multimodal_cache.py +0 -1
- sglang/srt/mem_cache/radix_cache.py +263 -78
- sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
- sglang/srt/mem_cache/swa_radix_cache.py +115 -58
- sglang/srt/metrics/collector.py +113 -120
- sglang/srt/metrics/func_timer.py +3 -8
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +2 -2
- sglang/srt/model_executor/cuda_graph_runner.py +81 -36
- sglang/srt/model_executor/forward_batch_info.py +40 -50
- sglang/srt/model_executor/model_runner.py +507 -319
- sglang/srt/model_executor/npu_graph_runner.py +11 -5
- sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
- sglang/srt/model_loader/__init__.py +1 -1
- sglang/srt/model_loader/loader.py +438 -37
- sglang/srt/model_loader/utils.py +0 -1
- sglang/srt/model_loader/weight_utils.py +200 -27
- sglang/srt/models/apertus.py +2 -3
- sglang/srt/models/arcee.py +2 -2
- sglang/srt/models/bailing_moe.py +40 -56
- sglang/srt/models/bailing_moe_nextn.py +3 -4
- sglang/srt/models/bert.py +1 -1
- sglang/srt/models/deepseek_nextn.py +25 -4
- sglang/srt/models/deepseek_ocr.py +1516 -0
- sglang/srt/models/deepseek_v2.py +793 -235
- sglang/srt/models/dots_ocr.py +171 -0
- sglang/srt/models/dots_vlm.py +0 -1
- sglang/srt/models/dots_vlm_vit.py +1 -1
- sglang/srt/models/falcon_h1.py +570 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +17 -1
- sglang/srt/models/gemma3n_mm.py +2 -3
- sglang/srt/models/glm4_moe.py +17 -40
- sglang/srt/models/glm4_moe_nextn.py +4 -4
- sglang/srt/models/glm4v.py +3 -2
- sglang/srt/models/glm4v_moe.py +6 -6
- sglang/srt/models/gpt_oss.py +12 -35
- sglang/srt/models/grok.py +10 -23
- sglang/srt/models/hunyuan.py +2 -7
- sglang/srt/models/interns1.py +0 -1
- sglang/srt/models/kimi_vl.py +1 -7
- sglang/srt/models/kimi_vl_moonvit.py +4 -2
- sglang/srt/models/llama.py +6 -2
- sglang/srt/models/llama_eagle3.py +1 -1
- sglang/srt/models/longcat_flash.py +6 -23
- sglang/srt/models/longcat_flash_nextn.py +4 -15
- sglang/srt/models/mimo.py +2 -13
- sglang/srt/models/mimo_mtp.py +1 -2
- sglang/srt/models/minicpmo.py +7 -5
- sglang/srt/models/mixtral.py +1 -4
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/mllama4.py +27 -6
- sglang/srt/models/nemotron_h.py +511 -0
- sglang/srt/models/olmo2.py +31 -4
- sglang/srt/models/opt.py +5 -5
- sglang/srt/models/phi.py +1 -1
- sglang/srt/models/phi4mm.py +1 -1
- sglang/srt/models/phimoe.py +0 -1
- sglang/srt/models/pixtral.py +0 -3
- sglang/srt/models/points_v15_chat.py +186 -0
- sglang/srt/models/qwen.py +0 -1
- sglang/srt/models/qwen2.py +0 -7
- sglang/srt/models/qwen2_5_vl.py +5 -5
- sglang/srt/models/qwen2_audio.py +2 -15
- sglang/srt/models/qwen2_moe.py +70 -4
- sglang/srt/models/qwen2_vl.py +6 -3
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +50 -38
- sglang/srt/models/qwen3_next.py +43 -21
- sglang/srt/models/qwen3_next_mtp.py +3 -4
- sglang/srt/models/qwen3_omni_moe.py +661 -0
- sglang/srt/models/qwen3_vl.py +791 -0
- sglang/srt/models/qwen3_vl_moe.py +343 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/roberta.py +55 -3
- sglang/srt/models/sarashina2_vision.py +268 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +3 -5
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +61 -0
- sglang/srt/multimodal/processors/base_processor.py +21 -9
- sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
- sglang/srt/multimodal/processors/dots_vlm.py +2 -4
- sglang/srt/multimodal/processors/glm4v.py +1 -5
- sglang/srt/multimodal/processors/internvl.py +20 -10
- sglang/srt/multimodal/processors/janus_pro.py +0 -1
- sglang/srt/multimodal/processors/mllama4.py +0 -8
- sglang/srt/multimodal/processors/phi4mm.py +0 -1
- sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
- sglang/srt/multimodal/processors/qwen_vl.py +83 -17
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/multimodal/processors/step3_vl.py +1 -1
- sglang/srt/parser/conversation.py +41 -0
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/parser/reasoning_parser.py +0 -1
- sglang/srt/sampling/custom_logit_processor.py +77 -2
- sglang/srt/sampling/sampling_batch_info.py +36 -23
- sglang/srt/sampling/sampling_params.py +75 -0
- sglang/srt/server_args.py +1300 -338
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +161 -0
- sglang/srt/speculative/base_spec_worker.py +34 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/draft_utils.py +226 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
- sglang/srt/speculative/eagle_info.py +786 -0
- sglang/srt/speculative/eagle_info_v2.py +458 -0
- sglang/srt/speculative/eagle_utils.py +113 -1270
- sglang/srt/speculative/eagle_worker.py +120 -285
- sglang/srt/speculative/eagle_worker_v2.py +702 -0
- sglang/srt/speculative/ngram_info.py +433 -0
- sglang/srt/speculative/ngram_worker.py +246 -0
- sglang/srt/speculative/spec_info.py +49 -0
- sglang/srt/speculative/spec_utils.py +641 -0
- sglang/srt/speculative/standalone_worker.py +4 -14
- sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
- sglang/srt/tracing/trace.py +32 -6
- sglang/srt/two_batch_overlap.py +35 -18
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
- sglang/srt/{utils.py → utils/common.py} +583 -113
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
- sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
- sglang/srt/{offloader.py → utils/offloader.py} +4 -4
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/profile_merger.py +199 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_flashattn_backend.py +1 -1
- sglang/test/attention/test_flashattn_mla_backend.py +0 -1
- sglang/test/attention/test_prefix_chunk_info.py +0 -2
- sglang/test/attention/test_trtllm_mla_backend.py +221 -53
- sglang/test/few_shot_gsm8k_engine.py +2 -4
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/kit_matched_stop.py +157 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +120 -11
- sglang/test/runners.py +3 -1
- sglang/test/send_one.py +42 -7
- sglang/test/simple_eval_common.py +8 -2
- sglang/test/simple_eval_gpqa.py +0 -1
- sglang/test/simple_eval_humaneval.py +0 -3
- sglang/test/simple_eval_longbench_v2.py +344 -0
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +3 -4
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
- sglang/test/test_cutlass_moe.py +1 -2
- sglang/test/test_cutlass_w4a8_moe.py +10 -20
- sglang/test/test_deterministic.py +430 -0
- sglang/test/test_deterministic_utils.py +73 -0
- sglang/test/test_disaggregation_utils.py +93 -1
- sglang/test/test_marlin_moe.py +0 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +432 -16
- sglang/utils.py +10 -1
- sglang/version.py +1 -1
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
- sglang/srt/entrypoints/grpc_request_manager.py +0 -580
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/srt/speculative/build_eagle_tree.py +0 -427
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
- /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
- /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
|
@@ -7,8 +7,10 @@ import time
|
|
|
7
7
|
import uuid
|
|
8
8
|
from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
|
|
9
9
|
|
|
10
|
+
import orjson
|
|
10
11
|
from fastapi import Request
|
|
11
12
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
|
13
|
+
from jsonschema import Draft202012Validator, SchemaError
|
|
12
14
|
|
|
13
15
|
from sglang.srt.entrypoints.openai.protocol import (
|
|
14
16
|
ChatCompletionRequest,
|
|
@@ -25,6 +27,8 @@ from sglang.srt.entrypoints.openai.protocol import (
|
|
|
25
27
|
LogProbs,
|
|
26
28
|
MessageProcessingResult,
|
|
27
29
|
ToolCall,
|
|
30
|
+
ToolCallProcessingResult,
|
|
31
|
+
ToolChoice,
|
|
28
32
|
TopLogprob,
|
|
29
33
|
)
|
|
30
34
|
from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
|
|
@@ -33,12 +37,14 @@ from sglang.srt.entrypoints.openai.utils import (
|
|
|
33
37
|
process_hidden_states_from_ret,
|
|
34
38
|
to_openai_style_logprobs,
|
|
35
39
|
)
|
|
40
|
+
from sglang.srt.function_call.core_types import ToolCallItem
|
|
36
41
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
|
42
|
+
from sglang.srt.function_call.json_array_parser import JsonArrayParser
|
|
43
|
+
from sglang.srt.function_call.utils import get_json_schema_constraint
|
|
37
44
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
|
38
45
|
from sglang.srt.parser.conversation import generate_chat_conv
|
|
39
46
|
from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
|
|
40
47
|
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
|
41
|
-
from sglang.utils import convert_json_schema_to_str
|
|
42
48
|
|
|
43
49
|
if TYPE_CHECKING:
|
|
44
50
|
from sglang.srt.managers.template_manager import TemplateManager
|
|
@@ -58,6 +64,16 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
58
64
|
super().__init__(tokenizer_manager)
|
|
59
65
|
self.template_manager = template_manager
|
|
60
66
|
self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
|
|
67
|
+
self.reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
|
|
68
|
+
|
|
69
|
+
# Get default sampling parameters from model's generation config
|
|
70
|
+
self.default_sampling_params = (
|
|
71
|
+
self.tokenizer_manager.model_config.get_default_sampling_params()
|
|
72
|
+
)
|
|
73
|
+
if self.default_sampling_params:
|
|
74
|
+
logger.info(
|
|
75
|
+
f"Using default chat sampling params from model generation config: {self.default_sampling_params}",
|
|
76
|
+
)
|
|
61
77
|
|
|
62
78
|
def _request_id_prefix(self) -> str:
|
|
63
79
|
return "chatcmpl-"
|
|
@@ -74,6 +90,23 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
74
90
|
):
|
|
75
91
|
return "Tools cannot be empty if tool choice is set to required."
|
|
76
92
|
|
|
93
|
+
if request.tool_choice is not None and not isinstance(request.tool_choice, str):
|
|
94
|
+
if not request.tools:
|
|
95
|
+
return "Tools cannot be empty if tool choice is set to a specific tool."
|
|
96
|
+
tool_name = request.tool_choice.function.name
|
|
97
|
+
tool_exists = any(tool.function.name == tool_name for tool in request.tools)
|
|
98
|
+
if not tool_exists:
|
|
99
|
+
return f"Tool '{tool_name}' not found in tools list."
|
|
100
|
+
|
|
101
|
+
# Validate tool definitions
|
|
102
|
+
for i, tool in enumerate(request.tools or []):
|
|
103
|
+
if tool.function.parameters is None:
|
|
104
|
+
continue
|
|
105
|
+
try:
|
|
106
|
+
Draft202012Validator.check_schema(tool.function.parameters)
|
|
107
|
+
except SchemaError as e:
|
|
108
|
+
return f"Tool {i} function has invalid 'parameters' schema: {str(e)}"
|
|
109
|
+
|
|
77
110
|
max_output_tokens = request.max_completion_tokens or request.max_tokens
|
|
78
111
|
server_context_length = self.tokenizer_manager.server_args.context_length
|
|
79
112
|
if (
|
|
@@ -113,10 +146,10 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
113
146
|
processed_messages = self._process_messages(request, is_multimodal)
|
|
114
147
|
|
|
115
148
|
# Build sampling parameters
|
|
116
|
-
sampling_params =
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
processed_messages.tool_call_constraint,
|
|
149
|
+
sampling_params = request.to_sampling_params(
|
|
150
|
+
stop=processed_messages.stop,
|
|
151
|
+
model_generation_config=self.default_sampling_params,
|
|
152
|
+
tool_call_constraint=processed_messages.tool_call_constraint,
|
|
120
153
|
)
|
|
121
154
|
|
|
122
155
|
# Handle single vs multiple requests
|
|
@@ -128,8 +161,19 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
128
161
|
else:
|
|
129
162
|
prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
|
|
130
163
|
|
|
131
|
-
# Extract
|
|
132
|
-
|
|
164
|
+
# Extract custom labels from raw request headers
|
|
165
|
+
custom_labels = self.extract_custom_labels(raw_request)
|
|
166
|
+
|
|
167
|
+
# Resolve LoRA adapter from model parameter or explicit lora_path
|
|
168
|
+
lora_path = self._resolve_lora_path(request.model, request.lora_path)
|
|
169
|
+
if lora_path:
|
|
170
|
+
first_adapter = (
|
|
171
|
+
lora_path
|
|
172
|
+
if isinstance(lora_path, str)
|
|
173
|
+
else next((a for a in lora_path if a), None)
|
|
174
|
+
)
|
|
175
|
+
if first_adapter:
|
|
176
|
+
self._validate_lora_enabled(first_adapter)
|
|
133
177
|
|
|
134
178
|
adapted_request = GenerateReqInput(
|
|
135
179
|
**prompt_kwargs,
|
|
@@ -143,13 +187,16 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
143
187
|
stream=request.stream,
|
|
144
188
|
return_text_in_logprobs=True,
|
|
145
189
|
modalities=processed_messages.modalities,
|
|
146
|
-
lora_path=
|
|
190
|
+
lora_path=lora_path,
|
|
147
191
|
bootstrap_host=request.bootstrap_host,
|
|
148
192
|
bootstrap_port=request.bootstrap_port,
|
|
149
193
|
bootstrap_room=request.bootstrap_room,
|
|
150
194
|
return_hidden_states=request.return_hidden_states,
|
|
151
195
|
rid=request.rid,
|
|
152
|
-
|
|
196
|
+
extra_key=self._compute_extra_key(request),
|
|
197
|
+
priority=request.priority,
|
|
198
|
+
custom_labels=custom_labels,
|
|
199
|
+
custom_logit_processor=request.custom_logit_processor,
|
|
153
200
|
)
|
|
154
201
|
|
|
155
202
|
return adapted_request, request
|
|
@@ -187,6 +234,14 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
187
234
|
tool_call_constraint = parser.get_structure_constraint(
|
|
188
235
|
request.tool_choice
|
|
189
236
|
)
|
|
237
|
+
# Handle JSON schema constraint directly for required or named tool choice
|
|
238
|
+
if request.tool_choice == "required" or isinstance(
|
|
239
|
+
request.tool_choice, ToolChoice
|
|
240
|
+
):
|
|
241
|
+
json_schema = get_json_schema_constraint(
|
|
242
|
+
request.tools, request.tool_choice
|
|
243
|
+
)
|
|
244
|
+
tool_call_constraint = ("json_schema", json_schema)
|
|
190
245
|
|
|
191
246
|
# Use chat template
|
|
192
247
|
if self.template_manager.chat_template_name is None:
|
|
@@ -243,7 +298,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
243
298
|
if "arguments" in item["function"] and isinstance(
|
|
244
299
|
item["function"]["arguments"], str
|
|
245
300
|
):
|
|
246
|
-
item["function"]["arguments"] =
|
|
301
|
+
item["function"]["arguments"] = orjson.loads(
|
|
247
302
|
item["function"]["arguments"]
|
|
248
303
|
)
|
|
249
304
|
|
|
@@ -376,68 +431,6 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
376
431
|
stop=stop,
|
|
377
432
|
)
|
|
378
433
|
|
|
379
|
-
def _build_sampling_params(
|
|
380
|
-
self,
|
|
381
|
-
request: ChatCompletionRequest,
|
|
382
|
-
stop: List[str],
|
|
383
|
-
tool_call_constraint: Optional[Any],
|
|
384
|
-
) -> Dict[str, Any]:
|
|
385
|
-
"""Build sampling parameters for the request"""
|
|
386
|
-
|
|
387
|
-
sampling_params = {
|
|
388
|
-
"temperature": request.temperature,
|
|
389
|
-
"max_new_tokens": request.max_tokens or request.max_completion_tokens,
|
|
390
|
-
"min_new_tokens": request.min_tokens,
|
|
391
|
-
"stop": stop,
|
|
392
|
-
"stop_token_ids": request.stop_token_ids,
|
|
393
|
-
"top_p": request.top_p,
|
|
394
|
-
"top_k": request.top_k,
|
|
395
|
-
"min_p": request.min_p,
|
|
396
|
-
"presence_penalty": request.presence_penalty,
|
|
397
|
-
"frequency_penalty": request.frequency_penalty,
|
|
398
|
-
"repetition_penalty": request.repetition_penalty,
|
|
399
|
-
"regex": request.regex,
|
|
400
|
-
"ebnf": request.ebnf,
|
|
401
|
-
"n": request.n,
|
|
402
|
-
"no_stop_trim": request.no_stop_trim,
|
|
403
|
-
"ignore_eos": request.ignore_eos,
|
|
404
|
-
"skip_special_tokens": request.skip_special_tokens,
|
|
405
|
-
"logit_bias": request.logit_bias,
|
|
406
|
-
}
|
|
407
|
-
|
|
408
|
-
if request.response_format and request.response_format.type == "json_schema":
|
|
409
|
-
sampling_params["json_schema"] = convert_json_schema_to_str(
|
|
410
|
-
request.response_format.json_schema.schema_
|
|
411
|
-
)
|
|
412
|
-
elif request.response_format and request.response_format.type == "json_object":
|
|
413
|
-
sampling_params["json_schema"] = '{"type": "object"}'
|
|
414
|
-
elif (
|
|
415
|
-
request.response_format and request.response_format.type == "structural_tag"
|
|
416
|
-
):
|
|
417
|
-
sampling_params["structural_tag"] = convert_json_schema_to_str(
|
|
418
|
-
request.response_format.model_dump(by_alias=True)
|
|
419
|
-
)
|
|
420
|
-
|
|
421
|
-
# Check if there are already existing output constraints
|
|
422
|
-
has_existing_constraints = (
|
|
423
|
-
sampling_params.get("regex")
|
|
424
|
-
or sampling_params.get("ebnf")
|
|
425
|
-
or sampling_params.get("structural_tag")
|
|
426
|
-
or sampling_params.get("json_schema")
|
|
427
|
-
)
|
|
428
|
-
|
|
429
|
-
if tool_call_constraint and has_existing_constraints:
|
|
430
|
-
logger.warning("Constrained decoding is not compatible with tool calls.")
|
|
431
|
-
elif tool_call_constraint:
|
|
432
|
-
constraint_type, constraint_value = tool_call_constraint
|
|
433
|
-
if constraint_type == "structural_tag":
|
|
434
|
-
sampling_params[constraint_type] = convert_json_schema_to_str(
|
|
435
|
-
constraint_value.model_dump(by_alias=True)
|
|
436
|
-
)
|
|
437
|
-
else:
|
|
438
|
-
sampling_params[constraint_type] = constraint_value
|
|
439
|
-
return sampling_params
|
|
440
|
-
|
|
441
434
|
async def _handle_streaming_request(
|
|
442
435
|
self,
|
|
443
436
|
adapted_request: GenerateReqInput,
|
|
@@ -526,10 +519,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
526
519
|
stream_buffers[index] = stream_buffer + delta
|
|
527
520
|
|
|
528
521
|
# Handle reasoning content
|
|
529
|
-
if
|
|
530
|
-
self.tokenizer_manager.server_args.reasoning_parser
|
|
531
|
-
and request.separate_reasoning
|
|
532
|
-
):
|
|
522
|
+
if self.reasoning_parser and request.separate_reasoning:
|
|
533
523
|
reasoning_text, delta = self._process_reasoning_stream(
|
|
534
524
|
index, delta, reasoning_parser_dict, content, request
|
|
535
525
|
)
|
|
@@ -719,7 +709,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
719
709
|
|
|
720
710
|
# Handle reasoning content
|
|
721
711
|
reasoning_text = None
|
|
722
|
-
reasoning_parser = self.
|
|
712
|
+
reasoning_parser = self.reasoning_parser
|
|
723
713
|
if reasoning_parser and request.separate_reasoning:
|
|
724
714
|
is_force_reasoning = (
|
|
725
715
|
self.template_manager.force_reasoning
|
|
@@ -747,8 +737,13 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
747
737
|
and request.tools
|
|
748
738
|
and self.tool_call_parser
|
|
749
739
|
):
|
|
740
|
+
history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
|
|
750
741
|
tool_calls, text, finish_reason = self._process_tool_calls(
|
|
751
|
-
text,
|
|
742
|
+
text,
|
|
743
|
+
request.tools,
|
|
744
|
+
finish_reason,
|
|
745
|
+
request.tool_choice,
|
|
746
|
+
history_tool_calls_cnt,
|
|
752
747
|
)
|
|
753
748
|
|
|
754
749
|
choice_data = ChatCompletionResponseChoice(
|
|
@@ -838,13 +833,76 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
838
833
|
token_logprobs = self._process_logprobs_tokens(logprobs, use_token_index=True)
|
|
839
834
|
return ChoiceLogprobs(content=token_logprobs)
|
|
840
835
|
|
|
836
|
+
def _process_tool_call_id(
|
|
837
|
+
self,
|
|
838
|
+
call_item: ToolCallItem,
|
|
839
|
+
history_tool_calls_cnt: int,
|
|
840
|
+
) -> str:
|
|
841
|
+
"""Process for generating a new and unique `tool_call_id`"""
|
|
842
|
+
if self.tool_call_parser != "kimi_k2":
|
|
843
|
+
# A simple uuid is sufficient for all models except for Kimi-K2.
|
|
844
|
+
tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
|
|
845
|
+
return tool_call_id
|
|
846
|
+
else:
|
|
847
|
+
# Align with Kimi-K2 format: functions.{name}:{index}
|
|
848
|
+
# Kimi-K2 allows multiple tool_calls in one message; SGLang sets call_item.tool_index to the *local* position inside that message.
|
|
849
|
+
# Therefore, the index must be corrected by using `history_tool_calls_cnt + call_item.tool_index` to ensure globally unique and properly ordered.
|
|
850
|
+
tool_call_id = f"functions.{call_item.name}:{history_tool_calls_cnt+call_item.tool_index}"
|
|
851
|
+
logger.debug(
|
|
852
|
+
f"Process tool call idx, parser: {self.tool_call_parser}, tool_call_id: {tool_call_id}, history_cnt: {history_tool_calls_cnt}"
|
|
853
|
+
)
|
|
854
|
+
return tool_call_id
|
|
855
|
+
|
|
841
856
|
def _process_tool_calls(
|
|
842
857
|
self,
|
|
843
858
|
text: str,
|
|
844
859
|
tools: List[Any],
|
|
845
860
|
finish_reason: Dict[str, Any],
|
|
846
|
-
|
|
861
|
+
tool_choice: Optional[Union[str, ToolChoice]] = None,
|
|
862
|
+
history_tool_calls_cnt: int = 0,
|
|
863
|
+
) -> ToolCallProcessingResult:
|
|
847
864
|
"""Process tool calls in the response"""
|
|
865
|
+
|
|
866
|
+
# Handle required or named tool choice
|
|
867
|
+
if tool_choice == "required" or (
|
|
868
|
+
isinstance(tool_choice, ToolChoice) and tool_choice.type == "function"
|
|
869
|
+
):
|
|
870
|
+
# Set finish reason to tool_calls since we're processing tool calls
|
|
871
|
+
if finish_reason["type"] == "stop":
|
|
872
|
+
finish_reason["type"] = "tool_calls"
|
|
873
|
+
finish_reason["matched"] = None
|
|
874
|
+
try:
|
|
875
|
+
# For required tool choice, we expect a JSON array of tool calls
|
|
876
|
+
tool_call_data = orjson.loads(text)
|
|
877
|
+
tool_calls = []
|
|
878
|
+
for i, tool in enumerate(tool_call_data):
|
|
879
|
+
# Create a ToolCallItem from the JSON data
|
|
880
|
+
call_info = ToolCallItem(
|
|
881
|
+
tool_index=i, # Use the loop index as tool_index
|
|
882
|
+
name=tool["name"],
|
|
883
|
+
parameters=json.dumps(tool["parameters"], ensure_ascii=False),
|
|
884
|
+
)
|
|
885
|
+
tool_id = self._process_tool_call_id(
|
|
886
|
+
call_info, history_tool_calls_cnt
|
|
887
|
+
)
|
|
888
|
+
tool_calls.append(
|
|
889
|
+
ToolCall(
|
|
890
|
+
id=tool_id,
|
|
891
|
+
index=i,
|
|
892
|
+
function=FunctionResponse(
|
|
893
|
+
name=tool["name"],
|
|
894
|
+
arguments=json.dumps(
|
|
895
|
+
tool["parameters"], ensure_ascii=False
|
|
896
|
+
),
|
|
897
|
+
),
|
|
898
|
+
)
|
|
899
|
+
)
|
|
900
|
+
return ToolCallProcessingResult(tool_calls, "", finish_reason)
|
|
901
|
+
except json.JSONDecodeError as e:
|
|
902
|
+
logger.error(f"Tool call parsing error: {e}")
|
|
903
|
+
return ToolCallProcessingResult(None, text, finish_reason)
|
|
904
|
+
|
|
905
|
+
# Use parser since output is not constrained by JSON schema
|
|
848
906
|
parser = FunctionCallParser(tools, self.tool_call_parser)
|
|
849
907
|
if parser.has_tool_call(text):
|
|
850
908
|
if finish_reason["type"] == "stop":
|
|
@@ -854,15 +912,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
854
912
|
text, call_info_list = parser.parse_non_stream(text)
|
|
855
913
|
tool_calls = []
|
|
856
914
|
for call_info in call_info_list:
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
and call_info.name is not None
|
|
861
|
-
):
|
|
862
|
-
tool_id = f"functions.{call_info.name}:{call_info.tool_index}"
|
|
863
|
-
else:
|
|
864
|
-
tool_id = f"call_{uuid.uuid4().hex[:24]}"
|
|
865
|
-
|
|
915
|
+
tool_id = self._process_tool_call_id(
|
|
916
|
+
call_info, history_tool_calls_cnt
|
|
917
|
+
)
|
|
866
918
|
tool_calls.append(
|
|
867
919
|
ToolCall(
|
|
868
920
|
id=tool_id,
|
|
@@ -872,13 +924,13 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
872
924
|
),
|
|
873
925
|
)
|
|
874
926
|
)
|
|
875
|
-
return tool_calls, text, finish_reason
|
|
927
|
+
return ToolCallProcessingResult(tool_calls, text, finish_reason)
|
|
876
928
|
except Exception as e:
|
|
877
929
|
logger.error(f"Tool call parsing error: {e}")
|
|
878
930
|
# Return error but don't fail the whole request
|
|
879
|
-
return None, text, finish_reason
|
|
931
|
+
return ToolCallProcessingResult(None, text, finish_reason)
|
|
880
932
|
|
|
881
|
-
return None, text, finish_reason
|
|
933
|
+
return ToolCallProcessingResult(None, text, finish_reason)
|
|
882
934
|
|
|
883
935
|
def _process_streaming_logprobs(
|
|
884
936
|
self, content: Dict[str, Any], n_prev_token: int
|
|
@@ -911,13 +963,33 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
911
963
|
or self._get_enable_thinking_from_request(request)
|
|
912
964
|
)
|
|
913
965
|
reasoning_parser_dict[index] = ReasoningParser(
|
|
914
|
-
self.
|
|
966
|
+
self.reasoning_parser,
|
|
915
967
|
request.stream_reasoning,
|
|
916
968
|
is_force_reasoning,
|
|
917
969
|
)
|
|
918
970
|
reasoning_parser = reasoning_parser_dict[index]
|
|
919
971
|
return reasoning_parser.parse_stream_chunk(delta)
|
|
920
972
|
|
|
973
|
+
def _get_history_tool_calls_cnt(self, request: ChatCompletionRequest) -> int:
|
|
974
|
+
"""Counts the number of tool calls in the request's message history.
|
|
975
|
+
|
|
976
|
+
NOTE: This method is only useful for models that include self-increasing
|
|
977
|
+
history tool call idx in tool calls id, such as kimi-k2
|
|
978
|
+
|
|
979
|
+
Args:
|
|
980
|
+
request: The chat completion request object.
|
|
981
|
+
|
|
982
|
+
Returns:
|
|
983
|
+
The total number of tool calls in the history, or 0 if not applicable.
|
|
984
|
+
"""
|
|
985
|
+
messages = getattr(request, "messages", [])
|
|
986
|
+
idx = 0
|
|
987
|
+
for msg in messages:
|
|
988
|
+
if msg.role == "assistant":
|
|
989
|
+
tool_calls = getattr(msg, "tool_calls", None)
|
|
990
|
+
idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa
|
|
991
|
+
return idx
|
|
992
|
+
|
|
921
993
|
def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
|
|
922
994
|
"""Extracts the 'enable_thinking' flag from request chat_template_kwargs.
|
|
923
995
|
|
|
@@ -931,11 +1003,11 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
931
1003
|
"""
|
|
932
1004
|
if hasattr(request, "chat_template_kwargs") and request.chat_template_kwargs:
|
|
933
1005
|
# For Qwen3 models, `enable_thinking` is supported.
|
|
934
|
-
if
|
|
935
|
-
return request.chat_template_kwargs.get("enable_thinking")
|
|
1006
|
+
if self.reasoning_parser in ["qwen3", "glm45"]:
|
|
1007
|
+
return request.chat_template_kwargs.get("enable_thinking", False)
|
|
936
1008
|
# For DeepSeek-V3.1 models, `thinking` is supported.
|
|
937
|
-
elif
|
|
938
|
-
return request.chat_template_kwargs.get("thinking")
|
|
1009
|
+
elif self.reasoning_parser in ["deepseek-v3"]:
|
|
1010
|
+
return request.chat_template_kwargs.get("thinking", False)
|
|
939
1011
|
else:
|
|
940
1012
|
return False
|
|
941
1013
|
return False
|
|
@@ -951,13 +1023,25 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
951
1023
|
):
|
|
952
1024
|
"""Process tool calls in streaming response"""
|
|
953
1025
|
if index not in parser_dict:
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
)
|
|
1026
|
+
# Use JSON detector directly for required or named tool choice
|
|
1027
|
+
if request.tool_choice == "required" or isinstance(
|
|
1028
|
+
request.tool_choice, ToolChoice
|
|
1029
|
+
):
|
|
1030
|
+
parser_dict[index] = JsonArrayParser()
|
|
1031
|
+
else:
|
|
1032
|
+
parser_dict[index] = FunctionCallParser(
|
|
1033
|
+
tools=request.tools,
|
|
1034
|
+
tool_call_parser=self.tool_call_parser,
|
|
1035
|
+
)
|
|
1036
|
+
|
|
958
1037
|
parser = parser_dict[index]
|
|
959
1038
|
|
|
960
|
-
|
|
1039
|
+
# Handle both FunctionCallParser and JsonArrayParser
|
|
1040
|
+
if isinstance(parser, JsonArrayParser):
|
|
1041
|
+
result = parser.parse_streaming_increment(delta, request.tools)
|
|
1042
|
+
normal_text, calls = result.normal_text, result.calls
|
|
1043
|
+
else:
|
|
1044
|
+
normal_text, calls = parser.parse_stream_chunk(delta)
|
|
961
1045
|
|
|
962
1046
|
# Yield normal text
|
|
963
1047
|
if normal_text:
|
|
@@ -975,6 +1059,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
975
1059
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
|
976
1060
|
|
|
977
1061
|
# Yield tool calls
|
|
1062
|
+
history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
|
|
978
1063
|
for call_item in calls:
|
|
979
1064
|
# Mark that this choice has tool calls
|
|
980
1065
|
has_tool_calls[index] = True
|
|
@@ -982,11 +1067,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
982
1067
|
# Tool call ID should be generated only once per tool call
|
|
983
1068
|
if call_item.name:
|
|
984
1069
|
# First chunk: include ID and function name
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
else:
|
|
989
|
-
tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
|
|
1070
|
+
tool_call_id = self._process_tool_call_id(
|
|
1071
|
+
call_item, history_tool_calls_cnt
|
|
1072
|
+
)
|
|
990
1073
|
function_name = call_item.name
|
|
991
1074
|
else:
|
|
992
1075
|
# Subsequent chunks: null ID and name for argument deltas
|
|
@@ -1017,7 +1100,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
1017
1100
|
|
|
1018
1101
|
def _check_for_unstreamed_tool_args(
|
|
1019
1102
|
self,
|
|
1020
|
-
parser: FunctionCallParser,
|
|
1103
|
+
parser: Union[FunctionCallParser, JsonArrayParser],
|
|
1021
1104
|
content: Dict[str, Any],
|
|
1022
1105
|
request: ChatCompletionRequest,
|
|
1023
1106
|
index: int,
|
|
@@ -1027,30 +1110,31 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
|
1027
1110
|
when generation finishes. This ensures tool calls are properly completed
|
|
1028
1111
|
even if the model generates the final arguments in the last chunk.
|
|
1029
1112
|
"""
|
|
1030
|
-
#
|
|
1113
|
+
# Get the detector - either from FunctionCallParser or directly if json detector
|
|
1114
|
+
detector = parser.detector if hasattr(parser, "detector") else parser
|
|
1115
|
+
|
|
1116
|
+
# Only check if we have tool calls and the detector has tracked data
|
|
1031
1117
|
if (
|
|
1032
|
-
not hasattr(
|
|
1033
|
-
or not
|
|
1118
|
+
not hasattr(detector, "prev_tool_call_arr")
|
|
1119
|
+
or not detector.prev_tool_call_arr
|
|
1034
1120
|
):
|
|
1035
1121
|
return None
|
|
1036
1122
|
|
|
1037
1123
|
if (
|
|
1038
|
-
not hasattr(
|
|
1039
|
-
or not
|
|
1124
|
+
not hasattr(detector, "streamed_args_for_tool")
|
|
1125
|
+
or not detector.streamed_args_for_tool
|
|
1040
1126
|
):
|
|
1041
1127
|
return None
|
|
1042
1128
|
|
|
1043
1129
|
# Get the last tool call that was being processed
|
|
1044
|
-
tool_index = len(
|
|
1045
|
-
if tool_index < 0 or tool_index >= len(
|
|
1130
|
+
tool_index = len(detector.prev_tool_call_arr) - 1
|
|
1131
|
+
if tool_index < 0 or tool_index >= len(detector.streamed_args_for_tool):
|
|
1046
1132
|
return None
|
|
1047
1133
|
|
|
1048
1134
|
# Get expected vs actual arguments
|
|
1049
|
-
expected_args =
|
|
1050
|
-
"arguments", {}
|
|
1051
|
-
)
|
|
1135
|
+
expected_args = detector.prev_tool_call_arr[tool_index].get("arguments", {})
|
|
1052
1136
|
expected_call = json.dumps(expected_args, ensure_ascii=False)
|
|
1053
|
-
actual_call =
|
|
1137
|
+
actual_call = detector.streamed_args_for_tool[tool_index]
|
|
1054
1138
|
|
|
1055
1139
|
# Check if there are remaining arguments to send
|
|
1056
1140
|
remaining_call = (
|