sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -11
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +474 -142
- sglang/compile_deep_gemm.py +3 -0
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +10 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +314 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +228 -92
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/qwen3_next.py +294 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +78 -37
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +373 -68
- sglang/srt/disaggregation/prefill.py +53 -49
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +842 -0
- sglang/srt/entrypoints/grpc_server.py +950 -0
- sglang/srt/entrypoints/http_server.py +179 -60
- sglang/srt/entrypoints/openai/protocol.py +265 -29
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +213 -122
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +289 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +17 -8
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +215 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +40 -8
- sglang/srt/layers/attention/flashinfer_backend.py +341 -204
- sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
- sglang/srt/layers/attention/mamba/mamba.py +577 -0
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +180 -18
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
- sglang/srt/layers/moe/ep_moe/layer.py +248 -333
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +83 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +29 -7
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +155 -60
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +191 -56
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +28 -33
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +44 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +55 -0
- sglang/srt/managers/schedule_batch.py +343 -212
- sglang/srt/managers/schedule_policy.py +145 -18
- sglang/srt/managers/scheduler.py +653 -273
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +579 -674
- sglang/srt/managers/tp_worker.py +96 -26
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +21 -22
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +9 -2
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +651 -80
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +227 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +93 -48
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +74 -46
- sglang/srt/model_executor/model_runner.py +455 -176
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +10 -4
- sglang/srt/model_loader/loader.py +319 -10
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +161 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +578 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +17 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/grok.py +5 -13
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mixtral.py +1 -3
- sglang/srt/models/mllama4.py +50 -4
- sglang/srt/models/nemotron_h.py +514 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +55 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +49 -26
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +1051 -285
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +98 -29
- sglang/srt/speculative/ngram_info.py +428 -0
- sglang/srt/speculative/ngram_worker.py +246 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +605 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +9 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +451 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +119 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_longbench_v2.py +332 -0
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_cutlass_w4a8_moe.py +9 -19
- sglang/test/test_deterministic.py +313 -0
- sglang/test/test_deterministic_utils.py +81 -0
- sglang/test/test_disaggregation_utils.py +140 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +407 -8
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,15 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import copy
|
2
4
|
import json
|
3
5
|
import logging
|
4
6
|
import time
|
5
7
|
import uuid
|
6
|
-
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
8
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
|
7
9
|
|
8
10
|
from fastapi import Request
|
9
11
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
12
|
+
from jsonschema import Draft202012Validator, SchemaError
|
10
13
|
|
11
14
|
from sglang.srt.entrypoints.openai.protocol import (
|
12
15
|
ChatCompletionRequest,
|
@@ -23,6 +26,8 @@ from sglang.srt.entrypoints.openai.protocol import (
|
|
23
26
|
LogProbs,
|
24
27
|
MessageProcessingResult,
|
25
28
|
ToolCall,
|
29
|
+
ToolCallProcessingResult,
|
30
|
+
ToolChoice,
|
26
31
|
TopLogprob,
|
27
32
|
)
|
28
33
|
from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
|
@@ -31,14 +36,18 @@ from sglang.srt.entrypoints.openai.utils import (
|
|
31
36
|
process_hidden_states_from_ret,
|
32
37
|
to_openai_style_logprobs,
|
33
38
|
)
|
39
|
+
from sglang.srt.function_call.core_types import ToolCallItem
|
34
40
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
41
|
+
from sglang.srt.function_call.json_array_parser import JsonArrayParser
|
42
|
+
from sglang.srt.function_call.utils import get_json_schema_constraint
|
35
43
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
36
|
-
from sglang.srt.managers.template_manager import TemplateManager
|
37
|
-
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
38
44
|
from sglang.srt.parser.conversation import generate_chat_conv
|
39
45
|
from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
|
40
46
|
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
41
|
-
|
47
|
+
|
48
|
+
if TYPE_CHECKING:
|
49
|
+
from sglang.srt.managers.template_manager import TemplateManager
|
50
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
42
51
|
|
43
52
|
logger = logging.getLogger(__name__)
|
44
53
|
|
@@ -53,6 +62,17 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
53
62
|
):
|
54
63
|
super().__init__(tokenizer_manager)
|
55
64
|
self.template_manager = template_manager
|
65
|
+
self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
|
66
|
+
self.reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
|
67
|
+
|
68
|
+
# Get default sampling parameters from model's generation config
|
69
|
+
self.default_sampling_params = (
|
70
|
+
self.tokenizer_manager.model_config.get_default_sampling_params()
|
71
|
+
)
|
72
|
+
if self.default_sampling_params:
|
73
|
+
logger.info(
|
74
|
+
f"Using default chat sampling params from model generation config: {self.default_sampling_params}",
|
75
|
+
)
|
56
76
|
|
57
77
|
def _request_id_prefix(self) -> str:
|
58
78
|
return "chatcmpl-"
|
@@ -69,6 +89,23 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
69
89
|
):
|
70
90
|
return "Tools cannot be empty if tool choice is set to required."
|
71
91
|
|
92
|
+
if request.tool_choice is not None and not isinstance(request.tool_choice, str):
|
93
|
+
if not request.tools:
|
94
|
+
return "Tools cannot be empty if tool choice is set to a specific tool."
|
95
|
+
tool_name = request.tool_choice.function.name
|
96
|
+
tool_exists = any(tool.function.name == tool_name for tool in request.tools)
|
97
|
+
if not tool_exists:
|
98
|
+
return f"Tool '{tool_name}' not found in tools list."
|
99
|
+
|
100
|
+
# Validate tool definitions
|
101
|
+
for i, tool in enumerate(request.tools or []):
|
102
|
+
if tool.function.parameters is None:
|
103
|
+
continue
|
104
|
+
try:
|
105
|
+
Draft202012Validator.check_schema(tool.function.parameters)
|
106
|
+
except SchemaError as e:
|
107
|
+
return f"Tool {i} function has invalid 'parameters' schema: {str(e)}"
|
108
|
+
|
72
109
|
max_output_tokens = request.max_completion_tokens or request.max_tokens
|
73
110
|
server_context_length = self.tokenizer_manager.server_args.context_length
|
74
111
|
if (
|
@@ -91,6 +128,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
91
128
|
def _convert_to_internal_request(
|
92
129
|
self,
|
93
130
|
request: ChatCompletionRequest,
|
131
|
+
raw_request: Request = None,
|
94
132
|
) -> tuple[GenerateReqInput, ChatCompletionRequest]:
|
95
133
|
reasoning_effort = (
|
96
134
|
request.chat_template_kwargs.pop("reasoning_effort", None)
|
@@ -107,10 +145,10 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
107
145
|
processed_messages = self._process_messages(request, is_multimodal)
|
108
146
|
|
109
147
|
# Build sampling parameters
|
110
|
-
sampling_params =
|
111
|
-
|
112
|
-
|
113
|
-
processed_messages.tool_call_constraint,
|
148
|
+
sampling_params = request.to_sampling_params(
|
149
|
+
stop=processed_messages.stop,
|
150
|
+
model_generation_config=self.default_sampling_params,
|
151
|
+
tool_call_constraint=processed_messages.tool_call_constraint,
|
114
152
|
)
|
115
153
|
|
116
154
|
# Handle single vs multiple requests
|
@@ -122,6 +160,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
122
160
|
else:
|
123
161
|
prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
|
124
162
|
|
163
|
+
# Extract custom labels from raw request headers
|
164
|
+
custom_labels = self.extract_custom_labels(raw_request)
|
165
|
+
|
125
166
|
adapted_request = GenerateReqInput(
|
126
167
|
**prompt_kwargs,
|
127
168
|
image_data=processed_messages.image_data,
|
@@ -140,6 +181,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
140
181
|
bootstrap_room=request.bootstrap_room,
|
141
182
|
return_hidden_states=request.return_hidden_states,
|
142
183
|
rid=request.rid,
|
184
|
+
extra_key=self._compute_extra_key(request),
|
185
|
+
priority=request.priority,
|
186
|
+
custom_labels=custom_labels,
|
143
187
|
)
|
144
188
|
|
145
189
|
return adapted_request, request
|
@@ -172,10 +216,19 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
172
216
|
]
|
173
217
|
else:
|
174
218
|
tools = [item.function.model_dump() for item in request.tools]
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
219
|
+
if self.tool_call_parser:
|
220
|
+
parser = FunctionCallParser(request.tools, self.tool_call_parser)
|
221
|
+
tool_call_constraint = parser.get_structure_constraint(
|
222
|
+
request.tool_choice
|
223
|
+
)
|
224
|
+
# Handle JSON schema constraint directly for required or named tool choice
|
225
|
+
if request.tool_choice == "required" or isinstance(
|
226
|
+
request.tool_choice, ToolChoice
|
227
|
+
):
|
228
|
+
json_schema = get_json_schema_constraint(
|
229
|
+
request.tools, request.tool_choice
|
230
|
+
)
|
231
|
+
tool_call_constraint = ("json_schema", json_schema)
|
179
232
|
|
180
233
|
# Use chat template
|
181
234
|
if self.template_manager.chat_template_name is None:
|
@@ -365,68 +418,6 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
365
418
|
stop=stop,
|
366
419
|
)
|
367
420
|
|
368
|
-
def _build_sampling_params(
|
369
|
-
self,
|
370
|
-
request: ChatCompletionRequest,
|
371
|
-
stop: List[str],
|
372
|
-
tool_call_constraint: Optional[Any],
|
373
|
-
) -> Dict[str, Any]:
|
374
|
-
"""Build sampling parameters for the request"""
|
375
|
-
|
376
|
-
sampling_params = {
|
377
|
-
"temperature": request.temperature,
|
378
|
-
"max_new_tokens": request.max_tokens or request.max_completion_tokens,
|
379
|
-
"min_new_tokens": request.min_tokens,
|
380
|
-
"stop": stop,
|
381
|
-
"stop_token_ids": request.stop_token_ids,
|
382
|
-
"top_p": request.top_p,
|
383
|
-
"top_k": request.top_k,
|
384
|
-
"min_p": request.min_p,
|
385
|
-
"presence_penalty": request.presence_penalty,
|
386
|
-
"frequency_penalty": request.frequency_penalty,
|
387
|
-
"repetition_penalty": request.repetition_penalty,
|
388
|
-
"regex": request.regex,
|
389
|
-
"ebnf": request.ebnf,
|
390
|
-
"n": request.n,
|
391
|
-
"no_stop_trim": request.no_stop_trim,
|
392
|
-
"ignore_eos": request.ignore_eos,
|
393
|
-
"skip_special_tokens": request.skip_special_tokens,
|
394
|
-
"logit_bias": request.logit_bias,
|
395
|
-
}
|
396
|
-
|
397
|
-
if request.response_format and request.response_format.type == "json_schema":
|
398
|
-
sampling_params["json_schema"] = convert_json_schema_to_str(
|
399
|
-
request.response_format.json_schema.schema_
|
400
|
-
)
|
401
|
-
elif request.response_format and request.response_format.type == "json_object":
|
402
|
-
sampling_params["json_schema"] = '{"type": "object"}'
|
403
|
-
elif (
|
404
|
-
request.response_format and request.response_format.type == "structural_tag"
|
405
|
-
):
|
406
|
-
sampling_params["structural_tag"] = convert_json_schema_to_str(
|
407
|
-
request.response_format.model_dump(by_alias=True)
|
408
|
-
)
|
409
|
-
|
410
|
-
# Check if there are already existing output constraints
|
411
|
-
has_existing_constraints = (
|
412
|
-
sampling_params.get("regex")
|
413
|
-
or sampling_params.get("ebnf")
|
414
|
-
or sampling_params.get("structural_tag")
|
415
|
-
or sampling_params.get("json_schema")
|
416
|
-
)
|
417
|
-
|
418
|
-
if tool_call_constraint and has_existing_constraints:
|
419
|
-
logger.warning("Constrained decoding is not compatible with tool calls.")
|
420
|
-
elif tool_call_constraint:
|
421
|
-
constraint_type, constraint_value = tool_call_constraint
|
422
|
-
if constraint_type == "structural_tag":
|
423
|
-
sampling_params[constraint_type] = convert_json_schema_to_str(
|
424
|
-
constraint_value.model_dump(by_alias=True)
|
425
|
-
)
|
426
|
-
else:
|
427
|
-
sampling_params[constraint_type] = constraint_value
|
428
|
-
return sampling_params
|
429
|
-
|
430
421
|
async def _handle_streaming_request(
|
431
422
|
self,
|
432
423
|
adapted_request: GenerateReqInput,
|
@@ -515,10 +506,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
515
506
|
stream_buffers[index] = stream_buffer + delta
|
516
507
|
|
517
508
|
# Handle reasoning content
|
518
|
-
if
|
519
|
-
self.tokenizer_manager.server_args.reasoning_parser
|
520
|
-
and request.separate_reasoning
|
521
|
-
):
|
509
|
+
if self.reasoning_parser and request.separate_reasoning:
|
522
510
|
reasoning_text, delta = self._process_reasoning_stream(
|
523
511
|
index, delta, reasoning_parser_dict, content, request
|
524
512
|
)
|
@@ -537,7 +525,11 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
537
525
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
538
526
|
|
539
527
|
# Handle tool calls
|
540
|
-
if
|
528
|
+
if (
|
529
|
+
request.tool_choice != "none"
|
530
|
+
and request.tools
|
531
|
+
and self.tool_call_parser
|
532
|
+
):
|
541
533
|
async for chunk in self._process_tool_call_stream(
|
542
534
|
index,
|
543
535
|
delta,
|
@@ -704,7 +696,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
704
696
|
|
705
697
|
# Handle reasoning content
|
706
698
|
reasoning_text = None
|
707
|
-
reasoning_parser = self.
|
699
|
+
reasoning_parser = self.reasoning_parser
|
708
700
|
if reasoning_parser and request.separate_reasoning:
|
709
701
|
is_force_reasoning = (
|
710
702
|
self.template_manager.force_reasoning
|
@@ -727,10 +719,18 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
727
719
|
|
728
720
|
# Handle tool calls
|
729
721
|
tool_calls = None
|
730
|
-
if
|
731
|
-
|
722
|
+
if (
|
723
|
+
request.tool_choice != "none"
|
724
|
+
and request.tools
|
725
|
+
and self.tool_call_parser
|
726
|
+
):
|
727
|
+
history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
|
732
728
|
tool_calls, text, finish_reason = self._process_tool_calls(
|
733
|
-
text,
|
729
|
+
text,
|
730
|
+
request.tools,
|
731
|
+
finish_reason,
|
732
|
+
request.tool_choice,
|
733
|
+
history_tool_calls_cnt,
|
734
734
|
)
|
735
735
|
|
736
736
|
choice_data = ChatCompletionResponseChoice(
|
@@ -820,15 +820,77 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
820
820
|
token_logprobs = self._process_logprobs_tokens(logprobs, use_token_index=True)
|
821
821
|
return ChoiceLogprobs(content=token_logprobs)
|
822
822
|
|
823
|
+
def _process_tool_call_id(
|
824
|
+
self,
|
825
|
+
call_item: ToolCallItem,
|
826
|
+
history_tool_calls_cnt: int,
|
827
|
+
) -> str:
|
828
|
+
"""Process for generating a new and unique `tool_call_id`"""
|
829
|
+
if self.tool_call_parser != "kimi_k2":
|
830
|
+
# A simple uuid is sufficient for all models except for Kimi-K2.
|
831
|
+
tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
|
832
|
+
return tool_call_id
|
833
|
+
else:
|
834
|
+
# Align with Kimi-K2 format: functions.{name}:{index}
|
835
|
+
# Kimi-K2 allows multiple tool_calls in one message; SGLang sets call_item.tool_index to the *local* position inside that message.
|
836
|
+
# Therefore, the index must be corrected by using `history_tool_calls_cnt + call_item.tool_index` to ensure globally unique and properly ordered.
|
837
|
+
tool_call_id = f"functions.{call_item.name}:{history_tool_calls_cnt+call_item.tool_index}"
|
838
|
+
logger.debug(
|
839
|
+
f"Process tool call idx, parser: {self.tool_call_parser}, tool_call_id: {tool_call_id}, history_cnt: {history_tool_calls_cnt}"
|
840
|
+
)
|
841
|
+
return tool_call_id
|
842
|
+
|
823
843
|
def _process_tool_calls(
|
824
844
|
self,
|
825
845
|
text: str,
|
826
846
|
tools: List[Any],
|
827
|
-
tool_call_parser: Optional[str],
|
828
847
|
finish_reason: Dict[str, Any],
|
829
|
-
|
848
|
+
tool_choice: Optional[Union[str, ToolChoice]] = None,
|
849
|
+
history_tool_calls_cnt: int = 0,
|
850
|
+
) -> ToolCallProcessingResult:
|
830
851
|
"""Process tool calls in the response"""
|
831
|
-
|
852
|
+
|
853
|
+
# Handle required or named tool choice
|
854
|
+
if tool_choice == "required" or (
|
855
|
+
isinstance(tool_choice, ToolChoice) and tool_choice.type == "function"
|
856
|
+
):
|
857
|
+
# Set finish reason to tool_calls since we're processing tool calls
|
858
|
+
if finish_reason["type"] == "stop":
|
859
|
+
finish_reason["type"] = "tool_calls"
|
860
|
+
finish_reason["matched"] = None
|
861
|
+
try:
|
862
|
+
# For required tool choice, we expect a JSON array of tool calls
|
863
|
+
tool_call_data = json.loads(text)
|
864
|
+
tool_calls = []
|
865
|
+
for i, tool in enumerate(tool_call_data):
|
866
|
+
# Create a ToolCallItem from the JSON data
|
867
|
+
call_info = ToolCallItem(
|
868
|
+
tool_index=i, # Use the loop index as tool_index
|
869
|
+
name=tool["name"],
|
870
|
+
parameters=json.dumps(tool["parameters"], ensure_ascii=False),
|
871
|
+
)
|
872
|
+
tool_id = self._process_tool_call_id(
|
873
|
+
call_info, history_tool_calls_cnt
|
874
|
+
)
|
875
|
+
tool_calls.append(
|
876
|
+
ToolCall(
|
877
|
+
id=tool_id,
|
878
|
+
index=i,
|
879
|
+
function=FunctionResponse(
|
880
|
+
name=tool["name"],
|
881
|
+
arguments=json.dumps(
|
882
|
+
tool["parameters"], ensure_ascii=False
|
883
|
+
),
|
884
|
+
),
|
885
|
+
)
|
886
|
+
)
|
887
|
+
return ToolCallProcessingResult(tool_calls, "", finish_reason)
|
888
|
+
except json.JSONDecodeError as e:
|
889
|
+
logger.error(f"Tool call parsing error: {e}")
|
890
|
+
return ToolCallProcessingResult(None, text, finish_reason)
|
891
|
+
|
892
|
+
# Use parser since output is not constrained by JSON schema
|
893
|
+
parser = FunctionCallParser(tools, self.tool_call_parser)
|
832
894
|
if parser.has_tool_call(text):
|
833
895
|
if finish_reason["type"] == "stop":
|
834
896
|
finish_reason["type"] = "tool_calls"
|
@@ -837,12 +899,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
837
899
|
text, call_info_list = parser.parse_non_stream(text)
|
838
900
|
tool_calls = []
|
839
901
|
for call_info in call_info_list:
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
else:
|
844
|
-
tool_id = f"call_{uuid.uuid4().hex[:24]}"
|
845
|
-
|
902
|
+
tool_id = self._process_tool_call_id(
|
903
|
+
call_info, history_tool_calls_cnt
|
904
|
+
)
|
846
905
|
tool_calls.append(
|
847
906
|
ToolCall(
|
848
907
|
id=tool_id,
|
@@ -852,13 +911,13 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
852
911
|
),
|
853
912
|
)
|
854
913
|
)
|
855
|
-
return tool_calls, text, finish_reason
|
914
|
+
return ToolCallProcessingResult(tool_calls, text, finish_reason)
|
856
915
|
except Exception as e:
|
857
916
|
logger.error(f"Tool call parsing error: {e}")
|
858
917
|
# Return error but don't fail the whole request
|
859
|
-
return None, text, finish_reason
|
918
|
+
return ToolCallProcessingResult(None, text, finish_reason)
|
860
919
|
|
861
|
-
return None, text, finish_reason
|
920
|
+
return ToolCallProcessingResult(None, text, finish_reason)
|
862
921
|
|
863
922
|
def _process_streaming_logprobs(
|
864
923
|
self, content: Dict[str, Any], n_prev_token: int
|
@@ -891,13 +950,33 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
891
950
|
or self._get_enable_thinking_from_request(request)
|
892
951
|
)
|
893
952
|
reasoning_parser_dict[index] = ReasoningParser(
|
894
|
-
self.
|
953
|
+
self.reasoning_parser,
|
895
954
|
request.stream_reasoning,
|
896
955
|
is_force_reasoning,
|
897
956
|
)
|
898
957
|
reasoning_parser = reasoning_parser_dict[index]
|
899
958
|
return reasoning_parser.parse_stream_chunk(delta)
|
900
959
|
|
960
|
+
def _get_history_tool_calls_cnt(self, request: ChatCompletionRequest) -> int:
|
961
|
+
"""Counts the number of tool calls in the request's message history.
|
962
|
+
|
963
|
+
NOTE: This method is only useful for models that include self-increasing
|
964
|
+
history tool call idx in tool calls id, such as kimi-k2
|
965
|
+
|
966
|
+
Args:
|
967
|
+
request: The chat completion request object.
|
968
|
+
|
969
|
+
Returns:
|
970
|
+
The total number of tool calls in the history, or 0 if not applicable.
|
971
|
+
"""
|
972
|
+
messages = getattr(request, "messages", [])
|
973
|
+
idx = 0
|
974
|
+
for msg in messages:
|
975
|
+
if msg.role == "assistant":
|
976
|
+
tool_calls = getattr(msg, "tool_calls", None)
|
977
|
+
idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa
|
978
|
+
return idx
|
979
|
+
|
901
980
|
def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
|
902
981
|
"""Extracts the 'enable_thinking' flag from request chat_template_kwargs.
|
903
982
|
|
@@ -911,11 +990,11 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
911
990
|
"""
|
912
991
|
if hasattr(request, "chat_template_kwargs") and request.chat_template_kwargs:
|
913
992
|
# For Qwen3 models, `enable_thinking` is supported.
|
914
|
-
if
|
915
|
-
return request.chat_template_kwargs.get("enable_thinking")
|
993
|
+
if self.reasoning_parser in ["qwen3", "glm45"]:
|
994
|
+
return request.chat_template_kwargs.get("enable_thinking", False)
|
916
995
|
# For DeepSeek-V3.1 models, `thinking` is supported.
|
917
|
-
elif
|
918
|
-
return request.chat_template_kwargs.get("thinking")
|
996
|
+
elif self.reasoning_parser in ["deepseek-v3"]:
|
997
|
+
return request.chat_template_kwargs.get("thinking", False)
|
919
998
|
else:
|
920
999
|
return False
|
921
1000
|
return False
|
@@ -931,13 +1010,25 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
931
1010
|
):
|
932
1011
|
"""Process tool calls in streaming response"""
|
933
1012
|
if index not in parser_dict:
|
934
|
-
|
935
|
-
|
936
|
-
|
937
|
-
)
|
1013
|
+
# Use JSON detector directly for required or named tool choice
|
1014
|
+
if request.tool_choice == "required" or isinstance(
|
1015
|
+
request.tool_choice, ToolChoice
|
1016
|
+
):
|
1017
|
+
parser_dict[index] = JsonArrayParser()
|
1018
|
+
else:
|
1019
|
+
parser_dict[index] = FunctionCallParser(
|
1020
|
+
tools=request.tools,
|
1021
|
+
tool_call_parser=self.tool_call_parser,
|
1022
|
+
)
|
1023
|
+
|
938
1024
|
parser = parser_dict[index]
|
939
1025
|
|
940
|
-
|
1026
|
+
# Handle both FunctionCallParser and JsonArrayParser
|
1027
|
+
if isinstance(parser, JsonArrayParser):
|
1028
|
+
result = parser.parse_streaming_increment(delta, request.tools)
|
1029
|
+
normal_text, calls = result.normal_text, result.calls
|
1030
|
+
else:
|
1031
|
+
normal_text, calls = parser.parse_stream_chunk(delta)
|
941
1032
|
|
942
1033
|
# Yield normal text
|
943
1034
|
if normal_text:
|
@@ -955,6 +1046,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
955
1046
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
956
1047
|
|
957
1048
|
# Yield tool calls
|
1049
|
+
history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
|
958
1050
|
for call_item in calls:
|
959
1051
|
# Mark that this choice has tool calls
|
960
1052
|
has_tool_calls[index] = True
|
@@ -962,11 +1054,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
962
1054
|
# Tool call ID should be generated only once per tool call
|
963
1055
|
if call_item.name:
|
964
1056
|
# First chunk: include ID and function name
|
965
|
-
|
966
|
-
|
967
|
-
|
968
|
-
else:
|
969
|
-
tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
|
1057
|
+
tool_call_id = self._process_tool_call_id(
|
1058
|
+
call_item, history_tool_calls_cnt
|
1059
|
+
)
|
970
1060
|
function_name = call_item.name
|
971
1061
|
else:
|
972
1062
|
# Subsequent chunks: null ID and name for argument deltas
|
@@ -997,7 +1087,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
997
1087
|
|
998
1088
|
def _check_for_unstreamed_tool_args(
|
999
1089
|
self,
|
1000
|
-
parser: FunctionCallParser,
|
1090
|
+
parser: Union[FunctionCallParser, JsonArrayParser],
|
1001
1091
|
content: Dict[str, Any],
|
1002
1092
|
request: ChatCompletionRequest,
|
1003
1093
|
index: int,
|
@@ -1007,30 +1097,31 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
1007
1097
|
when generation finishes. This ensures tool calls are properly completed
|
1008
1098
|
even if the model generates the final arguments in the last chunk.
|
1009
1099
|
"""
|
1010
|
-
#
|
1100
|
+
# Get the detector - either from FunctionCallParser or directly if json detector
|
1101
|
+
detector = parser.detector if hasattr(parser, "detector") else parser
|
1102
|
+
|
1103
|
+
# Only check if we have tool calls and the detector has tracked data
|
1011
1104
|
if (
|
1012
|
-
not hasattr(
|
1013
|
-
or not
|
1105
|
+
not hasattr(detector, "prev_tool_call_arr")
|
1106
|
+
or not detector.prev_tool_call_arr
|
1014
1107
|
):
|
1015
1108
|
return None
|
1016
1109
|
|
1017
1110
|
if (
|
1018
|
-
not hasattr(
|
1019
|
-
or not
|
1111
|
+
not hasattr(detector, "streamed_args_for_tool")
|
1112
|
+
or not detector.streamed_args_for_tool
|
1020
1113
|
):
|
1021
1114
|
return None
|
1022
1115
|
|
1023
1116
|
# Get the last tool call that was being processed
|
1024
|
-
tool_index = len(
|
1025
|
-
if tool_index < 0 or tool_index >= len(
|
1117
|
+
tool_index = len(detector.prev_tool_call_arr) - 1
|
1118
|
+
if tool_index < 0 or tool_index >= len(detector.streamed_args_for_tool):
|
1026
1119
|
return None
|
1027
1120
|
|
1028
1121
|
# Get expected vs actual arguments
|
1029
|
-
expected_args =
|
1030
|
-
"arguments", {}
|
1031
|
-
)
|
1122
|
+
expected_args = detector.prev_tool_call_arr[tool_index].get("arguments", {})
|
1032
1123
|
expected_call = json.dumps(expected_args, ensure_ascii=False)
|
1033
|
-
actual_call =
|
1124
|
+
actual_call = detector.streamed_args_for_tool[tool_index]
|
1034
1125
|
|
1035
1126
|
# Check if there are remaining arguments to send
|
1036
1127
|
remaining_call = (
|
@@ -1,6 +1,8 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import logging
|
2
4
|
import time
|
3
|
-
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
5
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
|
4
6
|
|
5
7
|
from fastapi import Request
|
6
8
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
@@ -20,13 +22,15 @@ from sglang.srt.entrypoints.openai.utils import (
|
|
20
22
|
to_openai_style_logprobs,
|
21
23
|
)
|
22
24
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
23
|
-
from sglang.srt.managers.template_manager import TemplateManager
|
24
|
-
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
25
25
|
from sglang.srt.parser.code_completion_parser import (
|
26
26
|
generate_completion_prompt_from_request,
|
27
27
|
)
|
28
28
|
from sglang.utils import convert_json_schema_to_str
|
29
29
|
|
30
|
+
if TYPE_CHECKING:
|
31
|
+
from sglang.srt.managers.template_manager import TemplateManager
|
32
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
33
|
+
|
30
34
|
logger = logging.getLogger(__name__)
|
31
35
|
|
32
36
|
|
@@ -55,6 +59,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
55
59
|
def _convert_to_internal_request(
|
56
60
|
self,
|
57
61
|
request: CompletionRequest,
|
62
|
+
raw_request: Request = None,
|
58
63
|
) -> tuple[GenerateReqInput, CompletionRequest]:
|
59
64
|
"""Convert OpenAI completion request to internal format"""
|
60
65
|
# NOTE: with openai API, the prompt's logprobs are always not computed
|
@@ -85,6 +90,9 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
85
90
|
else:
|
86
91
|
prompt_kwargs = {"input_ids": prompt}
|
87
92
|
|
93
|
+
# Extract custom labels from raw request headers
|
94
|
+
custom_labels = self.extract_custom_labels(raw_request)
|
95
|
+
|
88
96
|
adapted_request = GenerateReqInput(
|
89
97
|
**prompt_kwargs,
|
90
98
|
sampling_params=sampling_params,
|
@@ -99,6 +107,9 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
99
107
|
bootstrap_room=request.bootstrap_room,
|
100
108
|
return_hidden_states=request.return_hidden_states,
|
101
109
|
rid=request.rid,
|
110
|
+
extra_key=self._compute_extra_key(request),
|
111
|
+
priority=request.priority,
|
112
|
+
custom_labels=custom_labels,
|
102
113
|
)
|
103
114
|
|
104
115
|
return adapted_request, request
|
@@ -1,4 +1,6 @@
|
|
1
|
-
from
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
2
4
|
|
3
5
|
from fastapi import Request
|
4
6
|
from fastapi.responses import ORJSONResponse
|
@@ -13,10 +15,12 @@ from sglang.srt.entrypoints.openai.protocol import (
|
|
13
15
|
)
|
14
16
|
from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
|
15
17
|
from sglang.srt.managers.io_struct import EmbeddingReqInput
|
16
|
-
from sglang.srt.managers.template_manager import TemplateManager
|
17
|
-
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
18
18
|
from sglang.srt.parser.conversation import generate_embedding_convs
|
19
19
|
|
20
|
+
if TYPE_CHECKING:
|
21
|
+
from sglang.srt.managers.template_manager import TemplateManager
|
22
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
23
|
+
|
20
24
|
|
21
25
|
class OpenAIServingEmbedding(OpenAIServingBase):
|
22
26
|
"""Handler for v1/embeddings requests"""
|
@@ -70,6 +74,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
|
|
70
74
|
def _convert_to_internal_request(
|
71
75
|
self,
|
72
76
|
request: EmbeddingRequest,
|
77
|
+
raw_request: Request = None,
|
73
78
|
) -> tuple[EmbeddingReqInput, EmbeddingRequest]:
|
74
79
|
"""Convert OpenAI embedding request to internal format"""
|
75
80
|
prompt = request.input
|
@@ -120,6 +125,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
|
|
120
125
|
adapted_request = EmbeddingReqInput(
|
121
126
|
**prompt_kwargs,
|
122
127
|
rid=request.rid,
|
128
|
+
priority=request.priority,
|
123
129
|
)
|
124
130
|
|
125
131
|
return adapted_request, request
|
@@ -45,7 +45,9 @@ class OpenAIServingRerank(OpenAIServingBase):
|
|
45
45
|
return None
|
46
46
|
|
47
47
|
def _convert_to_internal_request(
|
48
|
-
self,
|
48
|
+
self,
|
49
|
+
request: V1RerankReqInput,
|
50
|
+
raw_request: Request = None,
|
49
51
|
) -> tuple[EmbeddingReqInput, V1RerankReqInput]:
|
50
52
|
"""Convert OpenAI rerank request to internal embedding format"""
|
51
53
|
# Create pairs of [query, document] for each document
|