sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +267 -32
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/model_config.py +181 -82
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +71 -19
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +326 -53
- sglang/srt/disaggregation/prefill.py +36 -17
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +855 -0
- sglang/srt/entrypoints/grpc_server.py +810 -0
- sglang/srt/entrypoints/http_server.py +130 -59
- sglang/srt/entrypoints/openai/protocol.py +112 -4
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +204 -55
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +9 -2
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +118 -198
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
- sglang/srt/layers/attention/mamba/mamba.py +629 -0
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +44 -12
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
- sglang/srt/layers/moe/ep_moe/layer.py +256 -63
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +22 -6
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +78 -49
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +190 -55
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +26 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +52 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +240 -138
- sglang/srt/managers/schedule_policy.py +144 -17
- sglang/srt/managers/scheduler.py +502 -209
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +320 -632
- sglang/srt/managers/tp_worker.py +81 -22
- sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +14 -20
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +535 -58
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +222 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +25 -36
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +82 -40
- sglang/srt/model_executor/model_runner.py +432 -157
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +133 -5
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +158 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mllama4.py +40 -4
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +38 -17
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +966 -267
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +99 -28
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +433 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +77 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +383 -5
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +375 -245
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,15 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import copy
|
2
4
|
import json
|
3
5
|
import logging
|
4
6
|
import time
|
5
7
|
import uuid
|
6
|
-
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
8
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
|
7
9
|
|
8
10
|
from fastapi import Request
|
9
11
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
12
|
+
from jsonschema import Draft202012Validator, SchemaError
|
10
13
|
|
11
14
|
from sglang.srt.entrypoints.openai.protocol import (
|
12
15
|
ChatCompletionRequest,
|
@@ -23,6 +26,8 @@ from sglang.srt.entrypoints.openai.protocol import (
|
|
23
26
|
LogProbs,
|
24
27
|
MessageProcessingResult,
|
25
28
|
ToolCall,
|
29
|
+
ToolCallProcessingResult,
|
30
|
+
ToolChoice,
|
26
31
|
TopLogprob,
|
27
32
|
)
|
28
33
|
from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
|
@@ -31,15 +36,20 @@ from sglang.srt.entrypoints.openai.utils import (
|
|
31
36
|
process_hidden_states_from_ret,
|
32
37
|
to_openai_style_logprobs,
|
33
38
|
)
|
39
|
+
from sglang.srt.function_call.core_types import ToolCallItem
|
34
40
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
41
|
+
from sglang.srt.function_call.json_array_parser import JsonArrayParser
|
42
|
+
from sglang.srt.function_call.utils import get_json_schema_constraint
|
35
43
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
36
|
-
from sglang.srt.managers.template_manager import TemplateManager
|
37
|
-
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
38
44
|
from sglang.srt.parser.conversation import generate_chat_conv
|
39
45
|
from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
|
40
46
|
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
41
47
|
from sglang.utils import convert_json_schema_to_str
|
42
48
|
|
49
|
+
if TYPE_CHECKING:
|
50
|
+
from sglang.srt.managers.template_manager import TemplateManager
|
51
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
52
|
+
|
43
53
|
logger = logging.getLogger(__name__)
|
44
54
|
|
45
55
|
|
@@ -53,6 +63,8 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
53
63
|
):
|
54
64
|
super().__init__(tokenizer_manager)
|
55
65
|
self.template_manager = template_manager
|
66
|
+
self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
|
67
|
+
self.reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
|
56
68
|
|
57
69
|
def _request_id_prefix(self) -> str:
|
58
70
|
return "chatcmpl-"
|
@@ -69,6 +81,23 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
69
81
|
):
|
70
82
|
return "Tools cannot be empty if tool choice is set to required."
|
71
83
|
|
84
|
+
if request.tool_choice is not None and not isinstance(request.tool_choice, str):
|
85
|
+
if not request.tools:
|
86
|
+
return "Tools cannot be empty if tool choice is set to a specific tool."
|
87
|
+
tool_name = request.tool_choice.function.name
|
88
|
+
tool_exists = any(tool.function.name == tool_name for tool in request.tools)
|
89
|
+
if not tool_exists:
|
90
|
+
return f"Tool '{tool_name}' not found in tools list."
|
91
|
+
|
92
|
+
# Validate tool definitions
|
93
|
+
for i, tool in enumerate(request.tools or []):
|
94
|
+
if tool.function.parameters is None:
|
95
|
+
continue
|
96
|
+
try:
|
97
|
+
Draft202012Validator.check_schema(tool.function.parameters)
|
98
|
+
except SchemaError as e:
|
99
|
+
return f"Tool {i} function has invalid 'parameters' schema: {str(e)}"
|
100
|
+
|
72
101
|
max_output_tokens = request.max_completion_tokens or request.max_tokens
|
73
102
|
server_context_length = self.tokenizer_manager.server_args.context_length
|
74
103
|
if (
|
@@ -91,6 +120,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
91
120
|
def _convert_to_internal_request(
|
92
121
|
self,
|
93
122
|
request: ChatCompletionRequest,
|
123
|
+
raw_request: Request = None,
|
94
124
|
) -> tuple[GenerateReqInput, ChatCompletionRequest]:
|
95
125
|
reasoning_effort = (
|
96
126
|
request.chat_template_kwargs.pop("reasoning_effort", None)
|
@@ -122,6 +152,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
122
152
|
else:
|
123
153
|
prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
|
124
154
|
|
155
|
+
# Extract custom labels from raw request headers
|
156
|
+
custom_labels = self.extract_custom_labels(raw_request)
|
157
|
+
|
125
158
|
adapted_request = GenerateReqInput(
|
126
159
|
**prompt_kwargs,
|
127
160
|
image_data=processed_messages.image_data,
|
@@ -140,6 +173,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
140
173
|
bootstrap_room=request.bootstrap_room,
|
141
174
|
return_hidden_states=request.return_hidden_states,
|
142
175
|
rid=request.rid,
|
176
|
+
extra_key=self._compute_extra_key(request),
|
177
|
+
priority=request.priority,
|
178
|
+
custom_labels=custom_labels,
|
143
179
|
)
|
144
180
|
|
145
181
|
return adapted_request, request
|
@@ -172,10 +208,19 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
172
208
|
]
|
173
209
|
else:
|
174
210
|
tools = [item.function.model_dump() for item in request.tools]
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
211
|
+
if self.tool_call_parser:
|
212
|
+
parser = FunctionCallParser(request.tools, self.tool_call_parser)
|
213
|
+
tool_call_constraint = parser.get_structure_constraint(
|
214
|
+
request.tool_choice
|
215
|
+
)
|
216
|
+
# Handle JSON schema constraint directly for required or named tool choice
|
217
|
+
if request.tool_choice == "required" or isinstance(
|
218
|
+
request.tool_choice, ToolChoice
|
219
|
+
):
|
220
|
+
json_schema = get_json_schema_constraint(
|
221
|
+
request.tools, request.tool_choice
|
222
|
+
)
|
223
|
+
tool_call_constraint = ("json_schema", json_schema)
|
179
224
|
|
180
225
|
# Use chat template
|
181
226
|
if self.template_manager.chat_template_name is None:
|
@@ -423,6 +468,10 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
423
468
|
sampling_params[constraint_type] = convert_json_schema_to_str(
|
424
469
|
constraint_value.model_dump(by_alias=True)
|
425
470
|
)
|
471
|
+
elif constraint_type == "json_schema":
|
472
|
+
sampling_params[constraint_type] = convert_json_schema_to_str(
|
473
|
+
constraint_value
|
474
|
+
)
|
426
475
|
else:
|
427
476
|
sampling_params[constraint_type] = constraint_value
|
428
477
|
return sampling_params
|
@@ -515,10 +564,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
515
564
|
stream_buffers[index] = stream_buffer + delta
|
516
565
|
|
517
566
|
# Handle reasoning content
|
518
|
-
if
|
519
|
-
self.tokenizer_manager.server_args.reasoning_parser
|
520
|
-
and request.separate_reasoning
|
521
|
-
):
|
567
|
+
if self.reasoning_parser and request.separate_reasoning:
|
522
568
|
reasoning_text, delta = self._process_reasoning_stream(
|
523
569
|
index, delta, reasoning_parser_dict, content, request
|
524
570
|
)
|
@@ -537,7 +583,11 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
537
583
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
538
584
|
|
539
585
|
# Handle tool calls
|
540
|
-
if
|
586
|
+
if (
|
587
|
+
request.tool_choice != "none"
|
588
|
+
and request.tools
|
589
|
+
and self.tool_call_parser
|
590
|
+
):
|
541
591
|
async for chunk in self._process_tool_call_stream(
|
542
592
|
index,
|
543
593
|
delta,
|
@@ -704,7 +754,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
704
754
|
|
705
755
|
# Handle reasoning content
|
706
756
|
reasoning_text = None
|
707
|
-
reasoning_parser = self.
|
757
|
+
reasoning_parser = self.reasoning_parser
|
708
758
|
if reasoning_parser and request.separate_reasoning:
|
709
759
|
is_force_reasoning = (
|
710
760
|
self.template_manager.force_reasoning
|
@@ -727,10 +777,18 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
727
777
|
|
728
778
|
# Handle tool calls
|
729
779
|
tool_calls = None
|
730
|
-
if
|
731
|
-
|
780
|
+
if (
|
781
|
+
request.tool_choice != "none"
|
782
|
+
and request.tools
|
783
|
+
and self.tool_call_parser
|
784
|
+
):
|
785
|
+
history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
|
732
786
|
tool_calls, text, finish_reason = self._process_tool_calls(
|
733
|
-
text,
|
787
|
+
text,
|
788
|
+
request.tools,
|
789
|
+
finish_reason,
|
790
|
+
request.tool_choice,
|
791
|
+
history_tool_calls_cnt,
|
734
792
|
)
|
735
793
|
|
736
794
|
choice_data = ChatCompletionResponseChoice(
|
@@ -820,15 +878,77 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
820
878
|
token_logprobs = self._process_logprobs_tokens(logprobs, use_token_index=True)
|
821
879
|
return ChoiceLogprobs(content=token_logprobs)
|
822
880
|
|
881
|
+
def _process_tool_call_id(
|
882
|
+
self,
|
883
|
+
call_item: ToolCallItem,
|
884
|
+
history_tool_calls_cnt: int,
|
885
|
+
) -> str:
|
886
|
+
"""Process for generating a new and unique `tool_call_id`"""
|
887
|
+
if self.tool_call_parser != "kimi_k2":
|
888
|
+
# A simple uuid is sufficient for all models except for Kimi-K2.
|
889
|
+
tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
|
890
|
+
return tool_call_id
|
891
|
+
else:
|
892
|
+
# Align with Kimi-K2 format: functions.{name}:{index}
|
893
|
+
# Kimi-K2 allows multiple tool_calls in one message; SGLang sets call_item.tool_index to the *local* position inside that message.
|
894
|
+
# Therefore, the index must be corrected by using `history_tool_calls_cnt + call_item.tool_index` to ensure globally unique and properly ordered.
|
895
|
+
tool_call_id = f"functions.{call_item.name}:{history_tool_calls_cnt+call_item.tool_index}"
|
896
|
+
logger.debug(
|
897
|
+
f"Process tool call idx, parser: {self.tool_call_parser}, tool_call_id: {tool_call_id}, history_cnt: {history_tool_calls_cnt}"
|
898
|
+
)
|
899
|
+
return tool_call_id
|
900
|
+
|
823
901
|
def _process_tool_calls(
|
824
902
|
self,
|
825
903
|
text: str,
|
826
904
|
tools: List[Any],
|
827
|
-
tool_call_parser: Optional[str],
|
828
905
|
finish_reason: Dict[str, Any],
|
829
|
-
|
906
|
+
tool_choice: Optional[Union[str, ToolChoice]] = None,
|
907
|
+
history_tool_calls_cnt: int = 0,
|
908
|
+
) -> ToolCallProcessingResult:
|
830
909
|
"""Process tool calls in the response"""
|
831
|
-
|
910
|
+
|
911
|
+
# Handle required or named tool choice
|
912
|
+
if tool_choice == "required" or (
|
913
|
+
isinstance(tool_choice, ToolChoice) and tool_choice.type == "function"
|
914
|
+
):
|
915
|
+
# Set finish reason to tool_calls since we're processing tool calls
|
916
|
+
if finish_reason["type"] == "stop":
|
917
|
+
finish_reason["type"] = "tool_calls"
|
918
|
+
finish_reason["matched"] = None
|
919
|
+
try:
|
920
|
+
# For required tool choice, we expect a JSON array of tool calls
|
921
|
+
tool_call_data = json.loads(text)
|
922
|
+
tool_calls = []
|
923
|
+
for i, tool in enumerate(tool_call_data):
|
924
|
+
# Create a ToolCallItem from the JSON data
|
925
|
+
call_info = ToolCallItem(
|
926
|
+
tool_index=i, # Use the loop index as tool_index
|
927
|
+
name=tool["name"],
|
928
|
+
parameters=json.dumps(tool["parameters"], ensure_ascii=False),
|
929
|
+
)
|
930
|
+
tool_id = self._process_tool_call_id(
|
931
|
+
call_info, history_tool_calls_cnt
|
932
|
+
)
|
933
|
+
tool_calls.append(
|
934
|
+
ToolCall(
|
935
|
+
id=tool_id,
|
936
|
+
index=i,
|
937
|
+
function=FunctionResponse(
|
938
|
+
name=tool["name"],
|
939
|
+
arguments=json.dumps(
|
940
|
+
tool["parameters"], ensure_ascii=False
|
941
|
+
),
|
942
|
+
),
|
943
|
+
)
|
944
|
+
)
|
945
|
+
return ToolCallProcessingResult(tool_calls, "", finish_reason)
|
946
|
+
except json.JSONDecodeError as e:
|
947
|
+
logger.error(f"Tool call parsing error: {e}")
|
948
|
+
return ToolCallProcessingResult(None, text, finish_reason)
|
949
|
+
|
950
|
+
# Use parser since output is not constrained by JSON schema
|
951
|
+
parser = FunctionCallParser(tools, self.tool_call_parser)
|
832
952
|
if parser.has_tool_call(text):
|
833
953
|
if finish_reason["type"] == "stop":
|
834
954
|
finish_reason["type"] = "tool_calls"
|
@@ -837,12 +957,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
837
957
|
text, call_info_list = parser.parse_non_stream(text)
|
838
958
|
tool_calls = []
|
839
959
|
for call_info in call_info_list:
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
else:
|
844
|
-
tool_id = f"call_{uuid.uuid4().hex[:24]}"
|
845
|
-
|
960
|
+
tool_id = self._process_tool_call_id(
|
961
|
+
call_info, history_tool_calls_cnt
|
962
|
+
)
|
846
963
|
tool_calls.append(
|
847
964
|
ToolCall(
|
848
965
|
id=tool_id,
|
@@ -852,13 +969,13 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
852
969
|
),
|
853
970
|
)
|
854
971
|
)
|
855
|
-
return tool_calls, text, finish_reason
|
972
|
+
return ToolCallProcessingResult(tool_calls, text, finish_reason)
|
856
973
|
except Exception as e:
|
857
974
|
logger.error(f"Tool call parsing error: {e}")
|
858
975
|
# Return error but don't fail the whole request
|
859
|
-
return None, text, finish_reason
|
976
|
+
return ToolCallProcessingResult(None, text, finish_reason)
|
860
977
|
|
861
|
-
return None, text, finish_reason
|
978
|
+
return ToolCallProcessingResult(None, text, finish_reason)
|
862
979
|
|
863
980
|
def _process_streaming_logprobs(
|
864
981
|
self, content: Dict[str, Any], n_prev_token: int
|
@@ -891,13 +1008,33 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
891
1008
|
or self._get_enable_thinking_from_request(request)
|
892
1009
|
)
|
893
1010
|
reasoning_parser_dict[index] = ReasoningParser(
|
894
|
-
self.
|
1011
|
+
self.reasoning_parser,
|
895
1012
|
request.stream_reasoning,
|
896
1013
|
is_force_reasoning,
|
897
1014
|
)
|
898
1015
|
reasoning_parser = reasoning_parser_dict[index]
|
899
1016
|
return reasoning_parser.parse_stream_chunk(delta)
|
900
1017
|
|
1018
|
+
def _get_history_tool_calls_cnt(self, request: ChatCompletionRequest) -> int:
|
1019
|
+
"""Counts the number of tool calls in the request's message history.
|
1020
|
+
|
1021
|
+
NOTE: This method is only useful for models that include self-increasing
|
1022
|
+
history tool call idx in tool calls id, such as kimi-k2
|
1023
|
+
|
1024
|
+
Args:
|
1025
|
+
request: The chat completion request object.
|
1026
|
+
|
1027
|
+
Returns:
|
1028
|
+
The total number of tool calls in the history, or 0 if not applicable.
|
1029
|
+
"""
|
1030
|
+
messages = getattr(request, "messages", [])
|
1031
|
+
idx = 0
|
1032
|
+
for msg in messages:
|
1033
|
+
if msg.role == "assistant":
|
1034
|
+
tool_calls = getattr(msg, "tool_calls", None)
|
1035
|
+
idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa
|
1036
|
+
return idx
|
1037
|
+
|
901
1038
|
def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
|
902
1039
|
"""Extracts the 'enable_thinking' flag from request chat_template_kwargs.
|
903
1040
|
|
@@ -911,11 +1048,11 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
911
1048
|
"""
|
912
1049
|
if hasattr(request, "chat_template_kwargs") and request.chat_template_kwargs:
|
913
1050
|
# For Qwen3 models, `enable_thinking` is supported.
|
914
|
-
if
|
915
|
-
return request.chat_template_kwargs.get("enable_thinking")
|
1051
|
+
if self.reasoning_parser in ["qwen3", "glm45"]:
|
1052
|
+
return request.chat_template_kwargs.get("enable_thinking", False)
|
916
1053
|
# For DeepSeek-V3.1 models, `thinking` is supported.
|
917
|
-
elif
|
918
|
-
return request.chat_template_kwargs.get("thinking")
|
1054
|
+
elif self.reasoning_parser in ["deepseek-v3"]:
|
1055
|
+
return request.chat_template_kwargs.get("thinking", False)
|
919
1056
|
else:
|
920
1057
|
return False
|
921
1058
|
return False
|
@@ -931,13 +1068,25 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
931
1068
|
):
|
932
1069
|
"""Process tool calls in streaming response"""
|
933
1070
|
if index not in parser_dict:
|
934
|
-
|
935
|
-
|
936
|
-
|
937
|
-
)
|
1071
|
+
# Use JSON detector directly for required or named tool choice
|
1072
|
+
if request.tool_choice == "required" or isinstance(
|
1073
|
+
request.tool_choice, ToolChoice
|
1074
|
+
):
|
1075
|
+
parser_dict[index] = JsonArrayParser()
|
1076
|
+
else:
|
1077
|
+
parser_dict[index] = FunctionCallParser(
|
1078
|
+
tools=request.tools,
|
1079
|
+
tool_call_parser=self.tool_call_parser,
|
1080
|
+
)
|
1081
|
+
|
938
1082
|
parser = parser_dict[index]
|
939
1083
|
|
940
|
-
|
1084
|
+
# Handle both FunctionCallParser and JsonArrayParser
|
1085
|
+
if isinstance(parser, JsonArrayParser):
|
1086
|
+
result = parser.parse_streaming_increment(delta, request.tools)
|
1087
|
+
normal_text, calls = result.normal_text, result.calls
|
1088
|
+
else:
|
1089
|
+
normal_text, calls = parser.parse_stream_chunk(delta)
|
941
1090
|
|
942
1091
|
# Yield normal text
|
943
1092
|
if normal_text:
|
@@ -955,6 +1104,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
955
1104
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
956
1105
|
|
957
1106
|
# Yield tool calls
|
1107
|
+
history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
|
958
1108
|
for call_item in calls:
|
959
1109
|
# Mark that this choice has tool calls
|
960
1110
|
has_tool_calls[index] = True
|
@@ -962,11 +1112,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
962
1112
|
# Tool call ID should be generated only once per tool call
|
963
1113
|
if call_item.name:
|
964
1114
|
# First chunk: include ID and function name
|
965
|
-
|
966
|
-
|
967
|
-
|
968
|
-
else:
|
969
|
-
tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
|
1115
|
+
tool_call_id = self._process_tool_call_id(
|
1116
|
+
call_item, history_tool_calls_cnt
|
1117
|
+
)
|
970
1118
|
function_name = call_item.name
|
971
1119
|
else:
|
972
1120
|
# Subsequent chunks: null ID and name for argument deltas
|
@@ -997,7 +1145,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
997
1145
|
|
998
1146
|
def _check_for_unstreamed_tool_args(
|
999
1147
|
self,
|
1000
|
-
parser: FunctionCallParser,
|
1148
|
+
parser: Union[FunctionCallParser, JsonArrayParser],
|
1001
1149
|
content: Dict[str, Any],
|
1002
1150
|
request: ChatCompletionRequest,
|
1003
1151
|
index: int,
|
@@ -1007,30 +1155,31 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
1007
1155
|
when generation finishes. This ensures tool calls are properly completed
|
1008
1156
|
even if the model generates the final arguments in the last chunk.
|
1009
1157
|
"""
|
1010
|
-
#
|
1158
|
+
# Get the detector - either from FunctionCallParser or directly if json detector
|
1159
|
+
detector = parser.detector if hasattr(parser, "detector") else parser
|
1160
|
+
|
1161
|
+
# Only check if we have tool calls and the detector has tracked data
|
1011
1162
|
if (
|
1012
|
-
not hasattr(
|
1013
|
-
or not
|
1163
|
+
not hasattr(detector, "prev_tool_call_arr")
|
1164
|
+
or not detector.prev_tool_call_arr
|
1014
1165
|
):
|
1015
1166
|
return None
|
1016
1167
|
|
1017
1168
|
if (
|
1018
|
-
not hasattr(
|
1019
|
-
or not
|
1169
|
+
not hasattr(detector, "streamed_args_for_tool")
|
1170
|
+
or not detector.streamed_args_for_tool
|
1020
1171
|
):
|
1021
1172
|
return None
|
1022
1173
|
|
1023
1174
|
# Get the last tool call that was being processed
|
1024
|
-
tool_index = len(
|
1025
|
-
if tool_index < 0 or tool_index >= len(
|
1175
|
+
tool_index = len(detector.prev_tool_call_arr) - 1
|
1176
|
+
if tool_index < 0 or tool_index >= len(detector.streamed_args_for_tool):
|
1026
1177
|
return None
|
1027
1178
|
|
1028
1179
|
# Get expected vs actual arguments
|
1029
|
-
expected_args =
|
1030
|
-
"arguments", {}
|
1031
|
-
)
|
1180
|
+
expected_args = detector.prev_tool_call_arr[tool_index].get("arguments", {})
|
1032
1181
|
expected_call = json.dumps(expected_args, ensure_ascii=False)
|
1033
|
-
actual_call =
|
1182
|
+
actual_call = detector.streamed_args_for_tool[tool_index]
|
1034
1183
|
|
1035
1184
|
# Check if there are remaining arguments to send
|
1036
1185
|
remaining_call = (
|
@@ -1,6 +1,8 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import logging
|
2
4
|
import time
|
3
|
-
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
5
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
|
4
6
|
|
5
7
|
from fastapi import Request
|
6
8
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
@@ -20,13 +22,15 @@ from sglang.srt.entrypoints.openai.utils import (
|
|
20
22
|
to_openai_style_logprobs,
|
21
23
|
)
|
22
24
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
23
|
-
from sglang.srt.managers.template_manager import TemplateManager
|
24
|
-
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
25
25
|
from sglang.srt.parser.code_completion_parser import (
|
26
26
|
generate_completion_prompt_from_request,
|
27
27
|
)
|
28
28
|
from sglang.utils import convert_json_schema_to_str
|
29
29
|
|
30
|
+
if TYPE_CHECKING:
|
31
|
+
from sglang.srt.managers.template_manager import TemplateManager
|
32
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
33
|
+
|
30
34
|
logger = logging.getLogger(__name__)
|
31
35
|
|
32
36
|
|
@@ -55,6 +59,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
55
59
|
def _convert_to_internal_request(
|
56
60
|
self,
|
57
61
|
request: CompletionRequest,
|
62
|
+
raw_request: Request = None,
|
58
63
|
) -> tuple[GenerateReqInput, CompletionRequest]:
|
59
64
|
"""Convert OpenAI completion request to internal format"""
|
60
65
|
# NOTE: with openai API, the prompt's logprobs are always not computed
|
@@ -85,6 +90,9 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
85
90
|
else:
|
86
91
|
prompt_kwargs = {"input_ids": prompt}
|
87
92
|
|
93
|
+
# Extract custom labels from raw request headers
|
94
|
+
custom_labels = self.extract_custom_labels(raw_request)
|
95
|
+
|
88
96
|
adapted_request = GenerateReqInput(
|
89
97
|
**prompt_kwargs,
|
90
98
|
sampling_params=sampling_params,
|
@@ -99,6 +107,9 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
99
107
|
bootstrap_room=request.bootstrap_room,
|
100
108
|
return_hidden_states=request.return_hidden_states,
|
101
109
|
rid=request.rid,
|
110
|
+
extra_key=self._compute_extra_key(request),
|
111
|
+
priority=request.priority,
|
112
|
+
custom_labels=custom_labels,
|
102
113
|
)
|
103
114
|
|
104
115
|
return adapted_request, request
|
@@ -1,4 +1,6 @@
|
|
1
|
-
from
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
2
4
|
|
3
5
|
from fastapi import Request
|
4
6
|
from fastapi.responses import ORJSONResponse
|
@@ -13,10 +15,12 @@ from sglang.srt.entrypoints.openai.protocol import (
|
|
13
15
|
)
|
14
16
|
from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
|
15
17
|
from sglang.srt.managers.io_struct import EmbeddingReqInput
|
16
|
-
from sglang.srt.managers.template_manager import TemplateManager
|
17
|
-
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
18
18
|
from sglang.srt.parser.conversation import generate_embedding_convs
|
19
19
|
|
20
|
+
if TYPE_CHECKING:
|
21
|
+
from sglang.srt.managers.template_manager import TemplateManager
|
22
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
23
|
+
|
20
24
|
|
21
25
|
class OpenAIServingEmbedding(OpenAIServingBase):
|
22
26
|
"""Handler for v1/embeddings requests"""
|
@@ -70,6 +74,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
|
|
70
74
|
def _convert_to_internal_request(
|
71
75
|
self,
|
72
76
|
request: EmbeddingRequest,
|
77
|
+
raw_request: Request = None,
|
73
78
|
) -> tuple[EmbeddingReqInput, EmbeddingRequest]:
|
74
79
|
"""Convert OpenAI embedding request to internal format"""
|
75
80
|
prompt = request.input
|
@@ -120,6 +125,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
|
|
120
125
|
adapted_request = EmbeddingReqInput(
|
121
126
|
**prompt_kwargs,
|
122
127
|
rid=request.rid,
|
128
|
+
priority=request.priority,
|
123
129
|
)
|
124
130
|
|
125
131
|
return adapted_request, request
|
@@ -45,7 +45,9 @@ class OpenAIServingRerank(OpenAIServingBase):
|
|
45
45
|
return None
|
46
46
|
|
47
47
|
def _convert_to_internal_request(
|
48
|
-
self,
|
48
|
+
self,
|
49
|
+
request: V1RerankReqInput,
|
50
|
+
raw_request: Request = None,
|
49
51
|
) -> tuple[EmbeddingReqInput, V1RerankReqInput]:
|
50
52
|
"""Convert OpenAI rerank request to internal embedding format"""
|
51
53
|
# Create pairs of [query, document] for each document
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# SPDX-License-Identifier: Apache-2.0
|
2
2
|
# Adapted from vLLM's OpenAIServingResponses
|
3
3
|
"""Handler for /v1/responses requests"""
|
4
|
+
from __future__ import annotations
|
4
5
|
|
5
6
|
import asyncio
|
6
7
|
import copy
|
@@ -9,7 +10,7 @@ import logging
|
|
9
10
|
import time
|
10
11
|
from contextlib import AsyncExitStack
|
11
12
|
from http import HTTPStatus
|
12
|
-
from typing import Any, AsyncGenerator, AsyncIterator, Optional, Union
|
13
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Optional, Union
|
13
14
|
|
14
15
|
import jinja2
|
15
16
|
import openai.types.responses as openai_responses_types
|
@@ -54,11 +55,13 @@ from sglang.srt.entrypoints.openai.protocol import (
|
|
54
55
|
from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
|
55
56
|
from sglang.srt.entrypoints.openai.tool_server import MCPToolServer, ToolServer
|
56
57
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
57
|
-
from sglang.srt.managers.template_manager import TemplateManager
|
58
|
-
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
59
58
|
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
60
59
|
from sglang.srt.utils import random_uuid
|
61
60
|
|
61
|
+
if TYPE_CHECKING:
|
62
|
+
from sglang.srt.managers.template_manager import TemplateManager
|
63
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
64
|
+
|
62
65
|
logger = logging.getLogger(__name__)
|
63
66
|
|
64
67
|
|
@@ -120,6 +123,39 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
120
123
|
|
121
124
|
self.background_tasks: dict[str, asyncio.Task] = {}
|
122
125
|
|
126
|
+
# error helpers dedicated for v1/responses
|
127
|
+
def create_error_response(
|
128
|
+
self,
|
129
|
+
message: str,
|
130
|
+
err_type: str = "invalid_request_error",
|
131
|
+
status_code: int = 400,
|
132
|
+
param: Optional[str] = None,
|
133
|
+
) -> ORJSONResponse:
|
134
|
+
nested_error = {
|
135
|
+
"message": message,
|
136
|
+
"type": err_type,
|
137
|
+
"param": param,
|
138
|
+
"code": status_code,
|
139
|
+
}
|
140
|
+
return ORJSONResponse(content={"error": nested_error}, status_code=status_code)
|
141
|
+
|
142
|
+
def create_streaming_error_response(
|
143
|
+
self,
|
144
|
+
message: str,
|
145
|
+
err_type: str = "BadRequestError",
|
146
|
+
status_code: int = 400,
|
147
|
+
) -> str:
|
148
|
+
return json.dumps(
|
149
|
+
{
|
150
|
+
"error": {
|
151
|
+
"message": message,
|
152
|
+
"type": err_type,
|
153
|
+
"param": None,
|
154
|
+
"code": status_code,
|
155
|
+
}
|
156
|
+
}
|
157
|
+
)
|
158
|
+
|
123
159
|
def _request_id_prefix(self) -> str:
|
124
160
|
return "resp_"
|
125
161
|
|
@@ -242,6 +278,7 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
242
278
|
sampling_params=sampling_params,
|
243
279
|
stream=request.stream,
|
244
280
|
rid=request.request_id,
|
281
|
+
extra_key=self._compute_extra_key(request),
|
245
282
|
background=request.background,
|
246
283
|
)
|
247
284
|
|
@@ -830,6 +867,13 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
830
867
|
|
831
868
|
async for ctx in result_generator:
|
832
869
|
|
870
|
+
# Only process context objects that implement the `is_expecting_start()` method,
|
871
|
+
# which indicates they support per-turn streaming (e.g., StreamingHarmonyContext).
|
872
|
+
# Contexts without this method are skipped, as they do not represent a new turn
|
873
|
+
# or are not compatible with per-turn handling in the /v1/responses endpoint.
|
874
|
+
if not hasattr(ctx, "is_expecting_start"):
|
875
|
+
continue
|
876
|
+
|
833
877
|
if ctx.is_expecting_start():
|
834
878
|
current_output_index += 1
|
835
879
|
sent_output_item_added = False
|
@@ -1247,6 +1291,7 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
1247
1291
|
sampling_params=sampling_params,
|
1248
1292
|
stream=adapted_request.stream,
|
1249
1293
|
rid=request_id,
|
1294
|
+
extra_key=adapted_request.extra_key,
|
1250
1295
|
return_logprob=adapted_request.return_logprob,
|
1251
1296
|
logprob_start_len=adapted_request.logprob_start_len,
|
1252
1297
|
top_logprobs_num=adapted_request.top_logprobs_num,
|
@@ -25,6 +25,7 @@ class OpenAIServingScore(OpenAIServingBase):
|
|
25
25
|
def _convert_to_internal_request(
|
26
26
|
self,
|
27
27
|
request: ScoringRequest,
|
28
|
+
raw_request: Request = None,
|
28
29
|
) -> tuple[ScoringRequest, ScoringRequest]:
|
29
30
|
"""Convert OpenAI scoring request to internal format"""
|
30
31
|
# For scoring, we pass the request directly as the tokenizer_manager
|