sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +267 -32
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/model_config.py +181 -82
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +71 -19
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +326 -53
- sglang/srt/disaggregation/prefill.py +36 -17
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +855 -0
- sglang/srt/entrypoints/grpc_server.py +810 -0
- sglang/srt/entrypoints/http_server.py +130 -59
- sglang/srt/entrypoints/openai/protocol.py +112 -4
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +204 -55
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +9 -2
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +118 -198
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
- sglang/srt/layers/attention/mamba/mamba.py +629 -0
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +44 -12
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
- sglang/srt/layers/moe/ep_moe/layer.py +256 -63
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +22 -6
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +78 -49
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +190 -55
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +26 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +52 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +240 -138
- sglang/srt/managers/schedule_policy.py +144 -17
- sglang/srt/managers/scheduler.py +502 -209
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +320 -632
- sglang/srt/managers/tp_worker.py +81 -22
- sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +14 -20
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +535 -58
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +222 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +25 -36
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +82 -40
- sglang/srt/model_executor/model_runner.py +432 -157
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +133 -5
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +158 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mllama4.py +40 -4
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +38 -17
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +966 -267
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +99 -28
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +433 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +77 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +383 -5
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +375 -245
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
@@ -27,9 +27,9 @@ import tempfile
|
|
27
27
|
import threading
|
28
28
|
import time
|
29
29
|
from http import HTTPStatus
|
30
|
-
from typing import Any, AsyncIterator, Callable, Dict, List, Optional
|
30
|
+
from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Union
|
31
31
|
|
32
|
-
import
|
32
|
+
from sglang.srt.tracing.trace import process_tracing_init, trace_set_thread_info
|
33
33
|
|
34
34
|
# Fix a bug of Python threading
|
35
35
|
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
|
@@ -47,11 +47,7 @@ from fastapi.exceptions import RequestValidationError
|
|
47
47
|
from fastapi.middleware.cors import CORSMiddleware
|
48
48
|
from fastapi.responses import ORJSONResponse, Response, StreamingResponse
|
49
49
|
|
50
|
-
from sglang.srt.disaggregation.utils import
|
51
|
-
FAKE_BOOTSTRAP_HOST,
|
52
|
-
DisaggregationMode,
|
53
|
-
register_disaggregation_server,
|
54
|
-
)
|
50
|
+
from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
|
55
51
|
from sglang.srt.entrypoints.engine import _launch_subprocesses
|
56
52
|
from sglang.srt.entrypoints.openai.protocol import (
|
57
53
|
ChatCompletionRequest,
|
@@ -74,9 +70,11 @@ from sglang.srt.managers.io_struct import (
|
|
74
70
|
AbortReq,
|
75
71
|
CloseSessionReqInput,
|
76
72
|
ConfigureLoggingReq,
|
73
|
+
DestroyWeightsUpdateGroupReqInput,
|
77
74
|
EmbeddingReqInput,
|
78
75
|
GenerateReqInput,
|
79
76
|
GetWeightsByNameReqInput,
|
77
|
+
InitWeightsSendGroupForRemoteInstanceReqInput,
|
80
78
|
InitWeightsUpdateGroupReqInput,
|
81
79
|
LoadLoRAAdapterReqInput,
|
82
80
|
OpenSessionReqInput,
|
@@ -84,6 +82,7 @@ from sglang.srt.managers.io_struct import (
|
|
84
82
|
ProfileReqInput,
|
85
83
|
ReleaseMemoryOccupationReqInput,
|
86
84
|
ResumeMemoryOccupationReqInput,
|
85
|
+
SendWeightsToRemoteInstanceReqInput,
|
87
86
|
SeparateReasoningReqInput,
|
88
87
|
SetInternalStateReq,
|
89
88
|
SlowDownReqInput,
|
@@ -95,9 +94,10 @@ from sglang.srt.managers.io_struct import (
|
|
95
94
|
VertexGenerateReqInput,
|
96
95
|
)
|
97
96
|
from sglang.srt.managers.multi_tokenizer_mixin import (
|
98
|
-
|
99
|
-
|
97
|
+
MultiTokenizerRouter,
|
98
|
+
TokenizerWorker,
|
100
99
|
get_main_process_id,
|
100
|
+
monkey_patch_uvicorn_multiprocessing,
|
101
101
|
read_from_shared_memory,
|
102
102
|
write_data_for_multi_tokenizer,
|
103
103
|
)
|
@@ -127,7 +127,7 @@ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
|
|
127
127
|
# Store global states
|
128
128
|
@dataclasses.dataclass
|
129
129
|
class _GlobalState:
|
130
|
-
tokenizer_manager: TokenizerManager
|
130
|
+
tokenizer_manager: Union[TokenizerManager, MultiTokenizerRouter, TokenizerWorker]
|
131
131
|
template_manager: TemplateManager
|
132
132
|
scheduler_info: Dict
|
133
133
|
|
@@ -140,21 +140,6 @@ def set_global_state(global_state: _GlobalState):
|
|
140
140
|
_global_state = global_state
|
141
141
|
|
142
142
|
|
143
|
-
# Function to set up all middlewares for multi-tokenizer compatibility
|
144
|
-
def setup_middlewares(api_key: Optional[str], enable_metrics: bool):
|
145
|
-
"""Setup all middlewares for both single and multi-process modes"""
|
146
|
-
worker_pid = os.getpid()
|
147
|
-
|
148
|
-
if api_key:
|
149
|
-
add_api_key_middleware(app, api_key)
|
150
|
-
logger.info(f"Worker {worker_pid} added API key middleware")
|
151
|
-
|
152
|
-
if enable_metrics:
|
153
|
-
add_prometheus_middleware(app)
|
154
|
-
enable_func_timer()
|
155
|
-
logger.info(f"Worker {worker_pid} added prometheus middleware")
|
156
|
-
|
157
|
-
|
158
143
|
async def init_multi_tokenizer() -> ServerArgs:
|
159
144
|
"""Read args information from shm and init tokenizer manager for current process"""
|
160
145
|
pid = os.getpid()
|
@@ -162,18 +147,22 @@ async def init_multi_tokenizer() -> ServerArgs:
|
|
162
147
|
logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
|
163
148
|
|
164
149
|
# Read configuration from shared memory
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
150
|
+
port_args, server_args, scheduler_info = read_from_shared_memory(
|
151
|
+
f"multi_tokenizer_args_{main_pid}"
|
152
|
+
)
|
153
|
+
server_args: ServerArgs
|
154
|
+
|
155
|
+
# API key authentication is not supported in multi-tokenizer mode
|
156
|
+
assert (
|
157
|
+
server_args.api_key is None
|
158
|
+
), "API key is not supported in multi-tokenizer mode"
|
170
159
|
|
171
160
|
port_args.tokenizer_ipc_name = (
|
172
161
|
f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
|
173
162
|
)
|
174
163
|
|
175
164
|
# Launch multi-tokenizer manager process
|
176
|
-
tokenizer_manager =
|
165
|
+
tokenizer_manager = TokenizerWorker(server_args, port_args)
|
177
166
|
template_manager = TemplateManager()
|
178
167
|
template_manager.initialize_templates(
|
179
168
|
tokenizer_manager=tokenizer_manager,
|
@@ -192,18 +181,29 @@ async def init_multi_tokenizer() -> ServerArgs:
|
|
192
181
|
scheduler_info=scheduler_info,
|
193
182
|
)
|
194
183
|
)
|
184
|
+
|
185
|
+
if server_args.enable_trace:
|
186
|
+
process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
|
187
|
+
if server_args.disaggregation_mode == "null":
|
188
|
+
thread_label = f"MultiTokenizer-{tokenizer_manager.worker_id}"
|
189
|
+
trace_set_thread_info(thread_label)
|
190
|
+
|
195
191
|
return server_args
|
196
192
|
|
197
193
|
|
198
194
|
@asynccontextmanager
|
199
195
|
async def lifespan(fast_api_app: FastAPI):
|
200
|
-
|
201
|
-
if server_args is None:
|
196
|
+
if not getattr(fast_api_app, "is_single_tokenizer_mode", False):
|
202
197
|
# Initialize multi-tokenizer support for worker processes
|
203
|
-
fast_api_app.server_args = await init_multi_tokenizer()
|
204
|
-
|
205
|
-
|
206
|
-
)
|
198
|
+
fast_api_app.server_args: ServerArgs = await init_multi_tokenizer()
|
199
|
+
|
200
|
+
# only metrics middleware is supported in multi-tokenizer mode
|
201
|
+
worker_pid = os.getpid()
|
202
|
+
if fast_api_app.server_args.enable_metrics:
|
203
|
+
add_prometheus_middleware(app)
|
204
|
+
enable_func_timer()
|
205
|
+
|
206
|
+
logger.info(f"Worker {worker_pid} added prometheus middleware")
|
207
207
|
fast_api_app.warmup_thread = threading.Thread(
|
208
208
|
target=_wait_and_warmup,
|
209
209
|
args=(
|
@@ -299,7 +299,23 @@ app.add_middleware(
|
|
299
299
|
|
300
300
|
@app.exception_handler(HTTPException)
|
301
301
|
async def validation_exception_handler(request: Request, exc: HTTPException):
|
302
|
-
"""Enrich HTTP exception with status code and other details
|
302
|
+
"""Enrich HTTP exception with status code and other details.
|
303
|
+
|
304
|
+
For /v1/responses, emit OpenAI-style nested error envelope:
|
305
|
+
{"error": {"message": "...", "type": "...", "param": null, "code": <status>}}
|
306
|
+
"""
|
307
|
+
# adjust fmt for responses api
|
308
|
+
if request.url.path.startswith("/v1/responses"):
|
309
|
+
nested_error = {
|
310
|
+
"message": exc.detail,
|
311
|
+
"type": HTTPStatus(exc.status_code).phrase,
|
312
|
+
"param": None,
|
313
|
+
"code": exc.status_code,
|
314
|
+
}
|
315
|
+
return ORJSONResponse(
|
316
|
+
content={"error": nested_error}, status_code=exc.status_code
|
317
|
+
)
|
318
|
+
|
303
319
|
error = ErrorResponse(
|
304
320
|
object="error",
|
305
321
|
message=exc.detail,
|
@@ -312,7 +328,10 @@ async def validation_exception_handler(request: Request, exc: HTTPException):
|
|
312
328
|
# Custom exception handlers to change validation error status codes
|
313
329
|
@app.exception_handler(RequestValidationError)
|
314
330
|
async def validation_exception_handler(request: Request, exc: RequestValidationError):
|
315
|
-
"""Override FastAPI's default 422 validation error with 400
|
331
|
+
"""Override FastAPI's default 422 validation error with 400.
|
332
|
+
|
333
|
+
For /v1/responses, emit OpenAI-style nested error envelope; for other endpoints keep legacy format.
|
334
|
+
"""
|
316
335
|
exc_str = str(exc)
|
317
336
|
errors_str = str(exc.errors())
|
318
337
|
|
@@ -321,6 +340,16 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE
|
|
321
340
|
else:
|
322
341
|
message = exc_str
|
323
342
|
|
343
|
+
if request.url.path.startswith("/v1/responses"):
|
344
|
+
# adapt specially, for v1/responses API only (notice the error key is different)
|
345
|
+
nested_error = {
|
346
|
+
"message": message,
|
347
|
+
"type": HTTPStatus.BAD_REQUEST.phrase,
|
348
|
+
"param": None,
|
349
|
+
"code": HTTPStatus.BAD_REQUEST.value,
|
350
|
+
}
|
351
|
+
return ORJSONResponse(status_code=400, content={"error": nested_error})
|
352
|
+
|
324
353
|
err = ErrorResponse(
|
325
354
|
message=message,
|
326
355
|
type=HTTPStatus.BAD_REQUEST.phrase,
|
@@ -681,6 +710,38 @@ async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: R
|
|
681
710
|
)
|
682
711
|
|
683
712
|
|
713
|
+
@app.post("/init_weights_send_group_for_remote_instance")
|
714
|
+
async def init_weights_send_group_for_remote_instance(
|
715
|
+
obj: InitWeightsSendGroupForRemoteInstanceReqInput, request: Request
|
716
|
+
):
|
717
|
+
success, message = (
|
718
|
+
await _global_state.tokenizer_manager.init_weights_send_group_for_remote_instance(
|
719
|
+
obj, request
|
720
|
+
)
|
721
|
+
)
|
722
|
+
content = {"success": success, "message": message}
|
723
|
+
if success:
|
724
|
+
return ORJSONResponse(content, status_code=200)
|
725
|
+
else:
|
726
|
+
return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
|
727
|
+
|
728
|
+
|
729
|
+
@app.post("/send_weights_to_remote_instance")
|
730
|
+
async def send_weights_to_remote_instance(
|
731
|
+
obj: SendWeightsToRemoteInstanceReqInput, request: Request
|
732
|
+
):
|
733
|
+
success, message = (
|
734
|
+
await _global_state.tokenizer_manager.send_weights_to_remote_instance(
|
735
|
+
obj, request
|
736
|
+
)
|
737
|
+
)
|
738
|
+
content = {"success": success, "message": message}
|
739
|
+
if success:
|
740
|
+
return ORJSONResponse(content, status_code=200)
|
741
|
+
else:
|
742
|
+
return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
|
743
|
+
|
744
|
+
|
684
745
|
@app.post("/init_weights_update_group")
|
685
746
|
async def init_weights_update_group(
|
686
747
|
obj: InitWeightsUpdateGroupReqInput, request: Request
|
@@ -696,6 +757,20 @@ async def init_weights_update_group(
|
|
696
757
|
return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
|
697
758
|
|
698
759
|
|
760
|
+
@app.post("/destroy_weights_update_group")
|
761
|
+
async def destroy_weights_update_group(
|
762
|
+
obj: DestroyWeightsUpdateGroupReqInput, request: Request
|
763
|
+
):
|
764
|
+
"""Destroy the parameter update group."""
|
765
|
+
success, message = (
|
766
|
+
await _global_state.tokenizer_manager.destroy_weights_update_group(obj, request)
|
767
|
+
)
|
768
|
+
content = {"success": success, "message": message}
|
769
|
+
return ORJSONResponse(
|
770
|
+
content, status_code=200 if success else HTTPStatus.BAD_REQUEST
|
771
|
+
)
|
772
|
+
|
773
|
+
|
699
774
|
@app.post("/update_weights_from_tensor")
|
700
775
|
async def update_weights_from_tensor(
|
701
776
|
obj: UpdateWeightsFromTensorReqInput, request: Request
|
@@ -1168,7 +1243,6 @@ def launch_server(
|
|
1168
1243
|
2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
|
1169
1244
|
"""
|
1170
1245
|
if server_args.tokenizer_worker_num > 1:
|
1171
|
-
setproctitle.setproctitle(f"sglang::http_server/multi_tokenizer_router")
|
1172
1246
|
port_args = PortArgs.init_new(server_args)
|
1173
1247
|
port_args.tokenizer_worker_ipc_name = (
|
1174
1248
|
f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
|
@@ -1177,11 +1251,16 @@ def launch_server(
|
|
1177
1251
|
server_args=server_args, port_args=port_args
|
1178
1252
|
)
|
1179
1253
|
else:
|
1180
|
-
setproctitle.setproctitle(f"sglang::http_server/tokenizer_manager")
|
1181
1254
|
tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
|
1182
1255
|
server_args=server_args,
|
1183
1256
|
)
|
1184
1257
|
|
1258
|
+
if server_args.enable_trace:
|
1259
|
+
process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
|
1260
|
+
if server_args.disaggregation_mode == "null":
|
1261
|
+
thread_label = "Tokenizer"
|
1262
|
+
trace_set_thread_info(thread_label)
|
1263
|
+
|
1185
1264
|
set_global_state(
|
1186
1265
|
_GlobalState(
|
1187
1266
|
tokenizer_manager=tokenizer_manager,
|
@@ -1191,12 +1270,10 @@ def launch_server(
|
|
1191
1270
|
)
|
1192
1271
|
|
1193
1272
|
if server_args.tokenizer_worker_num > 1:
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1198
|
-
scheduler_info,
|
1199
|
-
)
|
1273
|
+
multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
|
1274
|
+
port_args,
|
1275
|
+
server_args,
|
1276
|
+
scheduler_info,
|
1200
1277
|
)
|
1201
1278
|
else:
|
1202
1279
|
# Add api key authorization
|
@@ -1233,6 +1310,9 @@ def launch_server(
|
|
1233
1310
|
"level": "INFO",
|
1234
1311
|
"propagate": False,
|
1235
1312
|
}
|
1313
|
+
|
1314
|
+
monkey_patch_uvicorn_multiprocessing()
|
1315
|
+
|
1236
1316
|
uvicorn.run(
|
1237
1317
|
"sglang.srt.entrypoints.http_server:app",
|
1238
1318
|
host=server_args.host,
|
@@ -1243,6 +1323,7 @@ def launch_server(
|
|
1243
1323
|
workers=server_args.tokenizer_worker_num,
|
1244
1324
|
)
|
1245
1325
|
else:
|
1326
|
+
app.is_single_tokenizer_mode = True
|
1246
1327
|
uvicorn.run(
|
1247
1328
|
app,
|
1248
1329
|
host=server_args.host,
|
@@ -1253,10 +1334,8 @@ def launch_server(
|
|
1253
1334
|
)
|
1254
1335
|
finally:
|
1255
1336
|
if server_args.tokenizer_worker_num > 1:
|
1256
|
-
|
1257
|
-
|
1258
|
-
scheduler_info_shm.unlink()
|
1259
|
-
_global_state.tokenizer_manager.clear_tokenizer_mapping()
|
1337
|
+
multi_tokenizer_args_shm.unlink()
|
1338
|
+
_global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
|
1260
1339
|
else:
|
1261
1340
|
warmup_thread.join()
|
1262
1341
|
|
@@ -1405,13 +1484,5 @@ def _wait_and_warmup(
|
|
1405
1484
|
if server_args.debug_tensor_dump_input_file:
|
1406
1485
|
kill_process_tree(os.getpid())
|
1407
1486
|
|
1408
|
-
if server_args.pdlb_url is not None:
|
1409
|
-
register_disaggregation_server(
|
1410
|
-
server_args.disaggregation_mode,
|
1411
|
-
server_args.port,
|
1412
|
-
server_args.disaggregation_bootstrap_port,
|
1413
|
-
server_args.pdlb_url,
|
1414
|
-
)
|
1415
|
-
|
1416
1487
|
if launch_callback is not None:
|
1417
1488
|
launch_callback()
|
@@ -16,12 +16,14 @@
|
|
16
16
|
import time
|
17
17
|
import uuid
|
18
18
|
from dataclasses import dataclass
|
19
|
-
from typing import Any, Dict, List, Optional, TypeAlias, Union
|
19
|
+
from typing import Any, Dict, List, NamedTuple, Optional, TypeAlias, Union
|
20
20
|
|
21
21
|
from openai.types.responses import (
|
22
22
|
ResponseFunctionToolCall,
|
23
23
|
ResponseInputItemParam,
|
24
24
|
ResponseOutputItem,
|
25
|
+
ResponseOutputMessage,
|
26
|
+
ResponseOutputText,
|
25
27
|
ResponseReasoningItem,
|
26
28
|
)
|
27
29
|
from openai.types.responses.response import ToolChoice
|
@@ -228,6 +230,15 @@ class CompletionRequest(BaseModel):
|
|
228
230
|
|
229
231
|
# For request id
|
230
232
|
rid: Optional[Union[List[str], str]] = None
|
233
|
+
# Extra key for classifying the request (e.g. cache_salt)
|
234
|
+
extra_key: Optional[Union[List[str], str]] = None
|
235
|
+
# Cache salt for request caching
|
236
|
+
cache_salt: Optional[Union[List[str], str]] = None
|
237
|
+
# Priority for the request
|
238
|
+
priority: Optional[int] = None
|
239
|
+
|
240
|
+
# For custom metric labels
|
241
|
+
custom_labels: Optional[Dict[str, str]] = None
|
231
242
|
|
232
243
|
@field_validator("max_tokens")
|
233
244
|
@classmethod
|
@@ -334,7 +345,7 @@ class FunctionResponse(BaseModel):
|
|
334
345
|
"""Function response."""
|
335
346
|
|
336
347
|
name: Optional[str] = None
|
337
|
-
arguments: Optional[str] = None
|
348
|
+
arguments: Optional[str | Dict[str, Any]] = None
|
338
349
|
|
339
350
|
|
340
351
|
class ToolCall(BaseModel):
|
@@ -383,7 +394,7 @@ class Function(BaseModel):
|
|
383
394
|
"""Function descriptions."""
|
384
395
|
|
385
396
|
description: Optional[str] = Field(default=None, examples=[None])
|
386
|
-
name:
|
397
|
+
name: str
|
387
398
|
parameters: Optional[object] = None
|
388
399
|
strict: bool = False
|
389
400
|
|
@@ -447,7 +458,7 @@ class ChatCompletionRequest(BaseModel):
|
|
447
458
|
description="Constrains effort on reasoning for reasoning models. "
|
448
459
|
"'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
|
449
460
|
"result in faster responses and fewer tokens used on reasoning in a response. "
|
450
|
-
"Currently only supported for OpenAI models.",
|
461
|
+
"Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
|
451
462
|
)
|
452
463
|
|
453
464
|
@model_validator(mode="before")
|
@@ -540,6 +551,12 @@ class ChatCompletionRequest(BaseModel):
|
|
540
551
|
|
541
552
|
# For request id
|
542
553
|
rid: Optional[Union[List[str], str]] = None
|
554
|
+
# Extra key for classifying the request (e.g. cache_salt)
|
555
|
+
extra_key: Optional[Union[List[str], str]] = None
|
556
|
+
# Cache salt for request caching
|
557
|
+
cache_salt: Optional[Union[List[str], str]] = None
|
558
|
+
# Priority for the request
|
559
|
+
priority: Optional[int] = None
|
543
560
|
|
544
561
|
# For PD disaggregation
|
545
562
|
bootstrap_host: Optional[Union[List[str], str]] = None
|
@@ -641,6 +658,8 @@ class EmbeddingRequest(BaseModel):
|
|
641
658
|
|
642
659
|
# The request id.
|
643
660
|
rid: Optional[Union[List[str], str]] = None
|
661
|
+
# Priority for the request
|
662
|
+
priority: Optional[int] = None
|
644
663
|
|
645
664
|
|
646
665
|
class EmbeddingObject(BaseModel):
|
@@ -769,6 +788,13 @@ class ResponsesRequest(BaseModel):
|
|
769
788
|
description="The request_id related to this request. If the caller does not set it, a random uuid will be generated.",
|
770
789
|
)
|
771
790
|
priority: int = Field(default=0, description="Request priority")
|
791
|
+
extra_key: Optional[str] = Field(
|
792
|
+
default=None,
|
793
|
+
description="Extra key for classifying the request (e.g. cache_salt)",
|
794
|
+
)
|
795
|
+
cache_salt: Optional[str] = Field(
|
796
|
+
default=None, description="Cache salt for request caching"
|
797
|
+
)
|
772
798
|
|
773
799
|
# SGLang-specific sampling parameters
|
774
800
|
frequency_penalty: float = 0.0
|
@@ -857,6 +883,26 @@ class ResponsesResponse(BaseModel):
|
|
857
883
|
tool_choice: str = "auto"
|
858
884
|
tools: List[ResponseTool] = Field(default_factory=list)
|
859
885
|
|
886
|
+
# OpenAI compatibility fields. not all are used at the moment.
|
887
|
+
# Recommend checking https://platform.openai.com/docs/api-reference/responses
|
888
|
+
error: Optional[dict] = None
|
889
|
+
incomplete_details: Optional[dict] = None # TODO(v) support this input
|
890
|
+
instructions: Optional[str] = None
|
891
|
+
max_output_tokens: Optional[int] = None
|
892
|
+
previous_response_id: Optional[str] = None
|
893
|
+
reasoning: Optional[dict] = (
|
894
|
+
# Unused. No model supports this. For GPT-oss, system prompt sets
|
895
|
+
# the field, not server args.
|
896
|
+
None # {"effort": Optional[str], "summary": Optional[str]}
|
897
|
+
)
|
898
|
+
store: Optional[bool] = None
|
899
|
+
temperature: Optional[float] = None
|
900
|
+
text: Optional[dict] = None # e.g. {"format": {"type": "text"}}
|
901
|
+
top_p: Optional[float] = None
|
902
|
+
truncation: Optional[str] = None
|
903
|
+
user: Optional[str] = None
|
904
|
+
metadata: Optional[Dict[str, Any]] = None
|
905
|
+
|
860
906
|
@classmethod
|
861
907
|
def from_request(
|
862
908
|
cls,
|
@@ -871,6 +917,41 @@ class ResponsesResponse(BaseModel):
|
|
871
917
|
usage: Optional[UsageInfo],
|
872
918
|
) -> "ResponsesResponse":
|
873
919
|
"""Create a response from a request."""
|
920
|
+
|
921
|
+
# Determine if the output is plain text only to set text.format
|
922
|
+
def _is_text_only(
|
923
|
+
items: List[
|
924
|
+
Union[
|
925
|
+
ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall
|
926
|
+
]
|
927
|
+
]
|
928
|
+
) -> bool:
|
929
|
+
if not items:
|
930
|
+
return False
|
931
|
+
for it in items:
|
932
|
+
# tool call -> not pure text.
|
933
|
+
if isinstance(it, ResponseReasoningItem) or isinstance(
|
934
|
+
it, ResponseFunctionToolCall
|
935
|
+
):
|
936
|
+
return False
|
937
|
+
try:
|
938
|
+
if isinstance(it, ResponseOutputText):
|
939
|
+
continue
|
940
|
+
elif isinstance(it, ResponseOutputMessage):
|
941
|
+
if not it.content:
|
942
|
+
continue
|
943
|
+
for c in it.content:
|
944
|
+
if not isinstance(c, ResponseOutputText):
|
945
|
+
return False
|
946
|
+
else:
|
947
|
+
# Unknown type, not considered text-only
|
948
|
+
return False
|
949
|
+
except AttributeError:
|
950
|
+
return False
|
951
|
+
return True
|
952
|
+
|
953
|
+
text_format = {"format": {"type": "text"}} if _is_text_only(output) else None
|
954
|
+
|
874
955
|
return cls(
|
875
956
|
id=request.request_id,
|
876
957
|
created_at=created_time,
|
@@ -881,6 +962,23 @@ class ResponsesResponse(BaseModel):
|
|
881
962
|
parallel_tool_calls=request.parallel_tool_calls or True,
|
882
963
|
tool_choice=request.tool_choice,
|
883
964
|
tools=request.tools,
|
965
|
+
# fields for parity with v1/responses
|
966
|
+
error=None,
|
967
|
+
incomplete_details=None,
|
968
|
+
instructions=request.instructions,
|
969
|
+
max_output_tokens=request.max_output_tokens,
|
970
|
+
previous_response_id=request.previous_response_id, # TODO(v): ensure this is propagated if retrieved from store
|
971
|
+
reasoning={
|
972
|
+
"effort": request.reasoning.effort if request.reasoning else None,
|
973
|
+
"summary": None, # unused
|
974
|
+
},
|
975
|
+
store=request.store,
|
976
|
+
temperature=request.temperature,
|
977
|
+
text=text_format, # TODO(v): Expand coverage per https://platform.openai.com/docs/api-reference/responses/list
|
978
|
+
top_p=request.top_p,
|
979
|
+
truncation=request.truncation,
|
980
|
+
user=request.user,
|
981
|
+
metadata=request.metadata or {},
|
884
982
|
)
|
885
983
|
|
886
984
|
|
@@ -919,6 +1017,16 @@ class MessageProcessingResult:
|
|
919
1017
|
tool_call_constraint: Optional[Any] = None
|
920
1018
|
|
921
1019
|
|
1020
|
+
class ToolCallProcessingResult(NamedTuple):
|
1021
|
+
"""Result of processing tool calls in a response."""
|
1022
|
+
|
1023
|
+
tool_calls: Optional[
|
1024
|
+
List[Any]
|
1025
|
+
] # List of ToolCall objects or None if parsing failed
|
1026
|
+
remaining_text: str # Text remaining after parsing tool calls
|
1027
|
+
finish_reason: Dict[str, Any] # Updated finish reason dictionary
|
1028
|
+
|
1029
|
+
|
922
1030
|
class ResponseReasoningTextContent(BaseModel):
|
923
1031
|
text: str
|
924
1032
|
type: Literal["reasoning_text"] = "reasoning_text"
|
@@ -1,15 +1,20 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import json
|
2
4
|
import logging
|
3
5
|
import uuid
|
4
6
|
from abc import ABC, abstractmethod
|
5
|
-
from typing import Any, Optional, Union
|
7
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
6
8
|
|
7
9
|
from fastapi import HTTPException, Request
|
8
10
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
9
11
|
|
10
12
|
from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
|
11
13
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
12
|
-
from sglang.srt.
|
14
|
+
from sglang.srt.server_args import ServerArgs
|
15
|
+
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
13
18
|
|
14
19
|
logger = logging.getLogger(__name__)
|
15
20
|
|
@@ -20,6 +25,14 @@ class OpenAIServingBase(ABC):
|
|
20
25
|
|
21
26
|
def __init__(self, tokenizer_manager: TokenizerManager):
|
22
27
|
self.tokenizer_manager = tokenizer_manager
|
28
|
+
self.allowed_custom_labels = (
|
29
|
+
set(
|
30
|
+
self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
|
31
|
+
)
|
32
|
+
if isinstance(self.tokenizer_manager.server_args, ServerArgs)
|
33
|
+
and self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
|
34
|
+
else None
|
35
|
+
)
|
23
36
|
|
24
37
|
async def handle_request(
|
25
38
|
self, request: OpenAIServingRequest, raw_request: Request
|
@@ -33,7 +46,7 @@ class OpenAIServingBase(ABC):
|
|
33
46
|
|
34
47
|
# Convert to internal format
|
35
48
|
adapted_request, processed_request = self._convert_to_internal_request(
|
36
|
-
request
|
49
|
+
request, raw_request
|
37
50
|
)
|
38
51
|
|
39
52
|
# Note(Xinyuan): raw_request below is only used for detecting the connection of the client
|
@@ -49,6 +62,12 @@ class OpenAIServingBase(ABC):
|
|
49
62
|
return self.create_error_response(
|
50
63
|
message=e.detail, err_type=str(e.status_code), status_code=e.status_code
|
51
64
|
)
|
65
|
+
except ValueError as e:
|
66
|
+
return self.create_error_response(
|
67
|
+
message=str(e),
|
68
|
+
err_type="BadRequest",
|
69
|
+
status_code=400,
|
70
|
+
)
|
52
71
|
except Exception as e:
|
53
72
|
logger.exception(f"Error in request: {e}")
|
54
73
|
return self.create_error_response(
|
@@ -73,10 +92,24 @@ class OpenAIServingBase(ABC):
|
|
73
92
|
|
74
93
|
return f"{self._request_id_prefix()}{uuid.uuid4().hex}"
|
75
94
|
|
95
|
+
def _compute_extra_key(self, request: OpenAIServingRequest) -> Optional[str]:
|
96
|
+
"""Compute the final extra_key by concatenating cache_salt and extra_key if both are provided."""
|
97
|
+
parts = []
|
98
|
+
for key in ["cache_salt", "extra_key"]:
|
99
|
+
value = getattr(request, key, None)
|
100
|
+
if value:
|
101
|
+
if not isinstance(value, str):
|
102
|
+
raise TypeError(
|
103
|
+
f"Value of {key} must be a string, but got {type(value).__name__}"
|
104
|
+
)
|
105
|
+
parts.append(value)
|
106
|
+
return "".join(parts) if parts else None
|
107
|
+
|
76
108
|
@abstractmethod
|
77
109
|
def _convert_to_internal_request(
|
78
110
|
self,
|
79
111
|
request: OpenAIServingRequest,
|
112
|
+
raw_request: Request = None,
|
80
113
|
) -> tuple[GenerateReqInput, OpenAIServingRequest]:
|
81
114
|
"""Convert OpenAI request to internal format"""
|
82
115
|
pass
|
@@ -150,3 +183,32 @@ class OpenAIServingBase(ABC):
|
|
150
183
|
code=status_code,
|
151
184
|
)
|
152
185
|
return json.dumps({"error": error.model_dump()})
|
186
|
+
|
187
|
+
def extract_custom_labels(self, raw_request):
|
188
|
+
if (
|
189
|
+
not self.allowed_custom_labels
|
190
|
+
or not self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
|
191
|
+
):
|
192
|
+
return None
|
193
|
+
|
194
|
+
custom_labels = None
|
195
|
+
header = (
|
196
|
+
self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
|
197
|
+
)
|
198
|
+
try:
|
199
|
+
raw_labels = (
|
200
|
+
json.loads(raw_request.headers.get(header))
|
201
|
+
if raw_request and raw_request.headers.get(header)
|
202
|
+
else None
|
203
|
+
)
|
204
|
+
except json.JSONDecodeError as e:
|
205
|
+
logger.exception(f"Error in request: {e}")
|
206
|
+
raw_labels = None
|
207
|
+
|
208
|
+
if isinstance(raw_labels, dict):
|
209
|
+
custom_labels = {
|
210
|
+
label: value
|
211
|
+
for label, value in raw_labels.items()
|
212
|
+
if label in self.allowed_custom_labels
|
213
|
+
}
|
214
|
+
return custom_labels
|