sglang 0.5.2rc1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +267 -32
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/interpreter.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/model_config.py +181 -82
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +71 -19
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +326 -53
- sglang/srt/disaggregation/prefill.py +36 -17
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +192 -113
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +855 -0
- sglang/srt/entrypoints/grpc_server.py +810 -0
- sglang/srt/entrypoints/http_server.py +132 -57
- sglang/srt/entrypoints/openai/protocol.py +115 -7
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +207 -58
- sglang/srt/entrypoints/openai/serving_completions.py +17 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +10 -4
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +49 -4
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +9 -2
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +24 -1
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +106 -82
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +118 -198
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
- sglang/srt/layers/attention/mamba/mamba.py +629 -0
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +53 -7
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +44 -12
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
- sglang/srt/layers/moe/ep_moe/layer.py +256 -63
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +22 -7
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +78 -49
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +225 -57
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +77 -42
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +26 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +78 -49
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +52 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +215 -314
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +358 -404
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +240 -138
- sglang/srt/managers/schedule_policy.py +147 -19
- sglang/srt/managers/scheduler.py +501 -304
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +119 -40
- sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +321 -632
- sglang/srt/managers/tp_worker.py +81 -22
- sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +15 -21
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +58 -34
- sglang/srt/mem_cache/hiradix_cache.py +227 -80
- sglang/srt/mem_cache/memory_pool.py +535 -58
- sglang/srt/mem_cache/memory_pool_host.py +239 -223
- sglang/srt/mem_cache/radix_cache.py +222 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +268 -63
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +198 -30
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/mem_cache/swa_radix_cache.py +25 -36
- sglang/srt/metrics/collector.py +519 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +55 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +98 -57
- sglang/srt/model_executor/model_runner.py +433 -158
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +133 -5
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +158 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +833 -152
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +14 -5
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/mllama4.py +40 -4
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +124 -14
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +26 -5
- sglang/srt/models/qwen3_moe.py +71 -12
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +10 -3
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +6 -0
- sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
- sglang/srt/sampling/sampling_batch_info.py +38 -17
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +1030 -254
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +253 -136
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +445 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +77 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +383 -5
- sglang/utils.py +22 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/RECORD +392 -258
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
- /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -27,7 +27,9 @@ import tempfile
|
|
27
27
|
import threading
|
28
28
|
import time
|
29
29
|
from http import HTTPStatus
|
30
|
-
from typing import Any, AsyncIterator, Callable, Dict, List, Optional
|
30
|
+
from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Union
|
31
|
+
|
32
|
+
from sglang.srt.tracing.trace import process_tracing_init, trace_set_thread_info
|
31
33
|
|
32
34
|
# Fix a bug of Python threading
|
33
35
|
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
|
@@ -45,11 +47,7 @@ from fastapi.exceptions import RequestValidationError
|
|
45
47
|
from fastapi.middleware.cors import CORSMiddleware
|
46
48
|
from fastapi.responses import ORJSONResponse, Response, StreamingResponse
|
47
49
|
|
48
|
-
from sglang.srt.disaggregation.utils import
|
49
|
-
FAKE_BOOTSTRAP_HOST,
|
50
|
-
DisaggregationMode,
|
51
|
-
register_disaggregation_server,
|
52
|
-
)
|
50
|
+
from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
|
53
51
|
from sglang.srt.entrypoints.engine import _launch_subprocesses
|
54
52
|
from sglang.srt.entrypoints.openai.protocol import (
|
55
53
|
ChatCompletionRequest,
|
@@ -72,9 +70,11 @@ from sglang.srt.managers.io_struct import (
|
|
72
70
|
AbortReq,
|
73
71
|
CloseSessionReqInput,
|
74
72
|
ConfigureLoggingReq,
|
73
|
+
DestroyWeightsUpdateGroupReqInput,
|
75
74
|
EmbeddingReqInput,
|
76
75
|
GenerateReqInput,
|
77
76
|
GetWeightsByNameReqInput,
|
77
|
+
InitWeightsSendGroupForRemoteInstanceReqInput,
|
78
78
|
InitWeightsUpdateGroupReqInput,
|
79
79
|
LoadLoRAAdapterReqInput,
|
80
80
|
OpenSessionReqInput,
|
@@ -82,6 +82,7 @@ from sglang.srt.managers.io_struct import (
|
|
82
82
|
ProfileReqInput,
|
83
83
|
ReleaseMemoryOccupationReqInput,
|
84
84
|
ResumeMemoryOccupationReqInput,
|
85
|
+
SendWeightsToRemoteInstanceReqInput,
|
85
86
|
SeparateReasoningReqInput,
|
86
87
|
SetInternalStateReq,
|
87
88
|
SlowDownReqInput,
|
@@ -93,16 +94,17 @@ from sglang.srt.managers.io_struct import (
|
|
93
94
|
VertexGenerateReqInput,
|
94
95
|
)
|
95
96
|
from sglang.srt.managers.multi_tokenizer_mixin import (
|
96
|
-
|
97
|
-
|
97
|
+
MultiTokenizerRouter,
|
98
|
+
TokenizerWorker,
|
98
99
|
get_main_process_id,
|
100
|
+
monkey_patch_uvicorn_multiprocessing,
|
99
101
|
read_from_shared_memory,
|
100
102
|
write_data_for_multi_tokenizer,
|
101
103
|
)
|
102
104
|
from sglang.srt.managers.template_manager import TemplateManager
|
103
105
|
from sglang.srt.managers.tokenizer_manager import ServerStatus, TokenizerManager
|
104
106
|
from sglang.srt.metrics.func_timer import enable_func_timer
|
105
|
-
from sglang.srt.reasoning_parser import ReasoningParser
|
107
|
+
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
106
108
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
107
109
|
from sglang.srt.utils import (
|
108
110
|
add_api_key_middleware,
|
@@ -125,7 +127,7 @@ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
|
|
125
127
|
# Store global states
|
126
128
|
@dataclasses.dataclass
|
127
129
|
class _GlobalState:
|
128
|
-
tokenizer_manager: TokenizerManager
|
130
|
+
tokenizer_manager: Union[TokenizerManager, MultiTokenizerRouter, TokenizerWorker]
|
129
131
|
template_manager: TemplateManager
|
130
132
|
scheduler_info: Dict
|
131
133
|
|
@@ -138,21 +140,6 @@ def set_global_state(global_state: _GlobalState):
|
|
138
140
|
_global_state = global_state
|
139
141
|
|
140
142
|
|
141
|
-
# Function to set up all middlewares for multi-tokenizer compatibility
|
142
|
-
def setup_middlewares(api_key: Optional[str], enable_metrics: bool):
|
143
|
-
"""Setup all middlewares for both single and multi-process modes"""
|
144
|
-
worker_pid = os.getpid()
|
145
|
-
|
146
|
-
if api_key:
|
147
|
-
add_api_key_middleware(app, api_key)
|
148
|
-
logger.info(f"Worker {worker_pid} added API key middleware")
|
149
|
-
|
150
|
-
if enable_metrics:
|
151
|
-
add_prometheus_middleware(app)
|
152
|
-
enable_func_timer()
|
153
|
-
logger.info(f"Worker {worker_pid} added prometheus middleware")
|
154
|
-
|
155
|
-
|
156
143
|
async def init_multi_tokenizer() -> ServerArgs:
|
157
144
|
"""Read args information from shm and init tokenizer manager for current process"""
|
158
145
|
pid = os.getpid()
|
@@ -160,18 +147,22 @@ async def init_multi_tokenizer() -> ServerArgs:
|
|
160
147
|
logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
|
161
148
|
|
162
149
|
# Read configuration from shared memory
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
150
|
+
port_args, server_args, scheduler_info = read_from_shared_memory(
|
151
|
+
f"multi_tokenizer_args_{main_pid}"
|
152
|
+
)
|
153
|
+
server_args: ServerArgs
|
154
|
+
|
155
|
+
# API key authentication is not supported in multi-tokenizer mode
|
156
|
+
assert (
|
157
|
+
server_args.api_key is None
|
158
|
+
), "API key is not supported in multi-tokenizer mode"
|
168
159
|
|
169
160
|
port_args.tokenizer_ipc_name = (
|
170
161
|
f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
|
171
162
|
)
|
172
163
|
|
173
164
|
# Launch multi-tokenizer manager process
|
174
|
-
tokenizer_manager =
|
165
|
+
tokenizer_manager = TokenizerWorker(server_args, port_args)
|
175
166
|
template_manager = TemplateManager()
|
176
167
|
template_manager.initialize_templates(
|
177
168
|
tokenizer_manager=tokenizer_manager,
|
@@ -190,18 +181,29 @@ async def init_multi_tokenizer() -> ServerArgs:
|
|
190
181
|
scheduler_info=scheduler_info,
|
191
182
|
)
|
192
183
|
)
|
184
|
+
|
185
|
+
if server_args.enable_trace:
|
186
|
+
process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
|
187
|
+
if server_args.disaggregation_mode == "null":
|
188
|
+
thread_label = f"MultiTokenizer-{tokenizer_manager.worker_id}"
|
189
|
+
trace_set_thread_info(thread_label)
|
190
|
+
|
193
191
|
return server_args
|
194
192
|
|
195
193
|
|
196
194
|
@asynccontextmanager
|
197
195
|
async def lifespan(fast_api_app: FastAPI):
|
198
|
-
|
199
|
-
if server_args is None:
|
196
|
+
if not getattr(fast_api_app, "is_single_tokenizer_mode", False):
|
200
197
|
# Initialize multi-tokenizer support for worker processes
|
201
|
-
fast_api_app.server_args = await init_multi_tokenizer()
|
202
|
-
|
203
|
-
|
204
|
-
)
|
198
|
+
fast_api_app.server_args: ServerArgs = await init_multi_tokenizer()
|
199
|
+
|
200
|
+
# only metrics middleware is supported in multi-tokenizer mode
|
201
|
+
worker_pid = os.getpid()
|
202
|
+
if fast_api_app.server_args.enable_metrics:
|
203
|
+
add_prometheus_middleware(app)
|
204
|
+
enable_func_timer()
|
205
|
+
|
206
|
+
logger.info(f"Worker {worker_pid} added prometheus middleware")
|
205
207
|
fast_api_app.warmup_thread = threading.Thread(
|
206
208
|
target=_wait_and_warmup,
|
207
209
|
args=(
|
@@ -297,7 +299,23 @@ app.add_middleware(
|
|
297
299
|
|
298
300
|
@app.exception_handler(HTTPException)
|
299
301
|
async def validation_exception_handler(request: Request, exc: HTTPException):
|
300
|
-
"""Enrich HTTP exception with status code and other details
|
302
|
+
"""Enrich HTTP exception with status code and other details.
|
303
|
+
|
304
|
+
For /v1/responses, emit OpenAI-style nested error envelope:
|
305
|
+
{"error": {"message": "...", "type": "...", "param": null, "code": <status>}}
|
306
|
+
"""
|
307
|
+
# adjust fmt for responses api
|
308
|
+
if request.url.path.startswith("/v1/responses"):
|
309
|
+
nested_error = {
|
310
|
+
"message": exc.detail,
|
311
|
+
"type": HTTPStatus(exc.status_code).phrase,
|
312
|
+
"param": None,
|
313
|
+
"code": exc.status_code,
|
314
|
+
}
|
315
|
+
return ORJSONResponse(
|
316
|
+
content={"error": nested_error}, status_code=exc.status_code
|
317
|
+
)
|
318
|
+
|
301
319
|
error = ErrorResponse(
|
302
320
|
object="error",
|
303
321
|
message=exc.detail,
|
@@ -310,7 +328,10 @@ async def validation_exception_handler(request: Request, exc: HTTPException):
|
|
310
328
|
# Custom exception handlers to change validation error status codes
|
311
329
|
@app.exception_handler(RequestValidationError)
|
312
330
|
async def validation_exception_handler(request: Request, exc: RequestValidationError):
|
313
|
-
"""Override FastAPI's default 422 validation error with 400
|
331
|
+
"""Override FastAPI's default 422 validation error with 400.
|
332
|
+
|
333
|
+
For /v1/responses, emit OpenAI-style nested error envelope; for other endpoints keep legacy format.
|
334
|
+
"""
|
314
335
|
exc_str = str(exc)
|
315
336
|
errors_str = str(exc.errors())
|
316
337
|
|
@@ -319,6 +340,16 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE
|
|
319
340
|
else:
|
320
341
|
message = exc_str
|
321
342
|
|
343
|
+
if request.url.path.startswith("/v1/responses"):
|
344
|
+
# adapt specially, for v1/responses API only (notice the error key is different)
|
345
|
+
nested_error = {
|
346
|
+
"message": message,
|
347
|
+
"type": HTTPStatus.BAD_REQUEST.phrase,
|
348
|
+
"param": None,
|
349
|
+
"code": HTTPStatus.BAD_REQUEST.value,
|
350
|
+
}
|
351
|
+
return ORJSONResponse(status_code=400, content={"error": nested_error})
|
352
|
+
|
322
353
|
err = ErrorResponse(
|
323
354
|
message=message,
|
324
355
|
type=HTTPStatus.BAD_REQUEST.phrase,
|
@@ -679,6 +710,38 @@ async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: R
|
|
679
710
|
)
|
680
711
|
|
681
712
|
|
713
|
+
@app.post("/init_weights_send_group_for_remote_instance")
|
714
|
+
async def init_weights_send_group_for_remote_instance(
|
715
|
+
obj: InitWeightsSendGroupForRemoteInstanceReqInput, request: Request
|
716
|
+
):
|
717
|
+
success, message = (
|
718
|
+
await _global_state.tokenizer_manager.init_weights_send_group_for_remote_instance(
|
719
|
+
obj, request
|
720
|
+
)
|
721
|
+
)
|
722
|
+
content = {"success": success, "message": message}
|
723
|
+
if success:
|
724
|
+
return ORJSONResponse(content, status_code=200)
|
725
|
+
else:
|
726
|
+
return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
|
727
|
+
|
728
|
+
|
729
|
+
@app.post("/send_weights_to_remote_instance")
|
730
|
+
async def send_weights_to_remote_instance(
|
731
|
+
obj: SendWeightsToRemoteInstanceReqInput, request: Request
|
732
|
+
):
|
733
|
+
success, message = (
|
734
|
+
await _global_state.tokenizer_manager.send_weights_to_remote_instance(
|
735
|
+
obj, request
|
736
|
+
)
|
737
|
+
)
|
738
|
+
content = {"success": success, "message": message}
|
739
|
+
if success:
|
740
|
+
return ORJSONResponse(content, status_code=200)
|
741
|
+
else:
|
742
|
+
return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
|
743
|
+
|
744
|
+
|
682
745
|
@app.post("/init_weights_update_group")
|
683
746
|
async def init_weights_update_group(
|
684
747
|
obj: InitWeightsUpdateGroupReqInput, request: Request
|
@@ -694,6 +757,20 @@ async def init_weights_update_group(
|
|
694
757
|
return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
|
695
758
|
|
696
759
|
|
760
|
+
@app.post("/destroy_weights_update_group")
|
761
|
+
async def destroy_weights_update_group(
|
762
|
+
obj: DestroyWeightsUpdateGroupReqInput, request: Request
|
763
|
+
):
|
764
|
+
"""Destroy the parameter update group."""
|
765
|
+
success, message = (
|
766
|
+
await _global_state.tokenizer_manager.destroy_weights_update_group(obj, request)
|
767
|
+
)
|
768
|
+
content = {"success": success, "message": message}
|
769
|
+
return ORJSONResponse(
|
770
|
+
content, status_code=200 if success else HTTPStatus.BAD_REQUEST
|
771
|
+
)
|
772
|
+
|
773
|
+
|
697
774
|
@app.post("/update_weights_from_tensor")
|
698
775
|
async def update_weights_from_tensor(
|
699
776
|
obj: UpdateWeightsFromTensorReqInput, request: Request
|
@@ -1178,6 +1255,12 @@ def launch_server(
|
|
1178
1255
|
server_args=server_args,
|
1179
1256
|
)
|
1180
1257
|
|
1258
|
+
if server_args.enable_trace:
|
1259
|
+
process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
|
1260
|
+
if server_args.disaggregation_mode == "null":
|
1261
|
+
thread_label = "Tokenizer"
|
1262
|
+
trace_set_thread_info(thread_label)
|
1263
|
+
|
1181
1264
|
set_global_state(
|
1182
1265
|
_GlobalState(
|
1183
1266
|
tokenizer_manager=tokenizer_manager,
|
@@ -1187,12 +1270,10 @@ def launch_server(
|
|
1187
1270
|
)
|
1188
1271
|
|
1189
1272
|
if server_args.tokenizer_worker_num > 1:
|
1190
|
-
|
1191
|
-
|
1192
|
-
|
1193
|
-
|
1194
|
-
scheduler_info,
|
1195
|
-
)
|
1273
|
+
multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
|
1274
|
+
port_args,
|
1275
|
+
server_args,
|
1276
|
+
scheduler_info,
|
1196
1277
|
)
|
1197
1278
|
else:
|
1198
1279
|
# Add api key authorization
|
@@ -1229,6 +1310,9 @@ def launch_server(
|
|
1229
1310
|
"level": "INFO",
|
1230
1311
|
"propagate": False,
|
1231
1312
|
}
|
1313
|
+
|
1314
|
+
monkey_patch_uvicorn_multiprocessing()
|
1315
|
+
|
1232
1316
|
uvicorn.run(
|
1233
1317
|
"sglang.srt.entrypoints.http_server:app",
|
1234
1318
|
host=server_args.host,
|
@@ -1239,6 +1323,7 @@ def launch_server(
|
|
1239
1323
|
workers=server_args.tokenizer_worker_num,
|
1240
1324
|
)
|
1241
1325
|
else:
|
1326
|
+
app.is_single_tokenizer_mode = True
|
1242
1327
|
uvicorn.run(
|
1243
1328
|
app,
|
1244
1329
|
host=server_args.host,
|
@@ -1249,10 +1334,8 @@ def launch_server(
|
|
1249
1334
|
)
|
1250
1335
|
finally:
|
1251
1336
|
if server_args.tokenizer_worker_num > 1:
|
1252
|
-
|
1253
|
-
|
1254
|
-
scheduler_info_shm.unlink()
|
1255
|
-
_global_state.tokenizer_manager.clear_tokenizer_mapping()
|
1337
|
+
multi_tokenizer_args_shm.unlink()
|
1338
|
+
_global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
|
1256
1339
|
else:
|
1257
1340
|
warmup_thread.join()
|
1258
1341
|
|
@@ -1401,13 +1484,5 @@ def _wait_and_warmup(
|
|
1401
1484
|
if server_args.debug_tensor_dump_input_file:
|
1402
1485
|
kill_process_tree(os.getpid())
|
1403
1486
|
|
1404
|
-
if server_args.pdlb_url is not None:
|
1405
|
-
register_disaggregation_server(
|
1406
|
-
server_args.disaggregation_mode,
|
1407
|
-
server_args.port,
|
1408
|
-
server_args.disaggregation_bootstrap_port,
|
1409
|
-
server_args.pdlb_url,
|
1410
|
-
)
|
1411
|
-
|
1412
1487
|
if launch_callback is not None:
|
1413
1488
|
launch_callback()
|
@@ -16,12 +16,14 @@
|
|
16
16
|
import time
|
17
17
|
import uuid
|
18
18
|
from dataclasses import dataclass
|
19
|
-
from typing import Any, Dict, List, Optional, TypeAlias, Union
|
19
|
+
from typing import Any, Dict, List, NamedTuple, Optional, TypeAlias, Union
|
20
20
|
|
21
21
|
from openai.types.responses import (
|
22
22
|
ResponseFunctionToolCall,
|
23
23
|
ResponseInputItemParam,
|
24
24
|
ResponseOutputItem,
|
25
|
+
ResponseOutputMessage,
|
26
|
+
ResponseOutputText,
|
25
27
|
ResponseReasoningItem,
|
26
28
|
)
|
27
29
|
from openai.types.responses.response import ToolChoice
|
@@ -228,6 +230,15 @@ class CompletionRequest(BaseModel):
|
|
228
230
|
|
229
231
|
# For request id
|
230
232
|
rid: Optional[Union[List[str], str]] = None
|
233
|
+
# Extra key for classifying the request (e.g. cache_salt)
|
234
|
+
extra_key: Optional[Union[List[str], str]] = None
|
235
|
+
# Cache salt for request caching
|
236
|
+
cache_salt: Optional[Union[List[str], str]] = None
|
237
|
+
# Priority for the request
|
238
|
+
priority: Optional[int] = None
|
239
|
+
|
240
|
+
# For custom metric labels
|
241
|
+
custom_labels: Optional[Dict[str, str]] = None
|
231
242
|
|
232
243
|
@field_validator("max_tokens")
|
233
244
|
@classmethod
|
@@ -334,7 +345,7 @@ class FunctionResponse(BaseModel):
|
|
334
345
|
"""Function response."""
|
335
346
|
|
336
347
|
name: Optional[str] = None
|
337
|
-
arguments: Optional[str] = None
|
348
|
+
arguments: Optional[str | Dict[str, Any]] = None
|
338
349
|
|
339
350
|
|
340
351
|
class ToolCall(BaseModel):
|
@@ -383,7 +394,7 @@ class Function(BaseModel):
|
|
383
394
|
"""Function descriptions."""
|
384
395
|
|
385
396
|
description: Optional[str] = Field(default=None, examples=[None])
|
386
|
-
name:
|
397
|
+
name: str
|
387
398
|
parameters: Optional[object] = None
|
388
399
|
strict: bool = False
|
389
400
|
|
@@ -447,7 +458,7 @@ class ChatCompletionRequest(BaseModel):
|
|
447
458
|
description="Constrains effort on reasoning for reasoning models. "
|
448
459
|
"'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
|
449
460
|
"result in faster responses and fewer tokens used on reasoning in a response. "
|
450
|
-
"Currently only supported for OpenAI models.",
|
461
|
+
"Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
|
451
462
|
)
|
452
463
|
|
453
464
|
@model_validator(mode="before")
|
@@ -540,11 +551,17 @@ class ChatCompletionRequest(BaseModel):
|
|
540
551
|
|
541
552
|
# For request id
|
542
553
|
rid: Optional[Union[List[str], str]] = None
|
554
|
+
# Extra key for classifying the request (e.g. cache_salt)
|
555
|
+
extra_key: Optional[Union[List[str], str]] = None
|
556
|
+
# Cache salt for request caching
|
557
|
+
cache_salt: Optional[Union[List[str], str]] = None
|
558
|
+
# Priority for the request
|
559
|
+
priority: Optional[int] = None
|
543
560
|
|
544
561
|
# For PD disaggregation
|
545
|
-
bootstrap_host: Optional[str] = None
|
546
|
-
bootstrap_port: Optional[int] = None
|
547
|
-
bootstrap_room: Optional[int] = None
|
562
|
+
bootstrap_host: Optional[Union[List[str], str]] = None
|
563
|
+
bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
|
564
|
+
bootstrap_room: Optional[Union[List[int], int]] = None
|
548
565
|
|
549
566
|
|
550
567
|
class ChatMessage(BaseModel):
|
@@ -641,6 +658,8 @@ class EmbeddingRequest(BaseModel):
|
|
641
658
|
|
642
659
|
# The request id.
|
643
660
|
rid: Optional[Union[List[str], str]] = None
|
661
|
+
# Priority for the request
|
662
|
+
priority: Optional[int] = None
|
644
663
|
|
645
664
|
|
646
665
|
class EmbeddingObject(BaseModel):
|
@@ -769,6 +788,13 @@ class ResponsesRequest(BaseModel):
|
|
769
788
|
description="The request_id related to this request. If the caller does not set it, a random uuid will be generated.",
|
770
789
|
)
|
771
790
|
priority: int = Field(default=0, description="Request priority")
|
791
|
+
extra_key: Optional[str] = Field(
|
792
|
+
default=None,
|
793
|
+
description="Extra key for classifying the request (e.g. cache_salt)",
|
794
|
+
)
|
795
|
+
cache_salt: Optional[str] = Field(
|
796
|
+
default=None, description="Cache salt for request caching"
|
797
|
+
)
|
772
798
|
|
773
799
|
# SGLang-specific sampling parameters
|
774
800
|
frequency_penalty: float = 0.0
|
@@ -857,6 +883,26 @@ class ResponsesResponse(BaseModel):
|
|
857
883
|
tool_choice: str = "auto"
|
858
884
|
tools: List[ResponseTool] = Field(default_factory=list)
|
859
885
|
|
886
|
+
# OpenAI compatibility fields. not all are used at the moment.
|
887
|
+
# Recommend checking https://platform.openai.com/docs/api-reference/responses
|
888
|
+
error: Optional[dict] = None
|
889
|
+
incomplete_details: Optional[dict] = None # TODO(v) support this input
|
890
|
+
instructions: Optional[str] = None
|
891
|
+
max_output_tokens: Optional[int] = None
|
892
|
+
previous_response_id: Optional[str] = None
|
893
|
+
reasoning: Optional[dict] = (
|
894
|
+
# Unused. No model supports this. For GPT-oss, system prompt sets
|
895
|
+
# the field, not server args.
|
896
|
+
None # {"effort": Optional[str], "summary": Optional[str]}
|
897
|
+
)
|
898
|
+
store: Optional[bool] = None
|
899
|
+
temperature: Optional[float] = None
|
900
|
+
text: Optional[dict] = None # e.g. {"format": {"type": "text"}}
|
901
|
+
top_p: Optional[float] = None
|
902
|
+
truncation: Optional[str] = None
|
903
|
+
user: Optional[str] = None
|
904
|
+
metadata: Optional[Dict[str, Any]] = None
|
905
|
+
|
860
906
|
@classmethod
|
861
907
|
def from_request(
|
862
908
|
cls,
|
@@ -871,6 +917,41 @@ class ResponsesResponse(BaseModel):
|
|
871
917
|
usage: Optional[UsageInfo],
|
872
918
|
) -> "ResponsesResponse":
|
873
919
|
"""Create a response from a request."""
|
920
|
+
|
921
|
+
# Determine if the output is plain text only to set text.format
|
922
|
+
def _is_text_only(
|
923
|
+
items: List[
|
924
|
+
Union[
|
925
|
+
ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall
|
926
|
+
]
|
927
|
+
]
|
928
|
+
) -> bool:
|
929
|
+
if not items:
|
930
|
+
return False
|
931
|
+
for it in items:
|
932
|
+
# tool call -> not pure text.
|
933
|
+
if isinstance(it, ResponseReasoningItem) or isinstance(
|
934
|
+
it, ResponseFunctionToolCall
|
935
|
+
):
|
936
|
+
return False
|
937
|
+
try:
|
938
|
+
if isinstance(it, ResponseOutputText):
|
939
|
+
continue
|
940
|
+
elif isinstance(it, ResponseOutputMessage):
|
941
|
+
if not it.content:
|
942
|
+
continue
|
943
|
+
for c in it.content:
|
944
|
+
if not isinstance(c, ResponseOutputText):
|
945
|
+
return False
|
946
|
+
else:
|
947
|
+
# Unknown type, not considered text-only
|
948
|
+
return False
|
949
|
+
except AttributeError:
|
950
|
+
return False
|
951
|
+
return True
|
952
|
+
|
953
|
+
text_format = {"format": {"type": "text"}} if _is_text_only(output) else None
|
954
|
+
|
874
955
|
return cls(
|
875
956
|
id=request.request_id,
|
876
957
|
created_at=created_time,
|
@@ -881,6 +962,23 @@ class ResponsesResponse(BaseModel):
|
|
881
962
|
parallel_tool_calls=request.parallel_tool_calls or True,
|
882
963
|
tool_choice=request.tool_choice,
|
883
964
|
tools=request.tools,
|
965
|
+
# fields for parity with v1/responses
|
966
|
+
error=None,
|
967
|
+
incomplete_details=None,
|
968
|
+
instructions=request.instructions,
|
969
|
+
max_output_tokens=request.max_output_tokens,
|
970
|
+
previous_response_id=request.previous_response_id, # TODO(v): ensure this is propagated if retrieved from store
|
971
|
+
reasoning={
|
972
|
+
"effort": request.reasoning.effort if request.reasoning else None,
|
973
|
+
"summary": None, # unused
|
974
|
+
},
|
975
|
+
store=request.store,
|
976
|
+
temperature=request.temperature,
|
977
|
+
text=text_format, # TODO(v): Expand coverage per https://platform.openai.com/docs/api-reference/responses/list
|
978
|
+
top_p=request.top_p,
|
979
|
+
truncation=request.truncation,
|
980
|
+
user=request.user,
|
981
|
+
metadata=request.metadata or {},
|
884
982
|
)
|
885
983
|
|
886
984
|
|
@@ -919,6 +1017,16 @@ class MessageProcessingResult:
|
|
919
1017
|
tool_call_constraint: Optional[Any] = None
|
920
1018
|
|
921
1019
|
|
1020
|
+
class ToolCallProcessingResult(NamedTuple):
|
1021
|
+
"""Result of processing tool calls in a response."""
|
1022
|
+
|
1023
|
+
tool_calls: Optional[
|
1024
|
+
List[Any]
|
1025
|
+
] # List of ToolCall objects or None if parsing failed
|
1026
|
+
remaining_text: str # Text remaining after parsing tool calls
|
1027
|
+
finish_reason: Dict[str, Any] # Updated finish reason dictionary
|
1028
|
+
|
1029
|
+
|
922
1030
|
class ResponseReasoningTextContent(BaseModel):
|
923
1031
|
text: str
|
924
1032
|
type: Literal["reasoning_text"] = "reasoning_text"
|
@@ -1,15 +1,20 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import json
|
2
4
|
import logging
|
3
5
|
import uuid
|
4
6
|
from abc import ABC, abstractmethod
|
5
|
-
from typing import Any, Optional, Union
|
7
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
6
8
|
|
7
9
|
from fastapi import HTTPException, Request
|
8
10
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
9
11
|
|
10
12
|
from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
|
11
13
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
12
|
-
from sglang.srt.
|
14
|
+
from sglang.srt.server_args import ServerArgs
|
15
|
+
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
13
18
|
|
14
19
|
logger = logging.getLogger(__name__)
|
15
20
|
|
@@ -20,6 +25,14 @@ class OpenAIServingBase(ABC):
|
|
20
25
|
|
21
26
|
def __init__(self, tokenizer_manager: TokenizerManager):
|
22
27
|
self.tokenizer_manager = tokenizer_manager
|
28
|
+
self.allowed_custom_labels = (
|
29
|
+
set(
|
30
|
+
self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
|
31
|
+
)
|
32
|
+
if isinstance(self.tokenizer_manager.server_args, ServerArgs)
|
33
|
+
and self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
|
34
|
+
else None
|
35
|
+
)
|
23
36
|
|
24
37
|
async def handle_request(
|
25
38
|
self, request: OpenAIServingRequest, raw_request: Request
|
@@ -33,7 +46,7 @@ class OpenAIServingBase(ABC):
|
|
33
46
|
|
34
47
|
# Convert to internal format
|
35
48
|
adapted_request, processed_request = self._convert_to_internal_request(
|
36
|
-
request
|
49
|
+
request, raw_request
|
37
50
|
)
|
38
51
|
|
39
52
|
# Note(Xinyuan): raw_request below is only used for detecting the connection of the client
|
@@ -49,6 +62,12 @@ class OpenAIServingBase(ABC):
|
|
49
62
|
return self.create_error_response(
|
50
63
|
message=e.detail, err_type=str(e.status_code), status_code=e.status_code
|
51
64
|
)
|
65
|
+
except ValueError as e:
|
66
|
+
return self.create_error_response(
|
67
|
+
message=str(e),
|
68
|
+
err_type="BadRequest",
|
69
|
+
status_code=400,
|
70
|
+
)
|
52
71
|
except Exception as e:
|
53
72
|
logger.exception(f"Error in request: {e}")
|
54
73
|
return self.create_error_response(
|
@@ -73,10 +92,24 @@ class OpenAIServingBase(ABC):
|
|
73
92
|
|
74
93
|
return f"{self._request_id_prefix()}{uuid.uuid4().hex}"
|
75
94
|
|
95
|
+
def _compute_extra_key(self, request: OpenAIServingRequest) -> Optional[str]:
|
96
|
+
"""Compute the final extra_key by concatenating cache_salt and extra_key if both are provided."""
|
97
|
+
parts = []
|
98
|
+
for key in ["cache_salt", "extra_key"]:
|
99
|
+
value = getattr(request, key, None)
|
100
|
+
if value:
|
101
|
+
if not isinstance(value, str):
|
102
|
+
raise TypeError(
|
103
|
+
f"Value of {key} must be a string, but got {type(value).__name__}"
|
104
|
+
)
|
105
|
+
parts.append(value)
|
106
|
+
return "".join(parts) if parts else None
|
107
|
+
|
76
108
|
@abstractmethod
|
77
109
|
def _convert_to_internal_request(
|
78
110
|
self,
|
79
111
|
request: OpenAIServingRequest,
|
112
|
+
raw_request: Request = None,
|
80
113
|
) -> tuple[GenerateReqInput, OpenAIServingRequest]:
|
81
114
|
"""Convert OpenAI request to internal format"""
|
82
115
|
pass
|
@@ -150,3 +183,32 @@ class OpenAIServingBase(ABC):
|
|
150
183
|
code=status_code,
|
151
184
|
)
|
152
185
|
return json.dumps({"error": error.model_dump()})
|
186
|
+
|
187
|
+
def extract_custom_labels(self, raw_request):
|
188
|
+
if (
|
189
|
+
not self.allowed_custom_labels
|
190
|
+
or not self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
|
191
|
+
):
|
192
|
+
return None
|
193
|
+
|
194
|
+
custom_labels = None
|
195
|
+
header = (
|
196
|
+
self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
|
197
|
+
)
|
198
|
+
try:
|
199
|
+
raw_labels = (
|
200
|
+
json.loads(raw_request.headers.get(header))
|
201
|
+
if raw_request and raw_request.headers.get(header)
|
202
|
+
else None
|
203
|
+
)
|
204
|
+
except json.JSONDecodeError as e:
|
205
|
+
logger.exception(f"Error in request: {e}")
|
206
|
+
raw_labels = None
|
207
|
+
|
208
|
+
if isinstance(raw_labels, dict):
|
209
|
+
custom_labels = {
|
210
|
+
label: value
|
211
|
+
for label, value in raw_labels.items()
|
212
|
+
if label in self.allowed_custom_labels
|
213
|
+
}
|
214
|
+
return custom_labels
|