sglang 0.5.2rc1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (395) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/lang/interpreter.py +1 -1
  7. sglang/launch_server.py +14 -0
  8. sglang/profiler.py +2 -2
  9. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  10. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  11. sglang/srt/configs/__init__.py +8 -0
  12. sglang/srt/configs/device_config.py +3 -1
  13. sglang/srt/configs/dots_ocr.py +64 -0
  14. sglang/srt/configs/dots_vlm.py +139 -0
  15. sglang/srt/configs/falcon_h1.py +360 -0
  16. sglang/srt/configs/internvl.py +6 -0
  17. sglang/srt/configs/load_config.py +9 -0
  18. sglang/srt/configs/model_config.py +181 -82
  19. sglang/srt/configs/qwen3_next.py +326 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +71 -19
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +326 -53
  44. sglang/srt/disaggregation/prefill.py +36 -17
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +192 -113
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  52. sglang/srt/entrypoints/grpc_server.py +810 -0
  53. sglang/srt/entrypoints/http_server.py +132 -57
  54. sglang/srt/entrypoints/openai/protocol.py +115 -7
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +207 -58
  57. sglang/srt/entrypoints/openai/serving_completions.py +17 -4
  58. sglang/srt/entrypoints/openai/serving_embedding.py +10 -4
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +49 -4
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/environ.py +285 -0
  63. sglang/srt/eplb/eplb_manager.py +2 -2
  64. sglang/srt/eplb/expert_distribution.py +26 -13
  65. sglang/srt/eplb/expert_location.py +38 -8
  66. sglang/srt/eplb/expert_location_updater.py +1 -1
  67. sglang/srt/function_call/base_format_detector.py +3 -6
  68. sglang/srt/function_call/ebnf_composer.py +11 -9
  69. sglang/srt/function_call/function_call_parser.py +9 -2
  70. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  71. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  72. sglang/srt/function_call/json_array_parser.py +63 -0
  73. sglang/srt/function_call/kimik2_detector.py +17 -4
  74. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  75. sglang/srt/function_call/utils.py +96 -5
  76. sglang/srt/grpc/__init__.py +1 -0
  77. sglang/srt/grpc/compile_proto.py +245 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  81. sglang/srt/layers/activation.py +143 -9
  82. sglang/srt/layers/attention/aiter_backend.py +106 -82
  83. sglang/srt/layers/attention/ascend_backend.py +115 -9
  84. sglang/srt/layers/attention/attention_registry.py +206 -0
  85. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  86. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  87. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  88. sglang/srt/layers/attention/fla/chunk.py +242 -0
  89. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  90. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  91. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  92. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  93. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  94. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  95. sglang/srt/layers/attention/fla/index.py +37 -0
  96. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  97. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  98. sglang/srt/layers/attention/fla/op.py +66 -0
  99. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  100. sglang/srt/layers/attention/fla/utils.py +331 -0
  101. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  102. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  103. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  104. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  105. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  106. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  107. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  108. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  109. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  111. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  112. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  113. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  114. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  115. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  121. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  122. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  123. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  124. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  125. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  126. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  127. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  128. sglang/srt/layers/attention/nsa/utils.py +24 -0
  129. sglang/srt/layers/attention/nsa_backend.py +887 -0
  130. sglang/srt/layers/attention/tbo_backend.py +6 -6
  131. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  132. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  133. sglang/srt/layers/attention/triton_backend.py +57 -7
  134. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  135. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  136. sglang/srt/layers/attention/vision.py +58 -0
  137. sglang/srt/layers/attention/wave_backend.py +4 -4
  138. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  139. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  140. sglang/srt/layers/communicator.py +53 -7
  141. sglang/srt/layers/dp_attention.py +41 -2
  142. sglang/srt/layers/elementwise.py +3 -1
  143. sglang/srt/layers/layernorm.py +34 -15
  144. sglang/srt/layers/linear.py +55 -7
  145. sglang/srt/layers/logits_processor.py +44 -12
  146. sglang/srt/layers/moe/__init__.py +2 -1
  147. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  148. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  149. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  150. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  151. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  167. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  169. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  170. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  171. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  172. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  173. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  174. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  175. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  176. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  177. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  178. sglang/srt/layers/moe/topk.py +30 -9
  179. sglang/srt/layers/moe/utils.py +22 -7
  180. sglang/srt/layers/parameter.py +23 -6
  181. sglang/srt/layers/quantization/awq.py +19 -7
  182. sglang/srt/layers/quantization/base_config.py +11 -6
  183. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  184. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  185. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  186. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  187. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  188. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  189. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  190. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  191. sglang/srt/layers/quantization/fp8.py +78 -49
  192. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  193. sglang/srt/layers/quantization/gptq.py +25 -17
  194. sglang/srt/layers/quantization/modelopt_quant.py +225 -57
  195. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  196. sglang/srt/layers/quantization/mxfp4.py +77 -42
  197. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  199. sglang/srt/layers/quantization/quark/utils.py +97 -0
  200. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  201. sglang/srt/layers/quantization/unquant.py +135 -47
  202. sglang/srt/layers/quantization/w4afp8.py +26 -17
  203. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  204. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  205. sglang/srt/layers/rocm_linear_utils.py +44 -0
  206. sglang/srt/layers/rotary_embedding.py +78 -49
  207. sglang/srt/layers/sampler.py +213 -21
  208. sglang/srt/layers/utils.py +23 -0
  209. sglang/srt/lora/backend/base_backend.py +50 -8
  210. sglang/srt/lora/backend/chunked_backend.py +348 -0
  211. sglang/srt/lora/backend/triton_backend.py +99 -5
  212. sglang/srt/lora/layers.py +32 -0
  213. sglang/srt/lora/lora.py +8 -3
  214. sglang/srt/lora/lora_manager.py +52 -118
  215. sglang/srt/lora/mem_pool.py +25 -11
  216. sglang/srt/lora/triton_ops/__init__.py +4 -0
  217. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  219. sglang/srt/lora/utils.py +22 -11
  220. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  221. sglang/srt/managers/cache_controller.py +215 -314
  222. sglang/srt/managers/data_parallel_controller.py +115 -80
  223. sglang/srt/managers/detokenizer_manager.py +19 -15
  224. sglang/srt/managers/disagg_service.py +46 -0
  225. sglang/srt/managers/io_struct.py +340 -109
  226. sglang/srt/managers/mm_utils.py +44 -6
  227. sglang/srt/managers/multi_tokenizer_mixin.py +358 -404
  228. sglang/srt/managers/multimodal_processor.py +1 -2
  229. sglang/srt/managers/overlap_utils.py +53 -0
  230. sglang/srt/managers/schedule_batch.py +240 -138
  231. sglang/srt/managers/schedule_policy.py +147 -19
  232. sglang/srt/managers/scheduler.py +501 -304
  233. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  234. sglang/srt/managers/scheduler_metrics_mixin.py +119 -40
  235. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  236. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  237. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  238. sglang/srt/managers/template_manager.py +3 -3
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +321 -632
  241. sglang/srt/managers/tp_worker.py +81 -22
  242. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  243. sglang/srt/managers/utils.py +1 -45
  244. sglang/srt/mem_cache/allocator.py +15 -21
  245. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  246. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  247. sglang/srt/mem_cache/chunk_cache.py +8 -1
  248. sglang/srt/mem_cache/evict_policy.py +23 -0
  249. sglang/srt/mem_cache/hicache_storage.py +58 -34
  250. sglang/srt/mem_cache/hiradix_cache.py +227 -80
  251. sglang/srt/mem_cache/memory_pool.py +535 -58
  252. sglang/srt/mem_cache/memory_pool_host.py +239 -223
  253. sglang/srt/mem_cache/radix_cache.py +222 -73
  254. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  255. sglang/srt/mem_cache/storage/__init__.py +10 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  257. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  258. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  259. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  260. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  261. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  262. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  263. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +268 -63
  264. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  265. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  266. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +198 -30
  267. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  268. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  269. sglang/srt/metrics/collector.py +519 -132
  270. sglang/srt/metrics/func_timer.py +2 -7
  271. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  272. sglang/srt/metrics/utils.py +55 -0
  273. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  274. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  275. sglang/srt/model_executor/forward_batch_info.py +98 -57
  276. sglang/srt/model_executor/model_runner.py +433 -158
  277. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  278. sglang/srt/model_loader/__init__.py +9 -3
  279. sglang/srt/model_loader/loader.py +133 -5
  280. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  281. sglang/srt/model_loader/weight_utils.py +158 -3
  282. sglang/srt/models/apertus.py +686 -0
  283. sglang/srt/models/bailing_moe.py +820 -217
  284. sglang/srt/models/bailing_moe_nextn.py +168 -0
  285. sglang/srt/models/deepseek_nextn.py +6 -1
  286. sglang/srt/models/deepseek_v2.py +833 -152
  287. sglang/srt/models/dots_ocr.py +173 -0
  288. sglang/srt/models/dots_vlm.py +174 -0
  289. sglang/srt/models/dots_vlm_vit.py +337 -0
  290. sglang/srt/models/ernie4.py +1 -1
  291. sglang/srt/models/falcon_h1.py +576 -0
  292. sglang/srt/models/gemma3_causal.py +0 -2
  293. sglang/srt/models/gemma3_mm.py +1 -1
  294. sglang/srt/models/gemma3n_mm.py +2 -2
  295. sglang/srt/models/glm4_moe.py +14 -5
  296. sglang/srt/models/glm4_moe_nextn.py +2 -2
  297. sglang/srt/models/glm4v.py +5 -3
  298. sglang/srt/models/glm4v_moe.py +4 -1
  299. sglang/srt/models/gpt_oss.py +8 -31
  300. sglang/srt/models/internvl.py +28 -0
  301. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  302. sglang/srt/models/llama.py +4 -0
  303. sglang/srt/models/llama4.py +9 -0
  304. sglang/srt/models/llama_eagle3.py +13 -0
  305. sglang/srt/models/longcat_flash.py +3 -3
  306. sglang/srt/models/longcat_flash_nextn.py +1 -1
  307. sglang/srt/models/minicpmv.py +165 -3
  308. sglang/srt/models/mllama4.py +40 -4
  309. sglang/srt/models/opt.py +637 -0
  310. sglang/srt/models/qwen2_5_vl.py +29 -5
  311. sglang/srt/models/qwen2_audio.py +1 -1
  312. sglang/srt/models/qwen2_moe.py +124 -14
  313. sglang/srt/models/qwen2_vl.py +1 -1
  314. sglang/srt/models/qwen3.py +26 -5
  315. sglang/srt/models/qwen3_moe.py +71 -12
  316. sglang/srt/models/qwen3_next.py +1069 -0
  317. sglang/srt/models/qwen3_next_mtp.py +112 -0
  318. sglang/srt/models/qwen3_vl.py +787 -0
  319. sglang/srt/models/qwen3_vl_moe.py +471 -0
  320. sglang/srt/models/registry.py +15 -3
  321. sglang/srt/models/sarashina2_vision.py +269 -0
  322. sglang/srt/models/solar.py +505 -0
  323. sglang/srt/models/starcoder2.py +357 -0
  324. sglang/srt/models/step3_vl.py +1 -1
  325. sglang/srt/models/torch_native_llama.py +10 -3
  326. sglang/srt/models/utils.py +51 -0
  327. sglang/srt/multimodal/processors/base_processor.py +15 -7
  328. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  329. sglang/srt/multimodal/processors/glm4v.py +9 -9
  330. sglang/srt/multimodal/processors/internvl.py +153 -129
  331. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  332. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  333. sglang/srt/offloader.py +27 -3
  334. sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +6 -0
  335. sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
  336. sglang/srt/sampling/sampling_batch_info.py +38 -17
  337. sglang/srt/sampling/sampling_params.py +7 -0
  338. sglang/srt/server_args.py +1030 -254
  339. sglang/srt/server_args_config_parser.py +146 -0
  340. sglang/srt/single_batch_overlap.py +151 -0
  341. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  342. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  343. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  344. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  345. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  346. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  347. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  348. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  349. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  350. sglang/srt/speculative/eagle_worker.py +253 -136
  351. sglang/srt/speculative/ngram_utils.py +428 -0
  352. sglang/srt/speculative/ngram_worker.py +245 -0
  353. sglang/srt/speculative/spec_info.py +52 -0
  354. sglang/srt/speculative/spec_utils.py +606 -0
  355. sglang/srt/speculative/standalone_worker.py +109 -0
  356. sglang/srt/torch_memory_saver_adapter.py +5 -7
  357. sglang/srt/tracing/trace.py +578 -0
  358. sglang/srt/two_batch_overlap.py +8 -5
  359. sglang/srt/utils/__init__.py +2 -0
  360. sglang/srt/{utils.py → utils/common.py} +445 -77
  361. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  362. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  363. sglang/srt/utils/rpd_utils.py +452 -0
  364. sglang/srt/utils/slow_rank_detector.py +71 -0
  365. sglang/srt/warmup.py +8 -4
  366. sglang/srt/weight_sync/utils.py +2 -2
  367. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  368. sglang/test/few_shot_gsm8k.py +1 -0
  369. sglang/test/get_logits_ut.py +57 -0
  370. sglang/test/run_eval.py +79 -11
  371. sglang/test/runners.py +5 -1
  372. sglang/test/simple_eval_common.py +5 -2
  373. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  374. sglang/test/test_block_fp8.py +2 -2
  375. sglang/test/test_cutlass_moe.py +24 -6
  376. sglang/test/test_deterministic.py +297 -0
  377. sglang/test/test_disaggregation_utils.py +77 -0
  378. sglang/test/test_fp4_moe.py +370 -1
  379. sglang/test/test_programs.py +1 -1
  380. sglang/test/test_utils.py +383 -5
  381. sglang/utils.py +22 -1
  382. sglang/version.py +1 -1
  383. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
  384. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/RECORD +392 -258
  385. sglang/srt/disaggregation/launch_lb.py +0 -118
  386. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  387. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  388. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  389. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  390. /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
  391. /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
  392. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  393. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
  394. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
  395. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -27,7 +27,9 @@ import tempfile
27
27
  import threading
28
28
  import time
29
29
  from http import HTTPStatus
30
- from typing import Any, AsyncIterator, Callable, Dict, List, Optional
30
+ from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Union
31
+
32
+ from sglang.srt.tracing.trace import process_tracing_init, trace_set_thread_info
31
33
 
32
34
  # Fix a bug of Python threading
33
35
  setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
@@ -45,11 +47,7 @@ from fastapi.exceptions import RequestValidationError
45
47
  from fastapi.middleware.cors import CORSMiddleware
46
48
  from fastapi.responses import ORJSONResponse, Response, StreamingResponse
47
49
 
48
- from sglang.srt.disaggregation.utils import (
49
- FAKE_BOOTSTRAP_HOST,
50
- DisaggregationMode,
51
- register_disaggregation_server,
52
- )
50
+ from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
53
51
  from sglang.srt.entrypoints.engine import _launch_subprocesses
54
52
  from sglang.srt.entrypoints.openai.protocol import (
55
53
  ChatCompletionRequest,
@@ -72,9 +70,11 @@ from sglang.srt.managers.io_struct import (
72
70
  AbortReq,
73
71
  CloseSessionReqInput,
74
72
  ConfigureLoggingReq,
73
+ DestroyWeightsUpdateGroupReqInput,
75
74
  EmbeddingReqInput,
76
75
  GenerateReqInput,
77
76
  GetWeightsByNameReqInput,
77
+ InitWeightsSendGroupForRemoteInstanceReqInput,
78
78
  InitWeightsUpdateGroupReqInput,
79
79
  LoadLoRAAdapterReqInput,
80
80
  OpenSessionReqInput,
@@ -82,6 +82,7 @@ from sglang.srt.managers.io_struct import (
82
82
  ProfileReqInput,
83
83
  ReleaseMemoryOccupationReqInput,
84
84
  ResumeMemoryOccupationReqInput,
85
+ SendWeightsToRemoteInstanceReqInput,
85
86
  SeparateReasoningReqInput,
86
87
  SetInternalStateReq,
87
88
  SlowDownReqInput,
@@ -93,16 +94,17 @@ from sglang.srt.managers.io_struct import (
93
94
  VertexGenerateReqInput,
94
95
  )
95
96
  from sglang.srt.managers.multi_tokenizer_mixin import (
96
- MultiTokenizerManager,
97
- deserialize_data,
97
+ MultiTokenizerRouter,
98
+ TokenizerWorker,
98
99
  get_main_process_id,
100
+ monkey_patch_uvicorn_multiprocessing,
99
101
  read_from_shared_memory,
100
102
  write_data_for_multi_tokenizer,
101
103
  )
102
104
  from sglang.srt.managers.template_manager import TemplateManager
103
105
  from sglang.srt.managers.tokenizer_manager import ServerStatus, TokenizerManager
104
106
  from sglang.srt.metrics.func_timer import enable_func_timer
105
- from sglang.srt.reasoning_parser import ReasoningParser
107
+ from sglang.srt.parser.reasoning_parser import ReasoningParser
106
108
  from sglang.srt.server_args import PortArgs, ServerArgs
107
109
  from sglang.srt.utils import (
108
110
  add_api_key_middleware,
@@ -125,7 +127,7 @@ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
125
127
  # Store global states
126
128
  @dataclasses.dataclass
127
129
  class _GlobalState:
128
- tokenizer_manager: TokenizerManager
130
+ tokenizer_manager: Union[TokenizerManager, MultiTokenizerRouter, TokenizerWorker]
129
131
  template_manager: TemplateManager
130
132
  scheduler_info: Dict
131
133
 
@@ -138,21 +140,6 @@ def set_global_state(global_state: _GlobalState):
138
140
  _global_state = global_state
139
141
 
140
142
 
141
- # Function to set up all middlewares for multi-tokenizer compatibility
142
- def setup_middlewares(api_key: Optional[str], enable_metrics: bool):
143
- """Setup all middlewares for both single and multi-process modes"""
144
- worker_pid = os.getpid()
145
-
146
- if api_key:
147
- add_api_key_middleware(app, api_key)
148
- logger.info(f"Worker {worker_pid} added API key middleware")
149
-
150
- if enable_metrics:
151
- add_prometheus_middleware(app)
152
- enable_func_timer()
153
- logger.info(f"Worker {worker_pid} added prometheus middleware")
154
-
155
-
156
143
  async def init_multi_tokenizer() -> ServerArgs:
157
144
  """Read args information from shm and init tokenizer manager for current process"""
158
145
  pid = os.getpid()
@@ -160,18 +147,22 @@ async def init_multi_tokenizer() -> ServerArgs:
160
147
  logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
161
148
 
162
149
  # Read configuration from shared memory
163
- port_args_data = read_from_shared_memory(f"port_args_{main_pid}")
164
- server_args_data = read_from_shared_memory(f"server_args_{main_pid}")
165
- scheduler_info_data = read_from_shared_memory(f"scheduler_info_{main_pid}")
166
- port_args, server_args = deserialize_data(port_args_data, server_args_data)
167
- scheduler_info = scheduler_info_data
150
+ port_args, server_args, scheduler_info = read_from_shared_memory(
151
+ f"multi_tokenizer_args_{main_pid}"
152
+ )
153
+ server_args: ServerArgs
154
+
155
+ # API key authentication is not supported in multi-tokenizer mode
156
+ assert (
157
+ server_args.api_key is None
158
+ ), "API key is not supported in multi-tokenizer mode"
168
159
 
169
160
  port_args.tokenizer_ipc_name = (
170
161
  f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
171
162
  )
172
163
 
173
164
  # Launch multi-tokenizer manager process
174
- tokenizer_manager = MultiTokenizerManager(server_args, port_args)
165
+ tokenizer_manager = TokenizerWorker(server_args, port_args)
175
166
  template_manager = TemplateManager()
176
167
  template_manager.initialize_templates(
177
168
  tokenizer_manager=tokenizer_manager,
@@ -190,18 +181,29 @@ async def init_multi_tokenizer() -> ServerArgs:
190
181
  scheduler_info=scheduler_info,
191
182
  )
192
183
  )
184
+
185
+ if server_args.enable_trace:
186
+ process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
187
+ if server_args.disaggregation_mode == "null":
188
+ thread_label = f"MultiTokenizer-{tokenizer_manager.worker_id}"
189
+ trace_set_thread_info(thread_label)
190
+
193
191
  return server_args
194
192
 
195
193
 
196
194
  @asynccontextmanager
197
195
  async def lifespan(fast_api_app: FastAPI):
198
- server_args = getattr(fast_api_app, "server_args", None)
199
- if server_args is None:
196
+ if not getattr(fast_api_app, "is_single_tokenizer_mode", False):
200
197
  # Initialize multi-tokenizer support for worker processes
201
- fast_api_app.server_args = await init_multi_tokenizer()
202
- setup_middlewares(
203
- fast_api_app.server_args.api_key, fast_api_app.server_args.enable_metrics
204
- )
198
+ fast_api_app.server_args: ServerArgs = await init_multi_tokenizer()
199
+
200
+ # only metrics middleware is supported in multi-tokenizer mode
201
+ worker_pid = os.getpid()
202
+ if fast_api_app.server_args.enable_metrics:
203
+ add_prometheus_middleware(app)
204
+ enable_func_timer()
205
+
206
+ logger.info(f"Worker {worker_pid} added prometheus middleware")
205
207
  fast_api_app.warmup_thread = threading.Thread(
206
208
  target=_wait_and_warmup,
207
209
  args=(
@@ -297,7 +299,23 @@ app.add_middleware(
297
299
 
298
300
  @app.exception_handler(HTTPException)
299
301
  async def validation_exception_handler(request: Request, exc: HTTPException):
300
- """Enrich HTTP exception with status code and other details"""
302
+ """Enrich HTTP exception with status code and other details.
303
+
304
+ For /v1/responses, emit OpenAI-style nested error envelope:
305
+ {"error": {"message": "...", "type": "...", "param": null, "code": <status>}}
306
+ """
307
+ # adjust fmt for responses api
308
+ if request.url.path.startswith("/v1/responses"):
309
+ nested_error = {
310
+ "message": exc.detail,
311
+ "type": HTTPStatus(exc.status_code).phrase,
312
+ "param": None,
313
+ "code": exc.status_code,
314
+ }
315
+ return ORJSONResponse(
316
+ content={"error": nested_error}, status_code=exc.status_code
317
+ )
318
+
301
319
  error = ErrorResponse(
302
320
  object="error",
303
321
  message=exc.detail,
@@ -310,7 +328,10 @@ async def validation_exception_handler(request: Request, exc: HTTPException):
310
328
  # Custom exception handlers to change validation error status codes
311
329
  @app.exception_handler(RequestValidationError)
312
330
  async def validation_exception_handler(request: Request, exc: RequestValidationError):
313
- """Override FastAPI's default 422 validation error with 400"""
331
+ """Override FastAPI's default 422 validation error with 400.
332
+
333
+ For /v1/responses, emit OpenAI-style nested error envelope; for other endpoints keep legacy format.
334
+ """
314
335
  exc_str = str(exc)
315
336
  errors_str = str(exc.errors())
316
337
 
@@ -319,6 +340,16 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE
319
340
  else:
320
341
  message = exc_str
321
342
 
343
+ if request.url.path.startswith("/v1/responses"):
344
+ # adapt specially, for v1/responses API only (notice the error key is different)
345
+ nested_error = {
346
+ "message": message,
347
+ "type": HTTPStatus.BAD_REQUEST.phrase,
348
+ "param": None,
349
+ "code": HTTPStatus.BAD_REQUEST.value,
350
+ }
351
+ return ORJSONResponse(status_code=400, content={"error": nested_error})
352
+
322
353
  err = ErrorResponse(
323
354
  message=message,
324
355
  type=HTTPStatus.BAD_REQUEST.phrase,
@@ -679,6 +710,38 @@ async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: R
679
710
  )
680
711
 
681
712
 
713
+ @app.post("/init_weights_send_group_for_remote_instance")
714
+ async def init_weights_send_group_for_remote_instance(
715
+ obj: InitWeightsSendGroupForRemoteInstanceReqInput, request: Request
716
+ ):
717
+ success, message = (
718
+ await _global_state.tokenizer_manager.init_weights_send_group_for_remote_instance(
719
+ obj, request
720
+ )
721
+ )
722
+ content = {"success": success, "message": message}
723
+ if success:
724
+ return ORJSONResponse(content, status_code=200)
725
+ else:
726
+ return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
727
+
728
+
729
+ @app.post("/send_weights_to_remote_instance")
730
+ async def send_weights_to_remote_instance(
731
+ obj: SendWeightsToRemoteInstanceReqInput, request: Request
732
+ ):
733
+ success, message = (
734
+ await _global_state.tokenizer_manager.send_weights_to_remote_instance(
735
+ obj, request
736
+ )
737
+ )
738
+ content = {"success": success, "message": message}
739
+ if success:
740
+ return ORJSONResponse(content, status_code=200)
741
+ else:
742
+ return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
743
+
744
+
682
745
  @app.post("/init_weights_update_group")
683
746
  async def init_weights_update_group(
684
747
  obj: InitWeightsUpdateGroupReqInput, request: Request
@@ -694,6 +757,20 @@ async def init_weights_update_group(
694
757
  return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
695
758
 
696
759
 
760
+ @app.post("/destroy_weights_update_group")
761
+ async def destroy_weights_update_group(
762
+ obj: DestroyWeightsUpdateGroupReqInput, request: Request
763
+ ):
764
+ """Destroy the parameter update group."""
765
+ success, message = (
766
+ await _global_state.tokenizer_manager.destroy_weights_update_group(obj, request)
767
+ )
768
+ content = {"success": success, "message": message}
769
+ return ORJSONResponse(
770
+ content, status_code=200 if success else HTTPStatus.BAD_REQUEST
771
+ )
772
+
773
+
697
774
  @app.post("/update_weights_from_tensor")
698
775
  async def update_weights_from_tensor(
699
776
  obj: UpdateWeightsFromTensorReqInput, request: Request
@@ -1178,6 +1255,12 @@ def launch_server(
1178
1255
  server_args=server_args,
1179
1256
  )
1180
1257
 
1258
+ if server_args.enable_trace:
1259
+ process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
1260
+ if server_args.disaggregation_mode == "null":
1261
+ thread_label = "Tokenizer"
1262
+ trace_set_thread_info(thread_label)
1263
+
1181
1264
  set_global_state(
1182
1265
  _GlobalState(
1183
1266
  tokenizer_manager=tokenizer_manager,
@@ -1187,12 +1270,10 @@ def launch_server(
1187
1270
  )
1188
1271
 
1189
1272
  if server_args.tokenizer_worker_num > 1:
1190
- port_args_shm, server_args_shm, scheduler_info_shm = (
1191
- write_data_for_multi_tokenizer(
1192
- port_args,
1193
- server_args,
1194
- scheduler_info,
1195
- )
1273
+ multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
1274
+ port_args,
1275
+ server_args,
1276
+ scheduler_info,
1196
1277
  )
1197
1278
  else:
1198
1279
  # Add api key authorization
@@ -1229,6 +1310,9 @@ def launch_server(
1229
1310
  "level": "INFO",
1230
1311
  "propagate": False,
1231
1312
  }
1313
+
1314
+ monkey_patch_uvicorn_multiprocessing()
1315
+
1232
1316
  uvicorn.run(
1233
1317
  "sglang.srt.entrypoints.http_server:app",
1234
1318
  host=server_args.host,
@@ -1239,6 +1323,7 @@ def launch_server(
1239
1323
  workers=server_args.tokenizer_worker_num,
1240
1324
  )
1241
1325
  else:
1326
+ app.is_single_tokenizer_mode = True
1242
1327
  uvicorn.run(
1243
1328
  app,
1244
1329
  host=server_args.host,
@@ -1249,10 +1334,8 @@ def launch_server(
1249
1334
  )
1250
1335
  finally:
1251
1336
  if server_args.tokenizer_worker_num > 1:
1252
- port_args_shm.unlink()
1253
- server_args_shm.unlink()
1254
- scheduler_info_shm.unlink()
1255
- _global_state.tokenizer_manager.clear_tokenizer_mapping()
1337
+ multi_tokenizer_args_shm.unlink()
1338
+ _global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
1256
1339
  else:
1257
1340
  warmup_thread.join()
1258
1341
 
@@ -1401,13 +1484,5 @@ def _wait_and_warmup(
1401
1484
  if server_args.debug_tensor_dump_input_file:
1402
1485
  kill_process_tree(os.getpid())
1403
1486
 
1404
- if server_args.pdlb_url is not None:
1405
- register_disaggregation_server(
1406
- server_args.disaggregation_mode,
1407
- server_args.port,
1408
- server_args.disaggregation_bootstrap_port,
1409
- server_args.pdlb_url,
1410
- )
1411
-
1412
1487
  if launch_callback is not None:
1413
1488
  launch_callback()
@@ -16,12 +16,14 @@
16
16
  import time
17
17
  import uuid
18
18
  from dataclasses import dataclass
19
- from typing import Any, Dict, List, Optional, TypeAlias, Union
19
+ from typing import Any, Dict, List, NamedTuple, Optional, TypeAlias, Union
20
20
 
21
21
  from openai.types.responses import (
22
22
  ResponseFunctionToolCall,
23
23
  ResponseInputItemParam,
24
24
  ResponseOutputItem,
25
+ ResponseOutputMessage,
26
+ ResponseOutputText,
25
27
  ResponseReasoningItem,
26
28
  )
27
29
  from openai.types.responses.response import ToolChoice
@@ -228,6 +230,15 @@ class CompletionRequest(BaseModel):
228
230
 
229
231
  # For request id
230
232
  rid: Optional[Union[List[str], str]] = None
233
+ # Extra key for classifying the request (e.g. cache_salt)
234
+ extra_key: Optional[Union[List[str], str]] = None
235
+ # Cache salt for request caching
236
+ cache_salt: Optional[Union[List[str], str]] = None
237
+ # Priority for the request
238
+ priority: Optional[int] = None
239
+
240
+ # For custom metric labels
241
+ custom_labels: Optional[Dict[str, str]] = None
231
242
 
232
243
  @field_validator("max_tokens")
233
244
  @classmethod
@@ -334,7 +345,7 @@ class FunctionResponse(BaseModel):
334
345
  """Function response."""
335
346
 
336
347
  name: Optional[str] = None
337
- arguments: Optional[str] = None
348
+ arguments: Optional[str | Dict[str, Any]] = None
338
349
 
339
350
 
340
351
  class ToolCall(BaseModel):
@@ -383,7 +394,7 @@ class Function(BaseModel):
383
394
  """Function descriptions."""
384
395
 
385
396
  description: Optional[str] = Field(default=None, examples=[None])
386
- name: Optional[str] = None
397
+ name: str
387
398
  parameters: Optional[object] = None
388
399
  strict: bool = False
389
400
 
@@ -447,7 +458,7 @@ class ChatCompletionRequest(BaseModel):
447
458
  description="Constrains effort on reasoning for reasoning models. "
448
459
  "'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
449
460
  "result in faster responses and fewer tokens used on reasoning in a response. "
450
- "Currently only supported for OpenAI models.",
461
+ "Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
451
462
  )
452
463
 
453
464
  @model_validator(mode="before")
@@ -540,11 +551,17 @@ class ChatCompletionRequest(BaseModel):
540
551
 
541
552
  # For request id
542
553
  rid: Optional[Union[List[str], str]] = None
554
+ # Extra key for classifying the request (e.g. cache_salt)
555
+ extra_key: Optional[Union[List[str], str]] = None
556
+ # Cache salt for request caching
557
+ cache_salt: Optional[Union[List[str], str]] = None
558
+ # Priority for the request
559
+ priority: Optional[int] = None
543
560
 
544
561
  # For PD disaggregation
545
- bootstrap_host: Optional[str] = None
546
- bootstrap_port: Optional[int] = None
547
- bootstrap_room: Optional[int] = None
562
+ bootstrap_host: Optional[Union[List[str], str]] = None
563
+ bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
564
+ bootstrap_room: Optional[Union[List[int], int]] = None
548
565
 
549
566
 
550
567
  class ChatMessage(BaseModel):
@@ -641,6 +658,8 @@ class EmbeddingRequest(BaseModel):
641
658
 
642
659
  # The request id.
643
660
  rid: Optional[Union[List[str], str]] = None
661
+ # Priority for the request
662
+ priority: Optional[int] = None
644
663
 
645
664
 
646
665
  class EmbeddingObject(BaseModel):
@@ -769,6 +788,13 @@ class ResponsesRequest(BaseModel):
769
788
  description="The request_id related to this request. If the caller does not set it, a random uuid will be generated.",
770
789
  )
771
790
  priority: int = Field(default=0, description="Request priority")
791
+ extra_key: Optional[str] = Field(
792
+ default=None,
793
+ description="Extra key for classifying the request (e.g. cache_salt)",
794
+ )
795
+ cache_salt: Optional[str] = Field(
796
+ default=None, description="Cache salt for request caching"
797
+ )
772
798
 
773
799
  # SGLang-specific sampling parameters
774
800
  frequency_penalty: float = 0.0
@@ -857,6 +883,26 @@ class ResponsesResponse(BaseModel):
857
883
  tool_choice: str = "auto"
858
884
  tools: List[ResponseTool] = Field(default_factory=list)
859
885
 
886
+ # OpenAI compatibility fields. not all are used at the moment.
887
+ # Recommend checking https://platform.openai.com/docs/api-reference/responses
888
+ error: Optional[dict] = None
889
+ incomplete_details: Optional[dict] = None # TODO(v) support this input
890
+ instructions: Optional[str] = None
891
+ max_output_tokens: Optional[int] = None
892
+ previous_response_id: Optional[str] = None
893
+ reasoning: Optional[dict] = (
894
+ # Unused. No model supports this. For GPT-oss, system prompt sets
895
+ # the field, not server args.
896
+ None # {"effort": Optional[str], "summary": Optional[str]}
897
+ )
898
+ store: Optional[bool] = None
899
+ temperature: Optional[float] = None
900
+ text: Optional[dict] = None # e.g. {"format": {"type": "text"}}
901
+ top_p: Optional[float] = None
902
+ truncation: Optional[str] = None
903
+ user: Optional[str] = None
904
+ metadata: Optional[Dict[str, Any]] = None
905
+
860
906
  @classmethod
861
907
  def from_request(
862
908
  cls,
@@ -871,6 +917,41 @@ class ResponsesResponse(BaseModel):
871
917
  usage: Optional[UsageInfo],
872
918
  ) -> "ResponsesResponse":
873
919
  """Create a response from a request."""
920
+
921
+ # Determine if the output is plain text only to set text.format
922
+ def _is_text_only(
923
+ items: List[
924
+ Union[
925
+ ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall
926
+ ]
927
+ ]
928
+ ) -> bool:
929
+ if not items:
930
+ return False
931
+ for it in items:
932
+ # tool call -> not pure text.
933
+ if isinstance(it, ResponseReasoningItem) or isinstance(
934
+ it, ResponseFunctionToolCall
935
+ ):
936
+ return False
937
+ try:
938
+ if isinstance(it, ResponseOutputText):
939
+ continue
940
+ elif isinstance(it, ResponseOutputMessage):
941
+ if not it.content:
942
+ continue
943
+ for c in it.content:
944
+ if not isinstance(c, ResponseOutputText):
945
+ return False
946
+ else:
947
+ # Unknown type, not considered text-only
948
+ return False
949
+ except AttributeError:
950
+ return False
951
+ return True
952
+
953
+ text_format = {"format": {"type": "text"}} if _is_text_only(output) else None
954
+
874
955
  return cls(
875
956
  id=request.request_id,
876
957
  created_at=created_time,
@@ -881,6 +962,23 @@ class ResponsesResponse(BaseModel):
881
962
  parallel_tool_calls=request.parallel_tool_calls or True,
882
963
  tool_choice=request.tool_choice,
883
964
  tools=request.tools,
965
+ # fields for parity with v1/responses
966
+ error=None,
967
+ incomplete_details=None,
968
+ instructions=request.instructions,
969
+ max_output_tokens=request.max_output_tokens,
970
+ previous_response_id=request.previous_response_id, # TODO(v): ensure this is propagated if retrieved from store
971
+ reasoning={
972
+ "effort": request.reasoning.effort if request.reasoning else None,
973
+ "summary": None, # unused
974
+ },
975
+ store=request.store,
976
+ temperature=request.temperature,
977
+ text=text_format, # TODO(v): Expand coverage per https://platform.openai.com/docs/api-reference/responses/list
978
+ top_p=request.top_p,
979
+ truncation=request.truncation,
980
+ user=request.user,
981
+ metadata=request.metadata or {},
884
982
  )
885
983
 
886
984
 
@@ -919,6 +1017,16 @@ class MessageProcessingResult:
919
1017
  tool_call_constraint: Optional[Any] = None
920
1018
 
921
1019
 
1020
+ class ToolCallProcessingResult(NamedTuple):
1021
+ """Result of processing tool calls in a response."""
1022
+
1023
+ tool_calls: Optional[
1024
+ List[Any]
1025
+ ] # List of ToolCall objects or None if parsing failed
1026
+ remaining_text: str # Text remaining after parsing tool calls
1027
+ finish_reason: Dict[str, Any] # Updated finish reason dictionary
1028
+
1029
+
922
1030
  class ResponseReasoningTextContent(BaseModel):
923
1031
  text: str
924
1032
  type: Literal["reasoning_text"] = "reasoning_text"
@@ -1,15 +1,20 @@
1
+ from __future__ import annotations
2
+
1
3
  import json
2
4
  import logging
3
5
  import uuid
4
6
  from abc import ABC, abstractmethod
5
- from typing import Any, Optional, Union
7
+ from typing import TYPE_CHECKING, Any, Optional, Union
6
8
 
7
9
  from fastapi import HTTPException, Request
8
10
  from fastapi.responses import ORJSONResponse, StreamingResponse
9
11
 
10
12
  from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
11
13
  from sglang.srt.managers.io_struct import GenerateReqInput
12
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
14
+ from sglang.srt.server_args import ServerArgs
15
+
16
+ if TYPE_CHECKING:
17
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
13
18
 
14
19
  logger = logging.getLogger(__name__)
15
20
 
@@ -20,6 +25,14 @@ class OpenAIServingBase(ABC):
20
25
 
21
26
  def __init__(self, tokenizer_manager: TokenizerManager):
22
27
  self.tokenizer_manager = tokenizer_manager
28
+ self.allowed_custom_labels = (
29
+ set(
30
+ self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
31
+ )
32
+ if isinstance(self.tokenizer_manager.server_args, ServerArgs)
33
+ and self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
34
+ else None
35
+ )
23
36
 
24
37
  async def handle_request(
25
38
  self, request: OpenAIServingRequest, raw_request: Request
@@ -33,7 +46,7 @@ class OpenAIServingBase(ABC):
33
46
 
34
47
  # Convert to internal format
35
48
  adapted_request, processed_request = self._convert_to_internal_request(
36
- request
49
+ request, raw_request
37
50
  )
38
51
 
39
52
  # Note(Xinyuan): raw_request below is only used for detecting the connection of the client
@@ -49,6 +62,12 @@ class OpenAIServingBase(ABC):
49
62
  return self.create_error_response(
50
63
  message=e.detail, err_type=str(e.status_code), status_code=e.status_code
51
64
  )
65
+ except ValueError as e:
66
+ return self.create_error_response(
67
+ message=str(e),
68
+ err_type="BadRequest",
69
+ status_code=400,
70
+ )
52
71
  except Exception as e:
53
72
  logger.exception(f"Error in request: {e}")
54
73
  return self.create_error_response(
@@ -73,10 +92,24 @@ class OpenAIServingBase(ABC):
73
92
 
74
93
  return f"{self._request_id_prefix()}{uuid.uuid4().hex}"
75
94
 
95
+ def _compute_extra_key(self, request: OpenAIServingRequest) -> Optional[str]:
96
+ """Compute the final extra_key by concatenating cache_salt and extra_key if both are provided."""
97
+ parts = []
98
+ for key in ["cache_salt", "extra_key"]:
99
+ value = getattr(request, key, None)
100
+ if value:
101
+ if not isinstance(value, str):
102
+ raise TypeError(
103
+ f"Value of {key} must be a string, but got {type(value).__name__}"
104
+ )
105
+ parts.append(value)
106
+ return "".join(parts) if parts else None
107
+
76
108
  @abstractmethod
77
109
  def _convert_to_internal_request(
78
110
  self,
79
111
  request: OpenAIServingRequest,
112
+ raw_request: Request = None,
80
113
  ) -> tuple[GenerateReqInput, OpenAIServingRequest]:
81
114
  """Convert OpenAI request to internal format"""
82
115
  pass
@@ -150,3 +183,32 @@ class OpenAIServingBase(ABC):
150
183
  code=status_code,
151
184
  )
152
185
  return json.dumps({"error": error.model_dump()})
186
+
187
+ def extract_custom_labels(self, raw_request):
188
+ if (
189
+ not self.allowed_custom_labels
190
+ or not self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
191
+ ):
192
+ return None
193
+
194
+ custom_labels = None
195
+ header = (
196
+ self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
197
+ )
198
+ try:
199
+ raw_labels = (
200
+ json.loads(raw_request.headers.get(header))
201
+ if raw_request and raw_request.headers.get(header)
202
+ else None
203
+ )
204
+ except json.JSONDecodeError as e:
205
+ logger.exception(f"Error in request: {e}")
206
+ raw_labels = None
207
+
208
+ if isinstance(raw_labels, dict):
209
+ custom_labels = {
210
+ label: value
211
+ for label, value in raw_labels.items()
212
+ if label in self.allowed_custom_labels
213
+ }
214
+ return custom_labels