sglang 0.5.2rc2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +8 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +360 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/model_config.py +181 -82
  17. sglang/srt/configs/qwen3_next.py +326 -0
  18. sglang/srt/configs/qwen3_vl.py +586 -0
  19. sglang/srt/connector/__init__.py +8 -1
  20. sglang/srt/connector/remote_instance.py +82 -0
  21. sglang/srt/constrained/base_grammar_backend.py +49 -12
  22. sglang/srt/constrained/llguidance_backend.py +0 -1
  23. sglang/srt/constrained/outlines_backend.py +0 -1
  24. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  25. sglang/srt/constrained/xgrammar_backend.py +30 -9
  26. sglang/srt/custom_op.py +11 -1
  27. sglang/srt/debug_utils/dump_comparator.py +81 -44
  28. sglang/srt/debug_utils/dump_loader.py +97 -0
  29. sglang/srt/debug_utils/dumper.py +21 -6
  30. sglang/srt/debug_utils/text_comparator.py +73 -11
  31. sglang/srt/disaggregation/ascend/conn.py +2 -2
  32. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  33. sglang/srt/disaggregation/base/conn.py +1 -1
  34. sglang/srt/disaggregation/common/conn.py +279 -108
  35. sglang/srt/disaggregation/decode.py +71 -19
  36. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  37. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  38. sglang/srt/disaggregation/fake/conn.py +1 -1
  39. sglang/srt/disaggregation/mini_lb.py +6 -445
  40. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  41. sglang/srt/disaggregation/nixl/conn.py +326 -53
  42. sglang/srt/disaggregation/prefill.py +36 -17
  43. sglang/srt/disaggregation/utils.py +40 -54
  44. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  45. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  46. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  47. sglang/srt/distributed/parallel_state.py +156 -80
  48. sglang/srt/entrypoints/engine.py +59 -18
  49. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  50. sglang/srt/entrypoints/grpc_server.py +810 -0
  51. sglang/srt/entrypoints/http_server.py +130 -59
  52. sglang/srt/entrypoints/openai/protocol.py +112 -4
  53. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  54. sglang/srt/entrypoints/openai/serving_chat.py +204 -55
  55. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  56. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  57. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  58. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  59. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  60. sglang/srt/environ.py +285 -0
  61. sglang/srt/eplb/eplb_manager.py +2 -2
  62. sglang/srt/eplb/expert_distribution.py +26 -13
  63. sglang/srt/eplb/expert_location.py +38 -8
  64. sglang/srt/eplb/expert_location_updater.py +1 -1
  65. sglang/srt/function_call/base_format_detector.py +3 -6
  66. sglang/srt/function_call/ebnf_composer.py +11 -9
  67. sglang/srt/function_call/function_call_parser.py +9 -2
  68. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  69. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  70. sglang/srt/function_call/json_array_parser.py +63 -0
  71. sglang/srt/function_call/kimik2_detector.py +17 -4
  72. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  73. sglang/srt/function_call/utils.py +96 -5
  74. sglang/srt/grpc/__init__.py +1 -0
  75. sglang/srt/grpc/compile_proto.py +245 -0
  76. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  77. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  79. sglang/srt/layers/activation.py +143 -9
  80. sglang/srt/layers/attention/aiter_backend.py +14 -15
  81. sglang/srt/layers/attention/ascend_backend.py +115 -9
  82. sglang/srt/layers/attention/attention_registry.py +206 -0
  83. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  84. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  85. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  86. sglang/srt/layers/attention/fla/chunk.py +242 -0
  87. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  88. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  89. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  90. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  91. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  92. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  93. sglang/srt/layers/attention/fla/index.py +37 -0
  94. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  95. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  96. sglang/srt/layers/attention/fla/op.py +66 -0
  97. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  98. sglang/srt/layers/attention/fla/utils.py +331 -0
  99. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  100. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  101. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  102. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  103. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  104. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  105. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  106. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  107. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  108. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  109. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  110. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  111. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  112. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  113. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  114. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  115. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  119. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  120. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  121. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  122. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  123. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  124. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  125. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  126. sglang/srt/layers/attention/nsa/utils.py +24 -0
  127. sglang/srt/layers/attention/nsa_backend.py +887 -0
  128. sglang/srt/layers/attention/tbo_backend.py +6 -6
  129. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  130. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  131. sglang/srt/layers/attention/triton_backend.py +57 -7
  132. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  133. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  134. sglang/srt/layers/attention/vision.py +58 -0
  135. sglang/srt/layers/attention/wave_backend.py +4 -4
  136. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  137. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  138. sglang/srt/layers/communicator.py +8 -0
  139. sglang/srt/layers/dp_attention.py +41 -2
  140. sglang/srt/layers/elementwise.py +3 -1
  141. sglang/srt/layers/layernorm.py +34 -15
  142. sglang/srt/layers/linear.py +55 -7
  143. sglang/srt/layers/logits_processor.py +44 -12
  144. sglang/srt/layers/moe/__init__.py +2 -1
  145. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  146. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  147. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  148. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  149. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  150. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  151. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  164. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  165. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  166. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  167. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  168. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  169. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  170. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  171. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  172. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  173. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  174. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  175. sglang/srt/layers/moe/topk.py +30 -9
  176. sglang/srt/layers/moe/utils.py +22 -6
  177. sglang/srt/layers/parameter.py +23 -6
  178. sglang/srt/layers/quantization/awq.py +19 -7
  179. sglang/srt/layers/quantization/base_config.py +11 -6
  180. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  182. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  185. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  186. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  187. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  188. sglang/srt/layers/quantization/fp8.py +78 -49
  189. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  190. sglang/srt/layers/quantization/gptq.py +25 -17
  191. sglang/srt/layers/quantization/modelopt_quant.py +190 -55
  192. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  193. sglang/srt/layers/quantization/mxfp4.py +74 -42
  194. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  195. sglang/srt/layers/quantization/unquant.py +135 -47
  196. sglang/srt/layers/quantization/w4afp8.py +26 -17
  197. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  198. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  199. sglang/srt/layers/rotary_embedding.py +78 -31
  200. sglang/srt/layers/sampler.py +213 -21
  201. sglang/srt/layers/utils.py +23 -0
  202. sglang/srt/lora/backend/base_backend.py +50 -8
  203. sglang/srt/lora/backend/chunked_backend.py +348 -0
  204. sglang/srt/lora/backend/triton_backend.py +99 -5
  205. sglang/srt/lora/layers.py +32 -0
  206. sglang/srt/lora/lora.py +8 -3
  207. sglang/srt/lora/lora_manager.py +52 -118
  208. sglang/srt/lora/mem_pool.py +25 -11
  209. sglang/srt/lora/triton_ops/__init__.py +4 -0
  210. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  211. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  212. sglang/srt/lora/utils.py +22 -11
  213. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  214. sglang/srt/managers/cache_controller.py +199 -301
  215. sglang/srt/managers/data_parallel_controller.py +115 -80
  216. sglang/srt/managers/detokenizer_manager.py +19 -15
  217. sglang/srt/managers/disagg_service.py +46 -0
  218. sglang/srt/managers/io_struct.py +340 -109
  219. sglang/srt/managers/mm_utils.py +44 -6
  220. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  221. sglang/srt/managers/multimodal_processor.py +1 -2
  222. sglang/srt/managers/overlap_utils.py +53 -0
  223. sglang/srt/managers/schedule_batch.py +240 -138
  224. sglang/srt/managers/schedule_policy.py +144 -17
  225. sglang/srt/managers/scheduler.py +502 -209
  226. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  227. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  228. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  229. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  230. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  231. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  232. sglang/srt/managers/tokenizer_manager.py +320 -632
  233. sglang/srt/managers/tp_worker.py +81 -22
  234. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  235. sglang/srt/managers/utils.py +1 -45
  236. sglang/srt/mem_cache/allocator.py +14 -20
  237. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  238. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  239. sglang/srt/mem_cache/chunk_cache.py +8 -1
  240. sglang/srt/mem_cache/evict_policy.py +23 -0
  241. sglang/srt/mem_cache/hicache_storage.py +43 -24
  242. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  243. sglang/srt/mem_cache/memory_pool.py +535 -58
  244. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  245. sglang/srt/mem_cache/radix_cache.py +222 -73
  246. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  247. sglang/srt/mem_cache/storage/__init__.py +10 -0
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  249. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  250. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  251. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  252. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  253. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  254. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  255. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  256. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  257. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  259. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  260. sglang/srt/metrics/collector.py +511 -132
  261. sglang/srt/metrics/func_timer.py +2 -7
  262. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  263. sglang/srt/metrics/utils.py +8 -1
  264. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  265. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  266. sglang/srt/model_executor/forward_batch_info.py +82 -40
  267. sglang/srt/model_executor/model_runner.py +432 -157
  268. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  269. sglang/srt/model_loader/__init__.py +9 -3
  270. sglang/srt/model_loader/loader.py +133 -5
  271. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  272. sglang/srt/model_loader/weight_utils.py +158 -3
  273. sglang/srt/models/apertus.py +686 -0
  274. sglang/srt/models/bailing_moe.py +820 -217
  275. sglang/srt/models/bailing_moe_nextn.py +168 -0
  276. sglang/srt/models/deepseek_nextn.py +6 -1
  277. sglang/srt/models/deepseek_v2.py +607 -130
  278. sglang/srt/models/dots_ocr.py +173 -0
  279. sglang/srt/models/dots_vlm.py +174 -0
  280. sglang/srt/models/dots_vlm_vit.py +337 -0
  281. sglang/srt/models/ernie4.py +1 -1
  282. sglang/srt/models/falcon_h1.py +576 -0
  283. sglang/srt/models/gemma3_causal.py +0 -2
  284. sglang/srt/models/gemma3_mm.py +1 -1
  285. sglang/srt/models/gemma3n_mm.py +2 -2
  286. sglang/srt/models/glm4_moe.py +4 -4
  287. sglang/srt/models/glm4_moe_nextn.py +2 -2
  288. sglang/srt/models/glm4v.py +5 -3
  289. sglang/srt/models/glm4v_moe.py +4 -1
  290. sglang/srt/models/gpt_oss.py +8 -31
  291. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  292. sglang/srt/models/llama.py +4 -0
  293. sglang/srt/models/llama4.py +9 -0
  294. sglang/srt/models/llama_eagle3.py +13 -0
  295. sglang/srt/models/longcat_flash.py +3 -3
  296. sglang/srt/models/longcat_flash_nextn.py +1 -1
  297. sglang/srt/models/mllama4.py +40 -4
  298. sglang/srt/models/opt.py +637 -0
  299. sglang/srt/models/qwen2_5_vl.py +29 -5
  300. sglang/srt/models/qwen2_audio.py +1 -1
  301. sglang/srt/models/qwen2_moe.py +120 -13
  302. sglang/srt/models/qwen2_vl.py +1 -1
  303. sglang/srt/models/qwen3.py +18 -3
  304. sglang/srt/models/qwen3_moe.py +32 -4
  305. sglang/srt/models/qwen3_next.py +1069 -0
  306. sglang/srt/models/qwen3_next_mtp.py +112 -0
  307. sglang/srt/models/qwen3_vl.py +787 -0
  308. sglang/srt/models/qwen3_vl_moe.py +471 -0
  309. sglang/srt/models/registry.py +15 -3
  310. sglang/srt/models/sarashina2_vision.py +269 -0
  311. sglang/srt/models/solar.py +505 -0
  312. sglang/srt/models/starcoder2.py +357 -0
  313. sglang/srt/models/step3_vl.py +1 -1
  314. sglang/srt/models/torch_native_llama.py +9 -2
  315. sglang/srt/models/utils.py +51 -0
  316. sglang/srt/multimodal/processors/base_processor.py +15 -7
  317. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  318. sglang/srt/multimodal/processors/glm4v.py +9 -9
  319. sglang/srt/multimodal/processors/internvl.py +153 -129
  320. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  321. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  322. sglang/srt/offloader.py +27 -3
  323. sglang/srt/parser/jinja_template_utils.py +6 -0
  324. sglang/srt/sampling/sampling_batch_info.py +38 -17
  325. sglang/srt/sampling/sampling_params.py +7 -0
  326. sglang/srt/server_args.py +966 -267
  327. sglang/srt/server_args_config_parser.py +146 -0
  328. sglang/srt/single_batch_overlap.py +151 -0
  329. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  330. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  331. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  332. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  333. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  334. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  335. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  336. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  337. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  338. sglang/srt/speculative/eagle_worker.py +99 -28
  339. sglang/srt/speculative/ngram_utils.py +428 -0
  340. sglang/srt/speculative/ngram_worker.py +245 -0
  341. sglang/srt/speculative/spec_info.py +52 -0
  342. sglang/srt/speculative/spec_utils.py +606 -0
  343. sglang/srt/speculative/standalone_worker.py +109 -0
  344. sglang/srt/torch_memory_saver_adapter.py +5 -7
  345. sglang/srt/tracing/trace.py +578 -0
  346. sglang/srt/two_batch_overlap.py +8 -5
  347. sglang/srt/utils/__init__.py +2 -0
  348. sglang/srt/{utils.py → utils/common.py} +433 -77
  349. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  350. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  351. sglang/srt/utils/rpd_utils.py +452 -0
  352. sglang/srt/utils/slow_rank_detector.py +71 -0
  353. sglang/srt/warmup.py +8 -4
  354. sglang/srt/weight_sync/utils.py +2 -2
  355. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  356. sglang/test/get_logits_ut.py +57 -0
  357. sglang/test/run_eval.py +79 -11
  358. sglang/test/runners.py +5 -1
  359. sglang/test/simple_eval_common.py +5 -2
  360. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  361. sglang/test/test_block_fp8.py +2 -2
  362. sglang/test/test_cutlass_moe.py +24 -6
  363. sglang/test/test_deterministic.py +297 -0
  364. sglang/test/test_disaggregation_utils.py +77 -0
  365. sglang/test/test_fp4_moe.py +370 -1
  366. sglang/test/test_programs.py +1 -1
  367. sglang/test/test_utils.py +383 -5
  368. sglang/utils.py +21 -1
  369. sglang/version.py +1 -1
  370. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
  371. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/RECORD +375 -245
  372. sglang/srt/disaggregation/launch_lb.py +0 -118
  373. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  374. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  375. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
  376. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
  377. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -27,9 +27,9 @@ import tempfile
27
27
  import threading
28
28
  import time
29
29
  from http import HTTPStatus
30
- from typing import Any, AsyncIterator, Callable, Dict, List, Optional
30
+ from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Union
31
31
 
32
- import setproctitle
32
+ from sglang.srt.tracing.trace import process_tracing_init, trace_set_thread_info
33
33
 
34
34
  # Fix a bug of Python threading
35
35
  setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
@@ -47,11 +47,7 @@ from fastapi.exceptions import RequestValidationError
47
47
  from fastapi.middleware.cors import CORSMiddleware
48
48
  from fastapi.responses import ORJSONResponse, Response, StreamingResponse
49
49
 
50
- from sglang.srt.disaggregation.utils import (
51
- FAKE_BOOTSTRAP_HOST,
52
- DisaggregationMode,
53
- register_disaggregation_server,
54
- )
50
+ from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
55
51
  from sglang.srt.entrypoints.engine import _launch_subprocesses
56
52
  from sglang.srt.entrypoints.openai.protocol import (
57
53
  ChatCompletionRequest,
@@ -74,9 +70,11 @@ from sglang.srt.managers.io_struct import (
74
70
  AbortReq,
75
71
  CloseSessionReqInput,
76
72
  ConfigureLoggingReq,
73
+ DestroyWeightsUpdateGroupReqInput,
77
74
  EmbeddingReqInput,
78
75
  GenerateReqInput,
79
76
  GetWeightsByNameReqInput,
77
+ InitWeightsSendGroupForRemoteInstanceReqInput,
80
78
  InitWeightsUpdateGroupReqInput,
81
79
  LoadLoRAAdapterReqInput,
82
80
  OpenSessionReqInput,
@@ -84,6 +82,7 @@ from sglang.srt.managers.io_struct import (
84
82
  ProfileReqInput,
85
83
  ReleaseMemoryOccupationReqInput,
86
84
  ResumeMemoryOccupationReqInput,
85
+ SendWeightsToRemoteInstanceReqInput,
87
86
  SeparateReasoningReqInput,
88
87
  SetInternalStateReq,
89
88
  SlowDownReqInput,
@@ -95,9 +94,10 @@ from sglang.srt.managers.io_struct import (
95
94
  VertexGenerateReqInput,
96
95
  )
97
96
  from sglang.srt.managers.multi_tokenizer_mixin import (
98
- MultiTokenizerManager,
99
- deserialize_data,
97
+ MultiTokenizerRouter,
98
+ TokenizerWorker,
100
99
  get_main_process_id,
100
+ monkey_patch_uvicorn_multiprocessing,
101
101
  read_from_shared_memory,
102
102
  write_data_for_multi_tokenizer,
103
103
  )
@@ -127,7 +127,7 @@ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
127
127
  # Store global states
128
128
  @dataclasses.dataclass
129
129
  class _GlobalState:
130
- tokenizer_manager: TokenizerManager
130
+ tokenizer_manager: Union[TokenizerManager, MultiTokenizerRouter, TokenizerWorker]
131
131
  template_manager: TemplateManager
132
132
  scheduler_info: Dict
133
133
 
@@ -140,21 +140,6 @@ def set_global_state(global_state: _GlobalState):
140
140
  _global_state = global_state
141
141
 
142
142
 
143
- # Function to set up all middlewares for multi-tokenizer compatibility
144
- def setup_middlewares(api_key: Optional[str], enable_metrics: bool):
145
- """Setup all middlewares for both single and multi-process modes"""
146
- worker_pid = os.getpid()
147
-
148
- if api_key:
149
- add_api_key_middleware(app, api_key)
150
- logger.info(f"Worker {worker_pid} added API key middleware")
151
-
152
- if enable_metrics:
153
- add_prometheus_middleware(app)
154
- enable_func_timer()
155
- logger.info(f"Worker {worker_pid} added prometheus middleware")
156
-
157
-
158
143
  async def init_multi_tokenizer() -> ServerArgs:
159
144
  """Read args information from shm and init tokenizer manager for current process"""
160
145
  pid = os.getpid()
@@ -162,18 +147,22 @@ async def init_multi_tokenizer() -> ServerArgs:
162
147
  logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
163
148
 
164
149
  # Read configuration from shared memory
165
- port_args_data = read_from_shared_memory(f"port_args_{main_pid}")
166
- server_args_data = read_from_shared_memory(f"server_args_{main_pid}")
167
- scheduler_info_data = read_from_shared_memory(f"scheduler_info_{main_pid}")
168
- port_args, server_args = deserialize_data(port_args_data, server_args_data)
169
- scheduler_info = scheduler_info_data
150
+ port_args, server_args, scheduler_info = read_from_shared_memory(
151
+ f"multi_tokenizer_args_{main_pid}"
152
+ )
153
+ server_args: ServerArgs
154
+
155
+ # API key authentication is not supported in multi-tokenizer mode
156
+ assert (
157
+ server_args.api_key is None
158
+ ), "API key is not supported in multi-tokenizer mode"
170
159
 
171
160
  port_args.tokenizer_ipc_name = (
172
161
  f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
173
162
  )
174
163
 
175
164
  # Launch multi-tokenizer manager process
176
- tokenizer_manager = MultiTokenizerManager(server_args, port_args)
165
+ tokenizer_manager = TokenizerWorker(server_args, port_args)
177
166
  template_manager = TemplateManager()
178
167
  template_manager.initialize_templates(
179
168
  tokenizer_manager=tokenizer_manager,
@@ -192,18 +181,29 @@ async def init_multi_tokenizer() -> ServerArgs:
192
181
  scheduler_info=scheduler_info,
193
182
  )
194
183
  )
184
+
185
+ if server_args.enable_trace:
186
+ process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
187
+ if server_args.disaggregation_mode == "null":
188
+ thread_label = f"MultiTokenizer-{tokenizer_manager.worker_id}"
189
+ trace_set_thread_info(thread_label)
190
+
195
191
  return server_args
196
192
 
197
193
 
198
194
  @asynccontextmanager
199
195
  async def lifespan(fast_api_app: FastAPI):
200
- server_args = getattr(fast_api_app, "server_args", None)
201
- if server_args is None:
196
+ if not getattr(fast_api_app, "is_single_tokenizer_mode", False):
202
197
  # Initialize multi-tokenizer support for worker processes
203
- fast_api_app.server_args = await init_multi_tokenizer()
204
- setup_middlewares(
205
- fast_api_app.server_args.api_key, fast_api_app.server_args.enable_metrics
206
- )
198
+ fast_api_app.server_args: ServerArgs = await init_multi_tokenizer()
199
+
200
+ # only metrics middleware is supported in multi-tokenizer mode
201
+ worker_pid = os.getpid()
202
+ if fast_api_app.server_args.enable_metrics:
203
+ add_prometheus_middleware(app)
204
+ enable_func_timer()
205
+
206
+ logger.info(f"Worker {worker_pid} added prometheus middleware")
207
207
  fast_api_app.warmup_thread = threading.Thread(
208
208
  target=_wait_and_warmup,
209
209
  args=(
@@ -299,7 +299,23 @@ app.add_middleware(
299
299
 
300
300
  @app.exception_handler(HTTPException)
301
301
  async def validation_exception_handler(request: Request, exc: HTTPException):
302
- """Enrich HTTP exception with status code and other details"""
302
+ """Enrich HTTP exception with status code and other details.
303
+
304
+ For /v1/responses, emit OpenAI-style nested error envelope:
305
+ {"error": {"message": "...", "type": "...", "param": null, "code": <status>}}
306
+ """
307
+ # adjust fmt for responses api
308
+ if request.url.path.startswith("/v1/responses"):
309
+ nested_error = {
310
+ "message": exc.detail,
311
+ "type": HTTPStatus(exc.status_code).phrase,
312
+ "param": None,
313
+ "code": exc.status_code,
314
+ }
315
+ return ORJSONResponse(
316
+ content={"error": nested_error}, status_code=exc.status_code
317
+ )
318
+
303
319
  error = ErrorResponse(
304
320
  object="error",
305
321
  message=exc.detail,
@@ -312,7 +328,10 @@ async def validation_exception_handler(request: Request, exc: HTTPException):
312
328
  # Custom exception handlers to change validation error status codes
313
329
  @app.exception_handler(RequestValidationError)
314
330
  async def validation_exception_handler(request: Request, exc: RequestValidationError):
315
- """Override FastAPI's default 422 validation error with 400"""
331
+ """Override FastAPI's default 422 validation error with 400.
332
+
333
+ For /v1/responses, emit OpenAI-style nested error envelope; for other endpoints keep legacy format.
334
+ """
316
335
  exc_str = str(exc)
317
336
  errors_str = str(exc.errors())
318
337
 
@@ -321,6 +340,16 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE
321
340
  else:
322
341
  message = exc_str
323
342
 
343
+ if request.url.path.startswith("/v1/responses"):
344
+ # adapt specially, for v1/responses API only (notice the error key is different)
345
+ nested_error = {
346
+ "message": message,
347
+ "type": HTTPStatus.BAD_REQUEST.phrase,
348
+ "param": None,
349
+ "code": HTTPStatus.BAD_REQUEST.value,
350
+ }
351
+ return ORJSONResponse(status_code=400, content={"error": nested_error})
352
+
324
353
  err = ErrorResponse(
325
354
  message=message,
326
355
  type=HTTPStatus.BAD_REQUEST.phrase,
@@ -681,6 +710,38 @@ async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: R
681
710
  )
682
711
 
683
712
 
713
+ @app.post("/init_weights_send_group_for_remote_instance")
714
+ async def init_weights_send_group_for_remote_instance(
715
+ obj: InitWeightsSendGroupForRemoteInstanceReqInput, request: Request
716
+ ):
717
+ success, message = (
718
+ await _global_state.tokenizer_manager.init_weights_send_group_for_remote_instance(
719
+ obj, request
720
+ )
721
+ )
722
+ content = {"success": success, "message": message}
723
+ if success:
724
+ return ORJSONResponse(content, status_code=200)
725
+ else:
726
+ return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
727
+
728
+
729
+ @app.post("/send_weights_to_remote_instance")
730
+ async def send_weights_to_remote_instance(
731
+ obj: SendWeightsToRemoteInstanceReqInput, request: Request
732
+ ):
733
+ success, message = (
734
+ await _global_state.tokenizer_manager.send_weights_to_remote_instance(
735
+ obj, request
736
+ )
737
+ )
738
+ content = {"success": success, "message": message}
739
+ if success:
740
+ return ORJSONResponse(content, status_code=200)
741
+ else:
742
+ return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
743
+
744
+
684
745
  @app.post("/init_weights_update_group")
685
746
  async def init_weights_update_group(
686
747
  obj: InitWeightsUpdateGroupReqInput, request: Request
@@ -696,6 +757,20 @@ async def init_weights_update_group(
696
757
  return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
697
758
 
698
759
 
760
+ @app.post("/destroy_weights_update_group")
761
+ async def destroy_weights_update_group(
762
+ obj: DestroyWeightsUpdateGroupReqInput, request: Request
763
+ ):
764
+ """Destroy the parameter update group."""
765
+ success, message = (
766
+ await _global_state.tokenizer_manager.destroy_weights_update_group(obj, request)
767
+ )
768
+ content = {"success": success, "message": message}
769
+ return ORJSONResponse(
770
+ content, status_code=200 if success else HTTPStatus.BAD_REQUEST
771
+ )
772
+
773
+
699
774
  @app.post("/update_weights_from_tensor")
700
775
  async def update_weights_from_tensor(
701
776
  obj: UpdateWeightsFromTensorReqInput, request: Request
@@ -1168,7 +1243,6 @@ def launch_server(
1168
1243
  2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
1169
1244
  """
1170
1245
  if server_args.tokenizer_worker_num > 1:
1171
- setproctitle.setproctitle(f"sglang::http_server/multi_tokenizer_router")
1172
1246
  port_args = PortArgs.init_new(server_args)
1173
1247
  port_args.tokenizer_worker_ipc_name = (
1174
1248
  f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
@@ -1177,11 +1251,16 @@ def launch_server(
1177
1251
  server_args=server_args, port_args=port_args
1178
1252
  )
1179
1253
  else:
1180
- setproctitle.setproctitle(f"sglang::http_server/tokenizer_manager")
1181
1254
  tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
1182
1255
  server_args=server_args,
1183
1256
  )
1184
1257
 
1258
+ if server_args.enable_trace:
1259
+ process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
1260
+ if server_args.disaggregation_mode == "null":
1261
+ thread_label = "Tokenizer"
1262
+ trace_set_thread_info(thread_label)
1263
+
1185
1264
  set_global_state(
1186
1265
  _GlobalState(
1187
1266
  tokenizer_manager=tokenizer_manager,
@@ -1191,12 +1270,10 @@ def launch_server(
1191
1270
  )
1192
1271
 
1193
1272
  if server_args.tokenizer_worker_num > 1:
1194
- port_args_shm, server_args_shm, scheduler_info_shm = (
1195
- write_data_for_multi_tokenizer(
1196
- port_args,
1197
- server_args,
1198
- scheduler_info,
1199
- )
1273
+ multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
1274
+ port_args,
1275
+ server_args,
1276
+ scheduler_info,
1200
1277
  )
1201
1278
  else:
1202
1279
  # Add api key authorization
@@ -1233,6 +1310,9 @@ def launch_server(
1233
1310
  "level": "INFO",
1234
1311
  "propagate": False,
1235
1312
  }
1313
+
1314
+ monkey_patch_uvicorn_multiprocessing()
1315
+
1236
1316
  uvicorn.run(
1237
1317
  "sglang.srt.entrypoints.http_server:app",
1238
1318
  host=server_args.host,
@@ -1243,6 +1323,7 @@ def launch_server(
1243
1323
  workers=server_args.tokenizer_worker_num,
1244
1324
  )
1245
1325
  else:
1326
+ app.is_single_tokenizer_mode = True
1246
1327
  uvicorn.run(
1247
1328
  app,
1248
1329
  host=server_args.host,
@@ -1253,10 +1334,8 @@ def launch_server(
1253
1334
  )
1254
1335
  finally:
1255
1336
  if server_args.tokenizer_worker_num > 1:
1256
- port_args_shm.unlink()
1257
- server_args_shm.unlink()
1258
- scheduler_info_shm.unlink()
1259
- _global_state.tokenizer_manager.clear_tokenizer_mapping()
1337
+ multi_tokenizer_args_shm.unlink()
1338
+ _global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
1260
1339
  else:
1261
1340
  warmup_thread.join()
1262
1341
 
@@ -1405,13 +1484,5 @@ def _wait_and_warmup(
1405
1484
  if server_args.debug_tensor_dump_input_file:
1406
1485
  kill_process_tree(os.getpid())
1407
1486
 
1408
- if server_args.pdlb_url is not None:
1409
- register_disaggregation_server(
1410
- server_args.disaggregation_mode,
1411
- server_args.port,
1412
- server_args.disaggregation_bootstrap_port,
1413
- server_args.pdlb_url,
1414
- )
1415
-
1416
1487
  if launch_callback is not None:
1417
1488
  launch_callback()
@@ -16,12 +16,14 @@
16
16
  import time
17
17
  import uuid
18
18
  from dataclasses import dataclass
19
- from typing import Any, Dict, List, Optional, TypeAlias, Union
19
+ from typing import Any, Dict, List, NamedTuple, Optional, TypeAlias, Union
20
20
 
21
21
  from openai.types.responses import (
22
22
  ResponseFunctionToolCall,
23
23
  ResponseInputItemParam,
24
24
  ResponseOutputItem,
25
+ ResponseOutputMessage,
26
+ ResponseOutputText,
25
27
  ResponseReasoningItem,
26
28
  )
27
29
  from openai.types.responses.response import ToolChoice
@@ -228,6 +230,15 @@ class CompletionRequest(BaseModel):
228
230
 
229
231
  # For request id
230
232
  rid: Optional[Union[List[str], str]] = None
233
+ # Extra key for classifying the request (e.g. cache_salt)
234
+ extra_key: Optional[Union[List[str], str]] = None
235
+ # Cache salt for request caching
236
+ cache_salt: Optional[Union[List[str], str]] = None
237
+ # Priority for the request
238
+ priority: Optional[int] = None
239
+
240
+ # For custom metric labels
241
+ custom_labels: Optional[Dict[str, str]] = None
231
242
 
232
243
  @field_validator("max_tokens")
233
244
  @classmethod
@@ -334,7 +345,7 @@ class FunctionResponse(BaseModel):
334
345
  """Function response."""
335
346
 
336
347
  name: Optional[str] = None
337
- arguments: Optional[str] = None
348
+ arguments: Optional[str | Dict[str, Any]] = None
338
349
 
339
350
 
340
351
  class ToolCall(BaseModel):
@@ -383,7 +394,7 @@ class Function(BaseModel):
383
394
  """Function descriptions."""
384
395
 
385
396
  description: Optional[str] = Field(default=None, examples=[None])
386
- name: Optional[str] = None
397
+ name: str
387
398
  parameters: Optional[object] = None
388
399
  strict: bool = False
389
400
 
@@ -447,7 +458,7 @@ class ChatCompletionRequest(BaseModel):
447
458
  description="Constrains effort on reasoning for reasoning models. "
448
459
  "'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
449
460
  "result in faster responses and fewer tokens used on reasoning in a response. "
450
- "Currently only supported for OpenAI models.",
461
+ "Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
451
462
  )
452
463
 
453
464
  @model_validator(mode="before")
@@ -540,6 +551,12 @@ class ChatCompletionRequest(BaseModel):
540
551
 
541
552
  # For request id
542
553
  rid: Optional[Union[List[str], str]] = None
554
+ # Extra key for classifying the request (e.g. cache_salt)
555
+ extra_key: Optional[Union[List[str], str]] = None
556
+ # Cache salt for request caching
557
+ cache_salt: Optional[Union[List[str], str]] = None
558
+ # Priority for the request
559
+ priority: Optional[int] = None
543
560
 
544
561
  # For PD disaggregation
545
562
  bootstrap_host: Optional[Union[List[str], str]] = None
@@ -641,6 +658,8 @@ class EmbeddingRequest(BaseModel):
641
658
 
642
659
  # The request id.
643
660
  rid: Optional[Union[List[str], str]] = None
661
+ # Priority for the request
662
+ priority: Optional[int] = None
644
663
 
645
664
 
646
665
  class EmbeddingObject(BaseModel):
@@ -769,6 +788,13 @@ class ResponsesRequest(BaseModel):
769
788
  description="The request_id related to this request. If the caller does not set it, a random uuid will be generated.",
770
789
  )
771
790
  priority: int = Field(default=0, description="Request priority")
791
+ extra_key: Optional[str] = Field(
792
+ default=None,
793
+ description="Extra key for classifying the request (e.g. cache_salt)",
794
+ )
795
+ cache_salt: Optional[str] = Field(
796
+ default=None, description="Cache salt for request caching"
797
+ )
772
798
 
773
799
  # SGLang-specific sampling parameters
774
800
  frequency_penalty: float = 0.0
@@ -857,6 +883,26 @@ class ResponsesResponse(BaseModel):
857
883
  tool_choice: str = "auto"
858
884
  tools: List[ResponseTool] = Field(default_factory=list)
859
885
 
886
+ # OpenAI compatibility fields. not all are used at the moment.
887
+ # Recommend checking https://platform.openai.com/docs/api-reference/responses
888
+ error: Optional[dict] = None
889
+ incomplete_details: Optional[dict] = None # TODO(v) support this input
890
+ instructions: Optional[str] = None
891
+ max_output_tokens: Optional[int] = None
892
+ previous_response_id: Optional[str] = None
893
+ reasoning: Optional[dict] = (
894
+ # Unused. No model supports this. For GPT-oss, system prompt sets
895
+ # the field, not server args.
896
+ None # {"effort": Optional[str], "summary": Optional[str]}
897
+ )
898
+ store: Optional[bool] = None
899
+ temperature: Optional[float] = None
900
+ text: Optional[dict] = None # e.g. {"format": {"type": "text"}}
901
+ top_p: Optional[float] = None
902
+ truncation: Optional[str] = None
903
+ user: Optional[str] = None
904
+ metadata: Optional[Dict[str, Any]] = None
905
+
860
906
  @classmethod
861
907
  def from_request(
862
908
  cls,
@@ -871,6 +917,41 @@ class ResponsesResponse(BaseModel):
871
917
  usage: Optional[UsageInfo],
872
918
  ) -> "ResponsesResponse":
873
919
  """Create a response from a request."""
920
+
921
+ # Determine if the output is plain text only to set text.format
922
+ def _is_text_only(
923
+ items: List[
924
+ Union[
925
+ ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall
926
+ ]
927
+ ]
928
+ ) -> bool:
929
+ if not items:
930
+ return False
931
+ for it in items:
932
+ # tool call -> not pure text.
933
+ if isinstance(it, ResponseReasoningItem) or isinstance(
934
+ it, ResponseFunctionToolCall
935
+ ):
936
+ return False
937
+ try:
938
+ if isinstance(it, ResponseOutputText):
939
+ continue
940
+ elif isinstance(it, ResponseOutputMessage):
941
+ if not it.content:
942
+ continue
943
+ for c in it.content:
944
+ if not isinstance(c, ResponseOutputText):
945
+ return False
946
+ else:
947
+ # Unknown type, not considered text-only
948
+ return False
949
+ except AttributeError:
950
+ return False
951
+ return True
952
+
953
+ text_format = {"format": {"type": "text"}} if _is_text_only(output) else None
954
+
874
955
  return cls(
875
956
  id=request.request_id,
876
957
  created_at=created_time,
@@ -881,6 +962,23 @@ class ResponsesResponse(BaseModel):
881
962
  parallel_tool_calls=request.parallel_tool_calls or True,
882
963
  tool_choice=request.tool_choice,
883
964
  tools=request.tools,
965
+ # fields for parity with v1/responses
966
+ error=None,
967
+ incomplete_details=None,
968
+ instructions=request.instructions,
969
+ max_output_tokens=request.max_output_tokens,
970
+ previous_response_id=request.previous_response_id, # TODO(v): ensure this is propagated if retrieved from store
971
+ reasoning={
972
+ "effort": request.reasoning.effort if request.reasoning else None,
973
+ "summary": None, # unused
974
+ },
975
+ store=request.store,
976
+ temperature=request.temperature,
977
+ text=text_format, # TODO(v): Expand coverage per https://platform.openai.com/docs/api-reference/responses/list
978
+ top_p=request.top_p,
979
+ truncation=request.truncation,
980
+ user=request.user,
981
+ metadata=request.metadata or {},
884
982
  )
885
983
 
886
984
 
@@ -919,6 +1017,16 @@ class MessageProcessingResult:
919
1017
  tool_call_constraint: Optional[Any] = None
920
1018
 
921
1019
 
1020
+ class ToolCallProcessingResult(NamedTuple):
1021
+ """Result of processing tool calls in a response."""
1022
+
1023
+ tool_calls: Optional[
1024
+ List[Any]
1025
+ ] # List of ToolCall objects or None if parsing failed
1026
+ remaining_text: str # Text remaining after parsing tool calls
1027
+ finish_reason: Dict[str, Any] # Updated finish reason dictionary
1028
+
1029
+
922
1030
  class ResponseReasoningTextContent(BaseModel):
923
1031
  text: str
924
1032
  type: Literal["reasoning_text"] = "reasoning_text"
@@ -1,15 +1,20 @@
1
+ from __future__ import annotations
2
+
1
3
  import json
2
4
  import logging
3
5
  import uuid
4
6
  from abc import ABC, abstractmethod
5
- from typing import Any, Optional, Union
7
+ from typing import TYPE_CHECKING, Any, Optional, Union
6
8
 
7
9
  from fastapi import HTTPException, Request
8
10
  from fastapi.responses import ORJSONResponse, StreamingResponse
9
11
 
10
12
  from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
11
13
  from sglang.srt.managers.io_struct import GenerateReqInput
12
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
14
+ from sglang.srt.server_args import ServerArgs
15
+
16
+ if TYPE_CHECKING:
17
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
13
18
 
14
19
  logger = logging.getLogger(__name__)
15
20
 
@@ -20,6 +25,14 @@ class OpenAIServingBase(ABC):
20
25
 
21
26
  def __init__(self, tokenizer_manager: TokenizerManager):
22
27
  self.tokenizer_manager = tokenizer_manager
28
+ self.allowed_custom_labels = (
29
+ set(
30
+ self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
31
+ )
32
+ if isinstance(self.tokenizer_manager.server_args, ServerArgs)
33
+ and self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
34
+ else None
35
+ )
23
36
 
24
37
  async def handle_request(
25
38
  self, request: OpenAIServingRequest, raw_request: Request
@@ -33,7 +46,7 @@ class OpenAIServingBase(ABC):
33
46
 
34
47
  # Convert to internal format
35
48
  adapted_request, processed_request = self._convert_to_internal_request(
36
- request
49
+ request, raw_request
37
50
  )
38
51
 
39
52
  # Note(Xinyuan): raw_request below is only used for detecting the connection of the client
@@ -49,6 +62,12 @@ class OpenAIServingBase(ABC):
49
62
  return self.create_error_response(
50
63
  message=e.detail, err_type=str(e.status_code), status_code=e.status_code
51
64
  )
65
+ except ValueError as e:
66
+ return self.create_error_response(
67
+ message=str(e),
68
+ err_type="BadRequest",
69
+ status_code=400,
70
+ )
52
71
  except Exception as e:
53
72
  logger.exception(f"Error in request: {e}")
54
73
  return self.create_error_response(
@@ -73,10 +92,24 @@ class OpenAIServingBase(ABC):
73
92
 
74
93
  return f"{self._request_id_prefix()}{uuid.uuid4().hex}"
75
94
 
95
+ def _compute_extra_key(self, request: OpenAIServingRequest) -> Optional[str]:
96
+ """Compute the final extra_key by concatenating cache_salt and extra_key if both are provided."""
97
+ parts = []
98
+ for key in ["cache_salt", "extra_key"]:
99
+ value = getattr(request, key, None)
100
+ if value:
101
+ if not isinstance(value, str):
102
+ raise TypeError(
103
+ f"Value of {key} must be a string, but got {type(value).__name__}"
104
+ )
105
+ parts.append(value)
106
+ return "".join(parts) if parts else None
107
+
76
108
  @abstractmethod
77
109
  def _convert_to_internal_request(
78
110
  self,
79
111
  request: OpenAIServingRequest,
112
+ raw_request: Request = None,
80
113
  ) -> tuple[GenerateReqInput, OpenAIServingRequest]:
81
114
  """Convert OpenAI request to internal format"""
82
115
  pass
@@ -150,3 +183,32 @@ class OpenAIServingBase(ABC):
150
183
  code=status_code,
151
184
  )
152
185
  return json.dumps({"error": error.model_dump()})
186
+
187
+ def extract_custom_labels(self, raw_request):
188
+ if (
189
+ not self.allowed_custom_labels
190
+ or not self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
191
+ ):
192
+ return None
193
+
194
+ custom_labels = None
195
+ header = (
196
+ self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
197
+ )
198
+ try:
199
+ raw_labels = (
200
+ json.loads(raw_request.headers.get(header))
201
+ if raw_request and raw_request.headers.get(header)
202
+ else None
203
+ )
204
+ except json.JSONDecodeError as e:
205
+ logger.exception(f"Error in request: {e}")
206
+ raw_labels = None
207
+
208
+ if isinstance(raw_labels, dict):
209
+ custom_labels = {
210
+ label: value
211
+ for label, value in raw_labels.items()
212
+ if label in self.allowed_custom_labels
213
+ }
214
+ return custom_labels