sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +378 -160
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +10 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +136 -25
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +63 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +83 -80
  69. sglang/srt/entrypoints/grpc_server.py +430 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +195 -102
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +58 -6
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +33 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +20 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/minimax_m2.py +367 -0
  93. sglang/srt/function_call/utils.py +2 -2
  94. sglang/srt/grpc/compile_proto.py +3 -3
  95. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  96. sglang/srt/grpc/health_servicer.py +189 -0
  97. sglang/srt/grpc/scheduler_launcher.py +181 -0
  98. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  99. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  100. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  101. sglang/srt/layers/activation.py +10 -1
  102. sglang/srt/layers/attention/aiter_backend.py +3 -3
  103. sglang/srt/layers/attention/ascend_backend.py +17 -1
  104. sglang/srt/layers/attention/attention_registry.py +43 -23
  105. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  106. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  107. sglang/srt/layers/attention/fla/chunk.py +0 -1
  108. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  109. sglang/srt/layers/attention/fla/index.py +0 -2
  110. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  111. sglang/srt/layers/attention/fla/utils.py +0 -3
  112. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  113. sglang/srt/layers/attention/flashattention_backend.py +24 -10
  114. sglang/srt/layers/attention/flashinfer_backend.py +258 -22
  115. sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
  116. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  117. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  118. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  119. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  121. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  122. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  123. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  124. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  125. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  127. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  128. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  129. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  130. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  131. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  132. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  133. sglang/srt/layers/attention/nsa/utils.py +0 -1
  134. sglang/srt/layers/attention/nsa_backend.py +404 -90
  135. sglang/srt/layers/attention/triton_backend.py +208 -34
  136. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  137. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  138. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  139. sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
  140. sglang/srt/layers/attention/utils.py +89 -7
  141. sglang/srt/layers/attention/vision.py +3 -3
  142. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  143. sglang/srt/layers/communicator.py +12 -7
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  146. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  147. sglang/srt/layers/dp_attention.py +17 -0
  148. sglang/srt/layers/layernorm.py +64 -19
  149. sglang/srt/layers/linear.py +9 -1
  150. sglang/srt/layers/logits_processor.py +152 -17
  151. sglang/srt/layers/modelopt_utils.py +11 -0
  152. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  153. sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
  154. sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
  155. sglang/srt/layers/moe/ep_moe/layer.py +154 -625
  156. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  160. sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
  161. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
  162. sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
  163. sglang/srt/layers/moe/moe_runner/runner.py +6 -0
  164. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  165. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  166. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  167. sglang/srt/layers/moe/router.py +51 -15
  168. sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
  169. sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
  170. sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
  171. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  172. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  173. sglang/srt/layers/moe/topk.py +7 -6
  174. sglang/srt/layers/moe/utils.py +20 -5
  175. sglang/srt/layers/quantization/__init__.py +5 -58
  176. sglang/srt/layers/quantization/awq.py +183 -9
  177. sglang/srt/layers/quantization/awq_triton.py +29 -0
  178. sglang/srt/layers/quantization/base_config.py +27 -1
  179. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  180. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  185. sglang/srt/layers/quantization/fp8.py +152 -81
  186. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  187. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  188. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  189. sglang/srt/layers/quantization/gguf.py +566 -0
  190. sglang/srt/layers/quantization/gptq.py +0 -1
  191. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  192. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  193. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  194. sglang/srt/layers/quantization/mxfp4.py +35 -68
  195. sglang/srt/layers/quantization/petit.py +1 -1
  196. sglang/srt/layers/quantization/quark/quark.py +3 -1
  197. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  199. sglang/srt/layers/quantization/unquant.py +23 -48
  200. sglang/srt/layers/quantization/utils.py +0 -1
  201. sglang/srt/layers/quantization/w4afp8.py +87 -20
  202. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  203. sglang/srt/layers/radix_attention.py +62 -9
  204. sglang/srt/layers/rotary_embedding.py +686 -17
  205. sglang/srt/layers/sampler.py +47 -16
  206. sglang/srt/layers/sparse_pooler.py +98 -0
  207. sglang/srt/layers/utils.py +0 -1
  208. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  209. sglang/srt/lora/backend/triton_backend.py +0 -1
  210. sglang/srt/lora/eviction_policy.py +139 -0
  211. sglang/srt/lora/lora_manager.py +24 -9
  212. sglang/srt/lora/lora_registry.py +1 -1
  213. sglang/srt/lora/mem_pool.py +40 -16
  214. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  215. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  216. sglang/srt/managers/cache_controller.py +48 -17
  217. sglang/srt/managers/data_parallel_controller.py +146 -42
  218. sglang/srt/managers/detokenizer_manager.py +40 -13
  219. sglang/srt/managers/io_struct.py +69 -16
  220. sglang/srt/managers/mm_utils.py +20 -18
  221. sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
  222. sglang/srt/managers/overlap_utils.py +96 -19
  223. sglang/srt/managers/schedule_batch.py +241 -511
  224. sglang/srt/managers/schedule_policy.py +15 -2
  225. sglang/srt/managers/scheduler.py +420 -514
  226. sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
  227. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  228. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  229. sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
  230. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  231. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  232. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  233. sglang/srt/managers/tokenizer_manager.py +375 -95
  234. sglang/srt/managers/tp_worker.py +212 -161
  235. sglang/srt/managers/utils.py +78 -2
  236. sglang/srt/mem_cache/allocator.py +7 -2
  237. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  238. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  239. sglang/srt/mem_cache/chunk_cache.py +13 -2
  240. sglang/srt/mem_cache/common.py +480 -0
  241. sglang/srt/mem_cache/evict_policy.py +16 -1
  242. sglang/srt/mem_cache/hicache_storage.py +11 -2
  243. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  244. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  245. sglang/srt/mem_cache/memory_pool.py +517 -219
  246. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  247. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  248. sglang/srt/mem_cache/radix_cache.py +53 -19
  249. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  250. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  251. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  252. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  253. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  254. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  255. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  256. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  257. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  259. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  260. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  261. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  262. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  263. sglang/srt/metrics/collector.py +31 -0
  264. sglang/srt/metrics/func_timer.py +1 -1
  265. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  266. sglang/srt/model_executor/forward_batch_info.py +71 -25
  267. sglang/srt/model_executor/model_runner.py +362 -270
  268. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  269. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
  270. sglang/srt/model_loader/__init__.py +1 -1
  271. sglang/srt/model_loader/loader.py +424 -27
  272. sglang/srt/model_loader/utils.py +0 -1
  273. sglang/srt/model_loader/weight_utils.py +47 -28
  274. sglang/srt/models/apertus.py +2 -3
  275. sglang/srt/models/arcee.py +2 -2
  276. sglang/srt/models/bailing_moe.py +13 -52
  277. sglang/srt/models/bailing_moe_nextn.py +3 -4
  278. sglang/srt/models/bert.py +1 -1
  279. sglang/srt/models/deepseek_nextn.py +19 -3
  280. sglang/srt/models/deepseek_ocr.py +1516 -0
  281. sglang/srt/models/deepseek_v2.py +418 -140
  282. sglang/srt/models/dots_ocr.py +0 -2
  283. sglang/srt/models/dots_vlm.py +0 -1
  284. sglang/srt/models/dots_vlm_vit.py +1 -1
  285. sglang/srt/models/falcon_h1.py +13 -19
  286. sglang/srt/models/gemma3_mm.py +16 -0
  287. sglang/srt/models/gemma3n_mm.py +1 -2
  288. sglang/srt/models/glm4_moe.py +327 -382
  289. sglang/srt/models/glm4_moe_nextn.py +6 -16
  290. sglang/srt/models/glm4v.py +2 -1
  291. sglang/srt/models/glm4v_moe.py +32 -199
  292. sglang/srt/models/gpt_oss.py +5 -5
  293. sglang/srt/models/grok.py +10 -23
  294. sglang/srt/models/hunyuan.py +2 -7
  295. sglang/srt/models/interns1.py +0 -1
  296. sglang/srt/models/kimi_vl.py +1 -7
  297. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  298. sglang/srt/models/llama.py +2 -2
  299. sglang/srt/models/llama_eagle3.py +1 -1
  300. sglang/srt/models/longcat_flash.py +5 -22
  301. sglang/srt/models/longcat_flash_nextn.py +3 -14
  302. sglang/srt/models/mimo.py +2 -13
  303. sglang/srt/models/mimo_mtp.py +1 -2
  304. sglang/srt/models/minicpmo.py +7 -5
  305. sglang/srt/models/minimax_m2.py +922 -0
  306. sglang/srt/models/mixtral.py +1 -4
  307. sglang/srt/models/mllama.py +1 -1
  308. sglang/srt/models/mllama4.py +13 -3
  309. sglang/srt/models/nemotron_h.py +511 -0
  310. sglang/srt/models/nvila.py +355 -0
  311. sglang/srt/models/nvila_lite.py +184 -0
  312. sglang/srt/models/olmo2.py +31 -4
  313. sglang/srt/models/opt.py +5 -5
  314. sglang/srt/models/phi.py +1 -1
  315. sglang/srt/models/phi4mm.py +1 -1
  316. sglang/srt/models/phimoe.py +0 -1
  317. sglang/srt/models/pixtral.py +0 -3
  318. sglang/srt/models/points_v15_chat.py +186 -0
  319. sglang/srt/models/qwen.py +0 -1
  320. sglang/srt/models/qwen2.py +22 -1
  321. sglang/srt/models/qwen2_5_vl.py +3 -3
  322. sglang/srt/models/qwen2_audio.py +2 -15
  323. sglang/srt/models/qwen2_moe.py +15 -12
  324. sglang/srt/models/qwen2_vl.py +5 -2
  325. sglang/srt/models/qwen3.py +34 -4
  326. sglang/srt/models/qwen3_moe.py +19 -37
  327. sglang/srt/models/qwen3_next.py +7 -12
  328. sglang/srt/models/qwen3_next_mtp.py +3 -4
  329. sglang/srt/models/qwen3_omni_moe.py +661 -0
  330. sglang/srt/models/qwen3_vl.py +37 -33
  331. sglang/srt/models/qwen3_vl_moe.py +57 -185
  332. sglang/srt/models/roberta.py +55 -3
  333. sglang/srt/models/sarashina2_vision.py +0 -1
  334. sglang/srt/models/step3_vl.py +3 -5
  335. sglang/srt/models/utils.py +11 -1
  336. sglang/srt/multimodal/processors/base_processor.py +7 -2
  337. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  338. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  339. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  340. sglang/srt/multimodal/processors/glm4v.py +2 -6
  341. sglang/srt/multimodal/processors/internvl.py +0 -2
  342. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  343. sglang/srt/multimodal/processors/mllama4.py +0 -8
  344. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  345. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  346. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  347. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  348. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  349. sglang/srt/parser/conversation.py +41 -0
  350. sglang/srt/parser/reasoning_parser.py +28 -2
  351. sglang/srt/sampling/custom_logit_processor.py +77 -2
  352. sglang/srt/sampling/sampling_batch_info.py +17 -22
  353. sglang/srt/sampling/sampling_params.py +70 -2
  354. sglang/srt/server_args.py +846 -163
  355. sglang/srt/server_args_config_parser.py +1 -1
  356. sglang/srt/single_batch_overlap.py +36 -31
  357. sglang/srt/speculative/base_spec_worker.py +34 -0
  358. sglang/srt/speculative/draft_utils.py +226 -0
  359. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  360. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  361. sglang/srt/speculative/eagle_info.py +57 -18
  362. sglang/srt/speculative/eagle_info_v2.py +458 -0
  363. sglang/srt/speculative/eagle_utils.py +138 -0
  364. sglang/srt/speculative/eagle_worker.py +83 -280
  365. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  366. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  367. sglang/srt/speculative/ngram_worker.py +12 -11
  368. sglang/srt/speculative/spec_info.py +2 -0
  369. sglang/srt/speculative/spec_utils.py +38 -3
  370. sglang/srt/speculative/standalone_worker.py +4 -14
  371. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  372. sglang/srt/two_batch_overlap.py +28 -14
  373. sglang/srt/utils/__init__.py +1 -1
  374. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  375. sglang/srt/utils/common.py +272 -82
  376. sglang/srt/utils/hf_transformers_utils.py +44 -17
  377. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  378. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  379. sglang/srt/utils/profile_merger.py +199 -0
  380. sglang/test/attention/test_flashattn_backend.py +1 -1
  381. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  382. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  383. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  384. sglang/test/few_shot_gsm8k_engine.py +2 -4
  385. sglang/test/kit_matched_stop.py +157 -0
  386. sglang/test/longbench_v2/__init__.py +1 -0
  387. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  388. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  389. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  390. sglang/test/run_eval.py +41 -0
  391. sglang/test/runners.py +2 -0
  392. sglang/test/send_one.py +42 -7
  393. sglang/test/simple_eval_common.py +3 -0
  394. sglang/test/simple_eval_gpqa.py +0 -1
  395. sglang/test/simple_eval_humaneval.py +0 -3
  396. sglang/test/simple_eval_longbench_v2.py +344 -0
  397. sglang/test/test_block_fp8.py +1 -2
  398. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  399. sglang/test/test_cutlass_moe.py +1 -2
  400. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  401. sglang/test/test_deterministic.py +463 -107
  402. sglang/test/test_deterministic_utils.py +74 -0
  403. sglang/test/test_disaggregation_utils.py +81 -0
  404. sglang/test/test_marlin_moe.py +0 -1
  405. sglang/test/test_utils.py +85 -20
  406. sglang/version.py +1 -1
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
  409. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  410. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  411. sglang/srt/models/vila.py +0 -306
  412. sglang/srt/speculative/build_eagle_tree.py +0 -427
  413. sglang/test/test_block_fp8_ep.py +0 -358
  414. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  415. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  416. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  417. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  418. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  419. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
@@ -14,15 +14,17 @@ limitations under the License.
14
14
  """
15
15
 
16
16
  import logging
17
- import math
18
17
  import threading
19
18
  import time
20
- from queue import Empty, Full, PriorityQueue, Queue
21
- from typing import TYPE_CHECKING, List, NamedTuple, Optional, Set, Tuple
19
+ from queue import Empty, Full, Queue
20
+ from typing import TYPE_CHECKING, List, NamedTuple, Optional
22
21
 
23
22
  import torch
24
23
 
25
- from sglang.srt.mem_cache.hicache_storage import HiCacheStorageConfig
24
+ from sglang.srt.mem_cache.hicache_storage import (
25
+ HiCacheStorageConfig,
26
+ HiCacheStorageExtraInfo,
27
+ )
26
28
 
27
29
  if TYPE_CHECKING:
28
30
  from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
@@ -38,7 +40,7 @@ from sglang.srt.layers.dp_attention import (
38
40
  get_attention_tp_size,
39
41
  is_dp_attention_enabled,
40
42
  )
41
- from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool, MLATokenToKVPool
43
+ from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool
42
44
 
43
45
  logger = logging.getLogger(__name__)
44
46
 
@@ -191,12 +193,14 @@ class StorageOperation:
191
193
  token_ids: List[int],
192
194
  last_hash: Optional[str] = None,
193
195
  hash_value: Optional[List[str]] = None,
196
+ prefix_keys: Optional[List[str]] = None,
194
197
  ):
195
198
  self.host_indices = host_indices
196
199
  self.token_ids = token_ids
197
200
  self.last_hash = last_hash
198
201
  self.completed_tokens = 0
199
202
  self.hash_value = hash_value if hash_value is not None else []
203
+ self.prefix_keys = prefix_keys
200
204
 
201
205
  self.id = StorageOperation.counter
202
206
  StorageOperation.counter += 1
@@ -212,6 +216,7 @@ class PrefetchOperation(StorageOperation):
212
216
  host_indices: torch.Tensor,
213
217
  token_ids: List[int],
214
218
  last_hash: Optional[str] = None,
219
+ prefix_keys: Optional[List[str]] = None,
215
220
  ):
216
221
  self.request_id = request_id
217
222
 
@@ -219,7 +224,7 @@ class PrefetchOperation(StorageOperation):
219
224
  self._terminated_flag = False
220
225
  self.start_time = time.monotonic()
221
226
 
222
- super().__init__(host_indices, token_ids, last_hash)
227
+ super().__init__(host_indices, token_ids, last_hash, prefix_keys=prefix_keys)
223
228
 
224
229
  def increment(self, num_tokens: int):
225
230
  with self._lock:
@@ -550,12 +555,13 @@ class HiCacheController:
550
555
  host_indices: torch.Tensor,
551
556
  new_input_tokens: List[int],
552
557
  last_hash: Optional[str] = None,
558
+ prefix_keys: Optional[List[str]] = None,
553
559
  ) -> PrefetchOperation:
554
560
  """
555
561
  Prefetch KV caches from storage backend to host memory.
556
562
  """
557
563
  operation = PrefetchOperation(
558
- request_id, host_indices, new_input_tokens, last_hash
564
+ request_id, host_indices, new_input_tokens, last_hash, prefix_keys
559
565
  )
560
566
  self.prefetch_queue.put(operation)
561
567
  return operation
@@ -571,8 +577,12 @@ class HiCacheController:
571
577
  for page in pages:
572
578
  self.host_mem_release_queue.put(page)
573
579
 
574
- def _page_get_zero_copy(self, operation, hash_values, host_indices):
575
- results = self.storage_backend.batch_get_v1(hash_values, host_indices)
580
+ def _page_get_zero_copy(
581
+ self, operation, hash_values, host_indices, extra_info=None
582
+ ):
583
+ results = self.storage_backend.batch_get_v1(
584
+ hash_values, host_indices, extra_info
585
+ )
576
586
  inc = 0
577
587
  for i in range(len(hash_values)):
578
588
  if not results[i]:
@@ -584,7 +594,7 @@ class HiCacheController:
584
594
  operation.increment(inc)
585
595
 
586
596
  # todo: deprecate
587
- def _generic_page_get(self, operation, hash_values, host_indices):
597
+ def _generic_page_get(self, operation, hash_values, host_indices, extra_info=None):
588
598
  dummy_page_dst = [
589
599
  self.mem_pool_host.get_dummy_flat_data_page() for _ in hash_values
590
600
  ]
@@ -608,6 +618,7 @@ class HiCacheController:
608
618
 
609
619
  def _page_transfer(self, operation):
610
620
  # Transfer batch by batch
621
+ prefix_keys = operation.prefix_keys
611
622
  for i in range(0, len(operation.hash_value), self.storage_batch_size):
612
623
  batch_hashes = operation.hash_value[i : i + self.storage_batch_size]
613
624
  batch_host_indices = operation.host_indices[
@@ -615,7 +626,8 @@ class HiCacheController:
615
626
  ]
616
627
  prev_completed_tokens = operation.completed_tokens
617
628
  # Get one batch token, and update the completed_tokens if succeed
618
- self.page_get_func(operation, batch_hashes, batch_host_indices)
629
+ extra_info = HiCacheStorageExtraInfo(prefix_keys=prefix_keys)
630
+ self.page_get_func(operation, batch_hashes, batch_host_indices, extra_info)
619
631
  # Check termination
620
632
  if (
621
633
  operation.completed_tokens
@@ -623,6 +635,10 @@ class HiCacheController:
623
635
  ):
624
636
  operation.mark_terminate()
625
637
  break # Some operations fail or operation terminated by controller
638
+
639
+ if prefix_keys and len(prefix_keys) > 0:
640
+ prefix_keys += batch_hashes
641
+
626
642
  # release pre-allocated memory
627
643
  self.append_host_mem_release(
628
644
  operation.host_indices[operation.completed_tokens :]
@@ -656,6 +672,7 @@ class HiCacheController:
656
672
  def _storage_hit_query(self, operation) -> tuple[list[str], int]:
657
673
  last_hash = operation.last_hash
658
674
  tokens_to_fetch = operation.token_ids
675
+ prefix_keys = operation.prefix_keys.copy() if operation.prefix_keys else None
659
676
 
660
677
  storage_query_count = 0
661
678
  hash_value = []
@@ -673,11 +690,15 @@ class HiCacheController:
673
690
  batch_tokens[i : i + self.page_size], last_hash
674
691
  )
675
692
  batch_hashes.append(last_hash)
676
- hit_page_num = self.storage_backend.batch_exists(batch_hashes)
693
+ extra_info = HiCacheStorageExtraInfo(prefix_keys=prefix_keys)
694
+ hit_page_num = self.storage_backend.batch_exists(batch_hashes, extra_info)
677
695
  hash_value.extend(batch_hashes[:hit_page_num])
678
696
  storage_query_count += hit_page_num * self.page_size
679
697
  if hit_page_num < len(batch_hashes):
680
698
  break
699
+ if prefix_keys and len(prefix_keys) > 0:
700
+ prefix_keys += batch_hashes
701
+
681
702
  return hash_value, storage_query_count
682
703
 
683
704
  def prefetch_thread_func(self):
@@ -734,28 +755,34 @@ class HiCacheController:
734
755
  host_indices: torch.Tensor,
735
756
  token_ids: List[int],
736
757
  hash_value: Optional[List[str]] = None,
758
+ prefix_keys: Optional[List[str]] = None,
737
759
  ) -> int:
738
760
  """
739
761
  Write KV caches from host memory to storage backend.
740
762
  """
741
- operation = StorageOperation(host_indices, token_ids, hash_value=hash_value)
763
+ operation = StorageOperation(
764
+ host_indices, token_ids, hash_value=hash_value, prefix_keys=prefix_keys
765
+ )
742
766
  self.backup_queue.put(operation)
743
767
  return operation.id
744
768
 
745
769
  # todo: deprecate
746
- def _generic_page_set(self, hash_values, host_indices) -> bool:
770
+ def _generic_page_set(self, hash_values, host_indices, extra_info=None) -> bool:
747
771
  data = [
748
772
  self.mem_pool_host.get_data_page(host_indices[i * self.page_size])
749
773
  for i in range(len(hash_values))
750
774
  ]
751
775
  return self.storage_backend.batch_set(hash_values, data)
752
776
 
753
- def _page_set_zero_copy(self, hash_values, host_indices) -> bool:
754
- return all(self.storage_backend.batch_set_v1(hash_values, host_indices))
777
+ def _page_set_zero_copy(self, hash_values, host_indices, extra_info=None) -> bool:
778
+ return all(
779
+ self.storage_backend.batch_set_v1(hash_values, host_indices, extra_info)
780
+ )
755
781
 
756
782
  # Backup batch by batch
757
783
  def _page_backup(self, operation):
758
784
  # Backup batch by batch
785
+ prefix_keys = operation.prefix_keys
759
786
  for i in range(0, len(operation.hash_value), self.storage_batch_size):
760
787
  batch_hashes = operation.hash_value[i : i + self.storage_batch_size]
761
788
  batch_host_indices = operation.host_indices[
@@ -763,12 +790,16 @@ class HiCacheController:
763
790
  ]
764
791
  # Set one batch token, and record if success.
765
792
  # todo: allow partial success
766
- success = self.page_set_func(batch_hashes, batch_host_indices)
793
+ extra_info = HiCacheStorageExtraInfo(prefix_keys=prefix_keys)
794
+ success = self.page_set_func(batch_hashes, batch_host_indices, extra_info)
767
795
  if not success:
768
796
  logger.warning(
769
797
  f"Write page to storage: {len(batch_hashes)} pages failed."
770
798
  )
771
799
  break
800
+
801
+ if prefix_keys and len(prefix_keys) > 0:
802
+ prefix_keys += batch_hashes
772
803
  operation.completed_tokens += self.page_size * len(batch_hashes)
773
804
 
774
805
  def backup_thread_func(self):
@@ -21,7 +21,7 @@ import threading
21
21
  import time
22
22
  from collections import deque
23
23
  from enum import Enum, auto
24
- from typing import List
24
+ from typing import List, Optional
25
25
 
26
26
  import psutil
27
27
  import setproctitle
@@ -36,14 +36,19 @@ from sglang.srt.managers.io_struct import (
36
36
  )
37
37
  from sglang.srt.managers.schedule_batch import Req
38
38
  from sglang.srt.managers.scheduler import run_scheduler_process
39
- from sglang.srt.server_args import PortArgs, ServerArgs
40
- from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
39
+ from sglang.srt.server_args import (
40
+ DP_ATTENTION_HANDSHAKE_PORT_DELTA,
41
+ PortArgs,
42
+ ServerArgs,
43
+ )
41
44
  from sglang.srt.utils import (
42
45
  bind_port,
43
46
  configure_logger,
44
47
  get_zmq_socket,
45
48
  kill_itself_when_parent_died,
49
+ maybe_reindex_device_id,
46
50
  )
51
+ from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter
47
52
  from sglang.utils import TypeBasedDispatcher, get_exception_traceback
48
53
 
49
54
  logger = logging.getLogger(__name__)
@@ -135,27 +140,20 @@ class DataParallelController:
135
140
  # Load balance budget
136
141
  self.dp_budget = DPBudget()
137
142
 
143
+ # To protect changing env vars to set CUDA_VISIBLE_DEVICES.
144
+ self.env_lock = threading.Lock()
145
+
138
146
  # Launch data parallel workers
139
147
  self.scheduler_procs = []
140
148
  self.workers: List[zmq.Socket] = [None] * server_args.dp_size
141
149
 
142
150
  if server_args.enable_dp_attention:
143
- dp_port_args = self.launch_dp_attention_schedulers(server_args, port_args)
151
+ self.launch_dp_attention_schedulers(server_args, port_args)
144
152
  self.control_message_step = server_args.tp_size
145
153
  else:
146
- dp_port_args = self.launch_dp_schedulers(server_args, port_args)
154
+ self.launch_dp_schedulers(server_args, port_args)
147
155
  self.control_message_step = 1
148
156
 
149
- # Only node rank 0 runs the real data parallel controller that dispatches the requests.
150
- if server_args.node_rank == 0:
151
- for dp_rank in range(server_args.dp_size):
152
- self.workers[dp_rank] = get_zmq_socket(
153
- self.context,
154
- zmq.PUSH,
155
- dp_port_args[dp_rank].scheduler_input_ipc_name,
156
- True,
157
- )
158
-
159
157
  self.max_req_input_len = None
160
158
 
161
159
  self.init_dispatcher()
@@ -188,13 +186,11 @@ class DataParallelController:
188
186
 
189
187
  threads = []
190
188
  sockets = []
191
- dp_port_args = []
192
189
  ready_events = []
193
190
  for dp_rank in range(server_args.dp_size):
194
191
  tmp_port_args = PortArgs.init_new(server_args)
195
192
  tmp_port_args.tokenizer_ipc_name = port_args.tokenizer_ipc_name
196
193
  tmp_port_args.detokenizer_ipc_name = port_args.detokenizer_ipc_name
197
- dp_port_args.append(tmp_port_args)
198
194
 
199
195
  # This port is checked free in PortArgs.init_new.
200
196
  # We hold it first so that the next dp worker gets a different port
@@ -213,6 +209,14 @@ class DataParallelController:
213
209
  server_args.tp_size * server_args.pp_size * server_args.gpu_id_step
214
210
  )
215
211
 
212
+ if server_args.node_rank == 0:
213
+ self.workers[dp_rank] = get_zmq_socket(
214
+ self.context,
215
+ zmq.PUSH,
216
+ tmp_port_args.scheduler_input_ipc_name,
217
+ True,
218
+ )
219
+
216
220
  # Free all sockets before starting the threads to launch TP workers
217
221
  for sock in sockets:
218
222
  sock.close()
@@ -223,8 +227,6 @@ class DataParallelController:
223
227
  for event in ready_events:
224
228
  event.wait()
225
229
 
226
- return dp_port_args
227
-
228
230
  def launch_tensor_parallel_group_thread(
229
231
  self,
230
232
  server_args: ServerArgs,
@@ -241,19 +243,115 @@ class DataParallelController:
241
243
  while True:
242
244
  time.sleep(30 * 24 * 3600)
243
245
 
244
- def launch_dp_attention_schedulers(self, server_args, port_args):
245
- self.launch_tensor_parallel_group(server_args, port_args, 0, None)
246
- dp_port_args = []
247
- for dp_rank in range(server_args.dp_size):
248
- dp_port_args.append(PortArgs.init_new(server_args, dp_rank))
249
- return dp_port_args
246
+ def _broadcast_worker_ports(
247
+ self, server_args: ServerArgs, worker_ports: Optional[List[int]] = None
248
+ ) -> List[int]:
249
+ """Broadcast worker ports from node 0 to all other nodes.
250
+
251
+ Node 0 acts as the server, waiting for all other nodes to connect and
252
+ sending them the pre-allocated worker ports. Other nodes act as clients,
253
+ connecting to node 0 to receive their copy of the worker ports.
254
+
255
+ Args:
256
+ server_args: Server arguments containing node configuration.
257
+ worker_ports: Pre-allocated worker ports to broadcast.
258
+
259
+ Returns:
260
+ List of worker ports (same on all nodes after broadcast).
261
+ """
262
+ # Determine the endpoint for inter-node communication
263
+ if server_args.dist_init_addr is None:
264
+ endpoint = f"tcp://127.0.0.1:{server_args.port + DP_ATTENTION_HANDSHAKE_PORT_DELTA}"
265
+ else:
266
+ endpoint = f"tcp://{server_args.dist_init_addr}"
267
+
268
+ if server_args.node_rank == 0:
269
+ # Node 0: Broadcast worker ports to all other nodes
270
+ return self._broadcast_ports_as_server(
271
+ endpoint, server_args.nnodes - 1, worker_ports
272
+ )
273
+ else:
274
+ # Other nodes: Receive worker ports from node 0
275
+ return self._receive_ports_as_client(endpoint, server_args.node_rank)
276
+
277
+ def _broadcast_ports_as_server(
278
+ self, endpoint: str, expected_clients: int, worker_ports: List[int]
279
+ ) -> List[int]:
280
+ """Broadcast worker ports to all client nodes."""
281
+ logger.debug(f"Broadcasting worker ports to {expected_clients} client nodes")
282
+ logger.debug(f"Worker ports: {worker_ports}")
283
+
284
+ rep_socket = get_zmq_socket(self.context, zmq.REP, endpoint, True)
285
+
286
+ try:
287
+ connected_clients = 0
288
+ while connected_clients < expected_clients:
289
+ # Wait for client handshake
290
+ client_rank = rep_socket.recv().decode()
291
+ logger.debug(f"Received handshake from node {client_rank}")
292
+
293
+ # Send worker ports to client
294
+ rep_socket.send_pyobj(worker_ports)
295
+ connected_clients += 1
296
+ logger.debug(
297
+ f"Sent worker ports to {connected_clients}/{expected_clients} nodes"
298
+ )
299
+
300
+ logger.debug("Worker port broadcast completed")
301
+ return worker_ports
302
+ finally:
303
+ rep_socket.close()
304
+
305
+ def _receive_ports_as_client(self, endpoint: str, node_rank: int) -> List[int]:
306
+ """Receive worker ports from the server node."""
307
+ logger.debug(f"Connecting to node 0 to receive worker ports")
308
+
309
+ req_socket = get_zmq_socket(self.context, zmq.REQ, endpoint, False)
310
+ req_socket.setsockopt(zmq.RCVTIMEO, 60 * 1000) # 1 minute timeout
311
+ req_socket.setsockopt(zmq.SNDTIMEO, 60 * 1000)
312
+
313
+ try:
314
+ # Send handshake with our node rank
315
+ req_socket.send(str(node_rank).encode())
316
+
317
+ # Receive worker ports
318
+ worker_ports = req_socket.recv_pyobj()
319
+ logger.debug(f"Received {len(worker_ports)} worker ports from node 0")
320
+ return worker_ports
321
+ except zmq.Again:
322
+ logger.error("Timeout waiting for worker ports from node 0")
323
+ raise RuntimeError(
324
+ "Failed to receive worker ports from node 0 within timeout"
325
+ )
326
+ finally:
327
+ req_socket.close()
328
+
329
+ def launch_dp_attention_schedulers(
330
+ self, server_args: ServerArgs, port_args: PortArgs
331
+ ):
332
+ # Pre-allocate worker ports on node 0 to avoid conflicts
333
+ worker_ports = []
334
+ if server_args.node_rank == 0:
335
+ for dp_rank in range(server_args.dp_size):
336
+ port_and_socket = get_zmq_socket(self.context, zmq.PUSH)
337
+ worker_ports.append(port_and_socket[0])
338
+ self.workers[dp_rank] = port_and_socket[1]
339
+ logger.debug(f"Assigned port {port_and_socket[0]} to worker {dp_rank}")
340
+
341
+ broadcasted_ports = self._broadcast_worker_ports(
342
+ server_args, worker_ports if worker_ports else None
343
+ )
344
+ self.launch_tensor_parallel_group(
345
+ server_args, port_args, 0, None, broadcasted_ports
346
+ )
250
347
 
251
348
  def launch_tensor_parallel_group(
252
349
  self,
253
350
  server_args: ServerArgs,
254
351
  port_args: PortArgs,
255
352
  base_gpu_id: int,
256
- dp_rank: int,
353
+ dp_rank: Optional[int],
354
+ worker_ports: Optional[List[int]] = None,
257
355
  ):
258
356
  if not server_args.enable_dp_attention:
259
357
  logger.info(f"Launch DP{dp_rank} starting at GPU #{base_gpu_id}.")
@@ -290,7 +388,9 @@ class DataParallelController:
290
388
  server_args.dp_size,
291
389
  )
292
390
  # compute zmq ports for this dp rank
293
- rank_port_args = PortArgs.init_new(server_args, dp_rank)
391
+ rank_port_args = PortArgs.init_new(
392
+ server_args, dp_rank, worker_ports
393
+ )
294
394
  # Data parallelism reuses the tensor parallelism group,
295
395
  # so all dp ranks should use the same nccl port.
296
396
  rank_port_args.nccl_port = port_args.nccl_port
@@ -303,21 +403,22 @@ class DataParallelController:
303
403
  + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
304
404
  )
305
405
  moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
306
- proc = mp.Process(
307
- target=run_scheduler_process,
308
- args=(
309
- server_args,
310
- rank_port_args,
311
- gpu_id,
312
- tp_rank,
313
- moe_ep_rank,
314
- pp_rank,
315
- dp_rank,
316
- writer,
317
- ),
318
- )
319
- with memory_saver_adapter.configure_subprocess():
320
- proc.start()
406
+ with self.env_lock, maybe_reindex_device_id(gpu_id) as gpu_id:
407
+ proc = mp.Process(
408
+ target=run_scheduler_process,
409
+ args=(
410
+ server_args,
411
+ rank_port_args,
412
+ gpu_id,
413
+ tp_rank,
414
+ moe_ep_rank,
415
+ pp_rank,
416
+ dp_rank,
417
+ writer,
418
+ ),
419
+ )
420
+ with memory_saver_adapter.configure_subprocess():
421
+ proc.start()
321
422
  self.scheduler_procs.append(proc)
322
423
  scheduler_pipe_readers.append(reader)
323
424
 
@@ -346,6 +447,9 @@ class DataParallelController:
346
447
  self.workers
347
448
  )
348
449
  else:
450
+ assert (
451
+ req.bootstrap_room is not None
452
+ ), "req.bootstrap_room should not be None. Do not send requests directly to prefill or decode instances, but send to the router instead."
349
453
  self.workers[req.bootstrap_room % len(self.workers)].send_pyobj(req)
350
454
 
351
455
  def shortest_queue_scheduler(self, req):
@@ -31,7 +31,6 @@ from sglang.srt.managers.io_struct import (
31
31
  BatchStrOutput,
32
32
  BatchTokenIDOutput,
33
33
  FreezeGCReq,
34
- MultiTokenizerRegisterReq,
35
34
  )
36
35
  from sglang.srt.managers.multi_tokenizer_mixin import MultiHttpWorkerDetokenizerMixin
37
36
  from sglang.srt.server_args import PortArgs, ServerArgs
@@ -104,12 +103,12 @@ class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
104
103
  (BatchEmbeddingOutput, self.handle_batch_embedding_out),
105
104
  (BatchTokenIDOutput, self.handle_batch_token_id_out),
106
105
  (BatchMultimodalDecodeReq, self.handle_multimodal_decode_req),
107
- (MultiTokenizerRegisterReq, lambda x: x),
108
106
  (FreezeGCReq, self.handle_freeze_gc_req),
109
107
  ]
110
108
  )
111
109
 
112
110
  self.is_tool_call_parser_gpt_oss = server_args.tool_call_parser == "gpt-oss"
111
+ self.disable_tokenizer_batch_decode = server_args.disable_tokenizer_batch_decode
113
112
 
114
113
  def event_loop(self):
115
114
  """The event loop that handles requests"""
@@ -142,6 +141,7 @@ class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
142
141
  if output[-1] == 200012 and self.is_tool_call_parser_gpt_oss:
143
142
  return output
144
143
  assert len(output) > 0
144
+ # NOTE: We can always assume the last token is the matched stop token
145
145
  return output[:-1]
146
146
  return output
147
147
 
@@ -177,17 +177,39 @@ class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
177
177
  )
178
178
  surr_ids.append(s.decode_ids[s.surr_offset : s.read_offset])
179
179
 
180
- # TODO(lmzheng): handle skip_special_tokens/spaces_between_special_tokens per request
181
- surr_texts = self.tokenizer.batch_decode(
182
- surr_ids,
183
- skip_special_tokens=recv_obj.skip_special_tokens[0],
184
- spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
185
- )
186
- read_texts = self.tokenizer.batch_decode(
187
- read_ids,
188
- skip_special_tokens=recv_obj.skip_special_tokens[0],
189
- spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
190
- )
180
+ # TODO(lmzheng): better handle skip_special_tokens/spaces_between_special_tokens per request
181
+ if self.disable_tokenizer_batch_decode:
182
+ surr_texts = [
183
+ self.tokenizer.decode(
184
+ surr, skip_special_tokens=skip, spaces_between_special_tokens=space
185
+ )
186
+ for surr, skip, space in zip(
187
+ surr_ids,
188
+ recv_obj.skip_special_tokens,
189
+ recv_obj.spaces_between_special_tokens,
190
+ )
191
+ ]
192
+ read_texts = [
193
+ self.tokenizer.decode(
194
+ read, skip_special_tokens=skip, spaces_between_special_tokens=space
195
+ )
196
+ for read, skip, space in zip(
197
+ read_ids,
198
+ recv_obj.skip_special_tokens,
199
+ recv_obj.spaces_between_special_tokens,
200
+ )
201
+ ]
202
+ else:
203
+ surr_texts = self.tokenizer.batch_decode(
204
+ surr_ids,
205
+ skip_special_tokens=recv_obj.skip_special_tokens[0],
206
+ spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
207
+ )
208
+ read_texts = self.tokenizer.batch_decode(
209
+ read_ids,
210
+ skip_special_tokens=recv_obj.skip_special_tokens[0],
211
+ spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
212
+ )
191
213
 
192
214
  # Incremental decoding
193
215
  output_strs = []
@@ -226,6 +248,7 @@ class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
226
248
 
227
249
  return BatchStrOutput(
228
250
  rids=recv_obj.rids,
251
+ http_worker_ipcs=recv_obj.http_worker_ipcs,
229
252
  finished_reasons=recv_obj.finished_reasons,
230
253
  output_strs=output_strs,
231
254
  output_ids=recv_obj.decode_ids,
@@ -233,6 +256,7 @@ class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
233
256
  completion_tokens=recv_obj.completion_tokens,
234
257
  cached_tokens=recv_obj.cached_tokens,
235
258
  spec_verify_ct=recv_obj.spec_verify_ct,
259
+ spec_accepted_tokens=recv_obj.spec_accepted_tokens,
236
260
  input_token_logprobs_val=recv_obj.input_token_logprobs_val,
237
261
  input_token_logprobs_idx=recv_obj.input_token_logprobs_idx,
238
262
  output_token_logprobs_val=recv_obj.output_token_logprobs_val,
@@ -245,15 +269,18 @@ class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
245
269
  input_token_ids_logprobs_idx=recv_obj.input_token_ids_logprobs_idx,
246
270
  output_token_ids_logprobs_val=recv_obj.output_token_ids_logprobs_val,
247
271
  output_token_ids_logprobs_idx=recv_obj.output_token_ids_logprobs_idx,
272
+ output_token_entropy_val=recv_obj.output_token_entropy_val,
248
273
  output_hidden_states=recv_obj.output_hidden_states,
249
274
  placeholder_tokens_idx=None,
250
275
  placeholder_tokens_val=None,
276
+ token_steps=recv_obj.token_steps,
251
277
  )
252
278
 
253
279
  def handle_multimodal_decode_req(self, recv_obj: BatchMultimodalDecodeReq):
254
280
  outputs = self.tokenizer.detokenize(recv_obj)
255
281
  return BatchMultimodalOutput(
256
282
  rids=recv_obj.rids,
283
+ http_worker_ipcs=recv_obj.http_worker_ipcs,
257
284
  finished_reasons=recv_obj.finished_reasons,
258
285
  outputs=outputs,
259
286
  prompt_tokens=recv_obj.prompt_tokens,