sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -17,14 +17,11 @@ import faulthandler
17
17
  import logging
18
18
  import multiprocessing as mp
19
19
  import signal
20
- import struct
21
- import sys
22
20
  import threading
23
21
  import time
24
22
  from collections import deque
25
23
  from enum import Enum, auto
26
- from multiprocessing import shared_memory
27
- from typing import Dict, List
24
+ from typing import List, Optional
28
25
 
29
26
  import psutil
30
27
  import setproctitle
@@ -39,15 +36,19 @@ from sglang.srt.managers.io_struct import (
39
36
  )
40
37
  from sglang.srt.managers.schedule_batch import Req
41
38
  from sglang.srt.managers.scheduler import run_scheduler_process
42
- from sglang.srt.managers.utils import DPBalanceMeta
43
- from sglang.srt.server_args import PortArgs, ServerArgs
44
- from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
39
+ from sglang.srt.server_args import (
40
+ DP_ATTENTION_HANDSHAKE_PORT_DELTA,
41
+ PortArgs,
42
+ ServerArgs,
43
+ )
45
44
  from sglang.srt.utils import (
46
45
  bind_port,
47
46
  configure_logger,
48
47
  get_zmq_socket,
49
48
  kill_itself_when_parent_died,
49
+ maybe_reindex_device_id,
50
50
  )
51
+ from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter
51
52
  from sglang.utils import TypeBasedDispatcher, get_exception_traceback
52
53
 
53
54
  logger = logging.getLogger(__name__)
@@ -108,15 +109,9 @@ class DPBudget:
108
109
  class DataParallelController:
109
110
  """A controller that dispatches requests to multiple data parallel workers."""
110
111
 
111
- def __init__(
112
- self,
113
- server_args: ServerArgs,
114
- port_args: PortArgs,
115
- dp_balance_meta: DPBalanceMeta,
116
- ) -> None:
112
+ def __init__(self, server_args: ServerArgs, port_args: PortArgs) -> None:
117
113
  # for dp balance
118
114
  self.global_balance_id = 0
119
- self.balance_meta = dp_balance_meta
120
115
 
121
116
  # Parse args
122
117
  self.max_total_num_tokens = None
@@ -145,27 +140,20 @@ class DataParallelController:
145
140
  # Load balance budget
146
141
  self.dp_budget = DPBudget()
147
142
 
143
+ # To protect changing env vars to set CUDA_VISIBLE_DEVICES.
144
+ self.env_lock = threading.Lock()
145
+
148
146
  # Launch data parallel workers
149
147
  self.scheduler_procs = []
150
148
  self.workers: List[zmq.Socket] = [None] * server_args.dp_size
151
149
 
152
150
  if server_args.enable_dp_attention:
153
- dp_port_args = self.launch_dp_attention_schedulers(server_args, port_args)
151
+ self.launch_dp_attention_schedulers(server_args, port_args)
154
152
  self.control_message_step = server_args.tp_size
155
153
  else:
156
- dp_port_args = self.launch_dp_schedulers(server_args, port_args)
154
+ self.launch_dp_schedulers(server_args, port_args)
157
155
  self.control_message_step = 1
158
156
 
159
- # Only node rank 0 runs the real data parallel controller that dispatches the requests.
160
- if server_args.node_rank == 0:
161
- for dp_rank in range(server_args.dp_size):
162
- self.workers[dp_rank] = get_zmq_socket(
163
- self.context,
164
- zmq.PUSH,
165
- dp_port_args[dp_rank].scheduler_input_ipc_name,
166
- True,
167
- )
168
-
169
157
  self.max_req_input_len = None
170
158
 
171
159
  self.init_dispatcher()
@@ -198,13 +186,11 @@ class DataParallelController:
198
186
 
199
187
  threads = []
200
188
  sockets = []
201
- dp_port_args = []
202
189
  ready_events = []
203
190
  for dp_rank in range(server_args.dp_size):
204
191
  tmp_port_args = PortArgs.init_new(server_args)
205
192
  tmp_port_args.tokenizer_ipc_name = port_args.tokenizer_ipc_name
206
193
  tmp_port_args.detokenizer_ipc_name = port_args.detokenizer_ipc_name
207
- dp_port_args.append(tmp_port_args)
208
194
 
209
195
  # This port is checked free in PortArgs.init_new.
210
196
  # We hold it first so that the next dp worker gets a different port
@@ -219,7 +205,17 @@ class DataParallelController:
219
205
  args=(server_args, tmp_port_args, base_gpu_id, dp_rank, ready_event),
220
206
  )
221
207
  threads.append(thread)
222
- base_gpu_id += server_args.tp_size * server_args.gpu_id_step
208
+ base_gpu_id += (
209
+ server_args.tp_size * server_args.pp_size * server_args.gpu_id_step
210
+ )
211
+
212
+ if server_args.node_rank == 0:
213
+ self.workers[dp_rank] = get_zmq_socket(
214
+ self.context,
215
+ zmq.PUSH,
216
+ tmp_port_args.scheduler_input_ipc_name,
217
+ True,
218
+ )
223
219
 
224
220
  # Free all sockets before starting the threads to launch TP workers
225
221
  for sock in sockets:
@@ -231,8 +227,6 @@ class DataParallelController:
231
227
  for event in ready_events:
232
228
  event.wait()
233
229
 
234
- return dp_port_args
235
-
236
230
  def launch_tensor_parallel_group_thread(
237
231
  self,
238
232
  server_args: ServerArgs,
@@ -249,19 +243,115 @@ class DataParallelController:
249
243
  while True:
250
244
  time.sleep(30 * 24 * 3600)
251
245
 
252
- def launch_dp_attention_schedulers(self, server_args, port_args):
253
- self.launch_tensor_parallel_group(server_args, port_args, 0, None)
254
- dp_port_args = []
255
- for dp_rank in range(server_args.dp_size):
256
- dp_port_args.append(PortArgs.init_new(server_args, dp_rank))
257
- return dp_port_args
246
+ def _broadcast_worker_ports(
247
+ self, server_args: ServerArgs, worker_ports: Optional[List[int]] = None
248
+ ) -> List[int]:
249
+ """Broadcast worker ports from node 0 to all other nodes.
250
+
251
+ Node 0 acts as the server, waiting for all other nodes to connect and
252
+ sending them the pre-allocated worker ports. Other nodes act as clients,
253
+ connecting to node 0 to receive their copy of the worker ports.
254
+
255
+ Args:
256
+ server_args: Server arguments containing node configuration.
257
+ worker_ports: Pre-allocated worker ports to broadcast.
258
+
259
+ Returns:
260
+ List of worker ports (same on all nodes after broadcast).
261
+ """
262
+ # Determine the endpoint for inter-node communication
263
+ if server_args.dist_init_addr is None:
264
+ endpoint = f"tcp://127.0.0.1:{server_args.port + DP_ATTENTION_HANDSHAKE_PORT_DELTA}"
265
+ else:
266
+ endpoint = f"tcp://{server_args.dist_init_addr}"
267
+
268
+ if server_args.node_rank == 0:
269
+ # Node 0: Broadcast worker ports to all other nodes
270
+ return self._broadcast_ports_as_server(
271
+ endpoint, server_args.nnodes - 1, worker_ports
272
+ )
273
+ else:
274
+ # Other nodes: Receive worker ports from node 0
275
+ return self._receive_ports_as_client(endpoint, server_args.node_rank)
276
+
277
+ def _broadcast_ports_as_server(
278
+ self, endpoint: str, expected_clients: int, worker_ports: List[int]
279
+ ) -> List[int]:
280
+ """Broadcast worker ports to all client nodes."""
281
+ logger.debug(f"Broadcasting worker ports to {expected_clients} client nodes")
282
+ logger.debug(f"Worker ports: {worker_ports}")
283
+
284
+ rep_socket = get_zmq_socket(self.context, zmq.REP, endpoint, True)
285
+
286
+ try:
287
+ connected_clients = 0
288
+ while connected_clients < expected_clients:
289
+ # Wait for client handshake
290
+ client_rank = rep_socket.recv().decode()
291
+ logger.debug(f"Received handshake from node {client_rank}")
292
+
293
+ # Send worker ports to client
294
+ rep_socket.send_pyobj(worker_ports)
295
+ connected_clients += 1
296
+ logger.debug(
297
+ f"Sent worker ports to {connected_clients}/{expected_clients} nodes"
298
+ )
299
+
300
+ logger.debug("Worker port broadcast completed")
301
+ return worker_ports
302
+ finally:
303
+ rep_socket.close()
304
+
305
+ def _receive_ports_as_client(self, endpoint: str, node_rank: int) -> List[int]:
306
+ """Receive worker ports from the server node."""
307
+ logger.debug(f"Connecting to node 0 to receive worker ports")
308
+
309
+ req_socket = get_zmq_socket(self.context, zmq.REQ, endpoint, False)
310
+ req_socket.setsockopt(zmq.RCVTIMEO, 60 * 1000) # 1 minute timeout
311
+ req_socket.setsockopt(zmq.SNDTIMEO, 60 * 1000)
312
+
313
+ try:
314
+ # Send handshake with our node rank
315
+ req_socket.send(str(node_rank).encode())
316
+
317
+ # Receive worker ports
318
+ worker_ports = req_socket.recv_pyobj()
319
+ logger.debug(f"Received {len(worker_ports)} worker ports from node 0")
320
+ return worker_ports
321
+ except zmq.Again:
322
+ logger.error("Timeout waiting for worker ports from node 0")
323
+ raise RuntimeError(
324
+ "Failed to receive worker ports from node 0 within timeout"
325
+ )
326
+ finally:
327
+ req_socket.close()
328
+
329
+ def launch_dp_attention_schedulers(
330
+ self, server_args: ServerArgs, port_args: PortArgs
331
+ ):
332
+ # Pre-allocate worker ports on node 0 to avoid conflicts
333
+ worker_ports = []
334
+ if server_args.node_rank == 0:
335
+ for dp_rank in range(server_args.dp_size):
336
+ port_and_socket = get_zmq_socket(self.context, zmq.PUSH)
337
+ worker_ports.append(port_and_socket[0])
338
+ self.workers[dp_rank] = port_and_socket[1]
339
+ logger.debug(f"Assigned port {port_and_socket[0]} to worker {dp_rank}")
340
+
341
+ broadcasted_ports = self._broadcast_worker_ports(
342
+ server_args, worker_ports if worker_ports else None
343
+ )
344
+ self.launch_tensor_parallel_group(
345
+ server_args, port_args, 0, None, broadcasted_ports
346
+ )
258
347
 
259
348
  def launch_tensor_parallel_group(
260
349
  self,
261
350
  server_args: ServerArgs,
262
351
  port_args: PortArgs,
263
352
  base_gpu_id: int,
264
- dp_rank: int,
353
+ dp_rank: Optional[int],
354
+ worker_ports: Optional[List[int]] = None,
265
355
  ):
266
356
  if not server_args.enable_dp_attention:
267
357
  logger.info(f"Launch DP{dp_rank} starting at GPU #{base_gpu_id}.")
@@ -298,7 +388,9 @@ class DataParallelController:
298
388
  server_args.dp_size,
299
389
  )
300
390
  # compute zmq ports for this dp rank
301
- rank_port_args = PortArgs.init_new(server_args, dp_rank)
391
+ rank_port_args = PortArgs.init_new(
392
+ server_args, dp_rank, worker_ports
393
+ )
302
394
  # Data parallelism reuses the tensor parallelism group,
303
395
  # so all dp ranks should use the same nccl port.
304
396
  rank_port_args.nccl_port = port_args.nccl_port
@@ -311,22 +403,22 @@ class DataParallelController:
311
403
  + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
312
404
  )
313
405
  moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
314
- proc = mp.Process(
315
- target=run_scheduler_process,
316
- args=(
317
- server_args,
318
- rank_port_args,
319
- gpu_id,
320
- tp_rank,
321
- moe_ep_rank,
322
- pp_rank,
323
- dp_rank,
324
- writer,
325
- self.balance_meta,
326
- ),
327
- )
328
- with memory_saver_adapter.configure_subprocess():
329
- proc.start()
406
+ with self.env_lock, maybe_reindex_device_id(gpu_id) as gpu_id:
407
+ proc = mp.Process(
408
+ target=run_scheduler_process,
409
+ args=(
410
+ server_args,
411
+ rank_port_args,
412
+ gpu_id,
413
+ tp_rank,
414
+ moe_ep_rank,
415
+ pp_rank,
416
+ dp_rank,
417
+ writer,
418
+ ),
419
+ )
420
+ with memory_saver_adapter.configure_subprocess():
421
+ proc.start()
330
422
  self.scheduler_procs.append(proc)
331
423
  scheduler_pipe_readers.append(reader)
332
424
 
@@ -355,6 +447,9 @@ class DataParallelController:
355
447
  self.workers
356
448
  )
357
449
  else:
450
+ assert (
451
+ req.bootstrap_room is not None
452
+ ), "req.bootstrap_room should not be None. Do not send requests directly to prefill or decode instances, but send to the router instead."
358
453
  self.workers[req.bootstrap_room % len(self.workers)].send_pyobj(req)
359
454
 
360
455
  def shortest_queue_scheduler(self, req):
@@ -370,31 +465,11 @@ class DataParallelController:
370
465
  if self.maybe_external_dp_rank_routing(req):
371
466
  return
372
467
 
373
- # This variable corresponds to the balance_id in TokenizedGenerateReqInput.
374
- # We use it to to control the number of onfly tokens (requests dispatched to workers but not yet received).
375
- def get_next_global_balance_id() -> int:
376
- INT32_MAX = 2147483647
377
- current_id = self.global_balance_id
378
- self.global_balance_id = (self.global_balance_id + 1) % INT32_MAX
379
- return current_id
380
-
381
- req.dp_balance_id = get_next_global_balance_id()
382
- with self.balance_meta.mutex:
383
- # 1. local_tokens represents the tokens currently inferring on the worker,
384
- # while onfly refers to the requests dispatched by the dispatcher but not yet received by the scheduler.
385
- onfly_info = self.balance_meta.get_shared_onfly()
386
- local_tokens = self.balance_meta.get_shared_local_tokens()
387
- total_tokens = [
388
- local_token + sum(onfly_dict.values())
389
- for local_token, onfly_dict in zip(local_tokens, onfly_info)
390
- ]
391
- target_worker = total_tokens.index(min(total_tokens))
392
- onfly_info[target_worker][req.dp_balance_id] = len(req.input_ids)
393
- # 2. write the new onfly info to the shm
394
- self.balance_meta.set_shared_onfly_info(onfly_info)
395
-
396
- # logger.info(f"dp workers {local_tokens=}, {onfly_info=}, {target_worker=}")
397
- self.workers[target_worker].send_pyobj(req)
468
+ logger.warning(
469
+ "The 'minimum_tokens' load balancing method is deprecated for now and will introduced later."
470
+ "Fall back to 'round_robin_scheduler'"
471
+ )
472
+ self.round_robin_scheduler(req)
398
473
 
399
474
  def event_loop(self):
400
475
  while True:
@@ -416,12 +491,9 @@ def run_data_parallel_controller_process(
416
491
  faulthandler.enable()
417
492
  configure_logger(server_args)
418
493
  parent_process = psutil.Process().parent()
419
- balance_meta = DPBalanceMeta(server_args.dp_size)
420
494
 
421
495
  try:
422
- controller = DataParallelController(
423
- server_args, port_args, dp_balance_meta=balance_meta
424
- )
496
+ controller = DataParallelController(server_args, port_args)
425
497
  pipe_writer.send(
426
498
  {
427
499
  "status": "ready",
@@ -440,6 +512,3 @@ def run_data_parallel_controller_process(
440
512
  traceback = get_exception_traceback()
441
513
  logger.error(f"DataParallelController hit an exception: {traceback}")
442
514
  parent_process.send_signal(signal.SIGQUIT)
443
- finally:
444
- # we need to destruct mp.Manager() in balance_meta
445
- balance_meta.destructor()
@@ -24,15 +24,13 @@ import psutil
24
24
  import setproctitle
25
25
  import zmq
26
26
 
27
- from sglang.srt.hf_transformers_utils import get_tokenizer
28
27
  from sglang.srt.managers.io_struct import (
29
- BatchEmbeddingOut,
28
+ BatchEmbeddingOutput,
30
29
  BatchMultimodalDecodeReq,
31
- BatchMultimodalOut,
32
- BatchStrOut,
33
- BatchTokenIDOut,
30
+ BatchMultimodalOutput,
31
+ BatchStrOutput,
32
+ BatchTokenIDOutput,
34
33
  FreezeGCReq,
35
- MultiTokenizerRegisterReq,
36
34
  )
37
35
  from sglang.srt.managers.multi_tokenizer_mixin import MultiHttpWorkerDetokenizerMixin
38
36
  from sglang.srt.server_args import PortArgs, ServerArgs
@@ -42,6 +40,7 @@ from sglang.srt.utils import (
42
40
  get_zmq_socket,
43
41
  kill_itself_when_parent_died,
44
42
  )
43
+ from sglang.srt.utils.hf_transformers_utils import get_tokenizer
45
44
  from sglang.utils import (
46
45
  TypeBasedDispatcher,
47
46
  find_printable_text,
@@ -101,15 +100,15 @@ class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
101
100
 
102
101
  self._request_dispatcher = TypeBasedDispatcher(
103
102
  [
104
- (BatchEmbeddingOut, self.handle_batch_embedding_out),
105
- (BatchTokenIDOut, self.handle_batch_token_id_out),
103
+ (BatchEmbeddingOutput, self.handle_batch_embedding_out),
104
+ (BatchTokenIDOutput, self.handle_batch_token_id_out),
106
105
  (BatchMultimodalDecodeReq, self.handle_multimodal_decode_req),
107
- (MultiTokenizerRegisterReq, lambda x: x),
108
106
  (FreezeGCReq, self.handle_freeze_gc_req),
109
107
  ]
110
108
  )
111
109
 
112
110
  self.is_tool_call_parser_gpt_oss = server_args.tool_call_parser == "gpt-oss"
111
+ self.disable_tokenizer_batch_decode = server_args.disable_tokenizer_batch_decode
113
112
 
114
113
  def event_loop(self):
115
114
  """The event loop that handles requests"""
@@ -142,14 +141,15 @@ class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
142
141
  if output[-1] == 200012 and self.is_tool_call_parser_gpt_oss:
143
142
  return output
144
143
  assert len(output) > 0
144
+ # NOTE: We can always assume the last token is the matched stop token
145
145
  return output[:-1]
146
146
  return output
147
147
 
148
- def handle_batch_embedding_out(self, recv_obj: BatchEmbeddingOut):
148
+ def handle_batch_embedding_out(self, recv_obj: BatchEmbeddingOutput):
149
149
  # If it is embedding model, no detokenization is needed.
150
150
  return recv_obj
151
151
 
152
- def handle_batch_token_id_out(self, recv_obj: BatchTokenIDOut):
152
+ def handle_batch_token_id_out(self, recv_obj: BatchTokenIDOutput):
153
153
  bs = len(recv_obj.rids)
154
154
 
155
155
  # Initialize decode status
@@ -177,17 +177,39 @@ class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
177
177
  )
178
178
  surr_ids.append(s.decode_ids[s.surr_offset : s.read_offset])
179
179
 
180
- # TODO(lmzheng): handle skip_special_tokens/spaces_between_special_tokens per request
181
- surr_texts = self.tokenizer.batch_decode(
182
- surr_ids,
183
- skip_special_tokens=recv_obj.skip_special_tokens[0],
184
- spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
185
- )
186
- read_texts = self.tokenizer.batch_decode(
187
- read_ids,
188
- skip_special_tokens=recv_obj.skip_special_tokens[0],
189
- spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
190
- )
180
+ # TODO(lmzheng): better handle skip_special_tokens/spaces_between_special_tokens per request
181
+ if self.disable_tokenizer_batch_decode:
182
+ surr_texts = [
183
+ self.tokenizer.decode(
184
+ surr, skip_special_tokens=skip, spaces_between_special_tokens=space
185
+ )
186
+ for surr, skip, space in zip(
187
+ surr_ids,
188
+ recv_obj.skip_special_tokens,
189
+ recv_obj.spaces_between_special_tokens,
190
+ )
191
+ ]
192
+ read_texts = [
193
+ self.tokenizer.decode(
194
+ read, skip_special_tokens=skip, spaces_between_special_tokens=space
195
+ )
196
+ for read, skip, space in zip(
197
+ read_ids,
198
+ recv_obj.skip_special_tokens,
199
+ recv_obj.spaces_between_special_tokens,
200
+ )
201
+ ]
202
+ else:
203
+ surr_texts = self.tokenizer.batch_decode(
204
+ surr_ids,
205
+ skip_special_tokens=recv_obj.skip_special_tokens[0],
206
+ spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
207
+ )
208
+ read_texts = self.tokenizer.batch_decode(
209
+ read_ids,
210
+ skip_special_tokens=recv_obj.skip_special_tokens[0],
211
+ spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
212
+ )
191
213
 
192
214
  # Incremental decoding
193
215
  output_strs = []
@@ -224,8 +246,9 @@ class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
224
246
  s.sent_offset = len(output_str)
225
247
  output_strs.append(incremental_output)
226
248
 
227
- return BatchStrOut(
249
+ return BatchStrOutput(
228
250
  rids=recv_obj.rids,
251
+ http_worker_ipcs=recv_obj.http_worker_ipcs,
229
252
  finished_reasons=recv_obj.finished_reasons,
230
253
  output_strs=output_strs,
231
254
  output_ids=recv_obj.decode_ids,
@@ -233,6 +256,7 @@ class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
233
256
  completion_tokens=recv_obj.completion_tokens,
234
257
  cached_tokens=recv_obj.cached_tokens,
235
258
  spec_verify_ct=recv_obj.spec_verify_ct,
259
+ spec_accepted_tokens=recv_obj.spec_accepted_tokens,
236
260
  input_token_logprobs_val=recv_obj.input_token_logprobs_val,
237
261
  input_token_logprobs_idx=recv_obj.input_token_logprobs_idx,
238
262
  output_token_logprobs_val=recv_obj.output_token_logprobs_val,
@@ -245,15 +269,18 @@ class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
245
269
  input_token_ids_logprobs_idx=recv_obj.input_token_ids_logprobs_idx,
246
270
  output_token_ids_logprobs_val=recv_obj.output_token_ids_logprobs_val,
247
271
  output_token_ids_logprobs_idx=recv_obj.output_token_ids_logprobs_idx,
272
+ output_token_entropy_val=recv_obj.output_token_entropy_val,
248
273
  output_hidden_states=recv_obj.output_hidden_states,
249
274
  placeholder_tokens_idx=None,
250
275
  placeholder_tokens_val=None,
276
+ token_steps=recv_obj.token_steps,
251
277
  )
252
278
 
253
279
  def handle_multimodal_decode_req(self, recv_obj: BatchMultimodalDecodeReq):
254
280
  outputs = self.tokenizer.detokenize(recv_obj)
255
- return BatchMultimodalOut(
281
+ return BatchMultimodalOutput(
256
282
  rids=recv_obj.rids,
283
+ http_worker_ipcs=recv_obj.http_worker_ipcs,
257
284
  finished_reasons=recv_obj.finished_reasons,
258
285
  outputs=outputs,
259
286
  prompt_tokens=recv_obj.prompt_tokens,