sglang 0.5.2rc2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +8 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +360 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/model_config.py +181 -82
  17. sglang/srt/configs/qwen3_next.py +326 -0
  18. sglang/srt/configs/qwen3_vl.py +586 -0
  19. sglang/srt/connector/__init__.py +8 -1
  20. sglang/srt/connector/remote_instance.py +82 -0
  21. sglang/srt/constrained/base_grammar_backend.py +49 -12
  22. sglang/srt/constrained/llguidance_backend.py +0 -1
  23. sglang/srt/constrained/outlines_backend.py +0 -1
  24. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  25. sglang/srt/constrained/xgrammar_backend.py +30 -9
  26. sglang/srt/custom_op.py +11 -1
  27. sglang/srt/debug_utils/dump_comparator.py +81 -44
  28. sglang/srt/debug_utils/dump_loader.py +97 -0
  29. sglang/srt/debug_utils/dumper.py +21 -6
  30. sglang/srt/debug_utils/text_comparator.py +73 -11
  31. sglang/srt/disaggregation/ascend/conn.py +2 -2
  32. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  33. sglang/srt/disaggregation/base/conn.py +1 -1
  34. sglang/srt/disaggregation/common/conn.py +279 -108
  35. sglang/srt/disaggregation/decode.py +71 -19
  36. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  37. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  38. sglang/srt/disaggregation/fake/conn.py +1 -1
  39. sglang/srt/disaggregation/mini_lb.py +6 -445
  40. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  41. sglang/srt/disaggregation/nixl/conn.py +326 -53
  42. sglang/srt/disaggregation/prefill.py +36 -17
  43. sglang/srt/disaggregation/utils.py +40 -54
  44. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  45. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  46. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  47. sglang/srt/distributed/parallel_state.py +156 -80
  48. sglang/srt/entrypoints/engine.py +59 -18
  49. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  50. sglang/srt/entrypoints/grpc_server.py +810 -0
  51. sglang/srt/entrypoints/http_server.py +130 -59
  52. sglang/srt/entrypoints/openai/protocol.py +112 -4
  53. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  54. sglang/srt/entrypoints/openai/serving_chat.py +204 -55
  55. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  56. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  57. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  58. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  59. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  60. sglang/srt/environ.py +285 -0
  61. sglang/srt/eplb/eplb_manager.py +2 -2
  62. sglang/srt/eplb/expert_distribution.py +26 -13
  63. sglang/srt/eplb/expert_location.py +38 -8
  64. sglang/srt/eplb/expert_location_updater.py +1 -1
  65. sglang/srt/function_call/base_format_detector.py +3 -6
  66. sglang/srt/function_call/ebnf_composer.py +11 -9
  67. sglang/srt/function_call/function_call_parser.py +9 -2
  68. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  69. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  70. sglang/srt/function_call/json_array_parser.py +63 -0
  71. sglang/srt/function_call/kimik2_detector.py +17 -4
  72. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  73. sglang/srt/function_call/utils.py +96 -5
  74. sglang/srt/grpc/__init__.py +1 -0
  75. sglang/srt/grpc/compile_proto.py +245 -0
  76. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  77. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  79. sglang/srt/layers/activation.py +143 -9
  80. sglang/srt/layers/attention/aiter_backend.py +14 -15
  81. sglang/srt/layers/attention/ascend_backend.py +115 -9
  82. sglang/srt/layers/attention/attention_registry.py +206 -0
  83. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  84. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  85. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  86. sglang/srt/layers/attention/fla/chunk.py +242 -0
  87. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  88. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  89. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  90. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  91. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  92. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  93. sglang/srt/layers/attention/fla/index.py +37 -0
  94. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  95. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  96. sglang/srt/layers/attention/fla/op.py +66 -0
  97. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  98. sglang/srt/layers/attention/fla/utils.py +331 -0
  99. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  100. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  101. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  102. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  103. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  104. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  105. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  106. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  107. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  108. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  109. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  110. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  111. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  112. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  113. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  114. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  115. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  119. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  120. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  121. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  122. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  123. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  124. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  125. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  126. sglang/srt/layers/attention/nsa/utils.py +24 -0
  127. sglang/srt/layers/attention/nsa_backend.py +887 -0
  128. sglang/srt/layers/attention/tbo_backend.py +6 -6
  129. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  130. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  131. sglang/srt/layers/attention/triton_backend.py +57 -7
  132. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  133. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  134. sglang/srt/layers/attention/vision.py +58 -0
  135. sglang/srt/layers/attention/wave_backend.py +4 -4
  136. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  137. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  138. sglang/srt/layers/communicator.py +8 -0
  139. sglang/srt/layers/dp_attention.py +41 -2
  140. sglang/srt/layers/elementwise.py +3 -1
  141. sglang/srt/layers/layernorm.py +34 -15
  142. sglang/srt/layers/linear.py +55 -7
  143. sglang/srt/layers/logits_processor.py +44 -12
  144. sglang/srt/layers/moe/__init__.py +2 -1
  145. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  146. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  147. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  148. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  149. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  150. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  151. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  164. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  165. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  166. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  167. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  168. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  169. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  170. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  171. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  172. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  173. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  174. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  175. sglang/srt/layers/moe/topk.py +30 -9
  176. sglang/srt/layers/moe/utils.py +22 -6
  177. sglang/srt/layers/parameter.py +23 -6
  178. sglang/srt/layers/quantization/awq.py +19 -7
  179. sglang/srt/layers/quantization/base_config.py +11 -6
  180. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  182. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  185. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  186. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  187. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  188. sglang/srt/layers/quantization/fp8.py +78 -49
  189. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  190. sglang/srt/layers/quantization/gptq.py +25 -17
  191. sglang/srt/layers/quantization/modelopt_quant.py +190 -55
  192. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  193. sglang/srt/layers/quantization/mxfp4.py +74 -42
  194. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  195. sglang/srt/layers/quantization/unquant.py +135 -47
  196. sglang/srt/layers/quantization/w4afp8.py +26 -17
  197. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  198. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  199. sglang/srt/layers/rotary_embedding.py +78 -31
  200. sglang/srt/layers/sampler.py +213 -21
  201. sglang/srt/layers/utils.py +23 -0
  202. sglang/srt/lora/backend/base_backend.py +50 -8
  203. sglang/srt/lora/backend/chunked_backend.py +348 -0
  204. sglang/srt/lora/backend/triton_backend.py +99 -5
  205. sglang/srt/lora/layers.py +32 -0
  206. sglang/srt/lora/lora.py +8 -3
  207. sglang/srt/lora/lora_manager.py +52 -118
  208. sglang/srt/lora/mem_pool.py +25 -11
  209. sglang/srt/lora/triton_ops/__init__.py +4 -0
  210. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  211. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  212. sglang/srt/lora/utils.py +22 -11
  213. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  214. sglang/srt/managers/cache_controller.py +199 -301
  215. sglang/srt/managers/data_parallel_controller.py +115 -80
  216. sglang/srt/managers/detokenizer_manager.py +19 -15
  217. sglang/srt/managers/disagg_service.py +46 -0
  218. sglang/srt/managers/io_struct.py +340 -109
  219. sglang/srt/managers/mm_utils.py +44 -6
  220. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  221. sglang/srt/managers/multimodal_processor.py +1 -2
  222. sglang/srt/managers/overlap_utils.py +53 -0
  223. sglang/srt/managers/schedule_batch.py +240 -138
  224. sglang/srt/managers/schedule_policy.py +144 -17
  225. sglang/srt/managers/scheduler.py +502 -209
  226. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  227. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  228. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  229. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  230. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  231. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  232. sglang/srt/managers/tokenizer_manager.py +320 -632
  233. sglang/srt/managers/tp_worker.py +81 -22
  234. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  235. sglang/srt/managers/utils.py +1 -45
  236. sglang/srt/mem_cache/allocator.py +14 -20
  237. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  238. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  239. sglang/srt/mem_cache/chunk_cache.py +8 -1
  240. sglang/srt/mem_cache/evict_policy.py +23 -0
  241. sglang/srt/mem_cache/hicache_storage.py +43 -24
  242. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  243. sglang/srt/mem_cache/memory_pool.py +535 -58
  244. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  245. sglang/srt/mem_cache/radix_cache.py +222 -73
  246. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  247. sglang/srt/mem_cache/storage/__init__.py +10 -0
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  249. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  250. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  251. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  252. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  253. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  254. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  255. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  256. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  257. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  259. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  260. sglang/srt/metrics/collector.py +511 -132
  261. sglang/srt/metrics/func_timer.py +2 -7
  262. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  263. sglang/srt/metrics/utils.py +8 -1
  264. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  265. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  266. sglang/srt/model_executor/forward_batch_info.py +82 -40
  267. sglang/srt/model_executor/model_runner.py +432 -157
  268. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  269. sglang/srt/model_loader/__init__.py +9 -3
  270. sglang/srt/model_loader/loader.py +133 -5
  271. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  272. sglang/srt/model_loader/weight_utils.py +158 -3
  273. sglang/srt/models/apertus.py +686 -0
  274. sglang/srt/models/bailing_moe.py +820 -217
  275. sglang/srt/models/bailing_moe_nextn.py +168 -0
  276. sglang/srt/models/deepseek_nextn.py +6 -1
  277. sglang/srt/models/deepseek_v2.py +607 -130
  278. sglang/srt/models/dots_ocr.py +173 -0
  279. sglang/srt/models/dots_vlm.py +174 -0
  280. sglang/srt/models/dots_vlm_vit.py +337 -0
  281. sglang/srt/models/ernie4.py +1 -1
  282. sglang/srt/models/falcon_h1.py +576 -0
  283. sglang/srt/models/gemma3_causal.py +0 -2
  284. sglang/srt/models/gemma3_mm.py +1 -1
  285. sglang/srt/models/gemma3n_mm.py +2 -2
  286. sglang/srt/models/glm4_moe.py +4 -4
  287. sglang/srt/models/glm4_moe_nextn.py +2 -2
  288. sglang/srt/models/glm4v.py +5 -3
  289. sglang/srt/models/glm4v_moe.py +4 -1
  290. sglang/srt/models/gpt_oss.py +8 -31
  291. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  292. sglang/srt/models/llama.py +4 -0
  293. sglang/srt/models/llama4.py +9 -0
  294. sglang/srt/models/llama_eagle3.py +13 -0
  295. sglang/srt/models/longcat_flash.py +3 -3
  296. sglang/srt/models/longcat_flash_nextn.py +1 -1
  297. sglang/srt/models/mllama4.py +40 -4
  298. sglang/srt/models/opt.py +637 -0
  299. sglang/srt/models/qwen2_5_vl.py +29 -5
  300. sglang/srt/models/qwen2_audio.py +1 -1
  301. sglang/srt/models/qwen2_moe.py +120 -13
  302. sglang/srt/models/qwen2_vl.py +1 -1
  303. sglang/srt/models/qwen3.py +18 -3
  304. sglang/srt/models/qwen3_moe.py +32 -4
  305. sglang/srt/models/qwen3_next.py +1069 -0
  306. sglang/srt/models/qwen3_next_mtp.py +112 -0
  307. sglang/srt/models/qwen3_vl.py +787 -0
  308. sglang/srt/models/qwen3_vl_moe.py +471 -0
  309. sglang/srt/models/registry.py +15 -3
  310. sglang/srt/models/sarashina2_vision.py +269 -0
  311. sglang/srt/models/solar.py +505 -0
  312. sglang/srt/models/starcoder2.py +357 -0
  313. sglang/srt/models/step3_vl.py +1 -1
  314. sglang/srt/models/torch_native_llama.py +9 -2
  315. sglang/srt/models/utils.py +51 -0
  316. sglang/srt/multimodal/processors/base_processor.py +15 -7
  317. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  318. sglang/srt/multimodal/processors/glm4v.py +9 -9
  319. sglang/srt/multimodal/processors/internvl.py +153 -129
  320. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  321. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  322. sglang/srt/offloader.py +27 -3
  323. sglang/srt/parser/jinja_template_utils.py +6 -0
  324. sglang/srt/sampling/sampling_batch_info.py +38 -17
  325. sglang/srt/sampling/sampling_params.py +7 -0
  326. sglang/srt/server_args.py +966 -267
  327. sglang/srt/server_args_config_parser.py +146 -0
  328. sglang/srt/single_batch_overlap.py +151 -0
  329. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  330. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  331. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  332. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  333. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  334. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  335. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  336. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  337. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  338. sglang/srt/speculative/eagle_worker.py +99 -28
  339. sglang/srt/speculative/ngram_utils.py +428 -0
  340. sglang/srt/speculative/ngram_worker.py +245 -0
  341. sglang/srt/speculative/spec_info.py +52 -0
  342. sglang/srt/speculative/spec_utils.py +606 -0
  343. sglang/srt/speculative/standalone_worker.py +109 -0
  344. sglang/srt/torch_memory_saver_adapter.py +5 -7
  345. sglang/srt/tracing/trace.py +578 -0
  346. sglang/srt/two_batch_overlap.py +8 -5
  347. sglang/srt/utils/__init__.py +2 -0
  348. sglang/srt/{utils.py → utils/common.py} +433 -77
  349. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  350. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  351. sglang/srt/utils/rpd_utils.py +452 -0
  352. sglang/srt/utils/slow_rank_detector.py +71 -0
  353. sglang/srt/warmup.py +8 -4
  354. sglang/srt/weight_sync/utils.py +2 -2
  355. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  356. sglang/test/get_logits_ut.py +57 -0
  357. sglang/test/run_eval.py +79 -11
  358. sglang/test/runners.py +5 -1
  359. sglang/test/simple_eval_common.py +5 -2
  360. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  361. sglang/test/test_block_fp8.py +2 -2
  362. sglang/test/test_cutlass_moe.py +24 -6
  363. sglang/test/test_deterministic.py +297 -0
  364. sglang/test/test_disaggregation_utils.py +77 -0
  365. sglang/test/test_fp4_moe.py +370 -1
  366. sglang/test/test_programs.py +1 -1
  367. sglang/test/test_utils.py +383 -5
  368. sglang/utils.py +21 -1
  369. sglang/version.py +1 -1
  370. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
  371. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/RECORD +375 -245
  372. sglang/srt/disaggregation/launch_lb.py +0 -118
  373. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  374. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  375. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
  376. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
  377. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -11,32 +11,34 @@
11
11
  # See the License for the specific language governing permissions and
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
- """MultiTokenizerMixin is a class that provides nesscary methods for MultiTokenizerManager and DetokenizerManager."""
14
+ """Mixin class and utils for multi-http-worker mode"""
15
15
  import asyncio
16
- import dataclasses
17
- import json
18
16
  import logging
19
17
  import multiprocessing as multiprocessing
20
18
  import os
19
+ import pickle
21
20
  import sys
22
21
  import threading
22
+ from functools import partialmethod
23
23
  from multiprocessing import shared_memory
24
- from typing import Dict
24
+ from typing import Any, Dict
25
25
 
26
26
  import setproctitle
27
27
  import zmq
28
28
  import zmq.asyncio
29
29
 
30
30
  from sglang.srt.disaggregation.utils import DisaggregationMode, TransferBackend
31
+ from sglang.srt.managers.disagg_service import start_disagg_service
31
32
  from sglang.srt.managers.io_struct import (
32
- BatchEmbeddingOut,
33
- BatchMultimodalOut,
34
- BatchStrOut,
35
- BatchTokenIDOut,
33
+ BatchEmbeddingOutput,
34
+ BatchMultimodalOutput,
35
+ BatchStrOutput,
36
+ BatchTokenIDOutput,
36
37
  MultiTokenizerRegisterReq,
37
- MultiTokenizerWarpper,
38
+ MultiTokenizerWrapper,
38
39
  )
39
- from sglang.srt.managers.tokenizer_manager import TokenizerManager, _Communicator
40
+ from sglang.srt.managers.tokenizer_communicator_mixin import _Communicator
41
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
40
42
  from sglang.srt.server_args import PortArgs, ServerArgs
41
43
  from sglang.srt.utils import get_zmq_socket, kill_process_tree
42
44
  from sglang.utils import get_exception_traceback
@@ -44,302 +46,304 @@ from sglang.utils import get_exception_traceback
44
46
  logger = logging.getLogger(__name__)
45
47
 
46
48
 
47
- class MultiTokenizerMixin:
48
- """Mixin class for MultiTokenizerManager and DetokenizerManager"""
49
+ class SocketMapping:
50
+ def __init__(self):
51
+ self._zmq_context = zmq.Context()
52
+ self._mapping: Dict[str, zmq.Socket] = {}
49
53
 
50
- def create_sockets_mapping(self):
51
- if not hasattr(self, "tokenizer_mapping"):
52
- self.tokenizer_mapping = {}
53
- # Create ZMQ context if needed
54
- if not hasattr(self, "_zmq_context"):
55
- self._zmq_context = zmq.Context()
54
+ def clear_all_sockets(self):
55
+ for socket in self._mapping.values():
56
+ socket.close()
57
+ self._mapping.clear()
56
58
 
57
- def init_tokenizer_mapping(
58
- self, recv_obj: MultiTokenizerRegisterReq, worker_id: str
59
+ def register_ipc_mapping(
60
+ self, recv_obj: MultiTokenizerRegisterReq, worker_id: str, is_tokenizer: bool
59
61
  ):
60
- """init tokenizer mapping from register request"""
61
- ipc_name = recv_obj.ipc_name
62
- worker_id_int = int(worker_id)
63
-
64
- if worker_id_int not in self.tokenizer_mapping:
65
- socket = get_zmq_socket(self._zmq_context, zmq.PUSH, ipc_name, False)
66
- self.tokenizer_mapping[worker_id_int] = socket
67
- self.tokenizer_mapping[worker_id_int].send_pyobj(recv_obj)
68
- return True
69
- else:
70
- return False
71
-
72
- def register_tokenizer_ipc(self, recv_obj, worker_id):
73
- if worker_id not in self.tokenizer_mapping:
74
- # register the worker if not already done
75
- if isinstance(recv_obj, MultiTokenizerRegisterReq):
76
- return self.init_tokenizer_mapping(recv_obj, worker_id)
77
- else:
78
- logger.error(
79
- f"Worker {worker_id} not registered and not found in tokenizer mapping . "
80
- "Please ensure the worker is registered correctly."
81
- )
82
- return False
83
-
84
- def _handle_output_by_index(self, output, i):
85
- """NOTE: A maintainable method is better here."""
86
- if isinstance(output, BatchTokenIDOut):
87
- new_output = BatchTokenIDOut(
88
- rids=[output.rids[i]],
89
- finished_reasons=(
90
- [output.finished_reasons[i]]
91
- if len(output.finished_reasons) > i
92
- else None
93
- ),
94
- decoded_texts=(
95
- [output.decoded_texts[i]] if len(output.decoded_texts) > i else None
96
- ),
97
- decode_ids=(
98
- [output.decode_ids[i]] if len(output.decode_ids) > i else None
99
- ),
100
- read_offsets=(
101
- [output.read_offsets[i]] if len(output.read_offsets) > i else None
102
- ),
103
- output_ids=(
104
- [output.output_ids[i]]
105
- if output.output_ids and len(output.output_ids) > i
106
- else None
107
- ),
108
- skip_special_tokens=(
109
- [output.skip_special_tokens[i]]
110
- if len(output.skip_special_tokens) > i
111
- else None
112
- ),
113
- spaces_between_special_tokens=(
114
- [output.spaces_between_special_tokens[i]]
115
- if len(output.spaces_between_special_tokens) > i
116
- else None
117
- ),
118
- no_stop_trim=(
119
- [output.no_stop_trim[i]] if len(output.no_stop_trim) > i else None
120
- ),
121
- prompt_tokens=(
122
- [output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
123
- ),
124
- completion_tokens=(
125
- [output.completion_tokens[i]]
126
- if len(output.completion_tokens) > i
127
- else None
128
- ),
129
- cached_tokens=(
130
- [output.cached_tokens[i]] if len(output.cached_tokens) > i else None
131
- ),
132
- spec_verify_ct=(
133
- [output.spec_verify_ct[i]]
134
- if len(output.spec_verify_ct) > i
135
- else None
136
- ),
137
- input_token_logprobs_val=(
138
- [output.input_token_logprobs_val[i]]
139
- if output.input_token_logprobs_val
140
- else None
141
- ),
142
- input_token_logprobs_idx=(
143
- [output.input_token_logprobs_idx[i]]
144
- if output.input_token_logprobs_idx
145
- else None
146
- ),
147
- output_token_logprobs_val=(
148
- [output.output_token_logprobs_val[i]]
149
- if output.output_token_logprobs_val
150
- else None
151
- ),
152
- output_token_logprobs_idx=(
153
- [output.output_token_logprobs_idx[i]]
154
- if output.output_token_logprobs_idx
155
- else None
156
- ),
157
- input_top_logprobs_val=(
158
- [output.input_top_logprobs_val[i]]
159
- if output.input_top_logprobs_val
160
- else None
161
- ),
162
- input_top_logprobs_idx=(
163
- [output.input_top_logprobs_idx[i]]
164
- if output.input_top_logprobs_idx
165
- else None
166
- ),
167
- output_top_logprobs_val=(
168
- [output.output_top_logprobs_val[i]]
169
- if output.output_top_logprobs_val
170
- else None
171
- ),
172
- output_top_logprobs_idx=(
173
- [output.output_top_logprobs_idx[i]]
174
- if output.output_top_logprobs_idx
175
- else None
176
- ),
177
- input_token_ids_logprobs_val=(
178
- [output.input_token_ids_logprobs_val[i]]
179
- if output.input_token_ids_logprobs_val
180
- else None
181
- ),
182
- input_token_ids_logprobs_idx=(
183
- [output.input_token_ids_logprobs_idx[i]]
184
- if output.input_token_ids_logprobs_idx
185
- else None
186
- ),
187
- output_token_ids_logprobs_val=(
188
- [output.output_token_ids_logprobs_val[i]]
189
- if output.output_token_ids_logprobs_val
190
- else None
191
- ),
192
- output_token_ids_logprobs_idx=(
193
- [output.output_token_ids_logprobs_idx[i]]
194
- if output.output_token_ids_logprobs_idx
195
- else None
196
- ),
197
- output_hidden_states=(
198
- [output.output_hidden_states[i]]
199
- if output.output_hidden_states
200
- else None
201
- ),
202
- )
203
- elif isinstance(output, BatchEmbeddingOut):
204
- new_output = BatchEmbeddingOut(
205
- rids=[output.rids[i]],
206
- finished_reasons=(
207
- [output.finished_reasons[i]]
208
- if len(output.finished_reasons) > i
209
- else None
210
- ),
211
- embeddings=(
212
- [output.embeddings[i]] if len(output.embeddings) > i else None
213
- ),
214
- prompt_tokens=(
215
- [output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
216
- ),
217
- cached_tokens=(
218
- [output.cached_tokens[i]] if len(output.cached_tokens) > i else None
219
- ),
62
+ type_str = "tokenizer" if is_tokenizer else "detokenizer"
63
+ if worker_id in self._mapping:
64
+ logger.warning(
65
+ f"{type_str} already registered with worker {worker_id}, skipping..."
220
66
  )
221
- elif isinstance(output, BatchStrOut):
222
- new_output = BatchStrOut(
223
- rids=[output.rids[i]],
224
- finished_reasons=(
225
- [output.finished_reasons[i]]
226
- if len(output.finished_reasons) > i
227
- else None
228
- ),
229
- output_strs=(
230
- [output.output_strs[i]] if len(output.output_strs) > i else None
231
- ),
232
- output_ids=(
233
- [output.output_ids[i]]
234
- if output.output_ids and len(output.output_ids) > i
235
- else None
236
- ),
237
- prompt_tokens=(
238
- [output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
239
- ),
240
- completion_tokens=(
241
- [output.completion_tokens[i]]
242
- if len(output.completion_tokens) > i
243
- else None
244
- ),
245
- cached_tokens=(
246
- [output.cached_tokens[i]] if len(output.cached_tokens) > i else None
247
- ),
248
- spec_verify_ct=(
249
- [output.spec_verify_ct[i]]
250
- if len(output.spec_verify_ct) > i
251
- else None
252
- ),
253
- input_token_logprobs_val=(
254
- [output.input_token_logprobs_val[i]]
255
- if output.input_token_logprobs_val
256
- else None
257
- ),
258
- input_token_logprobs_idx=(
259
- [output.input_token_logprobs_idx[i]]
260
- if output.input_token_logprobs_idx
261
- else None
262
- ),
263
- output_token_logprobs_val=(
264
- [output.output_token_logprobs_val[i]]
265
- if output.output_token_logprobs_val
266
- else None
267
- ),
268
- output_token_logprobs_idx=(
269
- [output.output_token_logprobs_idx[i]]
270
- if output.output_token_logprobs_idx
271
- else None
272
- ),
273
- input_top_logprobs_val=(
274
- [output.input_top_logprobs_val[i]]
275
- if output.input_top_logprobs_val
276
- else None
277
- ),
278
- input_top_logprobs_idx=(
279
- [output.input_top_logprobs_idx[i]]
280
- if output.input_top_logprobs_idx
281
- else None
282
- ),
283
- output_top_logprobs_val=(
284
- [output.output_top_logprobs_val[i]]
285
- if output.output_top_logprobs_val
286
- else None
287
- ),
288
- output_top_logprobs_idx=(
289
- [output.output_top_logprobs_idx[i]]
290
- if output.output_top_logprobs_idx
291
- else None
292
- ),
293
- input_token_ids_logprobs_val=(
294
- [output.input_token_ids_logprobs_val[i]]
295
- if output.input_token_ids_logprobs_val
296
- else None
297
- ),
298
- input_token_ids_logprobs_idx=(
299
- [output.input_token_ids_logprobs_idx[i]]
300
- if output.input_token_ids_logprobs_idx
301
- else None
302
- ),
303
- output_token_ids_logprobs_val=(
304
- [output.output_token_ids_logprobs_val[i]]
305
- if output.output_token_ids_logprobs_val
306
- else None
307
- ),
308
- output_token_ids_logprobs_idx=(
309
- [output.output_token_ids_logprobs_idx[i]]
310
- if output.output_token_ids_logprobs_idx
311
- else None
312
- ),
313
- output_hidden_states=(
314
- [output.output_hidden_states[i]]
315
- if output.output_hidden_states
316
- else None
317
- ),
318
- )
319
- elif isinstance(output, BatchMultimodalOut):
320
- new_output = BatchMultimodalOut(
321
- rids=[output.rids[i]],
322
- finished_reasons=(
323
- [output.finished_reasons[i]]
324
- if len(output.finished_reasons) > i
325
- else None
326
- ),
327
- outputs=([output.outputs[i]] if len(output.outputs) > i else None),
328
- prompt_tokens=(
329
- [output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
330
- ),
331
- completion_tokens=(
332
- [output.completion_tokens[i]]
333
- if len(output.completion_tokens) > i
334
- else None
335
- ),
336
- cached_tokens=(
337
- [output.cached_tokens[i]] if len(output.cached_tokens) > i else None
338
- ),
67
+ return
68
+ logger.info(
69
+ f"{type_str} not registered with worker {worker_id}, registering..."
70
+ )
71
+ socket = get_zmq_socket(self._zmq_context, zmq.PUSH, recv_obj.ipc_name, False)
72
+ self._mapping[worker_id] = socket
73
+ self._mapping[worker_id].send_pyobj(recv_obj)
74
+
75
+ def send_output(self, worker_id: str, output: Any):
76
+ if worker_id not in self._mapping:
77
+ logger.error(
78
+ f"worker ID {worker_id} not registered. Check if the server Process is alive"
339
79
  )
340
- else:
341
- new_output = output
342
- return new_output
80
+ return
81
+ self._mapping[worker_id].send_pyobj(output)
82
+
83
+
84
+ def _handle_output_by_index(output, i):
85
+ """NOTE: A maintainable method is better here."""
86
+ if isinstance(output, BatchTokenIDOutput):
87
+ new_output = BatchTokenIDOutput(
88
+ rids=[output.rids[i]],
89
+ finished_reasons=(
90
+ [output.finished_reasons[i]]
91
+ if len(output.finished_reasons) > i
92
+ else None
93
+ ),
94
+ decoded_texts=(
95
+ [output.decoded_texts[i]] if len(output.decoded_texts) > i else None
96
+ ),
97
+ decode_ids=([output.decode_ids[i]] if len(output.decode_ids) > i else None),
98
+ read_offsets=(
99
+ [output.read_offsets[i]] if len(output.read_offsets) > i else None
100
+ ),
101
+ output_ids=(
102
+ [output.output_ids[i]]
103
+ if output.output_ids and len(output.output_ids) > i
104
+ else None
105
+ ),
106
+ skip_special_tokens=(
107
+ [output.skip_special_tokens[i]]
108
+ if len(output.skip_special_tokens) > i
109
+ else None
110
+ ),
111
+ spaces_between_special_tokens=(
112
+ [output.spaces_between_special_tokens[i]]
113
+ if len(output.spaces_between_special_tokens) > i
114
+ else None
115
+ ),
116
+ no_stop_trim=(
117
+ [output.no_stop_trim[i]] if len(output.no_stop_trim) > i else None
118
+ ),
119
+ prompt_tokens=(
120
+ [output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
121
+ ),
122
+ completion_tokens=(
123
+ [output.completion_tokens[i]]
124
+ if len(output.completion_tokens) > i
125
+ else None
126
+ ),
127
+ cached_tokens=(
128
+ [output.cached_tokens[i]] if len(output.cached_tokens) > i else None
129
+ ),
130
+ spec_verify_ct=(
131
+ [output.spec_verify_ct[i]] if len(output.spec_verify_ct) > i else None
132
+ ),
133
+ input_token_logprobs_val=(
134
+ [output.input_token_logprobs_val[i]]
135
+ if output.input_token_logprobs_val
136
+ else None
137
+ ),
138
+ input_token_logprobs_idx=(
139
+ [output.input_token_logprobs_idx[i]]
140
+ if output.input_token_logprobs_idx
141
+ else None
142
+ ),
143
+ output_token_logprobs_val=(
144
+ [output.output_token_logprobs_val[i]]
145
+ if output.output_token_logprobs_val
146
+ else None
147
+ ),
148
+ output_token_logprobs_idx=(
149
+ [output.output_token_logprobs_idx[i]]
150
+ if output.output_token_logprobs_idx
151
+ else None
152
+ ),
153
+ input_top_logprobs_val=(
154
+ [output.input_top_logprobs_val[i]]
155
+ if output.input_top_logprobs_val
156
+ else None
157
+ ),
158
+ input_top_logprobs_idx=(
159
+ [output.input_top_logprobs_idx[i]]
160
+ if output.input_top_logprobs_idx
161
+ else None
162
+ ),
163
+ output_top_logprobs_val=(
164
+ [output.output_top_logprobs_val[i]]
165
+ if output.output_top_logprobs_val
166
+ else None
167
+ ),
168
+ output_top_logprobs_idx=(
169
+ [output.output_top_logprobs_idx[i]]
170
+ if output.output_top_logprobs_idx
171
+ else None
172
+ ),
173
+ input_token_ids_logprobs_val=(
174
+ [output.input_token_ids_logprobs_val[i]]
175
+ if output.input_token_ids_logprobs_val
176
+ else None
177
+ ),
178
+ input_token_ids_logprobs_idx=(
179
+ [output.input_token_ids_logprobs_idx[i]]
180
+ if output.input_token_ids_logprobs_idx
181
+ else None
182
+ ),
183
+ output_token_ids_logprobs_val=(
184
+ [output.output_token_ids_logprobs_val[i]]
185
+ if output.output_token_ids_logprobs_val
186
+ else None
187
+ ),
188
+ output_token_ids_logprobs_idx=(
189
+ [output.output_token_ids_logprobs_idx[i]]
190
+ if output.output_token_ids_logprobs_idx
191
+ else None
192
+ ),
193
+ output_hidden_states=(
194
+ [output.output_hidden_states[i]]
195
+ if output.output_hidden_states
196
+ else None
197
+ ),
198
+ placeholder_tokens_idx=None,
199
+ placeholder_tokens_val=None,
200
+ )
201
+ elif isinstance(output, BatchEmbeddingOutput):
202
+ new_output = BatchEmbeddingOutput(
203
+ rids=[output.rids[i]],
204
+ finished_reasons=(
205
+ [output.finished_reasons[i]]
206
+ if len(output.finished_reasons) > i
207
+ else None
208
+ ),
209
+ embeddings=([output.embeddings[i]] if len(output.embeddings) > i else None),
210
+ prompt_tokens=(
211
+ [output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
212
+ ),
213
+ cached_tokens=(
214
+ [output.cached_tokens[i]] if len(output.cached_tokens) > i else None
215
+ ),
216
+ placeholder_tokens_idx=None,
217
+ placeholder_tokens_val=None,
218
+ )
219
+ elif isinstance(output, BatchStrOutput):
220
+ new_output = BatchStrOutput(
221
+ rids=[output.rids[i]],
222
+ finished_reasons=(
223
+ [output.finished_reasons[i]]
224
+ if len(output.finished_reasons) > i
225
+ else None
226
+ ),
227
+ output_strs=(
228
+ [output.output_strs[i]] if len(output.output_strs) > i else None
229
+ ),
230
+ output_ids=(
231
+ [output.output_ids[i]]
232
+ if output.output_ids and len(output.output_ids) > i
233
+ else None
234
+ ),
235
+ prompt_tokens=(
236
+ [output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
237
+ ),
238
+ completion_tokens=(
239
+ [output.completion_tokens[i]]
240
+ if len(output.completion_tokens) > i
241
+ else None
242
+ ),
243
+ cached_tokens=(
244
+ [output.cached_tokens[i]] if len(output.cached_tokens) > i else None
245
+ ),
246
+ spec_verify_ct=(
247
+ [output.spec_verify_ct[i]] if len(output.spec_verify_ct) > i else None
248
+ ),
249
+ input_token_logprobs_val=(
250
+ [output.input_token_logprobs_val[i]]
251
+ if output.input_token_logprobs_val
252
+ else None
253
+ ),
254
+ input_token_logprobs_idx=(
255
+ [output.input_token_logprobs_idx[i]]
256
+ if output.input_token_logprobs_idx
257
+ else None
258
+ ),
259
+ output_token_logprobs_val=(
260
+ [output.output_token_logprobs_val[i]]
261
+ if output.output_token_logprobs_val
262
+ else None
263
+ ),
264
+ output_token_logprobs_idx=(
265
+ [output.output_token_logprobs_idx[i]]
266
+ if output.output_token_logprobs_idx
267
+ else None
268
+ ),
269
+ input_top_logprobs_val=(
270
+ [output.input_top_logprobs_val[i]]
271
+ if output.input_top_logprobs_val
272
+ else None
273
+ ),
274
+ input_top_logprobs_idx=(
275
+ [output.input_top_logprobs_idx[i]]
276
+ if output.input_top_logprobs_idx
277
+ else None
278
+ ),
279
+ output_top_logprobs_val=(
280
+ [output.output_top_logprobs_val[i]]
281
+ if output.output_top_logprobs_val
282
+ else None
283
+ ),
284
+ output_top_logprobs_idx=(
285
+ [output.output_top_logprobs_idx[i]]
286
+ if output.output_top_logprobs_idx
287
+ else None
288
+ ),
289
+ input_token_ids_logprobs_val=(
290
+ [output.input_token_ids_logprobs_val[i]]
291
+ if output.input_token_ids_logprobs_val
292
+ else None
293
+ ),
294
+ input_token_ids_logprobs_idx=(
295
+ [output.input_token_ids_logprobs_idx[i]]
296
+ if output.input_token_ids_logprobs_idx
297
+ else None
298
+ ),
299
+ output_token_ids_logprobs_val=(
300
+ [output.output_token_ids_logprobs_val[i]]
301
+ if output.output_token_ids_logprobs_val
302
+ else None
303
+ ),
304
+ output_token_ids_logprobs_idx=(
305
+ [output.output_token_ids_logprobs_idx[i]]
306
+ if output.output_token_ids_logprobs_idx
307
+ else None
308
+ ),
309
+ output_hidden_states=(
310
+ [output.output_hidden_states[i]]
311
+ if output.output_hidden_states
312
+ else None
313
+ ),
314
+ placeholder_tokens_idx=None,
315
+ placeholder_tokens_val=None,
316
+ )
317
+ elif isinstance(output, BatchMultimodalOutput):
318
+ new_output = BatchMultimodalOutput(
319
+ rids=[output.rids[i]],
320
+ finished_reasons=(
321
+ [output.finished_reasons[i]]
322
+ if len(output.finished_reasons) > i
323
+ else None
324
+ ),
325
+ outputs=([output.outputs[i]] if len(output.outputs) > i else None),
326
+ prompt_tokens=(
327
+ [output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
328
+ ),
329
+ completion_tokens=(
330
+ [output.completion_tokens[i]]
331
+ if len(output.completion_tokens) > i
332
+ else None
333
+ ),
334
+ cached_tokens=(
335
+ [output.cached_tokens[i]] if len(output.cached_tokens) > i else None
336
+ ),
337
+ placeholder_tokens_idx=None,
338
+ placeholder_tokens_val=None,
339
+ )
340
+ else:
341
+ new_output = output
342
+ return new_output
343
+
344
+
345
+ class MultiHttpWorkerDetokenizerMixin:
346
+ """Mixin class for DetokenizerManager"""
343
347
 
344
348
  def get_worker_ids_from_req_rids(self, rids):
345
349
  if isinstance(rids, list):
@@ -350,9 +354,13 @@ class MultiTokenizerMixin:
350
354
  worker_ids = []
351
355
  return worker_ids
352
356
 
353
- def multi_tokenizer_manager_event_loop(self):
354
- """The event loop that handles requests, for multi tokenizer manager mode only"""
355
- self.create_sockets_mapping()
357
+ def maybe_clear_socket_mapping(self):
358
+ if hasattr(self, "socket_mapping"):
359
+ self.socket_mapping.clear_all_sockets()
360
+
361
+ def multi_http_worker_event_loop(self):
362
+ """The event loop that handles requests, for multi multi-http-worker mode"""
363
+ self.socket_mapping = SocketMapping()
356
364
  while True:
357
365
  recv_obj = self.recv_from_scheduler.recv_pyobj()
358
366
  output = self._request_dispatcher(recv_obj)
@@ -369,32 +377,16 @@ class MultiTokenizerMixin:
369
377
  # Send data using the corresponding socket
370
378
  for i, worker_id in enumerate(worker_ids):
371
379
  if isinstance(recv_obj, MultiTokenizerRegisterReq):
372
- if self.register_tokenizer_ipc(recv_obj, worker_id):
373
- logger.info(
374
- f"DetokenizerManager Created ZMQ socket for worker {worker_id}"
375
- )
376
- continue
380
+ self.socket_mapping.register_ipc_mapping(
381
+ recv_obj, worker_id, is_tokenizer=False
382
+ )
377
383
  else:
378
- if worker_id not in self.tokenizer_mapping:
379
- logger.error(
380
- f"Tokenizer Worker ID {worker_id} not registered. Check if the server Process {worker_id} is alive"
381
- )
382
- continue
383
- new_output = self._handle_output_by_index(output, i)
384
- self.tokenizer_mapping[worker_id].send_pyobj(new_output)
385
-
386
- def clear_tokenizer_mapping(self):
387
- if hasattr(self, "tokenizer_mapping"):
388
- for socket in self.tokenizer_mapping.values():
389
- try:
390
- socket.close()
391
- except Exception as e:
392
- logger.warning(f"Failed to close socket: {e}")
393
- self.tokenizer_mapping.clear()
394
-
395
-
396
- class MultiTokenizerRouter(TokenizerManager, MultiTokenizerMixin):
397
- """A router to receive requests from MultiTokenizerManager"""
384
+ new_output = _handle_output_by_index(output, i)
385
+ self.socket_mapping.send_output(worker_id, new_output)
386
+
387
+
388
+ class MultiTokenizerRouter:
389
+ """A router to receive requests from TokenizerWorker"""
398
390
 
399
391
  def __init__(
400
392
  self,
@@ -422,7 +414,7 @@ class MultiTokenizerRouter(TokenizerManager, MultiTokenizerMixin):
422
414
  self._handle_task = asyncio.run_coroutine_threadsafe(
423
415
  print_exception_wrapper(self.handle_loop), self._loop
424
416
  )
425
- self.init_disaggregation()
417
+ self.disaggregation_bootstrap_server = start_disagg_service(self.server_args)
426
418
 
427
419
  def _run_loop(self):
428
420
  self._loop.run_forever()
@@ -434,14 +426,14 @@ class MultiTokenizerRouter(TokenizerManager, MultiTokenizerMixin):
434
426
 
435
427
  async def handle_loop(self):
436
428
  # special reqs will recv from scheduler, need to route to right worker
437
- self.create_sockets_mapping()
429
+ self.socket_mapping = SocketMapping()
438
430
  while True:
439
431
  recv_obj = await self.recv_from_detokenizer.recv_pyobj()
440
432
  await self._distribute_result_to_workers(recv_obj)
441
433
 
442
434
  async def _distribute_result_to_workers(self, recv_obj):
443
435
  """Distribute result to corresponding workers based on rid"""
444
- if isinstance(recv_obj, MultiTokenizerWarpper):
436
+ if isinstance(recv_obj, MultiTokenizerWrapper):
445
437
  worker_ids = [recv_obj.worker_id]
446
438
  recv_obj = recv_obj.obj
447
439
  else:
@@ -454,32 +446,23 @@ class MultiTokenizerRouter(TokenizerManager, MultiTokenizerMixin):
454
446
  # Distribute result to each worker
455
447
  for i, worker_id in enumerate(worker_ids):
456
448
  if isinstance(recv_obj, MultiTokenizerRegisterReq):
457
- if self.register_tokenizer_ipc(recv_obj, worker_id):
458
- logger.info(
459
- f"MultiTokenizerRouter Created ZMQ socket for worker {worker_id}"
460
- )
461
- continue
449
+ self.socket_mapping.register_ipc_mapping(
450
+ recv_obj, worker_id, is_tokenizer=True
451
+ )
462
452
  else:
463
- if worker_id not in self.tokenizer_mapping:
464
- logger.error(
465
- f"Tokenizer Worker ID {worker_id} not registered. Check if the server Process {worker_id} is alive"
466
- )
467
- continue
468
- new_recv_obj = self._handle_output_by_index(recv_obj, i)
469
- self.tokenizer_mapping[worker_id].send_pyobj(new_recv_obj)
453
+ new_recv_obj = _handle_output_by_index(recv_obj, i)
454
+ self.socket_mapping.send_output(worker_id, new_recv_obj)
470
455
 
471
456
 
472
- class MultiTokenizerManager(TokenizerManager, MultiTokenizerMixin):
473
- """Multi Process Tokenizer Manager that tokenizes the text."""
457
+ class TokenizerWorker(TokenizerManager):
458
+ """Tokenizer Worker in multi-http-worker mode"""
474
459
 
475
460
  def __init__(
476
461
  self,
477
462
  server_args: ServerArgs,
478
463
  port_args: PortArgs,
479
464
  ):
480
- setproctitle.setproctitle(
481
- f"sglang::http_server/multi_tokenizer_manager:{os.getpid()}"
482
- )
465
+ setproctitle.setproctitle(f"sglang::tokenizer_worker:{os.getpid()}")
483
466
  # prevent init prefill bootstrapserver again
484
467
  disaggregation_mode = server_args.disaggregation_mode
485
468
  server_args.disaggregation_mode = "null"
@@ -535,42 +518,14 @@ async def print_exception_wrapper(func):
535
518
  sys.exit(1)
536
519
 
537
520
 
538
- def serialize_port_args(port_args: PortArgs) -> dict:
539
- """Serialize PortArgs into a shareable dictionary"""
540
- return {
541
- "tokenizer_ipc_name": port_args.tokenizer_ipc_name,
542
- "scheduler_input_ipc_name": port_args.scheduler_input_ipc_name,
543
- "detokenizer_ipc_name": port_args.detokenizer_ipc_name,
544
- "nccl_port": port_args.nccl_port,
545
- "rpc_ipc_name": port_args.rpc_ipc_name,
546
- "metrics_ipc_name": port_args.metrics_ipc_name,
547
- "tokenizer_worker_ipc_name": port_args.tokenizer_worker_ipc_name,
548
- }
549
-
550
-
551
- def deserialize_data(port_args: dict, server_args: dict):
552
- """Deserialize data from shared dictionaries"""
553
- return PortArgs(**port_args), ServerArgs(**server_args)
554
-
555
-
556
- def serialize_server_args(server_args: ServerArgs) -> dict:
557
- """Serialize ServerArgs into a shareable dictionary"""
558
- return dataclasses.asdict(server_args)
559
-
560
-
561
- def serialize_scheduler_info(scheduler_info: Dict) -> dict:
562
- """Serialize scheduler_info into a shareable dictionary"""
563
- return scheduler_info
564
-
565
-
566
- def deserialize_scheduler_info(data: dict) -> Dict:
567
- """Deserialize scheduler_info from a shared dictionary"""
568
- return data
521
+ def get_main_process_id() -> int:
522
+ """Get the main process ID"""
523
+ return multiprocessing.current_process()._parent_pid
569
524
 
570
525
 
571
- def write_to_shared_memory(data: dict, name: str) -> shared_memory.SharedMemory:
526
+ def write_to_shared_memory(obj, name: str) -> shared_memory.SharedMemory:
572
527
  """Write data to shared memory"""
573
- serialized = json.dumps(data).encode("utf-8")
528
+ serialized = pickle.dumps(obj)
574
529
  size = len(serialized)
575
530
  try:
576
531
  # Try to open existing shared memory
@@ -588,22 +543,17 @@ def write_to_shared_memory(data: dict, name: str) -> shared_memory.SharedMemory:
588
543
  return shm
589
544
 
590
545
 
591
- def read_from_shared_memory(name: str) -> dict:
546
+ def read_from_shared_memory(name: str) -> Any:
592
547
  """Read data from shared memory"""
593
548
  try:
594
549
  shm = shared_memory.SharedMemory(name=name)
595
- data = json.loads(bytes(shm.buf).decode("utf-8"))
550
+ data = pickle.loads(bytes(shm.buf))
596
551
  shm.close()
597
552
  return data
598
553
  except FileNotFoundError:
599
554
  raise FileNotFoundError(f"Shared memory {name} not found")
600
555
 
601
556
 
602
- def get_main_process_id() -> int:
603
- """Get the main process ID"""
604
- return multiprocessing.current_process()._parent_pid
605
-
606
-
607
557
  def write_data_for_multi_tokenizer(
608
558
  port_args: PortArgs, server_args: ServerArgs, scheduler_info: Dict
609
559
  ):
@@ -612,22 +562,22 @@ def write_data_for_multi_tokenizer(
612
562
  main_pid = get_main_process_id()
613
563
  current_pid = os.getpid()
614
564
  logger.info(f"main process ID: {main_pid}, current process ID: {current_pid}")
565
+ args = (port_args, server_args, scheduler_info)
566
+ args_shm = write_to_shared_memory(args, f"multi_tokenizer_args_{current_pid}")
567
+ args_shm.close()
615
568
 
616
- # Write port_args to shared memory
617
- port_args_shm = write_to_shared_memory(
618
- serialize_port_args(port_args), f"port_args_{current_pid}"
619
- )
620
- # Write server_args to shared memory
621
- server_args_shm = write_to_shared_memory(
622
- serialize_server_args(server_args), f"server_args_{current_pid}"
623
- )
624
- # Write scheduler_info to shared memory
625
- scheduler_info_shm = write_to_shared_memory(
626
- serialize_scheduler_info(scheduler_info), f"scheduler_info_{current_pid}"
627
- )
628
-
629
- port_args_shm.close()
630
- server_args_shm.close()
631
- scheduler_info_shm.close()
632
-
633
- return port_args_shm, server_args_shm, scheduler_info_shm
569
+ return args_shm
570
+
571
+
572
+ def monkey_patch_uvicorn_multiprocessing(timeout: float = 10):
573
+ """Monkey patch uvicorn multiprocessing is_alive timeout"""
574
+ # from default 5s -> 10s
575
+ try:
576
+ from uvicorn.supervisors.multiprocess import Process
577
+
578
+ Process.is_alive = partialmethod(Process.is_alive, timeout=timeout)
579
+
580
+ except ImportError:
581
+ logger.warning(
582
+ "uvicorn.supervisors.multiprocess not found, skipping monkey patch"
583
+ )