sglang 0.5.2rc1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (395) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/lang/interpreter.py +1 -1
  7. sglang/launch_server.py +14 -0
  8. sglang/profiler.py +2 -2
  9. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  10. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  11. sglang/srt/configs/__init__.py +8 -0
  12. sglang/srt/configs/device_config.py +3 -1
  13. sglang/srt/configs/dots_ocr.py +64 -0
  14. sglang/srt/configs/dots_vlm.py +139 -0
  15. sglang/srt/configs/falcon_h1.py +360 -0
  16. sglang/srt/configs/internvl.py +6 -0
  17. sglang/srt/configs/load_config.py +9 -0
  18. sglang/srt/configs/model_config.py +181 -82
  19. sglang/srt/configs/qwen3_next.py +326 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +71 -19
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +326 -53
  44. sglang/srt/disaggregation/prefill.py +36 -17
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +192 -113
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  52. sglang/srt/entrypoints/grpc_server.py +810 -0
  53. sglang/srt/entrypoints/http_server.py +132 -57
  54. sglang/srt/entrypoints/openai/protocol.py +115 -7
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +207 -58
  57. sglang/srt/entrypoints/openai/serving_completions.py +17 -4
  58. sglang/srt/entrypoints/openai/serving_embedding.py +10 -4
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +49 -4
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/environ.py +285 -0
  63. sglang/srt/eplb/eplb_manager.py +2 -2
  64. sglang/srt/eplb/expert_distribution.py +26 -13
  65. sglang/srt/eplb/expert_location.py +38 -8
  66. sglang/srt/eplb/expert_location_updater.py +1 -1
  67. sglang/srt/function_call/base_format_detector.py +3 -6
  68. sglang/srt/function_call/ebnf_composer.py +11 -9
  69. sglang/srt/function_call/function_call_parser.py +9 -2
  70. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  71. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  72. sglang/srt/function_call/json_array_parser.py +63 -0
  73. sglang/srt/function_call/kimik2_detector.py +17 -4
  74. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  75. sglang/srt/function_call/utils.py +96 -5
  76. sglang/srt/grpc/__init__.py +1 -0
  77. sglang/srt/grpc/compile_proto.py +245 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  81. sglang/srt/layers/activation.py +143 -9
  82. sglang/srt/layers/attention/aiter_backend.py +106 -82
  83. sglang/srt/layers/attention/ascend_backend.py +115 -9
  84. sglang/srt/layers/attention/attention_registry.py +206 -0
  85. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  86. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  87. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  88. sglang/srt/layers/attention/fla/chunk.py +242 -0
  89. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  90. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  91. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  92. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  93. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  94. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  95. sglang/srt/layers/attention/fla/index.py +37 -0
  96. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  97. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  98. sglang/srt/layers/attention/fla/op.py +66 -0
  99. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  100. sglang/srt/layers/attention/fla/utils.py +331 -0
  101. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  102. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  103. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  104. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  105. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  106. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  107. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  108. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  109. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  111. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  112. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  113. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  114. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  115. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  121. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  122. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  123. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  124. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  125. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  126. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  127. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  128. sglang/srt/layers/attention/nsa/utils.py +24 -0
  129. sglang/srt/layers/attention/nsa_backend.py +887 -0
  130. sglang/srt/layers/attention/tbo_backend.py +6 -6
  131. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  132. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  133. sglang/srt/layers/attention/triton_backend.py +57 -7
  134. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  135. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  136. sglang/srt/layers/attention/vision.py +58 -0
  137. sglang/srt/layers/attention/wave_backend.py +4 -4
  138. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  139. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  140. sglang/srt/layers/communicator.py +53 -7
  141. sglang/srt/layers/dp_attention.py +41 -2
  142. sglang/srt/layers/elementwise.py +3 -1
  143. sglang/srt/layers/layernorm.py +34 -15
  144. sglang/srt/layers/linear.py +55 -7
  145. sglang/srt/layers/logits_processor.py +44 -12
  146. sglang/srt/layers/moe/__init__.py +2 -1
  147. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  148. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  149. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  150. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  151. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  167. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  169. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  170. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  171. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  172. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  173. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  174. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  175. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  176. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  177. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  178. sglang/srt/layers/moe/topk.py +30 -9
  179. sglang/srt/layers/moe/utils.py +22 -7
  180. sglang/srt/layers/parameter.py +23 -6
  181. sglang/srt/layers/quantization/awq.py +19 -7
  182. sglang/srt/layers/quantization/base_config.py +11 -6
  183. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  184. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  185. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  186. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  187. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  188. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  189. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  190. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  191. sglang/srt/layers/quantization/fp8.py +78 -49
  192. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  193. sglang/srt/layers/quantization/gptq.py +25 -17
  194. sglang/srt/layers/quantization/modelopt_quant.py +225 -57
  195. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  196. sglang/srt/layers/quantization/mxfp4.py +77 -42
  197. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  199. sglang/srt/layers/quantization/quark/utils.py +97 -0
  200. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  201. sglang/srt/layers/quantization/unquant.py +135 -47
  202. sglang/srt/layers/quantization/w4afp8.py +26 -17
  203. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  204. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  205. sglang/srt/layers/rocm_linear_utils.py +44 -0
  206. sglang/srt/layers/rotary_embedding.py +78 -49
  207. sglang/srt/layers/sampler.py +213 -21
  208. sglang/srt/layers/utils.py +23 -0
  209. sglang/srt/lora/backend/base_backend.py +50 -8
  210. sglang/srt/lora/backend/chunked_backend.py +348 -0
  211. sglang/srt/lora/backend/triton_backend.py +99 -5
  212. sglang/srt/lora/layers.py +32 -0
  213. sglang/srt/lora/lora.py +8 -3
  214. sglang/srt/lora/lora_manager.py +52 -118
  215. sglang/srt/lora/mem_pool.py +25 -11
  216. sglang/srt/lora/triton_ops/__init__.py +4 -0
  217. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  219. sglang/srt/lora/utils.py +22 -11
  220. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  221. sglang/srt/managers/cache_controller.py +215 -314
  222. sglang/srt/managers/data_parallel_controller.py +115 -80
  223. sglang/srt/managers/detokenizer_manager.py +19 -15
  224. sglang/srt/managers/disagg_service.py +46 -0
  225. sglang/srt/managers/io_struct.py +340 -109
  226. sglang/srt/managers/mm_utils.py +44 -6
  227. sglang/srt/managers/multi_tokenizer_mixin.py +358 -404
  228. sglang/srt/managers/multimodal_processor.py +1 -2
  229. sglang/srt/managers/overlap_utils.py +53 -0
  230. sglang/srt/managers/schedule_batch.py +240 -138
  231. sglang/srt/managers/schedule_policy.py +147 -19
  232. sglang/srt/managers/scheduler.py +501 -304
  233. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  234. sglang/srt/managers/scheduler_metrics_mixin.py +119 -40
  235. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  236. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  237. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  238. sglang/srt/managers/template_manager.py +3 -3
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +321 -632
  241. sglang/srt/managers/tp_worker.py +81 -22
  242. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  243. sglang/srt/managers/utils.py +1 -45
  244. sglang/srt/mem_cache/allocator.py +15 -21
  245. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  246. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  247. sglang/srt/mem_cache/chunk_cache.py +8 -1
  248. sglang/srt/mem_cache/evict_policy.py +23 -0
  249. sglang/srt/mem_cache/hicache_storage.py +58 -34
  250. sglang/srt/mem_cache/hiradix_cache.py +227 -80
  251. sglang/srt/mem_cache/memory_pool.py +535 -58
  252. sglang/srt/mem_cache/memory_pool_host.py +239 -223
  253. sglang/srt/mem_cache/radix_cache.py +222 -73
  254. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  255. sglang/srt/mem_cache/storage/__init__.py +10 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  257. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  258. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  259. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  260. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  261. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  262. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  263. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +268 -63
  264. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  265. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  266. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +198 -30
  267. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  268. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  269. sglang/srt/metrics/collector.py +519 -132
  270. sglang/srt/metrics/func_timer.py +2 -7
  271. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  272. sglang/srt/metrics/utils.py +55 -0
  273. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  274. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  275. sglang/srt/model_executor/forward_batch_info.py +98 -57
  276. sglang/srt/model_executor/model_runner.py +433 -158
  277. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  278. sglang/srt/model_loader/__init__.py +9 -3
  279. sglang/srt/model_loader/loader.py +133 -5
  280. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  281. sglang/srt/model_loader/weight_utils.py +158 -3
  282. sglang/srt/models/apertus.py +686 -0
  283. sglang/srt/models/bailing_moe.py +820 -217
  284. sglang/srt/models/bailing_moe_nextn.py +168 -0
  285. sglang/srt/models/deepseek_nextn.py +6 -1
  286. sglang/srt/models/deepseek_v2.py +833 -152
  287. sglang/srt/models/dots_ocr.py +173 -0
  288. sglang/srt/models/dots_vlm.py +174 -0
  289. sglang/srt/models/dots_vlm_vit.py +337 -0
  290. sglang/srt/models/ernie4.py +1 -1
  291. sglang/srt/models/falcon_h1.py +576 -0
  292. sglang/srt/models/gemma3_causal.py +0 -2
  293. sglang/srt/models/gemma3_mm.py +1 -1
  294. sglang/srt/models/gemma3n_mm.py +2 -2
  295. sglang/srt/models/glm4_moe.py +14 -5
  296. sglang/srt/models/glm4_moe_nextn.py +2 -2
  297. sglang/srt/models/glm4v.py +5 -3
  298. sglang/srt/models/glm4v_moe.py +4 -1
  299. sglang/srt/models/gpt_oss.py +8 -31
  300. sglang/srt/models/internvl.py +28 -0
  301. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  302. sglang/srt/models/llama.py +4 -0
  303. sglang/srt/models/llama4.py +9 -0
  304. sglang/srt/models/llama_eagle3.py +13 -0
  305. sglang/srt/models/longcat_flash.py +3 -3
  306. sglang/srt/models/longcat_flash_nextn.py +1 -1
  307. sglang/srt/models/minicpmv.py +165 -3
  308. sglang/srt/models/mllama4.py +40 -4
  309. sglang/srt/models/opt.py +637 -0
  310. sglang/srt/models/qwen2_5_vl.py +29 -5
  311. sglang/srt/models/qwen2_audio.py +1 -1
  312. sglang/srt/models/qwen2_moe.py +124 -14
  313. sglang/srt/models/qwen2_vl.py +1 -1
  314. sglang/srt/models/qwen3.py +26 -5
  315. sglang/srt/models/qwen3_moe.py +71 -12
  316. sglang/srt/models/qwen3_next.py +1069 -0
  317. sglang/srt/models/qwen3_next_mtp.py +112 -0
  318. sglang/srt/models/qwen3_vl.py +787 -0
  319. sglang/srt/models/qwen3_vl_moe.py +471 -0
  320. sglang/srt/models/registry.py +15 -3
  321. sglang/srt/models/sarashina2_vision.py +269 -0
  322. sglang/srt/models/solar.py +505 -0
  323. sglang/srt/models/starcoder2.py +357 -0
  324. sglang/srt/models/step3_vl.py +1 -1
  325. sglang/srt/models/torch_native_llama.py +10 -3
  326. sglang/srt/models/utils.py +51 -0
  327. sglang/srt/multimodal/processors/base_processor.py +15 -7
  328. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  329. sglang/srt/multimodal/processors/glm4v.py +9 -9
  330. sglang/srt/multimodal/processors/internvl.py +153 -129
  331. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  332. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  333. sglang/srt/offloader.py +27 -3
  334. sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +6 -0
  335. sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
  336. sglang/srt/sampling/sampling_batch_info.py +38 -17
  337. sglang/srt/sampling/sampling_params.py +7 -0
  338. sglang/srt/server_args.py +1030 -254
  339. sglang/srt/server_args_config_parser.py +146 -0
  340. sglang/srt/single_batch_overlap.py +151 -0
  341. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  342. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  343. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  344. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  345. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  346. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  347. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  348. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  349. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  350. sglang/srt/speculative/eagle_worker.py +253 -136
  351. sglang/srt/speculative/ngram_utils.py +428 -0
  352. sglang/srt/speculative/ngram_worker.py +245 -0
  353. sglang/srt/speculative/spec_info.py +52 -0
  354. sglang/srt/speculative/spec_utils.py +606 -0
  355. sglang/srt/speculative/standalone_worker.py +109 -0
  356. sglang/srt/torch_memory_saver_adapter.py +5 -7
  357. sglang/srt/tracing/trace.py +578 -0
  358. sglang/srt/two_batch_overlap.py +8 -5
  359. sglang/srt/utils/__init__.py +2 -0
  360. sglang/srt/{utils.py → utils/common.py} +445 -77
  361. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  362. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  363. sglang/srt/utils/rpd_utils.py +452 -0
  364. sglang/srt/utils/slow_rank_detector.py +71 -0
  365. sglang/srt/warmup.py +8 -4
  366. sglang/srt/weight_sync/utils.py +2 -2
  367. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  368. sglang/test/few_shot_gsm8k.py +1 -0
  369. sglang/test/get_logits_ut.py +57 -0
  370. sglang/test/run_eval.py +79 -11
  371. sglang/test/runners.py +5 -1
  372. sglang/test/simple_eval_common.py +5 -2
  373. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  374. sglang/test/test_block_fp8.py +2 -2
  375. sglang/test/test_cutlass_moe.py +24 -6
  376. sglang/test/test_deterministic.py +297 -0
  377. sglang/test/test_disaggregation_utils.py +77 -0
  378. sglang/test/test_fp4_moe.py +370 -1
  379. sglang/test/test_programs.py +1 -1
  380. sglang/test/test_utils.py +383 -5
  381. sglang/utils.py +22 -1
  382. sglang/version.py +1 -1
  383. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
  384. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/RECORD +392 -258
  385. sglang/srt/disaggregation/launch_lb.py +0 -118
  386. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  387. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  388. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  389. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  390. /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
  391. /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
  392. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  393. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
  394. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
  395. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
  # Adapted from vLLM's OpenAIServingResponses
3
3
  """Handler for /v1/responses requests"""
4
+ from __future__ import annotations
4
5
 
5
6
  import asyncio
6
7
  import copy
@@ -9,7 +10,7 @@ import logging
9
10
  import time
10
11
  from contextlib import AsyncExitStack
11
12
  from http import HTTPStatus
12
- from typing import Any, AsyncGenerator, AsyncIterator, Optional, Union
13
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Optional, Union
13
14
 
14
15
  import jinja2
15
16
  import openai.types.responses as openai_responses_types
@@ -54,11 +55,13 @@ from sglang.srt.entrypoints.openai.protocol import (
54
55
  from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
55
56
  from sglang.srt.entrypoints.openai.tool_server import MCPToolServer, ToolServer
56
57
  from sglang.srt.managers.io_struct import GenerateReqInput
57
- from sglang.srt.managers.template_manager import TemplateManager
58
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
59
- from sglang.srt.reasoning_parser import ReasoningParser
58
+ from sglang.srt.parser.reasoning_parser import ReasoningParser
60
59
  from sglang.srt.utils import random_uuid
61
60
 
61
+ if TYPE_CHECKING:
62
+ from sglang.srt.managers.template_manager import TemplateManager
63
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
64
+
62
65
  logger = logging.getLogger(__name__)
63
66
 
64
67
 
@@ -120,6 +123,39 @@ class OpenAIServingResponses(OpenAIServingChat):
120
123
 
121
124
  self.background_tasks: dict[str, asyncio.Task] = {}
122
125
 
126
+ # error helpers dedicated for v1/responses
127
+ def create_error_response(
128
+ self,
129
+ message: str,
130
+ err_type: str = "invalid_request_error",
131
+ status_code: int = 400,
132
+ param: Optional[str] = None,
133
+ ) -> ORJSONResponse:
134
+ nested_error = {
135
+ "message": message,
136
+ "type": err_type,
137
+ "param": param,
138
+ "code": status_code,
139
+ }
140
+ return ORJSONResponse(content={"error": nested_error}, status_code=status_code)
141
+
142
+ def create_streaming_error_response(
143
+ self,
144
+ message: str,
145
+ err_type: str = "BadRequestError",
146
+ status_code: int = 400,
147
+ ) -> str:
148
+ return json.dumps(
149
+ {
150
+ "error": {
151
+ "message": message,
152
+ "type": err_type,
153
+ "param": None,
154
+ "code": status_code,
155
+ }
156
+ }
157
+ )
158
+
123
159
  def _request_id_prefix(self) -> str:
124
160
  return "resp_"
125
161
 
@@ -242,6 +278,7 @@ class OpenAIServingResponses(OpenAIServingChat):
242
278
  sampling_params=sampling_params,
243
279
  stream=request.stream,
244
280
  rid=request.request_id,
281
+ extra_key=self._compute_extra_key(request),
245
282
  background=request.background,
246
283
  )
247
284
 
@@ -830,6 +867,13 @@ class OpenAIServingResponses(OpenAIServingChat):
830
867
 
831
868
  async for ctx in result_generator:
832
869
 
870
+ # Only process context objects that implement the `is_expecting_start()` method,
871
+ # which indicates they support per-turn streaming (e.g., StreamingHarmonyContext).
872
+ # Contexts without this method are skipped, as they do not represent a new turn
873
+ # or are not compatible with per-turn handling in the /v1/responses endpoint.
874
+ if not hasattr(ctx, "is_expecting_start"):
875
+ continue
876
+
833
877
  if ctx.is_expecting_start():
834
878
  current_output_index += 1
835
879
  sent_output_item_added = False
@@ -1247,6 +1291,7 @@ class OpenAIServingResponses(OpenAIServingChat):
1247
1291
  sampling_params=sampling_params,
1248
1292
  stream=adapted_request.stream,
1249
1293
  rid=request_id,
1294
+ extra_key=adapted_request.extra_key,
1250
1295
  return_logprob=adapted_request.return_logprob,
1251
1296
  logprob_start_len=adapted_request.logprob_start_len,
1252
1297
  top_logprobs_num=adapted_request.top_logprobs_num,
@@ -25,6 +25,7 @@ class OpenAIServingScore(OpenAIServingBase):
25
25
  def _convert_to_internal_request(
26
26
  self,
27
27
  request: ScoringRequest,
28
+ raw_request: Request = None,
28
29
  ) -> tuple[ScoringRequest, ScoringRequest]:
29
30
  """Convert OpenAI scoring request to internal format"""
30
31
  # For scoring, we pass the request directly as the tokenizer_manager
sglang/srt/environ.py ADDED
@@ -0,0 +1,285 @@
1
+ import os
2
+ import subprocess
3
+ import warnings
4
+ from contextlib import ExitStack, contextmanager
5
+ from typing import Any
6
+
7
+
8
+ class EnvField:
9
+ def __init__(self, default: Any):
10
+ self.default = default
11
+ # NOTE: we use None to indicate whether the value is set or not
12
+ # If the value is manually set to None, we need mark it as _set_to_none.
13
+ # Always use clear() to reset the value, which leads to the default fallback.
14
+ self._set_to_none = False
15
+
16
+ def __set_name__(self, owner, name):
17
+ self.name = name
18
+
19
+ def parse(self, value: str) -> Any:
20
+ raise NotImplementedError()
21
+
22
+ def get(self) -> Any:
23
+ value = os.getenv(self.name)
24
+ if self._set_to_none:
25
+ assert value is None
26
+ return None
27
+
28
+ if value is None:
29
+ return self.default
30
+
31
+ try:
32
+ return self.parse(value)
33
+ except ValueError as e:
34
+ warnings.warn(
35
+ f'Invalid value for {self.name}: {e}, using default "{self.default}"'
36
+ )
37
+ return self.default
38
+
39
+ def is_set(self):
40
+ # NOTE: If None is manually set, it is considered as set.
41
+ return self.name in os.environ or self._set_to_none
42
+
43
+ def get_set_value_or(self, or_value: Any):
44
+ # NOTE: Ugly usage, but only way to get custom default value.
45
+ return self.get() if self.is_set() else or_value
46
+
47
+ def set(self, value: Any):
48
+ if value is None:
49
+ self._set_to_none = True
50
+ os.environ.pop(self.name, None)
51
+ else:
52
+ self._set_to_none = False
53
+ os.environ[self.name] = str(value)
54
+
55
+ @contextmanager
56
+ def override(self, value: Any):
57
+ backup_present = self.name in os.environ
58
+ backup_value = os.environ.get(self.name)
59
+ backup_set_to_none = self._set_to_none
60
+ self.set(value)
61
+ yield
62
+ if backup_present:
63
+ os.environ[self.name] = backup_value
64
+ else:
65
+ os.environ.pop(self.name, None)
66
+ self._set_to_none = backup_set_to_none
67
+
68
+ def clear(self):
69
+ os.environ.pop(self.name, None)
70
+ self._set_to_none = False
71
+
72
+ @property
73
+ def value(self):
74
+ return self.get()
75
+
76
+
77
+ class EnvStr(EnvField):
78
+ def parse(self, value: str) -> str:
79
+ return value
80
+
81
+
82
+ class EnvBool(EnvField):
83
+ def parse(self, value: str) -> bool:
84
+ value = value.lower()
85
+ if value in ["true", "1", "yes", "y"]:
86
+ return True
87
+ if value in ["false", "0", "no", "n"]:
88
+ return False
89
+ raise ValueError(f'"{value}" is not a valid boolean value')
90
+
91
+
92
+ class EnvInt(EnvField):
93
+ def parse(self, value: str) -> int:
94
+ try:
95
+ return int(value)
96
+ except ValueError:
97
+ raise ValueError(f'"{value}" is not a valid integer value')
98
+
99
+
100
+ class EnvFloat(EnvField):
101
+ def parse(self, value: str) -> float:
102
+ try:
103
+ return float(value)
104
+ except ValueError:
105
+ raise ValueError(f'"{value}" is not a valid float value')
106
+
107
+
108
+ class Envs:
109
+ # fmt: off
110
+
111
+ # Model & File Download
112
+ SGLANG_USE_MODELSCOPE = EnvBool(False)
113
+
114
+ # Test & Debug
115
+ SGLANG_IS_IN_CI = EnvBool(False)
116
+ SGLANG_AMD_CI = EnvBool(False)
117
+ SGLANG_TEST_RETRACT = EnvBool(False)
118
+ SGLANG_SET_CPU_AFFINITY = EnvBool(False)
119
+ SGLANG_PROFILE_WITH_STACK = EnvBool(True)
120
+ SGLANG_RECORD_STEP_TIME = EnvBool(False)
121
+ SGLANG_GC_LOG = EnvBool(False)
122
+ SGLANG_FORCE_SHUTDOWN = EnvBool(False)
123
+ SGLANG_DEBUG_MEMORY_POOL = EnvBool(False)
124
+ SGLANG_TEST_REQUEST_TIME_STATS = EnvBool(False)
125
+ SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK = EnvBool(False)
126
+ SGLANG_DISABLE_REQUEST_LOGGING = EnvBool(False)
127
+ SGLANG_SIMULATE_ACC_LEN = EnvFloat(-1)
128
+ SGLANG_SIMULATE_ACC_METHOD = EnvStr("multinomial")
129
+ SGLANG_TORCH_PROFILER_DIR = EnvStr("/tmp")
130
+
131
+ # Model Parallel
132
+ SGLANG_USE_MESSAGE_QUEUE_BROADCASTER = EnvBool(True)
133
+
134
+ # Constrained Decoding
135
+ SGLANG_DISABLE_OUTLINES_DISK_CACHE = EnvBool(True)
136
+ SGLANG_GRAMMAR_TIMEOUT = EnvFloat(300)
137
+
138
+ # Hi-Cache
139
+ SGLANG_HICACHE_HF3FS_CONFIG_PATH = EnvStr(None)
140
+
141
+ # Mooncake KV Transfer
142
+ SGLANG_MOONCAKE_CUSTOM_MEM_POOL = EnvBool(False)
143
+ ENABLE_ASCEND_TRANSFER_WITH_MOONCAKE = EnvBool(False)
144
+
145
+ # AMD & ROCm
146
+ SGLANG_USE_AITER = EnvBool(False)
147
+ SGLANG_ROCM_FUSED_DECODE_MLA = EnvBool(False)
148
+
149
+ # Quantization
150
+ SGLANG_INT4_WEIGHT = EnvBool(False)
151
+ SGLANG_CPU_QUANTIZATION = EnvBool(False)
152
+ SGLANG_USE_DYNAMIC_MXFP4_LINEAR = EnvBool(False)
153
+ SGLANG_FORCE_FP8_MARLIN = EnvBool(False)
154
+
155
+ # Flashinfer
156
+ SGLANG_IS_FLASHINFER_AVAILABLE = EnvBool(True)
157
+ SGLANG_ENABLE_FLASHINFER_GEMM = EnvBool(False)
158
+
159
+ # Triton
160
+ SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS = EnvBool(False)
161
+
162
+ # Torch Compile
163
+ SGLANG_ENABLE_TORCH_COMPILE = EnvBool(False)
164
+
165
+ # EPLB
166
+ SGLANG_EXPERT_LOCATION_UPDATER_LOG_INPUT = EnvBool(False)
167
+ SGLANG_EXPERT_LOCATION_UPDATER_CANARY = EnvBool(False)
168
+ SGLANG_EXPERT_LOCATION_UPDATER_LOG_METRICS = EnvBool(False)
169
+ SGLANG_LOG_EXPERT_LOCATION_METADATA = EnvBool(False)
170
+
171
+ # TBO
172
+ SGLANG_TBO_DEBUG = EnvBool(False)
173
+
174
+ # DeepGemm
175
+ SGLANG_ENABLE_JIT_DEEPGEMM = EnvBool(True)
176
+ SGLANG_JIT_DEEPGEMM_PRECOMPILE = EnvBool(True)
177
+ SGLANG_JIT_DEEPGEMM_COMPILE_WORKERS = EnvInt(4)
178
+ SGLANG_IN_DEEPGEMM_PRECOMPILE_STAGE = EnvBool(False)
179
+ SGLANG_DG_CACHE_DIR = EnvStr(os.path.expanduser("~/.cache/deep_gemm"))
180
+ SGLANG_DG_USE_NVRTC = EnvBool(False)
181
+ SGLANG_USE_DEEPGEMM_BMM = EnvBool(False)
182
+
183
+ # sgl-kernel
184
+ SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK = EnvBool(False)
185
+
186
+ # vLLM dependencies
187
+ USE_VLLM_CUSTOM_ALLREDUCE = EnvBool(False)
188
+ USE_VLLM_CUTLASS_W8A8_FP8_KERNEL = EnvBool(False)
189
+
190
+ USE_TRITON_W8A8_FP8_KERNEL = EnvBool(False)
191
+ RETURN_ORIGINAL_LOGPROB = EnvBool(False)
192
+ SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN = EnvBool(False)
193
+ SGLANG_MOE_PADDING = EnvBool(False)
194
+ SGLANG_CUTLASS_MOE = EnvBool(False)
195
+ HF_HUB_DISABLE_XET = EnvBool(False)
196
+ DISABLE_OPENAPI_DOC = EnvBool(False)
197
+ SGLANG_ENABLE_TORCH_INFERENCE_MODE = EnvBool(False)
198
+ SGLANG_IS_FIRST_RANK_ON_NODE = EnvBool(True)
199
+ SGLANG_SUPPORT_CUTLASS_BLOCK_FP8 = EnvBool(False)
200
+ SGLANG_SYNC_TOKEN_IDS_ACROSS_TP = EnvBool(False)
201
+ SGLANG_ENABLE_COLOCATED_BATCH_GEN = EnvBool(False)
202
+
203
+ # Deterministic inference
204
+ SGLANG_ENABLE_DETERMINISTIC_INFERENCE = EnvBool(False)
205
+ SGLANG_FLASHINFER_PREFILL_SPLIT_TILE_SIZE = EnvInt(4096)
206
+ SGLANG_FLASHINFER_DECODE_SPLIT_TILE_SIZE = EnvInt(2048)
207
+ SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZE = EnvInt(4096)
208
+ SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
209
+
210
+ # fmt: on
211
+
212
+
213
+ envs = Envs()
214
+
215
+
216
+ def _convert_SGL_to_SGLANG():
217
+ for key, value in os.environ.items():
218
+ if key.startswith("SGL_"):
219
+ new_key = key.replace("SGL_", "SGLANG_", 1)
220
+ warnings.warn(
221
+ f"Environment variable {key} is deprecated, please use {new_key}"
222
+ )
223
+ os.environ[new_key] = value
224
+
225
+
226
+ _convert_SGL_to_SGLANG()
227
+
228
+
229
+ def example_with_exit_stack():
230
+ # Use this style of context manager in unit test
231
+ exit_stack = ExitStack()
232
+ exit_stack.enter_context(envs.SGLANG_TEST_RETRACT.override(False))
233
+ assert envs.SGLANG_TEST_RETRACT.value is False
234
+ exit_stack.close()
235
+ assert envs.SGLANG_TEST_RETRACT.value is None
236
+
237
+
238
+ def example_with_subprocess():
239
+ command = ["python", "-c", "import os; print(os.getenv('SGLANG_TEST_RETRACT'))"]
240
+ with envs.SGLANG_TEST_RETRACT.override(True):
241
+ process = subprocess.Popen(
242
+ command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
243
+ )
244
+ process.wait()
245
+ output = process.stdout.read().decode("utf-8").strip()
246
+ assert output == "True"
247
+
248
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
249
+ output = process.stdout.read().decode("utf-8").strip()
250
+ assert output == "None"
251
+
252
+
253
+ def examples():
254
+ # Example usage for envs
255
+ envs.SGLANG_TEST_RETRACT.clear()
256
+ assert envs.SGLANG_TEST_RETRACT.value is False
257
+
258
+ envs.SGLANG_TEST_RETRACT.set(None)
259
+ assert envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
260
+
261
+ envs.SGLANG_TEST_RETRACT.clear()
262
+ assert not envs.SGLANG_TEST_RETRACT.is_set()
263
+
264
+ envs.SGLANG_TEST_RETRACT.set(True)
265
+ assert envs.SGLANG_TEST_RETRACT.value is True
266
+
267
+ with envs.SGLANG_TEST_RETRACT.override(None):
268
+ assert (
269
+ envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
270
+ )
271
+
272
+ assert envs.SGLANG_TEST_RETRACT.value is True
273
+
274
+ envs.SGLANG_TEST_RETRACT.set(None)
275
+ with envs.SGLANG_TEST_RETRACT.override(True):
276
+ assert envs.SGLANG_TEST_RETRACT.value is True
277
+
278
+ assert envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
279
+
280
+ example_with_exit_stack()
281
+ example_with_subprocess()
282
+
283
+
284
+ if __name__ == "__main__":
285
+ examples()
@@ -55,7 +55,7 @@ class EPLBManager:
55
55
  enable_timing = self._rebalance_layers_per_chunk is None
56
56
 
57
57
  if enable_timing:
58
- torch.cuda.synchronize()
58
+ torch.get_device_module().synchronize()
59
59
  time_start = time.time()
60
60
 
61
61
  dump_record_output = get_global_expert_distribution_recorder().dump_record(
@@ -85,7 +85,7 @@ class EPLBManager:
85
85
 
86
86
  msg = f"[EPLBManager] rebalance end"
87
87
  if enable_timing:
88
- torch.cuda.synchronize()
88
+ torch.get_device_module().synchronize()
89
89
  time_end = time.time()
90
90
  msg += f" time={time_end - time_start:.3f}s"
91
91
  logger.info(msg)
@@ -11,6 +11,9 @@
11
11
  # See the License for the specific language governing permissions and
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
+
15
+ from __future__ import annotations
16
+
14
17
  import logging
15
18
  import math
16
19
  import os
@@ -19,16 +22,20 @@ from abc import ABC
19
22
  from collections import deque
20
23
  from contextlib import contextmanager
21
24
  from pathlib import Path
22
- from typing import Any, Dict, List, Literal, Optional, Tuple, Type
25
+ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Type
23
26
 
24
27
  import einops
25
28
  import torch
26
29
  import torch.distributed
27
30
 
28
- from sglang.srt.eplb.expert_location import ExpertLocationMetadata
29
31
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
30
32
  from sglang.srt.server_args import ServerArgs
31
- from sglang.srt.utils import Withable, get_bool_env_var
33
+ from sglang.srt.utils import Withable, get_bool_env_var, is_npu
34
+
35
+ _is_npu = is_npu()
36
+
37
+ if TYPE_CHECKING:
38
+ from sglang.srt.eplb.expert_location import ExpertLocationMetadata
32
39
 
33
40
  logger = logging.getLogger(__name__)
34
41
 
@@ -43,7 +50,7 @@ class ExpertDistributionRecorder(ABC):
43
50
  @staticmethod
44
51
  def init_new(
45
52
  server_args: ServerArgs,
46
- expert_location_metadata: "ExpertLocationMetadata",
53
+ expert_location_metadata: ExpertLocationMetadata,
47
54
  rank: int,
48
55
  ):
49
56
  if server_args.expert_distribution_recorder_mode is not None:
@@ -118,7 +125,7 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
118
125
  def __init__(
119
126
  self,
120
127
  server_args: ServerArgs,
121
- expert_location_metadata: "ExpertLocationMetadata",
128
+ expert_location_metadata: ExpertLocationMetadata,
122
129
  rank: int,
123
130
  ):
124
131
  self._server_args = server_args
@@ -211,7 +218,9 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
211
218
  def _on_hook(self, hook_name: str, **kwargs):
212
219
  if self._disable_all:
213
220
  return
214
- if not (self._recording or torch.cuda.is_current_stream_capturing()):
221
+ if not (
222
+ self._recording or torch.get_device_module().is_current_stream_capturing()
223
+ ):
215
224
  return
216
225
  gatherer = self._single_pass_gatherers[
217
226
  self._accumulator.get_single_pass_gatherer_key(
@@ -279,7 +288,7 @@ class _SinglePassGatherer(ABC):
279
288
  @staticmethod
280
289
  def init_new(
281
290
  server_args: ServerArgs,
282
- expert_location_metadata: "ExpertLocationMetadata",
291
+ expert_location_metadata: ExpertLocationMetadata,
283
292
  rank: int,
284
293
  ) -> "_SinglePassGatherer":
285
294
  if server_args.expert_distribution_recorder_mode == "per_token":
@@ -307,7 +316,7 @@ class _SinglePassGatherer(ABC):
307
316
 
308
317
  return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
309
318
 
310
- def __init__(self, expert_location_metadata: "ExpertLocationMetadata", rank: int):
319
+ def __init__(self, expert_location_metadata: ExpertLocationMetadata, rank: int):
311
320
  self._expert_location_metadata = expert_location_metadata
312
321
  self._rank = rank
313
322
 
@@ -346,7 +355,7 @@ class _DetailSinglePassGatherer(_SinglePassGatherer):
346
355
  def __init__(
347
356
  self,
348
357
  server_args: ServerArgs,
349
- expert_location_metadata: "ExpertLocationMetadata",
358
+ expert_location_metadata: ExpertLocationMetadata,
350
359
  rank: int,
351
360
  ):
352
361
  super().__init__(expert_location_metadata, rank)
@@ -446,6 +455,10 @@ def _list_sum(a: List, b: List) -> List:
446
455
  class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
447
456
  def __init__(self, *args, enable_global_physical_experts: bool, **kwargs):
448
457
  super().__init__(*args, **kwargs)
458
+ if not _is_npu:
459
+ device = "cuda"
460
+ else:
461
+ device = "npu"
449
462
  self._enable_global_physical_experts = enable_global_physical_experts
450
463
  self._data = torch.zeros(
451
464
  (
@@ -457,7 +470,7 @@ class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
457
470
  ),
458
471
  ),
459
472
  dtype=torch.int,
460
- device="cuda",
473
+ device=device,
461
474
  )
462
475
 
463
476
  def reset(self):
@@ -561,7 +574,7 @@ class _Accumulator(ABC):
561
574
  @staticmethod
562
575
  def init_new(
563
576
  server_args: ServerArgs,
564
- expert_location_metadata: "ExpertLocationMetadata",
577
+ expert_location_metadata: ExpertLocationMetadata,
565
578
  rank: int,
566
579
  ) -> "_Accumulator":
567
580
  return _Accumulator.get_class(server_args)(
@@ -580,7 +593,7 @@ class _Accumulator(ABC):
580
593
  def __init__(
581
594
  self,
582
595
  server_args: ServerArgs,
583
- expert_location_metadata: "ExpertLocationMetadata",
596
+ expert_location_metadata: ExpertLocationMetadata,
584
597
  rank: int,
585
598
  ):
586
599
  self._server_args = server_args
@@ -779,7 +792,7 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
779
792
 
780
793
  if self._first_dump:
781
794
  self._first_dump = False
782
- torch.cuda.empty_cache()
795
+ torch.get_device_module().empty_cache()
783
796
 
784
797
  torch.distributed.all_reduce(
785
798
  logical_count_of_buffered_step, op=torch.distributed.ReduceOp.SUM
@@ -11,21 +11,26 @@
11
11
  # See the License for the specific language governing permissions and
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
+
15
+ from __future__ import annotations
16
+
14
17
  import json
15
18
  import logging
16
19
  import random
17
20
  from dataclasses import dataclass
18
21
  from pathlib import Path
19
- from typing import List, Optional
22
+ from typing import TYPE_CHECKING, List, Optional
20
23
 
21
24
  import torch
22
25
  import torch.distributed
23
26
  import torch.nn.functional as F
24
27
 
25
- from sglang.srt.configs.model_config import ModelConfig
26
28
  from sglang.srt.eplb import eplb_algorithms
27
29
  from sglang.srt.model_loader import get_model_architecture
28
- from sglang.srt.server_args import ServerArgs
30
+
31
+ if TYPE_CHECKING:
32
+ from sglang.srt.configs.model_config import ModelConfig
33
+ from sglang.srt.server_args import ServerArgs
29
34
 
30
35
  logger = logging.getLogger(__name__)
31
36
 
@@ -226,6 +231,7 @@ class ExpertLocationMetadata:
226
231
  logical_to_all_physical_map_num_valid=logical_to_all_physical_map_num_valid,
227
232
  logical_to_rank_dispatch_physical_map=(
228
233
  compute_logical_to_rank_dispatch_physical_map(
234
+ server_args=server_args,
229
235
  logical_to_all_physical_map=logical_to_all_physical_map,
230
236
  num_gpus=ep_size,
231
237
  num_physical_experts=num_physical_experts,
@@ -335,6 +341,7 @@ def _pad_nested_array(arr, pad_value):
335
341
 
336
342
  # TODO optimize performance (rewrite and/or run in separate process with overlap)
337
343
  def compute_logical_to_rank_dispatch_physical_map(
344
+ server_args: ServerArgs,
338
345
  logical_to_all_physical_map: torch.Tensor,
339
346
  num_gpus: int,
340
347
  num_physical_experts: int,
@@ -343,7 +350,9 @@ def compute_logical_to_rank_dispatch_physical_map(
343
350
  ):
344
351
  r = random.Random(seed)
345
352
 
346
- num_local_physical_experts = num_physical_experts // num_gpus
353
+ num_local_gpu_physical_experts = num_physical_experts // num_gpus
354
+ num_gpus_per_node = server_args.ep_size // server_args.nnodes
355
+ num_local_node_physical_experts = num_local_gpu_physical_experts * num_gpus_per_node
347
356
  num_layers, num_logical_experts, _ = logical_to_all_physical_map.shape
348
357
  dtype = logical_to_all_physical_map.dtype
349
358
 
@@ -367,13 +376,28 @@ def compute_logical_to_rank_dispatch_physical_map(
367
376
  physical_expert_id
368
377
  for physical_expert_id in candidate_physical_expert_ids
369
378
  if _compute_gpu_id_of_physical_expert(
370
- physical_expert_id, num_local_physical_experts
379
+ physical_expert_id, num_local_gpu_physical_experts
371
380
  )
372
381
  == gpu_id
373
382
  ]
374
383
  if len(same_gpu_physical_expert_ids) > 0:
384
+ # 1. Prefer same-GPU experts
375
385
  output_partial[gpu_id] = same_gpu_physical_expert_ids[0]
376
-
386
+ else:
387
+ # 2. Otherwise, prefer same-node experts
388
+ node_id = gpu_id // num_gpus_per_node
389
+ same_node_physical_expert_ids = [
390
+ physical_expert_id
391
+ for physical_expert_id in candidate_physical_expert_ids
392
+ if _compute_node_id_of_physical_expert(
393
+ physical_expert_id, num_local_node_physical_experts
394
+ )
395
+ == node_id
396
+ ]
397
+ if len(same_node_physical_expert_ids) > 0:
398
+ output_partial[gpu_id] = same_node_physical_expert_ids[0]
399
+
400
+ # 3. Fill remaining slots with fair random choices
377
401
  num_remain = torch.sum(output_partial == -1).item()
378
402
  output_partial[output_partial == -1] = torch.tensor(
379
403
  _fair_choices(candidate_physical_expert_ids, k=num_remain, r=r),
@@ -399,9 +423,15 @@ def _logical_to_all_physical_raw(
399
423
 
400
424
 
401
425
  def _compute_gpu_id_of_physical_expert(
402
- physical_expert_id: int, num_local_physical_experts: int
426
+ physical_expert_id: int, num_local_gpu_physical_experts: int
427
+ ) -> int:
428
+ return physical_expert_id // num_local_gpu_physical_experts
429
+
430
+
431
+ def _compute_node_id_of_physical_expert(
432
+ physical_expert_id: int, num_local_host_physical_experts: int
403
433
  ) -> int:
404
- return physical_expert_id // num_local_physical_experts
434
+ return physical_expert_id // num_local_host_physical_experts
405
435
 
406
436
 
407
437
  def _fair_choices(arr: List, k: int, r: random.Random) -> List:
@@ -47,7 +47,7 @@ class ExpertLocationUpdater:
47
47
  ):
48
48
  if self._first_execution:
49
49
  self._first_execution = False
50
- torch.cuda.empty_cache()
50
+ torch.get_device_module().empty_cache()
51
51
 
52
52
  old_expert_location_metadata = get_global_expert_location_metadata()
53
53
  assert old_expert_location_metadata is not None
@@ -162,12 +162,9 @@ class BaseFormatDetector(ABC):
162
162
 
163
163
  try:
164
164
  try:
165
- if current_text.startswith(self.bot_token):
166
- start_idx = len(self.bot_token)
167
- elif self.current_tool_id > 0 and current_text.startswith(
168
- self.tool_call_separator + self.bot_token
169
- ):
170
- start_idx = len(self.tool_call_separator + self.bot_token)
165
+ tool_call_pos = current_text.find(self.bot_token)
166
+ if tool_call_pos != -1:
167
+ start_idx = tool_call_pos + len(self.bot_token)
171
168
  elif self.current_tool_id > 0 and current_text.startswith(
172
169
  self.tool_call_separator
173
170
  ):