sglang 0.5.2rc2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +8 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +360 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/model_config.py +181 -82
  17. sglang/srt/configs/qwen3_next.py +326 -0
  18. sglang/srt/configs/qwen3_vl.py +586 -0
  19. sglang/srt/connector/__init__.py +8 -1
  20. sglang/srt/connector/remote_instance.py +82 -0
  21. sglang/srt/constrained/base_grammar_backend.py +49 -12
  22. sglang/srt/constrained/llguidance_backend.py +0 -1
  23. sglang/srt/constrained/outlines_backend.py +0 -1
  24. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  25. sglang/srt/constrained/xgrammar_backend.py +30 -9
  26. sglang/srt/custom_op.py +11 -1
  27. sglang/srt/debug_utils/dump_comparator.py +81 -44
  28. sglang/srt/debug_utils/dump_loader.py +97 -0
  29. sglang/srt/debug_utils/dumper.py +21 -6
  30. sglang/srt/debug_utils/text_comparator.py +73 -11
  31. sglang/srt/disaggregation/ascend/conn.py +2 -2
  32. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  33. sglang/srt/disaggregation/base/conn.py +1 -1
  34. sglang/srt/disaggregation/common/conn.py +279 -108
  35. sglang/srt/disaggregation/decode.py +71 -19
  36. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  37. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  38. sglang/srt/disaggregation/fake/conn.py +1 -1
  39. sglang/srt/disaggregation/mini_lb.py +6 -445
  40. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  41. sglang/srt/disaggregation/nixl/conn.py +326 -53
  42. sglang/srt/disaggregation/prefill.py +36 -17
  43. sglang/srt/disaggregation/utils.py +40 -54
  44. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  45. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  46. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  47. sglang/srt/distributed/parallel_state.py +156 -80
  48. sglang/srt/entrypoints/engine.py +59 -18
  49. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  50. sglang/srt/entrypoints/grpc_server.py +810 -0
  51. sglang/srt/entrypoints/http_server.py +130 -59
  52. sglang/srt/entrypoints/openai/protocol.py +112 -4
  53. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  54. sglang/srt/entrypoints/openai/serving_chat.py +204 -55
  55. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  56. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  57. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  58. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  59. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  60. sglang/srt/environ.py +285 -0
  61. sglang/srt/eplb/eplb_manager.py +2 -2
  62. sglang/srt/eplb/expert_distribution.py +26 -13
  63. sglang/srt/eplb/expert_location.py +38 -8
  64. sglang/srt/eplb/expert_location_updater.py +1 -1
  65. sglang/srt/function_call/base_format_detector.py +3 -6
  66. sglang/srt/function_call/ebnf_composer.py +11 -9
  67. sglang/srt/function_call/function_call_parser.py +9 -2
  68. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  69. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  70. sglang/srt/function_call/json_array_parser.py +63 -0
  71. sglang/srt/function_call/kimik2_detector.py +17 -4
  72. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  73. sglang/srt/function_call/utils.py +96 -5
  74. sglang/srt/grpc/__init__.py +1 -0
  75. sglang/srt/grpc/compile_proto.py +245 -0
  76. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  77. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  79. sglang/srt/layers/activation.py +143 -9
  80. sglang/srt/layers/attention/aiter_backend.py +14 -15
  81. sglang/srt/layers/attention/ascend_backend.py +115 -9
  82. sglang/srt/layers/attention/attention_registry.py +206 -0
  83. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  84. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  85. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  86. sglang/srt/layers/attention/fla/chunk.py +242 -0
  87. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  88. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  89. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  90. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  91. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  92. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  93. sglang/srt/layers/attention/fla/index.py +37 -0
  94. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  95. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  96. sglang/srt/layers/attention/fla/op.py +66 -0
  97. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  98. sglang/srt/layers/attention/fla/utils.py +331 -0
  99. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  100. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  101. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  102. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  103. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  104. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  105. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  106. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  107. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  108. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  109. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  110. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  111. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  112. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  113. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  114. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  115. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  119. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  120. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  121. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  122. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  123. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  124. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  125. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  126. sglang/srt/layers/attention/nsa/utils.py +24 -0
  127. sglang/srt/layers/attention/nsa_backend.py +887 -0
  128. sglang/srt/layers/attention/tbo_backend.py +6 -6
  129. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  130. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  131. sglang/srt/layers/attention/triton_backend.py +57 -7
  132. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  133. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  134. sglang/srt/layers/attention/vision.py +58 -0
  135. sglang/srt/layers/attention/wave_backend.py +4 -4
  136. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  137. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  138. sglang/srt/layers/communicator.py +8 -0
  139. sglang/srt/layers/dp_attention.py +41 -2
  140. sglang/srt/layers/elementwise.py +3 -1
  141. sglang/srt/layers/layernorm.py +34 -15
  142. sglang/srt/layers/linear.py +55 -7
  143. sglang/srt/layers/logits_processor.py +44 -12
  144. sglang/srt/layers/moe/__init__.py +2 -1
  145. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  146. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  147. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  148. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  149. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  150. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  151. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  164. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  165. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  166. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  167. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  168. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  169. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  170. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  171. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  172. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  173. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  174. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  175. sglang/srt/layers/moe/topk.py +30 -9
  176. sglang/srt/layers/moe/utils.py +22 -6
  177. sglang/srt/layers/parameter.py +23 -6
  178. sglang/srt/layers/quantization/awq.py +19 -7
  179. sglang/srt/layers/quantization/base_config.py +11 -6
  180. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  182. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  185. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  186. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  187. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  188. sglang/srt/layers/quantization/fp8.py +78 -49
  189. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  190. sglang/srt/layers/quantization/gptq.py +25 -17
  191. sglang/srt/layers/quantization/modelopt_quant.py +190 -55
  192. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  193. sglang/srt/layers/quantization/mxfp4.py +74 -42
  194. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  195. sglang/srt/layers/quantization/unquant.py +135 -47
  196. sglang/srt/layers/quantization/w4afp8.py +26 -17
  197. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  198. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  199. sglang/srt/layers/rotary_embedding.py +78 -31
  200. sglang/srt/layers/sampler.py +213 -21
  201. sglang/srt/layers/utils.py +23 -0
  202. sglang/srt/lora/backend/base_backend.py +50 -8
  203. sglang/srt/lora/backend/chunked_backend.py +348 -0
  204. sglang/srt/lora/backend/triton_backend.py +99 -5
  205. sglang/srt/lora/layers.py +32 -0
  206. sglang/srt/lora/lora.py +8 -3
  207. sglang/srt/lora/lora_manager.py +52 -118
  208. sglang/srt/lora/mem_pool.py +25 -11
  209. sglang/srt/lora/triton_ops/__init__.py +4 -0
  210. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  211. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  212. sglang/srt/lora/utils.py +22 -11
  213. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  214. sglang/srt/managers/cache_controller.py +199 -301
  215. sglang/srt/managers/data_parallel_controller.py +115 -80
  216. sglang/srt/managers/detokenizer_manager.py +19 -15
  217. sglang/srt/managers/disagg_service.py +46 -0
  218. sglang/srt/managers/io_struct.py +340 -109
  219. sglang/srt/managers/mm_utils.py +44 -6
  220. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  221. sglang/srt/managers/multimodal_processor.py +1 -2
  222. sglang/srt/managers/overlap_utils.py +53 -0
  223. sglang/srt/managers/schedule_batch.py +240 -138
  224. sglang/srt/managers/schedule_policy.py +144 -17
  225. sglang/srt/managers/scheduler.py +502 -209
  226. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  227. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  228. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  229. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  230. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  231. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  232. sglang/srt/managers/tokenizer_manager.py +320 -632
  233. sglang/srt/managers/tp_worker.py +81 -22
  234. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  235. sglang/srt/managers/utils.py +1 -45
  236. sglang/srt/mem_cache/allocator.py +14 -20
  237. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  238. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  239. sglang/srt/mem_cache/chunk_cache.py +8 -1
  240. sglang/srt/mem_cache/evict_policy.py +23 -0
  241. sglang/srt/mem_cache/hicache_storage.py +43 -24
  242. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  243. sglang/srt/mem_cache/memory_pool.py +535 -58
  244. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  245. sglang/srt/mem_cache/radix_cache.py +222 -73
  246. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  247. sglang/srt/mem_cache/storage/__init__.py +10 -0
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  249. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  250. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  251. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  252. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  253. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  254. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  255. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  256. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  257. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  259. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  260. sglang/srt/metrics/collector.py +511 -132
  261. sglang/srt/metrics/func_timer.py +2 -7
  262. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  263. sglang/srt/metrics/utils.py +8 -1
  264. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  265. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  266. sglang/srt/model_executor/forward_batch_info.py +82 -40
  267. sglang/srt/model_executor/model_runner.py +432 -157
  268. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  269. sglang/srt/model_loader/__init__.py +9 -3
  270. sglang/srt/model_loader/loader.py +133 -5
  271. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  272. sglang/srt/model_loader/weight_utils.py +158 -3
  273. sglang/srt/models/apertus.py +686 -0
  274. sglang/srt/models/bailing_moe.py +820 -217
  275. sglang/srt/models/bailing_moe_nextn.py +168 -0
  276. sglang/srt/models/deepseek_nextn.py +6 -1
  277. sglang/srt/models/deepseek_v2.py +607 -130
  278. sglang/srt/models/dots_ocr.py +173 -0
  279. sglang/srt/models/dots_vlm.py +174 -0
  280. sglang/srt/models/dots_vlm_vit.py +337 -0
  281. sglang/srt/models/ernie4.py +1 -1
  282. sglang/srt/models/falcon_h1.py +576 -0
  283. sglang/srt/models/gemma3_causal.py +0 -2
  284. sglang/srt/models/gemma3_mm.py +1 -1
  285. sglang/srt/models/gemma3n_mm.py +2 -2
  286. sglang/srt/models/glm4_moe.py +4 -4
  287. sglang/srt/models/glm4_moe_nextn.py +2 -2
  288. sglang/srt/models/glm4v.py +5 -3
  289. sglang/srt/models/glm4v_moe.py +4 -1
  290. sglang/srt/models/gpt_oss.py +8 -31
  291. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  292. sglang/srt/models/llama.py +4 -0
  293. sglang/srt/models/llama4.py +9 -0
  294. sglang/srt/models/llama_eagle3.py +13 -0
  295. sglang/srt/models/longcat_flash.py +3 -3
  296. sglang/srt/models/longcat_flash_nextn.py +1 -1
  297. sglang/srt/models/mllama4.py +40 -4
  298. sglang/srt/models/opt.py +637 -0
  299. sglang/srt/models/qwen2_5_vl.py +29 -5
  300. sglang/srt/models/qwen2_audio.py +1 -1
  301. sglang/srt/models/qwen2_moe.py +120 -13
  302. sglang/srt/models/qwen2_vl.py +1 -1
  303. sglang/srt/models/qwen3.py +18 -3
  304. sglang/srt/models/qwen3_moe.py +32 -4
  305. sglang/srt/models/qwen3_next.py +1069 -0
  306. sglang/srt/models/qwen3_next_mtp.py +112 -0
  307. sglang/srt/models/qwen3_vl.py +787 -0
  308. sglang/srt/models/qwen3_vl_moe.py +471 -0
  309. sglang/srt/models/registry.py +15 -3
  310. sglang/srt/models/sarashina2_vision.py +269 -0
  311. sglang/srt/models/solar.py +505 -0
  312. sglang/srt/models/starcoder2.py +357 -0
  313. sglang/srt/models/step3_vl.py +1 -1
  314. sglang/srt/models/torch_native_llama.py +9 -2
  315. sglang/srt/models/utils.py +51 -0
  316. sglang/srt/multimodal/processors/base_processor.py +15 -7
  317. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  318. sglang/srt/multimodal/processors/glm4v.py +9 -9
  319. sglang/srt/multimodal/processors/internvl.py +153 -129
  320. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  321. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  322. sglang/srt/offloader.py +27 -3
  323. sglang/srt/parser/jinja_template_utils.py +6 -0
  324. sglang/srt/sampling/sampling_batch_info.py +38 -17
  325. sglang/srt/sampling/sampling_params.py +7 -0
  326. sglang/srt/server_args.py +966 -267
  327. sglang/srt/server_args_config_parser.py +146 -0
  328. sglang/srt/single_batch_overlap.py +151 -0
  329. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  330. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  331. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  332. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  333. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  334. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  335. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  336. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  337. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  338. sglang/srt/speculative/eagle_worker.py +99 -28
  339. sglang/srt/speculative/ngram_utils.py +428 -0
  340. sglang/srt/speculative/ngram_worker.py +245 -0
  341. sglang/srt/speculative/spec_info.py +52 -0
  342. sglang/srt/speculative/spec_utils.py +606 -0
  343. sglang/srt/speculative/standalone_worker.py +109 -0
  344. sglang/srt/torch_memory_saver_adapter.py +5 -7
  345. sglang/srt/tracing/trace.py +578 -0
  346. sglang/srt/two_batch_overlap.py +8 -5
  347. sglang/srt/utils/__init__.py +2 -0
  348. sglang/srt/{utils.py → utils/common.py} +433 -77
  349. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  350. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  351. sglang/srt/utils/rpd_utils.py +452 -0
  352. sglang/srt/utils/slow_rank_detector.py +71 -0
  353. sglang/srt/warmup.py +8 -4
  354. sglang/srt/weight_sync/utils.py +2 -2
  355. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  356. sglang/test/get_logits_ut.py +57 -0
  357. sglang/test/run_eval.py +79 -11
  358. sglang/test/runners.py +5 -1
  359. sglang/test/simple_eval_common.py +5 -2
  360. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  361. sglang/test/test_block_fp8.py +2 -2
  362. sglang/test/test_cutlass_moe.py +24 -6
  363. sglang/test/test_deterministic.py +297 -0
  364. sglang/test/test_disaggregation_utils.py +77 -0
  365. sglang/test/test_fp4_moe.py +370 -1
  366. sglang/test/test_programs.py +1 -1
  367. sglang/test/test_utils.py +383 -5
  368. sglang/utils.py +21 -1
  369. sglang/version.py +1 -1
  370. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
  371. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/RECORD +375 -245
  372. sglang/srt/disaggregation/launch_lb.py +0 -118
  373. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  374. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  375. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
  376. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
  377. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
sglang/srt/environ.py ADDED
@@ -0,0 +1,285 @@
1
+ import os
2
+ import subprocess
3
+ import warnings
4
+ from contextlib import ExitStack, contextmanager
5
+ from typing import Any
6
+
7
+
8
+ class EnvField:
9
+ def __init__(self, default: Any):
10
+ self.default = default
11
+ # NOTE: we use None to indicate whether the value is set or not
12
+ # If the value is manually set to None, we need mark it as _set_to_none.
13
+ # Always use clear() to reset the value, which leads to the default fallback.
14
+ self._set_to_none = False
15
+
16
+ def __set_name__(self, owner, name):
17
+ self.name = name
18
+
19
+ def parse(self, value: str) -> Any:
20
+ raise NotImplementedError()
21
+
22
+ def get(self) -> Any:
23
+ value = os.getenv(self.name)
24
+ if self._set_to_none:
25
+ assert value is None
26
+ return None
27
+
28
+ if value is None:
29
+ return self.default
30
+
31
+ try:
32
+ return self.parse(value)
33
+ except ValueError as e:
34
+ warnings.warn(
35
+ f'Invalid value for {self.name}: {e}, using default "{self.default}"'
36
+ )
37
+ return self.default
38
+
39
+ def is_set(self):
40
+ # NOTE: If None is manually set, it is considered as set.
41
+ return self.name in os.environ or self._set_to_none
42
+
43
+ def get_set_value_or(self, or_value: Any):
44
+ # NOTE: Ugly usage, but only way to get custom default value.
45
+ return self.get() if self.is_set() else or_value
46
+
47
+ def set(self, value: Any):
48
+ if value is None:
49
+ self._set_to_none = True
50
+ os.environ.pop(self.name, None)
51
+ else:
52
+ self._set_to_none = False
53
+ os.environ[self.name] = str(value)
54
+
55
+ @contextmanager
56
+ def override(self, value: Any):
57
+ backup_present = self.name in os.environ
58
+ backup_value = os.environ.get(self.name)
59
+ backup_set_to_none = self._set_to_none
60
+ self.set(value)
61
+ yield
62
+ if backup_present:
63
+ os.environ[self.name] = backup_value
64
+ else:
65
+ os.environ.pop(self.name, None)
66
+ self._set_to_none = backup_set_to_none
67
+
68
+ def clear(self):
69
+ os.environ.pop(self.name, None)
70
+ self._set_to_none = False
71
+
72
+ @property
73
+ def value(self):
74
+ return self.get()
75
+
76
+
77
+ class EnvStr(EnvField):
78
+ def parse(self, value: str) -> str:
79
+ return value
80
+
81
+
82
+ class EnvBool(EnvField):
83
+ def parse(self, value: str) -> bool:
84
+ value = value.lower()
85
+ if value in ["true", "1", "yes", "y"]:
86
+ return True
87
+ if value in ["false", "0", "no", "n"]:
88
+ return False
89
+ raise ValueError(f'"{value}" is not a valid boolean value')
90
+
91
+
92
+ class EnvInt(EnvField):
93
+ def parse(self, value: str) -> int:
94
+ try:
95
+ return int(value)
96
+ except ValueError:
97
+ raise ValueError(f'"{value}" is not a valid integer value')
98
+
99
+
100
+ class EnvFloat(EnvField):
101
+ def parse(self, value: str) -> float:
102
+ try:
103
+ return float(value)
104
+ except ValueError:
105
+ raise ValueError(f'"{value}" is not a valid float value')
106
+
107
+
108
+ class Envs:
109
+ # fmt: off
110
+
111
+ # Model & File Download
112
+ SGLANG_USE_MODELSCOPE = EnvBool(False)
113
+
114
+ # Test & Debug
115
+ SGLANG_IS_IN_CI = EnvBool(False)
116
+ SGLANG_AMD_CI = EnvBool(False)
117
+ SGLANG_TEST_RETRACT = EnvBool(False)
118
+ SGLANG_SET_CPU_AFFINITY = EnvBool(False)
119
+ SGLANG_PROFILE_WITH_STACK = EnvBool(True)
120
+ SGLANG_RECORD_STEP_TIME = EnvBool(False)
121
+ SGLANG_GC_LOG = EnvBool(False)
122
+ SGLANG_FORCE_SHUTDOWN = EnvBool(False)
123
+ SGLANG_DEBUG_MEMORY_POOL = EnvBool(False)
124
+ SGLANG_TEST_REQUEST_TIME_STATS = EnvBool(False)
125
+ SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK = EnvBool(False)
126
+ SGLANG_DISABLE_REQUEST_LOGGING = EnvBool(False)
127
+ SGLANG_SIMULATE_ACC_LEN = EnvFloat(-1)
128
+ SGLANG_SIMULATE_ACC_METHOD = EnvStr("multinomial")
129
+ SGLANG_TORCH_PROFILER_DIR = EnvStr("/tmp")
130
+
131
+ # Model Parallel
132
+ SGLANG_USE_MESSAGE_QUEUE_BROADCASTER = EnvBool(True)
133
+
134
+ # Constrained Decoding
135
+ SGLANG_DISABLE_OUTLINES_DISK_CACHE = EnvBool(True)
136
+ SGLANG_GRAMMAR_TIMEOUT = EnvFloat(300)
137
+
138
+ # Hi-Cache
139
+ SGLANG_HICACHE_HF3FS_CONFIG_PATH = EnvStr(None)
140
+
141
+ # Mooncake KV Transfer
142
+ SGLANG_MOONCAKE_CUSTOM_MEM_POOL = EnvBool(False)
143
+ ENABLE_ASCEND_TRANSFER_WITH_MOONCAKE = EnvBool(False)
144
+
145
+ # AMD & ROCm
146
+ SGLANG_USE_AITER = EnvBool(False)
147
+ SGLANG_ROCM_FUSED_DECODE_MLA = EnvBool(False)
148
+
149
+ # Quantization
150
+ SGLANG_INT4_WEIGHT = EnvBool(False)
151
+ SGLANG_CPU_QUANTIZATION = EnvBool(False)
152
+ SGLANG_USE_DYNAMIC_MXFP4_LINEAR = EnvBool(False)
153
+ SGLANG_FORCE_FP8_MARLIN = EnvBool(False)
154
+
155
+ # Flashinfer
156
+ SGLANG_IS_FLASHINFER_AVAILABLE = EnvBool(True)
157
+ SGLANG_ENABLE_FLASHINFER_GEMM = EnvBool(False)
158
+
159
+ # Triton
160
+ SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS = EnvBool(False)
161
+
162
+ # Torch Compile
163
+ SGLANG_ENABLE_TORCH_COMPILE = EnvBool(False)
164
+
165
+ # EPLB
166
+ SGLANG_EXPERT_LOCATION_UPDATER_LOG_INPUT = EnvBool(False)
167
+ SGLANG_EXPERT_LOCATION_UPDATER_CANARY = EnvBool(False)
168
+ SGLANG_EXPERT_LOCATION_UPDATER_LOG_METRICS = EnvBool(False)
169
+ SGLANG_LOG_EXPERT_LOCATION_METADATA = EnvBool(False)
170
+
171
+ # TBO
172
+ SGLANG_TBO_DEBUG = EnvBool(False)
173
+
174
+ # DeepGemm
175
+ SGLANG_ENABLE_JIT_DEEPGEMM = EnvBool(True)
176
+ SGLANG_JIT_DEEPGEMM_PRECOMPILE = EnvBool(True)
177
+ SGLANG_JIT_DEEPGEMM_COMPILE_WORKERS = EnvInt(4)
178
+ SGLANG_IN_DEEPGEMM_PRECOMPILE_STAGE = EnvBool(False)
179
+ SGLANG_DG_CACHE_DIR = EnvStr(os.path.expanduser("~/.cache/deep_gemm"))
180
+ SGLANG_DG_USE_NVRTC = EnvBool(False)
181
+ SGLANG_USE_DEEPGEMM_BMM = EnvBool(False)
182
+
183
+ # sgl-kernel
184
+ SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK = EnvBool(False)
185
+
186
+ # vLLM dependencies
187
+ USE_VLLM_CUSTOM_ALLREDUCE = EnvBool(False)
188
+ USE_VLLM_CUTLASS_W8A8_FP8_KERNEL = EnvBool(False)
189
+
190
+ USE_TRITON_W8A8_FP8_KERNEL = EnvBool(False)
191
+ RETURN_ORIGINAL_LOGPROB = EnvBool(False)
192
+ SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN = EnvBool(False)
193
+ SGLANG_MOE_PADDING = EnvBool(False)
194
+ SGLANG_CUTLASS_MOE = EnvBool(False)
195
+ HF_HUB_DISABLE_XET = EnvBool(False)
196
+ DISABLE_OPENAPI_DOC = EnvBool(False)
197
+ SGLANG_ENABLE_TORCH_INFERENCE_MODE = EnvBool(False)
198
+ SGLANG_IS_FIRST_RANK_ON_NODE = EnvBool(True)
199
+ SGLANG_SUPPORT_CUTLASS_BLOCK_FP8 = EnvBool(False)
200
+ SGLANG_SYNC_TOKEN_IDS_ACROSS_TP = EnvBool(False)
201
+ SGLANG_ENABLE_COLOCATED_BATCH_GEN = EnvBool(False)
202
+
203
+ # Deterministic inference
204
+ SGLANG_ENABLE_DETERMINISTIC_INFERENCE = EnvBool(False)
205
+ SGLANG_FLASHINFER_PREFILL_SPLIT_TILE_SIZE = EnvInt(4096)
206
+ SGLANG_FLASHINFER_DECODE_SPLIT_TILE_SIZE = EnvInt(2048)
207
+ SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZE = EnvInt(4096)
208
+ SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
209
+
210
+ # fmt: on
211
+
212
+
213
+ envs = Envs()
214
+
215
+
216
+ def _convert_SGL_to_SGLANG():
217
+ for key, value in os.environ.items():
218
+ if key.startswith("SGL_"):
219
+ new_key = key.replace("SGL_", "SGLANG_", 1)
220
+ warnings.warn(
221
+ f"Environment variable {key} is deprecated, please use {new_key}"
222
+ )
223
+ os.environ[new_key] = value
224
+
225
+
226
+ _convert_SGL_to_SGLANG()
227
+
228
+
229
+ def example_with_exit_stack():
230
+ # Use this style of context manager in unit test
231
+ exit_stack = ExitStack()
232
+ exit_stack.enter_context(envs.SGLANG_TEST_RETRACT.override(False))
233
+ assert envs.SGLANG_TEST_RETRACT.value is False
234
+ exit_stack.close()
235
+ assert envs.SGLANG_TEST_RETRACT.value is None
236
+
237
+
238
+ def example_with_subprocess():
239
+ command = ["python", "-c", "import os; print(os.getenv('SGLANG_TEST_RETRACT'))"]
240
+ with envs.SGLANG_TEST_RETRACT.override(True):
241
+ process = subprocess.Popen(
242
+ command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
243
+ )
244
+ process.wait()
245
+ output = process.stdout.read().decode("utf-8").strip()
246
+ assert output == "True"
247
+
248
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
249
+ output = process.stdout.read().decode("utf-8").strip()
250
+ assert output == "None"
251
+
252
+
253
+ def examples():
254
+ # Example usage for envs
255
+ envs.SGLANG_TEST_RETRACT.clear()
256
+ assert envs.SGLANG_TEST_RETRACT.value is False
257
+
258
+ envs.SGLANG_TEST_RETRACT.set(None)
259
+ assert envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
260
+
261
+ envs.SGLANG_TEST_RETRACT.clear()
262
+ assert not envs.SGLANG_TEST_RETRACT.is_set()
263
+
264
+ envs.SGLANG_TEST_RETRACT.set(True)
265
+ assert envs.SGLANG_TEST_RETRACT.value is True
266
+
267
+ with envs.SGLANG_TEST_RETRACT.override(None):
268
+ assert (
269
+ envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
270
+ )
271
+
272
+ assert envs.SGLANG_TEST_RETRACT.value is True
273
+
274
+ envs.SGLANG_TEST_RETRACT.set(None)
275
+ with envs.SGLANG_TEST_RETRACT.override(True):
276
+ assert envs.SGLANG_TEST_RETRACT.value is True
277
+
278
+ assert envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
279
+
280
+ example_with_exit_stack()
281
+ example_with_subprocess()
282
+
283
+
284
+ if __name__ == "__main__":
285
+ examples()
@@ -55,7 +55,7 @@ class EPLBManager:
55
55
  enable_timing = self._rebalance_layers_per_chunk is None
56
56
 
57
57
  if enable_timing:
58
- torch.cuda.synchronize()
58
+ torch.get_device_module().synchronize()
59
59
  time_start = time.time()
60
60
 
61
61
  dump_record_output = get_global_expert_distribution_recorder().dump_record(
@@ -85,7 +85,7 @@ class EPLBManager:
85
85
 
86
86
  msg = f"[EPLBManager] rebalance end"
87
87
  if enable_timing:
88
- torch.cuda.synchronize()
88
+ torch.get_device_module().synchronize()
89
89
  time_end = time.time()
90
90
  msg += f" time={time_end - time_start:.3f}s"
91
91
  logger.info(msg)
@@ -11,6 +11,9 @@
11
11
  # See the License for the specific language governing permissions and
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
+
15
+ from __future__ import annotations
16
+
14
17
  import logging
15
18
  import math
16
19
  import os
@@ -19,16 +22,20 @@ from abc import ABC
19
22
  from collections import deque
20
23
  from contextlib import contextmanager
21
24
  from pathlib import Path
22
- from typing import Any, Dict, List, Literal, Optional, Tuple, Type
25
+ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Type
23
26
 
24
27
  import einops
25
28
  import torch
26
29
  import torch.distributed
27
30
 
28
- from sglang.srt.eplb.expert_location import ExpertLocationMetadata
29
31
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
30
32
  from sglang.srt.server_args import ServerArgs
31
- from sglang.srt.utils import Withable, get_bool_env_var
33
+ from sglang.srt.utils import Withable, get_bool_env_var, is_npu
34
+
35
+ _is_npu = is_npu()
36
+
37
+ if TYPE_CHECKING:
38
+ from sglang.srt.eplb.expert_location import ExpertLocationMetadata
32
39
 
33
40
  logger = logging.getLogger(__name__)
34
41
 
@@ -43,7 +50,7 @@ class ExpertDistributionRecorder(ABC):
43
50
  @staticmethod
44
51
  def init_new(
45
52
  server_args: ServerArgs,
46
- expert_location_metadata: "ExpertLocationMetadata",
53
+ expert_location_metadata: ExpertLocationMetadata,
47
54
  rank: int,
48
55
  ):
49
56
  if server_args.expert_distribution_recorder_mode is not None:
@@ -118,7 +125,7 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
118
125
  def __init__(
119
126
  self,
120
127
  server_args: ServerArgs,
121
- expert_location_metadata: "ExpertLocationMetadata",
128
+ expert_location_metadata: ExpertLocationMetadata,
122
129
  rank: int,
123
130
  ):
124
131
  self._server_args = server_args
@@ -211,7 +218,9 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
211
218
  def _on_hook(self, hook_name: str, **kwargs):
212
219
  if self._disable_all:
213
220
  return
214
- if not (self._recording or torch.cuda.is_current_stream_capturing()):
221
+ if not (
222
+ self._recording or torch.get_device_module().is_current_stream_capturing()
223
+ ):
215
224
  return
216
225
  gatherer = self._single_pass_gatherers[
217
226
  self._accumulator.get_single_pass_gatherer_key(
@@ -279,7 +288,7 @@ class _SinglePassGatherer(ABC):
279
288
  @staticmethod
280
289
  def init_new(
281
290
  server_args: ServerArgs,
282
- expert_location_metadata: "ExpertLocationMetadata",
291
+ expert_location_metadata: ExpertLocationMetadata,
283
292
  rank: int,
284
293
  ) -> "_SinglePassGatherer":
285
294
  if server_args.expert_distribution_recorder_mode == "per_token":
@@ -307,7 +316,7 @@ class _SinglePassGatherer(ABC):
307
316
 
308
317
  return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
309
318
 
310
- def __init__(self, expert_location_metadata: "ExpertLocationMetadata", rank: int):
319
+ def __init__(self, expert_location_metadata: ExpertLocationMetadata, rank: int):
311
320
  self._expert_location_metadata = expert_location_metadata
312
321
  self._rank = rank
313
322
 
@@ -346,7 +355,7 @@ class _DetailSinglePassGatherer(_SinglePassGatherer):
346
355
  def __init__(
347
356
  self,
348
357
  server_args: ServerArgs,
349
- expert_location_metadata: "ExpertLocationMetadata",
358
+ expert_location_metadata: ExpertLocationMetadata,
350
359
  rank: int,
351
360
  ):
352
361
  super().__init__(expert_location_metadata, rank)
@@ -446,6 +455,10 @@ def _list_sum(a: List, b: List) -> List:
446
455
  class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
447
456
  def __init__(self, *args, enable_global_physical_experts: bool, **kwargs):
448
457
  super().__init__(*args, **kwargs)
458
+ if not _is_npu:
459
+ device = "cuda"
460
+ else:
461
+ device = "npu"
449
462
  self._enable_global_physical_experts = enable_global_physical_experts
450
463
  self._data = torch.zeros(
451
464
  (
@@ -457,7 +470,7 @@ class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
457
470
  ),
458
471
  ),
459
472
  dtype=torch.int,
460
- device="cuda",
473
+ device=device,
461
474
  )
462
475
 
463
476
  def reset(self):
@@ -561,7 +574,7 @@ class _Accumulator(ABC):
561
574
  @staticmethod
562
575
  def init_new(
563
576
  server_args: ServerArgs,
564
- expert_location_metadata: "ExpertLocationMetadata",
577
+ expert_location_metadata: ExpertLocationMetadata,
565
578
  rank: int,
566
579
  ) -> "_Accumulator":
567
580
  return _Accumulator.get_class(server_args)(
@@ -580,7 +593,7 @@ class _Accumulator(ABC):
580
593
  def __init__(
581
594
  self,
582
595
  server_args: ServerArgs,
583
- expert_location_metadata: "ExpertLocationMetadata",
596
+ expert_location_metadata: ExpertLocationMetadata,
584
597
  rank: int,
585
598
  ):
586
599
  self._server_args = server_args
@@ -779,7 +792,7 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
779
792
 
780
793
  if self._first_dump:
781
794
  self._first_dump = False
782
- torch.cuda.empty_cache()
795
+ torch.get_device_module().empty_cache()
783
796
 
784
797
  torch.distributed.all_reduce(
785
798
  logical_count_of_buffered_step, op=torch.distributed.ReduceOp.SUM
@@ -11,21 +11,26 @@
11
11
  # See the License for the specific language governing permissions and
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
+
15
+ from __future__ import annotations
16
+
14
17
  import json
15
18
  import logging
16
19
  import random
17
20
  from dataclasses import dataclass
18
21
  from pathlib import Path
19
- from typing import List, Optional
22
+ from typing import TYPE_CHECKING, List, Optional
20
23
 
21
24
  import torch
22
25
  import torch.distributed
23
26
  import torch.nn.functional as F
24
27
 
25
- from sglang.srt.configs.model_config import ModelConfig
26
28
  from sglang.srt.eplb import eplb_algorithms
27
29
  from sglang.srt.model_loader import get_model_architecture
28
- from sglang.srt.server_args import ServerArgs
30
+
31
+ if TYPE_CHECKING:
32
+ from sglang.srt.configs.model_config import ModelConfig
33
+ from sglang.srt.server_args import ServerArgs
29
34
 
30
35
  logger = logging.getLogger(__name__)
31
36
 
@@ -226,6 +231,7 @@ class ExpertLocationMetadata:
226
231
  logical_to_all_physical_map_num_valid=logical_to_all_physical_map_num_valid,
227
232
  logical_to_rank_dispatch_physical_map=(
228
233
  compute_logical_to_rank_dispatch_physical_map(
234
+ server_args=server_args,
229
235
  logical_to_all_physical_map=logical_to_all_physical_map,
230
236
  num_gpus=ep_size,
231
237
  num_physical_experts=num_physical_experts,
@@ -335,6 +341,7 @@ def _pad_nested_array(arr, pad_value):
335
341
 
336
342
  # TODO optimize performance (rewrite and/or run in separate process with overlap)
337
343
  def compute_logical_to_rank_dispatch_physical_map(
344
+ server_args: ServerArgs,
338
345
  logical_to_all_physical_map: torch.Tensor,
339
346
  num_gpus: int,
340
347
  num_physical_experts: int,
@@ -343,7 +350,9 @@ def compute_logical_to_rank_dispatch_physical_map(
343
350
  ):
344
351
  r = random.Random(seed)
345
352
 
346
- num_local_physical_experts = num_physical_experts // num_gpus
353
+ num_local_gpu_physical_experts = num_physical_experts // num_gpus
354
+ num_gpus_per_node = server_args.ep_size // server_args.nnodes
355
+ num_local_node_physical_experts = num_local_gpu_physical_experts * num_gpus_per_node
347
356
  num_layers, num_logical_experts, _ = logical_to_all_physical_map.shape
348
357
  dtype = logical_to_all_physical_map.dtype
349
358
 
@@ -367,13 +376,28 @@ def compute_logical_to_rank_dispatch_physical_map(
367
376
  physical_expert_id
368
377
  for physical_expert_id in candidate_physical_expert_ids
369
378
  if _compute_gpu_id_of_physical_expert(
370
- physical_expert_id, num_local_physical_experts
379
+ physical_expert_id, num_local_gpu_physical_experts
371
380
  )
372
381
  == gpu_id
373
382
  ]
374
383
  if len(same_gpu_physical_expert_ids) > 0:
384
+ # 1. Prefer same-GPU experts
375
385
  output_partial[gpu_id] = same_gpu_physical_expert_ids[0]
376
-
386
+ else:
387
+ # 2. Otherwise, prefer same-node experts
388
+ node_id = gpu_id // num_gpus_per_node
389
+ same_node_physical_expert_ids = [
390
+ physical_expert_id
391
+ for physical_expert_id in candidate_physical_expert_ids
392
+ if _compute_node_id_of_physical_expert(
393
+ physical_expert_id, num_local_node_physical_experts
394
+ )
395
+ == node_id
396
+ ]
397
+ if len(same_node_physical_expert_ids) > 0:
398
+ output_partial[gpu_id] = same_node_physical_expert_ids[0]
399
+
400
+ # 3. Fill remaining slots with fair random choices
377
401
  num_remain = torch.sum(output_partial == -1).item()
378
402
  output_partial[output_partial == -1] = torch.tensor(
379
403
  _fair_choices(candidate_physical_expert_ids, k=num_remain, r=r),
@@ -399,9 +423,15 @@ def _logical_to_all_physical_raw(
399
423
 
400
424
 
401
425
  def _compute_gpu_id_of_physical_expert(
402
- physical_expert_id: int, num_local_physical_experts: int
426
+ physical_expert_id: int, num_local_gpu_physical_experts: int
427
+ ) -> int:
428
+ return physical_expert_id // num_local_gpu_physical_experts
429
+
430
+
431
+ def _compute_node_id_of_physical_expert(
432
+ physical_expert_id: int, num_local_host_physical_experts: int
403
433
  ) -> int:
404
- return physical_expert_id // num_local_physical_experts
434
+ return physical_expert_id // num_local_host_physical_experts
405
435
 
406
436
 
407
437
  def _fair_choices(arr: List, k: int, r: random.Random) -> List:
@@ -47,7 +47,7 @@ class ExpertLocationUpdater:
47
47
  ):
48
48
  if self._first_execution:
49
49
  self._first_execution = False
50
- torch.cuda.empty_cache()
50
+ torch.get_device_module().empty_cache()
51
51
 
52
52
  old_expert_location_metadata = get_global_expert_location_metadata()
53
53
  assert old_expert_location_metadata is not None
@@ -162,12 +162,9 @@ class BaseFormatDetector(ABC):
162
162
 
163
163
  try:
164
164
  try:
165
- if current_text.startswith(self.bot_token):
166
- start_idx = len(self.bot_token)
167
- elif self.current_tool_id > 0 and current_text.startswith(
168
- self.tool_call_separator + self.bot_token
169
- ):
170
- start_idx = len(self.tool_call_separator + self.bot_token)
165
+ tool_call_pos = current_text.find(self.bot_token)
166
+ if tool_call_pos != -1:
167
+ start_idx = tool_call_pos + len(self.bot_token)
171
168
  elif self.current_tool_id > 0 and current_text.startswith(
172
169
  self.tool_call_separator
173
170
  ):
@@ -50,19 +50,19 @@ class EBNFComposer:
50
50
 
51
51
  CALL_RULE_MAP = {
52
52
  "pythonic": 'call_{name} ::= "{name}" "(" {arguments_rule} ")"',
53
- "json": 'call_{name} ::= "{{" "\\"name\\"" ":" "\\"{name}\\"" ", " "\\"arguments\\"" ":" {arguments_rule} "}}"',
53
+ "json": 'call_{name} ::= "{{" ws "\\"name\\"" ws ":" ws "\\"{name}\\"" ws "," ws "\\"arguments\\"" ws ":" ws {arguments_rule} ws "}}"',
54
54
  "xml": 'call_{name} ::= "<function={name}>\\n" {arguments_rule} "\\n</function>"',
55
55
  }
56
56
 
57
57
  ARGUMENTS_RULE_MAP = {
58
58
  "pythonic": "{arg_rules}",
59
- "json": '"{{" {arg_rules} "}}"',
59
+ "json": '"{{" ws {arg_rules} ws "}}"',
60
60
  "xml": "{arg_rules}",
61
61
  }
62
62
 
63
63
  KEY_VALUE_RULE_MAP = {
64
64
  "pythonic": '"{key}" "=" {valrule}',
65
- "json": '"\\"{key}\\"" ":" {valrule}',
65
+ "json": '"\\"{key}\\"" ws ":" ws {valrule}',
66
66
  "xml": '"<parameter={key}>\\n" {valrule} "\\n</parameter>"',
67
67
  }
68
68
 
@@ -165,7 +165,7 @@ class EBNFComposer:
165
165
  tool_call_separator: Optional[str] = None,
166
166
  call_rule_fmt: Optional[str] = None,
167
167
  key_value_rule_fmt: Optional[str] = None,
168
- key_value_separator: str = ",",
168
+ key_value_separator: str = 'ws "," ws',
169
169
  ):
170
170
  """
171
171
  Generalized EBNF builder for all detectors.
@@ -183,6 +183,10 @@ class EBNFComposer:
183
183
  key_value_rule_fmt: Optional custom format string for key-value pairs. It should define how each parameter is formatted,
184
184
  with placeholders {key} for the parameter name and {valrule} for the value rule. If None, a default format
185
185
  based on function_format will be used.
186
+ key_value_separator: Raw EBNF fragment inserted between key-value pairs.
187
+ This string is used verbatim (not auto-quoted). Pass:
188
+ - Quoted terminals when you need a literal token (e.g. '","' or '"\\n"').
189
+ - Raw/non-terminals when you need grammar tokens (e.g. 'ws "," ws').
186
190
  """
187
191
  # =================================================================
188
192
  # Step 1: Determine the root tool calls rule
@@ -281,9 +285,7 @@ class EBNFComposer:
281
285
  # Add required properties joined by commas
282
286
  if required:
283
287
  rule_parts.append(
284
- f' "{key_value_separator}" '.join(
285
- prop_kv_pairs[k] for k in required
286
- )
288
+ f" {key_value_separator} ".join(prop_kv_pairs[k] for k in required)
287
289
  )
288
290
 
289
291
  # Add optional properties with flexible ordering
@@ -298,14 +300,14 @@ class EBNFComposer:
298
300
  opt_parts.append(prop_kv_pairs[optional[j]])
299
301
  else:
300
302
  opt_parts.append(
301
- f' ( "{key_value_separator}" {prop_kv_pairs[optional[j]]} )?'
303
+ f" ( {key_value_separator} {prop_kv_pairs[optional[j]]} )?"
302
304
  )
303
305
  opt_alternatives.append("".join(opt_parts))
304
306
 
305
307
  # Wrap with appropriate comma handling based on whether we have required properties
306
308
  if required:
307
309
  # Required properties exist, so optional group needs outer comma
308
- rule_parts.append(f' ( "{key_value_separator}" ( ')
310
+ rule_parts.append(f" ( {key_value_separator} ( ")
309
311
  rule_parts.append(" | ".join(opt_alternatives))
310
312
  rule_parts.append(" ) )?")
311
313
  else:
@@ -20,6 +20,7 @@ from sglang.srt.function_call.pythonic_detector import PythonicDetector
20
20
  from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
21
21
  from sglang.srt.function_call.qwen25_detector import Qwen25Detector
22
22
  from sglang.srt.function_call.step3_detector import Step3Detector
23
+ from sglang.srt.function_call.utils import get_json_schema_constraint
23
24
 
24
25
  logger = logging.getLogger(__name__)
25
26
 
@@ -69,6 +70,8 @@ class FunctionCallParser:
69
70
  Returns:
70
71
  True if the text contains a tool call, False otherwise
71
72
  """
73
+ if not self.tools:
74
+ return False
72
75
  return self.detector.has_tool_call(text)
73
76
 
74
77
  def parse_non_stream(self, full_text: str) -> Tuple[str, list[ToolCallItem]]:
@@ -83,6 +86,8 @@ class FunctionCallParser:
83
86
  - The remaining text after parsing that was not consumed by the detector (can be treated as normal text)
84
87
  - A list of tool calls parsed from the text
85
88
  """
89
+ if not self.tools:
90
+ return full_text, []
86
91
  parsed_result = self.detector.detect_and_parse(full_text, self.tools)
87
92
  tool_call_list = parsed_result.calls
88
93
  if tool_call_list:
@@ -102,6 +107,8 @@ class FunctionCallParser:
102
107
  - The normal text that should be displayed to the user
103
108
  - A list of tool calls parsed from the chunk
104
109
  """
110
+ if not self.tools:
111
+ return chunk_text, []
105
112
  final_normal_text = ""
106
113
  final_calls = []
107
114
 
@@ -172,8 +179,8 @@ class FunctionCallParser:
172
179
  strict_tag = self.get_structure_tag()
173
180
  return ("structural_tag", strict_tag)
174
181
  elif tool_choice == "required" or isinstance(tool_choice, ToolChoice):
175
- ebnf = self.get_ebnf(tool_choice)
176
- return ("ebnf", ebnf) if ebnf is not None else None
182
+ json_schema = get_json_schema_constraint(self.tools, tool_choice)
183
+ return ("json_schema", json_schema)
177
184
 
178
185
  def get_ebnf(
179
186
  self, tool_choice: Union[ToolChoice, Literal["required"]]