sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
sglang/srt/environ.py ADDED
@@ -0,0 +1,323 @@
1
+ import os
2
+ import subprocess
3
+ import warnings
4
+ from contextlib import ExitStack, contextmanager
5
+ from typing import Any
6
+
7
+
8
+ class EnvField:
9
+ def __init__(self, default: Any):
10
+ self.default = default
11
+ # NOTE: we use None to indicate whether the value is set or not
12
+ # If the value is manually set to None, we need mark it as _set_to_none.
13
+ # Always use clear() to reset the value, which leads to the default fallback.
14
+ self._set_to_none = False
15
+
16
+ def __set_name__(self, owner, name):
17
+ self.name = name
18
+
19
+ def parse(self, value: str) -> Any:
20
+ raise NotImplementedError()
21
+
22
+ def get(self) -> Any:
23
+ value = os.getenv(self.name)
24
+ if self._set_to_none:
25
+ assert value is None
26
+ return None
27
+
28
+ if value is None:
29
+ return self.default
30
+
31
+ try:
32
+ return self.parse(value)
33
+ except ValueError as e:
34
+ warnings.warn(
35
+ f'Invalid value for {self.name}: {e}, using default "{self.default}"'
36
+ )
37
+ return self.default
38
+
39
+ def is_set(self):
40
+ # NOTE: If None is manually set, it is considered as set.
41
+ return self.name in os.environ or self._set_to_none
42
+
43
+ def get_set_value_or(self, or_value: Any):
44
+ # NOTE: Ugly usage, but only way to get custom default value.
45
+ return self.get() if self.is_set() else or_value
46
+
47
+ def set(self, value: Any):
48
+ if value is None:
49
+ self._set_to_none = True
50
+ os.environ.pop(self.name, None)
51
+ else:
52
+ self._set_to_none = False
53
+ os.environ[self.name] = str(value)
54
+
55
+ @contextmanager
56
+ def override(self, value: Any):
57
+ backup_present = self.name in os.environ
58
+ backup_value = os.environ.get(self.name)
59
+ backup_set_to_none = self._set_to_none
60
+ self.set(value)
61
+ yield
62
+ if backup_present:
63
+ os.environ[self.name] = backup_value
64
+ else:
65
+ os.environ.pop(self.name, None)
66
+ self._set_to_none = backup_set_to_none
67
+
68
+ def clear(self):
69
+ os.environ.pop(self.name, None)
70
+ self._set_to_none = False
71
+
72
+ @property
73
+ def value(self):
74
+ return self.get()
75
+
76
+
77
+ class EnvStr(EnvField):
78
+ def parse(self, value: str) -> str:
79
+ return value
80
+
81
+
82
+ class EnvBool(EnvField):
83
+ def parse(self, value: str) -> bool:
84
+ value = value.lower()
85
+ if value in ["true", "1", "yes", "y"]:
86
+ return True
87
+ if value in ["false", "0", "no", "n"]:
88
+ return False
89
+ raise ValueError(f'"{value}" is not a valid boolean value')
90
+
91
+
92
+ class EnvInt(EnvField):
93
+ def parse(self, value: str) -> int:
94
+ try:
95
+ return int(value)
96
+ except ValueError:
97
+ raise ValueError(f'"{value}" is not a valid integer value')
98
+
99
+
100
+ class EnvFloat(EnvField):
101
+ def parse(self, value: str) -> float:
102
+ try:
103
+ return float(value)
104
+ except ValueError:
105
+ raise ValueError(f'"{value}" is not a valid float value')
106
+
107
+
108
+ class Envs:
109
+ # fmt: off
110
+
111
+ # Model & File Download
112
+ SGLANG_USE_MODELSCOPE = EnvBool(False)
113
+
114
+ # Test & Debug
115
+ SGLANG_IS_IN_CI = EnvBool(False)
116
+ SGLANG_IS_IN_CI_AMD = EnvBool(False)
117
+ SGLANG_SET_CPU_AFFINITY = EnvBool(False)
118
+ SGLANG_PROFILE_WITH_STACK = EnvBool(True)
119
+ SGLANG_RECORD_STEP_TIME = EnvBool(False)
120
+ SGLANG_GC_LOG = EnvBool(False)
121
+ SGLANG_FORCE_SHUTDOWN = EnvBool(False)
122
+ SGLANG_DEBUG_MEMORY_POOL = EnvBool(False)
123
+ SGLANG_TEST_REQUEST_TIME_STATS = EnvBool(False)
124
+ SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK = EnvBool(False)
125
+ SGLANG_DISABLE_REQUEST_LOGGING = EnvBool(False)
126
+ SGLANG_SIMULATE_ACC_LEN = EnvFloat(-1)
127
+ SGLANG_SIMULATE_ACC_METHOD = EnvStr("multinomial")
128
+ SGLANG_TORCH_PROFILER_DIR = EnvStr("/tmp")
129
+
130
+ # Scheduler: memory leak test
131
+ SGLANG_TEST_RETRACT = EnvBool(False)
132
+ SGLANG_TEST_RETRACT_INTERVAL = EnvInt(3)
133
+ SGLANG_ENABLE_RUNTIME_MEM_LEAK_CHECK = EnvBool(False)
134
+
135
+ # Scheduler: new token ratio hyperparameters
136
+ SGLANG_INIT_NEW_TOKEN_RATIO = EnvFloat(0.7)
137
+ SGLANG_MIN_NEW_TOKEN_RATIO_FACTOR = EnvFloat(0.14)
138
+ SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS = EnvInt(600)
139
+ SGLANG_RETRACT_DECODE_STEPS = EnvInt(20)
140
+
141
+ # Scheduler: others:
142
+ SGLANG_EMPTY_CACHE_INTERVAL = EnvFloat(-1) # in seconds. Set if you observe high memory accumulation over a long serving period.
143
+ # Test: pd-disaggregation
144
+ SGLANG_TEST_PD_DISAGG_BACKEND = EnvStr("mooncake")
145
+ SGLANG_TEST_PD_DISAGG_DEVICES = EnvStr(None)
146
+
147
+ # Model Parallel
148
+ SGLANG_USE_MESSAGE_QUEUE_BROADCASTER = EnvBool(True)
149
+ SGLANG_ONE_VISIBLE_DEVICE_PER_PROCESS = EnvBool(False)
150
+
151
+ # Constrained Decoding
152
+ SGLANG_DISABLE_OUTLINES_DISK_CACHE = EnvBool(True)
153
+ SGLANG_GRAMMAR_TIMEOUT = EnvFloat(300)
154
+
155
+ # Hi-Cache
156
+ SGLANG_HICACHE_HF3FS_CONFIG_PATH = EnvStr(None)
157
+
158
+ # Mooncake KV Transfer
159
+ SGLANG_MOONCAKE_CUSTOM_MEM_POOL = EnvBool(False)
160
+ ENABLE_ASCEND_TRANSFER_WITH_MOONCAKE = EnvBool(False)
161
+
162
+ # AMD & ROCm
163
+ SGLANG_USE_AITER = EnvBool(False)
164
+ SGLANG_ROCM_FUSED_DECODE_MLA = EnvBool(False)
165
+ SGLANG_ROCM_DISABLE_LINEARQUANT = EnvBool(False)
166
+
167
+ # Quantization
168
+ SGLANG_INT4_WEIGHT = EnvBool(False)
169
+ SGLANG_CPU_QUANTIZATION = EnvBool(False)
170
+ SGLANG_USE_DYNAMIC_MXFP4_LINEAR = EnvBool(False)
171
+ SGLANG_FORCE_FP8_MARLIN = EnvBool(False)
172
+
173
+ # Flashinfer
174
+ SGLANG_IS_FLASHINFER_AVAILABLE = EnvBool(True)
175
+ SGLANG_ENABLE_FLASHINFER_GEMM = EnvBool(False)
176
+ SGLANG_FLASHINFER_WORKSPACE_SIZE = EnvInt(384 * 1024 * 1024)
177
+
178
+ # Triton
179
+ SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS = EnvBool(False)
180
+
181
+ # Torch Compile
182
+ SGLANG_ENABLE_TORCH_COMPILE = EnvBool(False)
183
+
184
+ # EPLB
185
+ SGLANG_EXPERT_LOCATION_UPDATER_LOG_INPUT = EnvBool(False)
186
+ SGLANG_EXPERT_LOCATION_UPDATER_CANARY = EnvBool(False)
187
+ SGLANG_EXPERT_LOCATION_UPDATER_LOG_METRICS = EnvBool(False)
188
+ SGLANG_LOG_EXPERT_LOCATION_METADATA = EnvBool(False)
189
+ SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR = EnvStr("/tmp")
190
+
191
+ # TBO
192
+ SGLANG_TBO_DEBUG = EnvBool(False)
193
+
194
+ # DeepGemm
195
+ SGLANG_ENABLE_JIT_DEEPGEMM = EnvBool(True)
196
+ SGLANG_JIT_DEEPGEMM_PRECOMPILE = EnvBool(True)
197
+ SGLANG_JIT_DEEPGEMM_COMPILE_WORKERS = EnvInt(4)
198
+ SGLANG_IN_DEEPGEMM_PRECOMPILE_STAGE = EnvBool(False)
199
+ SGLANG_DG_CACHE_DIR = EnvStr(os.path.expanduser("~/.cache/deep_gemm"))
200
+ SGLANG_DG_USE_NVRTC = EnvBool(False)
201
+ SGLANG_USE_DEEPGEMM_BMM = EnvBool(False)
202
+
203
+ # sgl-kernel
204
+ SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK = EnvBool(False)
205
+
206
+ # vLLM dependencies (TODO: they have been deprecated, we can remove them safely)
207
+ USE_VLLM_CUSTOM_ALLREDUCE = EnvBool(False)
208
+ USE_VLLM_CUTLASS_W8A8_FP8_KERNEL = EnvBool(False)
209
+
210
+ USE_TRITON_W8A8_FP8_KERNEL = EnvBool(False)
211
+ SGLANG_RETURN_ORIGINAL_LOGPROB = EnvBool(False)
212
+ SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN = EnvBool(False)
213
+ SGLANG_MOE_PADDING = EnvBool(False)
214
+ SGLANG_CUTLASS_MOE = EnvBool(False)
215
+ HF_HUB_DISABLE_XET = EnvBool(False)
216
+ DISABLE_OPENAPI_DOC = EnvBool(False)
217
+ SGLANG_ENABLE_TORCH_INFERENCE_MODE = EnvBool(False)
218
+ SGLANG_IS_FIRST_RANK_ON_NODE = EnvBool(True)
219
+ SGLANG_SUPPORT_CUTLASS_BLOCK_FP8 = EnvBool(False)
220
+ SGLANG_SYNC_TOKEN_IDS_ACROSS_TP = EnvBool(False)
221
+ SGLANG_ENABLE_COLOCATED_BATCH_GEN = EnvBool(False)
222
+
223
+ # Deterministic inference
224
+ SGLANG_ENABLE_DETERMINISTIC_INFERENCE = EnvBool(False)
225
+ SGLANG_FLASHINFER_PREFILL_SPLIT_TILE_SIZE = EnvInt(4096)
226
+ SGLANG_FLASHINFER_DECODE_SPLIT_TILE_SIZE = EnvInt(2048)
227
+ SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZE = EnvInt(4096)
228
+ SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
229
+
230
+ # Overlap Spec V2
231
+ SGLANG_ENABLE_OVERLAP_PLAN_STREAM = EnvBool(False)
232
+
233
+ # VLM
234
+ SGLANG_IMAGE_MAX_PIXELS = EnvInt(16384 * 28 * 28)
235
+ SGLANG_RESIZE_RESAMPLE = EnvStr("")
236
+
237
+ # Ktransformers
238
+ SGLANG_KT_MOE_NUM_GPU_EXPERTS = EnvInt(None)
239
+ SGLANG_KT_MOE_CPUINFER = EnvInt(None)
240
+ SGLANG_KT_THREADPOOL_COUNT = EnvInt(None)
241
+ SGLANG_KT_MOE_AMX_WEIGHT_PATH = EnvStr(None)
242
+ SGLANG_KT_AMX_METHOD = EnvStr(None)
243
+ SGLANG_KT_MOE_CHUNKED_PREFILL_SIZE = EnvInt(None)
244
+
245
+ # Sparse Embeddings
246
+ SGLANG_EMBEDDINGS_SPARSE_HEAD = EnvStr(None)
247
+
248
+ # fmt: on
249
+
250
+
251
+ envs = Envs()
252
+
253
+
254
+ def _convert_SGL_to_SGLANG():
255
+ for key, value in os.environ.items():
256
+ if key.startswith("SGL_"):
257
+ new_key = key.replace("SGL_", "SGLANG_", 1)
258
+ warnings.warn(
259
+ f"Environment variable {key} is deprecated, please use {new_key}"
260
+ )
261
+ os.environ[new_key] = value
262
+
263
+
264
+ _convert_SGL_to_SGLANG()
265
+
266
+
267
+ def example_with_exit_stack():
268
+ # Use this style of context manager in unit test
269
+ exit_stack = ExitStack()
270
+ exit_stack.enter_context(envs.SGLANG_TEST_RETRACT.override(False))
271
+ assert envs.SGLANG_TEST_RETRACT.value is False
272
+ exit_stack.close()
273
+ assert envs.SGLANG_TEST_RETRACT.value is None
274
+
275
+
276
+ def example_with_subprocess():
277
+ command = ["python", "-c", "import os; print(os.getenv('SGLANG_TEST_RETRACT'))"]
278
+ with envs.SGLANG_TEST_RETRACT.override(True):
279
+ process = subprocess.Popen(
280
+ command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
281
+ )
282
+ process.wait()
283
+ output = process.stdout.read().decode("utf-8").strip()
284
+ assert output == "True"
285
+
286
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
287
+ output = process.stdout.read().decode("utf-8").strip()
288
+ assert output == "None"
289
+
290
+
291
+ def examples():
292
+ # Example usage for envs
293
+ envs.SGLANG_TEST_RETRACT.clear()
294
+ assert envs.SGLANG_TEST_RETRACT.value is False
295
+
296
+ envs.SGLANG_TEST_RETRACT.set(None)
297
+ assert envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
298
+
299
+ envs.SGLANG_TEST_RETRACT.clear()
300
+ assert not envs.SGLANG_TEST_RETRACT.is_set()
301
+
302
+ envs.SGLANG_TEST_RETRACT.set(True)
303
+ assert envs.SGLANG_TEST_RETRACT.value is True
304
+
305
+ with envs.SGLANG_TEST_RETRACT.override(None):
306
+ assert (
307
+ envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
308
+ )
309
+
310
+ assert envs.SGLANG_TEST_RETRACT.value is True
311
+
312
+ envs.SGLANG_TEST_RETRACT.set(None)
313
+ with envs.SGLANG_TEST_RETRACT.override(True):
314
+ assert envs.SGLANG_TEST_RETRACT.value is True
315
+
316
+ assert envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
317
+
318
+ example_with_exit_stack()
319
+ example_with_subprocess()
320
+
321
+
322
+ if __name__ == "__main__":
323
+ examples()
@@ -3,7 +3,8 @@ from typing import Optional
3
3
 
4
4
  import torch
5
5
 
6
- from sglang.srt.eplb.eplb_algorithms import deepseek, deepseek_vec
6
+ from sglang.srt.elastic_ep.elastic_ep import ElasticEPStateManager
7
+ from sglang.srt.eplb.eplb_algorithms import deepseek, deepseek_vec, elasticity_aware
7
8
 
8
9
 
9
10
  class EplbAlgorithm(Enum):
@@ -11,6 +12,7 @@ class EplbAlgorithm(Enum):
11
12
  deepseek_hierarchical = auto()
12
13
  deepseek_vec = auto()
13
14
  deepseek_vec_hierarchical = auto()
15
+ elasticity_aware = auto()
14
16
  # TODO may have more algorithm later
15
17
 
16
18
 
@@ -45,6 +47,21 @@ def rebalance_experts(
45
47
  enable_hierarchical=algorithm == EplbAlgorithm.deepseek_vec_hierarchical,
46
48
  )
47
49
 
50
+ if algorithm == EplbAlgorithm.elasticity_aware:
51
+ return elasticity_aware.rebalance_experts(
52
+ weight=tokens_per_expert.sum(dim=0),
53
+ num_replicas=num_physical_experts,
54
+ num_groups=num_groups,
55
+ num_nodes=num_nodes,
56
+ num_gpus=num_physical_experts // num_local_physical_experts,
57
+ enable_hierarchical=True,
58
+ active_ranks=(
59
+ ElasticEPStateManager.instance().active_ranks
60
+ if ElasticEPStateManager.instance() is not None
61
+ else ElasticEPStateManager.healthy_rank_state()
62
+ ),
63
+ )
64
+
48
65
  raise NotImplementedError
49
66
 
50
67
 
@@ -3,8 +3,6 @@ from typing import Tuple
3
3
 
4
4
  import torch
5
5
 
6
- from sglang.srt.utils import get_bool_env_var
7
-
8
6
 
9
7
  def balanced_packing(
10
8
  weight: torch.Tensor, num_packs: int
@@ -0,0 +1,87 @@
1
+ from typing import Tuple
2
+
3
+ import torch
4
+
5
+ from sglang.srt.eplb.eplb_algorithms.deepseek import rebalance_experts_hierarchical
6
+
7
+
8
+ def rebalance_experts(
9
+ weight: torch.Tensor,
10
+ num_replicas: int,
11
+ num_groups: int,
12
+ num_nodes: int,
13
+ num_gpus: int,
14
+ enable_hierarchical: bool,
15
+ active_ranks: torch.Tensor,
16
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
17
+ """
18
+ Entry point for expert-parallelism load balancer.
19
+
20
+ Parameters:
21
+ weight: [layers, num_logical_experts], the load statistics for all logical experts
22
+ num_replicas: number of physical experts, must be a multiple of `num_gpus`
23
+ num_groups: number of expert groups
24
+ num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
25
+ num_gpus: number of GPUs, must be a multiple of `num_nodes`
26
+
27
+ Returns:
28
+ physical_to_logical_map: [layers, num_replicas], the expert index of each replica
29
+ logical_to_physical_map: [layers, num_logical_experts, X], the replica indices for each expert
30
+ expert_count: [layers, num_logical_experts], number of physical replicas for each logical expert
31
+ """
32
+
33
+ num_layers, num_logical_experts = weight.shape
34
+ weight = weight.float().cpu()
35
+ num_active_ranks = active_ranks.sum().item()
36
+ num_local_experts = num_replicas // num_gpus
37
+ if num_active_ranks < num_gpus:
38
+ # Must fall back to global load-balance policy
39
+ # and fix some params
40
+ phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
41
+ weight,
42
+ num_local_experts * num_active_ranks,
43
+ 1,
44
+ 1,
45
+ num_active_ranks,
46
+ )
47
+ elif enable_hierarchical:
48
+ # use hierarchical load-balance policy
49
+ phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
50
+ weight, num_replicas, num_groups, num_nodes, num_gpus
51
+ )
52
+ else:
53
+ # use global load-balance policy
54
+ phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
55
+ weight, num_replicas, 1, 1, num_gpus
56
+ )
57
+ maxlogcnt = logcnt.max().item()
58
+ log2phy: torch.Tensor = torch.full(
59
+ (num_layers, num_logical_experts, maxlogcnt),
60
+ -1,
61
+ dtype=torch.int64,
62
+ device=logcnt.device,
63
+ )
64
+ log2phy.view(num_layers, -1).scatter_(
65
+ -1,
66
+ phy2log * maxlogcnt + phyrank,
67
+ torch.arange(
68
+ num_local_experts * num_active_ranks,
69
+ dtype=torch.int64,
70
+ device=log2phy.device,
71
+ ).expand(num_layers, -1),
72
+ )
73
+ if num_active_ranks < num_gpus:
74
+ phy2log_slices = list(
75
+ phy2log.view(num_layers, num_active_ranks, -1).unbind(dim=1)
76
+ )
77
+ active_ranks_list = active_ranks.tolist()
78
+ for idx, active_rank in enumerate(active_ranks_list):
79
+ if not active_rank:
80
+ phy2log_slices.insert(idx, torch.zeros_like(phy2log_slices[0]))
81
+ log2phy = torch.where(
82
+ log2phy >= idx * num_local_experts,
83
+ log2phy + num_local_experts,
84
+ log2phy,
85
+ )
86
+ phy2log = torch.stack(phy2log_slices, dim=1).contiguous().view(num_layers, -1)
87
+ return phy2log, log2phy, logcnt
@@ -16,21 +16,20 @@ from __future__ import annotations
16
16
 
17
17
  import logging
18
18
  import math
19
- import os
20
19
  import time
21
20
  from abc import ABC
22
21
  from collections import deque
23
22
  from contextlib import contextmanager
24
- from pathlib import Path
25
23
  from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Type
26
24
 
27
25
  import einops
28
26
  import torch
29
27
  import torch.distributed
30
28
 
29
+ from sglang.srt.environ import envs
31
30
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
32
31
  from sglang.srt.server_args import ServerArgs
33
- from sglang.srt.utils import Withable, get_bool_env_var, is_npu
32
+ from sglang.srt.utils import Withable, is_npu
34
33
 
35
34
  _is_npu = is_npu()
36
35
 
@@ -839,7 +838,7 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
839
838
 
840
839
 
841
840
  def _dump_to_file(name, data):
842
- save_dir = Path(os.environ.get("SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR", "/tmp"))
841
+ save_dir = envs.SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR.get()
843
842
  path_output = save_dir / name
844
843
  logger.info(f"Write expert distribution to {path_output}")
845
844
  if not save_dir.exists():
@@ -231,6 +231,7 @@ class ExpertLocationMetadata:
231
231
  logical_to_all_physical_map_num_valid=logical_to_all_physical_map_num_valid,
232
232
  logical_to_rank_dispatch_physical_map=(
233
233
  compute_logical_to_rank_dispatch_physical_map(
234
+ server_args=server_args,
234
235
  logical_to_all_physical_map=logical_to_all_physical_map,
235
236
  num_gpus=ep_size,
236
237
  num_physical_experts=num_physical_experts,
@@ -340,6 +341,7 @@ def _pad_nested_array(arr, pad_value):
340
341
 
341
342
  # TODO optimize performance (rewrite and/or run in separate process with overlap)
342
343
  def compute_logical_to_rank_dispatch_physical_map(
344
+ server_args: ServerArgs,
343
345
  logical_to_all_physical_map: torch.Tensor,
344
346
  num_gpus: int,
345
347
  num_physical_experts: int,
@@ -348,7 +350,9 @@ def compute_logical_to_rank_dispatch_physical_map(
348
350
  ):
349
351
  r = random.Random(seed)
350
352
 
351
- num_local_physical_experts = num_physical_experts // num_gpus
353
+ num_local_gpu_physical_experts = num_physical_experts // num_gpus
354
+ num_gpus_per_node = server_args.ep_size // server_args.nnodes
355
+ num_local_node_physical_experts = num_local_gpu_physical_experts * num_gpus_per_node
352
356
  num_layers, num_logical_experts, _ = logical_to_all_physical_map.shape
353
357
  dtype = logical_to_all_physical_map.dtype
354
358
 
@@ -372,13 +376,28 @@ def compute_logical_to_rank_dispatch_physical_map(
372
376
  physical_expert_id
373
377
  for physical_expert_id in candidate_physical_expert_ids
374
378
  if _compute_gpu_id_of_physical_expert(
375
- physical_expert_id, num_local_physical_experts
379
+ physical_expert_id, num_local_gpu_physical_experts
376
380
  )
377
381
  == gpu_id
378
382
  ]
379
383
  if len(same_gpu_physical_expert_ids) > 0:
384
+ # 1. Prefer same-GPU experts
380
385
  output_partial[gpu_id] = same_gpu_physical_expert_ids[0]
381
-
386
+ else:
387
+ # 2. Otherwise, prefer same-node experts
388
+ node_id = gpu_id // num_gpus_per_node
389
+ same_node_physical_expert_ids = [
390
+ physical_expert_id
391
+ for physical_expert_id in candidate_physical_expert_ids
392
+ if _compute_node_id_of_physical_expert(
393
+ physical_expert_id, num_local_node_physical_experts
394
+ )
395
+ == node_id
396
+ ]
397
+ if len(same_node_physical_expert_ids) > 0:
398
+ output_partial[gpu_id] = same_node_physical_expert_ids[0]
399
+
400
+ # 3. Fill remaining slots with fair random choices
382
401
  num_remain = torch.sum(output_partial == -1).item()
383
402
  output_partial[output_partial == -1] = torch.tensor(
384
403
  _fair_choices(candidate_physical_expert_ids, k=num_remain, r=r),
@@ -404,9 +423,15 @@ def _logical_to_all_physical_raw(
404
423
 
405
424
 
406
425
  def _compute_gpu_id_of_physical_expert(
407
- physical_expert_id: int, num_local_physical_experts: int
426
+ physical_expert_id: int, num_local_gpu_physical_experts: int
427
+ ) -> int:
428
+ return physical_expert_id // num_local_gpu_physical_experts
429
+
430
+
431
+ def _compute_node_id_of_physical_expert(
432
+ physical_expert_id: int, num_local_host_physical_experts: int
408
433
  ) -> int:
409
- return physical_expert_id // num_local_physical_experts
434
+ return physical_expert_id // num_local_host_physical_experts
410
435
 
411
436
 
412
437
  def _fair_choices(arr: List, k: int, r: random.Random) -> List:
@@ -18,7 +18,7 @@ from typing import Literal, Optional
18
18
  import torch
19
19
 
20
20
  from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
21
- from sglang.srt.managers.schedule_batch import global_server_args_dict
21
+ from sglang.srt.server_args import get_global_server_args
22
22
 
23
23
 
24
24
  @dataclass
@@ -34,7 +34,7 @@ class ExpertLocationDispatchInfo:
34
34
 
35
35
  @classmethod
36
36
  def init_new(cls, layer_id: int):
37
- ep_dispatch_algorithm = global_server_args_dict["ep_dispatch_algorithm"]
37
+ ep_dispatch_algorithm = get_global_server_args().ep_dispatch_algorithm
38
38
  expert_location_metadata = get_global_expert_location_metadata()
39
39
  assert expert_location_metadata is not None
40
40
 
@@ -24,7 +24,7 @@ from sglang.srt.eplb.expert_location import (
24
24
  ExpertLocationMetadata,
25
25
  get_global_expert_location_metadata,
26
26
  )
27
- from sglang.srt.managers.schedule_batch import global_server_args_dict
27
+ from sglang.srt.server_args import get_global_server_args
28
28
  from sglang.srt.utils import get_bool_env_var
29
29
 
30
30
  logger = logging.getLogger(__name__)
@@ -97,7 +97,7 @@ def _update_expert_weights_with_canary(
97
97
  canary_tensor = (
98
98
  _get_canary_value(old_expert_location_metadata, layer_id)
99
99
  .clone()
100
- .to(device=global_server_args_dict["device"], non_blocking=True)
100
+ .to(device=get_global_server_args().device, non_blocking=True)
101
101
  )
102
102
  routed_experts_weights_of_layer[layer_id].append(canary_tensor)
103
103
 
@@ -3,6 +3,7 @@ import logging
3
3
  from abc import ABC, abstractmethod
4
4
  from typing import Any, Dict, List
5
5
 
6
+ import orjson
6
7
  from partial_json_parser.core.exceptions import MalformedJSON
7
8
  from partial_json_parser.core.options import Allow
8
9
 
@@ -96,7 +97,7 @@ class BaseFormatDetector(ABC):
96
97
  Parses the text in one go. Returns success=True if the format matches, otherwise False.
97
98
  Note that leftover_text here represents "content that this parser will not consume further".
98
99
  """
99
- action = json.loads(text)
100
+ action = orjson.loads(text)
100
101
  return StreamingParseResult(calls=self.parse_base_json(action, tools))
101
102
 
102
103
  def _ends_with_partial_token(self, buffer: str, bot_token: str) -> int:
@@ -264,12 +265,6 @@ class BaseFormatDetector(ABC):
264
265
  # Only remove the processed portion, keep unprocessed content
265
266
  self._buffer = current_text[start_idx + end_idx :]
266
267
 
267
- if self.current_tool_id < len(self.prev_tool_call_arr):
268
- self.prev_tool_call_arr[self.current_tool_id].clear()
269
- self.current_tool_name_sent = False
270
- self.streamed_args_for_tool[self.current_tool_id] = ""
271
- self.current_tool_id += 1
272
-
273
268
  # If the tool is still being parsed, send incremental changes
274
269
  elif prev_arguments:
275
270
  prev_args_json = json.dumps(prev_arguments)
@@ -277,6 +272,20 @@ class BaseFormatDetector(ABC):
277
272
  prefix = _find_common_prefix(prev_args_json, cur_args_json)
278
273
  argument_diff = prefix[sent:]
279
274
 
275
+ # Update prev_tool_call_arr with current state
276
+ if self.current_tool_id >= 0:
277
+ # Ensure prev_tool_call_arr is large enough
278
+ while len(self.prev_tool_call_arr) <= self.current_tool_id:
279
+ self.prev_tool_call_arr.append({})
280
+ self.prev_tool_call_arr[self.current_tool_id] = (
281
+ current_tool_call
282
+ )
283
+
284
+ # Advance to next tool if complete
285
+ if is_current_complete:
286
+ self.current_tool_name_sent = False
287
+ self.current_tool_id += 1
288
+
280
289
  # Send the argument diff if there's something new
281
290
  if argument_diff is not None:
282
291
  # Use the correct tool_index: completing_tool_id for completed tools, current_tool_id for ongoing
@@ -293,17 +302,7 @@ class BaseFormatDetector(ABC):
293
302
  )
294
303
  ],
295
304
  )
296
- if not is_current_complete:
297
- self.streamed_args_for_tool[
298
- self.current_tool_id
299
- ] += argument_diff
300
-
301
- # Update prev_tool_call_arr with current state
302
- if self.current_tool_id >= 0:
303
- # Ensure prev_tool_call_arr is large enough
304
- while len(self.prev_tool_call_arr) <= self.current_tool_id:
305
- self.prev_tool_call_arr.append({})
306
- self.prev_tool_call_arr[self.current_tool_id] = current_tool_call
305
+ self.streamed_args_for_tool[tool_index_to_use] += argument_diff
307
306
 
308
307
  return res
309
308