sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (396) hide show
  1. sglang/bench_one_batch.py +7 -11
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +474 -142
  4. sglang/compile_deep_gemm.py +3 -0
  5. sglang/global_config.py +2 -2
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +10 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +314 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/mamba_utils.py +117 -0
  17. sglang/srt/configs/model_config.py +228 -92
  18. sglang/srt/configs/nemotron_h.py +286 -0
  19. sglang/srt/configs/qwen3_next.py +294 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +78 -37
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +373 -68
  44. sglang/srt/disaggregation/prefill.py +53 -49
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +156 -80
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +842 -0
  52. sglang/srt/entrypoints/grpc_server.py +950 -0
  53. sglang/srt/entrypoints/http_server.py +179 -60
  54. sglang/srt/entrypoints/openai/protocol.py +265 -29
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +213 -122
  57. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  58. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  63. sglang/srt/environ.py +289 -0
  64. sglang/srt/eplb/eplb_manager.py +2 -2
  65. sglang/srt/eplb/expert_distribution.py +26 -13
  66. sglang/srt/eplb/expert_location.py +38 -8
  67. sglang/srt/eplb/expert_location_updater.py +1 -1
  68. sglang/srt/function_call/base_format_detector.py +3 -6
  69. sglang/srt/function_call/ebnf_composer.py +11 -9
  70. sglang/srt/function_call/function_call_parser.py +17 -8
  71. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  72. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  73. sglang/srt/function_call/json_array_parser.py +63 -0
  74. sglang/srt/function_call/kimik2_detector.py +17 -4
  75. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  76. sglang/srt/function_call/utils.py +96 -5
  77. sglang/srt/grpc/__init__.py +1 -0
  78. sglang/srt/grpc/compile_proto.py +245 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
  81. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
  82. sglang/srt/layers/activation.py +143 -9
  83. sglang/srt/layers/attention/aiter_backend.py +14 -15
  84. sglang/srt/layers/attention/ascend_backend.py +115 -9
  85. sglang/srt/layers/attention/attention_registry.py +215 -0
  86. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  87. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  88. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  89. sglang/srt/layers/attention/fla/chunk.py +242 -0
  90. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  91. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  92. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  93. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  94. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  95. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  96. sglang/srt/layers/attention/fla/index.py +37 -0
  97. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  98. sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
  99. sglang/srt/layers/attention/fla/op.py +66 -0
  100. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  101. sglang/srt/layers/attention/fla/utils.py +331 -0
  102. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  103. sglang/srt/layers/attention/flashattention_backend.py +40 -8
  104. sglang/srt/layers/attention/flashinfer_backend.py +341 -204
  105. sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
  106. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  107. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  108. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
  109. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  111. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
  112. sglang/srt/layers/attention/mamba/mamba.py +577 -0
  113. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  114. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  115. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  116. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  117. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  121. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  122. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  123. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  124. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  125. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  126. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  127. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  128. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  129. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  130. sglang/srt/layers/attention/nsa/utils.py +24 -0
  131. sglang/srt/layers/attention/nsa_backend.py +887 -0
  132. sglang/srt/layers/attention/tbo_backend.py +6 -6
  133. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  134. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  135. sglang/srt/layers/attention/triton_backend.py +57 -7
  136. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  137. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  138. sglang/srt/layers/attention/vision.py +58 -0
  139. sglang/srt/layers/attention/wave_backend.py +4 -4
  140. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  141. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  142. sglang/srt/layers/communicator.py +8 -0
  143. sglang/srt/layers/dp_attention.py +41 -2
  144. sglang/srt/layers/elementwise.py +3 -1
  145. sglang/srt/layers/layernorm.py +34 -15
  146. sglang/srt/layers/linear.py +55 -7
  147. sglang/srt/layers/logits_processor.py +180 -18
  148. sglang/srt/layers/modelopt_utils.py +11 -0
  149. sglang/srt/layers/moe/__init__.py +2 -1
  150. sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
  151. sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
  152. sglang/srt/layers/moe/ep_moe/layer.py +248 -333
  153. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  154. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  155. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  167. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  169. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  170. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  171. sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
  172. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  173. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  174. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  175. sglang/srt/layers/moe/moe_runner/runner.py +83 -0
  176. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  177. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  178. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  179. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  180. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  181. sglang/srt/layers/moe/topk.py +30 -9
  182. sglang/srt/layers/moe/utils.py +29 -7
  183. sglang/srt/layers/parameter.py +23 -6
  184. sglang/srt/layers/quantization/__init__.py +1 -1
  185. sglang/srt/layers/quantization/awq.py +19 -7
  186. sglang/srt/layers/quantization/base_config.py +11 -6
  187. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  188. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  189. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  190. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  191. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  192. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  193. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  194. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  195. sglang/srt/layers/quantization/fp8.py +155 -60
  196. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  197. sglang/srt/layers/quantization/gptq.py +25 -17
  198. sglang/srt/layers/quantization/modelopt_quant.py +191 -56
  199. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  200. sglang/srt/layers/quantization/mxfp4.py +74 -42
  201. sglang/srt/layers/quantization/quark/quark.py +3 -1
  202. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  203. sglang/srt/layers/quantization/unquant.py +135 -47
  204. sglang/srt/layers/quantization/w4afp8.py +28 -33
  205. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  206. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  207. sglang/srt/layers/rotary_embedding.py +78 -31
  208. sglang/srt/layers/sampler.py +213 -21
  209. sglang/srt/layers/utils.py +23 -0
  210. sglang/srt/lora/backend/base_backend.py +50 -8
  211. sglang/srt/lora/backend/chunked_backend.py +348 -0
  212. sglang/srt/lora/backend/triton_backend.py +99 -5
  213. sglang/srt/lora/layers.py +32 -0
  214. sglang/srt/lora/lora.py +8 -3
  215. sglang/srt/lora/lora_manager.py +44 -118
  216. sglang/srt/lora/mem_pool.py +25 -11
  217. sglang/srt/lora/triton_ops/__init__.py +4 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  219. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  220. sglang/srt/lora/utils.py +22 -11
  221. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  222. sglang/srt/managers/cache_controller.py +199 -301
  223. sglang/srt/managers/data_parallel_controller.py +115 -80
  224. sglang/srt/managers/detokenizer_manager.py +19 -15
  225. sglang/srt/managers/disagg_service.py +46 -0
  226. sglang/srt/managers/io_struct.py +340 -109
  227. sglang/srt/managers/mm_utils.py +44 -6
  228. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  229. sglang/srt/managers/multimodal_processor.py +1 -2
  230. sglang/srt/managers/overlap_utils.py +55 -0
  231. sglang/srt/managers/schedule_batch.py +343 -212
  232. sglang/srt/managers/schedule_policy.py +145 -18
  233. sglang/srt/managers/scheduler.py +653 -273
  234. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  235. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  236. sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
  237. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  238. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +579 -674
  241. sglang/srt/managers/tp_worker.py +96 -26
  242. sglang/srt/managers/utils.py +1 -45
  243. sglang/srt/mem_cache/allocator.py +21 -22
  244. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  245. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  246. sglang/srt/mem_cache/chunk_cache.py +9 -2
  247. sglang/srt/mem_cache/evict_policy.py +23 -0
  248. sglang/srt/mem_cache/hicache_storage.py +43 -24
  249. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  250. sglang/srt/mem_cache/memory_pool.py +651 -80
  251. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  252. sglang/srt/mem_cache/radix_cache.py +227 -73
  253. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  254. sglang/srt/mem_cache/storage/__init__.py +10 -0
  255. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  257. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  258. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  259. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  260. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  261. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  262. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  263. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  264. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  265. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  266. sglang/srt/mem_cache/swa_radix_cache.py +93 -48
  267. sglang/srt/metrics/collector.py +511 -132
  268. sglang/srt/metrics/func_timer.py +2 -7
  269. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  270. sglang/srt/metrics/utils.py +8 -1
  271. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  272. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  273. sglang/srt/model_executor/forward_batch_info.py +74 -46
  274. sglang/srt/model_executor/model_runner.py +455 -176
  275. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  276. sglang/srt/model_loader/__init__.py +10 -4
  277. sglang/srt/model_loader/loader.py +319 -10
  278. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  279. sglang/srt/model_loader/weight_utils.py +161 -3
  280. sglang/srt/models/apertus.py +686 -0
  281. sglang/srt/models/bailing_moe.py +820 -217
  282. sglang/srt/models/bailing_moe_nextn.py +168 -0
  283. sglang/srt/models/deepseek_nextn.py +6 -1
  284. sglang/srt/models/deepseek_v2.py +607 -130
  285. sglang/srt/models/dots_ocr.py +173 -0
  286. sglang/srt/models/dots_vlm.py +174 -0
  287. sglang/srt/models/dots_vlm_vit.py +337 -0
  288. sglang/srt/models/ernie4.py +1 -1
  289. sglang/srt/models/falcon_h1.py +578 -0
  290. sglang/srt/models/gemma3_causal.py +0 -2
  291. sglang/srt/models/gemma3_mm.py +17 -1
  292. sglang/srt/models/gemma3n_mm.py +2 -2
  293. sglang/srt/models/glm4_moe.py +4 -4
  294. sglang/srt/models/glm4_moe_nextn.py +2 -2
  295. sglang/srt/models/glm4v.py +5 -3
  296. sglang/srt/models/glm4v_moe.py +4 -1
  297. sglang/srt/models/gpt_oss.py +8 -31
  298. sglang/srt/models/grok.py +5 -13
  299. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  300. sglang/srt/models/llama.py +4 -0
  301. sglang/srt/models/llama4.py +9 -0
  302. sglang/srt/models/llama_eagle3.py +13 -0
  303. sglang/srt/models/longcat_flash.py +3 -3
  304. sglang/srt/models/longcat_flash_nextn.py +1 -1
  305. sglang/srt/models/mixtral.py +1 -3
  306. sglang/srt/models/mllama4.py +50 -4
  307. sglang/srt/models/nemotron_h.py +514 -0
  308. sglang/srt/models/opt.py +637 -0
  309. sglang/srt/models/qwen2_5_vl.py +29 -5
  310. sglang/srt/models/qwen2_audio.py +1 -1
  311. sglang/srt/models/qwen2_moe.py +120 -13
  312. sglang/srt/models/qwen2_vl.py +1 -1
  313. sglang/srt/models/qwen3.py +18 -3
  314. sglang/srt/models/qwen3_moe.py +32 -4
  315. sglang/srt/models/qwen3_next.py +1069 -0
  316. sglang/srt/models/qwen3_next_mtp.py +112 -0
  317. sglang/srt/models/qwen3_vl.py +787 -0
  318. sglang/srt/models/qwen3_vl_moe.py +471 -0
  319. sglang/srt/models/registry.py +15 -3
  320. sglang/srt/models/sarashina2_vision.py +269 -0
  321. sglang/srt/models/solar.py +505 -0
  322. sglang/srt/models/starcoder2.py +357 -0
  323. sglang/srt/models/step3_vl.py +1 -1
  324. sglang/srt/models/torch_native_llama.py +9 -2
  325. sglang/srt/models/utils.py +55 -0
  326. sglang/srt/multimodal/processors/base_processor.py +15 -7
  327. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  328. sglang/srt/multimodal/processors/glm4v.py +9 -9
  329. sglang/srt/multimodal/processors/internvl.py +153 -129
  330. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  331. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  332. sglang/srt/offloader.py +27 -3
  333. sglang/srt/parser/jinja_template_utils.py +6 -0
  334. sglang/srt/sampling/sampling_batch_info.py +49 -26
  335. sglang/srt/sampling/sampling_params.py +7 -0
  336. sglang/srt/server_args.py +1051 -285
  337. sglang/srt/server_args_config_parser.py +146 -0
  338. sglang/srt/single_batch_overlap.py +151 -0
  339. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  340. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  341. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  342. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  343. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  344. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  345. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  346. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  347. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  348. sglang/srt/speculative/eagle_worker.py +98 -29
  349. sglang/srt/speculative/ngram_info.py +428 -0
  350. sglang/srt/speculative/ngram_worker.py +246 -0
  351. sglang/srt/speculative/spec_info.py +52 -0
  352. sglang/srt/speculative/spec_utils.py +605 -0
  353. sglang/srt/speculative/standalone_worker.py +109 -0
  354. sglang/srt/torch_memory_saver_adapter.py +5 -7
  355. sglang/srt/tracing/trace.py +578 -0
  356. sglang/srt/two_batch_overlap.py +9 -5
  357. sglang/srt/utils/__init__.py +2 -0
  358. sglang/srt/{utils.py → utils/common.py} +451 -77
  359. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
  360. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  361. sglang/srt/utils/rpd_utils.py +452 -0
  362. sglang/srt/utils/slow_rank_detector.py +71 -0
  363. sglang/srt/warmup.py +8 -4
  364. sglang/srt/weight_sync/utils.py +2 -2
  365. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  366. sglang/test/get_logits_ut.py +57 -0
  367. sglang/test/longbench_v2/__init__.py +1 -0
  368. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  369. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  370. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  371. sglang/test/run_eval.py +119 -11
  372. sglang/test/runners.py +5 -1
  373. sglang/test/simple_eval_common.py +5 -2
  374. sglang/test/simple_eval_longbench_v2.py +332 -0
  375. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  376. sglang/test/test_block_fp8.py +2 -2
  377. sglang/test/test_cutlass_moe.py +24 -6
  378. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  379. sglang/test/test_deterministic.py +313 -0
  380. sglang/test/test_deterministic_utils.py +81 -0
  381. sglang/test/test_disaggregation_utils.py +140 -0
  382. sglang/test/test_fp4_moe.py +370 -1
  383. sglang/test/test_programs.py +1 -1
  384. sglang/test/test_utils.py +407 -8
  385. sglang/utils.py +21 -1
  386. sglang/version.py +1 -1
  387. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
  388. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
  389. sglang/srt/disaggregation/launch_lb.py +0 -118
  390. sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
  391. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  392. sglang/test/test_block_fp8_ep.py +0 -358
  393. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  394. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  395. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  396. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
  # Adapted from vLLM's OpenAIServingResponses
3
3
  """Handler for /v1/responses requests"""
4
+ from __future__ import annotations
4
5
 
5
6
  import asyncio
6
7
  import copy
@@ -9,7 +10,7 @@ import logging
9
10
  import time
10
11
  from contextlib import AsyncExitStack
11
12
  from http import HTTPStatus
12
- from typing import Any, AsyncGenerator, AsyncIterator, Optional, Union
13
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Optional, Union
13
14
 
14
15
  import jinja2
15
16
  import openai.types.responses as openai_responses_types
@@ -54,11 +55,13 @@ from sglang.srt.entrypoints.openai.protocol import (
54
55
  from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
55
56
  from sglang.srt.entrypoints.openai.tool_server import MCPToolServer, ToolServer
56
57
  from sglang.srt.managers.io_struct import GenerateReqInput
57
- from sglang.srt.managers.template_manager import TemplateManager
58
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
59
58
  from sglang.srt.parser.reasoning_parser import ReasoningParser
60
59
  from sglang.srt.utils import random_uuid
61
60
 
61
+ if TYPE_CHECKING:
62
+ from sglang.srt.managers.template_manager import TemplateManager
63
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
64
+
62
65
  logger = logging.getLogger(__name__)
63
66
 
64
67
 
@@ -120,6 +123,39 @@ class OpenAIServingResponses(OpenAIServingChat):
120
123
 
121
124
  self.background_tasks: dict[str, asyncio.Task] = {}
122
125
 
126
+ # error helpers dedicated for v1/responses
127
+ def create_error_response(
128
+ self,
129
+ message: str,
130
+ err_type: str = "invalid_request_error",
131
+ status_code: int = 400,
132
+ param: Optional[str] = None,
133
+ ) -> ORJSONResponse:
134
+ nested_error = {
135
+ "message": message,
136
+ "type": err_type,
137
+ "param": param,
138
+ "code": status_code,
139
+ }
140
+ return ORJSONResponse(content={"error": nested_error}, status_code=status_code)
141
+
142
+ def create_streaming_error_response(
143
+ self,
144
+ message: str,
145
+ err_type: str = "BadRequestError",
146
+ status_code: int = 400,
147
+ ) -> str:
148
+ return json.dumps(
149
+ {
150
+ "error": {
151
+ "message": message,
152
+ "type": err_type,
153
+ "param": None,
154
+ "code": status_code,
155
+ }
156
+ }
157
+ )
158
+
123
159
  def _request_id_prefix(self) -> str:
124
160
  return "resp_"
125
161
 
@@ -242,6 +278,7 @@ class OpenAIServingResponses(OpenAIServingChat):
242
278
  sampling_params=sampling_params,
243
279
  stream=request.stream,
244
280
  rid=request.request_id,
281
+ extra_key=self._compute_extra_key(request),
245
282
  background=request.background,
246
283
  )
247
284
 
@@ -830,6 +867,13 @@ class OpenAIServingResponses(OpenAIServingChat):
830
867
 
831
868
  async for ctx in result_generator:
832
869
 
870
+ # Only process context objects that implement the `is_expecting_start()` method,
871
+ # which indicates they support per-turn streaming (e.g., StreamingHarmonyContext).
872
+ # Contexts without this method are skipped, as they do not represent a new turn
873
+ # or are not compatible with per-turn handling in the /v1/responses endpoint.
874
+ if not hasattr(ctx, "is_expecting_start"):
875
+ continue
876
+
833
877
  if ctx.is_expecting_start():
834
878
  current_output_index += 1
835
879
  sent_output_item_added = False
@@ -1247,6 +1291,7 @@ class OpenAIServingResponses(OpenAIServingChat):
1247
1291
  sampling_params=sampling_params,
1248
1292
  stream=adapted_request.stream,
1249
1293
  rid=request_id,
1294
+ extra_key=adapted_request.extra_key,
1250
1295
  return_logprob=adapted_request.return_logprob,
1251
1296
  logprob_start_len=adapted_request.logprob_start_len,
1252
1297
  top_logprobs_num=adapted_request.top_logprobs_num,
@@ -25,6 +25,7 @@ class OpenAIServingScore(OpenAIServingBase):
25
25
  def _convert_to_internal_request(
26
26
  self,
27
27
  request: ScoringRequest,
28
+ raw_request: Request = None,
28
29
  ) -> tuple[ScoringRequest, ScoringRequest]:
29
30
  """Convert OpenAI scoring request to internal format"""
30
31
  # For scoring, we pass the request directly as the tokenizer_manager
@@ -0,0 +1,144 @@
1
+ import logging
2
+ from http import HTTPStatus
3
+ from typing import List, Union
4
+
5
+ from fastapi import Request
6
+
7
+ from sglang.srt.entrypoints.openai.protocol import (
8
+ DetokenizeRequest,
9
+ DetokenizeResponse,
10
+ ErrorResponse,
11
+ TokenizeRequest,
12
+ TokenizeResponse,
13
+ )
14
+ from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class OpenAIServingTokenize(OpenAIServingBase):
20
+ """Handler for /v1/tokenize requests"""
21
+
22
+ def _request_id_prefix(self) -> str:
23
+ return "tok-"
24
+
25
+ def _convert_to_internal_request(
26
+ self, request: TokenizeRequest, raw_request: Request
27
+ ) -> tuple[TokenizeRequest, TokenizeRequest]:
28
+ return request, request
29
+
30
+ async def _handle_non_streaming_request(
31
+ self,
32
+ adapted_request: TokenizeRequest,
33
+ request: TokenizeRequest,
34
+ raw_request: Request,
35
+ ) -> Union[TokenizeResponse, ErrorResponse]:
36
+ try:
37
+ tokenizer = self.tokenizer_manager.tokenizer
38
+ max_model_len = getattr(tokenizer, "model_max_length", -1)
39
+
40
+ if isinstance(request.prompt, str):
41
+ token_ids = tokenizer.encode(
42
+ request.prompt,
43
+ add_special_tokens=request.add_special_tokens,
44
+ )
45
+ tokens = token_ids
46
+ count = len(token_ids)
47
+ elif isinstance(request.prompt, list):
48
+ token_ids_list = [
49
+ tokenizer.encode(
50
+ text, add_special_tokens=request.add_special_tokens
51
+ )
52
+ for text in request.prompt
53
+ ]
54
+ tokens = token_ids_list
55
+ count = [len(ids) for ids in token_ids_list]
56
+ else:
57
+ return self.create_error_response(
58
+ f"Invalid prompt type: {type(request.prompt)}. Expected str or List[str]."
59
+ )
60
+
61
+ return TokenizeResponse(
62
+ tokens=tokens, count=count, max_model_len=max_model_len
63
+ )
64
+ except Exception as e:
65
+ logger.error("Error during tokenization", exc_info=True)
66
+ return self.create_error_response(
67
+ f"Internal server error during tokenization: {e}",
68
+ err_type="InternalServerError",
69
+ status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
70
+ )
71
+
72
+
73
+ class OpenAIServingDetokenize(OpenAIServingBase):
74
+ """Handler for /v1/detokenize requests"""
75
+
76
+ def _request_id_prefix(self) -> str:
77
+ return "detok-"
78
+
79
+ def _convert_to_internal_request(
80
+ self, request: DetokenizeRequest, raw_request: Request
81
+ ) -> tuple[DetokenizeRequest, DetokenizeRequest]:
82
+ return request, request
83
+
84
+ async def _handle_non_streaming_request(
85
+ self,
86
+ adapted_request: DetokenizeRequest,
87
+ request: DetokenizeRequest,
88
+ raw_request: Request,
89
+ ) -> Union[DetokenizeResponse, ErrorResponse]:
90
+ try:
91
+ tokenizer = self.tokenizer_manager.tokenizer
92
+
93
+ if (
94
+ isinstance(request.tokens, list)
95
+ and request.tokens
96
+ and isinstance(request.tokens[0], int)
97
+ ):
98
+ if not all(isinstance(t, int) for t in request.tokens):
99
+ return self.create_error_response(
100
+ "Invalid input: 'tokens' must be a list of integers."
101
+ )
102
+ tokens_to_decode = [int(t) for t in request.tokens]
103
+ text = tokenizer.decode(
104
+ tokens_to_decode, skip_special_tokens=request.skip_special_tokens
105
+ )
106
+ text_out: Union[str, List[str]] = text
107
+ elif (
108
+ isinstance(request.tokens, list)
109
+ and request.tokens
110
+ and isinstance(request.tokens[0], list)
111
+ ):
112
+ texts: List[str] = []
113
+ for token_list in request.tokens:
114
+ if not all(isinstance(t, int) for t in token_list):
115
+ return self.create_error_response(
116
+ f"Invalid input: Sublist in 'tokens' must contain only integers. Found: {token_list}"
117
+ )
118
+ decoded_text = tokenizer.decode(
119
+ [int(t) for t in token_list],
120
+ skip_special_tokens=request.skip_special_tokens,
121
+ )
122
+ texts.append(decoded_text)
123
+ text_out = texts
124
+ elif isinstance(request.tokens, list) and not request.tokens:
125
+ text_out = ""
126
+ else:
127
+ return self.create_error_response(
128
+ f"Invalid tokens type: {type(request.tokens)}. Expected List[int] or List[List[int]]."
129
+ )
130
+
131
+ return DetokenizeResponse(text=text_out)
132
+ except Exception as e:
133
+ logger.error("Error during detokenization", exc_info=True)
134
+ if "decode" in str(e).lower():
135
+ return self.create_error_response(
136
+ f"Error decoding tokens: {e}. Input tokens might be invalid for the model.",
137
+ err_type="DecodeError",
138
+ status_code=HTTPStatus.BAD_REQUEST,
139
+ )
140
+ return self.create_error_response(
141
+ f"Internal server error during detokenization: {e}",
142
+ err_type="InternalServerError",
143
+ status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
144
+ )
sglang/srt/environ.py ADDED
@@ -0,0 +1,289 @@
1
+ import os
2
+ import subprocess
3
+ import warnings
4
+ from contextlib import ExitStack, contextmanager
5
+ from typing import Any
6
+
7
+
8
+ class EnvField:
9
+ def __init__(self, default: Any):
10
+ self.default = default
11
+ # NOTE: we use None to indicate whether the value is set or not
12
+ # If the value is manually set to None, we need mark it as _set_to_none.
13
+ # Always use clear() to reset the value, which leads to the default fallback.
14
+ self._set_to_none = False
15
+
16
+ def __set_name__(self, owner, name):
17
+ self.name = name
18
+
19
+ def parse(self, value: str) -> Any:
20
+ raise NotImplementedError()
21
+
22
+ def get(self) -> Any:
23
+ value = os.getenv(self.name)
24
+ if self._set_to_none:
25
+ assert value is None
26
+ return None
27
+
28
+ if value is None:
29
+ return self.default
30
+
31
+ try:
32
+ return self.parse(value)
33
+ except ValueError as e:
34
+ warnings.warn(
35
+ f'Invalid value for {self.name}: {e}, using default "{self.default}"'
36
+ )
37
+ return self.default
38
+
39
+ def is_set(self):
40
+ # NOTE: If None is manually set, it is considered as set.
41
+ return self.name in os.environ or self._set_to_none
42
+
43
+ def get_set_value_or(self, or_value: Any):
44
+ # NOTE: Ugly usage, but only way to get custom default value.
45
+ return self.get() if self.is_set() else or_value
46
+
47
+ def set(self, value: Any):
48
+ if value is None:
49
+ self._set_to_none = True
50
+ os.environ.pop(self.name, None)
51
+ else:
52
+ self._set_to_none = False
53
+ os.environ[self.name] = str(value)
54
+
55
+ @contextmanager
56
+ def override(self, value: Any):
57
+ backup_present = self.name in os.environ
58
+ backup_value = os.environ.get(self.name)
59
+ backup_set_to_none = self._set_to_none
60
+ self.set(value)
61
+ yield
62
+ if backup_present:
63
+ os.environ[self.name] = backup_value
64
+ else:
65
+ os.environ.pop(self.name, None)
66
+ self._set_to_none = backup_set_to_none
67
+
68
+ def clear(self):
69
+ os.environ.pop(self.name, None)
70
+ self._set_to_none = False
71
+
72
+ @property
73
+ def value(self):
74
+ return self.get()
75
+
76
+
77
+ class EnvStr(EnvField):
78
+ def parse(self, value: str) -> str:
79
+ return value
80
+
81
+
82
+ class EnvBool(EnvField):
83
+ def parse(self, value: str) -> bool:
84
+ value = value.lower()
85
+ if value in ["true", "1", "yes", "y"]:
86
+ return True
87
+ if value in ["false", "0", "no", "n"]:
88
+ return False
89
+ raise ValueError(f'"{value}" is not a valid boolean value')
90
+
91
+
92
+ class EnvInt(EnvField):
93
+ def parse(self, value: str) -> int:
94
+ try:
95
+ return int(value)
96
+ except ValueError:
97
+ raise ValueError(f'"{value}" is not a valid integer value')
98
+
99
+
100
+ class EnvFloat(EnvField):
101
+ def parse(self, value: str) -> float:
102
+ try:
103
+ return float(value)
104
+ except ValueError:
105
+ raise ValueError(f'"{value}" is not a valid float value')
106
+
107
+
108
+ class Envs:
109
+ # fmt: off
110
+
111
+ # Model & File Download
112
+ SGLANG_USE_MODELSCOPE = EnvBool(False)
113
+
114
+ # Test & Debug
115
+ SGLANG_IS_IN_CI = EnvBool(False)
116
+ SGLANG_AMD_CI = EnvBool(False)
117
+ SGLANG_TEST_RETRACT = EnvBool(False)
118
+ SGLANG_SET_CPU_AFFINITY = EnvBool(False)
119
+ SGLANG_PROFILE_WITH_STACK = EnvBool(True)
120
+ SGLANG_RECORD_STEP_TIME = EnvBool(False)
121
+ SGLANG_GC_LOG = EnvBool(False)
122
+ SGLANG_FORCE_SHUTDOWN = EnvBool(False)
123
+ SGLANG_DEBUG_MEMORY_POOL = EnvBool(False)
124
+ SGLANG_TEST_REQUEST_TIME_STATS = EnvBool(False)
125
+ SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK = EnvBool(False)
126
+ SGLANG_DISABLE_REQUEST_LOGGING = EnvBool(False)
127
+ SGLANG_SIMULATE_ACC_LEN = EnvFloat(-1)
128
+ SGLANG_SIMULATE_ACC_METHOD = EnvStr("multinomial")
129
+ SGLANG_TORCH_PROFILER_DIR = EnvStr("/tmp")
130
+
131
+ # Test: pd-disaggregation
132
+ SGLANG_TEST_PD_DISAGG_BACKEND = EnvStr("mooncake")
133
+ SGLANG_TEST_PD_DISAGG_DEVICES = EnvStr(None)
134
+
135
+ # Model Parallel
136
+ SGLANG_USE_MESSAGE_QUEUE_BROADCASTER = EnvBool(True)
137
+
138
+ # Constrained Decoding
139
+ SGLANG_DISABLE_OUTLINES_DISK_CACHE = EnvBool(True)
140
+ SGLANG_GRAMMAR_TIMEOUT = EnvFloat(300)
141
+
142
+ # Hi-Cache
143
+ SGLANG_HICACHE_HF3FS_CONFIG_PATH = EnvStr(None)
144
+
145
+ # Mooncake KV Transfer
146
+ SGLANG_MOONCAKE_CUSTOM_MEM_POOL = EnvBool(False)
147
+ ENABLE_ASCEND_TRANSFER_WITH_MOONCAKE = EnvBool(False)
148
+
149
+ # AMD & ROCm
150
+ SGLANG_USE_AITER = EnvBool(False)
151
+ SGLANG_ROCM_FUSED_DECODE_MLA = EnvBool(False)
152
+
153
+ # Quantization
154
+ SGLANG_INT4_WEIGHT = EnvBool(False)
155
+ SGLANG_CPU_QUANTIZATION = EnvBool(False)
156
+ SGLANG_USE_DYNAMIC_MXFP4_LINEAR = EnvBool(False)
157
+ SGLANG_FORCE_FP8_MARLIN = EnvBool(False)
158
+
159
+ # Flashinfer
160
+ SGLANG_IS_FLASHINFER_AVAILABLE = EnvBool(True)
161
+ SGLANG_ENABLE_FLASHINFER_GEMM = EnvBool(False)
162
+
163
+ # Triton
164
+ SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS = EnvBool(False)
165
+
166
+ # Torch Compile
167
+ SGLANG_ENABLE_TORCH_COMPILE = EnvBool(False)
168
+
169
+ # EPLB
170
+ SGLANG_EXPERT_LOCATION_UPDATER_LOG_INPUT = EnvBool(False)
171
+ SGLANG_EXPERT_LOCATION_UPDATER_CANARY = EnvBool(False)
172
+ SGLANG_EXPERT_LOCATION_UPDATER_LOG_METRICS = EnvBool(False)
173
+ SGLANG_LOG_EXPERT_LOCATION_METADATA = EnvBool(False)
174
+
175
+ # TBO
176
+ SGLANG_TBO_DEBUG = EnvBool(False)
177
+
178
+ # DeepGemm
179
+ SGLANG_ENABLE_JIT_DEEPGEMM = EnvBool(True)
180
+ SGLANG_JIT_DEEPGEMM_PRECOMPILE = EnvBool(True)
181
+ SGLANG_JIT_DEEPGEMM_COMPILE_WORKERS = EnvInt(4)
182
+ SGLANG_IN_DEEPGEMM_PRECOMPILE_STAGE = EnvBool(False)
183
+ SGLANG_DG_CACHE_DIR = EnvStr(os.path.expanduser("~/.cache/deep_gemm"))
184
+ SGLANG_DG_USE_NVRTC = EnvBool(False)
185
+ SGLANG_USE_DEEPGEMM_BMM = EnvBool(False)
186
+
187
+ # sgl-kernel
188
+ SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK = EnvBool(False)
189
+
190
+ # vLLM dependencies
191
+ USE_VLLM_CUSTOM_ALLREDUCE = EnvBool(False)
192
+ USE_VLLM_CUTLASS_W8A8_FP8_KERNEL = EnvBool(False)
193
+
194
+ USE_TRITON_W8A8_FP8_KERNEL = EnvBool(False)
195
+ RETURN_ORIGINAL_LOGPROB = EnvBool(False)
196
+ SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN = EnvBool(False)
197
+ SGLANG_MOE_PADDING = EnvBool(False)
198
+ SGLANG_CUTLASS_MOE = EnvBool(False)
199
+ HF_HUB_DISABLE_XET = EnvBool(False)
200
+ DISABLE_OPENAPI_DOC = EnvBool(False)
201
+ SGLANG_ENABLE_TORCH_INFERENCE_MODE = EnvBool(False)
202
+ SGLANG_IS_FIRST_RANK_ON_NODE = EnvBool(True)
203
+ SGLANG_SUPPORT_CUTLASS_BLOCK_FP8 = EnvBool(False)
204
+ SGLANG_SYNC_TOKEN_IDS_ACROSS_TP = EnvBool(False)
205
+ SGLANG_ENABLE_COLOCATED_BATCH_GEN = EnvBool(False)
206
+
207
+ # Deterministic inference
208
+ SGLANG_ENABLE_DETERMINISTIC_INFERENCE = EnvBool(False)
209
+ SGLANG_FLASHINFER_PREFILL_SPLIT_TILE_SIZE = EnvInt(4096)
210
+ SGLANG_FLASHINFER_DECODE_SPLIT_TILE_SIZE = EnvInt(2048)
211
+ SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZE = EnvInt(4096)
212
+ SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
213
+
214
+ # fmt: on
215
+
216
+
217
+ envs = Envs()
218
+
219
+
220
+ def _convert_SGL_to_SGLANG():
221
+ for key, value in os.environ.items():
222
+ if key.startswith("SGL_"):
223
+ new_key = key.replace("SGL_", "SGLANG_", 1)
224
+ warnings.warn(
225
+ f"Environment variable {key} is deprecated, please use {new_key}"
226
+ )
227
+ os.environ[new_key] = value
228
+
229
+
230
+ _convert_SGL_to_SGLANG()
231
+
232
+
233
+ def example_with_exit_stack():
234
+ # Use this style of context manager in unit test
235
+ exit_stack = ExitStack()
236
+ exit_stack.enter_context(envs.SGLANG_TEST_RETRACT.override(False))
237
+ assert envs.SGLANG_TEST_RETRACT.value is False
238
+ exit_stack.close()
239
+ assert envs.SGLANG_TEST_RETRACT.value is None
240
+
241
+
242
+ def example_with_subprocess():
243
+ command = ["python", "-c", "import os; print(os.getenv('SGLANG_TEST_RETRACT'))"]
244
+ with envs.SGLANG_TEST_RETRACT.override(True):
245
+ process = subprocess.Popen(
246
+ command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
247
+ )
248
+ process.wait()
249
+ output = process.stdout.read().decode("utf-8").strip()
250
+ assert output == "True"
251
+
252
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
253
+ output = process.stdout.read().decode("utf-8").strip()
254
+ assert output == "None"
255
+
256
+
257
+ def examples():
258
+ # Example usage for envs
259
+ envs.SGLANG_TEST_RETRACT.clear()
260
+ assert envs.SGLANG_TEST_RETRACT.value is False
261
+
262
+ envs.SGLANG_TEST_RETRACT.set(None)
263
+ assert envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
264
+
265
+ envs.SGLANG_TEST_RETRACT.clear()
266
+ assert not envs.SGLANG_TEST_RETRACT.is_set()
267
+
268
+ envs.SGLANG_TEST_RETRACT.set(True)
269
+ assert envs.SGLANG_TEST_RETRACT.value is True
270
+
271
+ with envs.SGLANG_TEST_RETRACT.override(None):
272
+ assert (
273
+ envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
274
+ )
275
+
276
+ assert envs.SGLANG_TEST_RETRACT.value is True
277
+
278
+ envs.SGLANG_TEST_RETRACT.set(None)
279
+ with envs.SGLANG_TEST_RETRACT.override(True):
280
+ assert envs.SGLANG_TEST_RETRACT.value is True
281
+
282
+ assert envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
283
+
284
+ example_with_exit_stack()
285
+ example_with_subprocess()
286
+
287
+
288
+ if __name__ == "__main__":
289
+ examples()
@@ -55,7 +55,7 @@ class EPLBManager:
55
55
  enable_timing = self._rebalance_layers_per_chunk is None
56
56
 
57
57
  if enable_timing:
58
- torch.cuda.synchronize()
58
+ torch.get_device_module().synchronize()
59
59
  time_start = time.time()
60
60
 
61
61
  dump_record_output = get_global_expert_distribution_recorder().dump_record(
@@ -85,7 +85,7 @@ class EPLBManager:
85
85
 
86
86
  msg = f"[EPLBManager] rebalance end"
87
87
  if enable_timing:
88
- torch.cuda.synchronize()
88
+ torch.get_device_module().synchronize()
89
89
  time_end = time.time()
90
90
  msg += f" time={time_end - time_start:.3f}s"
91
91
  logger.info(msg)
@@ -11,6 +11,9 @@
11
11
  # See the License for the specific language governing permissions and
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
+
15
+ from __future__ import annotations
16
+
14
17
  import logging
15
18
  import math
16
19
  import os
@@ -19,16 +22,20 @@ from abc import ABC
19
22
  from collections import deque
20
23
  from contextlib import contextmanager
21
24
  from pathlib import Path
22
- from typing import Any, Dict, List, Literal, Optional, Tuple, Type
25
+ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Type
23
26
 
24
27
  import einops
25
28
  import torch
26
29
  import torch.distributed
27
30
 
28
- from sglang.srt.eplb.expert_location import ExpertLocationMetadata
29
31
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
30
32
  from sglang.srt.server_args import ServerArgs
31
- from sglang.srt.utils import Withable, get_bool_env_var
33
+ from sglang.srt.utils import Withable, get_bool_env_var, is_npu
34
+
35
+ _is_npu = is_npu()
36
+
37
+ if TYPE_CHECKING:
38
+ from sglang.srt.eplb.expert_location import ExpertLocationMetadata
32
39
 
33
40
  logger = logging.getLogger(__name__)
34
41
 
@@ -43,7 +50,7 @@ class ExpertDistributionRecorder(ABC):
43
50
  @staticmethod
44
51
  def init_new(
45
52
  server_args: ServerArgs,
46
- expert_location_metadata: "ExpertLocationMetadata",
53
+ expert_location_metadata: ExpertLocationMetadata,
47
54
  rank: int,
48
55
  ):
49
56
  if server_args.expert_distribution_recorder_mode is not None:
@@ -118,7 +125,7 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
118
125
  def __init__(
119
126
  self,
120
127
  server_args: ServerArgs,
121
- expert_location_metadata: "ExpertLocationMetadata",
128
+ expert_location_metadata: ExpertLocationMetadata,
122
129
  rank: int,
123
130
  ):
124
131
  self._server_args = server_args
@@ -211,7 +218,9 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
211
218
  def _on_hook(self, hook_name: str, **kwargs):
212
219
  if self._disable_all:
213
220
  return
214
- if not (self._recording or torch.cuda.is_current_stream_capturing()):
221
+ if not (
222
+ self._recording or torch.get_device_module().is_current_stream_capturing()
223
+ ):
215
224
  return
216
225
  gatherer = self._single_pass_gatherers[
217
226
  self._accumulator.get_single_pass_gatherer_key(
@@ -279,7 +288,7 @@ class _SinglePassGatherer(ABC):
279
288
  @staticmethod
280
289
  def init_new(
281
290
  server_args: ServerArgs,
282
- expert_location_metadata: "ExpertLocationMetadata",
291
+ expert_location_metadata: ExpertLocationMetadata,
283
292
  rank: int,
284
293
  ) -> "_SinglePassGatherer":
285
294
  if server_args.expert_distribution_recorder_mode == "per_token":
@@ -307,7 +316,7 @@ class _SinglePassGatherer(ABC):
307
316
 
308
317
  return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
309
318
 
310
- def __init__(self, expert_location_metadata: "ExpertLocationMetadata", rank: int):
319
+ def __init__(self, expert_location_metadata: ExpertLocationMetadata, rank: int):
311
320
  self._expert_location_metadata = expert_location_metadata
312
321
  self._rank = rank
313
322
 
@@ -346,7 +355,7 @@ class _DetailSinglePassGatherer(_SinglePassGatherer):
346
355
  def __init__(
347
356
  self,
348
357
  server_args: ServerArgs,
349
- expert_location_metadata: "ExpertLocationMetadata",
358
+ expert_location_metadata: ExpertLocationMetadata,
350
359
  rank: int,
351
360
  ):
352
361
  super().__init__(expert_location_metadata, rank)
@@ -446,6 +455,10 @@ def _list_sum(a: List, b: List) -> List:
446
455
  class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
447
456
  def __init__(self, *args, enable_global_physical_experts: bool, **kwargs):
448
457
  super().__init__(*args, **kwargs)
458
+ if not _is_npu:
459
+ device = "cuda"
460
+ else:
461
+ device = "npu"
449
462
  self._enable_global_physical_experts = enable_global_physical_experts
450
463
  self._data = torch.zeros(
451
464
  (
@@ -457,7 +470,7 @@ class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
457
470
  ),
458
471
  ),
459
472
  dtype=torch.int,
460
- device="cuda",
473
+ device=device,
461
474
  )
462
475
 
463
476
  def reset(self):
@@ -561,7 +574,7 @@ class _Accumulator(ABC):
561
574
  @staticmethod
562
575
  def init_new(
563
576
  server_args: ServerArgs,
564
- expert_location_metadata: "ExpertLocationMetadata",
577
+ expert_location_metadata: ExpertLocationMetadata,
565
578
  rank: int,
566
579
  ) -> "_Accumulator":
567
580
  return _Accumulator.get_class(server_args)(
@@ -580,7 +593,7 @@ class _Accumulator(ABC):
580
593
  def __init__(
581
594
  self,
582
595
  server_args: ServerArgs,
583
- expert_location_metadata: "ExpertLocationMetadata",
596
+ expert_location_metadata: ExpertLocationMetadata,
584
597
  rank: int,
585
598
  ):
586
599
  self._server_args = server_args
@@ -779,7 +792,7 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
779
792
 
780
793
  if self._first_dump:
781
794
  self._first_dump = False
782
- torch.cuda.empty_cache()
795
+ torch.get_device_module().empty_cache()
783
796
 
784
797
  torch.distributed.all_reduce(
785
798
  logical_count_of_buffered_step, op=torch.distributed.ReduceOp.SUM