sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,74 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+
6
+ import torch
7
+
8
+ from sglang.srt.managers.schedule_batch import ServerArgs
9
+ from sglang.srt.utils import is_cpu, is_cuda
10
+
11
+
12
+ @dataclass
13
+ class ElasticEPState:
14
+ active_ranks: Optional[torch.Tensor]
15
+ last_active_ranks: Optional[torch.Tensor]
16
+ active_ranks_cpu: Optional[torch.Tensor]
17
+
18
+ def is_active_equal_last(self) -> bool:
19
+ return torch.equal(self.active_ranks, self.last_active_ranks)
20
+
21
+ def sync_active_to_cpu(self):
22
+ if self.active_ranks is not None:
23
+ self.active_ranks_cpu = self.active_ranks.detach().cpu().clone()
24
+
25
+ def snapshot_active_to_last(self):
26
+ if self.active_ranks is not None:
27
+ self.last_active_ranks = self.active_ranks.clone()
28
+
29
+
30
+ class ElasticEPStateManager:
31
+ _instance: Optional[ElasticEPState] = None
32
+
33
+ @classmethod
34
+ def instance(cls) -> ElasticEPState:
35
+ return cls._instance
36
+
37
+ @classmethod
38
+ def init(cls, server_args: ServerArgs):
39
+ if cls._instance is not None:
40
+ return cls._instance
41
+
42
+ if server_args.elastic_ep_backend is not None:
43
+ cls._instance = cls._build_state(ep_size=None, device=None)
44
+ return cls._instance
45
+
46
+ @staticmethod
47
+ def _select_device() -> torch.device:
48
+ if is_cuda():
49
+ return torch.device("cuda")
50
+ elif is_cpu():
51
+ return torch.device("cpu")
52
+ else:
53
+ raise NotImplementedError("Only CUDA and CPU support elastic ep now.")
54
+
55
+ @classmethod
56
+ def _build_state(
57
+ cls, *, ep_size: Optional[int] = None, device: Optional[torch.device] = None
58
+ ) -> ElasticEPState:
59
+
60
+ active = cls.healthy_rank_state(ep_size=ep_size, device=device)
61
+ return ElasticEPState(
62
+ active_ranks=active,
63
+ last_active_ranks=active.clone(),
64
+ active_ranks_cpu=active.detach().cpu().clone(),
65
+ )
66
+
67
+ @classmethod
68
+ def healthy_rank_state(
69
+ cls, *, ep_size: Optional[int] = None, device: Optional[torch.device] = None
70
+ ) -> torch.Tensor:
71
+ size = ep_size if ep_size is not None else torch.distributed.get_world_size()
72
+ dev = device if device is not None else cls._select_device()
73
+
74
+ return torch.ones(size, dtype=torch.int32, device=dev)
@@ -1,10 +1,11 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
  # Copied from vLLM
3
- import json
4
3
  import logging
5
4
  from abc import ABC, abstractmethod
6
5
  from typing import Union
7
6
 
7
+ import orjson
8
+
8
9
  logger = logging.getLogger(__name__)
9
10
 
10
11
  try:
@@ -148,7 +149,7 @@ class HarmonyContext(ConversationContext):
148
149
  if isinstance(tool_session, Tool):
149
150
  return await tool_session.get_result(self)
150
151
  tool_name = last_msg.recipient.split(".")[1]
151
- args = json.loads(last_msg.content[0].text)
152
+ args = orjson.loads(last_msg.content[0].text)
152
153
  result = await tool_session.call_tool(tool_name, args)
153
154
  result_str = result.content[0].text
154
155
  content = TextContent(text=result_str)
@@ -30,8 +30,6 @@ import time
30
30
  from typing import AsyncIterator, Dict, Iterator, List, Optional, Tuple, Union
31
31
 
32
32
  import zmq
33
- import zmq.asyncio
34
- from PIL.Image import Image
35
33
 
36
34
  from sglang.srt.tracing.trace import process_tracing_init, trace_set_thread_info
37
35
 
@@ -47,6 +45,7 @@ from sglang.srt.managers.data_parallel_controller import (
47
45
  )
48
46
  from sglang.srt.managers.detokenizer_manager import run_detokenizer_process
49
47
  from sglang.srt.managers.io_struct import (
48
+ DestroyWeightsUpdateGroupReqInput,
50
49
  EmbeddingReqInput,
51
50
  GenerateReqInput,
52
51
  GetWeightsByNameReqInput,
@@ -60,6 +59,7 @@ from sglang.srt.managers.io_struct import (
60
59
  UnloadLoRAAdapterReqInput,
61
60
  UpdateWeightFromDiskReqInput,
62
61
  UpdateWeightsFromDistributedReqInput,
62
+ UpdateWeightsFromIPCReqInput,
63
63
  UpdateWeightsFromTensorReqInput,
64
64
  )
65
65
  from sglang.srt.managers.multi_tokenizer_mixin import MultiTokenizerRouter
@@ -67,7 +67,6 @@ from sglang.srt.managers.scheduler import run_scheduler_process
67
67
  from sglang.srt.managers.template_manager import TemplateManager
68
68
  from sglang.srt.managers.tokenizer_manager import TokenizerManager
69
69
  from sglang.srt.server_args import PortArgs, ServerArgs
70
- from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
71
70
  from sglang.srt.utils import (
72
71
  MultiprocessingSerializer,
73
72
  assert_pkg_version,
@@ -77,10 +76,12 @@ from sglang.srt.utils import (
77
76
  is_cuda,
78
77
  kill_process_tree,
79
78
  launch_dummy_health_check_server,
79
+ maybe_reindex_device_id,
80
80
  prepare_model_and_tokenizer,
81
81
  set_prometheus_multiproc_dir,
82
82
  set_ulimit,
83
83
  )
84
+ from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter
84
85
  from sglang.version import __version__
85
86
 
86
87
  logger = logging.getLogger(__name__)
@@ -146,6 +147,12 @@ class Engine(EngineBase):
146
147
  thread_label = "Tokenizer"
147
148
  trace_set_thread_info(thread_label)
148
149
 
150
+ try:
151
+ self.loop = asyncio.get_running_loop()
152
+ except RuntimeError:
153
+ self.loop = asyncio.new_event_loop()
154
+ asyncio.set_event_loop(self.loop)
155
+
149
156
  def generate(
150
157
  self,
151
158
  # The input prompt. It can be a single prompt or a batch of prompts.
@@ -209,7 +216,6 @@ class Engine(EngineBase):
209
216
  bootstrap_room=bootstrap_room,
210
217
  data_parallel_rank=data_parallel_rank,
211
218
  )
212
- loop = asyncio.get_event_loop()
213
219
  generator = self.tokenizer_manager.generate_request(obj, None)
214
220
 
215
221
  if stream:
@@ -217,14 +223,14 @@ class Engine(EngineBase):
217
223
  def generator_wrapper():
218
224
  while True:
219
225
  try:
220
- chunk = loop.run_until_complete(generator.__anext__())
226
+ chunk = self.loop.run_until_complete(generator.__anext__())
221
227
  yield chunk
222
228
  except StopAsyncIteration:
223
229
  break
224
230
 
225
231
  return generator_wrapper()
226
232
  else:
227
- ret = loop.run_until_complete(generator.__anext__())
233
+ ret = self.loop.run_until_complete(generator.__anext__())
228
234
  return ret
229
235
 
230
236
  async def async_generate(
@@ -316,9 +322,8 @@ class Engine(EngineBase):
316
322
  audio_data=audio_data,
317
323
  video_data=video_data,
318
324
  )
319
- loop = asyncio.get_event_loop()
320
325
  generator = self.tokenizer_manager.generate_request(obj, None)
321
- ret = loop.run_until_complete(generator.__anext__())
326
+ ret = self.loop.run_until_complete(generator.__anext__())
322
327
  return ret
323
328
 
324
329
  async def async_encode(
@@ -352,9 +357,8 @@ class Engine(EngineBase):
352
357
  Please refer to `EmbeddingReqInput` for the documentation.
353
358
  """
354
359
  obj = EmbeddingReqInput(text=prompt, is_cross_encoder_request=True)
355
- loop = asyncio.get_event_loop()
356
360
  generator = self.tokenizer_manager.generate_request(obj, None)
357
- ret = loop.run_until_complete(generator.__anext__())
361
+ ret = self.loop.run_until_complete(generator.__anext__())
358
362
  return ret
359
363
 
360
364
  def shutdown(self):
@@ -369,38 +373,31 @@ class Engine(EngineBase):
369
373
  return False
370
374
 
371
375
  def flush_cache(self):
372
- loop = asyncio.get_event_loop()
373
- return loop.run_until_complete(self.tokenizer_manager.flush_cache())
376
+ return self.loop.run_until_complete(self.tokenizer_manager.flush_cache())
374
377
 
375
378
  def start_profile(self, **kwargs):
376
- loop = asyncio.get_event_loop()
377
- loop.run_until_complete(self.tokenizer_manager.start_profile(**kwargs))
379
+ self.loop.run_until_complete(self.tokenizer_manager.start_profile(**kwargs))
378
380
 
379
381
  def stop_profile(self):
380
- loop = asyncio.get_event_loop()
381
- loop.run_until_complete(self.tokenizer_manager.stop_profile())
382
+ self.loop.run_until_complete(self.tokenizer_manager.stop_profile())
382
383
 
383
384
  def start_expert_distribution_record(self):
384
- loop = asyncio.get_event_loop()
385
- loop.run_until_complete(
385
+ self.loop.run_until_complete(
386
386
  self.tokenizer_manager.start_expert_distribution_record()
387
387
  )
388
388
 
389
389
  def stop_expert_distribution_record(self):
390
- loop = asyncio.get_event_loop()
391
- loop.run_until_complete(
390
+ self.loop.run_until_complete(
392
391
  self.tokenizer_manager.stop_expert_distribution_record()
393
392
  )
394
393
 
395
394
  def dump_expert_distribution_record(self):
396
- loop = asyncio.get_event_loop()
397
- loop.run_until_complete(
395
+ self.loop.run_until_complete(
398
396
  self.tokenizer_manager.dump_expert_distribution_record()
399
397
  )
400
398
 
401
399
  def get_server_info(self):
402
- loop = asyncio.get_event_loop()
403
- internal_states = loop.run_until_complete(
400
+ internal_states = self.loop.run_until_complete(
404
401
  self.tokenizer_manager.get_internal_state()
405
402
  )
406
403
  return {
@@ -428,11 +425,22 @@ class Engine(EngineBase):
428
425
  group_name=group_name,
429
426
  backend=backend,
430
427
  )
431
- loop = asyncio.get_event_loop()
432
- return loop.run_until_complete(
428
+ return self.loop.run_until_complete(
433
429
  self.tokenizer_manager.init_weights_update_group(obj, None)
434
430
  )
435
431
 
432
+ def destroy_weights_update_group(
433
+ self,
434
+ group_name: str,
435
+ ):
436
+ """Destroy parameter update group."""
437
+ obj = DestroyWeightsUpdateGroupReqInput(
438
+ group_name=group_name,
439
+ )
440
+ return self.loop.run_until_complete(
441
+ self.tokenizer_manager.destroy_weights_update_group(obj, None)
442
+ )
443
+
436
444
  def update_weights_from_distributed(
437
445
  self,
438
446
  names: list[str],
@@ -449,8 +457,7 @@ class Engine(EngineBase):
449
457
  group_name=group_name,
450
458
  flush_cache=flush_cache,
451
459
  )
452
- loop = asyncio.get_event_loop()
453
- return loop.run_until_complete(
460
+ return self.loop.run_until_complete(
454
461
  self.tokenizer_manager.update_weights_from_distributed(obj, None)
455
462
  )
456
463
 
@@ -474,9 +481,7 @@ class Engine(EngineBase):
474
481
  load_format=load_format,
475
482
  flush_cache=flush_cache,
476
483
  )
477
- loop = asyncio.get_event_loop()
478
-
479
- return loop.run_until_complete(
484
+ return self.loop.run_until_complete(
480
485
  self.tokenizer_manager.update_weights_from_tensor(obj, None)
481
486
  )
482
487
 
@@ -496,16 +501,14 @@ class Engine(EngineBase):
496
501
  load_format=load_format,
497
502
  )
498
503
 
499
- loop = asyncio.get_event_loop()
500
- return loop.run_until_complete(
504
+ return self.loop.run_until_complete(
501
505
  self.tokenizer_manager.update_weights_from_disk(obj, None)
502
506
  )
503
507
 
504
508
  def get_weights_by_name(self, name: str, truncate_size: int = 100):
505
509
  """Get weights by parameter name."""
506
510
  obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
507
- loop = asyncio.get_event_loop()
508
- return loop.run_until_complete(
511
+ return self.loop.run_until_complete(
509
512
  self.tokenizer_manager.get_weights_by_name(obj, None)
510
513
  )
511
514
 
@@ -518,8 +521,7 @@ class Engine(EngineBase):
518
521
  pinned=pinned,
519
522
  )
520
523
 
521
- loop = asyncio.get_event_loop()
522
- return loop.run_until_complete(
524
+ return self.loop.run_until_complete(
523
525
  self.tokenizer_manager.load_lora_adapter(obj, None)
524
526
  )
525
527
 
@@ -528,22 +530,19 @@ class Engine(EngineBase):
528
530
 
529
531
  obj = UnloadLoRAAdapterReqInput(lora_name=lora_name)
530
532
 
531
- loop = asyncio.get_event_loop()
532
- return loop.run_until_complete(
533
+ return self.loop.run_until_complete(
533
534
  self.tokenizer_manager.unload_lora_adapter(obj, None)
534
535
  )
535
536
 
536
537
  def release_memory_occupation(self, tags: Optional[List[str]] = None):
537
538
  obj = ReleaseMemoryOccupationReqInput(tags=tags)
538
- loop = asyncio.get_event_loop()
539
- return loop.run_until_complete(
539
+ return self.loop.run_until_complete(
540
540
  self.tokenizer_manager.release_memory_occupation(obj, None)
541
541
  )
542
542
 
543
543
  def resume_memory_occupation(self, tags: Optional[List[str]] = None):
544
544
  obj = ResumeMemoryOccupationReqInput(tags=tags)
545
- loop = asyncio.get_event_loop()
546
- return loop.run_until_complete(
545
+ return self.loop.run_until_complete(
547
546
  self.tokenizer_manager.resume_memory_occupation(obj, None)
548
547
  )
549
548
 
@@ -560,8 +559,7 @@ class Engine(EngineBase):
560
559
  collection.
561
560
  """
562
561
 
563
- loop = asyncio.get_event_loop()
564
- loop.run_until_complete(self.tokenizer_manager.freeze_gc())
562
+ self.loop.run_until_complete(self.tokenizer_manager.freeze_gc())
565
563
 
566
564
  """
567
565
  Execute an RPC call on all scheduler processes.
@@ -619,8 +617,7 @@ class Engine(EngineBase):
619
617
  ValueError: If query is not provided, or if items is not provided,
620
618
  or if token IDs are out of vocabulary, or if logprobs are not available for the specified tokens.
621
619
  """
622
- loop = asyncio.get_event_loop()
623
- return loop.run_until_complete(
620
+ return self.loop.run_until_complete(
624
621
  self.tokenizer_manager.score_request(
625
622
  query=query,
626
623
  items=items,
@@ -653,6 +650,21 @@ class Engine(EngineBase):
653
650
  request=None,
654
651
  )
655
652
 
653
+ def update_weights_from_ipc(
654
+ self,
655
+ zmq_handles: Dict[str, str],
656
+ flush_cache: bool = True,
657
+ ):
658
+ """Update weights from IPC for checkpoint-engine integration."""
659
+ obj = UpdateWeightsFromIPCReqInput(
660
+ zmq_handles=zmq_handles,
661
+ flush_cache=flush_cache,
662
+ )
663
+ loop = asyncio.get_event_loop()
664
+ return loop.run_until_complete(
665
+ self.tokenizer_manager.update_weights_from_ipc(obj, None)
666
+ )
667
+
656
668
 
657
669
  def _set_envs_and_config(server_args: ServerArgs):
658
670
  # Set global environments
@@ -666,6 +678,13 @@ def _set_envs_and_config(server_args: ServerArgs):
666
678
  if os.environ.get("TRTLLM_ENABLE_PDL", "1") != "0":
667
679
  os.environ["TRTLLM_ENABLE_PDL"] = "1"
668
680
 
681
+ if os.environ.get("CUTE_DSL_LOG_LEVEL") is None:
682
+ # Default to warning level, to avoid too many logs
683
+ os.environ["CUTE_DSL_LOG_LEVEL"] = "30"
684
+ if os.environ.get("CUTE_DSL_LOG_TO_CONSOLE") is None:
685
+ # Need to set log to console, otherwise the log level won't take effect
686
+ os.environ["CUTE_DSL_LOG_TO_CONSOLE"] = "1"
687
+
669
688
  # Can also be passed as argument
670
689
  os.environ["SGLANG_RUN_ID"] = (
671
690
  f"sglang-run-{time.time()}-{random.randint(0, 100000000)}"
@@ -682,7 +701,7 @@ def _set_envs_and_config(server_args: ServerArgs):
682
701
  if server_args.attention_backend == "flashinfer":
683
702
  assert_pkg_version(
684
703
  "flashinfer_python",
685
- "0.3.1",
704
+ "0.4.1",
686
705
  "Please uninstall the old version and "
687
706
  "reinstall the latest version by following the instructions "
688
707
  "at https://docs.flashinfer.ai/installation.html.",
@@ -690,7 +709,7 @@ def _set_envs_and_config(server_args: ServerArgs):
690
709
  if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
691
710
  assert_pkg_version(
692
711
  "sgl-kernel",
693
- "0.3.9.post2",
712
+ "0.3.16.post3",
694
713
  "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
695
714
  )
696
715
 
@@ -780,23 +799,24 @@ def _launch_subprocesses(
780
799
  + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
781
800
  )
782
801
  moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
783
- proc = mp.Process(
784
- target=run_scheduler_process,
785
- args=(
786
- server_args,
787
- port_args,
788
- gpu_id,
789
- tp_rank,
790
- moe_ep_rank,
791
- pp_rank,
792
- None,
793
- writer,
794
- None,
795
- ),
796
- )
797
802
 
798
- with memory_saver_adapter.configure_subprocess():
799
- proc.start()
803
+ with maybe_reindex_device_id(gpu_id) as gpu_id:
804
+ proc = mp.Process(
805
+ target=run_scheduler_process,
806
+ args=(
807
+ server_args,
808
+ port_args,
809
+ gpu_id,
810
+ tp_rank,
811
+ moe_ep_rank,
812
+ pp_rank,
813
+ None,
814
+ writer,
815
+ ),
816
+ )
817
+ with memory_saver_adapter.configure_subprocess():
818
+ proc.start()
819
+
800
820
  scheduler_procs.append(proc)
801
821
  scheduler_pipe_readers.append(reader)
802
822
  else: