sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (408) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +330 -156
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +8 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +134 -23
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +70 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +66 -66
  69. sglang/srt/entrypoints/grpc_server.py +431 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +120 -8
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +42 -4
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +3 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +18 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/utils.py +2 -2
  93. sglang/srt/grpc/compile_proto.py +3 -3
  94. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  95. sglang/srt/grpc/health_servicer.py +189 -0
  96. sglang/srt/grpc/scheduler_launcher.py +181 -0
  97. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  98. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  99. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  100. sglang/srt/layers/activation.py +4 -1
  101. sglang/srt/layers/attention/aiter_backend.py +3 -3
  102. sglang/srt/layers/attention/ascend_backend.py +17 -1
  103. sglang/srt/layers/attention/attention_registry.py +43 -23
  104. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  105. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  106. sglang/srt/layers/attention/fla/chunk.py +0 -1
  107. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  108. sglang/srt/layers/attention/fla/index.py +0 -2
  109. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  110. sglang/srt/layers/attention/fla/utils.py +0 -3
  111. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  112. sglang/srt/layers/attention/flashattention_backend.py +12 -8
  113. sglang/srt/layers/attention/flashinfer_backend.py +248 -21
  114. sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
  115. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  116. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  117. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  118. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  119. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  121. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  122. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  123. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  124. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  125. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  127. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  128. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  129. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  130. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  131. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  132. sglang/srt/layers/attention/nsa/utils.py +0 -1
  133. sglang/srt/layers/attention/nsa_backend.py +404 -90
  134. sglang/srt/layers/attention/triton_backend.py +208 -34
  135. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  136. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  137. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  138. sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
  139. sglang/srt/layers/attention/utils.py +11 -7
  140. sglang/srt/layers/attention/vision.py +3 -3
  141. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  142. sglang/srt/layers/communicator.py +11 -7
  143. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  146. sglang/srt/layers/dp_attention.py +17 -0
  147. sglang/srt/layers/layernorm.py +45 -15
  148. sglang/srt/layers/linear.py +9 -1
  149. sglang/srt/layers/logits_processor.py +147 -17
  150. sglang/srt/layers/modelopt_utils.py +11 -0
  151. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  152. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  153. sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
  154. sglang/srt/layers/moe/ep_moe/layer.py +119 -397
  155. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  159. sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
  160. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  161. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  162. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  163. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  164. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  165. sglang/srt/layers/moe/router.py +51 -15
  166. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  167. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  168. sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
  169. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  170. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  171. sglang/srt/layers/moe/topk.py +3 -2
  172. sglang/srt/layers/moe/utils.py +17 -1
  173. sglang/srt/layers/quantization/__init__.py +2 -53
  174. sglang/srt/layers/quantization/awq.py +183 -6
  175. sglang/srt/layers/quantization/awq_triton.py +29 -0
  176. sglang/srt/layers/quantization/base_config.py +20 -1
  177. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  178. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  179. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  180. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  181. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  183. sglang/srt/layers/quantization/fp8.py +84 -18
  184. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  185. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  186. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  187. sglang/srt/layers/quantization/gptq.py +0 -1
  188. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  189. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  190. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  191. sglang/srt/layers/quantization/mxfp4.py +5 -30
  192. sglang/srt/layers/quantization/petit.py +1 -1
  193. sglang/srt/layers/quantization/quark/quark.py +3 -1
  194. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  195. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  196. sglang/srt/layers/quantization/unquant.py +1 -4
  197. sglang/srt/layers/quantization/utils.py +0 -1
  198. sglang/srt/layers/quantization/w4afp8.py +51 -20
  199. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  200. sglang/srt/layers/radix_attention.py +59 -9
  201. sglang/srt/layers/rotary_embedding.py +673 -16
  202. sglang/srt/layers/sampler.py +36 -16
  203. sglang/srt/layers/sparse_pooler.py +98 -0
  204. sglang/srt/layers/utils.py +0 -1
  205. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  206. sglang/srt/lora/backend/triton_backend.py +0 -1
  207. sglang/srt/lora/eviction_policy.py +139 -0
  208. sglang/srt/lora/lora_manager.py +24 -9
  209. sglang/srt/lora/lora_registry.py +1 -1
  210. sglang/srt/lora/mem_pool.py +40 -16
  211. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  212. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  213. sglang/srt/managers/cache_controller.py +48 -17
  214. sglang/srt/managers/data_parallel_controller.py +146 -42
  215. sglang/srt/managers/detokenizer_manager.py +40 -13
  216. sglang/srt/managers/io_struct.py +66 -16
  217. sglang/srt/managers/mm_utils.py +20 -18
  218. sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
  219. sglang/srt/managers/overlap_utils.py +96 -19
  220. sglang/srt/managers/schedule_batch.py +241 -511
  221. sglang/srt/managers/schedule_policy.py +15 -2
  222. sglang/srt/managers/scheduler.py +399 -499
  223. sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
  224. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  225. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  226. sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
  227. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  228. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  229. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  230. sglang/srt/managers/tokenizer_manager.py +378 -90
  231. sglang/srt/managers/tp_worker.py +212 -161
  232. sglang/srt/managers/utils.py +78 -2
  233. sglang/srt/mem_cache/allocator.py +7 -2
  234. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  235. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  236. sglang/srt/mem_cache/chunk_cache.py +13 -2
  237. sglang/srt/mem_cache/common.py +480 -0
  238. sglang/srt/mem_cache/evict_policy.py +16 -1
  239. sglang/srt/mem_cache/hicache_storage.py +4 -1
  240. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  241. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  242. sglang/srt/mem_cache/memory_pool.py +435 -219
  243. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  244. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  245. sglang/srt/mem_cache/radix_cache.py +53 -19
  246. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  247. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  249. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  250. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  251. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  252. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  253. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  254. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  255. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  256. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  257. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  258. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  259. sglang/srt/metrics/collector.py +31 -0
  260. sglang/srt/metrics/func_timer.py +1 -1
  261. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  262. sglang/srt/model_executor/forward_batch_info.py +28 -23
  263. sglang/srt/model_executor/model_runner.py +379 -139
  264. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  265. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  266. sglang/srt/model_loader/__init__.py +1 -1
  267. sglang/srt/model_loader/loader.py +424 -27
  268. sglang/srt/model_loader/utils.py +0 -1
  269. sglang/srt/model_loader/weight_utils.py +47 -28
  270. sglang/srt/models/apertus.py +2 -3
  271. sglang/srt/models/arcee.py +2 -2
  272. sglang/srt/models/bailing_moe.py +13 -52
  273. sglang/srt/models/bailing_moe_nextn.py +3 -4
  274. sglang/srt/models/bert.py +1 -1
  275. sglang/srt/models/deepseek_nextn.py +19 -3
  276. sglang/srt/models/deepseek_ocr.py +1516 -0
  277. sglang/srt/models/deepseek_v2.py +273 -98
  278. sglang/srt/models/dots_ocr.py +0 -2
  279. sglang/srt/models/dots_vlm.py +0 -1
  280. sglang/srt/models/dots_vlm_vit.py +1 -1
  281. sglang/srt/models/falcon_h1.py +13 -19
  282. sglang/srt/models/gemma3_mm.py +16 -0
  283. sglang/srt/models/gemma3n_mm.py +1 -2
  284. sglang/srt/models/glm4_moe.py +14 -37
  285. sglang/srt/models/glm4_moe_nextn.py +2 -2
  286. sglang/srt/models/glm4v.py +2 -1
  287. sglang/srt/models/glm4v_moe.py +5 -5
  288. sglang/srt/models/gpt_oss.py +5 -5
  289. sglang/srt/models/grok.py +10 -23
  290. sglang/srt/models/hunyuan.py +2 -7
  291. sglang/srt/models/interns1.py +0 -1
  292. sglang/srt/models/kimi_vl.py +1 -7
  293. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  294. sglang/srt/models/llama.py +2 -2
  295. sglang/srt/models/llama_eagle3.py +1 -1
  296. sglang/srt/models/longcat_flash.py +5 -22
  297. sglang/srt/models/longcat_flash_nextn.py +3 -14
  298. sglang/srt/models/mimo.py +2 -13
  299. sglang/srt/models/mimo_mtp.py +1 -2
  300. sglang/srt/models/minicpmo.py +7 -5
  301. sglang/srt/models/mixtral.py +1 -4
  302. sglang/srt/models/mllama.py +1 -1
  303. sglang/srt/models/mllama4.py +13 -3
  304. sglang/srt/models/nemotron_h.py +511 -0
  305. sglang/srt/models/olmo2.py +31 -4
  306. sglang/srt/models/opt.py +5 -5
  307. sglang/srt/models/phi.py +1 -1
  308. sglang/srt/models/phi4mm.py +1 -1
  309. sglang/srt/models/phimoe.py +0 -1
  310. sglang/srt/models/pixtral.py +0 -3
  311. sglang/srt/models/points_v15_chat.py +186 -0
  312. sglang/srt/models/qwen.py +0 -1
  313. sglang/srt/models/qwen2_5_vl.py +3 -3
  314. sglang/srt/models/qwen2_audio.py +2 -15
  315. sglang/srt/models/qwen2_moe.py +15 -12
  316. sglang/srt/models/qwen2_vl.py +5 -2
  317. sglang/srt/models/qwen3_moe.py +19 -35
  318. sglang/srt/models/qwen3_next.py +7 -12
  319. sglang/srt/models/qwen3_next_mtp.py +3 -4
  320. sglang/srt/models/qwen3_omni_moe.py +661 -0
  321. sglang/srt/models/qwen3_vl.py +37 -33
  322. sglang/srt/models/qwen3_vl_moe.py +57 -185
  323. sglang/srt/models/roberta.py +55 -3
  324. sglang/srt/models/sarashina2_vision.py +0 -1
  325. sglang/srt/models/step3_vl.py +3 -5
  326. sglang/srt/models/utils.py +11 -1
  327. sglang/srt/multimodal/processors/base_processor.py +6 -2
  328. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  329. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  330. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  331. sglang/srt/multimodal/processors/glm4v.py +1 -5
  332. sglang/srt/multimodal/processors/internvl.py +0 -2
  333. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  334. sglang/srt/multimodal/processors/mllama4.py +0 -8
  335. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  336. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  337. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  338. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  339. sglang/srt/parser/conversation.py +41 -0
  340. sglang/srt/parser/reasoning_parser.py +0 -1
  341. sglang/srt/sampling/custom_logit_processor.py +77 -2
  342. sglang/srt/sampling/sampling_batch_info.py +17 -22
  343. sglang/srt/sampling/sampling_params.py +70 -2
  344. sglang/srt/server_args.py +577 -73
  345. sglang/srt/server_args_config_parser.py +1 -1
  346. sglang/srt/single_batch_overlap.py +38 -28
  347. sglang/srt/speculative/base_spec_worker.py +34 -0
  348. sglang/srt/speculative/draft_utils.py +226 -0
  349. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  350. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  351. sglang/srt/speculative/eagle_info.py +57 -18
  352. sglang/srt/speculative/eagle_info_v2.py +458 -0
  353. sglang/srt/speculative/eagle_utils.py +138 -0
  354. sglang/srt/speculative/eagle_worker.py +83 -280
  355. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  356. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  357. sglang/srt/speculative/ngram_worker.py +12 -11
  358. sglang/srt/speculative/spec_info.py +2 -0
  359. sglang/srt/speculative/spec_utils.py +38 -3
  360. sglang/srt/speculative/standalone_worker.py +4 -14
  361. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  362. sglang/srt/two_batch_overlap.py +28 -14
  363. sglang/srt/utils/__init__.py +1 -1
  364. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  365. sglang/srt/utils/common.py +192 -47
  366. sglang/srt/utils/hf_transformers_utils.py +40 -17
  367. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  368. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  369. sglang/srt/utils/profile_merger.py +199 -0
  370. sglang/test/attention/test_flashattn_backend.py +1 -1
  371. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  372. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  373. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  374. sglang/test/few_shot_gsm8k_engine.py +2 -4
  375. sglang/test/kit_matched_stop.py +157 -0
  376. sglang/test/longbench_v2/__init__.py +1 -0
  377. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  378. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  379. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  380. sglang/test/run_eval.py +41 -0
  381. sglang/test/runners.py +2 -0
  382. sglang/test/send_one.py +42 -7
  383. sglang/test/simple_eval_common.py +3 -0
  384. sglang/test/simple_eval_gpqa.py +0 -1
  385. sglang/test/simple_eval_humaneval.py +0 -3
  386. sglang/test/simple_eval_longbench_v2.py +344 -0
  387. sglang/test/test_block_fp8.py +1 -2
  388. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  389. sglang/test/test_cutlass_moe.py +1 -2
  390. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  391. sglang/test/test_deterministic.py +232 -99
  392. sglang/test/test_deterministic_utils.py +73 -0
  393. sglang/test/test_disaggregation_utils.py +81 -0
  394. sglang/test/test_marlin_moe.py +0 -1
  395. sglang/test/test_utils.py +85 -20
  396. sglang/version.py +1 -1
  397. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
  398. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
  399. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  400. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  401. sglang/srt/speculative/build_eagle_tree.py +0 -427
  402. sglang/test/test_block_fp8_ep.py +0 -358
  403. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  404. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  405. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  406. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -30,8 +30,6 @@ import time
30
30
  from typing import AsyncIterator, Dict, Iterator, List, Optional, Tuple, Union
31
31
 
32
32
  import zmq
33
- import zmq.asyncio
34
- from PIL.Image import Image
35
33
 
36
34
  from sglang.srt.tracing.trace import process_tracing_init, trace_set_thread_info
37
35
 
@@ -61,6 +59,7 @@ from sglang.srt.managers.io_struct import (
61
59
  UnloadLoRAAdapterReqInput,
62
60
  UpdateWeightFromDiskReqInput,
63
61
  UpdateWeightsFromDistributedReqInput,
62
+ UpdateWeightsFromIPCReqInput,
64
63
  UpdateWeightsFromTensorReqInput,
65
64
  )
66
65
  from sglang.srt.managers.multi_tokenizer_mixin import MultiTokenizerRouter
@@ -68,7 +67,6 @@ from sglang.srt.managers.scheduler import run_scheduler_process
68
67
  from sglang.srt.managers.template_manager import TemplateManager
69
68
  from sglang.srt.managers.tokenizer_manager import TokenizerManager
70
69
  from sglang.srt.server_args import PortArgs, ServerArgs
71
- from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
72
70
  from sglang.srt.utils import (
73
71
  MultiprocessingSerializer,
74
72
  assert_pkg_version,
@@ -78,10 +76,12 @@ from sglang.srt.utils import (
78
76
  is_cuda,
79
77
  kill_process_tree,
80
78
  launch_dummy_health_check_server,
79
+ maybe_reindex_device_id,
81
80
  prepare_model_and_tokenizer,
82
81
  set_prometheus_multiproc_dir,
83
82
  set_ulimit,
84
83
  )
84
+ from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter
85
85
  from sglang.version import __version__
86
86
 
87
87
  logger = logging.getLogger(__name__)
@@ -147,6 +147,12 @@ class Engine(EngineBase):
147
147
  thread_label = "Tokenizer"
148
148
  trace_set_thread_info(thread_label)
149
149
 
150
+ try:
151
+ self.loop = asyncio.get_running_loop()
152
+ except RuntimeError:
153
+ self.loop = asyncio.new_event_loop()
154
+ asyncio.set_event_loop(self.loop)
155
+
150
156
  def generate(
151
157
  self,
152
158
  # The input prompt. It can be a single prompt or a batch of prompts.
@@ -210,7 +216,6 @@ class Engine(EngineBase):
210
216
  bootstrap_room=bootstrap_room,
211
217
  data_parallel_rank=data_parallel_rank,
212
218
  )
213
- loop = asyncio.get_event_loop()
214
219
  generator = self.tokenizer_manager.generate_request(obj, None)
215
220
 
216
221
  if stream:
@@ -218,14 +223,14 @@ class Engine(EngineBase):
218
223
  def generator_wrapper():
219
224
  while True:
220
225
  try:
221
- chunk = loop.run_until_complete(generator.__anext__())
226
+ chunk = self.loop.run_until_complete(generator.__anext__())
222
227
  yield chunk
223
228
  except StopAsyncIteration:
224
229
  break
225
230
 
226
231
  return generator_wrapper()
227
232
  else:
228
- ret = loop.run_until_complete(generator.__anext__())
233
+ ret = self.loop.run_until_complete(generator.__anext__())
229
234
  return ret
230
235
 
231
236
  async def async_generate(
@@ -317,9 +322,8 @@ class Engine(EngineBase):
317
322
  audio_data=audio_data,
318
323
  video_data=video_data,
319
324
  )
320
- loop = asyncio.get_event_loop()
321
325
  generator = self.tokenizer_manager.generate_request(obj, None)
322
- ret = loop.run_until_complete(generator.__anext__())
326
+ ret = self.loop.run_until_complete(generator.__anext__())
323
327
  return ret
324
328
 
325
329
  async def async_encode(
@@ -353,9 +357,8 @@ class Engine(EngineBase):
353
357
  Please refer to `EmbeddingReqInput` for the documentation.
354
358
  """
355
359
  obj = EmbeddingReqInput(text=prompt, is_cross_encoder_request=True)
356
- loop = asyncio.get_event_loop()
357
360
  generator = self.tokenizer_manager.generate_request(obj, None)
358
- ret = loop.run_until_complete(generator.__anext__())
361
+ ret = self.loop.run_until_complete(generator.__anext__())
359
362
  return ret
360
363
 
361
364
  def shutdown(self):
@@ -370,38 +373,31 @@ class Engine(EngineBase):
370
373
  return False
371
374
 
372
375
  def flush_cache(self):
373
- loop = asyncio.get_event_loop()
374
- return loop.run_until_complete(self.tokenizer_manager.flush_cache())
376
+ return self.loop.run_until_complete(self.tokenizer_manager.flush_cache())
375
377
 
376
378
  def start_profile(self, **kwargs):
377
- loop = asyncio.get_event_loop()
378
- loop.run_until_complete(self.tokenizer_manager.start_profile(**kwargs))
379
+ self.loop.run_until_complete(self.tokenizer_manager.start_profile(**kwargs))
379
380
 
380
381
  def stop_profile(self):
381
- loop = asyncio.get_event_loop()
382
- loop.run_until_complete(self.tokenizer_manager.stop_profile())
382
+ self.loop.run_until_complete(self.tokenizer_manager.stop_profile())
383
383
 
384
384
  def start_expert_distribution_record(self):
385
- loop = asyncio.get_event_loop()
386
- loop.run_until_complete(
385
+ self.loop.run_until_complete(
387
386
  self.tokenizer_manager.start_expert_distribution_record()
388
387
  )
389
388
 
390
389
  def stop_expert_distribution_record(self):
391
- loop = asyncio.get_event_loop()
392
- loop.run_until_complete(
390
+ self.loop.run_until_complete(
393
391
  self.tokenizer_manager.stop_expert_distribution_record()
394
392
  )
395
393
 
396
394
  def dump_expert_distribution_record(self):
397
- loop = asyncio.get_event_loop()
398
- loop.run_until_complete(
395
+ self.loop.run_until_complete(
399
396
  self.tokenizer_manager.dump_expert_distribution_record()
400
397
  )
401
398
 
402
399
  def get_server_info(self):
403
- loop = asyncio.get_event_loop()
404
- internal_states = loop.run_until_complete(
400
+ internal_states = self.loop.run_until_complete(
405
401
  self.tokenizer_manager.get_internal_state()
406
402
  )
407
403
  return {
@@ -429,8 +425,7 @@ class Engine(EngineBase):
429
425
  group_name=group_name,
430
426
  backend=backend,
431
427
  )
432
- loop = asyncio.get_event_loop()
433
- return loop.run_until_complete(
428
+ return self.loop.run_until_complete(
434
429
  self.tokenizer_manager.init_weights_update_group(obj, None)
435
430
  )
436
431
 
@@ -442,8 +437,7 @@ class Engine(EngineBase):
442
437
  obj = DestroyWeightsUpdateGroupReqInput(
443
438
  group_name=group_name,
444
439
  )
445
- loop = asyncio.get_event_loop()
446
- return loop.run_until_complete(
440
+ return self.loop.run_until_complete(
447
441
  self.tokenizer_manager.destroy_weights_update_group(obj, None)
448
442
  )
449
443
 
@@ -463,8 +457,7 @@ class Engine(EngineBase):
463
457
  group_name=group_name,
464
458
  flush_cache=flush_cache,
465
459
  )
466
- loop = asyncio.get_event_loop()
467
- return loop.run_until_complete(
460
+ return self.loop.run_until_complete(
468
461
  self.tokenizer_manager.update_weights_from_distributed(obj, None)
469
462
  )
470
463
 
@@ -488,9 +481,7 @@ class Engine(EngineBase):
488
481
  load_format=load_format,
489
482
  flush_cache=flush_cache,
490
483
  )
491
- loop = asyncio.get_event_loop()
492
-
493
- return loop.run_until_complete(
484
+ return self.loop.run_until_complete(
494
485
  self.tokenizer_manager.update_weights_from_tensor(obj, None)
495
486
  )
496
487
 
@@ -510,16 +501,14 @@ class Engine(EngineBase):
510
501
  load_format=load_format,
511
502
  )
512
503
 
513
- loop = asyncio.get_event_loop()
514
- return loop.run_until_complete(
504
+ return self.loop.run_until_complete(
515
505
  self.tokenizer_manager.update_weights_from_disk(obj, None)
516
506
  )
517
507
 
518
508
  def get_weights_by_name(self, name: str, truncate_size: int = 100):
519
509
  """Get weights by parameter name."""
520
510
  obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
521
- loop = asyncio.get_event_loop()
522
- return loop.run_until_complete(
511
+ return self.loop.run_until_complete(
523
512
  self.tokenizer_manager.get_weights_by_name(obj, None)
524
513
  )
525
514
 
@@ -532,8 +521,7 @@ class Engine(EngineBase):
532
521
  pinned=pinned,
533
522
  )
534
523
 
535
- loop = asyncio.get_event_loop()
536
- return loop.run_until_complete(
524
+ return self.loop.run_until_complete(
537
525
  self.tokenizer_manager.load_lora_adapter(obj, None)
538
526
  )
539
527
 
@@ -542,22 +530,19 @@ class Engine(EngineBase):
542
530
 
543
531
  obj = UnloadLoRAAdapterReqInput(lora_name=lora_name)
544
532
 
545
- loop = asyncio.get_event_loop()
546
- return loop.run_until_complete(
533
+ return self.loop.run_until_complete(
547
534
  self.tokenizer_manager.unload_lora_adapter(obj, None)
548
535
  )
549
536
 
550
537
  def release_memory_occupation(self, tags: Optional[List[str]] = None):
551
538
  obj = ReleaseMemoryOccupationReqInput(tags=tags)
552
- loop = asyncio.get_event_loop()
553
- return loop.run_until_complete(
539
+ return self.loop.run_until_complete(
554
540
  self.tokenizer_manager.release_memory_occupation(obj, None)
555
541
  )
556
542
 
557
543
  def resume_memory_occupation(self, tags: Optional[List[str]] = None):
558
544
  obj = ResumeMemoryOccupationReqInput(tags=tags)
559
- loop = asyncio.get_event_loop()
560
- return loop.run_until_complete(
545
+ return self.loop.run_until_complete(
561
546
  self.tokenizer_manager.resume_memory_occupation(obj, None)
562
547
  )
563
548
 
@@ -574,8 +559,7 @@ class Engine(EngineBase):
574
559
  collection.
575
560
  """
576
561
 
577
- loop = asyncio.get_event_loop()
578
- loop.run_until_complete(self.tokenizer_manager.freeze_gc())
562
+ self.loop.run_until_complete(self.tokenizer_manager.freeze_gc())
579
563
 
580
564
  """
581
565
  Execute an RPC call on all scheduler processes.
@@ -633,8 +617,7 @@ class Engine(EngineBase):
633
617
  ValueError: If query is not provided, or if items is not provided,
634
618
  or if token IDs are out of vocabulary, or if logprobs are not available for the specified tokens.
635
619
  """
636
- loop = asyncio.get_event_loop()
637
- return loop.run_until_complete(
620
+ return self.loop.run_until_complete(
638
621
  self.tokenizer_manager.score_request(
639
622
  query=query,
640
623
  items=items,
@@ -667,6 +650,21 @@ class Engine(EngineBase):
667
650
  request=None,
668
651
  )
669
652
 
653
+ def update_weights_from_ipc(
654
+ self,
655
+ zmq_handles: Dict[str, str],
656
+ flush_cache: bool = True,
657
+ ):
658
+ """Update weights from IPC for checkpoint-engine integration."""
659
+ obj = UpdateWeightsFromIPCReqInput(
660
+ zmq_handles=zmq_handles,
661
+ flush_cache=flush_cache,
662
+ )
663
+ loop = asyncio.get_event_loop()
664
+ return loop.run_until_complete(
665
+ self.tokenizer_manager.update_weights_from_ipc(obj, None)
666
+ )
667
+
670
668
 
671
669
  def _set_envs_and_config(server_args: ServerArgs):
672
670
  # Set global environments
@@ -703,7 +701,7 @@ def _set_envs_and_config(server_args: ServerArgs):
703
701
  if server_args.attention_backend == "flashinfer":
704
702
  assert_pkg_version(
705
703
  "flashinfer_python",
706
- "0.4.0rc3",
704
+ "0.4.1",
707
705
  "Please uninstall the old version and "
708
706
  "reinstall the latest version by following the instructions "
709
707
  "at https://docs.flashinfer.ai/installation.html.",
@@ -711,7 +709,7 @@ def _set_envs_and_config(server_args: ServerArgs):
711
709
  if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
712
710
  assert_pkg_version(
713
711
  "sgl-kernel",
714
- "0.3.14",
712
+ "0.3.16.post3",
715
713
  "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
716
714
  )
717
715
 
@@ -801,22 +799,24 @@ def _launch_subprocesses(
801
799
  + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
802
800
  )
803
801
  moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
804
- proc = mp.Process(
805
- target=run_scheduler_process,
806
- args=(
807
- server_args,
808
- port_args,
809
- gpu_id,
810
- tp_rank,
811
- moe_ep_rank,
812
- pp_rank,
813
- None,
814
- writer,
815
- ),
816
- )
817
802
 
818
- with memory_saver_adapter.configure_subprocess():
819
- proc.start()
803
+ with maybe_reindex_device_id(gpu_id) as gpu_id:
804
+ proc = mp.Process(
805
+ target=run_scheduler_process,
806
+ args=(
807
+ server_args,
808
+ port_args,
809
+ gpu_id,
810
+ tp_rank,
811
+ moe_ep_rank,
812
+ pp_rank,
813
+ None,
814
+ writer,
815
+ ),
816
+ )
817
+ with memory_saver_adapter.configure_subprocess():
818
+ proc.start()
819
+
820
820
  scheduler_procs.append(proc)
821
821
  scheduler_pipe_readers.append(reader)
822
822
  else: