sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (408) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +330 -156
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +8 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +134 -23
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +70 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +66 -66
  69. sglang/srt/entrypoints/grpc_server.py +431 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +120 -8
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +42 -4
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +3 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +18 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/utils.py +2 -2
  93. sglang/srt/grpc/compile_proto.py +3 -3
  94. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  95. sglang/srt/grpc/health_servicer.py +189 -0
  96. sglang/srt/grpc/scheduler_launcher.py +181 -0
  97. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  98. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  99. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  100. sglang/srt/layers/activation.py +4 -1
  101. sglang/srt/layers/attention/aiter_backend.py +3 -3
  102. sglang/srt/layers/attention/ascend_backend.py +17 -1
  103. sglang/srt/layers/attention/attention_registry.py +43 -23
  104. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  105. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  106. sglang/srt/layers/attention/fla/chunk.py +0 -1
  107. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  108. sglang/srt/layers/attention/fla/index.py +0 -2
  109. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  110. sglang/srt/layers/attention/fla/utils.py +0 -3
  111. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  112. sglang/srt/layers/attention/flashattention_backend.py +12 -8
  113. sglang/srt/layers/attention/flashinfer_backend.py +248 -21
  114. sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
  115. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  116. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  117. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  118. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  119. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  121. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  122. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  123. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  124. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  125. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  127. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  128. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  129. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  130. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  131. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  132. sglang/srt/layers/attention/nsa/utils.py +0 -1
  133. sglang/srt/layers/attention/nsa_backend.py +404 -90
  134. sglang/srt/layers/attention/triton_backend.py +208 -34
  135. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  136. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  137. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  138. sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
  139. sglang/srt/layers/attention/utils.py +11 -7
  140. sglang/srt/layers/attention/vision.py +3 -3
  141. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  142. sglang/srt/layers/communicator.py +11 -7
  143. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  146. sglang/srt/layers/dp_attention.py +17 -0
  147. sglang/srt/layers/layernorm.py +45 -15
  148. sglang/srt/layers/linear.py +9 -1
  149. sglang/srt/layers/logits_processor.py +147 -17
  150. sglang/srt/layers/modelopt_utils.py +11 -0
  151. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  152. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  153. sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
  154. sglang/srt/layers/moe/ep_moe/layer.py +119 -397
  155. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  159. sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
  160. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  161. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  162. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  163. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  164. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  165. sglang/srt/layers/moe/router.py +51 -15
  166. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  167. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  168. sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
  169. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  170. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  171. sglang/srt/layers/moe/topk.py +3 -2
  172. sglang/srt/layers/moe/utils.py +17 -1
  173. sglang/srt/layers/quantization/__init__.py +2 -53
  174. sglang/srt/layers/quantization/awq.py +183 -6
  175. sglang/srt/layers/quantization/awq_triton.py +29 -0
  176. sglang/srt/layers/quantization/base_config.py +20 -1
  177. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  178. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  179. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  180. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  181. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  183. sglang/srt/layers/quantization/fp8.py +84 -18
  184. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  185. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  186. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  187. sglang/srt/layers/quantization/gptq.py +0 -1
  188. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  189. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  190. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  191. sglang/srt/layers/quantization/mxfp4.py +5 -30
  192. sglang/srt/layers/quantization/petit.py +1 -1
  193. sglang/srt/layers/quantization/quark/quark.py +3 -1
  194. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  195. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  196. sglang/srt/layers/quantization/unquant.py +1 -4
  197. sglang/srt/layers/quantization/utils.py +0 -1
  198. sglang/srt/layers/quantization/w4afp8.py +51 -20
  199. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  200. sglang/srt/layers/radix_attention.py +59 -9
  201. sglang/srt/layers/rotary_embedding.py +673 -16
  202. sglang/srt/layers/sampler.py +36 -16
  203. sglang/srt/layers/sparse_pooler.py +98 -0
  204. sglang/srt/layers/utils.py +0 -1
  205. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  206. sglang/srt/lora/backend/triton_backend.py +0 -1
  207. sglang/srt/lora/eviction_policy.py +139 -0
  208. sglang/srt/lora/lora_manager.py +24 -9
  209. sglang/srt/lora/lora_registry.py +1 -1
  210. sglang/srt/lora/mem_pool.py +40 -16
  211. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  212. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  213. sglang/srt/managers/cache_controller.py +48 -17
  214. sglang/srt/managers/data_parallel_controller.py +146 -42
  215. sglang/srt/managers/detokenizer_manager.py +40 -13
  216. sglang/srt/managers/io_struct.py +66 -16
  217. sglang/srt/managers/mm_utils.py +20 -18
  218. sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
  219. sglang/srt/managers/overlap_utils.py +96 -19
  220. sglang/srt/managers/schedule_batch.py +241 -511
  221. sglang/srt/managers/schedule_policy.py +15 -2
  222. sglang/srt/managers/scheduler.py +399 -499
  223. sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
  224. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  225. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  226. sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
  227. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  228. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  229. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  230. sglang/srt/managers/tokenizer_manager.py +378 -90
  231. sglang/srt/managers/tp_worker.py +212 -161
  232. sglang/srt/managers/utils.py +78 -2
  233. sglang/srt/mem_cache/allocator.py +7 -2
  234. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  235. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  236. sglang/srt/mem_cache/chunk_cache.py +13 -2
  237. sglang/srt/mem_cache/common.py +480 -0
  238. sglang/srt/mem_cache/evict_policy.py +16 -1
  239. sglang/srt/mem_cache/hicache_storage.py +4 -1
  240. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  241. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  242. sglang/srt/mem_cache/memory_pool.py +435 -219
  243. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  244. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  245. sglang/srt/mem_cache/radix_cache.py +53 -19
  246. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  247. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  249. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  250. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  251. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  252. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  253. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  254. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  255. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  256. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  257. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  258. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  259. sglang/srt/metrics/collector.py +31 -0
  260. sglang/srt/metrics/func_timer.py +1 -1
  261. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  262. sglang/srt/model_executor/forward_batch_info.py +28 -23
  263. sglang/srt/model_executor/model_runner.py +379 -139
  264. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  265. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  266. sglang/srt/model_loader/__init__.py +1 -1
  267. sglang/srt/model_loader/loader.py +424 -27
  268. sglang/srt/model_loader/utils.py +0 -1
  269. sglang/srt/model_loader/weight_utils.py +47 -28
  270. sglang/srt/models/apertus.py +2 -3
  271. sglang/srt/models/arcee.py +2 -2
  272. sglang/srt/models/bailing_moe.py +13 -52
  273. sglang/srt/models/bailing_moe_nextn.py +3 -4
  274. sglang/srt/models/bert.py +1 -1
  275. sglang/srt/models/deepseek_nextn.py +19 -3
  276. sglang/srt/models/deepseek_ocr.py +1516 -0
  277. sglang/srt/models/deepseek_v2.py +273 -98
  278. sglang/srt/models/dots_ocr.py +0 -2
  279. sglang/srt/models/dots_vlm.py +0 -1
  280. sglang/srt/models/dots_vlm_vit.py +1 -1
  281. sglang/srt/models/falcon_h1.py +13 -19
  282. sglang/srt/models/gemma3_mm.py +16 -0
  283. sglang/srt/models/gemma3n_mm.py +1 -2
  284. sglang/srt/models/glm4_moe.py +14 -37
  285. sglang/srt/models/glm4_moe_nextn.py +2 -2
  286. sglang/srt/models/glm4v.py +2 -1
  287. sglang/srt/models/glm4v_moe.py +5 -5
  288. sglang/srt/models/gpt_oss.py +5 -5
  289. sglang/srt/models/grok.py +10 -23
  290. sglang/srt/models/hunyuan.py +2 -7
  291. sglang/srt/models/interns1.py +0 -1
  292. sglang/srt/models/kimi_vl.py +1 -7
  293. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  294. sglang/srt/models/llama.py +2 -2
  295. sglang/srt/models/llama_eagle3.py +1 -1
  296. sglang/srt/models/longcat_flash.py +5 -22
  297. sglang/srt/models/longcat_flash_nextn.py +3 -14
  298. sglang/srt/models/mimo.py +2 -13
  299. sglang/srt/models/mimo_mtp.py +1 -2
  300. sglang/srt/models/minicpmo.py +7 -5
  301. sglang/srt/models/mixtral.py +1 -4
  302. sglang/srt/models/mllama.py +1 -1
  303. sglang/srt/models/mllama4.py +13 -3
  304. sglang/srt/models/nemotron_h.py +511 -0
  305. sglang/srt/models/olmo2.py +31 -4
  306. sglang/srt/models/opt.py +5 -5
  307. sglang/srt/models/phi.py +1 -1
  308. sglang/srt/models/phi4mm.py +1 -1
  309. sglang/srt/models/phimoe.py +0 -1
  310. sglang/srt/models/pixtral.py +0 -3
  311. sglang/srt/models/points_v15_chat.py +186 -0
  312. sglang/srt/models/qwen.py +0 -1
  313. sglang/srt/models/qwen2_5_vl.py +3 -3
  314. sglang/srt/models/qwen2_audio.py +2 -15
  315. sglang/srt/models/qwen2_moe.py +15 -12
  316. sglang/srt/models/qwen2_vl.py +5 -2
  317. sglang/srt/models/qwen3_moe.py +19 -35
  318. sglang/srt/models/qwen3_next.py +7 -12
  319. sglang/srt/models/qwen3_next_mtp.py +3 -4
  320. sglang/srt/models/qwen3_omni_moe.py +661 -0
  321. sglang/srt/models/qwen3_vl.py +37 -33
  322. sglang/srt/models/qwen3_vl_moe.py +57 -185
  323. sglang/srt/models/roberta.py +55 -3
  324. sglang/srt/models/sarashina2_vision.py +0 -1
  325. sglang/srt/models/step3_vl.py +3 -5
  326. sglang/srt/models/utils.py +11 -1
  327. sglang/srt/multimodal/processors/base_processor.py +6 -2
  328. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  329. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  330. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  331. sglang/srt/multimodal/processors/glm4v.py +1 -5
  332. sglang/srt/multimodal/processors/internvl.py +0 -2
  333. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  334. sglang/srt/multimodal/processors/mllama4.py +0 -8
  335. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  336. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  337. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  338. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  339. sglang/srt/parser/conversation.py +41 -0
  340. sglang/srt/parser/reasoning_parser.py +0 -1
  341. sglang/srt/sampling/custom_logit_processor.py +77 -2
  342. sglang/srt/sampling/sampling_batch_info.py +17 -22
  343. sglang/srt/sampling/sampling_params.py +70 -2
  344. sglang/srt/server_args.py +577 -73
  345. sglang/srt/server_args_config_parser.py +1 -1
  346. sglang/srt/single_batch_overlap.py +38 -28
  347. sglang/srt/speculative/base_spec_worker.py +34 -0
  348. sglang/srt/speculative/draft_utils.py +226 -0
  349. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  350. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  351. sglang/srt/speculative/eagle_info.py +57 -18
  352. sglang/srt/speculative/eagle_info_v2.py +458 -0
  353. sglang/srt/speculative/eagle_utils.py +138 -0
  354. sglang/srt/speculative/eagle_worker.py +83 -280
  355. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  356. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  357. sglang/srt/speculative/ngram_worker.py +12 -11
  358. sglang/srt/speculative/spec_info.py +2 -0
  359. sglang/srt/speculative/spec_utils.py +38 -3
  360. sglang/srt/speculative/standalone_worker.py +4 -14
  361. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  362. sglang/srt/two_batch_overlap.py +28 -14
  363. sglang/srt/utils/__init__.py +1 -1
  364. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  365. sglang/srt/utils/common.py +192 -47
  366. sglang/srt/utils/hf_transformers_utils.py +40 -17
  367. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  368. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  369. sglang/srt/utils/profile_merger.py +199 -0
  370. sglang/test/attention/test_flashattn_backend.py +1 -1
  371. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  372. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  373. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  374. sglang/test/few_shot_gsm8k_engine.py +2 -4
  375. sglang/test/kit_matched_stop.py +157 -0
  376. sglang/test/longbench_v2/__init__.py +1 -0
  377. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  378. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  379. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  380. sglang/test/run_eval.py +41 -0
  381. sglang/test/runners.py +2 -0
  382. sglang/test/send_one.py +42 -7
  383. sglang/test/simple_eval_common.py +3 -0
  384. sglang/test/simple_eval_gpqa.py +0 -1
  385. sglang/test/simple_eval_humaneval.py +0 -3
  386. sglang/test/simple_eval_longbench_v2.py +344 -0
  387. sglang/test/test_block_fp8.py +1 -2
  388. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  389. sglang/test/test_cutlass_moe.py +1 -2
  390. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  391. sglang/test/test_deterministic.py +232 -99
  392. sglang/test/test_deterministic_utils.py +73 -0
  393. sglang/test/test_disaggregation_utils.py +81 -0
  394. sglang/test/test_marlin_moe.py +0 -1
  395. sglang/test/test_utils.py +85 -20
  396. sglang/version.py +1 -1
  397. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
  398. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
  399. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  400. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  401. sglang/srt/speculative/build_eagle_tree.py +0 -427
  402. sglang/test/test_block_fp8_ep.py +0 -358
  403. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  404. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  405. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  406. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py CHANGED
@@ -51,6 +51,7 @@ import logging
51
51
  import multiprocessing
52
52
  import os
53
53
  import time
54
+ from types import SimpleNamespace
54
55
  from typing import Tuple
55
56
 
56
57
  import numpy as np
@@ -71,7 +72,10 @@ from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
71
72
  from sglang.srt.utils import (
72
73
  configure_logger,
73
74
  get_bool_env_var,
75
+ is_cuda_alike,
76
+ is_xpu,
74
77
  kill_process_tree,
78
+ maybe_reindex_device_id,
75
79
  require_mlp_sync,
76
80
  require_mlp_tp_gather,
77
81
  set_gpu_proc_affinity,
@@ -79,6 +83,15 @@ from sglang.srt.utils import (
79
83
  )
80
84
  from sglang.srt.utils.hf_transformers_utils import get_tokenizer
81
85
 
86
+ profile_activities = [torch.profiler.ProfilerActivity.CPU] + [
87
+ profiler_activity
88
+ for available, profiler_activity in [
89
+ (is_cuda_alike(), torch.profiler.ProfilerActivity.CUDA),
90
+ (is_xpu(), torch.profiler.ProfilerActivity.XPU),
91
+ ]
92
+ if available
93
+ ]
94
+
82
95
 
83
96
  @dataclasses.dataclass
84
97
  class BenchArgs:
@@ -147,7 +160,7 @@ class BenchArgs:
147
160
  )
148
161
 
149
162
 
150
- def load_model(server_args, port_args, tp_rank):
163
+ def load_model(server_args, port_args, gpu_id, tp_rank):
151
164
  suppress_other_loggers()
152
165
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
153
166
  moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
@@ -156,7 +169,7 @@ def load_model(server_args, port_args, tp_rank):
156
169
  model_runner = ModelRunner(
157
170
  model_config=model_config,
158
171
  mem_fraction_static=server_args.mem_fraction_static,
159
- gpu_id=tp_rank,
172
+ gpu_id=gpu_id,
160
173
  tp_rank=tp_rank,
161
174
  tp_size=server_args.tp_size,
162
175
  moe_ep_rank=moe_ep_rank,
@@ -204,7 +217,6 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer, custom_prompts):
204
217
  origin_input_ids=tmp_input_ids,
205
218
  sampling_params=sampling_params,
206
219
  )
207
- req.prefix_indices = []
208
220
  req.fill_ids = req.origin_input_ids
209
221
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
210
222
  req.logprob_start_len = len(req.origin_input_ids) - 1
@@ -248,7 +260,6 @@ def prepare_synthetic_inputs_for_latency_test(
248
260
  origin_input_ids=list(input_ids[i]),
249
261
  sampling_params=sampling_params,
250
262
  )
251
- req.prefix_indices = []
252
263
  req.fill_ids = req.origin_input_ids
253
264
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
254
265
  req.logprob_start_len = len(req.origin_input_ids) - 1
@@ -259,11 +270,18 @@ def prepare_synthetic_inputs_for_latency_test(
259
270
 
260
271
  @torch.no_grad
261
272
  def extend(reqs, model_runner):
273
+ # Create dummy tree_cache for benchmarks (no prefix caching, just allocation)
274
+ dummy_tree_cache = SimpleNamespace(
275
+ page_size=model_runner.server_args.page_size,
276
+ device=model_runner.device,
277
+ token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
278
+ )
279
+
262
280
  batch = ScheduleBatch.init_new(
263
281
  reqs=reqs,
264
282
  req_to_token_pool=model_runner.req_to_token_pool,
265
283
  token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
266
- tree_cache=None,
284
+ tree_cache=dummy_tree_cache,
267
285
  model_config=model_runner.model_config,
268
286
  enable_overlap=False,
269
287
  spec_algorithm=SpeculativeAlgorithm.NONE,
@@ -302,6 +320,7 @@ def _maybe_prepare_mlp_sync_batch(batch: ScheduleBatch, model_runner):
302
320
  speculative_num_draft_tokens=None,
303
321
  require_mlp_tp_gather=require_mlp_tp_gather(model_runner.server_args),
304
322
  disable_overlap_schedule=model_runner.server_args.disable_overlap_schedule,
323
+ offload_tags=set(),
305
324
  )
306
325
 
307
326
 
@@ -333,6 +352,7 @@ def correctness_test(
333
352
  server_args,
334
353
  port_args,
335
354
  bench_args,
355
+ gpu_id,
336
356
  tp_rank,
337
357
  ):
338
358
  # Configure the logger
@@ -340,7 +360,7 @@ def correctness_test(
340
360
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
341
361
 
342
362
  # Load the model
343
- model_runner, tokenizer = load_model(server_args, port_args, tp_rank)
363
+ model_runner, tokenizer = load_model(server_args, port_args, gpu_id, tp_rank)
344
364
 
345
365
  # Prepare inputs
346
366
  custom_prompts = _read_prompts_from_file(bench_args.prompt_filename, rank_print)
@@ -418,10 +438,7 @@ def latency_test_run_once(
418
438
  profiler = None
419
439
  if profile:
420
440
  profiler = torch.profiler.profile(
421
- activities=[
422
- torch.profiler.ProfilerActivity.CPU,
423
- torch.profiler.ProfilerActivity.CUDA,
424
- ],
441
+ activities=profile_activities,
425
442
  with_stack=True,
426
443
  record_shapes=profile_record_shapes,
427
444
  )
@@ -454,10 +471,7 @@ def latency_test_run_once(
454
471
  if profile and i == output_len / 2:
455
472
  profiler = None
456
473
  profiler = torch.profiler.profile(
457
- activities=[
458
- torch.profiler.ProfilerActivity.CPU,
459
- torch.profiler.ProfilerActivity.CUDA,
460
- ],
474
+ activities=profile_activities,
461
475
  with_stack=True,
462
476
  record_shapes=profile_record_shapes,
463
477
  )
@@ -506,20 +520,23 @@ def latency_test(
506
520
  server_args,
507
521
  port_args,
508
522
  bench_args,
523
+ gpu_id,
509
524
  tp_rank,
510
525
  ):
511
526
  initialize_moe_config(server_args)
512
527
 
513
528
  # Set CPU affinity
514
529
  if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
515
- set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, tp_rank)
530
+ set_gpu_proc_affinity(
531
+ server_args.pp_size, server_args.tp_size, server_args.nnodes, tp_rank
532
+ )
516
533
 
517
534
  # Configure the logger
518
535
  configure_logger(server_args, prefix=f" TP{tp_rank}")
519
536
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
520
537
 
521
538
  # Load the model
522
- model_runner, tokenizer = load_model(server_args, port_args, tp_rank)
539
+ model_runner, tokenizer = load_model(server_args, port_args, gpu_id, tp_rank)
523
540
 
524
541
  # Prepare inputs for warm up
525
542
  reqs = prepare_synthetic_inputs_for_latency_test(
@@ -621,21 +638,23 @@ def main(server_args, bench_args):
621
638
  port_args = PortArgs.init_new(server_args)
622
639
 
623
640
  if server_args.tp_size == 1:
624
- work_func(server_args, port_args, bench_args, 0)
641
+ work_func(server_args, port_args, bench_args, 0, 0)
625
642
  else:
626
643
  workers = []
627
644
  for tp_rank in range(server_args.tp_size):
628
- proc = multiprocessing.Process(
629
- target=work_func,
630
- args=(
631
- server_args,
632
- port_args,
633
- bench_args,
634
- tp_rank,
635
- ),
636
- )
637
- proc.start()
638
- workers.append(proc)
645
+ with maybe_reindex_device_id(tp_rank) as gpu_id:
646
+ proc = multiprocessing.Process(
647
+ target=work_func,
648
+ args=(
649
+ server_args,
650
+ port_args,
651
+ bench_args,
652
+ gpu_id,
653
+ tp_rank,
654
+ ),
655
+ )
656
+ proc.start()
657
+ workers.append(proc)
639
658
 
640
659
  for proc in workers:
641
660
  proc.join()
@@ -16,6 +16,7 @@ import argparse
16
16
  import dataclasses
17
17
  import itertools
18
18
  import json
19
+ import logging
19
20
  import multiprocessing
20
21
  import os
21
22
  import random
@@ -25,8 +26,10 @@ from typing import List, Optional, Tuple
25
26
  import numpy as np
26
27
  import requests
27
28
  from pydantic import BaseModel
29
+ from transformers import AutoProcessor, PreTrainedTokenizer
28
30
 
29
31
  from sglang.bench_serving import (
32
+ get_processor,
30
33
  get_tokenizer,
31
34
  sample_mmmu_requests,
32
35
  sample_random_requests,
@@ -37,6 +40,8 @@ from sglang.srt.server_args import ServerArgs
37
40
  from sglang.srt.utils import is_blackwell, kill_process_tree
38
41
  from sglang.test.test_utils import is_in_ci, write_github_step_summary
39
42
 
43
+ logger = logging.getLogger(__name__)
44
+
40
45
 
41
46
  class ProfileLinks(BaseModel):
42
47
  """Pydantic model for profile trace links."""
@@ -104,8 +109,14 @@ Note: To view the traces through perfetto-ui, please:
104
109
  if self.profile_links.extend or self.profile_links.decode:
105
110
  # Create a combined link or use the first available one
106
111
  trace_files = [self.profile_links.extend, self.profile_links.decode]
112
+ if any(trace_file is None for trace_file in trace_files):
113
+ logger.error("Some trace files are None", f"{trace_files=}")
107
114
  trace_files_relay_links = [
108
- f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
115
+ (
116
+ f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
117
+ if trace_file
118
+ else "N/A"
119
+ )
109
120
  for trace_file in trace_files
110
121
  ]
111
122
 
@@ -114,30 +125,29 @@ Note: To view the traces through perfetto-ui, please:
114
125
  # Build the row
115
126
  return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n"
116
127
 
117
- @classmethod
118
- def generate_markdown_report(
119
- cls, trace_dir, results: List["BenchmarkResult"]
120
- ) -> str:
121
- """Generate a markdown report from a list of BenchmarkResult object from a single run."""
122
- import os
123
128
 
124
- summary = f"### {results[0].model_path}\n"
129
+ def generate_markdown_report(trace_dir, results: List["BenchmarkResult"]) -> str:
130
+ """Generate a markdown report from a list of BenchmarkResult object from a single run."""
131
+ import os
125
132
 
126
- # summary += (
127
- # f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
128
- # )
129
- summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
130
- summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
133
+ summary = f"### {results[0].model_path}\n"
131
134
 
132
- # all results should share the same isl & osl
133
- for result in results:
134
- base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/")
135
- relay_base = os.getenv("PERFETTO_RELAY_URL", "").rstrip("/")
136
- relay_base = "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html"
137
- # base_url = "https://github.com/sgl-project/ci-data/traces"
138
- summary += result.to_markdown_row(trace_dir, base_url, relay_base)
135
+ # summary += (
136
+ # f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
137
+ # )
138
+ summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
139
+ summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
139
140
 
140
- return summary
141
+ # all results should share the same isl & osl
142
+ for result in results:
143
+ base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/")
144
+ relay_base = os.getenv(
145
+ "PERFETTO_RELAY_URL",
146
+ "",
147
+ ).rstrip("/")
148
+ summary += result.to_markdown_row(trace_dir, base_url, relay_base)
149
+
150
+ return summary
141
151
 
142
152
 
143
153
  @dataclasses.dataclass
@@ -288,7 +298,7 @@ def run_one_case(
288
298
  input_len_step_percentage: float,
289
299
  run_name: str,
290
300
  result_filename: str,
291
- tokenizer,
301
+ tokenizer: PreTrainedTokenizer | AutoProcessor,
292
302
  dataset_name="",
293
303
  profile: bool = False,
294
304
  profile_steps: int = 3,
@@ -302,9 +312,8 @@ def run_one_case(
302
312
  if dataset_name == "mmmu":
303
313
  input_requests = sample_mmmu_requests(
304
314
  num_requests=batch_size,
305
- tokenizer=tokenizer,
315
+ processor=tokenizer,
306
316
  fixed_output_len=output_len,
307
- apply_chat_template=True,
308
317
  random_sample=False,
309
318
  )
310
319
  elif dataset_name == "random":
@@ -364,6 +373,8 @@ def run_one_case(
364
373
  if dataset_name == "mmmu":
365
374
  # vlm
366
375
  input_ids = []
376
+ # for vlms, tokenizer is an instance of AutoProcessor
377
+ tokenizer = tokenizer.tokenizer
367
378
  for input_req in input_requests:
368
379
  input_ids += [tokenizer.encode(input_req.prompt)]
369
380
  payload["image_data"] = [req.image_data for req in input_requests]
@@ -609,7 +620,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
609
620
  tokenizer_path = server_info["tokenizer_path"]
610
621
  elif "prefill" in server_info:
611
622
  tokenizer_path = server_info["prefill"][0]["tokenizer_path"]
612
- tokenizer = get_tokenizer(tokenizer_path)
623
+
624
+ if bench_args.dataset_name == "mmmu":
625
+ # mmmu implies this is a MLLM
626
+ tokenizer = get_processor(tokenizer_path)
627
+ else:
628
+ tokenizer = get_tokenizer(tokenizer_path)
613
629
 
614
630
  # warmup
615
631
  if not bench_args.skip_warmup: