sglang 0.5.3rc2__py3-none-any.whl β†’ 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +378 -160
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +10 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +136 -25
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +63 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +83 -80
  69. sglang/srt/entrypoints/grpc_server.py +430 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +195 -102
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +58 -6
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +33 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +20 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/minimax_m2.py +367 -0
  93. sglang/srt/function_call/utils.py +2 -2
  94. sglang/srt/grpc/compile_proto.py +3 -3
  95. sglang/srt/{entrypoints β†’ grpc}/grpc_request_manager.py +112 -52
  96. sglang/srt/grpc/health_servicer.py +189 -0
  97. sglang/srt/grpc/scheduler_launcher.py +181 -0
  98. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  99. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  100. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  101. sglang/srt/layers/activation.py +10 -1
  102. sglang/srt/layers/attention/aiter_backend.py +3 -3
  103. sglang/srt/layers/attention/ascend_backend.py +17 -1
  104. sglang/srt/layers/attention/attention_registry.py +43 -23
  105. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  106. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  107. sglang/srt/layers/attention/fla/chunk.py +0 -1
  108. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  109. sglang/srt/layers/attention/fla/index.py +0 -2
  110. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  111. sglang/srt/layers/attention/fla/utils.py +0 -3
  112. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  113. sglang/srt/layers/attention/flashattention_backend.py +24 -10
  114. sglang/srt/layers/attention/flashinfer_backend.py +258 -22
  115. sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
  116. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  117. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  118. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  119. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  121. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  122. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  123. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  124. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  125. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  127. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  128. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  129. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  130. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  131. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  132. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  133. sglang/srt/layers/attention/nsa/utils.py +0 -1
  134. sglang/srt/layers/attention/nsa_backend.py +404 -90
  135. sglang/srt/layers/attention/triton_backend.py +208 -34
  136. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  137. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  138. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  139. sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
  140. sglang/srt/layers/attention/utils.py +89 -7
  141. sglang/srt/layers/attention/vision.py +3 -3
  142. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  143. sglang/srt/layers/communicator.py +12 -7
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper β†’ deep_gemm_wrapper}/compile_utils.py +5 -9
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper β†’ deep_gemm_wrapper}/configurer.py +4 -3
  146. sglang/srt/layers/{quantization/deep_gemm_wrapper β†’ deep_gemm_wrapper}/entrypoint.py +3 -3
  147. sglang/srt/layers/dp_attention.py +17 -0
  148. sglang/srt/layers/layernorm.py +64 -19
  149. sglang/srt/layers/linear.py +9 -1
  150. sglang/srt/layers/logits_processor.py +152 -17
  151. sglang/srt/layers/modelopt_utils.py +11 -0
  152. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  153. sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
  154. sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
  155. sglang/srt/layers/moe/ep_moe/layer.py +154 -625
  156. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  160. sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
  161. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
  162. sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
  163. sglang/srt/layers/moe/moe_runner/runner.py +6 -0
  164. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  165. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  166. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  167. sglang/srt/layers/moe/router.py +51 -15
  168. sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
  169. sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
  170. sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
  171. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  172. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  173. sglang/srt/layers/moe/topk.py +7 -6
  174. sglang/srt/layers/moe/utils.py +20 -5
  175. sglang/srt/layers/quantization/__init__.py +5 -58
  176. sglang/srt/layers/quantization/awq.py +183 -9
  177. sglang/srt/layers/quantization/awq_triton.py +29 -0
  178. sglang/srt/layers/quantization/base_config.py +27 -1
  179. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  180. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  185. sglang/srt/layers/quantization/fp8.py +152 -81
  186. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  187. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  188. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  189. sglang/srt/layers/quantization/gguf.py +566 -0
  190. sglang/srt/layers/quantization/gptq.py +0 -1
  191. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  192. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  193. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  194. sglang/srt/layers/quantization/mxfp4.py +35 -68
  195. sglang/srt/layers/quantization/petit.py +1 -1
  196. sglang/srt/layers/quantization/quark/quark.py +3 -1
  197. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  199. sglang/srt/layers/quantization/unquant.py +23 -48
  200. sglang/srt/layers/quantization/utils.py +0 -1
  201. sglang/srt/layers/quantization/w4afp8.py +87 -20
  202. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  203. sglang/srt/layers/radix_attention.py +62 -9
  204. sglang/srt/layers/rotary_embedding.py +686 -17
  205. sglang/srt/layers/sampler.py +47 -16
  206. sglang/srt/layers/sparse_pooler.py +98 -0
  207. sglang/srt/layers/utils.py +0 -1
  208. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  209. sglang/srt/lora/backend/triton_backend.py +0 -1
  210. sglang/srt/lora/eviction_policy.py +139 -0
  211. sglang/srt/lora/lora_manager.py +24 -9
  212. sglang/srt/lora/lora_registry.py +1 -1
  213. sglang/srt/lora/mem_pool.py +40 -16
  214. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  215. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  216. sglang/srt/managers/cache_controller.py +48 -17
  217. sglang/srt/managers/data_parallel_controller.py +146 -42
  218. sglang/srt/managers/detokenizer_manager.py +40 -13
  219. sglang/srt/managers/io_struct.py +69 -16
  220. sglang/srt/managers/mm_utils.py +20 -18
  221. sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
  222. sglang/srt/managers/overlap_utils.py +96 -19
  223. sglang/srt/managers/schedule_batch.py +241 -511
  224. sglang/srt/managers/schedule_policy.py +15 -2
  225. sglang/srt/managers/scheduler.py +420 -514
  226. sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
  227. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  228. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  229. sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
  230. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  231. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  232. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  233. sglang/srt/managers/tokenizer_manager.py +375 -95
  234. sglang/srt/managers/tp_worker.py +212 -161
  235. sglang/srt/managers/utils.py +78 -2
  236. sglang/srt/mem_cache/allocator.py +7 -2
  237. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  238. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  239. sglang/srt/mem_cache/chunk_cache.py +13 -2
  240. sglang/srt/mem_cache/common.py +480 -0
  241. sglang/srt/mem_cache/evict_policy.py +16 -1
  242. sglang/srt/mem_cache/hicache_storage.py +11 -2
  243. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  244. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  245. sglang/srt/mem_cache/memory_pool.py +517 -219
  246. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  247. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  248. sglang/srt/mem_cache/radix_cache.py +53 -19
  249. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  250. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  251. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  252. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  253. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  254. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  255. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  256. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  257. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  259. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  260. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  261. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  262. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  263. sglang/srt/metrics/collector.py +31 -0
  264. sglang/srt/metrics/func_timer.py +1 -1
  265. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  266. sglang/srt/model_executor/forward_batch_info.py +71 -25
  267. sglang/srt/model_executor/model_runner.py +362 -270
  268. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  269. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
  270. sglang/srt/model_loader/__init__.py +1 -1
  271. sglang/srt/model_loader/loader.py +424 -27
  272. sglang/srt/model_loader/utils.py +0 -1
  273. sglang/srt/model_loader/weight_utils.py +47 -28
  274. sglang/srt/models/apertus.py +2 -3
  275. sglang/srt/models/arcee.py +2 -2
  276. sglang/srt/models/bailing_moe.py +13 -52
  277. sglang/srt/models/bailing_moe_nextn.py +3 -4
  278. sglang/srt/models/bert.py +1 -1
  279. sglang/srt/models/deepseek_nextn.py +19 -3
  280. sglang/srt/models/deepseek_ocr.py +1516 -0
  281. sglang/srt/models/deepseek_v2.py +418 -140
  282. sglang/srt/models/dots_ocr.py +0 -2
  283. sglang/srt/models/dots_vlm.py +0 -1
  284. sglang/srt/models/dots_vlm_vit.py +1 -1
  285. sglang/srt/models/falcon_h1.py +13 -19
  286. sglang/srt/models/gemma3_mm.py +16 -0
  287. sglang/srt/models/gemma3n_mm.py +1 -2
  288. sglang/srt/models/glm4_moe.py +327 -382
  289. sglang/srt/models/glm4_moe_nextn.py +6 -16
  290. sglang/srt/models/glm4v.py +2 -1
  291. sglang/srt/models/glm4v_moe.py +32 -199
  292. sglang/srt/models/gpt_oss.py +5 -5
  293. sglang/srt/models/grok.py +10 -23
  294. sglang/srt/models/hunyuan.py +2 -7
  295. sglang/srt/models/interns1.py +0 -1
  296. sglang/srt/models/kimi_vl.py +1 -7
  297. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  298. sglang/srt/models/llama.py +2 -2
  299. sglang/srt/models/llama_eagle3.py +1 -1
  300. sglang/srt/models/longcat_flash.py +5 -22
  301. sglang/srt/models/longcat_flash_nextn.py +3 -14
  302. sglang/srt/models/mimo.py +2 -13
  303. sglang/srt/models/mimo_mtp.py +1 -2
  304. sglang/srt/models/minicpmo.py +7 -5
  305. sglang/srt/models/minimax_m2.py +922 -0
  306. sglang/srt/models/mixtral.py +1 -4
  307. sglang/srt/models/mllama.py +1 -1
  308. sglang/srt/models/mllama4.py +13 -3
  309. sglang/srt/models/nemotron_h.py +511 -0
  310. sglang/srt/models/nvila.py +355 -0
  311. sglang/srt/models/nvila_lite.py +184 -0
  312. sglang/srt/models/olmo2.py +31 -4
  313. sglang/srt/models/opt.py +5 -5
  314. sglang/srt/models/phi.py +1 -1
  315. sglang/srt/models/phi4mm.py +1 -1
  316. sglang/srt/models/phimoe.py +0 -1
  317. sglang/srt/models/pixtral.py +0 -3
  318. sglang/srt/models/points_v15_chat.py +186 -0
  319. sglang/srt/models/qwen.py +0 -1
  320. sglang/srt/models/qwen2.py +22 -1
  321. sglang/srt/models/qwen2_5_vl.py +3 -3
  322. sglang/srt/models/qwen2_audio.py +2 -15
  323. sglang/srt/models/qwen2_moe.py +15 -12
  324. sglang/srt/models/qwen2_vl.py +5 -2
  325. sglang/srt/models/qwen3.py +34 -4
  326. sglang/srt/models/qwen3_moe.py +19 -37
  327. sglang/srt/models/qwen3_next.py +7 -12
  328. sglang/srt/models/qwen3_next_mtp.py +3 -4
  329. sglang/srt/models/qwen3_omni_moe.py +661 -0
  330. sglang/srt/models/qwen3_vl.py +37 -33
  331. sglang/srt/models/qwen3_vl_moe.py +57 -185
  332. sglang/srt/models/roberta.py +55 -3
  333. sglang/srt/models/sarashina2_vision.py +0 -1
  334. sglang/srt/models/step3_vl.py +3 -5
  335. sglang/srt/models/utils.py +11 -1
  336. sglang/srt/multimodal/processors/base_processor.py +7 -2
  337. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  338. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  339. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  340. sglang/srt/multimodal/processors/glm4v.py +2 -6
  341. sglang/srt/multimodal/processors/internvl.py +0 -2
  342. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  343. sglang/srt/multimodal/processors/mllama4.py +0 -8
  344. sglang/srt/multimodal/processors/{vila.py β†’ nvila.py} +32 -24
  345. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  346. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  347. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  348. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  349. sglang/srt/parser/conversation.py +41 -0
  350. sglang/srt/parser/reasoning_parser.py +28 -2
  351. sglang/srt/sampling/custom_logit_processor.py +77 -2
  352. sglang/srt/sampling/sampling_batch_info.py +17 -22
  353. sglang/srt/sampling/sampling_params.py +70 -2
  354. sglang/srt/server_args.py +846 -163
  355. sglang/srt/server_args_config_parser.py +1 -1
  356. sglang/srt/single_batch_overlap.py +36 -31
  357. sglang/srt/speculative/base_spec_worker.py +34 -0
  358. sglang/srt/speculative/draft_utils.py +226 -0
  359. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  360. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  361. sglang/srt/speculative/eagle_info.py +57 -18
  362. sglang/srt/speculative/eagle_info_v2.py +458 -0
  363. sglang/srt/speculative/eagle_utils.py +138 -0
  364. sglang/srt/speculative/eagle_worker.py +83 -280
  365. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  366. sglang/srt/speculative/{ngram_utils.py β†’ ngram_info.py} +14 -9
  367. sglang/srt/speculative/ngram_worker.py +12 -11
  368. sglang/srt/speculative/spec_info.py +2 -0
  369. sglang/srt/speculative/spec_utils.py +38 -3
  370. sglang/srt/speculative/standalone_worker.py +4 -14
  371. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  372. sglang/srt/two_batch_overlap.py +28 -14
  373. sglang/srt/utils/__init__.py +1 -1
  374. sglang/srt/{bench_utils.py β†’ utils/bench_utils.py} +4 -2
  375. sglang/srt/utils/common.py +272 -82
  376. sglang/srt/utils/hf_transformers_utils.py +44 -17
  377. sglang/srt/{host_shared_memory.py β†’ utils/host_shared_memory.py} +0 -1
  378. sglang/srt/{offloader.py β†’ utils/offloader.py} +4 -4
  379. sglang/srt/utils/profile_merger.py +199 -0
  380. sglang/test/attention/test_flashattn_backend.py +1 -1
  381. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  382. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  383. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  384. sglang/test/few_shot_gsm8k_engine.py +2 -4
  385. sglang/test/kit_matched_stop.py +157 -0
  386. sglang/test/longbench_v2/__init__.py +1 -0
  387. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  388. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  389. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  390. sglang/test/run_eval.py +41 -0
  391. sglang/test/runners.py +2 -0
  392. sglang/test/send_one.py +42 -7
  393. sglang/test/simple_eval_common.py +3 -0
  394. sglang/test/simple_eval_gpqa.py +0 -1
  395. sglang/test/simple_eval_humaneval.py +0 -3
  396. sglang/test/simple_eval_longbench_v2.py +344 -0
  397. sglang/test/test_block_fp8.py +1 -2
  398. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  399. sglang/test/test_cutlass_moe.py +1 -2
  400. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  401. sglang/test/test_deterministic.py +463 -107
  402. sglang/test/test_deterministic_utils.py +74 -0
  403. sglang/test/test_disaggregation_utils.py +81 -0
  404. sglang/test/test_marlin_moe.py +0 -1
  405. sglang/test/test_utils.py +85 -20
  406. sglang/version.py +1 -1
  407. {sglang-0.5.3rc2.dist-info β†’ sglang-0.5.4.post1.dist-info}/METADATA +48 -35
  408. {sglang-0.5.3rc2.dist-info β†’ sglang-0.5.4.post1.dist-info}/RECORD +414 -350
  409. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  410. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  411. sglang/srt/models/vila.py +0 -306
  412. sglang/srt/speculative/build_eagle_tree.py +0 -427
  413. sglang/test/test_block_fp8_ep.py +0 -358
  414. /sglang/srt/layers/{quantization/deep_gemm_wrapper β†’ deep_gemm_wrapper}/__init__.py +0 -0
  415. /sglang/srt/{aio_rwlock.py β†’ utils/aio_rwlock.py} +0 -0
  416. /sglang/srt/{torch_memory_saver_adapter.py β†’ utils/torch_memory_saver_adapter.py} +0 -0
  417. {sglang-0.5.3rc2.dist-info β†’ sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  418. {sglang-0.5.3rc2.dist-info β†’ sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  419. {sglang-0.5.3rc2.dist-info β†’ sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,74 @@
1
+ import unittest
2
+
3
+ from sglang.srt.utils import kill_process_tree
4
+ from sglang.test.test_deterministic import BenchArgs, test_deterministic
5
+ from sglang.test.test_utils import (
6
+ DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
7
+ DEFAULT_URL_FOR_TEST,
8
+ CustomTestCase,
9
+ popen_launch_server,
10
+ )
11
+
12
+ DEFAULT_MODEL = "Qwen/Qwen3-8B"
13
+ COMMON_SERVER_ARGS = [
14
+ "--trust-remote-code",
15
+ "--cuda-graph-max-bs",
16
+ "32",
17
+ "--enable-deterministic-inference",
18
+ ]
19
+
20
+
21
+ class TestDeterministicBase(CustomTestCase):
22
+ @classmethod
23
+ def get_server_args(cls):
24
+ return COMMON_SERVER_ARGS
25
+
26
+ @classmethod
27
+ def get_model(cls):
28
+ return DEFAULT_MODEL
29
+
30
+ @classmethod
31
+ def setUpClass(cls):
32
+ cls.model = cls.get_model()
33
+ cls.base_url = DEFAULT_URL_FOR_TEST
34
+ if "--attention-backend" not in cls.get_server_args():
35
+ raise unittest.SkipTest("Skip the base test class")
36
+
37
+ cls.process = popen_launch_server(
38
+ cls.model,
39
+ cls.base_url,
40
+ timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
41
+ other_args=cls.get_server_args(),
42
+ )
43
+
44
+ @classmethod
45
+ def tearDownClass(cls):
46
+ kill_process_tree(cls.process.pid)
47
+
48
+ def _extract_host_and_port(self, url):
49
+ return url.split("://")[-1].split(":")[0], int(url.split(":")[-1])
50
+
51
+ def test_single(self):
52
+ args = BenchArgs()
53
+ url = DEFAULT_URL_FOR_TEST
54
+ args.host, args.port = self._extract_host_and_port(url)
55
+ args.test_mode = "single"
56
+ args.n_start = 10
57
+ args.n_trials = 20
58
+ results = test_deterministic(args)
59
+ args.temperature = 0.5 # test for deterministic sampling
60
+ for result in results:
61
+ assert result == 1
62
+
63
+ def test_prefix_with_logprobs(self):
64
+ args = BenchArgs()
65
+ url = DEFAULT_URL_FOR_TEST
66
+ args.host, args.port = self._extract_host_and_port(url)
67
+ args.test_mode = "prefix"
68
+ args.n_start = 10
69
+ args.n_trials = 10
70
+ args.temperature = 0.5 # test for deterministic sampling
71
+ args.return_logprob = True # Enable logprobs comparison
72
+ results = test_deterministic(args)
73
+ for result in results:
74
+ assert result == 1
@@ -1,16 +1,23 @@
1
+ import logging
2
+ import os
1
3
  import time
4
+ import warnings
2
5
  from urllib.parse import urlparse
3
6
 
4
7
  import requests
5
8
 
9
+ from sglang.srt.environ import envs
6
10
  from sglang.srt.utils import kill_process_tree
7
11
  from sglang.test.test_utils import (
8
12
  DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
9
13
  DEFAULT_URL_FOR_TEST,
10
14
  CustomTestCase,
15
+ is_in_ci,
11
16
  popen_with_error_check,
12
17
  )
13
18
 
19
+ logger = logging.getLogger(__name__)
20
+
14
21
 
15
22
  class TestDisaggregationBase(CustomTestCase):
16
23
  @classmethod
@@ -27,6 +34,24 @@ class TestDisaggregationBase(CustomTestCase):
27
34
  print(f"{cls.base_host=} {cls.lb_port=} {cls.prefill_port=} {cls.decode_port=}")
28
35
  cls.process_lb, cls.process_decode, cls.process_prefill = None, None, None
29
36
 
37
+ # config transfer backend and rdma devices
38
+ if is_in_ci():
39
+ cls.transfer_backend = ["--disaggregation-transfer-backend", "mooncake"]
40
+ cls.rdma_devices = ["--disaggregation-ib-device", get_rdma_devices_args()]
41
+ else:
42
+ cls.transfer_backend = [
43
+ "--disaggregation-transfer-backend",
44
+ envs.SGLANG_TEST_PD_DISAGG_BACKEND.get(),
45
+ ]
46
+ cls.rdma_devices = [
47
+ "--disaggregation-ib-device",
48
+ envs.SGLANG_TEST_PD_DISAGG_DEVICES.get(),
49
+ ]
50
+ if cls.rdma_devices[1] is None:
51
+ cls.rdma_devices = []
52
+ msg = "No RDMA devices specified for disaggregation test, using default settings."
53
+ warnings.warn(msg)
54
+
30
55
  @classmethod
31
56
  def launch_lb(cls):
32
57
  lb_command = [
@@ -75,3 +100,59 @@ class TestDisaggregationBase(CustomTestCase):
75
100
 
76
101
  # wait for 5 seconds
77
102
  time.sleep(5)
103
+
104
+
105
+ def get_rdma_devices_args():
106
+ def _parse_list_env(var_name: str):
107
+ val = os.getenv(var_name)
108
+ if not val:
109
+ return None
110
+ items = [x.strip() for x in val.split(",") if x.strip()]
111
+ return items or None
112
+
113
+ def _pick_default_pair(rdma_all_devices):
114
+ return [rdma_all_devices[0], rdma_all_devices[len(rdma_all_devices) // 2]]
115
+
116
+ rdma_all_devices = _parse_list_env("SGLANG_CI_RDMA_ALL_DEVICES") or [
117
+ f"mlx5_roce{i}" for i in range(8)
118
+ ]
119
+ logger.info("Resolved rdma_all_devices=%s", rdma_all_devices)
120
+
121
+ n_rdma = len(rdma_all_devices)
122
+
123
+ # 1. Get visible GPU indices
124
+ cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
125
+ if not cuda_visible_devices:
126
+ warnings.warn("CUDA_VISIBLE_DEVICES is not set. Using default RDMA devices.")
127
+ return ",".join(_pick_default_pair(rdma_all_devices))
128
+
129
+ try:
130
+ # Convert to list of integers (handling possible spaces and empty strings)
131
+ gpu_indices = [
132
+ int(idx.strip()) for idx in cuda_visible_devices.split(",") if idx.strip()
133
+ ]
134
+ if not gpu_indices or len(gpu_indices) > 4:
135
+ return ",".join(_pick_default_pair(rdma_all_devices))
136
+ except ValueError:
137
+ warnings.warn(f"Invalid CUDA_VISIBLE_DEVICES format: {cuda_visible_devices}")
138
+ return ",".join(_pick_default_pair(rdma_all_devices))
139
+
140
+ # 2. Calculate base RDMA index group (each group of 4 GPUs uses consecutive devices)
141
+ base_rdma_group = (min(gpu_indices) // 4) * 4
142
+ for gpu_idx in gpu_indices:
143
+ if not (base_rdma_group <= gpu_idx < base_rdma_group + 4):
144
+ warnings.warn(
145
+ f"GPU index {gpu_idx} is outside expected group "
146
+ f"{base_rdma_group}-{base_rdma_group+3}"
147
+ )
148
+
149
+ # 3. Generate RDMA device names
150
+ rdma_devices = []
151
+ for gpu_idx in gpu_indices:
152
+ nic_index = gpu_idx // (8 // n_rdma)
153
+ rdma_devices.append(rdma_all_devices[nic_index])
154
+
155
+ if not rdma_devices:
156
+ return ",".join(_pick_default_pair(rdma_all_devices))
157
+
158
+ return ",".join(rdma_devices)
@@ -1,4 +1,3 @@
1
- import types
2
1
  from typing import Optional
3
2
 
4
3
  import pytest
sglang/test/test_utils.py CHANGED
@@ -16,11 +16,10 @@ import unittest
16
16
  from concurrent.futures import ThreadPoolExecutor
17
17
  from dataclasses import dataclass
18
18
  from datetime import datetime
19
- from functools import partial
19
+ from functools import partial, wraps
20
20
  from pathlib import Path
21
21
  from types import SimpleNamespace
22
22
  from typing import Any, Awaitable, Callable, List, Optional, Tuple
23
- from urllib.parse import quote
24
23
 
25
24
  import aiohttp
26
25
  import numpy as np
@@ -76,6 +75,11 @@ DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE = "gaunernst/DeepSeek-V2-Lite-Chat-FP8"
76
75
  DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8"
77
76
  DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
78
77
 
78
+ # INT4 models
79
+ DEFAULT_MODEL_NAME_FOR_TEST_AWQ_INT4 = (
80
+ "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
81
+ )
82
+
79
83
  # EAGLE
80
84
  DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
81
85
  DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
@@ -122,7 +126,12 @@ def is_in_ci():
122
126
 
123
127
  def is_in_amd_ci():
124
128
  """Return whether it is in an AMD CI runner."""
125
- return get_bool_env_var("SGLANG_AMD_CI")
129
+ return get_bool_env_var("SGLANG_IS_IN_CI_AMD")
130
+
131
+
132
+ def is_blackwell_system():
133
+ """Return whether it is running on a Blackwell (B200) system."""
134
+ return get_bool_env_var("IS_BLACKWELL")
126
135
 
127
136
 
128
137
  def _use_cached_default_models(model_repo: str):
@@ -136,17 +145,20 @@ def _use_cached_default_models(model_repo: str):
136
145
 
137
146
  if is_in_ci():
138
147
  DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
139
- 5000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
148
+ 10000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 1000
140
149
  )
141
150
  else:
142
151
  DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
143
- 7000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
152
+ 20000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 1000
144
153
  )
145
154
  DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
146
155
 
147
156
  if is_in_amd_ci():
148
157
  DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 3000
149
158
 
159
+ if is_blackwell_system():
160
+ DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 3000
161
+
150
162
 
151
163
  def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
152
164
  assert url is not None
@@ -397,8 +409,6 @@ def _get_call_generate(args: argparse.Namespace):
397
409
  return partial(call_generate_vllm, url=f"{args.host}:{args.port}/generate")
398
410
  elif args.backend == "srt-raw":
399
411
  return partial(call_generate_srt_raw, url=f"{args.host}:{args.port}/generate")
400
- elif args.backend == "gserver":
401
- return partial(call_generate_gserver, url=f"{args.host}:{args.port}")
402
412
  elif args.backend == "outlines":
403
413
  return partial(call_generate_outlines, url=f"{args.host}:{args.port}/generate")
404
414
  elif args.backend == "guidance":
@@ -504,11 +514,12 @@ def popen_launch_server(
504
514
  base_url: str,
505
515
  timeout: float,
506
516
  api_key: Optional[str] = None,
507
- other_args: list[str] = [],
517
+ other_args: Optional[list[str]] = None,
508
518
  env: Optional[dict] = None,
509
519
  return_stdout_stderr: Optional[tuple] = None,
510
520
  device: str = "auto",
511
521
  pd_separated: bool = False,
522
+ num_replicas: Optional[int] = None,
512
523
  ):
513
524
  """Launch a server process with automatic device detection.
514
525
 
@@ -516,17 +527,19 @@ def popen_launch_server(
516
527
  device: Device type ("auto", "cuda", "rocm" or "cpu").
517
528
  If "auto", will detect available platforms automatically.
518
529
  """
530
+ other_args = other_args or []
531
+
519
532
  # Auto-detect device if needed
520
533
  if device == "auto":
521
534
  device = auto_config_device()
522
- print(f"Auto-configed device: {device}", flush=True)
523
535
  other_args = list(other_args)
524
536
  other_args += ["--device", str(device)]
525
537
 
526
538
  _, host, port = base_url.split(":")
527
539
  host = host[2:]
528
540
 
529
- if pd_separated:
541
+ use_mixed_pd_engine = not pd_separated and num_replicas is not None
542
+ if pd_separated or use_mixed_pd_engine:
530
543
  command = "sglang.launch_pd_server"
531
544
  else:
532
545
  command = "sglang.launch_server"
@@ -540,7 +553,7 @@ def popen_launch_server(
540
553
  *[str(x) for x in other_args],
541
554
  ]
542
555
 
543
- if pd_separated:
556
+ if pd_separated or use_mixed_pd_engine:
544
557
  command.extend(
545
558
  [
546
559
  "--lb-host",
@@ -559,6 +572,15 @@ def popen_launch_server(
559
572
  ]
560
573
  )
561
574
 
575
+ if use_mixed_pd_engine:
576
+ command.extend(
577
+ [
578
+ "--mixed",
579
+ "--num-replicas",
580
+ str(num_replicas),
581
+ ]
582
+ )
583
+
562
584
  if api_key:
563
585
  command += ["--api-key", api_key]
564
586
 
@@ -597,7 +619,6 @@ def popen_launch_server(
597
619
  start_time = time.perf_counter()
598
620
  with requests.Session() as session:
599
621
  while time.perf_counter() - start_time < timeout:
600
-
601
622
  return_code = process.poll()
602
623
  if return_code is not None:
603
624
  # Server failed to start (non-zero exit code) or crashed
@@ -1149,7 +1170,7 @@ def run_bench_offline_throughput(model, other_args):
1149
1170
  *[str(x) for x in other_args],
1150
1171
  ]
1151
1172
 
1152
- print(f"{command=}")
1173
+ print(f"command={' '.join(command)}")
1153
1174
  process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1154
1175
 
1155
1176
  try:
@@ -1608,6 +1629,9 @@ class CustomTestCase(unittest.TestCase):
1608
1629
  max_retry=max_retry,
1609
1630
  )
1610
1631
 
1632
+ def setUp(self):
1633
+ print(f"[Test Method] {self._testMethodName}", flush=True)
1634
+
1611
1635
 
1612
1636
  def dump_bench_raw_result(
1613
1637
  path: str,
@@ -1641,15 +1665,26 @@ def _ensure_remove_suffix(text: str, suffix: str):
1641
1665
  return text.removesuffix(suffix)
1642
1666
 
1643
1667
 
1644
- class ModelDeploySetup:
1645
- def __init__(self, model_path: str, extra_args: List[str] = []):
1668
+ class ModelLaunchSettings:
1669
+ def __init__(
1670
+ self,
1671
+ model_path: str,
1672
+ tp_size: int = 1,
1673
+ extra_args: Optional[List[str]] = None,
1674
+ env: Optional[dict] = None,
1675
+ ):
1646
1676
  self.model_path = model_path
1647
- if "--enable-multimodal" not in extra_args:
1648
- extra_args.append("--enable-multimodal")
1649
- if "--trust-remote-code" not in extra_args:
1650
- extra_args.append("--trust-remote-code")
1677
+ self.tp_size = tp_size
1678
+ self.extra_args = list(extra_args) if extra_args else []
1679
+ self.env = env
1680
+
1681
+ if self.tp_size > 1 and "--tp" not in self.extra_args:
1682
+ self.extra_args.extend(["--tp", str(self.tp_size)])
1651
1683
 
1652
- self.extra_args = extra_args
1684
+ fixed_args = ["--enable-multimodal", "--trust-remote-code"]
1685
+ for fixed_arg in fixed_args:
1686
+ if fixed_arg not in self.extra_args:
1687
+ self.extra_args.append(fixed_arg)
1653
1688
 
1654
1689
 
1655
1690
  class ModelEvalMetrics:
@@ -1782,3 +1817,33 @@ def write_results_to_json(model, metrics, mode="a"):
1782
1817
 
1783
1818
  with open("results.json", "w") as f:
1784
1819
  json.dump(existing_results, f, indent=2)
1820
+
1821
+
1822
+ def intel_amx_benchmark(extra_args=None, min_throughput=None):
1823
+ def decorator(test_func):
1824
+ @wraps(test_func)
1825
+ def wrapper(self):
1826
+ common_args = [
1827
+ "--attention-backend",
1828
+ "intel_amx",
1829
+ "--disable-radix",
1830
+ "--trust-remote-code",
1831
+ ]
1832
+ full_args = common_args + (extra_args or [])
1833
+
1834
+ model = test_func(self)
1835
+ prefill_latency, decode_throughput, decode_latency = run_bench_one_batch(
1836
+ model, full_args
1837
+ )
1838
+
1839
+ print(f"{model=}")
1840
+ print(f"{prefill_latency=}")
1841
+ print(f"{decode_throughput=}")
1842
+ print(f"{decode_latency=}")
1843
+
1844
+ if is_in_ci() and min_throughput is not None:
1845
+ self.assertGreater(decode_throughput, min_throughput)
1846
+
1847
+ return wrapper
1848
+
1849
+ return decorator
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.3rc2"
1
+ __version__ = "0.5.4.post1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.5.3rc2
3
+ Version: 0.5.4.post1
4
4
  Summary: SGLang is a fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -218,10 +218,12 @@ Requires-Dist: blobfile==3.0.0
218
218
  Requires-Dist: build
219
219
  Requires-Dist: compressed-tensors
220
220
  Requires-Dist: cuda-python
221
+ Requires-Dist: decord2
221
222
  Requires-Dist: datasets
222
223
  Requires-Dist: einops
223
224
  Requires-Dist: fastapi
224
- Requires-Dist: flashinfer_python==0.4.0rc3
225
+ Requires-Dist: flashinfer_python==0.4.1
226
+ Requires-Dist: gguf
225
227
  Requires-Dist: hf_transfer
226
228
  Requires-Dist: huggingface_hub
227
229
  Requires-Dist: interegular
@@ -243,31 +245,33 @@ Requires-Dist: psutil
243
245
  Requires-Dist: py-spy
244
246
  Requires-Dist: pybase64
245
247
  Requires-Dist: pydantic
246
- Requires-Dist: pynvml
248
+ Requires-Dist: nvidia-ml-py
247
249
  Requires-Dist: python-multipart
248
250
  Requires-Dist: pyzmq>=25.1.2
249
251
  Requires-Dist: requests
250
252
  Requires-Dist: scipy
251
253
  Requires-Dist: sentencepiece
252
254
  Requires-Dist: setproctitle
253
- Requires-Dist: sgl-kernel==0.3.14.post1
255
+ Requires-Dist: sgl-kernel==0.3.16.post4
254
256
  Requires-Dist: soundfile==0.13.1
255
257
  Requires-Dist: tiktoken
256
258
  Requires-Dist: timm==1.0.16
257
259
  Requires-Dist: torch==2.8.0
258
- Requires-Dist: torch_memory_saver==0.0.9rc2
260
+ Requires-Dist: torch_memory_saver==0.0.9
259
261
  Requires-Dist: torchao==0.9.0
260
262
  Requires-Dist: torchaudio==2.8.0
261
263
  Requires-Dist: torchvision
262
264
  Requires-Dist: tqdm
263
- Requires-Dist: transformers==4.57.0
265
+ Requires-Dist: transformers==4.57.1
264
266
  Requires-Dist: uvicorn
265
267
  Requires-Dist: uvloop
266
- Requires-Dist: xgrammar==0.1.24
268
+ Requires-Dist: xgrammar==0.1.25
267
269
  Requires-Dist: grpcio==1.75.1
268
270
  Requires-Dist: grpcio-tools==1.75.1
269
- Provides-Extra: decord
270
- Requires-Dist: decord; extra == "decord"
271
+ Requires-Dist: grpcio-reflection==1.75.1
272
+ Requires-Dist: grpcio-health-checking==1.75.1
273
+ Provides-Extra: modelopt
274
+ Requires-Dist: nvidia-modelopt; extra == "modelopt"
271
275
  Provides-Extra: test
272
276
  Requires-Dist: accelerate; extra == "test"
273
277
  Requires-Dist: expecttest; extra == "test"
@@ -278,24 +282,28 @@ Requires-Dist: peft; extra == "test"
278
282
  Requires-Dist: pytest; extra == "test"
279
283
  Requires-Dist: sentence_transformers; extra == "test"
280
284
  Requires-Dist: tabulate; extra == "test"
285
+ Provides-Extra: checkpoint-engine
286
+ Requires-Dist: checkpoint-engine==0.1.2; extra == "checkpoint-engine"
287
+ Provides-Extra: all
288
+ Provides-Extra: dev
289
+ Requires-Dist: sglang[test]; extra == "dev"
290
+ Provides-Extra: cu130
291
+ Requires-Dist: torch==2.9.0; extra == "cu130"
292
+ Requires-Dist: torchaudio==2.9.0; extra == "cu130"
293
+ Requires-Dist: torchvision==0.24.0; extra == "cu130"
294
+ Provides-Extra: cu130-all
295
+ Requires-Dist: sglang[test]; extra == "cu130-all"
296
+ Requires-Dist: sglang[decord]; extra == "cu130-all"
297
+ Requires-Dist: sglang[cu130]; extra == "cu130-all"
281
298
  Provides-Extra: tracing
282
299
  Requires-Dist: opentelemetry-api; extra == "tracing"
283
300
  Requires-Dist: opentelemetry-exporter-otlp; extra == "tracing"
284
301
  Requires-Dist: opentelemetry-exporter-otlp-proto-grpc; extra == "tracing"
285
302
  Requires-Dist: opentelemetry-sdk; extra == "tracing"
286
- Provides-Extra: all
287
- Requires-Dist: sglang[test]; extra == "all"
288
- Requires-Dist: sglang[decord]; extra == "all"
289
- Provides-Extra: all-aarch64
290
- Requires-Dist: sglang[test]; extra == "all-aarch64"
291
- Provides-Extra: dev
292
- Requires-Dist: sglang[test]; extra == "dev"
293
- Requires-Dist: sglang[decord]; extra == "dev"
294
303
  Provides-Extra: blackwell
295
- Requires-Dist: sglang[test]; extra == "blackwell"
296
- Requires-Dist: sglang[decord]; extra == "blackwell"
304
+ Requires-Dist: sglang[dev]; extra == "blackwell"
297
305
  Provides-Extra: blackwell-aarch64
298
- Requires-Dist: sglang[test]; extra == "blackwell-aarch64"
306
+ Requires-Dist: sglang[dev]; extra == "blackwell-aarch64"
299
307
  Dynamic: license-file
300
308
 
301
309
  <div align="center" id="sglangtop">
@@ -312,7 +320,7 @@ Dynamic: license-file
312
320
 
313
321
  --------------------------------------------------------------------------------
314
322
 
315
- | [**Blog**](https://lmsys.org/blog/2025-05-05-large-scale-ep/)
323
+ | [**Blog**](https://lmsys.org/blog/)
316
324
  | [**Documentation**](https://docs.sglang.ai/)
317
325
  | [**Join Slack**](https://slack.sglang.ai/)
318
326
  | [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
@@ -320,18 +328,21 @@ Dynamic: license-file
320
328
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
321
329
 
322
330
  ## News
323
- - [2025/08] πŸ”” SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf), [Highlights](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_highlights.pdf), [AITER/MoRI](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_aiter_mori.pdf), [Wave](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_wave.pdf)).
324
- - [2025/08] πŸ”₯ SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
325
- - [2025/06] πŸ”₯ SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
326
- - [2025/06] πŸ”₯ Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
327
- - [2025/05] πŸ”₯ Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
328
- - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
331
+ - [2025/10] πŸ”₯ AMD AI Dev Day 2025 SGLang ([slide](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/sglang_amd_ai_devday_2025.pdf)), PyTorch Conference 2025 SGLang ([slide](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/sglang_pytorch_2025.pdf)).
332
+ - [2025/09] πŸ”₯ Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part II): 3.8x Prefill, 4.8x Decode Throughput ([blog](https://lmsys.org/blog/2025-09-25-gb200-part-2/)).
333
+ - [2025/09] SGLang Day 0 Support for DeepSeek-V3.2 with Sparse Attention ([blog](https://lmsys.org/blog/2025-09-29-deepseek-V32/)).
334
+ - [2025/08] SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf), [Highlights](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_highlights.pdf), [AITER/MoRI](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_aiter_mori.pdf), [Wave](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_wave.pdf)).
335
+ - [2025/08] SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
336
+ - [2025/05] Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
329
337
  - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
330
338
  - [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
331
339
 
332
340
  <details>
333
341
  <summary>More</summary>
334
342
 
343
+ - [2025/06] SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
344
+ - [2025/06] Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
345
+ - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
335
346
  - [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinctβ„’ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
336
347
  - [2025/01] SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
337
348
  - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
@@ -344,14 +355,15 @@ Dynamic: license-file
344
355
  </details>
345
356
 
346
357
  ## About
347
- SGLang is a fast serving framework for large language models and vision language models.
348
- It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
349
- The core features include:
358
+ SGLang is a high-performance serving framework for large language models and vision-language models.
359
+ It is designed to deliver low-latency and high-throughput inference across a wide range of setups, from a single GPU to large distributed clusters.
360
+ Its core features include:
350
361
 
351
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-lora batching.
352
- - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
353
- - **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
354
- - **Active Community**: SGLang is open-source and backed by an active community with wide industry adoption.
362
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, a zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-LoRA batching.
363
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GLM, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse), and reward models (Skywork), with easy extensibility for integrating new models. Compatible with most Hugging Face models and OpenAI APIs.
364
+ - **Extensive Hardware Support**: Runs on NVIDIA GPUs (GB200/B300/H100/A100/Spark), AMD GPUs (MI355/MI300), Intel Xeon CPUs, Google TPUs, Ascend NPUs, and more.
365
+ - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, supporting chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
366
+ - **Active Community**: SGLang is open-source and supported by a vibrant community with widespread industry adoption, powering over 300,000 GPUs worldwide.
355
367
 
356
368
  ## Getting Started
357
369
  - [Install SGLang](https://docs.sglang.ai/get_started/install.html)
@@ -367,7 +379,8 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
367
379
  [Development Roadmap (2025 H2)](https://github.com/sgl-project/sglang/issues/7736)
368
380
 
369
381
  ## Adoption and Sponsorship
370
- SGLang has been deployed at large scale, generating trillions of tokens in production each day. It is trusted and adopted by a wide range of leading enterprises and institutions, including xAI, AMD, NVIDIA, Intel, LinkedIn, Cursor, Oracle Cloud, Google Cloud, Microsoft Azure, AWS, Atlas Cloud, Voltage Park, Nebius, DataCrunch, Novita, InnoMatrix, MIT, UCLA, the University of Washington, Stanford, UC Berkeley, Tsinghua University, Jam & Tea Studios, Baseten, and other major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto industry standard, with deployments running on over 1,000,000 GPUs worldwide.
382
+ SGLang has been deployed at large scale, generating trillions of tokens in production each day. It is trusted and adopted by a wide range of leading enterprises and institutions, including xAI, AMD, NVIDIA, Intel, LinkedIn, Cursor, Oracle Cloud, Google Cloud, Microsoft Azure, AWS, Atlas Cloud, Voltage Park, Nebius, DataCrunch, Novita, InnoMatrix, MIT, UCLA, the University of Washington, Stanford, UC Berkeley, Tsinghua University, Jam & Tea Studios, Baseten, and other major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto industry standard, with deployments running on over 300,000 GPUs worldwide.
383
+ SGLang is currently hosted under the non-profit open-source organization [LMSYS](https://lmsys.org/about/).
371
384
 
372
385
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
373
386