sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
sglang/test/test_utils.py CHANGED
@@ -9,15 +9,17 @@ import os
9
9
  import random
10
10
  import re
11
11
  import subprocess
12
+ import sys
12
13
  import threading
13
14
  import time
14
15
  import unittest
15
16
  from concurrent.futures import ThreadPoolExecutor
16
17
  from dataclasses import dataclass
17
- from functools import partial
18
+ from datetime import datetime
19
+ from functools import partial, wraps
18
20
  from pathlib import Path
19
21
  from types import SimpleNamespace
20
- from typing import Awaitable, Callable, List, Optional, Tuple
22
+ from typing import Any, Awaitable, Callable, List, Optional, Tuple
21
23
 
22
24
  import aiohttp
23
25
  import numpy as np
@@ -41,6 +43,7 @@ from sglang.utils import get_exception_traceback
41
43
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
42
44
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
43
45
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
46
+ DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE = "Qwen/Qwen3-Reranker-0.6B"
44
47
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
45
48
  DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B"
46
49
  DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
@@ -72,14 +75,21 @@ DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE = "gaunernst/DeepSeek-V2-Lite-Chat-FP8"
72
75
  DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8"
73
76
  DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
74
77
 
78
+ # INT4 models
79
+ DEFAULT_MODEL_NAME_FOR_TEST_AWQ_INT4 = (
80
+ "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
81
+ )
82
+
75
83
  # EAGLE
76
84
  DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
77
85
  DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
78
- DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B"
86
+ DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST_EAGLE3 = "meta-llama/Llama-3.1-8B-Instruct"
87
+ DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B"
79
88
  DEFAULT_STANDALONE_SPECULATIVE_TARGET_MODEL_FOR_TEST = (
80
89
  "meta-llama/Llama-3.1-8B-Instruct"
81
90
  )
82
91
  DEFAULT_STANDALONE_SPECULATIVE_DRAFT_MODEL_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
92
+ DEFAULT_NGRAM_SPECULATIVE_TARGET_MODEL_FOR_TEST = "Qwen/Qwen2.5-Coder-7B-Instruct"
83
93
 
84
94
  # Other use cases
85
95
  DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
@@ -116,7 +126,12 @@ def is_in_ci():
116
126
 
117
127
  def is_in_amd_ci():
118
128
  """Return whether it is in an AMD CI runner."""
119
- return get_bool_env_var("SGLANG_AMD_CI")
129
+ return get_bool_env_var("SGLANG_IS_IN_CI_AMD")
130
+
131
+
132
+ def is_blackwell_system():
133
+ """Return whether it is running on a Blackwell (B200) system."""
134
+ return get_bool_env_var("IS_BLACKWELL")
120
135
 
121
136
 
122
137
  def _use_cached_default_models(model_repo: str):
@@ -130,17 +145,20 @@ def _use_cached_default_models(model_repo: str):
130
145
 
131
146
  if is_in_ci():
132
147
  DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
133
- 5000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
148
+ 10000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 1000
134
149
  )
135
150
  else:
136
151
  DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
137
- 7000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
152
+ 20000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 1000
138
153
  )
139
154
  DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
140
155
 
141
156
  if is_in_amd_ci():
142
157
  DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 3000
143
158
 
159
+ if is_blackwell_system():
160
+ DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 3000
161
+
144
162
 
145
163
  def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
146
164
  assert url is not None
@@ -391,8 +409,6 @@ def _get_call_generate(args: argparse.Namespace):
391
409
  return partial(call_generate_vllm, url=f"{args.host}:{args.port}/generate")
392
410
  elif args.backend == "srt-raw":
393
411
  return partial(call_generate_srt_raw, url=f"{args.host}:{args.port}/generate")
394
- elif args.backend == "gserver":
395
- return partial(call_generate_gserver, url=f"{args.host}:{args.port}")
396
412
  elif args.backend == "outlines":
397
413
  return partial(call_generate_outlines, url=f"{args.host}:{args.port}/generate")
398
414
  elif args.backend == "guidance":
@@ -498,11 +514,12 @@ def popen_launch_server(
498
514
  base_url: str,
499
515
  timeout: float,
500
516
  api_key: Optional[str] = None,
501
- other_args: list[str] = [],
517
+ other_args: Optional[list[str]] = None,
502
518
  env: Optional[dict] = None,
503
519
  return_stdout_stderr: Optional[tuple] = None,
504
520
  device: str = "auto",
505
521
  pd_separated: bool = False,
522
+ num_replicas: Optional[int] = None,
506
523
  ):
507
524
  """Launch a server process with automatic device detection.
508
525
 
@@ -510,17 +527,19 @@ def popen_launch_server(
510
527
  device: Device type ("auto", "cuda", "rocm" or "cpu").
511
528
  If "auto", will detect available platforms automatically.
512
529
  """
530
+ other_args = other_args or []
531
+
513
532
  # Auto-detect device if needed
514
533
  if device == "auto":
515
534
  device = auto_config_device()
516
- print(f"Auto-configed device: {device}", flush=True)
517
535
  other_args = list(other_args)
518
536
  other_args += ["--device", str(device)]
519
537
 
520
538
  _, host, port = base_url.split(":")
521
539
  host = host[2:]
522
540
 
523
- if pd_separated:
541
+ use_mixed_pd_engine = not pd_separated and num_replicas is not None
542
+ if pd_separated or use_mixed_pd_engine:
524
543
  command = "sglang.launch_pd_server"
525
544
  else:
526
545
  command = "sglang.launch_server"
@@ -534,7 +553,7 @@ def popen_launch_server(
534
553
  *[str(x) for x in other_args],
535
554
  ]
536
555
 
537
- if pd_separated:
556
+ if pd_separated or use_mixed_pd_engine:
538
557
  command.extend(
539
558
  [
540
559
  "--lb-host",
@@ -553,6 +572,15 @@ def popen_launch_server(
553
572
  ]
554
573
  )
555
574
 
575
+ if use_mixed_pd_engine:
576
+ command.extend(
577
+ [
578
+ "--mixed",
579
+ "--num-replicas",
580
+ str(num_replicas),
581
+ ]
582
+ )
583
+
556
584
  if api_key:
557
585
  command += ["--api-key", api_key]
558
586
 
@@ -561,18 +589,36 @@ def popen_launch_server(
561
589
  if return_stdout_stderr:
562
590
  process = subprocess.Popen(
563
591
  command,
564
- stdout=return_stdout_stderr[0],
565
- stderr=return_stdout_stderr[1],
592
+ stdout=subprocess.PIPE,
593
+ stderr=subprocess.PIPE,
566
594
  env=env,
567
595
  text=True,
596
+ bufsize=1,
568
597
  )
598
+
599
+ def _dump(src, sinks):
600
+ for line in iter(src.readline, ""):
601
+ for sink in sinks:
602
+ sink.write(line)
603
+ sink.flush()
604
+ src.close()
605
+
606
+ threading.Thread(
607
+ target=_dump,
608
+ args=(process.stdout, [return_stdout_stderr[0], sys.stdout]),
609
+ daemon=True,
610
+ ).start()
611
+ threading.Thread(
612
+ target=_dump,
613
+ args=(process.stderr, [return_stdout_stderr[1], sys.stderr]),
614
+ daemon=True,
615
+ ).start()
569
616
  else:
570
617
  process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
571
618
 
572
619
  start_time = time.perf_counter()
573
620
  with requests.Session() as session:
574
621
  while time.perf_counter() - start_time < timeout:
575
-
576
622
  return_code = process.poll()
577
623
  if return_code is not None:
578
624
  # Server failed to start (non-zero exit code) or crashed
@@ -869,6 +915,154 @@ def run_bench_serving(
869
915
  return res
870
916
 
871
917
 
918
+ def run_score_benchmark(
919
+ model,
920
+ num_requests=100,
921
+ batch_size=5,
922
+ other_server_args=None,
923
+ need_warmup=False,
924
+ device="auto",
925
+ ):
926
+ """Score API benchmark function compatible with run_bench_serving pattern"""
927
+ if other_server_args is None:
928
+ other_server_args = []
929
+
930
+ if device == "auto":
931
+ device = auto_config_device()
932
+
933
+ # Launch the server (consistent with run_bench_serving)
934
+ base_url = DEFAULT_URL_FOR_TEST
935
+ process = popen_launch_server(
936
+ model,
937
+ base_url,
938
+ timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
939
+ other_args=other_server_args,
940
+ )
941
+
942
+ async def _run_benchmark():
943
+
944
+ # Load tokenizer for generating test data
945
+ from sglang.srt.utils.hf_transformers_utils import get_tokenizer
946
+
947
+ tokenizer = get_tokenizer(model)
948
+
949
+ # Score API configuration
950
+ score_query_tokens = 120
951
+ score_item_tokens = 180
952
+ score_label_token_ids = [9454, 2753] # Yes/No token IDs
953
+ special_token = "<|im_start|>"
954
+
955
+ def generate_text_with_token_count(num_tokens):
956
+ """Generate text with precise token count using replicated token."""
957
+ text = special_token * num_tokens
958
+ actual_tokens = len(tokenizer.encode(text, add_special_tokens=False))
959
+ if actual_tokens != num_tokens:
960
+ text = special_token * (
961
+ num_tokens
962
+ // len(tokenizer.encode(special_token, add_special_tokens=False))
963
+ )
964
+ return text
965
+
966
+ if need_warmup:
967
+ warmup_data = {
968
+ "query": generate_text_with_token_count(score_query_tokens),
969
+ "items": [
970
+ generate_text_with_token_count(score_item_tokens) for _ in range(3)
971
+ ],
972
+ "label_token_ids": score_label_token_ids,
973
+ "model": model,
974
+ "apply_softmax": True,
975
+ }
976
+
977
+ async with aiohttp.ClientSession() as session:
978
+ try:
979
+ await session.post(
980
+ f"{base_url}/v1/score",
981
+ json=warmup_data,
982
+ timeout=aiohttp.ClientTimeout(total=30),
983
+ )
984
+ except:
985
+ pass # Ignore warmup errors
986
+
987
+ test_requests = []
988
+ for i in range(num_requests):
989
+ query = generate_text_with_token_count(score_query_tokens)
990
+ items = [
991
+ generate_text_with_token_count(score_item_tokens)
992
+ for _ in range(batch_size)
993
+ ]
994
+
995
+ score_data = {
996
+ "query": query,
997
+ "items": items,
998
+ "label_token_ids": score_label_token_ids,
999
+ "model": model,
1000
+ "apply_softmax": True,
1001
+ }
1002
+ test_requests.append(score_data)
1003
+
1004
+ start_time = time.monotonic()
1005
+ successful_requests = 0
1006
+ total_latency = 0
1007
+ latencies = []
1008
+
1009
+ async with aiohttp.ClientSession() as session:
1010
+ for request_data in test_requests:
1011
+ try:
1012
+ request_start = time.monotonic()
1013
+ async with session.post(
1014
+ f"{base_url}/v1/score",
1015
+ json=request_data,
1016
+ timeout=aiohttp.ClientTimeout(total=30),
1017
+ ) as response:
1018
+ if response.status == 200:
1019
+ response_data = await response.json()
1020
+ request_end = time.monotonic()
1021
+
1022
+ if "scores" in response_data or "logprobs" in response_data:
1023
+ latency_ms = (request_end - request_start) * 1000
1024
+ latencies.append(latency_ms)
1025
+ total_latency += latency_ms
1026
+ successful_requests += 1
1027
+ except Exception:
1028
+ continue
1029
+
1030
+ end_time = time.monotonic()
1031
+ total_time = end_time - start_time
1032
+
1033
+ if successful_requests > 0:
1034
+ throughput = successful_requests / total_time
1035
+ avg_latency = total_latency / successful_requests
1036
+ latencies.sort()
1037
+ p95_latency = latencies[int(len(latencies) * 0.95)] if latencies else 0
1038
+
1039
+ return {
1040
+ "completed": successful_requests,
1041
+ "total_requests": num_requests,
1042
+ "throughput": throughput,
1043
+ "avg_latency_ms": avg_latency,
1044
+ "p95_latency_ms": p95_latency,
1045
+ "successful_requests": successful_requests,
1046
+ }
1047
+ else:
1048
+ return {
1049
+ "completed": 0,
1050
+ "total_requests": num_requests,
1051
+ "throughput": 0,
1052
+ "avg_latency_ms": 0,
1053
+ "p95_latency_ms": 0,
1054
+ "successful_requests": 0,
1055
+ }
1056
+
1057
+ try:
1058
+ res = asyncio.run(_run_benchmark())
1059
+ finally:
1060
+ kill_process_tree(process.pid)
1061
+
1062
+ assert res["completed"] == res["successful_requests"]
1063
+ return res
1064
+
1065
+
872
1066
  def run_bench_serving_multi(
873
1067
  model,
874
1068
  base_url,
@@ -976,7 +1170,7 @@ def run_bench_offline_throughput(model, other_args):
976
1170
  *[str(x) for x in other_args],
977
1171
  ]
978
1172
 
979
- print(f"{command=}")
1173
+ print(f"command={' '.join(command)}")
980
1174
  process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
981
1175
 
982
1176
  try:
@@ -1390,6 +1584,41 @@ async def send_concurrent_generate_requests(
1390
1584
  return await asyncio.gather(*tasks)
1391
1585
 
1392
1586
 
1587
+ async def send_concurrent_generate_requests_with_custom_params(
1588
+ base_url: str,
1589
+ custom_params: List[dict[str, Any]],
1590
+ ) -> Tuple[int, Any]:
1591
+ """Sends generate request concurrently with custom parameters and returns status code and response json tuple. Max concurrency is num_requests."""
1592
+
1593
+ base_payload = {
1594
+ "text": """
1595
+ System: You are a helpful assistant.
1596
+ User: What is the capital of France?
1597
+ Assistant: The capital of France is
1598
+ """,
1599
+ "sampling_params": {
1600
+ "temperature": 0,
1601
+ "max_new_tokens": 50,
1602
+ },
1603
+ }
1604
+
1605
+ async def async_generate_with_priority(req):
1606
+ async with aiohttp.ClientSession() as session:
1607
+ async with session.post(
1608
+ f"{base_url}/generate",
1609
+ json=req,
1610
+ ) as response:
1611
+ resp_json = await response.json()
1612
+ return (response.status, resp_json)
1613
+
1614
+ tasks = []
1615
+ for c in custom_params:
1616
+ req = base_payload.copy()
1617
+ req.update(c)
1618
+ tasks.append(asyncio.create_task(async_generate_with_priority(req)))
1619
+ return await asyncio.gather(*tasks)
1620
+
1621
+
1393
1622
  class CustomTestCase(unittest.TestCase):
1394
1623
  def _callTestMethod(self, method):
1395
1624
  max_retry = int(
@@ -1400,6 +1629,9 @@ class CustomTestCase(unittest.TestCase):
1400
1629
  max_retry=max_retry,
1401
1630
  )
1402
1631
 
1632
+ def setUp(self):
1633
+ print(f"[Test Method] {self._testMethodName}", flush=True)
1634
+
1403
1635
 
1404
1636
  def dump_bench_raw_result(
1405
1637
  path: str,
@@ -1431,3 +1663,187 @@ def dump_bench_raw_result(
1431
1663
  def _ensure_remove_suffix(text: str, suffix: str):
1432
1664
  assert text.endswith(suffix)
1433
1665
  return text.removesuffix(suffix)
1666
+
1667
+
1668
+ class ModelLaunchSettings:
1669
+ def __init__(
1670
+ self,
1671
+ model_path: str,
1672
+ tp_size: int = 1,
1673
+ extra_args: Optional[List[str]] = None,
1674
+ env: Optional[dict] = None,
1675
+ ):
1676
+ self.model_path = model_path
1677
+ self.tp_size = tp_size
1678
+ self.extra_args = list(extra_args) if extra_args else []
1679
+ self.env = env
1680
+
1681
+ if self.tp_size > 1 and "--tp" not in self.extra_args:
1682
+ self.extra_args.extend(["--tp", str(self.tp_size)])
1683
+
1684
+ fixed_args = ["--enable-multimodal", "--trust-remote-code"]
1685
+ for fixed_arg in fixed_args:
1686
+ if fixed_arg not in self.extra_args:
1687
+ self.extra_args.append(fixed_arg)
1688
+
1689
+
1690
+ class ModelEvalMetrics:
1691
+ def __init__(self, accuracy: float, eval_time: float):
1692
+ self.accuracy = accuracy
1693
+ self.eval_time = eval_time
1694
+
1695
+
1696
+ def extract_trace_link_from_bench_one_batch_server_output(output: str) -> str:
1697
+ match = re.search(r"\[Profile\]\((.*?)\)", output)
1698
+ if match:
1699
+ trace_link = match.group(1)
1700
+ return trace_link
1701
+ return None
1702
+
1703
+
1704
+ def parse_models(model_string: str):
1705
+ return [model.strip() for model in model_string.split(",") if model.strip()]
1706
+
1707
+
1708
+ def check_evaluation_test_results(
1709
+ results,
1710
+ test_name,
1711
+ model_accuracy_thresholds,
1712
+ model_latency_thresholds=None,
1713
+ model_count=None,
1714
+ ):
1715
+ """
1716
+ results: list of tuple of (model_path, accuracy, latency)
1717
+ """
1718
+ failed_models = []
1719
+ if model_latency_thresholds is not None:
1720
+ summary = " | model | status | score | score_threshold | latency | latency_threshold | \n"
1721
+ summary += "| ----- | ------ | ----- | --------------- | ------- | ----------------- | \n"
1722
+ else:
1723
+ summary = " | model | status | score | score_threshold | \n"
1724
+ summary += "| ----- | ------ | ----- | --------------- | \n"
1725
+
1726
+ results_dict = {res[0]: (res[1], res[2]) for res in results}
1727
+
1728
+ for model, accuracy_threshold in sorted(model_accuracy_thresholds.items()):
1729
+ latency_threshold = (
1730
+ model_latency_thresholds.get(model)
1731
+ if model_latency_thresholds is not None
1732
+ else 1e9
1733
+ )
1734
+
1735
+ if model in results_dict:
1736
+ accuracy, latency = results_dict[model]
1737
+ is_success = accuracy >= accuracy_threshold and latency <= latency_threshold
1738
+ status_emoji = "✅" if is_success else "❌"
1739
+
1740
+ if not is_success:
1741
+ if accuracy < accuracy_threshold:
1742
+ failed_models.append(
1743
+ f"\nScore Check Failed: {model}\n"
1744
+ f"Model {model} score ({accuracy:.4f}) is below threshold ({accuracy_threshold:.4f})"
1745
+ )
1746
+ if latency > latency_threshold:
1747
+ failed_models.append(
1748
+ f"\nLatency Check Failed: {model}\n"
1749
+ f"Model {model} latency ({latency:.4f}) is above threshold ({latency_threshold:.4f})"
1750
+ )
1751
+
1752
+ if model_latency_thresholds is not None:
1753
+ line = f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold} | {latency} | {latency_threshold}\n"
1754
+ else:
1755
+ line = (
1756
+ f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold}\n"
1757
+ )
1758
+ else:
1759
+ status_emoji = "❌"
1760
+ failed_models.append(f"Model failed to launch or be evaluated: {model}")
1761
+ if model_latency_thresholds is not None:
1762
+ line = f"| {model} | {status_emoji} | N/A | {accuracy_threshold} | N/A | {latency_threshold}\n"
1763
+ else:
1764
+ line = f"| {model} | {status_emoji} | N/A | {accuracy_threshold}\n"
1765
+
1766
+ summary += line
1767
+
1768
+ print(summary)
1769
+
1770
+ if is_in_ci():
1771
+ write_github_step_summary(f"## {test_name}\n{summary}")
1772
+
1773
+ if failed_models:
1774
+ print("Some models failed the evaluation.")
1775
+ raise AssertionError("\n".join(failed_models))
1776
+
1777
+
1778
+ # Bench knobs for bench_one_batch_server (override by env)
1779
+ def _parse_int_list_env(name: str, default_val: str):
1780
+ val = os.environ.get(name, default_val)
1781
+ return [int(x) for x in val.split(",") if x]
1782
+
1783
+
1784
+ # Return filenames
1785
+ def find_traces_under_path(path: str) -> List[str]:
1786
+ results = []
1787
+ for _, dirs, files in os.walk(path):
1788
+ for file in files:
1789
+ if file.endswith(".trace.json.gz"):
1790
+ results.append(f"{file}")
1791
+ return results
1792
+
1793
+
1794
+ def write_results_to_json(model, metrics, mode="a"):
1795
+ result = {
1796
+ "timestamp": datetime.now().isoformat(),
1797
+ "model": model,
1798
+ "metrics": metrics,
1799
+ "score": metrics["score"],
1800
+ }
1801
+
1802
+ if "latency" in metrics:
1803
+ result["latency"] = (metrics.get("latency"),)
1804
+
1805
+ existing_results = []
1806
+ if mode == "a" and os.path.exists("results.json"):
1807
+ try:
1808
+ with open("results.json", "r") as f:
1809
+ existing_results = json.load(f)
1810
+ except json.JSONDecodeError:
1811
+ existing_results = []
1812
+
1813
+ if isinstance(existing_results, list):
1814
+ existing_results.append(result)
1815
+ else:
1816
+ existing_results = [result]
1817
+
1818
+ with open("results.json", "w") as f:
1819
+ json.dump(existing_results, f, indent=2)
1820
+
1821
+
1822
+ def intel_amx_benchmark(extra_args=None, min_throughput=None):
1823
+ def decorator(test_func):
1824
+ @wraps(test_func)
1825
+ def wrapper(self):
1826
+ common_args = [
1827
+ "--attention-backend",
1828
+ "intel_amx",
1829
+ "--disable-radix",
1830
+ "--trust-remote-code",
1831
+ ]
1832
+ full_args = common_args + (extra_args or [])
1833
+
1834
+ model = test_func(self)
1835
+ prefill_latency, decode_throughput, decode_latency = run_bench_one_batch(
1836
+ model, full_args
1837
+ )
1838
+
1839
+ print(f"{model=}")
1840
+ print(f"{prefill_latency=}")
1841
+ print(f"{decode_throughput=}")
1842
+ print(f"{decode_latency=}")
1843
+
1844
+ if is_in_ci() and min_throughput is not None:
1845
+ self.assertGreater(decode_throughput, min_throughput)
1846
+
1847
+ return wrapper
1848
+
1849
+ return decorator
sglang/utils.py CHANGED
@@ -6,6 +6,7 @@ import logging
6
6
  import os
7
7
  import random
8
8
  import socket
9
+ import ssl
9
10
  import subprocess
10
11
  import sys
11
12
  import time
@@ -155,7 +156,15 @@ def http_request(
155
156
  data = bytes(dumps(json), encoding="utf-8")
156
157
 
157
158
  try:
158
- resp = urllib.request.urlopen(req, data=data, cafile=verify)
159
+ if sys.version_info >= (3, 13):
160
+ # Python 3.13+: Use SSL context (cafile removed)
161
+ if verify and isinstance(verify, str):
162
+ context = ssl.create_default_context(cafile=verify)
163
+ else:
164
+ context = ssl.create_default_context()
165
+ resp = urllib.request.urlopen(req, data=data, context=context)
166
+ else:
167
+ resp = urllib.request.urlopen(req, data=data, cafile=verify)
159
168
  return HttpResponse(resp)
160
169
  except urllib.error.HTTPError as e:
161
170
  return HttpResponse(e)
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.3rc0"
1
+ __version__ = "0.5.4"