sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (396) hide show
  1. sglang/bench_one_batch.py +7 -11
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +474 -142
  4. sglang/compile_deep_gemm.py +3 -0
  5. sglang/global_config.py +2 -2
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +10 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +314 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/mamba_utils.py +117 -0
  17. sglang/srt/configs/model_config.py +228 -92
  18. sglang/srt/configs/nemotron_h.py +286 -0
  19. sglang/srt/configs/qwen3_next.py +294 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +78 -37
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +373 -68
  44. sglang/srt/disaggregation/prefill.py +53 -49
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +156 -80
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +842 -0
  52. sglang/srt/entrypoints/grpc_server.py +950 -0
  53. sglang/srt/entrypoints/http_server.py +179 -60
  54. sglang/srt/entrypoints/openai/protocol.py +265 -29
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +213 -122
  57. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  58. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  63. sglang/srt/environ.py +289 -0
  64. sglang/srt/eplb/eplb_manager.py +2 -2
  65. sglang/srt/eplb/expert_distribution.py +26 -13
  66. sglang/srt/eplb/expert_location.py +38 -8
  67. sglang/srt/eplb/expert_location_updater.py +1 -1
  68. sglang/srt/function_call/base_format_detector.py +3 -6
  69. sglang/srt/function_call/ebnf_composer.py +11 -9
  70. sglang/srt/function_call/function_call_parser.py +17 -8
  71. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  72. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  73. sglang/srt/function_call/json_array_parser.py +63 -0
  74. sglang/srt/function_call/kimik2_detector.py +17 -4
  75. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  76. sglang/srt/function_call/utils.py +96 -5
  77. sglang/srt/grpc/__init__.py +1 -0
  78. sglang/srt/grpc/compile_proto.py +245 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
  81. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
  82. sglang/srt/layers/activation.py +143 -9
  83. sglang/srt/layers/attention/aiter_backend.py +14 -15
  84. sglang/srt/layers/attention/ascend_backend.py +115 -9
  85. sglang/srt/layers/attention/attention_registry.py +215 -0
  86. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  87. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  88. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  89. sglang/srt/layers/attention/fla/chunk.py +242 -0
  90. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  91. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  92. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  93. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  94. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  95. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  96. sglang/srt/layers/attention/fla/index.py +37 -0
  97. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  98. sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
  99. sglang/srt/layers/attention/fla/op.py +66 -0
  100. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  101. sglang/srt/layers/attention/fla/utils.py +331 -0
  102. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  103. sglang/srt/layers/attention/flashattention_backend.py +40 -8
  104. sglang/srt/layers/attention/flashinfer_backend.py +341 -204
  105. sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
  106. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  107. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  108. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
  109. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  111. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
  112. sglang/srt/layers/attention/mamba/mamba.py +577 -0
  113. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  114. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  115. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  116. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  117. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  121. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  122. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  123. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  124. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  125. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  126. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  127. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  128. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  129. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  130. sglang/srt/layers/attention/nsa/utils.py +24 -0
  131. sglang/srt/layers/attention/nsa_backend.py +887 -0
  132. sglang/srt/layers/attention/tbo_backend.py +6 -6
  133. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  134. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  135. sglang/srt/layers/attention/triton_backend.py +57 -7
  136. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  137. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  138. sglang/srt/layers/attention/vision.py +58 -0
  139. sglang/srt/layers/attention/wave_backend.py +4 -4
  140. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  141. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  142. sglang/srt/layers/communicator.py +8 -0
  143. sglang/srt/layers/dp_attention.py +41 -2
  144. sglang/srt/layers/elementwise.py +3 -1
  145. sglang/srt/layers/layernorm.py +34 -15
  146. sglang/srt/layers/linear.py +55 -7
  147. sglang/srt/layers/logits_processor.py +180 -18
  148. sglang/srt/layers/modelopt_utils.py +11 -0
  149. sglang/srt/layers/moe/__init__.py +2 -1
  150. sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
  151. sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
  152. sglang/srt/layers/moe/ep_moe/layer.py +248 -333
  153. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  154. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  155. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  167. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  169. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  170. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  171. sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
  172. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  173. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  174. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  175. sglang/srt/layers/moe/moe_runner/runner.py +83 -0
  176. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  177. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  178. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  179. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  180. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  181. sglang/srt/layers/moe/topk.py +30 -9
  182. sglang/srt/layers/moe/utils.py +29 -7
  183. sglang/srt/layers/parameter.py +23 -6
  184. sglang/srt/layers/quantization/__init__.py +1 -1
  185. sglang/srt/layers/quantization/awq.py +19 -7
  186. sglang/srt/layers/quantization/base_config.py +11 -6
  187. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  188. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  189. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  190. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  191. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  192. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  193. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  194. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  195. sglang/srt/layers/quantization/fp8.py +155 -60
  196. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  197. sglang/srt/layers/quantization/gptq.py +25 -17
  198. sglang/srt/layers/quantization/modelopt_quant.py +191 -56
  199. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  200. sglang/srt/layers/quantization/mxfp4.py +74 -42
  201. sglang/srt/layers/quantization/quark/quark.py +3 -1
  202. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  203. sglang/srt/layers/quantization/unquant.py +135 -47
  204. sglang/srt/layers/quantization/w4afp8.py +28 -33
  205. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  206. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  207. sglang/srt/layers/rotary_embedding.py +78 -31
  208. sglang/srt/layers/sampler.py +213 -21
  209. sglang/srt/layers/utils.py +23 -0
  210. sglang/srt/lora/backend/base_backend.py +50 -8
  211. sglang/srt/lora/backend/chunked_backend.py +348 -0
  212. sglang/srt/lora/backend/triton_backend.py +99 -5
  213. sglang/srt/lora/layers.py +32 -0
  214. sglang/srt/lora/lora.py +8 -3
  215. sglang/srt/lora/lora_manager.py +44 -118
  216. sglang/srt/lora/mem_pool.py +25 -11
  217. sglang/srt/lora/triton_ops/__init__.py +4 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  219. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  220. sglang/srt/lora/utils.py +22 -11
  221. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  222. sglang/srt/managers/cache_controller.py +199 -301
  223. sglang/srt/managers/data_parallel_controller.py +115 -80
  224. sglang/srt/managers/detokenizer_manager.py +19 -15
  225. sglang/srt/managers/disagg_service.py +46 -0
  226. sglang/srt/managers/io_struct.py +340 -109
  227. sglang/srt/managers/mm_utils.py +44 -6
  228. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  229. sglang/srt/managers/multimodal_processor.py +1 -2
  230. sglang/srt/managers/overlap_utils.py +55 -0
  231. sglang/srt/managers/schedule_batch.py +343 -212
  232. sglang/srt/managers/schedule_policy.py +145 -18
  233. sglang/srt/managers/scheduler.py +653 -273
  234. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  235. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  236. sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
  237. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  238. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +579 -674
  241. sglang/srt/managers/tp_worker.py +96 -26
  242. sglang/srt/managers/utils.py +1 -45
  243. sglang/srt/mem_cache/allocator.py +21 -22
  244. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  245. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  246. sglang/srt/mem_cache/chunk_cache.py +9 -2
  247. sglang/srt/mem_cache/evict_policy.py +23 -0
  248. sglang/srt/mem_cache/hicache_storage.py +43 -24
  249. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  250. sglang/srt/mem_cache/memory_pool.py +651 -80
  251. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  252. sglang/srt/mem_cache/radix_cache.py +227 -73
  253. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  254. sglang/srt/mem_cache/storage/__init__.py +10 -0
  255. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  257. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  258. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  259. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  260. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  261. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  262. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  263. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  264. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  265. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  266. sglang/srt/mem_cache/swa_radix_cache.py +93 -48
  267. sglang/srt/metrics/collector.py +511 -132
  268. sglang/srt/metrics/func_timer.py +2 -7
  269. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  270. sglang/srt/metrics/utils.py +8 -1
  271. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  272. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  273. sglang/srt/model_executor/forward_batch_info.py +74 -46
  274. sglang/srt/model_executor/model_runner.py +455 -176
  275. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  276. sglang/srt/model_loader/__init__.py +10 -4
  277. sglang/srt/model_loader/loader.py +319 -10
  278. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  279. sglang/srt/model_loader/weight_utils.py +161 -3
  280. sglang/srt/models/apertus.py +686 -0
  281. sglang/srt/models/bailing_moe.py +820 -217
  282. sglang/srt/models/bailing_moe_nextn.py +168 -0
  283. sglang/srt/models/deepseek_nextn.py +6 -1
  284. sglang/srt/models/deepseek_v2.py +607 -130
  285. sglang/srt/models/dots_ocr.py +173 -0
  286. sglang/srt/models/dots_vlm.py +174 -0
  287. sglang/srt/models/dots_vlm_vit.py +337 -0
  288. sglang/srt/models/ernie4.py +1 -1
  289. sglang/srt/models/falcon_h1.py +578 -0
  290. sglang/srt/models/gemma3_causal.py +0 -2
  291. sglang/srt/models/gemma3_mm.py +17 -1
  292. sglang/srt/models/gemma3n_mm.py +2 -2
  293. sglang/srt/models/glm4_moe.py +4 -4
  294. sglang/srt/models/glm4_moe_nextn.py +2 -2
  295. sglang/srt/models/glm4v.py +5 -3
  296. sglang/srt/models/glm4v_moe.py +4 -1
  297. sglang/srt/models/gpt_oss.py +8 -31
  298. sglang/srt/models/grok.py +5 -13
  299. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  300. sglang/srt/models/llama.py +4 -0
  301. sglang/srt/models/llama4.py +9 -0
  302. sglang/srt/models/llama_eagle3.py +13 -0
  303. sglang/srt/models/longcat_flash.py +3 -3
  304. sglang/srt/models/longcat_flash_nextn.py +1 -1
  305. sglang/srt/models/mixtral.py +1 -3
  306. sglang/srt/models/mllama4.py +50 -4
  307. sglang/srt/models/nemotron_h.py +514 -0
  308. sglang/srt/models/opt.py +637 -0
  309. sglang/srt/models/qwen2_5_vl.py +29 -5
  310. sglang/srt/models/qwen2_audio.py +1 -1
  311. sglang/srt/models/qwen2_moe.py +120 -13
  312. sglang/srt/models/qwen2_vl.py +1 -1
  313. sglang/srt/models/qwen3.py +18 -3
  314. sglang/srt/models/qwen3_moe.py +32 -4
  315. sglang/srt/models/qwen3_next.py +1069 -0
  316. sglang/srt/models/qwen3_next_mtp.py +112 -0
  317. sglang/srt/models/qwen3_vl.py +787 -0
  318. sglang/srt/models/qwen3_vl_moe.py +471 -0
  319. sglang/srt/models/registry.py +15 -3
  320. sglang/srt/models/sarashina2_vision.py +269 -0
  321. sglang/srt/models/solar.py +505 -0
  322. sglang/srt/models/starcoder2.py +357 -0
  323. sglang/srt/models/step3_vl.py +1 -1
  324. sglang/srt/models/torch_native_llama.py +9 -2
  325. sglang/srt/models/utils.py +55 -0
  326. sglang/srt/multimodal/processors/base_processor.py +15 -7
  327. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  328. sglang/srt/multimodal/processors/glm4v.py +9 -9
  329. sglang/srt/multimodal/processors/internvl.py +153 -129
  330. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  331. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  332. sglang/srt/offloader.py +27 -3
  333. sglang/srt/parser/jinja_template_utils.py +6 -0
  334. sglang/srt/sampling/sampling_batch_info.py +49 -26
  335. sglang/srt/sampling/sampling_params.py +7 -0
  336. sglang/srt/server_args.py +1051 -285
  337. sglang/srt/server_args_config_parser.py +146 -0
  338. sglang/srt/single_batch_overlap.py +151 -0
  339. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  340. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  341. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  342. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  343. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  344. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  345. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  346. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  347. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  348. sglang/srt/speculative/eagle_worker.py +98 -29
  349. sglang/srt/speculative/ngram_info.py +428 -0
  350. sglang/srt/speculative/ngram_worker.py +246 -0
  351. sglang/srt/speculative/spec_info.py +52 -0
  352. sglang/srt/speculative/spec_utils.py +605 -0
  353. sglang/srt/speculative/standalone_worker.py +109 -0
  354. sglang/srt/torch_memory_saver_adapter.py +5 -7
  355. sglang/srt/tracing/trace.py +578 -0
  356. sglang/srt/two_batch_overlap.py +9 -5
  357. sglang/srt/utils/__init__.py +2 -0
  358. sglang/srt/{utils.py → utils/common.py} +451 -77
  359. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
  360. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  361. sglang/srt/utils/rpd_utils.py +452 -0
  362. sglang/srt/utils/slow_rank_detector.py +71 -0
  363. sglang/srt/warmup.py +8 -4
  364. sglang/srt/weight_sync/utils.py +2 -2
  365. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  366. sglang/test/get_logits_ut.py +57 -0
  367. sglang/test/longbench_v2/__init__.py +1 -0
  368. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  369. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  370. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  371. sglang/test/run_eval.py +119 -11
  372. sglang/test/runners.py +5 -1
  373. sglang/test/simple_eval_common.py +5 -2
  374. sglang/test/simple_eval_longbench_v2.py +332 -0
  375. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  376. sglang/test/test_block_fp8.py +2 -2
  377. sglang/test/test_cutlass_moe.py +24 -6
  378. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  379. sglang/test/test_deterministic.py +313 -0
  380. sglang/test/test_deterministic_utils.py +81 -0
  381. sglang/test/test_disaggregation_utils.py +140 -0
  382. sglang/test/test_fp4_moe.py +370 -1
  383. sglang/test/test_programs.py +1 -1
  384. sglang/test/test_utils.py +407 -8
  385. sglang/utils.py +21 -1
  386. sglang/version.py +1 -1
  387. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
  388. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
  389. sglang/srt/disaggregation/launch_lb.py +0 -118
  390. sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
  391. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  392. sglang/test/test_block_fp8_ep.py +0 -358
  393. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  394. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  395. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  396. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
sglang/test/test_utils.py CHANGED
@@ -9,15 +9,17 @@ import os
9
9
  import random
10
10
  import re
11
11
  import subprocess
12
+ import sys
12
13
  import threading
13
14
  import time
14
15
  import unittest
15
16
  from concurrent.futures import ThreadPoolExecutor
16
17
  from dataclasses import dataclass
18
+ from datetime import datetime
17
19
  from functools import partial
18
20
  from pathlib import Path
19
21
  from types import SimpleNamespace
20
- from typing import Awaitable, Callable, List, Optional, Tuple
22
+ from typing import Any, Awaitable, Callable, List, Optional, Tuple
21
23
 
22
24
  import aiohttp
23
25
  import numpy as np
@@ -41,8 +43,10 @@ from sglang.utils import get_exception_traceback
41
43
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
42
44
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
43
45
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
46
+ DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE = "Qwen/Qwen3-Reranker-0.6B"
44
47
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
45
- DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
48
+ DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B"
49
+ DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
46
50
 
47
51
  # MLA test models
48
52
  DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
@@ -52,6 +56,9 @@ DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instru
52
56
  DEFAULT_MODEL_NAME_FOR_TEST_MLA = "lmsys/sglang-ci-dsv3-test"
53
57
  DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN = "lmsys/sglang-ci-dsv3-test-NextN"
54
58
 
59
+ # NVFP4 models
60
+ DEFAULT_DEEPSEEK_NVFP4_MODEL_FOR_TEST = "nvidia/DeepSeek-R1-0528-FP4"
61
+
55
62
  # FP8 models
56
63
  DEFAULT_MODEL_NAME_FOR_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
57
64
  DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
@@ -71,7 +78,13 @@ DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
71
78
  # EAGLE
72
79
  DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
73
80
  DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
74
- DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B"
81
+ DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST_EAGLE3 = "meta-llama/Llama-3.1-8B-Instruct"
82
+ DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B"
83
+ DEFAULT_STANDALONE_SPECULATIVE_TARGET_MODEL_FOR_TEST = (
84
+ "meta-llama/Llama-3.1-8B-Instruct"
85
+ )
86
+ DEFAULT_STANDALONE_SPECULATIVE_DRAFT_MODEL_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
87
+ DEFAULT_NGRAM_SPECULATIVE_TARGET_MODEL_FOR_TEST = "Qwen/Qwen2.5-Coder-7B-Instruct"
75
88
 
76
89
  # Other use cases
77
90
  DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
@@ -466,6 +479,25 @@ def try_cached_model(model_repo: str):
466
479
  return model_dir if model_dir else model_repo
467
480
 
468
481
 
482
+ def popen_with_error_check(command: list[str], allow_exit: bool = False):
483
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
484
+
485
+ def _run_and_check():
486
+ stdout, stderr = process.communicate()
487
+
488
+ while process.poll() is None:
489
+ time.sleep(5)
490
+
491
+ if not allow_exit or process.returncode != 0:
492
+ raise Exception(
493
+ f"{command} exited with code {process.returncode}\n{stdout=}\n{stderr=}"
494
+ )
495
+
496
+ t = threading.Thread(target=_run_and_check)
497
+ t.start()
498
+ return process
499
+
500
+
469
501
  def popen_launch_server(
470
502
  model: str,
471
503
  base_url: str,
@@ -476,6 +508,7 @@ def popen_launch_server(
476
508
  return_stdout_stderr: Optional[tuple] = None,
477
509
  device: str = "auto",
478
510
  pd_separated: bool = False,
511
+ num_replicas: Optional[int] = None,
479
512
  ):
480
513
  """Launch a server process with automatic device detection.
481
514
 
@@ -493,7 +526,8 @@ def popen_launch_server(
493
526
  _, host, port = base_url.split(":")
494
527
  host = host[2:]
495
528
 
496
- if pd_separated:
529
+ use_mixed_pd_engine = not pd_separated and num_replicas is not None
530
+ if pd_separated or use_mixed_pd_engine:
497
531
  command = "sglang.launch_pd_server"
498
532
  else:
499
533
  command = "sglang.launch_server"
@@ -507,7 +541,7 @@ def popen_launch_server(
507
541
  *[str(x) for x in other_args],
508
542
  ]
509
543
 
510
- if pd_separated:
544
+ if pd_separated or use_mixed_pd_engine:
511
545
  command.extend(
512
546
  [
513
547
  "--lb-host",
@@ -526,6 +560,15 @@ def popen_launch_server(
526
560
  ]
527
561
  )
528
562
 
563
+ if use_mixed_pd_engine:
564
+ command.extend(
565
+ [
566
+ "--mixed",
567
+ "--num-replicas",
568
+ str(num_replicas),
569
+ ]
570
+ )
571
+
529
572
  if api_key:
530
573
  command += ["--api-key", api_key]
531
574
 
@@ -534,11 +577,30 @@ def popen_launch_server(
534
577
  if return_stdout_stderr:
535
578
  process = subprocess.Popen(
536
579
  command,
537
- stdout=return_stdout_stderr[0],
538
- stderr=return_stdout_stderr[1],
580
+ stdout=subprocess.PIPE,
581
+ stderr=subprocess.PIPE,
539
582
  env=env,
540
583
  text=True,
584
+ bufsize=1,
541
585
  )
586
+
587
+ def _dump(src, sinks):
588
+ for line in iter(src.readline, ""):
589
+ for sink in sinks:
590
+ sink.write(line)
591
+ sink.flush()
592
+ src.close()
593
+
594
+ threading.Thread(
595
+ target=_dump,
596
+ args=(process.stdout, [return_stdout_stderr[0], sys.stdout]),
597
+ daemon=True,
598
+ ).start()
599
+ threading.Thread(
600
+ target=_dump,
601
+ args=(process.stderr, [return_stdout_stderr[1], sys.stderr]),
602
+ daemon=True,
603
+ ).start()
542
604
  else:
543
605
  process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
544
606
 
@@ -842,6 +904,154 @@ def run_bench_serving(
842
904
  return res
843
905
 
844
906
 
907
+ def run_score_benchmark(
908
+ model,
909
+ num_requests=100,
910
+ batch_size=5,
911
+ other_server_args=None,
912
+ need_warmup=False,
913
+ device="auto",
914
+ ):
915
+ """Score API benchmark function compatible with run_bench_serving pattern"""
916
+ if other_server_args is None:
917
+ other_server_args = []
918
+
919
+ if device == "auto":
920
+ device = auto_config_device()
921
+
922
+ # Launch the server (consistent with run_bench_serving)
923
+ base_url = DEFAULT_URL_FOR_TEST
924
+ process = popen_launch_server(
925
+ model,
926
+ base_url,
927
+ timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
928
+ other_args=other_server_args,
929
+ )
930
+
931
+ async def _run_benchmark():
932
+
933
+ # Load tokenizer for generating test data
934
+ from sglang.srt.utils.hf_transformers_utils import get_tokenizer
935
+
936
+ tokenizer = get_tokenizer(model)
937
+
938
+ # Score API configuration
939
+ score_query_tokens = 120
940
+ score_item_tokens = 180
941
+ score_label_token_ids = [9454, 2753] # Yes/No token IDs
942
+ special_token = "<|im_start|>"
943
+
944
+ def generate_text_with_token_count(num_tokens):
945
+ """Generate text with precise token count using replicated token."""
946
+ text = special_token * num_tokens
947
+ actual_tokens = len(tokenizer.encode(text, add_special_tokens=False))
948
+ if actual_tokens != num_tokens:
949
+ text = special_token * (
950
+ num_tokens
951
+ // len(tokenizer.encode(special_token, add_special_tokens=False))
952
+ )
953
+ return text
954
+
955
+ if need_warmup:
956
+ warmup_data = {
957
+ "query": generate_text_with_token_count(score_query_tokens),
958
+ "items": [
959
+ generate_text_with_token_count(score_item_tokens) for _ in range(3)
960
+ ],
961
+ "label_token_ids": score_label_token_ids,
962
+ "model": model,
963
+ "apply_softmax": True,
964
+ }
965
+
966
+ async with aiohttp.ClientSession() as session:
967
+ try:
968
+ await session.post(
969
+ f"{base_url}/v1/score",
970
+ json=warmup_data,
971
+ timeout=aiohttp.ClientTimeout(total=30),
972
+ )
973
+ except:
974
+ pass # Ignore warmup errors
975
+
976
+ test_requests = []
977
+ for i in range(num_requests):
978
+ query = generate_text_with_token_count(score_query_tokens)
979
+ items = [
980
+ generate_text_with_token_count(score_item_tokens)
981
+ for _ in range(batch_size)
982
+ ]
983
+
984
+ score_data = {
985
+ "query": query,
986
+ "items": items,
987
+ "label_token_ids": score_label_token_ids,
988
+ "model": model,
989
+ "apply_softmax": True,
990
+ }
991
+ test_requests.append(score_data)
992
+
993
+ start_time = time.monotonic()
994
+ successful_requests = 0
995
+ total_latency = 0
996
+ latencies = []
997
+
998
+ async with aiohttp.ClientSession() as session:
999
+ for request_data in test_requests:
1000
+ try:
1001
+ request_start = time.monotonic()
1002
+ async with session.post(
1003
+ f"{base_url}/v1/score",
1004
+ json=request_data,
1005
+ timeout=aiohttp.ClientTimeout(total=30),
1006
+ ) as response:
1007
+ if response.status == 200:
1008
+ response_data = await response.json()
1009
+ request_end = time.monotonic()
1010
+
1011
+ if "scores" in response_data or "logprobs" in response_data:
1012
+ latency_ms = (request_end - request_start) * 1000
1013
+ latencies.append(latency_ms)
1014
+ total_latency += latency_ms
1015
+ successful_requests += 1
1016
+ except Exception:
1017
+ continue
1018
+
1019
+ end_time = time.monotonic()
1020
+ total_time = end_time - start_time
1021
+
1022
+ if successful_requests > 0:
1023
+ throughput = successful_requests / total_time
1024
+ avg_latency = total_latency / successful_requests
1025
+ latencies.sort()
1026
+ p95_latency = latencies[int(len(latencies) * 0.95)] if latencies else 0
1027
+
1028
+ return {
1029
+ "completed": successful_requests,
1030
+ "total_requests": num_requests,
1031
+ "throughput": throughput,
1032
+ "avg_latency_ms": avg_latency,
1033
+ "p95_latency_ms": p95_latency,
1034
+ "successful_requests": successful_requests,
1035
+ }
1036
+ else:
1037
+ return {
1038
+ "completed": 0,
1039
+ "total_requests": num_requests,
1040
+ "throughput": 0,
1041
+ "avg_latency_ms": 0,
1042
+ "p95_latency_ms": 0,
1043
+ "successful_requests": 0,
1044
+ }
1045
+
1046
+ try:
1047
+ res = asyncio.run(_run_benchmark())
1048
+ finally:
1049
+ kill_process_tree(process.pid)
1050
+
1051
+ assert res["completed"] == res["successful_requests"]
1052
+ return res
1053
+
1054
+
845
1055
  def run_bench_serving_multi(
846
1056
  model,
847
1057
  base_url,
@@ -949,7 +1159,7 @@ def run_bench_offline_throughput(model, other_args):
949
1159
  *[str(x) for x in other_args],
950
1160
  ]
951
1161
 
952
- print(f"{command=}")
1162
+ print(f"command={' '.join(command)}")
953
1163
  process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
954
1164
 
955
1165
  try:
@@ -1363,6 +1573,41 @@ async def send_concurrent_generate_requests(
1363
1573
  return await asyncio.gather(*tasks)
1364
1574
 
1365
1575
 
1576
+ async def send_concurrent_generate_requests_with_custom_params(
1577
+ base_url: str,
1578
+ custom_params: List[dict[str, Any]],
1579
+ ) -> Tuple[int, Any]:
1580
+ """Sends generate request concurrently with custom parameters and returns status code and response json tuple. Max concurrency is num_requests."""
1581
+
1582
+ base_payload = {
1583
+ "text": """
1584
+ System: You are a helpful assistant.
1585
+ User: What is the capital of France?
1586
+ Assistant: The capital of France is
1587
+ """,
1588
+ "sampling_params": {
1589
+ "temperature": 0,
1590
+ "max_new_tokens": 50,
1591
+ },
1592
+ }
1593
+
1594
+ async def async_generate_with_priority(req):
1595
+ async with aiohttp.ClientSession() as session:
1596
+ async with session.post(
1597
+ f"{base_url}/generate",
1598
+ json=req,
1599
+ ) as response:
1600
+ resp_json = await response.json()
1601
+ return (response.status, resp_json)
1602
+
1603
+ tasks = []
1604
+ for c in custom_params:
1605
+ req = base_payload.copy()
1606
+ req.update(c)
1607
+ tasks.append(asyncio.create_task(async_generate_with_priority(req)))
1608
+ return await asyncio.gather(*tasks)
1609
+
1610
+
1366
1611
  class CustomTestCase(unittest.TestCase):
1367
1612
  def _callTestMethod(self, method):
1368
1613
  max_retry = int(
@@ -1404,3 +1649,157 @@ def dump_bench_raw_result(
1404
1649
  def _ensure_remove_suffix(text: str, suffix: str):
1405
1650
  assert text.endswith(suffix)
1406
1651
  return text.removesuffix(suffix)
1652
+
1653
+
1654
+ class ModelLaunchSettings:
1655
+ def __init__(
1656
+ self,
1657
+ model_path: str,
1658
+ tp_size: int = 1,
1659
+ extra_args: Optional[List[str]] = None,
1660
+ env: Optional[dict] = None,
1661
+ ):
1662
+ self.model_path = model_path
1663
+ self.tp_size = tp_size
1664
+ self.extra_args = list(extra_args) if extra_args else []
1665
+ self.env = env
1666
+
1667
+ if self.tp_size > 1 and "--tp" not in self.extra_args:
1668
+ self.extra_args.extend(["--tp", str(self.tp_size)])
1669
+
1670
+ fixed_args = ["--enable-multimodal", "--trust-remote-code"]
1671
+ for fixed_arg in fixed_args:
1672
+ if fixed_arg not in self.extra_args:
1673
+ self.extra_args.append(fixed_arg)
1674
+
1675
+
1676
+ class ModelEvalMetrics:
1677
+ def __init__(self, accuracy: float, eval_time: float):
1678
+ self.accuracy = accuracy
1679
+ self.eval_time = eval_time
1680
+
1681
+
1682
+ def extract_trace_link_from_bench_one_batch_server_output(output: str) -> str:
1683
+ match = re.search(r"\[Profile\]\((.*?)\)", output)
1684
+ if match:
1685
+ trace_link = match.group(1)
1686
+ return trace_link
1687
+ return None
1688
+
1689
+
1690
+ def parse_models(model_string: str):
1691
+ return [model.strip() for model in model_string.split(",") if model.strip()]
1692
+
1693
+
1694
+ def check_evaluation_test_results(
1695
+ results,
1696
+ test_name,
1697
+ model_accuracy_thresholds,
1698
+ model_latency_thresholds=None,
1699
+ model_count=None,
1700
+ ):
1701
+ """
1702
+ results: list of tuple of (model_path, accuracy, latency)
1703
+ """
1704
+ failed_models = []
1705
+ if model_latency_thresholds is not None:
1706
+ summary = " | model | status | score | score_threshold | latency | latency_threshold | \n"
1707
+ summary += "| ----- | ------ | ----- | --------------- | ------- | ----------------- | \n"
1708
+ else:
1709
+ summary = " | model | status | score | score_threshold | \n"
1710
+ summary += "| ----- | ------ | ----- | --------------- | \n"
1711
+
1712
+ results_dict = {res[0]: (res[1], res[2]) for res in results}
1713
+
1714
+ for model, accuracy_threshold in sorted(model_accuracy_thresholds.items()):
1715
+ latency_threshold = (
1716
+ model_latency_thresholds.get(model)
1717
+ if model_latency_thresholds is not None
1718
+ else 1e9
1719
+ )
1720
+
1721
+ if model in results_dict:
1722
+ accuracy, latency = results_dict[model]
1723
+ is_success = accuracy >= accuracy_threshold and latency <= latency_threshold
1724
+ status_emoji = "✅" if is_success else "❌"
1725
+
1726
+ if not is_success:
1727
+ if accuracy < accuracy_threshold:
1728
+ failed_models.append(
1729
+ f"\nScore Check Failed: {model}\n"
1730
+ f"Model {model} score ({accuracy:.4f}) is below threshold ({accuracy_threshold:.4f})"
1731
+ )
1732
+ if latency > latency_threshold:
1733
+ failed_models.append(
1734
+ f"\nLatency Check Failed: {model}\n"
1735
+ f"Model {model} latency ({latency:.4f}) is above threshold ({latency_threshold:.4f})"
1736
+ )
1737
+
1738
+ if model_latency_thresholds is not None:
1739
+ line = f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold} | {latency} | {latency_threshold}\n"
1740
+ else:
1741
+ line = (
1742
+ f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold}\n"
1743
+ )
1744
+ else:
1745
+ status_emoji = "❌"
1746
+ failed_models.append(f"Model failed to launch or be evaluated: {model}")
1747
+ if model_latency_thresholds is not None:
1748
+ line = f"| {model} | {status_emoji} | N/A | {accuracy_threshold} | N/A | {latency_threshold}\n"
1749
+ else:
1750
+ line = f"| {model} | {status_emoji} | N/A | {accuracy_threshold}\n"
1751
+
1752
+ summary += line
1753
+
1754
+ print(summary)
1755
+
1756
+ if is_in_ci():
1757
+ write_github_step_summary(f"## {test_name}\n{summary}")
1758
+
1759
+ if failed_models:
1760
+ print("Some models failed the evaluation.")
1761
+ raise AssertionError("\n".join(failed_models))
1762
+
1763
+
1764
+ # Bench knobs for bench_one_batch_server (override by env)
1765
+ def _parse_int_list_env(name: str, default_val: str):
1766
+ val = os.environ.get(name, default_val)
1767
+ return [int(x) for x in val.split(",") if x]
1768
+
1769
+
1770
+ # Return filenames
1771
+ def find_traces_under_path(path: str) -> List[str]:
1772
+ results = []
1773
+ for _, dirs, files in os.walk(path):
1774
+ for file in files:
1775
+ if file.endswith(".trace.json.gz"):
1776
+ results.append(f"{file}")
1777
+ return results
1778
+
1779
+
1780
+ def write_results_to_json(model, metrics, mode="a"):
1781
+ result = {
1782
+ "timestamp": datetime.now().isoformat(),
1783
+ "model": model,
1784
+ "metrics": metrics,
1785
+ "score": metrics["score"],
1786
+ }
1787
+
1788
+ if "latency" in metrics:
1789
+ result["latency"] = (metrics.get("latency"),)
1790
+
1791
+ existing_results = []
1792
+ if mode == "a" and os.path.exists("results.json"):
1793
+ try:
1794
+ with open("results.json", "r") as f:
1795
+ existing_results = json.load(f)
1796
+ except json.JSONDecodeError:
1797
+ existing_results = []
1798
+
1799
+ if isinstance(existing_results, list):
1800
+ existing_results.append(result)
1801
+ else:
1802
+ existing_results = [result]
1803
+
1804
+ with open("results.json", "w") as f:
1805
+ json.dump(existing_results, f, indent=2)
sglang/utils.py CHANGED
@@ -6,6 +6,7 @@ import logging
6
6
  import os
7
7
  import random
8
8
  import socket
9
+ import ssl
9
10
  import subprocess
10
11
  import sys
11
12
  import time
@@ -155,7 +156,15 @@ def http_request(
155
156
  data = bytes(dumps(json), encoding="utf-8")
156
157
 
157
158
  try:
158
- resp = urllib.request.urlopen(req, data=data, cafile=verify)
159
+ if sys.version_info >= (3, 13):
160
+ # Python 3.13+: Use SSL context (cafile removed)
161
+ if verify and isinstance(verify, str):
162
+ context = ssl.create_default_context(cafile=verify)
163
+ else:
164
+ context = ssl.create_default_context()
165
+ resp = urllib.request.urlopen(req, data=data, context=context)
166
+ else:
167
+ resp = urllib.request.urlopen(req, data=data, cafile=verify)
159
168
  return HttpResponse(resp)
160
169
  except urllib.error.HTTPError as e:
161
170
  return HttpResponse(e)
@@ -472,11 +481,22 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
472
481
  class TypeBasedDispatcher:
473
482
  def __init__(self, mapping: List[Tuple[Type, Callable]]):
474
483
  self._mapping = mapping
484
+ self._fallback_fn = None
485
+
486
+ def add_fallback_fn(self, fallback_fn: Callable):
487
+ self._fallback_fn = fallback_fn
488
+
489
+ def __iadd__(self, other: "TypeBasedDispatcher"):
490
+ self._mapping.extend(other._mapping)
491
+ return self
475
492
 
476
493
  def __call__(self, obj: Any):
477
494
  for ty, fn in self._mapping:
478
495
  if isinstance(obj, ty):
479
496
  return fn(obj)
497
+
498
+ if self._fallback_fn is not None:
499
+ return self._fallback_fn(obj)
480
500
  raise ValueError(f"Invalid object: {obj}")
481
501
 
482
502
 
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.2rc2"
1
+ __version__ = "0.5.3.post1"