sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +8 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +360 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/model_config.py +181 -82
  17. sglang/srt/configs/qwen3_next.py +326 -0
  18. sglang/srt/configs/qwen3_vl.py +586 -0
  19. sglang/srt/connector/__init__.py +8 -1
  20. sglang/srt/connector/remote_instance.py +82 -0
  21. sglang/srt/constrained/base_grammar_backend.py +49 -12
  22. sglang/srt/constrained/llguidance_backend.py +0 -1
  23. sglang/srt/constrained/outlines_backend.py +0 -1
  24. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  25. sglang/srt/constrained/xgrammar_backend.py +30 -9
  26. sglang/srt/custom_op.py +11 -1
  27. sglang/srt/debug_utils/dump_comparator.py +81 -44
  28. sglang/srt/debug_utils/dump_loader.py +97 -0
  29. sglang/srt/debug_utils/dumper.py +21 -6
  30. sglang/srt/debug_utils/text_comparator.py +73 -11
  31. sglang/srt/disaggregation/ascend/conn.py +2 -2
  32. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  33. sglang/srt/disaggregation/base/conn.py +1 -1
  34. sglang/srt/disaggregation/common/conn.py +279 -108
  35. sglang/srt/disaggregation/decode.py +71 -19
  36. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  37. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  38. sglang/srt/disaggregation/fake/conn.py +1 -1
  39. sglang/srt/disaggregation/mini_lb.py +6 -445
  40. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  41. sglang/srt/disaggregation/nixl/conn.py +326 -53
  42. sglang/srt/disaggregation/prefill.py +36 -17
  43. sglang/srt/disaggregation/utils.py +40 -54
  44. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  45. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  46. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  47. sglang/srt/distributed/parallel_state.py +156 -80
  48. sglang/srt/entrypoints/engine.py +59 -18
  49. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  50. sglang/srt/entrypoints/grpc_server.py +810 -0
  51. sglang/srt/entrypoints/http_server.py +130 -59
  52. sglang/srt/entrypoints/openai/protocol.py +112 -4
  53. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  54. sglang/srt/entrypoints/openai/serving_chat.py +204 -55
  55. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  56. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  57. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  58. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  59. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  60. sglang/srt/environ.py +285 -0
  61. sglang/srt/eplb/eplb_manager.py +2 -2
  62. sglang/srt/eplb/expert_distribution.py +26 -13
  63. sglang/srt/eplb/expert_location.py +38 -8
  64. sglang/srt/eplb/expert_location_updater.py +1 -1
  65. sglang/srt/function_call/base_format_detector.py +3 -6
  66. sglang/srt/function_call/ebnf_composer.py +11 -9
  67. sglang/srt/function_call/function_call_parser.py +9 -2
  68. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  69. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  70. sglang/srt/function_call/json_array_parser.py +63 -0
  71. sglang/srt/function_call/kimik2_detector.py +17 -4
  72. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  73. sglang/srt/function_call/utils.py +96 -5
  74. sglang/srt/grpc/__init__.py +1 -0
  75. sglang/srt/grpc/compile_proto.py +245 -0
  76. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  77. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  79. sglang/srt/layers/activation.py +143 -9
  80. sglang/srt/layers/attention/aiter_backend.py +14 -15
  81. sglang/srt/layers/attention/ascend_backend.py +115 -9
  82. sglang/srt/layers/attention/attention_registry.py +206 -0
  83. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  84. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  85. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  86. sglang/srt/layers/attention/fla/chunk.py +242 -0
  87. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  88. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  89. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  90. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  91. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  92. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  93. sglang/srt/layers/attention/fla/index.py +37 -0
  94. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  95. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  96. sglang/srt/layers/attention/fla/op.py +66 -0
  97. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  98. sglang/srt/layers/attention/fla/utils.py +331 -0
  99. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  100. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  101. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  102. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  103. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  104. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  105. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  106. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  107. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  108. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  109. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  110. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  111. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  112. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  113. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  114. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  115. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  119. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  120. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  121. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  122. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  123. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  124. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  125. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  126. sglang/srt/layers/attention/nsa/utils.py +24 -0
  127. sglang/srt/layers/attention/nsa_backend.py +887 -0
  128. sglang/srt/layers/attention/tbo_backend.py +6 -6
  129. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  130. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  131. sglang/srt/layers/attention/triton_backend.py +57 -7
  132. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  133. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  134. sglang/srt/layers/attention/vision.py +58 -0
  135. sglang/srt/layers/attention/wave_backend.py +4 -4
  136. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  137. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  138. sglang/srt/layers/communicator.py +8 -0
  139. sglang/srt/layers/dp_attention.py +41 -2
  140. sglang/srt/layers/elementwise.py +3 -1
  141. sglang/srt/layers/layernorm.py +34 -15
  142. sglang/srt/layers/linear.py +55 -7
  143. sglang/srt/layers/logits_processor.py +44 -12
  144. sglang/srt/layers/moe/__init__.py +2 -1
  145. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  146. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  147. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  148. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  149. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  150. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  151. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  164. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  165. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  166. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  167. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  168. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  169. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  170. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  171. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  172. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  173. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  174. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  175. sglang/srt/layers/moe/topk.py +30 -9
  176. sglang/srt/layers/moe/utils.py +22 -6
  177. sglang/srt/layers/parameter.py +23 -6
  178. sglang/srt/layers/quantization/awq.py +19 -7
  179. sglang/srt/layers/quantization/base_config.py +11 -6
  180. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  182. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  185. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  186. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  187. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  188. sglang/srt/layers/quantization/fp8.py +78 -49
  189. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  190. sglang/srt/layers/quantization/gptq.py +25 -17
  191. sglang/srt/layers/quantization/modelopt_quant.py +190 -55
  192. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  193. sglang/srt/layers/quantization/mxfp4.py +74 -42
  194. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  195. sglang/srt/layers/quantization/unquant.py +135 -47
  196. sglang/srt/layers/quantization/w4afp8.py +26 -17
  197. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  198. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  199. sglang/srt/layers/rotary_embedding.py +78 -31
  200. sglang/srt/layers/sampler.py +213 -21
  201. sglang/srt/layers/utils.py +23 -0
  202. sglang/srt/lora/backend/base_backend.py +50 -8
  203. sglang/srt/lora/backend/chunked_backend.py +348 -0
  204. sglang/srt/lora/backend/triton_backend.py +99 -5
  205. sglang/srt/lora/layers.py +32 -0
  206. sglang/srt/lora/lora.py +8 -3
  207. sglang/srt/lora/lora_manager.py +52 -118
  208. sglang/srt/lora/mem_pool.py +25 -11
  209. sglang/srt/lora/triton_ops/__init__.py +4 -0
  210. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  211. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  212. sglang/srt/lora/utils.py +22 -11
  213. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  214. sglang/srt/managers/cache_controller.py +199 -301
  215. sglang/srt/managers/data_parallel_controller.py +115 -80
  216. sglang/srt/managers/detokenizer_manager.py +19 -15
  217. sglang/srt/managers/disagg_service.py +46 -0
  218. sglang/srt/managers/io_struct.py +340 -109
  219. sglang/srt/managers/mm_utils.py +44 -6
  220. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  221. sglang/srt/managers/multimodal_processor.py +1 -2
  222. sglang/srt/managers/overlap_utils.py +53 -0
  223. sglang/srt/managers/schedule_batch.py +240 -138
  224. sglang/srt/managers/schedule_policy.py +144 -17
  225. sglang/srt/managers/scheduler.py +502 -209
  226. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  227. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  228. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  229. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  230. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  231. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  232. sglang/srt/managers/tokenizer_manager.py +320 -632
  233. sglang/srt/managers/tp_worker.py +81 -22
  234. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  235. sglang/srt/managers/utils.py +1 -45
  236. sglang/srt/mem_cache/allocator.py +14 -20
  237. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  238. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  239. sglang/srt/mem_cache/chunk_cache.py +8 -1
  240. sglang/srt/mem_cache/evict_policy.py +23 -0
  241. sglang/srt/mem_cache/hicache_storage.py +43 -24
  242. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  243. sglang/srt/mem_cache/memory_pool.py +535 -58
  244. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  245. sglang/srt/mem_cache/radix_cache.py +222 -73
  246. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  247. sglang/srt/mem_cache/storage/__init__.py +10 -0
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  249. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  250. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  251. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  252. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  253. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  254. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  255. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  256. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  257. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  259. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  260. sglang/srt/metrics/collector.py +511 -132
  261. sglang/srt/metrics/func_timer.py +2 -7
  262. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  263. sglang/srt/metrics/utils.py +8 -1
  264. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  265. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  266. sglang/srt/model_executor/forward_batch_info.py +82 -40
  267. sglang/srt/model_executor/model_runner.py +432 -157
  268. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  269. sglang/srt/model_loader/__init__.py +9 -3
  270. sglang/srt/model_loader/loader.py +133 -5
  271. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  272. sglang/srt/model_loader/weight_utils.py +158 -3
  273. sglang/srt/models/apertus.py +686 -0
  274. sglang/srt/models/bailing_moe.py +820 -217
  275. sglang/srt/models/bailing_moe_nextn.py +168 -0
  276. sglang/srt/models/deepseek_nextn.py +6 -1
  277. sglang/srt/models/deepseek_v2.py +607 -130
  278. sglang/srt/models/dots_ocr.py +173 -0
  279. sglang/srt/models/dots_vlm.py +174 -0
  280. sglang/srt/models/dots_vlm_vit.py +337 -0
  281. sglang/srt/models/ernie4.py +1 -1
  282. sglang/srt/models/falcon_h1.py +576 -0
  283. sglang/srt/models/gemma3_causal.py +0 -2
  284. sglang/srt/models/gemma3_mm.py +1 -1
  285. sglang/srt/models/gemma3n_mm.py +2 -2
  286. sglang/srt/models/glm4_moe.py +4 -4
  287. sglang/srt/models/glm4_moe_nextn.py +2 -2
  288. sglang/srt/models/glm4v.py +5 -3
  289. sglang/srt/models/glm4v_moe.py +4 -1
  290. sglang/srt/models/gpt_oss.py +8 -31
  291. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  292. sglang/srt/models/llama.py +4 -0
  293. sglang/srt/models/llama4.py +9 -0
  294. sglang/srt/models/llama_eagle3.py +13 -0
  295. sglang/srt/models/longcat_flash.py +3 -3
  296. sglang/srt/models/longcat_flash_nextn.py +1 -1
  297. sglang/srt/models/mllama4.py +40 -4
  298. sglang/srt/models/opt.py +637 -0
  299. sglang/srt/models/qwen2_5_vl.py +29 -5
  300. sglang/srt/models/qwen2_audio.py +1 -1
  301. sglang/srt/models/qwen2_moe.py +120 -13
  302. sglang/srt/models/qwen2_vl.py +1 -1
  303. sglang/srt/models/qwen3.py +18 -3
  304. sglang/srt/models/qwen3_moe.py +32 -4
  305. sglang/srt/models/qwen3_next.py +1069 -0
  306. sglang/srt/models/qwen3_next_mtp.py +112 -0
  307. sglang/srt/models/qwen3_vl.py +787 -0
  308. sglang/srt/models/qwen3_vl_moe.py +471 -0
  309. sglang/srt/models/registry.py +15 -3
  310. sglang/srt/models/sarashina2_vision.py +269 -0
  311. sglang/srt/models/solar.py +505 -0
  312. sglang/srt/models/starcoder2.py +357 -0
  313. sglang/srt/models/step3_vl.py +1 -1
  314. sglang/srt/models/torch_native_llama.py +9 -2
  315. sglang/srt/models/utils.py +51 -0
  316. sglang/srt/multimodal/processors/base_processor.py +15 -7
  317. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  318. sglang/srt/multimodal/processors/glm4v.py +9 -9
  319. sglang/srt/multimodal/processors/internvl.py +153 -129
  320. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  321. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  322. sglang/srt/offloader.py +27 -3
  323. sglang/srt/parser/jinja_template_utils.py +6 -0
  324. sglang/srt/sampling/sampling_batch_info.py +38 -17
  325. sglang/srt/sampling/sampling_params.py +7 -0
  326. sglang/srt/server_args.py +966 -267
  327. sglang/srt/server_args_config_parser.py +146 -0
  328. sglang/srt/single_batch_overlap.py +151 -0
  329. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  330. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  331. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  332. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  333. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  334. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  335. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  336. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  337. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  338. sglang/srt/speculative/eagle_worker.py +99 -28
  339. sglang/srt/speculative/ngram_utils.py +428 -0
  340. sglang/srt/speculative/ngram_worker.py +245 -0
  341. sglang/srt/speculative/spec_info.py +52 -0
  342. sglang/srt/speculative/spec_utils.py +606 -0
  343. sglang/srt/speculative/standalone_worker.py +109 -0
  344. sglang/srt/torch_memory_saver_adapter.py +5 -7
  345. sglang/srt/tracing/trace.py +578 -0
  346. sglang/srt/two_batch_overlap.py +8 -5
  347. sglang/srt/utils/__init__.py +2 -0
  348. sglang/srt/{utils.py → utils/common.py} +433 -77
  349. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  350. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  351. sglang/srt/utils/rpd_utils.py +452 -0
  352. sglang/srt/utils/slow_rank_detector.py +71 -0
  353. sglang/srt/warmup.py +8 -4
  354. sglang/srt/weight_sync/utils.py +2 -2
  355. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  356. sglang/test/get_logits_ut.py +57 -0
  357. sglang/test/run_eval.py +79 -11
  358. sglang/test/runners.py +5 -1
  359. sglang/test/simple_eval_common.py +5 -2
  360. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  361. sglang/test/test_block_fp8.py +2 -2
  362. sglang/test/test_cutlass_moe.py +24 -6
  363. sglang/test/test_deterministic.py +297 -0
  364. sglang/test/test_disaggregation_utils.py +77 -0
  365. sglang/test/test_fp4_moe.py +370 -1
  366. sglang/test/test_programs.py +1 -1
  367. sglang/test/test_utils.py +383 -5
  368. sglang/utils.py +21 -1
  369. sglang/version.py +1 -1
  370. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +69 -124
  371. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +375 -245
  372. sglang/srt/disaggregation/launch_lb.py +0 -118
  373. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  374. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  375. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
  376. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
  377. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
sglang/test/test_utils.py CHANGED
@@ -9,15 +9,18 @@ import os
9
9
  import random
10
10
  import re
11
11
  import subprocess
12
+ import sys
12
13
  import threading
13
14
  import time
14
15
  import unittest
15
16
  from concurrent.futures import ThreadPoolExecutor
16
17
  from dataclasses import dataclass
18
+ from datetime import datetime
17
19
  from functools import partial
18
20
  from pathlib import Path
19
21
  from types import SimpleNamespace
20
- from typing import Awaitable, Callable, List, Optional, Tuple
22
+ from typing import Any, Awaitable, Callable, List, Optional, Tuple
23
+ from urllib.parse import quote
21
24
 
22
25
  import aiohttp
23
26
  import numpy as np
@@ -41,8 +44,10 @@ from sglang.utils import get_exception_traceback
41
44
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
42
45
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
43
46
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
47
+ DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE = "Qwen/Qwen3-Reranker-0.6B"
44
48
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
45
- DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
49
+ DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B"
50
+ DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
46
51
 
47
52
  # MLA test models
48
53
  DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
@@ -52,6 +57,9 @@ DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instru
52
57
  DEFAULT_MODEL_NAME_FOR_TEST_MLA = "lmsys/sglang-ci-dsv3-test"
53
58
  DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN = "lmsys/sglang-ci-dsv3-test-NextN"
54
59
 
60
+ # NVFP4 models
61
+ DEFAULT_DEEPSEEK_NVFP4_MODEL_FOR_TEST = "nvidia/DeepSeek-R1-0528-FP4"
62
+
55
63
  # FP8 models
56
64
  DEFAULT_MODEL_NAME_FOR_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
57
65
  DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
@@ -71,7 +79,13 @@ DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
71
79
  # EAGLE
72
80
  DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
73
81
  DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
74
- DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B"
82
+ DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST_EAGLE3 = "meta-llama/Llama-3.1-8B-Instruct"
83
+ DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B"
84
+ DEFAULT_STANDALONE_SPECULATIVE_TARGET_MODEL_FOR_TEST = (
85
+ "meta-llama/Llama-3.1-8B-Instruct"
86
+ )
87
+ DEFAULT_STANDALONE_SPECULATIVE_DRAFT_MODEL_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
88
+ DEFAULT_NGRAM_SPECULATIVE_TARGET_MODEL_FOR_TEST = "Qwen/Qwen2.5-Coder-7B-Instruct"
75
89
 
76
90
  # Other use cases
77
91
  DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
@@ -466,6 +480,25 @@ def try_cached_model(model_repo: str):
466
480
  return model_dir if model_dir else model_repo
467
481
 
468
482
 
483
+ def popen_with_error_check(command: list[str], allow_exit: bool = False):
484
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
485
+
486
+ def _run_and_check():
487
+ stdout, stderr = process.communicate()
488
+
489
+ while process.poll() is None:
490
+ time.sleep(5)
491
+
492
+ if not allow_exit or process.returncode != 0:
493
+ raise Exception(
494
+ f"{command} exited with code {process.returncode}\n{stdout=}\n{stderr=}"
495
+ )
496
+
497
+ t = threading.Thread(target=_run_and_check)
498
+ t.start()
499
+ return process
500
+
501
+
469
502
  def popen_launch_server(
470
503
  model: str,
471
504
  base_url: str,
@@ -534,11 +567,30 @@ def popen_launch_server(
534
567
  if return_stdout_stderr:
535
568
  process = subprocess.Popen(
536
569
  command,
537
- stdout=return_stdout_stderr[0],
538
- stderr=return_stdout_stderr[1],
570
+ stdout=subprocess.PIPE,
571
+ stderr=subprocess.PIPE,
539
572
  env=env,
540
573
  text=True,
574
+ bufsize=1,
541
575
  )
576
+
577
+ def _dump(src, sinks):
578
+ for line in iter(src.readline, ""):
579
+ for sink in sinks:
580
+ sink.write(line)
581
+ sink.flush()
582
+ src.close()
583
+
584
+ threading.Thread(
585
+ target=_dump,
586
+ args=(process.stdout, [return_stdout_stderr[0], sys.stdout]),
587
+ daemon=True,
588
+ ).start()
589
+ threading.Thread(
590
+ target=_dump,
591
+ args=(process.stderr, [return_stdout_stderr[1], sys.stderr]),
592
+ daemon=True,
593
+ ).start()
542
594
  else:
543
595
  process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
544
596
 
@@ -842,6 +894,154 @@ def run_bench_serving(
842
894
  return res
843
895
 
844
896
 
897
+ def run_score_benchmark(
898
+ model,
899
+ num_requests=100,
900
+ batch_size=5,
901
+ other_server_args=None,
902
+ need_warmup=False,
903
+ device="auto",
904
+ ):
905
+ """Score API benchmark function compatible with run_bench_serving pattern"""
906
+ if other_server_args is None:
907
+ other_server_args = []
908
+
909
+ if device == "auto":
910
+ device = auto_config_device()
911
+
912
+ # Launch the server (consistent with run_bench_serving)
913
+ base_url = DEFAULT_URL_FOR_TEST
914
+ process = popen_launch_server(
915
+ model,
916
+ base_url,
917
+ timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
918
+ other_args=other_server_args,
919
+ )
920
+
921
+ async def _run_benchmark():
922
+
923
+ # Load tokenizer for generating test data
924
+ from sglang.srt.utils.hf_transformers_utils import get_tokenizer
925
+
926
+ tokenizer = get_tokenizer(model)
927
+
928
+ # Score API configuration
929
+ score_query_tokens = 120
930
+ score_item_tokens = 180
931
+ score_label_token_ids = [9454, 2753] # Yes/No token IDs
932
+ special_token = "<|im_start|>"
933
+
934
+ def generate_text_with_token_count(num_tokens):
935
+ """Generate text with precise token count using replicated token."""
936
+ text = special_token * num_tokens
937
+ actual_tokens = len(tokenizer.encode(text, add_special_tokens=False))
938
+ if actual_tokens != num_tokens:
939
+ text = special_token * (
940
+ num_tokens
941
+ // len(tokenizer.encode(special_token, add_special_tokens=False))
942
+ )
943
+ return text
944
+
945
+ if need_warmup:
946
+ warmup_data = {
947
+ "query": generate_text_with_token_count(score_query_tokens),
948
+ "items": [
949
+ generate_text_with_token_count(score_item_tokens) for _ in range(3)
950
+ ],
951
+ "label_token_ids": score_label_token_ids,
952
+ "model": model,
953
+ "apply_softmax": True,
954
+ }
955
+
956
+ async with aiohttp.ClientSession() as session:
957
+ try:
958
+ await session.post(
959
+ f"{base_url}/v1/score",
960
+ json=warmup_data,
961
+ timeout=aiohttp.ClientTimeout(total=30),
962
+ )
963
+ except:
964
+ pass # Ignore warmup errors
965
+
966
+ test_requests = []
967
+ for i in range(num_requests):
968
+ query = generate_text_with_token_count(score_query_tokens)
969
+ items = [
970
+ generate_text_with_token_count(score_item_tokens)
971
+ for _ in range(batch_size)
972
+ ]
973
+
974
+ score_data = {
975
+ "query": query,
976
+ "items": items,
977
+ "label_token_ids": score_label_token_ids,
978
+ "model": model,
979
+ "apply_softmax": True,
980
+ }
981
+ test_requests.append(score_data)
982
+
983
+ start_time = time.monotonic()
984
+ successful_requests = 0
985
+ total_latency = 0
986
+ latencies = []
987
+
988
+ async with aiohttp.ClientSession() as session:
989
+ for request_data in test_requests:
990
+ try:
991
+ request_start = time.monotonic()
992
+ async with session.post(
993
+ f"{base_url}/v1/score",
994
+ json=request_data,
995
+ timeout=aiohttp.ClientTimeout(total=30),
996
+ ) as response:
997
+ if response.status == 200:
998
+ response_data = await response.json()
999
+ request_end = time.monotonic()
1000
+
1001
+ if "scores" in response_data or "logprobs" in response_data:
1002
+ latency_ms = (request_end - request_start) * 1000
1003
+ latencies.append(latency_ms)
1004
+ total_latency += latency_ms
1005
+ successful_requests += 1
1006
+ except Exception:
1007
+ continue
1008
+
1009
+ end_time = time.monotonic()
1010
+ total_time = end_time - start_time
1011
+
1012
+ if successful_requests > 0:
1013
+ throughput = successful_requests / total_time
1014
+ avg_latency = total_latency / successful_requests
1015
+ latencies.sort()
1016
+ p95_latency = latencies[int(len(latencies) * 0.95)] if latencies else 0
1017
+
1018
+ return {
1019
+ "completed": successful_requests,
1020
+ "total_requests": num_requests,
1021
+ "throughput": throughput,
1022
+ "avg_latency_ms": avg_latency,
1023
+ "p95_latency_ms": p95_latency,
1024
+ "successful_requests": successful_requests,
1025
+ }
1026
+ else:
1027
+ return {
1028
+ "completed": 0,
1029
+ "total_requests": num_requests,
1030
+ "throughput": 0,
1031
+ "avg_latency_ms": 0,
1032
+ "p95_latency_ms": 0,
1033
+ "successful_requests": 0,
1034
+ }
1035
+
1036
+ try:
1037
+ res = asyncio.run(_run_benchmark())
1038
+ finally:
1039
+ kill_process_tree(process.pid)
1040
+
1041
+ assert res["completed"] == res["successful_requests"]
1042
+ return res
1043
+
1044
+
845
1045
  def run_bench_serving_multi(
846
1046
  model,
847
1047
  base_url,
@@ -1363,6 +1563,41 @@ async def send_concurrent_generate_requests(
1363
1563
  return await asyncio.gather(*tasks)
1364
1564
 
1365
1565
 
1566
+ async def send_concurrent_generate_requests_with_custom_params(
1567
+ base_url: str,
1568
+ custom_params: List[dict[str, Any]],
1569
+ ) -> Tuple[int, Any]:
1570
+ """Sends generate request concurrently with custom parameters and returns status code and response json tuple. Max concurrency is num_requests."""
1571
+
1572
+ base_payload = {
1573
+ "text": """
1574
+ System: You are a helpful assistant.
1575
+ User: What is the capital of France?
1576
+ Assistant: The capital of France is
1577
+ """,
1578
+ "sampling_params": {
1579
+ "temperature": 0,
1580
+ "max_new_tokens": 50,
1581
+ },
1582
+ }
1583
+
1584
+ async def async_generate_with_priority(req):
1585
+ async with aiohttp.ClientSession() as session:
1586
+ async with session.post(
1587
+ f"{base_url}/generate",
1588
+ json=req,
1589
+ ) as response:
1590
+ resp_json = await response.json()
1591
+ return (response.status, resp_json)
1592
+
1593
+ tasks = []
1594
+ for c in custom_params:
1595
+ req = base_payload.copy()
1596
+ req.update(c)
1597
+ tasks.append(asyncio.create_task(async_generate_with_priority(req)))
1598
+ return await asyncio.gather(*tasks)
1599
+
1600
+
1366
1601
  class CustomTestCase(unittest.TestCase):
1367
1602
  def _callTestMethod(self, method):
1368
1603
  max_retry = int(
@@ -1404,3 +1639,146 @@ def dump_bench_raw_result(
1404
1639
  def _ensure_remove_suffix(text: str, suffix: str):
1405
1640
  assert text.endswith(suffix)
1406
1641
  return text.removesuffix(suffix)
1642
+
1643
+
1644
+ class ModelDeploySetup:
1645
+ def __init__(self, model_path: str, extra_args: List[str] = []):
1646
+ self.model_path = model_path
1647
+ if "--enable-multimodal" not in extra_args:
1648
+ extra_args.append("--enable-multimodal")
1649
+ if "--trust-remote-code" not in extra_args:
1650
+ extra_args.append("--trust-remote-code")
1651
+
1652
+ self.extra_args = extra_args
1653
+
1654
+
1655
+ class ModelEvalMetrics:
1656
+ def __init__(self, accuracy: float, eval_time: float):
1657
+ self.accuracy = accuracy
1658
+ self.eval_time = eval_time
1659
+
1660
+
1661
+ def extract_trace_link_from_bench_one_batch_server_output(output: str) -> str:
1662
+ match = re.search(r"\[Profile\]\((.*?)\)", output)
1663
+ if match:
1664
+ trace_link = match.group(1)
1665
+ return trace_link
1666
+ return None
1667
+
1668
+
1669
+ def parse_models(model_string: str):
1670
+ return [model.strip() for model in model_string.split(",") if model.strip()]
1671
+
1672
+
1673
+ def check_evaluation_test_results(
1674
+ results,
1675
+ test_name,
1676
+ model_accuracy_thresholds,
1677
+ model_latency_thresholds=None,
1678
+ model_count=None,
1679
+ ):
1680
+ """
1681
+ results: list of tuple of (model_path, accuracy, latency)
1682
+ """
1683
+ failed_models = []
1684
+ if model_latency_thresholds is not None:
1685
+ summary = " | model | status | score | score_threshold | latency | latency_threshold | \n"
1686
+ summary += "| ----- | ------ | ----- | --------------- | ------- | ----------------- | \n"
1687
+ else:
1688
+ summary = " | model | status | score | score_threshold | \n"
1689
+ summary += "| ----- | ------ | ----- | --------------- | \n"
1690
+
1691
+ results_dict = {res[0]: (res[1], res[2]) for res in results}
1692
+
1693
+ for model, accuracy_threshold in sorted(model_accuracy_thresholds.items()):
1694
+ latency_threshold = (
1695
+ model_latency_thresholds.get(model)
1696
+ if model_latency_thresholds is not None
1697
+ else 1e9
1698
+ )
1699
+
1700
+ if model in results_dict:
1701
+ accuracy, latency = results_dict[model]
1702
+ is_success = accuracy >= accuracy_threshold and latency <= latency_threshold
1703
+ status_emoji = "✅" if is_success else "❌"
1704
+
1705
+ if not is_success:
1706
+ if accuracy < accuracy_threshold:
1707
+ failed_models.append(
1708
+ f"\nScore Check Failed: {model}\n"
1709
+ f"Model {model} score ({accuracy:.4f}) is below threshold ({accuracy_threshold:.4f})"
1710
+ )
1711
+ if latency > latency_threshold:
1712
+ failed_models.append(
1713
+ f"\nLatency Check Failed: {model}\n"
1714
+ f"Model {model} latency ({latency:.4f}) is above threshold ({latency_threshold:.4f})"
1715
+ )
1716
+
1717
+ if model_latency_thresholds is not None:
1718
+ line = f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold} | {latency} | {latency_threshold}\n"
1719
+ else:
1720
+ line = (
1721
+ f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold}\n"
1722
+ )
1723
+ else:
1724
+ status_emoji = "❌"
1725
+ failed_models.append(f"Model failed to launch or be evaluated: {model}")
1726
+ if model_latency_thresholds is not None:
1727
+ line = f"| {model} | {status_emoji} | N/A | {accuracy_threshold} | N/A | {latency_threshold}\n"
1728
+ else:
1729
+ line = f"| {model} | {status_emoji} | N/A | {accuracy_threshold}\n"
1730
+
1731
+ summary += line
1732
+
1733
+ print(summary)
1734
+
1735
+ if is_in_ci():
1736
+ write_github_step_summary(f"## {test_name}\n{summary}")
1737
+
1738
+ if failed_models:
1739
+ print("Some models failed the evaluation.")
1740
+ raise AssertionError("\n".join(failed_models))
1741
+
1742
+
1743
+ # Bench knobs for bench_one_batch_server (override by env)
1744
+ def _parse_int_list_env(name: str, default_val: str):
1745
+ val = os.environ.get(name, default_val)
1746
+ return [int(x) for x in val.split(",") if x]
1747
+
1748
+
1749
+ # Return filenames
1750
+ def find_traces_under_path(path: str) -> List[str]:
1751
+ results = []
1752
+ for _, dirs, files in os.walk(path):
1753
+ for file in files:
1754
+ if file.endswith(".trace.json.gz"):
1755
+ results.append(f"{file}")
1756
+ return results
1757
+
1758
+
1759
+ def write_results_to_json(model, metrics, mode="a"):
1760
+ result = {
1761
+ "timestamp": datetime.now().isoformat(),
1762
+ "model": model,
1763
+ "metrics": metrics,
1764
+ "score": metrics["score"],
1765
+ }
1766
+
1767
+ if "latency" in metrics:
1768
+ result["latency"] = (metrics.get("latency"),)
1769
+
1770
+ existing_results = []
1771
+ if mode == "a" and os.path.exists("results.json"):
1772
+ try:
1773
+ with open("results.json", "r") as f:
1774
+ existing_results = json.load(f)
1775
+ except json.JSONDecodeError:
1776
+ existing_results = []
1777
+
1778
+ if isinstance(existing_results, list):
1779
+ existing_results.append(result)
1780
+ else:
1781
+ existing_results = [result]
1782
+
1783
+ with open("results.json", "w") as f:
1784
+ json.dump(existing_results, f, indent=2)
sglang/utils.py CHANGED
@@ -6,6 +6,7 @@ import logging
6
6
  import os
7
7
  import random
8
8
  import socket
9
+ import ssl
9
10
  import subprocess
10
11
  import sys
11
12
  import time
@@ -155,7 +156,15 @@ def http_request(
155
156
  data = bytes(dumps(json), encoding="utf-8")
156
157
 
157
158
  try:
158
- resp = urllib.request.urlopen(req, data=data, cafile=verify)
159
+ if sys.version_info >= (3, 13):
160
+ # Python 3.13+: Use SSL context (cafile removed)
161
+ if verify and isinstance(verify, str):
162
+ context = ssl.create_default_context(cafile=verify)
163
+ else:
164
+ context = ssl.create_default_context()
165
+ resp = urllib.request.urlopen(req, data=data, context=context)
166
+ else:
167
+ resp = urllib.request.urlopen(req, data=data, cafile=verify)
159
168
  return HttpResponse(resp)
160
169
  except urllib.error.HTTPError as e:
161
170
  return HttpResponse(e)
@@ -472,11 +481,22 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
472
481
  class TypeBasedDispatcher:
473
482
  def __init__(self, mapping: List[Tuple[Type, Callable]]):
474
483
  self._mapping = mapping
484
+ self._fallback_fn = None
485
+
486
+ def add_fallback_fn(self, fallback_fn: Callable):
487
+ self._fallback_fn = fallback_fn
488
+
489
+ def __iadd__(self, other: "TypeBasedDispatcher"):
490
+ self._mapping.extend(other._mapping)
491
+ return self
475
492
 
476
493
  def __call__(self, obj: Any):
477
494
  for ty, fn in self._mapping:
478
495
  if isinstance(obj, ty):
479
496
  return fn(obj)
497
+
498
+ if self._fallback_fn is not None:
499
+ return self._fallback_fn(obj)
480
500
  raise ValueError(f"Invalid object: {obj}")
481
501
 
482
502
 
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.2rc2"
1
+ __version__ = "0.5.3rc2"