sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (408) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +330 -156
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +8 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +134 -23
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +70 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +66 -66
  69. sglang/srt/entrypoints/grpc_server.py +431 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +120 -8
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +42 -4
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +3 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +18 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/utils.py +2 -2
  93. sglang/srt/grpc/compile_proto.py +3 -3
  94. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  95. sglang/srt/grpc/health_servicer.py +189 -0
  96. sglang/srt/grpc/scheduler_launcher.py +181 -0
  97. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  98. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  99. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  100. sglang/srt/layers/activation.py +4 -1
  101. sglang/srt/layers/attention/aiter_backend.py +3 -3
  102. sglang/srt/layers/attention/ascend_backend.py +17 -1
  103. sglang/srt/layers/attention/attention_registry.py +43 -23
  104. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  105. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  106. sglang/srt/layers/attention/fla/chunk.py +0 -1
  107. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  108. sglang/srt/layers/attention/fla/index.py +0 -2
  109. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  110. sglang/srt/layers/attention/fla/utils.py +0 -3
  111. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  112. sglang/srt/layers/attention/flashattention_backend.py +12 -8
  113. sglang/srt/layers/attention/flashinfer_backend.py +248 -21
  114. sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
  115. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  116. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  117. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  118. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  119. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  121. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  122. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  123. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  124. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  125. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  127. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  128. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  129. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  130. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  131. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  132. sglang/srt/layers/attention/nsa/utils.py +0 -1
  133. sglang/srt/layers/attention/nsa_backend.py +404 -90
  134. sglang/srt/layers/attention/triton_backend.py +208 -34
  135. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  136. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  137. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  138. sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
  139. sglang/srt/layers/attention/utils.py +11 -7
  140. sglang/srt/layers/attention/vision.py +3 -3
  141. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  142. sglang/srt/layers/communicator.py +11 -7
  143. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  146. sglang/srt/layers/dp_attention.py +17 -0
  147. sglang/srt/layers/layernorm.py +45 -15
  148. sglang/srt/layers/linear.py +9 -1
  149. sglang/srt/layers/logits_processor.py +147 -17
  150. sglang/srt/layers/modelopt_utils.py +11 -0
  151. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  152. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  153. sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
  154. sglang/srt/layers/moe/ep_moe/layer.py +119 -397
  155. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  159. sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
  160. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  161. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  162. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  163. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  164. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  165. sglang/srt/layers/moe/router.py +51 -15
  166. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  167. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  168. sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
  169. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  170. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  171. sglang/srt/layers/moe/topk.py +3 -2
  172. sglang/srt/layers/moe/utils.py +17 -1
  173. sglang/srt/layers/quantization/__init__.py +2 -53
  174. sglang/srt/layers/quantization/awq.py +183 -6
  175. sglang/srt/layers/quantization/awq_triton.py +29 -0
  176. sglang/srt/layers/quantization/base_config.py +20 -1
  177. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  178. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  179. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  180. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  181. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  183. sglang/srt/layers/quantization/fp8.py +84 -18
  184. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  185. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  186. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  187. sglang/srt/layers/quantization/gptq.py +0 -1
  188. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  189. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  190. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  191. sglang/srt/layers/quantization/mxfp4.py +5 -30
  192. sglang/srt/layers/quantization/petit.py +1 -1
  193. sglang/srt/layers/quantization/quark/quark.py +3 -1
  194. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  195. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  196. sglang/srt/layers/quantization/unquant.py +1 -4
  197. sglang/srt/layers/quantization/utils.py +0 -1
  198. sglang/srt/layers/quantization/w4afp8.py +51 -20
  199. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  200. sglang/srt/layers/radix_attention.py +59 -9
  201. sglang/srt/layers/rotary_embedding.py +673 -16
  202. sglang/srt/layers/sampler.py +36 -16
  203. sglang/srt/layers/sparse_pooler.py +98 -0
  204. sglang/srt/layers/utils.py +0 -1
  205. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  206. sglang/srt/lora/backend/triton_backend.py +0 -1
  207. sglang/srt/lora/eviction_policy.py +139 -0
  208. sglang/srt/lora/lora_manager.py +24 -9
  209. sglang/srt/lora/lora_registry.py +1 -1
  210. sglang/srt/lora/mem_pool.py +40 -16
  211. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  212. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  213. sglang/srt/managers/cache_controller.py +48 -17
  214. sglang/srt/managers/data_parallel_controller.py +146 -42
  215. sglang/srt/managers/detokenizer_manager.py +40 -13
  216. sglang/srt/managers/io_struct.py +66 -16
  217. sglang/srt/managers/mm_utils.py +20 -18
  218. sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
  219. sglang/srt/managers/overlap_utils.py +96 -19
  220. sglang/srt/managers/schedule_batch.py +241 -511
  221. sglang/srt/managers/schedule_policy.py +15 -2
  222. sglang/srt/managers/scheduler.py +399 -499
  223. sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
  224. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  225. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  226. sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
  227. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  228. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  229. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  230. sglang/srt/managers/tokenizer_manager.py +378 -90
  231. sglang/srt/managers/tp_worker.py +212 -161
  232. sglang/srt/managers/utils.py +78 -2
  233. sglang/srt/mem_cache/allocator.py +7 -2
  234. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  235. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  236. sglang/srt/mem_cache/chunk_cache.py +13 -2
  237. sglang/srt/mem_cache/common.py +480 -0
  238. sglang/srt/mem_cache/evict_policy.py +16 -1
  239. sglang/srt/mem_cache/hicache_storage.py +4 -1
  240. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  241. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  242. sglang/srt/mem_cache/memory_pool.py +435 -219
  243. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  244. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  245. sglang/srt/mem_cache/radix_cache.py +53 -19
  246. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  247. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  249. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  250. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  251. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  252. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  253. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  254. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  255. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  256. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  257. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  258. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  259. sglang/srt/metrics/collector.py +31 -0
  260. sglang/srt/metrics/func_timer.py +1 -1
  261. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  262. sglang/srt/model_executor/forward_batch_info.py +28 -23
  263. sglang/srt/model_executor/model_runner.py +379 -139
  264. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  265. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  266. sglang/srt/model_loader/__init__.py +1 -1
  267. sglang/srt/model_loader/loader.py +424 -27
  268. sglang/srt/model_loader/utils.py +0 -1
  269. sglang/srt/model_loader/weight_utils.py +47 -28
  270. sglang/srt/models/apertus.py +2 -3
  271. sglang/srt/models/arcee.py +2 -2
  272. sglang/srt/models/bailing_moe.py +13 -52
  273. sglang/srt/models/bailing_moe_nextn.py +3 -4
  274. sglang/srt/models/bert.py +1 -1
  275. sglang/srt/models/deepseek_nextn.py +19 -3
  276. sglang/srt/models/deepseek_ocr.py +1516 -0
  277. sglang/srt/models/deepseek_v2.py +273 -98
  278. sglang/srt/models/dots_ocr.py +0 -2
  279. sglang/srt/models/dots_vlm.py +0 -1
  280. sglang/srt/models/dots_vlm_vit.py +1 -1
  281. sglang/srt/models/falcon_h1.py +13 -19
  282. sglang/srt/models/gemma3_mm.py +16 -0
  283. sglang/srt/models/gemma3n_mm.py +1 -2
  284. sglang/srt/models/glm4_moe.py +14 -37
  285. sglang/srt/models/glm4_moe_nextn.py +2 -2
  286. sglang/srt/models/glm4v.py +2 -1
  287. sglang/srt/models/glm4v_moe.py +5 -5
  288. sglang/srt/models/gpt_oss.py +5 -5
  289. sglang/srt/models/grok.py +10 -23
  290. sglang/srt/models/hunyuan.py +2 -7
  291. sglang/srt/models/interns1.py +0 -1
  292. sglang/srt/models/kimi_vl.py +1 -7
  293. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  294. sglang/srt/models/llama.py +2 -2
  295. sglang/srt/models/llama_eagle3.py +1 -1
  296. sglang/srt/models/longcat_flash.py +5 -22
  297. sglang/srt/models/longcat_flash_nextn.py +3 -14
  298. sglang/srt/models/mimo.py +2 -13
  299. sglang/srt/models/mimo_mtp.py +1 -2
  300. sglang/srt/models/minicpmo.py +7 -5
  301. sglang/srt/models/mixtral.py +1 -4
  302. sglang/srt/models/mllama.py +1 -1
  303. sglang/srt/models/mllama4.py +13 -3
  304. sglang/srt/models/nemotron_h.py +511 -0
  305. sglang/srt/models/olmo2.py +31 -4
  306. sglang/srt/models/opt.py +5 -5
  307. sglang/srt/models/phi.py +1 -1
  308. sglang/srt/models/phi4mm.py +1 -1
  309. sglang/srt/models/phimoe.py +0 -1
  310. sglang/srt/models/pixtral.py +0 -3
  311. sglang/srt/models/points_v15_chat.py +186 -0
  312. sglang/srt/models/qwen.py +0 -1
  313. sglang/srt/models/qwen2_5_vl.py +3 -3
  314. sglang/srt/models/qwen2_audio.py +2 -15
  315. sglang/srt/models/qwen2_moe.py +15 -12
  316. sglang/srt/models/qwen2_vl.py +5 -2
  317. sglang/srt/models/qwen3_moe.py +19 -35
  318. sglang/srt/models/qwen3_next.py +7 -12
  319. sglang/srt/models/qwen3_next_mtp.py +3 -4
  320. sglang/srt/models/qwen3_omni_moe.py +661 -0
  321. sglang/srt/models/qwen3_vl.py +37 -33
  322. sglang/srt/models/qwen3_vl_moe.py +57 -185
  323. sglang/srt/models/roberta.py +55 -3
  324. sglang/srt/models/sarashina2_vision.py +0 -1
  325. sglang/srt/models/step3_vl.py +3 -5
  326. sglang/srt/models/utils.py +11 -1
  327. sglang/srt/multimodal/processors/base_processor.py +6 -2
  328. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  329. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  330. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  331. sglang/srt/multimodal/processors/glm4v.py +1 -5
  332. sglang/srt/multimodal/processors/internvl.py +0 -2
  333. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  334. sglang/srt/multimodal/processors/mllama4.py +0 -8
  335. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  336. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  337. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  338. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  339. sglang/srt/parser/conversation.py +41 -0
  340. sglang/srt/parser/reasoning_parser.py +0 -1
  341. sglang/srt/sampling/custom_logit_processor.py +77 -2
  342. sglang/srt/sampling/sampling_batch_info.py +17 -22
  343. sglang/srt/sampling/sampling_params.py +70 -2
  344. sglang/srt/server_args.py +577 -73
  345. sglang/srt/server_args_config_parser.py +1 -1
  346. sglang/srt/single_batch_overlap.py +38 -28
  347. sglang/srt/speculative/base_spec_worker.py +34 -0
  348. sglang/srt/speculative/draft_utils.py +226 -0
  349. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  350. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  351. sglang/srt/speculative/eagle_info.py +57 -18
  352. sglang/srt/speculative/eagle_info_v2.py +458 -0
  353. sglang/srt/speculative/eagle_utils.py +138 -0
  354. sglang/srt/speculative/eagle_worker.py +83 -280
  355. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  356. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  357. sglang/srt/speculative/ngram_worker.py +12 -11
  358. sglang/srt/speculative/spec_info.py +2 -0
  359. sglang/srt/speculative/spec_utils.py +38 -3
  360. sglang/srt/speculative/standalone_worker.py +4 -14
  361. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  362. sglang/srt/two_batch_overlap.py +28 -14
  363. sglang/srt/utils/__init__.py +1 -1
  364. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  365. sglang/srt/utils/common.py +192 -47
  366. sglang/srt/utils/hf_transformers_utils.py +40 -17
  367. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  368. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  369. sglang/srt/utils/profile_merger.py +199 -0
  370. sglang/test/attention/test_flashattn_backend.py +1 -1
  371. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  372. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  373. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  374. sglang/test/few_shot_gsm8k_engine.py +2 -4
  375. sglang/test/kit_matched_stop.py +157 -0
  376. sglang/test/longbench_v2/__init__.py +1 -0
  377. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  378. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  379. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  380. sglang/test/run_eval.py +41 -0
  381. sglang/test/runners.py +2 -0
  382. sglang/test/send_one.py +42 -7
  383. sglang/test/simple_eval_common.py +3 -0
  384. sglang/test/simple_eval_gpqa.py +0 -1
  385. sglang/test/simple_eval_humaneval.py +0 -3
  386. sglang/test/simple_eval_longbench_v2.py +344 -0
  387. sglang/test/test_block_fp8.py +1 -2
  388. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  389. sglang/test/test_cutlass_moe.py +1 -2
  390. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  391. sglang/test/test_deterministic.py +232 -99
  392. sglang/test/test_deterministic_utils.py +73 -0
  393. sglang/test/test_disaggregation_utils.py +81 -0
  394. sglang/test/test_marlin_moe.py +0 -1
  395. sglang/test/test_utils.py +85 -20
  396. sglang/version.py +1 -1
  397. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
  398. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
  399. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  400. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  401. sglang/srt/speculative/build_eagle_tree.py +0 -427
  402. sglang/test/test_block_fp8_ep.py +0 -358
  403. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  404. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  405. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  406. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
sglang/srt/environ.py CHANGED
@@ -113,8 +113,7 @@ class Envs:
113
113
 
114
114
  # Test & Debug
115
115
  SGLANG_IS_IN_CI = EnvBool(False)
116
- SGLANG_AMD_CI = EnvBool(False)
117
- SGLANG_TEST_RETRACT = EnvBool(False)
116
+ SGLANG_IS_IN_CI_AMD = EnvBool(False)
118
117
  SGLANG_SET_CPU_AFFINITY = EnvBool(False)
119
118
  SGLANG_PROFILE_WITH_STACK = EnvBool(True)
120
119
  SGLANG_RECORD_STEP_TIME = EnvBool(False)
@@ -128,8 +127,26 @@ class Envs:
128
127
  SGLANG_SIMULATE_ACC_METHOD = EnvStr("multinomial")
129
128
  SGLANG_TORCH_PROFILER_DIR = EnvStr("/tmp")
130
129
 
130
+ # Scheduler: memory leak test
131
+ SGLANG_TEST_RETRACT = EnvBool(False)
132
+ SGLANG_TEST_RETRACT_INTERVAL = EnvInt(3)
133
+ SGLANG_ENABLE_RUNTIME_MEM_LEAK_CHECK = EnvBool(False)
134
+
135
+ # Scheduler: new token ratio hyperparameters
136
+ SGLANG_INIT_NEW_TOKEN_RATIO = EnvFloat(0.7)
137
+ SGLANG_MIN_NEW_TOKEN_RATIO_FACTOR = EnvFloat(0.14)
138
+ SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS = EnvInt(600)
139
+ SGLANG_RETRACT_DECODE_STEPS = EnvInt(20)
140
+
141
+ # Scheduler: others:
142
+ SGLANG_EMPTY_CACHE_INTERVAL = EnvFloat(-1) # in seconds. Set if you observe high memory accumulation over a long serving period.
143
+ # Test: pd-disaggregation
144
+ SGLANG_TEST_PD_DISAGG_BACKEND = EnvStr("mooncake")
145
+ SGLANG_TEST_PD_DISAGG_DEVICES = EnvStr(None)
146
+
131
147
  # Model Parallel
132
148
  SGLANG_USE_MESSAGE_QUEUE_BROADCASTER = EnvBool(True)
149
+ SGLANG_ONE_VISIBLE_DEVICE_PER_PROCESS = EnvBool(False)
133
150
 
134
151
  # Constrained Decoding
135
152
  SGLANG_DISABLE_OUTLINES_DISK_CACHE = EnvBool(True)
@@ -145,6 +162,7 @@ class Envs:
145
162
  # AMD & ROCm
146
163
  SGLANG_USE_AITER = EnvBool(False)
147
164
  SGLANG_ROCM_FUSED_DECODE_MLA = EnvBool(False)
165
+ SGLANG_ROCM_DISABLE_LINEARQUANT = EnvBool(False)
148
166
 
149
167
  # Quantization
150
168
  SGLANG_INT4_WEIGHT = EnvBool(False)
@@ -155,6 +173,7 @@ class Envs:
155
173
  # Flashinfer
156
174
  SGLANG_IS_FLASHINFER_AVAILABLE = EnvBool(True)
157
175
  SGLANG_ENABLE_FLASHINFER_GEMM = EnvBool(False)
176
+ SGLANG_FLASHINFER_WORKSPACE_SIZE = EnvInt(384 * 1024 * 1024)
158
177
 
159
178
  # Triton
160
179
  SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS = EnvBool(False)
@@ -167,6 +186,7 @@ class Envs:
167
186
  SGLANG_EXPERT_LOCATION_UPDATER_CANARY = EnvBool(False)
168
187
  SGLANG_EXPERT_LOCATION_UPDATER_LOG_METRICS = EnvBool(False)
169
188
  SGLANG_LOG_EXPERT_LOCATION_METADATA = EnvBool(False)
189
+ SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR = EnvStr("/tmp")
170
190
 
171
191
  # TBO
172
192
  SGLANG_TBO_DEBUG = EnvBool(False)
@@ -183,12 +203,12 @@ class Envs:
183
203
  # sgl-kernel
184
204
  SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK = EnvBool(False)
185
205
 
186
- # vLLM dependencies
206
+ # vLLM dependencies (TODO: they have been deprecated, we can remove them safely)
187
207
  USE_VLLM_CUSTOM_ALLREDUCE = EnvBool(False)
188
208
  USE_VLLM_CUTLASS_W8A8_FP8_KERNEL = EnvBool(False)
189
209
 
190
210
  USE_TRITON_W8A8_FP8_KERNEL = EnvBool(False)
191
- RETURN_ORIGINAL_LOGPROB = EnvBool(False)
211
+ SGLANG_RETURN_ORIGINAL_LOGPROB = EnvBool(False)
192
212
  SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN = EnvBool(False)
193
213
  SGLANG_MOE_PADDING = EnvBool(False)
194
214
  SGLANG_CUTLASS_MOE = EnvBool(False)
@@ -207,6 +227,24 @@ class Envs:
207
227
  SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZE = EnvInt(4096)
208
228
  SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
209
229
 
230
+ # Overlap Spec V2
231
+ SGLANG_ENABLE_OVERLAP_PLAN_STREAM = EnvBool(False)
232
+
233
+ # VLM
234
+ SGLANG_IMAGE_MAX_PIXELS = EnvInt(16384 * 28 * 28)
235
+ SGLANG_RESIZE_RESAMPLE = EnvStr("")
236
+
237
+ # Ktransformers
238
+ SGLANG_KT_MOE_NUM_GPU_EXPERTS = EnvInt(None)
239
+ SGLANG_KT_MOE_CPUINFER = EnvInt(None)
240
+ SGLANG_KT_THREADPOOL_COUNT = EnvInt(None)
241
+ SGLANG_KT_MOE_AMX_WEIGHT_PATH = EnvStr(None)
242
+ SGLANG_KT_AMX_METHOD = EnvStr(None)
243
+ SGLANG_KT_MOE_CHUNKED_PREFILL_SIZE = EnvInt(None)
244
+
245
+ # Sparse Embeddings
246
+ SGLANG_EMBEDDINGS_SPARSE_HEAD = EnvStr(None)
247
+
210
248
  # fmt: on
211
249
 
212
250
 
@@ -3,7 +3,8 @@ from typing import Optional
3
3
 
4
4
  import torch
5
5
 
6
- from sglang.srt.eplb.eplb_algorithms import deepseek, deepseek_vec
6
+ from sglang.srt.elastic_ep.elastic_ep import ElasticEPStateManager
7
+ from sglang.srt.eplb.eplb_algorithms import deepseek, deepseek_vec, elasticity_aware
7
8
 
8
9
 
9
10
  class EplbAlgorithm(Enum):
@@ -11,6 +12,7 @@ class EplbAlgorithm(Enum):
11
12
  deepseek_hierarchical = auto()
12
13
  deepseek_vec = auto()
13
14
  deepseek_vec_hierarchical = auto()
15
+ elasticity_aware = auto()
14
16
  # TODO may have more algorithm later
15
17
 
16
18
 
@@ -45,6 +47,21 @@ def rebalance_experts(
45
47
  enable_hierarchical=algorithm == EplbAlgorithm.deepseek_vec_hierarchical,
46
48
  )
47
49
 
50
+ if algorithm == EplbAlgorithm.elasticity_aware:
51
+ return elasticity_aware.rebalance_experts(
52
+ weight=tokens_per_expert.sum(dim=0),
53
+ num_replicas=num_physical_experts,
54
+ num_groups=num_groups,
55
+ num_nodes=num_nodes,
56
+ num_gpus=num_physical_experts // num_local_physical_experts,
57
+ enable_hierarchical=True,
58
+ active_ranks=(
59
+ ElasticEPStateManager.instance().active_ranks
60
+ if ElasticEPStateManager.instance() is not None
61
+ else ElasticEPStateManager.healthy_rank_state()
62
+ ),
63
+ )
64
+
48
65
  raise NotImplementedError
49
66
 
50
67
 
@@ -3,8 +3,6 @@ from typing import Tuple
3
3
 
4
4
  import torch
5
5
 
6
- from sglang.srt.utils import get_bool_env_var
7
-
8
6
 
9
7
  def balanced_packing(
10
8
  weight: torch.Tensor, num_packs: int
@@ -0,0 +1,87 @@
1
+ from typing import Tuple
2
+
3
+ import torch
4
+
5
+ from sglang.srt.eplb.eplb_algorithms.deepseek import rebalance_experts_hierarchical
6
+
7
+
8
+ def rebalance_experts(
9
+ weight: torch.Tensor,
10
+ num_replicas: int,
11
+ num_groups: int,
12
+ num_nodes: int,
13
+ num_gpus: int,
14
+ enable_hierarchical: bool,
15
+ active_ranks: torch.Tensor,
16
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
17
+ """
18
+ Entry point for expert-parallelism load balancer.
19
+
20
+ Parameters:
21
+ weight: [layers, num_logical_experts], the load statistics for all logical experts
22
+ num_replicas: number of physical experts, must be a multiple of `num_gpus`
23
+ num_groups: number of expert groups
24
+ num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
25
+ num_gpus: number of GPUs, must be a multiple of `num_nodes`
26
+
27
+ Returns:
28
+ physical_to_logical_map: [layers, num_replicas], the expert index of each replica
29
+ logical_to_physical_map: [layers, num_logical_experts, X], the replica indices for each expert
30
+ expert_count: [layers, num_logical_experts], number of physical replicas for each logical expert
31
+ """
32
+
33
+ num_layers, num_logical_experts = weight.shape
34
+ weight = weight.float().cpu()
35
+ num_active_ranks = active_ranks.sum().item()
36
+ num_local_experts = num_replicas // num_gpus
37
+ if num_active_ranks < num_gpus:
38
+ # Must fall back to global load-balance policy
39
+ # and fix some params
40
+ phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
41
+ weight,
42
+ num_local_experts * num_active_ranks,
43
+ 1,
44
+ 1,
45
+ num_active_ranks,
46
+ )
47
+ elif enable_hierarchical:
48
+ # use hierarchical load-balance policy
49
+ phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
50
+ weight, num_replicas, num_groups, num_nodes, num_gpus
51
+ )
52
+ else:
53
+ # use global load-balance policy
54
+ phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
55
+ weight, num_replicas, 1, 1, num_gpus
56
+ )
57
+ maxlogcnt = logcnt.max().item()
58
+ log2phy: torch.Tensor = torch.full(
59
+ (num_layers, num_logical_experts, maxlogcnt),
60
+ -1,
61
+ dtype=torch.int64,
62
+ device=logcnt.device,
63
+ )
64
+ log2phy.view(num_layers, -1).scatter_(
65
+ -1,
66
+ phy2log * maxlogcnt + phyrank,
67
+ torch.arange(
68
+ num_local_experts * num_active_ranks,
69
+ dtype=torch.int64,
70
+ device=log2phy.device,
71
+ ).expand(num_layers, -1),
72
+ )
73
+ if num_active_ranks < num_gpus:
74
+ phy2log_slices = list(
75
+ phy2log.view(num_layers, num_active_ranks, -1).unbind(dim=1)
76
+ )
77
+ active_ranks_list = active_ranks.tolist()
78
+ for idx, active_rank in enumerate(active_ranks_list):
79
+ if not active_rank:
80
+ phy2log_slices.insert(idx, torch.zeros_like(phy2log_slices[0]))
81
+ log2phy = torch.where(
82
+ log2phy >= idx * num_local_experts,
83
+ log2phy + num_local_experts,
84
+ log2phy,
85
+ )
86
+ phy2log = torch.stack(phy2log_slices, dim=1).contiguous().view(num_layers, -1)
87
+ return phy2log, log2phy, logcnt
@@ -16,21 +16,20 @@ from __future__ import annotations
16
16
 
17
17
  import logging
18
18
  import math
19
- import os
20
19
  import time
21
20
  from abc import ABC
22
21
  from collections import deque
23
22
  from contextlib import contextmanager
24
- from pathlib import Path
25
23
  from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Type
26
24
 
27
25
  import einops
28
26
  import torch
29
27
  import torch.distributed
30
28
 
29
+ from sglang.srt.environ import envs
31
30
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
32
31
  from sglang.srt.server_args import ServerArgs
33
- from sglang.srt.utils import Withable, get_bool_env_var, is_npu
32
+ from sglang.srt.utils import Withable, is_npu
34
33
 
35
34
  _is_npu = is_npu()
36
35
 
@@ -839,7 +838,7 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
839
838
 
840
839
 
841
840
  def _dump_to_file(name, data):
842
- save_dir = Path(os.environ.get("SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR", "/tmp"))
841
+ save_dir = envs.SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR.get()
843
842
  path_output = save_dir / name
844
843
  logger.info(f"Write expert distribution to {path_output}")
845
844
  if not save_dir.exists():
@@ -18,7 +18,7 @@ from typing import Literal, Optional
18
18
  import torch
19
19
 
20
20
  from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
21
- from sglang.srt.managers.schedule_batch import global_server_args_dict
21
+ from sglang.srt.server_args import get_global_server_args
22
22
 
23
23
 
24
24
  @dataclass
@@ -34,7 +34,7 @@ class ExpertLocationDispatchInfo:
34
34
 
35
35
  @classmethod
36
36
  def init_new(cls, layer_id: int):
37
- ep_dispatch_algorithm = global_server_args_dict["ep_dispatch_algorithm"]
37
+ ep_dispatch_algorithm = get_global_server_args().ep_dispatch_algorithm
38
38
  expert_location_metadata = get_global_expert_location_metadata()
39
39
  assert expert_location_metadata is not None
40
40
 
@@ -24,7 +24,7 @@ from sglang.srt.eplb.expert_location import (
24
24
  ExpertLocationMetadata,
25
25
  get_global_expert_location_metadata,
26
26
  )
27
- from sglang.srt.managers.schedule_batch import global_server_args_dict
27
+ from sglang.srt.server_args import get_global_server_args
28
28
  from sglang.srt.utils import get_bool_env_var
29
29
 
30
30
  logger = logging.getLogger(__name__)
@@ -97,7 +97,7 @@ def _update_expert_weights_with_canary(
97
97
  canary_tensor = (
98
98
  _get_canary_value(old_expert_location_metadata, layer_id)
99
99
  .clone()
100
- .to(device=global_server_args_dict["device"], non_blocking=True)
100
+ .to(device=get_global_server_args().device, non_blocking=True)
101
101
  )
102
102
  routed_experts_weights_of_layer[layer_id].append(canary_tensor)
103
103
 
@@ -3,6 +3,7 @@ import logging
3
3
  from abc import ABC, abstractmethod
4
4
  from typing import Any, Dict, List
5
5
 
6
+ import orjson
6
7
  from partial_json_parser.core.exceptions import MalformedJSON
7
8
  from partial_json_parser.core.options import Allow
8
9
 
@@ -96,7 +97,7 @@ class BaseFormatDetector(ABC):
96
97
  Parses the text in one go. Returns success=True if the format matches, otherwise False.
97
98
  Note that leftover_text here represents "content that this parser will not consume further".
98
99
  """
99
- action = json.loads(text)
100
+ action = orjson.loads(text)
100
101
  return StreamingParseResult(calls=self.parse_base_json(action, tools))
101
102
 
102
103
  def _ends_with_partial_token(self, buffer: str, bot_token: str) -> int:
@@ -264,12 +265,6 @@ class BaseFormatDetector(ABC):
264
265
  # Only remove the processed portion, keep unprocessed content
265
266
  self._buffer = current_text[start_idx + end_idx :]
266
267
 
267
- if self.current_tool_id < len(self.prev_tool_call_arr):
268
- self.prev_tool_call_arr[self.current_tool_id].clear()
269
- self.current_tool_name_sent = False
270
- self.streamed_args_for_tool[self.current_tool_id] = ""
271
- self.current_tool_id += 1
272
-
273
268
  # If the tool is still being parsed, send incremental changes
274
269
  elif prev_arguments:
275
270
  prev_args_json = json.dumps(prev_arguments)
@@ -277,6 +272,20 @@ class BaseFormatDetector(ABC):
277
272
  prefix = _find_common_prefix(prev_args_json, cur_args_json)
278
273
  argument_diff = prefix[sent:]
279
274
 
275
+ # Update prev_tool_call_arr with current state
276
+ if self.current_tool_id >= 0:
277
+ # Ensure prev_tool_call_arr is large enough
278
+ while len(self.prev_tool_call_arr) <= self.current_tool_id:
279
+ self.prev_tool_call_arr.append({})
280
+ self.prev_tool_call_arr[self.current_tool_id] = (
281
+ current_tool_call
282
+ )
283
+
284
+ # Advance to next tool if complete
285
+ if is_current_complete:
286
+ self.current_tool_name_sent = False
287
+ self.current_tool_id += 1
288
+
280
289
  # Send the argument diff if there's something new
281
290
  if argument_diff is not None:
282
291
  # Use the correct tool_index: completing_tool_id for completed tools, current_tool_id for ongoing
@@ -293,17 +302,7 @@ class BaseFormatDetector(ABC):
293
302
  )
294
303
  ],
295
304
  )
296
- if not is_current_complete:
297
- self.streamed_args_for_tool[
298
- self.current_tool_id
299
- ] += argument_diff
300
-
301
- # Update prev_tool_call_arr with current state
302
- if self.current_tool_id >= 0:
303
- # Ensure prev_tool_call_arr is large enough
304
- while len(self.prev_tool_call_arr) <= self.current_tool_id:
305
- self.prev_tool_call_arr.append({})
306
- self.prev_tool_call_arr[self.current_tool_id] = current_tool_call
305
+ self.streamed_args_for_tool[tool_index_to_use] += argument_diff
307
306
 
308
307
  return res
309
308
 
@@ -1,10 +1,11 @@
1
1
  import logging
2
- from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Type, Union
2
+ from typing import Dict, List, Literal, Optional, Set, Tuple, Type, Union
3
3
 
4
4
  from sglang.srt.entrypoints.openai.protocol import (
5
- StructuralTagResponseFormat,
5
+ LegacyStructuralTagResponseFormat,
6
6
  StructuresResponseFormat,
7
7
  Tool,
8
+ ToolCallConstraint,
8
9
  ToolChoice,
9
10
  )
10
11
  from sglang.srt.function_call.base_format_detector import BaseFormatDetector
@@ -35,21 +36,22 @@ class FunctionCallParser:
35
36
  """
36
37
 
37
38
  ToolCallParserEnum: Dict[str, Type[BaseFormatDetector]] = {
38
- "llama3": Llama32Detector,
39
- "qwen25": Qwen25Detector,
40
- "mistral": MistralDetector,
41
39
  "deepseekv3": DeepSeekV3Detector,
42
40
  "deepseekv31": DeepSeekV31Detector,
43
- "pythonic": PythonicDetector,
41
+ "glm": Glm4MoeDetector,
42
+ "glm45": Glm4MoeDetector,
43
+ "gpt-oss": GptOssDetector,
44
44
  "kimi_k2": KimiK2Detector,
45
+ "llama3": Llama32Detector,
46
+ "mistral": MistralDetector,
47
+ "pythonic": PythonicDetector,
48
+ "qwen": Qwen25Detector,
49
+ "qwen25": Qwen25Detector,
45
50
  "qwen3_coder": Qwen3CoderDetector,
46
- "glm45": Glm4MoeDetector,
47
51
  "step3": Step3Detector,
48
- "gpt-oss": GptOssDetector,
49
52
  }
50
53
 
51
54
  def __init__(self, tools: List[Tool], tool_call_parser: str):
52
- detector: Type[BaseFormatDetector] = None
53
55
  detector_class = self.ToolCallParserEnum.get(tool_call_parser)
54
56
  if detector_class:
55
57
  detector = detector_class()
@@ -121,7 +123,7 @@ class FunctionCallParser:
121
123
 
122
124
  return final_normal_text, final_calls
123
125
 
124
- def get_structure_tag(self) -> StructuralTagResponseFormat:
126
+ def get_structure_tag(self) -> LegacyStructuralTagResponseFormat:
125
127
  """
126
128
  Generate a structural tag response format for all available tools.
127
129
 
@@ -149,7 +151,9 @@ class FunctionCallParser:
149
151
  )
150
152
  tool_trigger_set.add(info.trigger)
151
153
 
152
- return StructuralTagResponseFormat(
154
+ # TODO(dark): move this into new structural tag format
155
+ # This requires all grammar backend support the new format
156
+ return LegacyStructuralTagResponseFormat(
153
157
  type="structural_tag",
154
158
  structures=tool_structures,
155
159
  triggers=list(tool_trigger_set),
@@ -157,7 +161,7 @@ class FunctionCallParser:
157
161
 
158
162
  def get_structure_constraint(
159
163
  self, tool_choice: Union[ToolChoice, Literal["auto", "required"]]
160
- ) -> Optional[Tuple[str, Any]]:
164
+ ) -> Optional[ToolCallConstraint]:
161
165
  """
162
166
  Returns the appropriate structure constraint for tool calls based on the tool_choice.
163
167
  The constraint is used to guide the model's output format.
@@ -176,8 +180,8 @@ class FunctionCallParser:
176
180
  and tool_choice == "auto"
177
181
  and any(tool.function.strict for tool in self.tools)
178
182
  ):
179
- strict_tag = self.get_structure_tag()
180
- return ("structural_tag", strict_tag)
183
+ tag = self.get_structure_tag()
184
+ return ("structural_tag", tag)
181
185
  elif tool_choice == "required" or isinstance(tool_choice, ToolChoice):
182
186
  json_schema = get_json_schema_constraint(self.tools, tool_choice)
183
187
  return ("json_schema", json_schema)
@@ -6,11 +6,7 @@ from typing import List
6
6
 
7
7
  from sglang.srt.entrypoints.openai.protocol import Tool
8
8
  from sglang.srt.function_call.base_format_detector import BaseFormatDetector
9
- from sglang.srt.function_call.core_types import (
10
- StreamingParseResult,
11
- StructureInfo,
12
- _GetInfoFunc,
13
- )
9
+ from sglang.srt.function_call.core_types import StreamingParseResult, _GetInfoFunc
14
10
  from sglang.srt.function_call.ebnf_composer import EBNFComposer
15
11
 
16
12
  logger = logging.getLogger(__name__)
@@ -31,7 +31,7 @@ class GptOssDetector(BaseFormatDetector):
31
31
 
32
32
  # Pattern to extract function name and JSON from tool_call event content
33
33
  self.tool_extract_pattern = re.compile(
34
- r"to=([a-zA-Z_][a-zA-Z0-9_.]*)\s*<\|constrain\|>json<\|message\|>(.*?)(?:<\|call\|>|$)",
34
+ r"to=([a-zA-Z_][a-zA-Z0-9_.-]*)\s*<\|constrain\|>json<\|message\|>(.*?)(?:<\|call\|>|$)",
35
35
  re.DOTALL,
36
36
  )
37
37
 
@@ -1,5 +1,3 @@
1
- import json
2
- import re
3
1
  from typing import List
4
2
 
5
3
  from sglang.srt.entrypoints.openai.protocol import Tool
@@ -1,8 +1,8 @@
1
- import json
2
1
  from json import JSONDecodeError, JSONDecoder
3
2
  from json.decoder import WHITESPACE
4
3
  from typing import Any, List, Literal, Optional, Tuple, Union
5
4
 
5
+ import orjson
6
6
  import partial_json_parser
7
7
  from partial_json_parser.core.options import Allow
8
8
 
@@ -51,7 +51,7 @@ def _partial_json_loads(input_str: str, flags: Allow) -> Tuple[Any, int]:
51
51
 
52
52
  def _is_complete_json(input_str: str) -> bool:
53
53
  try:
54
- json.loads(input_str)
54
+ orjson.loads(input_str)
55
55
  return True
56
56
  except JSONDecodeError:
57
57
  return False
@@ -16,7 +16,7 @@ Options:
16
16
  --proto-file Specify proto file (default: sglang_scheduler.proto)
17
17
 
18
18
  ### Install Dependencies
19
- pip install "grpcio==1.74.0" "grpcio-tools==1.74.0"
19
+ pip install "grpcio==1.75.1" "grpcio-tools==1.75.1"
20
20
 
21
21
  ### Run Script
22
22
  cd python/sglang/srt/grpc
@@ -30,7 +30,7 @@ import sys
30
30
  from importlib.metadata import version
31
31
  from pathlib import Path
32
32
 
33
- GRPC_VERSION = "1.74.0"
33
+ GRPC_VERSION = "1.75.1"
34
34
 
35
35
 
36
36
  def get_file_mtime(path: Path) -> float:
@@ -70,7 +70,7 @@ def compile_proto(proto_file: Path, output_dir: Path, verbose: bool = True) -> b
70
70
 
71
71
  # Check if grpc_tools is available
72
72
  try:
73
- import grpc_tools.protoc
73
+ import grpc_tools.protoc # noqa: F401
74
74
  except ImportError:
75
75
  print("Error: grpcio-tools not installed")
76
76
  print(