sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,11 @@
1
1
  import logging
2
- from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Type, Union
2
+ from typing import Dict, List, Literal, Optional, Set, Tuple, Type, Union
3
3
 
4
4
  from sglang.srt.entrypoints.openai.protocol import (
5
- StructuralTagResponseFormat,
5
+ LegacyStructuralTagResponseFormat,
6
6
  StructuresResponseFormat,
7
7
  Tool,
8
+ ToolCallConstraint,
8
9
  ToolChoice,
9
10
  )
10
11
  from sglang.srt.function_call.base_format_detector import BaseFormatDetector
@@ -20,6 +21,7 @@ from sglang.srt.function_call.pythonic_detector import PythonicDetector
20
21
  from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
21
22
  from sglang.srt.function_call.qwen25_detector import Qwen25Detector
22
23
  from sglang.srt.function_call.step3_detector import Step3Detector
24
+ from sglang.srt.function_call.utils import get_json_schema_constraint
23
25
 
24
26
  logger = logging.getLogger(__name__)
25
27
 
@@ -34,21 +36,22 @@ class FunctionCallParser:
34
36
  """
35
37
 
36
38
  ToolCallParserEnum: Dict[str, Type[BaseFormatDetector]] = {
37
- "llama3": Llama32Detector,
38
- "qwen25": Qwen25Detector,
39
- "mistral": MistralDetector,
40
39
  "deepseekv3": DeepSeekV3Detector,
41
40
  "deepseekv31": DeepSeekV31Detector,
42
- "pythonic": PythonicDetector,
41
+ "glm": Glm4MoeDetector,
42
+ "glm45": Glm4MoeDetector,
43
+ "gpt-oss": GptOssDetector,
43
44
  "kimi_k2": KimiK2Detector,
45
+ "llama3": Llama32Detector,
46
+ "mistral": MistralDetector,
47
+ "pythonic": PythonicDetector,
48
+ "qwen": Qwen25Detector,
49
+ "qwen25": Qwen25Detector,
44
50
  "qwen3_coder": Qwen3CoderDetector,
45
- "glm45": Glm4MoeDetector,
46
51
  "step3": Step3Detector,
47
- "gpt-oss": GptOssDetector,
48
52
  }
49
53
 
50
54
  def __init__(self, tools: List[Tool], tool_call_parser: str):
51
- detector: Type[BaseFormatDetector] = None
52
55
  detector_class = self.ToolCallParserEnum.get(tool_call_parser)
53
56
  if detector_class:
54
57
  detector = detector_class()
@@ -120,7 +123,7 @@ class FunctionCallParser:
120
123
 
121
124
  return final_normal_text, final_calls
122
125
 
123
- def get_structure_tag(self) -> StructuralTagResponseFormat:
126
+ def get_structure_tag(self) -> LegacyStructuralTagResponseFormat:
124
127
  """
125
128
  Generate a structural tag response format for all available tools.
126
129
 
@@ -148,7 +151,9 @@ class FunctionCallParser:
148
151
  )
149
152
  tool_trigger_set.add(info.trigger)
150
153
 
151
- return StructuralTagResponseFormat(
154
+ # TODO(dark): move this into new structural tag format
155
+ # This requires all grammar backend support the new format
156
+ return LegacyStructuralTagResponseFormat(
152
157
  type="structural_tag",
153
158
  structures=tool_structures,
154
159
  triggers=list(tool_trigger_set),
@@ -156,7 +161,7 @@ class FunctionCallParser:
156
161
 
157
162
  def get_structure_constraint(
158
163
  self, tool_choice: Union[ToolChoice, Literal["auto", "required"]]
159
- ) -> Optional[Tuple[str, Any]]:
164
+ ) -> Optional[ToolCallConstraint]:
160
165
  """
161
166
  Returns the appropriate structure constraint for tool calls based on the tool_choice.
162
167
  The constraint is used to guide the model's output format.
@@ -175,11 +180,11 @@ class FunctionCallParser:
175
180
  and tool_choice == "auto"
176
181
  and any(tool.function.strict for tool in self.tools)
177
182
  ):
178
- strict_tag = self.get_structure_tag()
179
- return ("structural_tag", strict_tag)
183
+ tag = self.get_structure_tag()
184
+ return ("structural_tag", tag)
180
185
  elif tool_choice == "required" or isinstance(tool_choice, ToolChoice):
181
- ebnf = self.get_ebnf(tool_choice)
182
- return ("ebnf", ebnf) if ebnf is not None else None
186
+ json_schema = get_json_schema_constraint(self.tools, tool_choice)
187
+ return ("json_schema", json_schema)
183
188
 
184
189
  def get_ebnf(
185
190
  self, tool_choice: Union[ToolChoice, Literal["required"]]
@@ -6,11 +6,7 @@ from typing import List
6
6
 
7
7
  from sglang.srt.entrypoints.openai.protocol import Tool
8
8
  from sglang.srt.function_call.base_format_detector import BaseFormatDetector
9
- from sglang.srt.function_call.core_types import (
10
- StreamingParseResult,
11
- StructureInfo,
12
- _GetInfoFunc,
13
- )
9
+ from sglang.srt.function_call.core_types import StreamingParseResult, _GetInfoFunc
14
10
  from sglang.srt.function_call.ebnf_composer import EBNFComposer
15
11
 
16
12
  logger = logging.getLogger(__name__)
@@ -39,7 +35,7 @@ def parse_arguments(json_value):
39
35
 
40
36
  class Glm4MoeDetector(BaseFormatDetector):
41
37
  """
42
- Detector for GLM-4.5 models.
38
+ Detector for GLM-4.5 and GLM-4.6 models.
43
39
  Assumes function call format:
44
40
  <tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>北京</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>\n<tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>上海</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>
45
41
  """
@@ -53,7 +49,7 @@ class Glm4MoeDetector(BaseFormatDetector):
53
49
  self.func_arg_regex = r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>"
54
50
 
55
51
  def has_tool_call(self, text: str) -> bool:
56
- """Check if the text contains a glm-4.5 format tool call."""
52
+ """Check if the text contains a glm-4.5 / glm-4.6 format tool call."""
57
53
  return self.bot_token in text
58
54
 
59
55
  def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
@@ -102,7 +98,7 @@ class Glm4MoeDetector(BaseFormatDetector):
102
98
  self, new_text: str, tools: List[Tool]
103
99
  ) -> StreamingParseResult:
104
100
  """
105
- Streaming incremental parsing tool calls for GLM-4.5 format.
101
+ Streaming incremental parsing tool calls for GLM-4.5 and GLM-4.6 format.
106
102
  """
107
103
  self._buffer += new_text
108
104
  current_text = self._buffer
@@ -31,7 +31,7 @@ class GptOssDetector(BaseFormatDetector):
31
31
 
32
32
  # Pattern to extract function name and JSON from tool_call event content
33
33
  self.tool_extract_pattern = re.compile(
34
- r"to=([a-zA-Z_][a-zA-Z0-9_.]*)\s*<\|constrain\|>json<\|message\|>(.*?)(?:<\|call\|>|$)",
34
+ r"to=([a-zA-Z_][a-zA-Z0-9_.-]*)\s*<\|constrain\|>json<\|message\|>(.*?)(?:<\|call\|>|$)",
35
35
  re.DOTALL,
36
36
  )
37
37
 
@@ -81,6 +81,29 @@ class GptOssDetector(BaseFormatDetector):
81
81
  # Always use HarmonyParser for parsing to ensure proper filtering
82
82
  events = self.harmony_parser.parse(new_text)
83
83
 
84
+ # If there are no parsed events and the chunk contains no Harmony structural
85
+ # markers, treat it as plain text and pass it through. This fixes a bug where
86
+ # normal content was held in the buffer when tools were provided but not used.
87
+ if not events:
88
+ has_harmony_markers = any(
89
+ marker in self._buffer
90
+ for marker in (
91
+ "<|start|>",
92
+ "<|channel|>",
93
+ "<|message|>",
94
+ "<|constrain|>",
95
+ "<|end|>",
96
+ "<|call|>",
97
+ "<|return|>",
98
+ "assistantfinal",
99
+ )
100
+ )
101
+ if not has_harmony_markers:
102
+ # Plain text with no tool markers — emit as normal content
103
+ out = self._buffer
104
+ self._buffer = ""
105
+ return StreamingParseResult(normal_text=out, calls=[])
106
+
84
107
  # Quick check if we might have tool calls
85
108
  if (
86
109
  "<|channel|>commentary to=" not in self._buffer
@@ -0,0 +1,61 @@
1
+ from typing import List
2
+
3
+ from sglang.srt.entrypoints.openai.protocol import Tool
4
+ from sglang.srt.function_call.base_format_detector import BaseFormatDetector
5
+ from sglang.srt.function_call.core_types import StreamingParseResult
6
+
7
+
8
+ class JsonArrayParser(BaseFormatDetector):
9
+ """
10
+ Parser for JSON array tool calls when JSON schema constraints are active.
11
+
12
+ This parser is used when tool_choice="required" or a specific tool is named,
13
+ bypassing model-specific parsers in favor of direct JSON array parsing.
14
+ """
15
+
16
+ def __init__(self):
17
+ super().__init__()
18
+ # Configure for JSON array parsing
19
+ self.bot_token = "["
20
+ self.eot_token = "]"
21
+ self.tool_call_separator = ","
22
+
23
+ def has_tool_call(self, text: str) -> bool:
24
+ """
25
+ Check if the given text contains a JSON tool call (array or single object).
26
+ """
27
+ return "[" in text or "{" in text
28
+
29
+ def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
30
+ """
31
+ Parse JSON tool calls using the base class implementation.
32
+ """
33
+ raise NotImplementedError(
34
+ "Detect and parse not supported for JSON schema constraints."
35
+ )
36
+
37
+ def build_ebnf(self, tools: List[Tool]) -> str:
38
+ """
39
+ Build an EBNF grammar for constrained generation.
40
+ This is not used for JSON schema constraints as they are handled
41
+ by the constraint backends directly.
42
+ """
43
+ raise NotImplementedError(
44
+ "EBNF generation is not supported for JSON schema constraints."
45
+ )
46
+
47
+ def parse_streaming_increment(
48
+ self, new_text: str, tools: List[Tool]
49
+ ) -> StreamingParseResult:
50
+ """
51
+ Streaming incremental parsing with tool validation.
52
+ """
53
+ return super().parse_streaming_increment(new_text, tools)
54
+
55
+ def structure_info(self) -> callable:
56
+ """
57
+ Return a function that creates StructureInfo for constrained generation.
58
+ This is not used for JSON schema constraints as they are handled
59
+ by the constraint backends directly.
60
+ """
61
+ raise NotImplementedError("structure_info not used for JSON schema constraints")
@@ -50,6 +50,11 @@ class KimiK2Detector(BaseFormatDetector):
50
50
 
51
51
  self._last_arguments = ""
52
52
 
53
+ # Robust parser for ids like "functions.search:0" or fallback "search:0"
54
+ self.tool_call_id_regex = re.compile(
55
+ r"^(?:functions\.)?(?P<name>[\w\.]+):(?P<index>\d+)$"
56
+ )
57
+
53
58
  def has_tool_call(self, text: str) -> bool:
54
59
  """Check if the text contains a KimiK2 format tool call."""
55
60
  return self.bot_token in text
@@ -76,14 +81,18 @@ class KimiK2Detector(BaseFormatDetector):
76
81
  tool_calls = []
77
82
  for match in function_call_tuples:
78
83
  function_id, function_args = match
79
- function_name = function_id.split(".")[1].split(":")[0]
80
- function_idx = int(function_id.split(".")[1].split(":")[1])
84
+ m = self.tool_call_id_regex.match(function_id)
85
+ if not m:
86
+ logger.warning("Unexpected tool_call_id format: %s", function_id)
87
+ continue
88
+ function_name = m.group("name")
89
+ function_idx = int(m.group("index"))
81
90
 
82
91
  logger.info(f"function_name {function_name}")
83
92
 
84
93
  tool_calls.append(
85
94
  ToolCallItem(
86
- tool_index=function_idx, # Use the call index in the response, not tool position
95
+ tool_index=function_idx,
87
96
  name=function_name,
88
97
  parameters=function_args,
89
98
  )
@@ -128,7 +137,11 @@ class KimiK2Detector(BaseFormatDetector):
128
137
  function_id = match.group("tool_call_id")
129
138
  function_args = match.group("function_arguments")
130
139
 
131
- function_name = function_id.split(".")[1].split(":")[0]
140
+ m = self.tool_call_id_regex.match(function_id)
141
+ if not m:
142
+ logger.warning("Unexpected tool_call_id format: %s", function_id)
143
+ return StreamingParseResult(normal_text="", calls=calls)
144
+ function_name = m.group("name")
132
145
 
133
146
  # Initialize state if this is the first tool call
134
147
  if self.current_tool_id == -1:
@@ -1,10 +1,13 @@
1
- import json
2
1
  from json import JSONDecodeError, JSONDecoder
3
- from typing import Any, Tuple
2
+ from json.decoder import WHITESPACE
3
+ from typing import Any, List, Literal, Optional, Tuple, Union
4
4
 
5
+ import orjson
5
6
  import partial_json_parser
6
7
  from partial_json_parser.core.options import Allow
7
8
 
9
+ from sglang.srt.entrypoints.openai.protocol import Tool, ToolChoice
10
+
8
11
 
9
12
  def _find_common_prefix(s1: str, s2: str) -> str:
10
13
  prefix = ""
@@ -37,16 +40,104 @@ def _partial_json_loads(input_str: str, flags: Allow) -> Tuple[Any, int]:
37
40
  """
38
41
  try:
39
42
  return (partial_json_parser.loads(input_str, flags), len(input_str))
40
- except JSONDecodeError as e:
41
- if "Extra data" in e.msg:
42
- dec = JSONDecoder()
43
- return dec.raw_decode(input_str)
43
+ except (JSONDecodeError, IndexError) as e:
44
+ msg = getattr(e, "msg", str(e))
45
+ if "Extra data" in msg or "pop from empty list" in msg:
46
+ start = WHITESPACE.match(input_str, 0).end()
47
+ obj, end = JSONDecoder().raw_decode(input_str, start)
48
+ return obj, end
44
49
  raise
45
50
 
46
51
 
47
52
  def _is_complete_json(input_str: str) -> bool:
48
53
  try:
49
- json.loads(input_str)
54
+ orjson.loads(input_str)
50
55
  return True
51
56
  except JSONDecodeError:
52
57
  return False
58
+
59
+
60
+ def _get_tool_schema_defs(tools: List[Tool]) -> dict:
61
+ """
62
+ Get consolidated $defs from all tools, validating for conflicts.
63
+
64
+ Args:
65
+ tools: List of tools to process
66
+
67
+ Returns:
68
+ Dictionary of consolidated $defs from all tools
69
+
70
+ Raises:
71
+ ValueError: If conflicting $defs are found
72
+ """
73
+ all_defs = {}
74
+ for tool in tools:
75
+ if tool.function.parameters is None:
76
+ continue
77
+ defs = tool.function.parameters.get("$defs", {})
78
+ for def_name, def_schema in defs.items():
79
+ if def_name in all_defs and all_defs[def_name] != def_schema:
80
+ raise ValueError(
81
+ f"Tool definition '{def_name}' has "
82
+ "multiple schemas, which is not "
83
+ "supported."
84
+ )
85
+ else:
86
+ all_defs[def_name] = def_schema
87
+ return all_defs
88
+
89
+
90
+ def _get_tool_schema(tool: Tool) -> dict:
91
+ return {
92
+ "properties": {
93
+ "name": {"type": "string", "enum": [tool.function.name]},
94
+ "parameters": (
95
+ tool.function.parameters
96
+ if tool.function.parameters
97
+ else {"type": "object", "properties": {}}
98
+ ),
99
+ },
100
+ "required": ["name", "parameters"],
101
+ }
102
+
103
+
104
+ def get_json_schema_constraint(
105
+ tools: List[Tool], tool_choice: Union[ToolChoice, Literal["required"]]
106
+ ) -> Optional[dict]:
107
+ """
108
+ Get the JSON schema constraint for the specified tool choice.
109
+
110
+ Args:
111
+ tool_choice: The tool choice specification
112
+
113
+ Returns:
114
+ JSON schema dict, or None if no valid tools found
115
+ """
116
+
117
+ if isinstance(tool_choice, ToolChoice):
118
+ # For specific function choice, return the user's parameters schema directly
119
+ fn_name = tool_choice.function.name
120
+ for tool in tools:
121
+ if tool.function.name == fn_name:
122
+ return {
123
+ "type": "array",
124
+ "minItems": 1,
125
+ "maxItems": 1,
126
+ "items": _get_tool_schema(tool),
127
+ }
128
+ return None
129
+ elif tool_choice == "required":
130
+ json_schema = {
131
+ "type": "array",
132
+ "minItems": 1,
133
+ "items": {
134
+ "type": "object",
135
+ "anyOf": [_get_tool_schema(tool) for tool in tools],
136
+ },
137
+ }
138
+ json_schema_defs = _get_tool_schema_defs(tools)
139
+ if json_schema_defs:
140
+ json_schema["$defs"] = json_schema_defs
141
+ return json_schema
142
+
143
+ return None
@@ -0,0 +1,245 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Compile protobuf files for SGLang gRPC server.
4
+
5
+ This script compiles .proto files to Python code using grpc_tools.protoc.
6
+ It generates:
7
+ - *_pb2.py (protobuf message classes)
8
+ - *_pb2_grpc.py (gRPC service classes)
9
+ - *_pb2.pyi (type hints for mypy/IDEs)
10
+
11
+ Usage:
12
+ python compile_proto.py [--check] [--proto-file PROTO_FILE]
13
+
14
+ Options:
15
+ --check Check if regeneration is needed (exit 1 if needed)
16
+ --proto-file Specify proto file (default: sglang_scheduler.proto)
17
+
18
+ ### Install Dependencies
19
+ pip install "grpcio==1.75.1" "grpcio-tools==1.75.1"
20
+
21
+ ### Run Script
22
+ cd python/sglang/srt/grpc
23
+ python compile_proto.py
24
+ """
25
+
26
+
27
+ import argparse
28
+ import subprocess
29
+ import sys
30
+ from importlib.metadata import version
31
+ from pathlib import Path
32
+
33
+ GRPC_VERSION = "1.75.1"
34
+
35
+
36
+ def get_file_mtime(path: Path) -> float:
37
+ """Get file modification time, return 0 if file doesn't exist."""
38
+ try:
39
+ return path.stat().st_mtime
40
+ except FileNotFoundError:
41
+ return 0.0
42
+
43
+
44
+ def check_regeneration_needed(proto_file: Path, output_dir: Path) -> bool:
45
+ """Check if proto files are newer than generated files."""
46
+ proto_mtime = get_file_mtime(proto_file)
47
+
48
+ generated_files = [
49
+ output_dir / f"{proto_file.stem}_pb2.py",
50
+ output_dir / f"{proto_file.stem}_pb2_grpc.py",
51
+ output_dir / f"{proto_file.stem}_pb2.pyi",
52
+ ]
53
+
54
+ for gen_file in generated_files:
55
+ if get_file_mtime(gen_file) < proto_mtime:
56
+ return True
57
+
58
+ return False
59
+
60
+
61
+ def compile_proto(proto_file: Path, output_dir: Path, verbose: bool = True) -> bool:
62
+ """Compile the protobuf file to Python."""
63
+
64
+ if not proto_file.exists():
65
+ print(f"Error: Proto file not found: {proto_file}")
66
+ return False
67
+
68
+ if verbose:
69
+ print(f"Found proto file: {proto_file}")
70
+
71
+ # Check if grpc_tools is available
72
+ try:
73
+ import grpc_tools.protoc # noqa: F401
74
+ except ImportError:
75
+ print("Error: grpcio-tools not installed")
76
+ print(
77
+ f'Install with: pip install "grpcio-tools=={GRPC_VERSION}" "grpcio=={GRPC_VERSION}"'
78
+ )
79
+ return False
80
+
81
+ grpc_tools_version = version("grpcio-tools")
82
+ grpc_version = version("grpcio")
83
+ if grpc_tools_version != GRPC_VERSION or grpc_version != GRPC_VERSION:
84
+ raise RuntimeError(
85
+ f"Error: grpcio-tools version {grpc_tools_version} and grpcio version {grpc_version} detected, but {GRPC_VERSION} is required."
86
+ )
87
+
88
+ # Compile command
89
+ cmd = [
90
+ sys.executable,
91
+ "-m",
92
+ "grpc_tools.protoc",
93
+ f"-I{proto_file.parent}",
94
+ f"--python_out={output_dir}",
95
+ f"--grpc_python_out={output_dir}",
96
+ f"--pyi_out={output_dir}", # Generate type stubs
97
+ str(proto_file.name),
98
+ ]
99
+
100
+ if verbose:
101
+ print(f"Running: {' '.join(cmd)}")
102
+
103
+ # Run protoc
104
+ result = subprocess.run(cmd, capture_output=True, text=True, cwd=proto_file.parent)
105
+
106
+ if result.returncode != 0:
107
+ print(f"Error compiling proto:")
108
+ print(result.stderr)
109
+ if result.stdout:
110
+ print(result.stdout)
111
+ return False
112
+
113
+ # Verify generated files exist
114
+ generated_files = [
115
+ f"{proto_file.stem}_pb2.py",
116
+ f"{proto_file.stem}_pb2_grpc.py",
117
+ f"{proto_file.stem}_pb2.pyi",
118
+ ]
119
+
120
+ missing_files = []
121
+ for gen_file in generated_files:
122
+ if not (output_dir / gen_file).exists():
123
+ missing_files.append(gen_file)
124
+
125
+ if missing_files:
126
+ print(f"Error: Expected generated files not found: {missing_files}")
127
+ return False
128
+
129
+ if verbose:
130
+ print("Successfully compiled protobuf files:")
131
+ for gen_file in generated_files:
132
+ print(f" - {output_dir}/{gen_file}")
133
+
134
+ # Fix imports in generated files
135
+ fix_imports(output_dir, proto_file.stem, verbose)
136
+
137
+ return True
138
+
139
+
140
+ def fix_imports(output_dir: Path, proto_stem: str, verbose: bool = True) -> None:
141
+ """Fix imports in generated files to use relative imports."""
142
+ grpc_file = output_dir / f"{proto_stem}_pb2_grpc.py"
143
+
144
+ if grpc_file.exists():
145
+ content = grpc_file.read_text()
146
+ # Change absolute import to relative import
147
+ old_import = f"import {proto_stem}_pb2"
148
+ new_import = f"from . import {proto_stem}_pb2"
149
+
150
+ if old_import in content:
151
+ content = content.replace(old_import, new_import)
152
+ grpc_file.write_text(content)
153
+ if verbose:
154
+ print("Fixed imports in generated files")
155
+
156
+
157
+ def add_generation_header(output_dir: Path, proto_stem: str) -> None:
158
+ """Add header to generated files indicating they are auto-generated."""
159
+ header = """# This file is auto-generated. Do not edit manually.
160
+ # Regenerate with: python compile_proto.py
161
+
162
+ """
163
+
164
+ files_to_update = [f"{proto_stem}_pb2.py", f"{proto_stem}_pb2_grpc.py"]
165
+
166
+ for filename in files_to_update:
167
+ file_path = output_dir / filename
168
+ if file_path.exists():
169
+ content = file_path.read_text()
170
+ if not content.startswith("# This file is auto-generated"):
171
+ file_path.write_text(header + content)
172
+
173
+
174
+ def main():
175
+ """Main entry point."""
176
+ parser = argparse.ArgumentParser(
177
+ description="Compile protobuf files for SGLang gRPC server",
178
+ formatter_class=argparse.RawDescriptionHelpFormatter,
179
+ epilog=__doc__,
180
+ )
181
+
182
+ parser.add_argument(
183
+ "--check",
184
+ action="store_true",
185
+ help="Check if regeneration is needed (exit 1 if needed)",
186
+ )
187
+
188
+ parser.add_argument(
189
+ "--proto-file",
190
+ type=str,
191
+ default="sglang_scheduler.proto",
192
+ help="Proto file to compile (default: sglang_scheduler.proto)",
193
+ )
194
+
195
+ parser.add_argument(
196
+ "-v",
197
+ "--verbose",
198
+ action="store_true",
199
+ default=True,
200
+ help="Verbose output (default: True)",
201
+ )
202
+
203
+ parser.add_argument(
204
+ "-q", "--quiet", action="store_true", help="Quiet mode (overrides verbose)"
205
+ )
206
+
207
+ args = parser.parse_args()
208
+
209
+ # Handle verbosity
210
+ verbose = args.verbose and not args.quiet
211
+
212
+ # Get paths
213
+ script_dir = Path(__file__).parent
214
+ proto_file = script_dir / args.proto_file
215
+ output_dir = script_dir
216
+
217
+ # Check mode
218
+ if args.check:
219
+ if check_regeneration_needed(proto_file, output_dir):
220
+ if verbose:
221
+ print("Proto files need regeneration")
222
+ sys.exit(1)
223
+ else:
224
+ if verbose:
225
+ print("Generated files are up to date")
226
+ sys.exit(0)
227
+
228
+ # Compile mode
229
+ success = compile_proto(proto_file, output_dir, verbose)
230
+
231
+ if success:
232
+ # Add generation headers
233
+ add_generation_header(output_dir, proto_file.stem)
234
+
235
+ if verbose:
236
+ print("\n✅ Protobuf compilation successful!")
237
+ print("Generated files are ready for use")
238
+ else:
239
+ if verbose:
240
+ print("\n❌ Protobuf compilation failed!")
241
+ sys.exit(1)
242
+
243
+
244
+ if __name__ == "__main__":
245
+ main()