sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (408) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +330 -156
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +8 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +134 -23
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +70 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +66 -66
  69. sglang/srt/entrypoints/grpc_server.py +431 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +120 -8
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +42 -4
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +3 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +18 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/utils.py +2 -2
  93. sglang/srt/grpc/compile_proto.py +3 -3
  94. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  95. sglang/srt/grpc/health_servicer.py +189 -0
  96. sglang/srt/grpc/scheduler_launcher.py +181 -0
  97. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  98. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  99. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  100. sglang/srt/layers/activation.py +4 -1
  101. sglang/srt/layers/attention/aiter_backend.py +3 -3
  102. sglang/srt/layers/attention/ascend_backend.py +17 -1
  103. sglang/srt/layers/attention/attention_registry.py +43 -23
  104. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  105. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  106. sglang/srt/layers/attention/fla/chunk.py +0 -1
  107. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  108. sglang/srt/layers/attention/fla/index.py +0 -2
  109. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  110. sglang/srt/layers/attention/fla/utils.py +0 -3
  111. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  112. sglang/srt/layers/attention/flashattention_backend.py +12 -8
  113. sglang/srt/layers/attention/flashinfer_backend.py +248 -21
  114. sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
  115. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  116. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  117. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  118. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  119. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  121. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  122. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  123. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  124. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  125. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  127. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  128. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  129. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  130. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  131. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  132. sglang/srt/layers/attention/nsa/utils.py +0 -1
  133. sglang/srt/layers/attention/nsa_backend.py +404 -90
  134. sglang/srt/layers/attention/triton_backend.py +208 -34
  135. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  136. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  137. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  138. sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
  139. sglang/srt/layers/attention/utils.py +11 -7
  140. sglang/srt/layers/attention/vision.py +3 -3
  141. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  142. sglang/srt/layers/communicator.py +11 -7
  143. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  146. sglang/srt/layers/dp_attention.py +17 -0
  147. sglang/srt/layers/layernorm.py +45 -15
  148. sglang/srt/layers/linear.py +9 -1
  149. sglang/srt/layers/logits_processor.py +147 -17
  150. sglang/srt/layers/modelopt_utils.py +11 -0
  151. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  152. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  153. sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
  154. sglang/srt/layers/moe/ep_moe/layer.py +119 -397
  155. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  159. sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
  160. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  161. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  162. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  163. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  164. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  165. sglang/srt/layers/moe/router.py +51 -15
  166. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  167. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  168. sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
  169. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  170. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  171. sglang/srt/layers/moe/topk.py +3 -2
  172. sglang/srt/layers/moe/utils.py +17 -1
  173. sglang/srt/layers/quantization/__init__.py +2 -53
  174. sglang/srt/layers/quantization/awq.py +183 -6
  175. sglang/srt/layers/quantization/awq_triton.py +29 -0
  176. sglang/srt/layers/quantization/base_config.py +20 -1
  177. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  178. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  179. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  180. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  181. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  183. sglang/srt/layers/quantization/fp8.py +84 -18
  184. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  185. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  186. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  187. sglang/srt/layers/quantization/gptq.py +0 -1
  188. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  189. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  190. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  191. sglang/srt/layers/quantization/mxfp4.py +5 -30
  192. sglang/srt/layers/quantization/petit.py +1 -1
  193. sglang/srt/layers/quantization/quark/quark.py +3 -1
  194. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  195. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  196. sglang/srt/layers/quantization/unquant.py +1 -4
  197. sglang/srt/layers/quantization/utils.py +0 -1
  198. sglang/srt/layers/quantization/w4afp8.py +51 -20
  199. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  200. sglang/srt/layers/radix_attention.py +59 -9
  201. sglang/srt/layers/rotary_embedding.py +673 -16
  202. sglang/srt/layers/sampler.py +36 -16
  203. sglang/srt/layers/sparse_pooler.py +98 -0
  204. sglang/srt/layers/utils.py +0 -1
  205. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  206. sglang/srt/lora/backend/triton_backend.py +0 -1
  207. sglang/srt/lora/eviction_policy.py +139 -0
  208. sglang/srt/lora/lora_manager.py +24 -9
  209. sglang/srt/lora/lora_registry.py +1 -1
  210. sglang/srt/lora/mem_pool.py +40 -16
  211. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  212. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  213. sglang/srt/managers/cache_controller.py +48 -17
  214. sglang/srt/managers/data_parallel_controller.py +146 -42
  215. sglang/srt/managers/detokenizer_manager.py +40 -13
  216. sglang/srt/managers/io_struct.py +66 -16
  217. sglang/srt/managers/mm_utils.py +20 -18
  218. sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
  219. sglang/srt/managers/overlap_utils.py +96 -19
  220. sglang/srt/managers/schedule_batch.py +241 -511
  221. sglang/srt/managers/schedule_policy.py +15 -2
  222. sglang/srt/managers/scheduler.py +399 -499
  223. sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
  224. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  225. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  226. sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
  227. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  228. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  229. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  230. sglang/srt/managers/tokenizer_manager.py +378 -90
  231. sglang/srt/managers/tp_worker.py +212 -161
  232. sglang/srt/managers/utils.py +78 -2
  233. sglang/srt/mem_cache/allocator.py +7 -2
  234. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  235. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  236. sglang/srt/mem_cache/chunk_cache.py +13 -2
  237. sglang/srt/mem_cache/common.py +480 -0
  238. sglang/srt/mem_cache/evict_policy.py +16 -1
  239. sglang/srt/mem_cache/hicache_storage.py +4 -1
  240. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  241. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  242. sglang/srt/mem_cache/memory_pool.py +435 -219
  243. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  244. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  245. sglang/srt/mem_cache/radix_cache.py +53 -19
  246. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  247. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  249. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  250. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  251. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  252. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  253. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  254. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  255. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  256. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  257. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  258. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  259. sglang/srt/metrics/collector.py +31 -0
  260. sglang/srt/metrics/func_timer.py +1 -1
  261. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  262. sglang/srt/model_executor/forward_batch_info.py +28 -23
  263. sglang/srt/model_executor/model_runner.py +379 -139
  264. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  265. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  266. sglang/srt/model_loader/__init__.py +1 -1
  267. sglang/srt/model_loader/loader.py +424 -27
  268. sglang/srt/model_loader/utils.py +0 -1
  269. sglang/srt/model_loader/weight_utils.py +47 -28
  270. sglang/srt/models/apertus.py +2 -3
  271. sglang/srt/models/arcee.py +2 -2
  272. sglang/srt/models/bailing_moe.py +13 -52
  273. sglang/srt/models/bailing_moe_nextn.py +3 -4
  274. sglang/srt/models/bert.py +1 -1
  275. sglang/srt/models/deepseek_nextn.py +19 -3
  276. sglang/srt/models/deepseek_ocr.py +1516 -0
  277. sglang/srt/models/deepseek_v2.py +273 -98
  278. sglang/srt/models/dots_ocr.py +0 -2
  279. sglang/srt/models/dots_vlm.py +0 -1
  280. sglang/srt/models/dots_vlm_vit.py +1 -1
  281. sglang/srt/models/falcon_h1.py +13 -19
  282. sglang/srt/models/gemma3_mm.py +16 -0
  283. sglang/srt/models/gemma3n_mm.py +1 -2
  284. sglang/srt/models/glm4_moe.py +14 -37
  285. sglang/srt/models/glm4_moe_nextn.py +2 -2
  286. sglang/srt/models/glm4v.py +2 -1
  287. sglang/srt/models/glm4v_moe.py +5 -5
  288. sglang/srt/models/gpt_oss.py +5 -5
  289. sglang/srt/models/grok.py +10 -23
  290. sglang/srt/models/hunyuan.py +2 -7
  291. sglang/srt/models/interns1.py +0 -1
  292. sglang/srt/models/kimi_vl.py +1 -7
  293. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  294. sglang/srt/models/llama.py +2 -2
  295. sglang/srt/models/llama_eagle3.py +1 -1
  296. sglang/srt/models/longcat_flash.py +5 -22
  297. sglang/srt/models/longcat_flash_nextn.py +3 -14
  298. sglang/srt/models/mimo.py +2 -13
  299. sglang/srt/models/mimo_mtp.py +1 -2
  300. sglang/srt/models/minicpmo.py +7 -5
  301. sglang/srt/models/mixtral.py +1 -4
  302. sglang/srt/models/mllama.py +1 -1
  303. sglang/srt/models/mllama4.py +13 -3
  304. sglang/srt/models/nemotron_h.py +511 -0
  305. sglang/srt/models/olmo2.py +31 -4
  306. sglang/srt/models/opt.py +5 -5
  307. sglang/srt/models/phi.py +1 -1
  308. sglang/srt/models/phi4mm.py +1 -1
  309. sglang/srt/models/phimoe.py +0 -1
  310. sglang/srt/models/pixtral.py +0 -3
  311. sglang/srt/models/points_v15_chat.py +186 -0
  312. sglang/srt/models/qwen.py +0 -1
  313. sglang/srt/models/qwen2_5_vl.py +3 -3
  314. sglang/srt/models/qwen2_audio.py +2 -15
  315. sglang/srt/models/qwen2_moe.py +15 -12
  316. sglang/srt/models/qwen2_vl.py +5 -2
  317. sglang/srt/models/qwen3_moe.py +19 -35
  318. sglang/srt/models/qwen3_next.py +7 -12
  319. sglang/srt/models/qwen3_next_mtp.py +3 -4
  320. sglang/srt/models/qwen3_omni_moe.py +661 -0
  321. sglang/srt/models/qwen3_vl.py +37 -33
  322. sglang/srt/models/qwen3_vl_moe.py +57 -185
  323. sglang/srt/models/roberta.py +55 -3
  324. sglang/srt/models/sarashina2_vision.py +0 -1
  325. sglang/srt/models/step3_vl.py +3 -5
  326. sglang/srt/models/utils.py +11 -1
  327. sglang/srt/multimodal/processors/base_processor.py +6 -2
  328. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  329. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  330. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  331. sglang/srt/multimodal/processors/glm4v.py +1 -5
  332. sglang/srt/multimodal/processors/internvl.py +0 -2
  333. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  334. sglang/srt/multimodal/processors/mllama4.py +0 -8
  335. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  336. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  337. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  338. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  339. sglang/srt/parser/conversation.py +41 -0
  340. sglang/srt/parser/reasoning_parser.py +0 -1
  341. sglang/srt/sampling/custom_logit_processor.py +77 -2
  342. sglang/srt/sampling/sampling_batch_info.py +17 -22
  343. sglang/srt/sampling/sampling_params.py +70 -2
  344. sglang/srt/server_args.py +577 -73
  345. sglang/srt/server_args_config_parser.py +1 -1
  346. sglang/srt/single_batch_overlap.py +38 -28
  347. sglang/srt/speculative/base_spec_worker.py +34 -0
  348. sglang/srt/speculative/draft_utils.py +226 -0
  349. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  350. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  351. sglang/srt/speculative/eagle_info.py +57 -18
  352. sglang/srt/speculative/eagle_info_v2.py +458 -0
  353. sglang/srt/speculative/eagle_utils.py +138 -0
  354. sglang/srt/speculative/eagle_worker.py +83 -280
  355. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  356. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  357. sglang/srt/speculative/ngram_worker.py +12 -11
  358. sglang/srt/speculative/spec_info.py +2 -0
  359. sglang/srt/speculative/spec_utils.py +38 -3
  360. sglang/srt/speculative/standalone_worker.py +4 -14
  361. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  362. sglang/srt/two_batch_overlap.py +28 -14
  363. sglang/srt/utils/__init__.py +1 -1
  364. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  365. sglang/srt/utils/common.py +192 -47
  366. sglang/srt/utils/hf_transformers_utils.py +40 -17
  367. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  368. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  369. sglang/srt/utils/profile_merger.py +199 -0
  370. sglang/test/attention/test_flashattn_backend.py +1 -1
  371. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  372. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  373. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  374. sglang/test/few_shot_gsm8k_engine.py +2 -4
  375. sglang/test/kit_matched_stop.py +157 -0
  376. sglang/test/longbench_v2/__init__.py +1 -0
  377. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  378. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  379. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  380. sglang/test/run_eval.py +41 -0
  381. sglang/test/runners.py +2 -0
  382. sglang/test/send_one.py +42 -7
  383. sglang/test/simple_eval_common.py +3 -0
  384. sglang/test/simple_eval_gpqa.py +0 -1
  385. sglang/test/simple_eval_humaneval.py +0 -3
  386. sglang/test/simple_eval_longbench_v2.py +344 -0
  387. sglang/test/test_block_fp8.py +1 -2
  388. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  389. sglang/test/test_cutlass_moe.py +1 -2
  390. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  391. sglang/test/test_deterministic.py +232 -99
  392. sglang/test/test_deterministic_utils.py +73 -0
  393. sglang/test/test_disaggregation_utils.py +81 -0
  394. sglang/test/test_marlin_moe.py +0 -1
  395. sglang/test/test_utils.py +85 -20
  396. sglang/version.py +1 -1
  397. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
  398. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
  399. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  400. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  401. sglang/srt/speculative/build_eagle_tree.py +0 -427
  402. sglang/test/test_block_fp8_ep.py +0 -358
  403. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  404. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  405. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  406. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -3,10 +3,10 @@
3
3
  # Adapted from vLLM: https://github.com/vllm-project/vllm/blob/1b9902806915040ac9b3029f2ab7522ec505afc3/vllm/entrypoints/harmony_utils.py
4
4
  # Slight differences in processing chat messages
5
5
  import datetime
6
- import json
7
6
  from collections.abc import Iterable
8
7
  from typing import Literal, Optional, Union
9
8
 
9
+ import orjson
10
10
  from openai.types.responses import (
11
11
  ResponseOutputItem,
12
12
  ResponseOutputMessage,
@@ -228,7 +228,7 @@ def parse_output_message(message: Message):
228
228
  if len(message.content) != 1:
229
229
  raise ValueError("Invalid number of contents in browser message")
230
230
  content = message.content[0]
231
- browser_call = json.loads(content.text)
231
+ browser_call = orjson.loads(content.text)
232
232
  # TODO: translate to url properly!
233
233
  if recipient == "browser.search":
234
234
  action = ActionSearch(
@@ -19,7 +19,6 @@ This file implements HTTP APIs for the inference engine via fastapi.
19
19
 
20
20
  import asyncio
21
21
  import dataclasses
22
- import json
23
22
  import logging
24
23
  import multiprocessing as multiprocessing
25
24
  import os
@@ -51,20 +50,28 @@ from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationM
51
50
  from sglang.srt.entrypoints.engine import _launch_subprocesses
52
51
  from sglang.srt.entrypoints.openai.protocol import (
53
52
  ChatCompletionRequest,
53
+ ClassifyRequest,
54
54
  CompletionRequest,
55
+ DetokenizeRequest,
55
56
  EmbeddingRequest,
56
57
  ErrorResponse,
57
58
  ModelCard,
58
59
  ModelList,
59
60
  ResponsesRequest,
60
61
  ScoringRequest,
62
+ TokenizeRequest,
61
63
  V1RerankReqInput,
62
64
  )
63
65
  from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
66
+ from sglang.srt.entrypoints.openai.serving_classify import OpenAIServingClassify
64
67
  from sglang.srt.entrypoints.openai.serving_completions import OpenAIServingCompletion
65
68
  from sglang.srt.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
66
69
  from sglang.srt.entrypoints.openai.serving_rerank import OpenAIServingRerank
67
70
  from sglang.srt.entrypoints.openai.serving_score import OpenAIServingScore
71
+ from sglang.srt.entrypoints.openai.serving_tokenize import (
72
+ OpenAIServingDetokenize,
73
+ OpenAIServingTokenize,
74
+ )
68
75
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
69
76
  from sglang.srt.managers.io_struct import (
70
77
  AbortReq,
@@ -89,6 +96,7 @@ from sglang.srt.managers.io_struct import (
89
96
  UnloadLoRAAdapterReqInput,
90
97
  UpdateWeightFromDiskReqInput,
91
98
  UpdateWeightsFromDistributedReqInput,
99
+ UpdateWeightsFromIPCReqInput,
92
100
  UpdateWeightsFromTensorReqInput,
93
101
  UpdateWeightVersionReqInput,
94
102
  VertexGenerateReqInput,
@@ -122,6 +130,7 @@ logger = logging.getLogger(__name__)
122
130
  asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
123
131
 
124
132
  HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
133
+ WAIT_WEIGHTS_READY_TIMEOUT = int(os.getenv("SGLANG_WAIT_WEIGHTS_READY_TIMEOUT", 120))
125
134
 
126
135
 
127
136
  # Store global states
@@ -142,15 +151,14 @@ def set_global_state(global_state: _GlobalState):
142
151
 
143
152
  async def init_multi_tokenizer() -> ServerArgs:
144
153
  """Read args information from shm and init tokenizer manager for current process"""
145
- pid = os.getpid()
146
- main_pid = get_main_process_id()
147
- logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
148
154
 
149
155
  # Read configuration from shared memory
156
+ main_pid = get_main_process_id()
150
157
  port_args, server_args, scheduler_info = read_from_shared_memory(
151
158
  f"multi_tokenizer_args_{main_pid}"
152
159
  )
153
160
  server_args: ServerArgs
161
+ port_args: PortArgs
154
162
 
155
163
  # API key authentication is not supported in multi-tokenizer mode
156
164
  assert (
@@ -160,6 +168,10 @@ async def init_multi_tokenizer() -> ServerArgs:
160
168
  port_args.tokenizer_ipc_name = (
161
169
  f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
162
170
  )
171
+ logger.info(
172
+ f"Start multi-tokenizer worker process {os.getpid()}, "
173
+ f"ipc_name={port_args.tokenizer_ipc_name}"
174
+ )
163
175
 
164
176
  # Launch multi-tokenizer manager process
165
177
  tokenizer_manager = TokenizerWorker(server_args, port_args)
@@ -170,8 +182,6 @@ async def init_multi_tokenizer() -> ServerArgs:
170
182
  chat_template=server_args.chat_template,
171
183
  completion_template=server_args.completion_template,
172
184
  )
173
- # Register this tokenizer with the main tokenizer manager
174
- await tokenizer_manager.register_to_main_tokenizer_manager()
175
185
 
176
186
  tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
177
187
  set_global_state(
@@ -223,12 +233,21 @@ async def lifespan(fast_api_app: FastAPI):
223
233
  fast_api_app.state.openai_serving_embedding = OpenAIServingEmbedding(
224
234
  _global_state.tokenizer_manager, _global_state.template_manager
225
235
  )
236
+ fast_api_app.state.openai_serving_classify = OpenAIServingClassify(
237
+ _global_state.tokenizer_manager, _global_state.template_manager
238
+ )
226
239
  fast_api_app.state.openai_serving_score = OpenAIServingScore(
227
240
  _global_state.tokenizer_manager
228
241
  )
229
242
  fast_api_app.state.openai_serving_rerank = OpenAIServingRerank(
230
243
  _global_state.tokenizer_manager
231
244
  )
245
+ fast_api_app.state.openai_serving_tokenize = OpenAIServingTokenize(
246
+ _global_state.tokenizer_manager
247
+ )
248
+ fast_api_app.state.openai_serving_detokenize = OpenAIServingDetokenize(
249
+ _global_state.tokenizer_manager
250
+ )
232
251
 
233
252
  server_args: ServerArgs = fast_api_app.server_args
234
253
 
@@ -494,7 +513,7 @@ async def get_load():
494
513
 
495
514
 
496
515
  # example usage:
497
- # curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"max_micro_batch_size": 8}}'
516
+ # curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"pp_max_micro_batch_size": 8}}'
498
517
  @app.api_route("/set_internal_state", methods=["POST", "PUT"])
499
518
  async def set_internal_state(obj: SetInternalStateReq, request: Request):
500
519
  res = await _global_state.tokenizer_manager.set_internal_state(obj)
@@ -543,7 +562,7 @@ async def generate_request(obj: GenerateReqInput, request: Request):
543
562
  async def generate_from_file_request(file: UploadFile, request: Request):
544
563
  """Handle a generate request, this is purely to work with input_embeds."""
545
564
  content = await file.read()
546
- input_embeds = json.loads(content.decode("utf-8"))
565
+ input_embeds = orjson.loads(content.decode("utf-8"))
547
566
 
548
567
  obj = GenerateReqInput(
549
568
  input_embeds=input_embeds,
@@ -622,6 +641,7 @@ async def start_profile_async(obj: Optional[ProfileReqInput] = None):
622
641
  with_stack=obj.with_stack,
623
642
  record_shapes=obj.record_shapes,
624
643
  profile_by_stage=obj.profile_by_stage,
644
+ merge_profiles=obj.merge_profiles,
625
645
  )
626
646
  return Response(
627
647
  content="Start profiling.\n",
@@ -820,6 +840,27 @@ async def update_weights_from_distributed(
820
840
  return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
821
841
 
822
842
 
843
+ @app.post("/update_weights_from_ipc")
844
+ async def update_weights_from_ipc(obj: UpdateWeightsFromIPCReqInput, request: Request):
845
+ """Update the weights from IPC (Inter-Process Communication) for checkpoint-engine integration."""
846
+ success, message = await _global_state.tokenizer_manager.update_weights_from_ipc(
847
+ obj, request
848
+ )
849
+
850
+ # Update weight version if provided and weights update was successful
851
+ if success and obj.weight_version is not None:
852
+ _update_weight_version_if_provided(obj.weight_version)
853
+ message += f" Weight version updated to {obj.weight_version}."
854
+
855
+ content = {"success": success, "message": message}
856
+ if success:
857
+ if _global_state.tokenizer_manager.initial_weights_loaded is False:
858
+ _global_state.tokenizer_manager.initial_weights_loaded = True
859
+ return ORJSONResponse(content)
860
+ else:
861
+ return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
862
+
863
+
823
864
  @app.post("/update_weight_version")
824
865
  async def update_weight_version(obj: UpdateWeightVersionReqInput, request: Request):
825
866
  """Update the weight version. This operation requires no active requests."""
@@ -1070,6 +1111,54 @@ async def openai_v1_embeddings(request: EmbeddingRequest, raw_request: Request):
1070
1111
  )
1071
1112
 
1072
1113
 
1114
+ @app.post(
1115
+ "/v1/classify",
1116
+ response_class=ORJSONResponse,
1117
+ dependencies=[Depends(validate_json_request)],
1118
+ )
1119
+ async def openai_v1_classify(request: ClassifyRequest, raw_request: Request):
1120
+ """OpenAI-compatible classification endpoint."""
1121
+ return await raw_request.app.state.openai_serving_classify.handle_request(
1122
+ request, raw_request
1123
+ )
1124
+
1125
+
1126
+ @app.post(
1127
+ "/v1/tokenize",
1128
+ response_class=ORJSONResponse,
1129
+ dependencies=[Depends(validate_json_request)],
1130
+ )
1131
+ @app.post(
1132
+ "/tokenize",
1133
+ response_class=ORJSONResponse,
1134
+ dependencies=[Depends(validate_json_request)],
1135
+ include_in_schema=False,
1136
+ )
1137
+ async def openai_v1_tokenize(request: TokenizeRequest, raw_request: Request):
1138
+ """OpenAI-compatible tokenization endpoint."""
1139
+ return await raw_request.app.state.openai_serving_tokenize.handle_request(
1140
+ request, raw_request
1141
+ )
1142
+
1143
+
1144
+ @app.post(
1145
+ "/v1/detokenize",
1146
+ response_class=ORJSONResponse,
1147
+ dependencies=[Depends(validate_json_request)],
1148
+ )
1149
+ @app.post(
1150
+ "/detokenize",
1151
+ response_class=ORJSONResponse,
1152
+ dependencies=[Depends(validate_json_request)],
1153
+ include_in_schema=False,
1154
+ )
1155
+ async def openai_v1_detokenize(request: DetokenizeRequest, raw_request: Request):
1156
+ """OpenAI-compatible detokenization endpoint."""
1157
+ return await raw_request.app.state.openai_serving_detokenize.handle_request(
1158
+ request, raw_request
1159
+ )
1160
+
1161
+
1073
1162
  @app.get("/v1/models", response_class=ORJSONResponse)
1074
1163
  async def available_models():
1075
1164
  """Show available models. OpenAI-compatible endpoint."""
@@ -1464,6 +1553,8 @@ def _wait_and_warmup(
1464
1553
  pipe_finish_writer: Optional[multiprocessing.connection.Connection],
1465
1554
  launch_callback: Optional[Callable[[], None]] = None,
1466
1555
  ):
1556
+ if server_args.checkpoint_engine_wait_weights_before_ready:
1557
+ _wait_weights_ready()
1467
1558
  if not server_args.skip_server_warmup:
1468
1559
  if not _execute_server_warmup(
1469
1560
  server_args,
@@ -1486,3 +1577,24 @@ def _wait_and_warmup(
1486
1577
 
1487
1578
  if launch_callback is not None:
1488
1579
  launch_callback()
1580
+
1581
+
1582
+ def _wait_weights_ready():
1583
+ """Wait for weights to be ready within the specified timeout."""
1584
+ timeout = WAIT_WEIGHTS_READY_TIMEOUT
1585
+ start_time = time.time()
1586
+
1587
+ for _ in range(timeout):
1588
+ if _global_state.tokenizer_manager.initial_weights_loaded:
1589
+ logger.info(
1590
+ f"Weights are ready after {time.time() - start_time:.2f} seconds"
1591
+ )
1592
+ return
1593
+ time.sleep(1)
1594
+
1595
+ # Timeout reached without weights being ready
1596
+ logger.error(
1597
+ f"Weights are not ready after waiting {timeout} seconds. "
1598
+ f"Consider increasing SGLANG_WAIT_WEIGHTS_READY_TIMEOUT environment variable. "
1599
+ f"Current status: initial_weights_loaded={_global_state.tokenizer_manager.initial_weights_loaded}"
1600
+ )
@@ -1,15 +1,9 @@
1
- import copy
2
- import dataclasses
3
1
  import multiprocessing
4
- import pickle
5
- import threading
6
2
  import time
7
- from typing import Any, Dict, List, Optional, Tuple, Union
3
+ from typing import List, Optional, Tuple
8
4
 
9
- import pybase64
10
5
  import requests
11
6
  import torch
12
- import torch.distributed as dist
13
7
 
14
8
  from sglang.srt.entrypoints.EngineBase import EngineBase
15
9
  from sglang.srt.entrypoints.http_server import launch_server
@@ -13,10 +13,11 @@
13
13
  # ==============================================================================
14
14
  """Pydantic models for OpenAI API protocol"""
15
15
 
16
+ import logging
16
17
  import time
17
18
  import uuid
18
19
  from dataclasses import dataclass
19
- from typing import Any, Dict, List, NamedTuple, Optional, TypeAlias, Union
20
+ from typing import Any, Dict, List, NamedTuple, Optional, Tuple, TypeAlias, Union
20
21
 
21
22
  from openai.types.responses import (
22
23
  ResponseFunctionToolCall,
@@ -36,6 +37,11 @@ from pydantic import (
36
37
  model_validator,
37
38
  )
38
39
  from typing_extensions import Literal
40
+ from xgrammar import StructuralTag
41
+
42
+ from sglang.utils import convert_json_schema_to_str
43
+
44
+ logger = logging.getLogger(__name__)
39
45
 
40
46
  DEFAULT_MODEL_NAME = "default"
41
47
 
@@ -123,12 +129,23 @@ class StructuresResponseFormat(BaseModel):
123
129
  end: str
124
130
 
125
131
 
126
- class StructuralTagResponseFormat(BaseModel):
132
+ # NOTE(dark): keep this for backward compatibility
133
+ class LegacyStructuralTagResponseFormat(BaseModel):
127
134
  type: Literal["structural_tag"]
128
135
  structures: List[StructuresResponseFormat]
129
136
  triggers: List[str]
130
137
 
131
138
 
139
+ StructuralTagResponseFormat: TypeAlias = Union[
140
+ LegacyStructuralTagResponseFormat, StructuralTag
141
+ ]
142
+
143
+ ToolCallConstraint: TypeAlias = Union[
144
+ Tuple[Literal["structural_tag"], StructuralTagResponseFormat],
145
+ Tuple[Literal["json_schema"], Any], # json_schema can be dict/str/None
146
+ ]
147
+
148
+
132
149
  class FileRequest(BaseModel):
133
150
  # https://platform.openai.com/docs/api-reference/files/create
134
151
  file: bytes # The File object (not file name) to be uploaded
@@ -187,7 +204,10 @@ class BatchResponse(BaseModel):
187
204
  class CompletionRequest(BaseModel):
188
205
  # Ordered by official OpenAI API documentation
189
206
  # https://platform.openai.com/docs/api-reference/completions/create
190
- model: str = DEFAULT_MODEL_NAME
207
+ model: str = Field(
208
+ default=DEFAULT_MODEL_NAME,
209
+ description="Model name. Supports LoRA adapters via 'base-model:adapter-name' syntax.",
210
+ )
191
211
  prompt: Union[List[int], List[List[int]], str, List[str]]
192
212
  best_of: Optional[int] = None
193
213
  echo: bool = False
@@ -216,12 +236,15 @@ class CompletionRequest(BaseModel):
216
236
  ebnf: Optional[str] = None
217
237
  repetition_penalty: float = 1.0
218
238
  stop_token_ids: Optional[List[int]] = None
239
+ stop_regex: Optional[Union[str, List[str]]] = None
219
240
  no_stop_trim: bool = False
220
241
  ignore_eos: bool = False
221
242
  skip_special_tokens: bool = True
222
243
  lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
223
244
  session_params: Optional[Dict] = None
224
245
  response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
246
+ custom_params: Optional[Dict] = None
247
+ custom_logit_processor: Optional[str] = None
225
248
 
226
249
  # For PD disaggregation
227
250
  bootstrap_host: Optional[Union[List[str], str]] = None
@@ -423,7 +446,10 @@ class ChatCompletionRequest(BaseModel):
423
446
  # Ordered by official OpenAI API documentation
424
447
  # https://platform.openai.com/docs/api-reference/chat/create
425
448
  messages: List[ChatCompletionMessageParam]
426
- model: str = DEFAULT_MODEL_NAME
449
+ model: str = Field(
450
+ default=DEFAULT_MODEL_NAME,
451
+ description="Model name. Supports LoRA adapters via 'base-model:adapter-name' syntax.",
452
+ )
427
453
  frequency_penalty: float = 0.0
428
454
  logit_bias: Optional[Dict[str, float]] = None
429
455
  logprobs: bool = False
@@ -445,8 +471,8 @@ class ChatCompletionRequest(BaseModel):
445
471
  stop: Optional[Union[str, List[str]]] = None
446
472
  stream: bool = False
447
473
  stream_options: Optional[StreamOptions] = None
448
- temperature: float = 0.7
449
- top_p: float = 1.0
474
+ temperature: Optional[float] = None
475
+ top_p: Optional[float] = None
450
476
  user: Optional[str] = None
451
477
  tools: Optional[List[Tool]] = Field(default=None, examples=[None])
452
478
  tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field(
@@ -461,6 +487,52 @@ class ChatCompletionRequest(BaseModel):
461
487
  "Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
462
488
  )
463
489
 
490
+ # Extra parameters for SRT backend only and will be ignored by OpenAI models.
491
+ top_k: Optional[int] = None
492
+ min_p: Optional[float] = None
493
+ min_tokens: int = 0
494
+ regex: Optional[str] = None
495
+ ebnf: Optional[str] = None
496
+ repetition_penalty: Optional[float] = None
497
+ stop_token_ids: Optional[List[int]] = None
498
+ stop_regex: Optional[Union[str, List[str]]] = None
499
+ no_stop_trim: bool = False
500
+ ignore_eos: bool = False
501
+ continue_final_message: bool = False
502
+ skip_special_tokens: bool = True
503
+ lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
504
+ session_params: Optional[Dict] = None
505
+ separate_reasoning: bool = True
506
+ stream_reasoning: bool = True
507
+ chat_template_kwargs: Optional[Dict] = None
508
+
509
+ # Custom logit processor for advanced sampling control
510
+ custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
511
+ custom_params: Optional[Dict] = None
512
+
513
+ # For request id
514
+ rid: Optional[Union[List[str], str]] = None
515
+ # Extra key for classifying the request (e.g. cache_salt)
516
+ extra_key: Optional[Union[List[str], str]] = None
517
+ # Cache salt for request caching
518
+ cache_salt: Optional[Union[List[str], str]] = None
519
+ # Priority for the request
520
+ priority: Optional[int] = None
521
+
522
+ # For PD disaggregation
523
+ bootstrap_host: Optional[Union[List[str], str]] = None
524
+ bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
525
+ bootstrap_room: Optional[Union[List[int], int]] = None
526
+
527
+ # OpenAI/SGLang default sampling parameters
528
+ _DEFAULT_SAMPLING_PARAMS = {
529
+ "temperature": 1.0,
530
+ "top_p": 1.0,
531
+ "top_k": -1,
532
+ "min_p": 0.0,
533
+ "repetition_penalty": 1.0,
534
+ }
535
+
464
536
  @model_validator(mode="before")
465
537
  @classmethod
466
538
  def set_tool_choice_default(cls, values):
@@ -531,37 +603,83 @@ class ChatCompletionRequest(BaseModel):
531
603
 
532
604
  return values
533
605
 
534
- # Extra parameters for SRT backend only and will be ignored by OpenAI models.
535
- top_k: int = -1
536
- min_p: float = 0.0
537
- min_tokens: int = 0
538
- regex: Optional[str] = None
539
- ebnf: Optional[str] = None
540
- repetition_penalty: float = 1.0
541
- stop_token_ids: Optional[List[int]] = None
542
- no_stop_trim: bool = False
543
- ignore_eos: bool = False
544
- continue_final_message: bool = False
545
- skip_special_tokens: bool = True
546
- lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
547
- session_params: Optional[Dict] = None
548
- separate_reasoning: bool = True
549
- stream_reasoning: bool = True
550
- chat_template_kwargs: Optional[Dict] = None
606
+ def to_sampling_params(
607
+ self,
608
+ stop: List[str],
609
+ model_generation_config: Dict[str, Any],
610
+ tool_call_constraint: Optional[ToolCallConstraint] = None,
611
+ ) -> Dict[str, Any]:
612
+ """
613
+ Convert request to sampling parameters.
614
+ Priority: user value > model generation_config > OpenAI defaults
615
+ """
616
+
617
+ def get_param(param_name: str):
618
+ value = getattr(self, param_name)
619
+ if value is None:
620
+ return model_generation_config.get(
621
+ param_name, self._DEFAULT_SAMPLING_PARAMS[param_name]
622
+ )
623
+ return value
624
+
625
+ sampling_params = {
626
+ "temperature": get_param("temperature"),
627
+ "max_new_tokens": self.max_tokens or self.max_completion_tokens,
628
+ "min_new_tokens": self.min_tokens,
629
+ "stop": stop,
630
+ "stop_token_ids": self.stop_token_ids,
631
+ "stop_regex": self.stop_regex,
632
+ "top_p": get_param("top_p"),
633
+ "top_k": get_param("top_k"),
634
+ "min_p": get_param("min_p"),
635
+ "presence_penalty": self.presence_penalty,
636
+ "frequency_penalty": self.frequency_penalty,
637
+ "repetition_penalty": get_param("repetition_penalty"),
638
+ "regex": self.regex,
639
+ "ebnf": self.ebnf,
640
+ "n": self.n,
641
+ "no_stop_trim": self.no_stop_trim,
642
+ "ignore_eos": self.ignore_eos,
643
+ "skip_special_tokens": self.skip_special_tokens,
644
+ "logit_bias": self.logit_bias,
645
+ "custom_params": self.custom_params,
646
+ }
551
647
 
552
- # For request id
553
- rid: Optional[Union[List[str], str]] = None
554
- # Extra key for classifying the request (e.g. cache_salt)
555
- extra_key: Optional[Union[List[str], str]] = None
556
- # Cache salt for request caching
557
- cache_salt: Optional[Union[List[str], str]] = None
558
- # Priority for the request
559
- priority: Optional[int] = None
648
+ if self.response_format and self.response_format.type == "json_schema":
649
+ sampling_params["json_schema"] = convert_json_schema_to_str(
650
+ self.response_format.json_schema.schema_
651
+ )
652
+ elif self.response_format and self.response_format.type == "json_object":
653
+ sampling_params["json_schema"] = '{"type": "object"}'
654
+ elif self.response_format and self.response_format.type == "structural_tag":
655
+ sampling_params["structural_tag"] = convert_json_schema_to_str(
656
+ self.response_format.model_dump(by_alias=True)
657
+ )
560
658
 
561
- # For PD disaggregation
562
- bootstrap_host: Optional[Union[List[str], str]] = None
563
- bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
564
- bootstrap_room: Optional[Union[List[int], int]] = None
659
+ # Check if there are already existing output constraints
660
+ has_existing_constraints = (
661
+ sampling_params.get("regex")
662
+ or sampling_params.get("ebnf")
663
+ or sampling_params.get("structural_tag")
664
+ or sampling_params.get("json_schema")
665
+ )
666
+
667
+ if tool_call_constraint and has_existing_constraints:
668
+ logger.warning("Constrained decoding is not compatible with tool calls.")
669
+ elif tool_call_constraint:
670
+ constraint_type, constraint_value = tool_call_constraint
671
+ if constraint_type == "structural_tag":
672
+ sampling_params[constraint_type] = convert_json_schema_to_str(
673
+ constraint_value.model_dump(by_alias=True)
674
+ )
675
+ elif constraint_type == "json_schema":
676
+ sampling_params[constraint_type] = convert_json_schema_to_str(
677
+ constraint_value # type: ignore
678
+ )
679
+ else:
680
+ sampling_params[constraint_type] = constraint_value
681
+
682
+ return sampling_params
565
683
 
566
684
 
567
685
  class ChatMessage(BaseModel):
@@ -668,6 +786,37 @@ class EmbeddingObject(BaseModel):
668
786
  object: str = "embedding"
669
787
 
670
788
 
789
+ ClassifyInput = Union[str, List[str], List[int]]
790
+
791
+
792
+ class ClassifyRequest(BaseModel):
793
+ # OpenAI-compatible classification request
794
+ model: str = DEFAULT_MODEL_NAME
795
+ input: ClassifyInput
796
+ user: Optional[str] = None
797
+
798
+ # The request id.
799
+ rid: Optional[Union[List[str], str]] = None
800
+ # Priority for the request
801
+ priority: Optional[int] = None
802
+
803
+
804
+ class ClassifyData(BaseModel):
805
+ index: int
806
+ label: str
807
+ probs: List[float]
808
+ num_classes: int
809
+
810
+
811
+ class ClassifyResponse(BaseModel):
812
+ id: str
813
+ object: str = "list"
814
+ created: int
815
+ model: str
816
+ data: List[ClassifyData]
817
+ usage: UsageInfo
818
+
819
+
671
820
  class EmbeddingResponse(BaseModel):
672
821
  data: List[EmbeddingObject]
673
822
  model: str
@@ -711,12 +860,51 @@ class RerankResponse(BaseModel):
711
860
  meta_info: Optional[dict] = None
712
861
 
713
862
 
863
+ class TokenizeRequest(BaseModel):
864
+ """Request schema for the /tokenize endpoint."""
865
+
866
+ model: str = DEFAULT_MODEL_NAME
867
+ prompt: Union[str, List[str]]
868
+ add_special_tokens: bool = Field(
869
+ default=True,
870
+ description="whether to add model-specific special tokens (e.g. BOS/EOS) during encoding.",
871
+ )
872
+
873
+
874
+ class TokenizeResponse(BaseModel):
875
+ """Response schema for the /tokenize endpoint."""
876
+
877
+ tokens: Union[List[int], List[List[int]]]
878
+ count: Union[int, List[int]]
879
+ max_model_len: int
880
+
881
+
882
+ class DetokenizeRequest(BaseModel):
883
+ """Request schema for the /detokenize endpoint."""
884
+
885
+ model: str = DEFAULT_MODEL_NAME
886
+ tokens: Union[List[int], List[List[int]]]
887
+ skip_special_tokens: bool = Field(
888
+ default=True,
889
+ description="whether to exclude special tokens (e.g. padding or EOS) during decoding.",
890
+ )
891
+
892
+
893
+ class DetokenizeResponse(BaseModel):
894
+ """Response schema for the /detokenize endpoint."""
895
+
896
+ text: Union[str, List[str]]
897
+
898
+
714
899
  OpenAIServingRequest = Union[
715
900
  ChatCompletionRequest,
716
901
  CompletionRequest,
717
902
  EmbeddingRequest,
903
+ ClassifyRequest,
718
904
  ScoringRequest,
719
905
  V1RerankReqInput,
906
+ TokenizeRequest,
907
+ DetokenizeRequest,
720
908
  ]
721
909
 
722
910
 
@@ -924,7 +1112,7 @@ class ResponsesResponse(BaseModel):
924
1112
  Union[
925
1113
  ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall
926
1114
  ]
927
- ]
1115
+ ],
928
1116
  ) -> bool:
929
1117
  if not items:
930
1118
  return False
@@ -1014,7 +1202,7 @@ class MessageProcessingResult:
1014
1202
  video_data: Optional[Any]
1015
1203
  modalities: List[str]
1016
1204
  stop: List[str]
1017
- tool_call_constraint: Optional[Any] = None
1205
+ tool_call_constraint: Optional[ToolCallConstraint] = None
1018
1206
 
1019
1207
 
1020
1208
  class ToolCallProcessingResult(NamedTuple):