sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -13,15 +13,18 @@
13
13
  # ==============================================================================
14
14
  """Pydantic models for OpenAI API protocol"""
15
15
 
16
+ import logging
16
17
  import time
17
18
  import uuid
18
19
  from dataclasses import dataclass
19
- from typing import Any, Dict, List, Optional, TypeAlias, Union
20
+ from typing import Any, Dict, List, NamedTuple, Optional, Tuple, TypeAlias, Union
20
21
 
21
22
  from openai.types.responses import (
22
23
  ResponseFunctionToolCall,
23
24
  ResponseInputItemParam,
24
25
  ResponseOutputItem,
26
+ ResponseOutputMessage,
27
+ ResponseOutputText,
25
28
  ResponseReasoningItem,
26
29
  )
27
30
  from openai.types.responses.response import ToolChoice
@@ -34,6 +37,11 @@ from pydantic import (
34
37
  model_validator,
35
38
  )
36
39
  from typing_extensions import Literal
40
+ from xgrammar import StructuralTag
41
+
42
+ from sglang.utils import convert_json_schema_to_str
43
+
44
+ logger = logging.getLogger(__name__)
37
45
 
38
46
  DEFAULT_MODEL_NAME = "default"
39
47
 
@@ -121,12 +129,23 @@ class StructuresResponseFormat(BaseModel):
121
129
  end: str
122
130
 
123
131
 
124
- class StructuralTagResponseFormat(BaseModel):
132
+ # NOTE(dark): keep this for backward compatibility
133
+ class LegacyStructuralTagResponseFormat(BaseModel):
125
134
  type: Literal["structural_tag"]
126
135
  structures: List[StructuresResponseFormat]
127
136
  triggers: List[str]
128
137
 
129
138
 
139
+ StructuralTagResponseFormat: TypeAlias = Union[
140
+ LegacyStructuralTagResponseFormat, StructuralTag
141
+ ]
142
+
143
+ ToolCallConstraint: TypeAlias = Union[
144
+ Tuple[Literal["structural_tag"], StructuralTagResponseFormat],
145
+ Tuple[Literal["json_schema"], Any], # json_schema can be dict/str/None
146
+ ]
147
+
148
+
130
149
  class FileRequest(BaseModel):
131
150
  # https://platform.openai.com/docs/api-reference/files/create
132
151
  file: bytes # The File object (not file name) to be uploaded
@@ -185,7 +204,10 @@ class BatchResponse(BaseModel):
185
204
  class CompletionRequest(BaseModel):
186
205
  # Ordered by official OpenAI API documentation
187
206
  # https://platform.openai.com/docs/api-reference/completions/create
188
- model: str = DEFAULT_MODEL_NAME
207
+ model: str = Field(
208
+ default=DEFAULT_MODEL_NAME,
209
+ description="Model name. Supports LoRA adapters via 'base-model:adapter-name' syntax.",
210
+ )
189
211
  prompt: Union[List[int], List[List[int]], str, List[str]]
190
212
  best_of: Optional[int] = None
191
213
  echo: bool = False
@@ -214,12 +236,15 @@ class CompletionRequest(BaseModel):
214
236
  ebnf: Optional[str] = None
215
237
  repetition_penalty: float = 1.0
216
238
  stop_token_ids: Optional[List[int]] = None
239
+ stop_regex: Optional[Union[str, List[str]]] = None
217
240
  no_stop_trim: bool = False
218
241
  ignore_eos: bool = False
219
242
  skip_special_tokens: bool = True
220
243
  lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
221
244
  session_params: Optional[Dict] = None
222
245
  response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
246
+ custom_params: Optional[Dict] = None
247
+ custom_logit_processor: Optional[str] = None
223
248
 
224
249
  # For PD disaggregation
225
250
  bootstrap_host: Optional[Union[List[str], str]] = None
@@ -228,9 +253,15 @@ class CompletionRequest(BaseModel):
228
253
 
229
254
  # For request id
230
255
  rid: Optional[Union[List[str], str]] = None
256
+ # Extra key for classifying the request (e.g. cache_salt)
257
+ extra_key: Optional[Union[List[str], str]] = None
258
+ # Cache salt for request caching
259
+ cache_salt: Optional[Union[List[str], str]] = None
260
+ # Priority for the request
261
+ priority: Optional[int] = None
231
262
 
232
- # For customer metric labels
233
- customer_labels: Optional[Dict[str, str]] = None
263
+ # For custom metric labels
264
+ custom_labels: Optional[Dict[str, str]] = None
234
265
 
235
266
  @field_validator("max_tokens")
236
267
  @classmethod
@@ -337,7 +368,7 @@ class FunctionResponse(BaseModel):
337
368
  """Function response."""
338
369
 
339
370
  name: Optional[str] = None
340
- arguments: Optional[str] = None
371
+ arguments: Optional[str | Dict[str, Any]] = None
341
372
 
342
373
 
343
374
  class ToolCall(BaseModel):
@@ -386,7 +417,7 @@ class Function(BaseModel):
386
417
  """Function descriptions."""
387
418
 
388
419
  description: Optional[str] = Field(default=None, examples=[None])
389
- name: Optional[str] = None
420
+ name: str
390
421
  parameters: Optional[object] = None
391
422
  strict: bool = False
392
423
 
@@ -415,7 +446,10 @@ class ChatCompletionRequest(BaseModel):
415
446
  # Ordered by official OpenAI API documentation
416
447
  # https://platform.openai.com/docs/api-reference/chat/create
417
448
  messages: List[ChatCompletionMessageParam]
418
- model: str = DEFAULT_MODEL_NAME
449
+ model: str = Field(
450
+ default=DEFAULT_MODEL_NAME,
451
+ description="Model name. Supports LoRA adapters via 'base-model:adapter-name' syntax.",
452
+ )
419
453
  frequency_penalty: float = 0.0
420
454
  logit_bias: Optional[Dict[str, float]] = None
421
455
  logprobs: bool = False
@@ -437,8 +471,8 @@ class ChatCompletionRequest(BaseModel):
437
471
  stop: Optional[Union[str, List[str]]] = None
438
472
  stream: bool = False
439
473
  stream_options: Optional[StreamOptions] = None
440
- temperature: float = 0.7
441
- top_p: float = 1.0
474
+ temperature: Optional[float] = None
475
+ top_p: Optional[float] = None
442
476
  user: Optional[str] = None
443
477
  tools: Optional[List[Tool]] = Field(default=None, examples=[None])
444
478
  tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field(
@@ -453,6 +487,52 @@ class ChatCompletionRequest(BaseModel):
453
487
  "Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
454
488
  )
455
489
 
490
+ # Extra parameters for SRT backend only and will be ignored by OpenAI models.
491
+ top_k: Optional[int] = None
492
+ min_p: Optional[float] = None
493
+ min_tokens: int = 0
494
+ regex: Optional[str] = None
495
+ ebnf: Optional[str] = None
496
+ repetition_penalty: Optional[float] = None
497
+ stop_token_ids: Optional[List[int]] = None
498
+ stop_regex: Optional[Union[str, List[str]]] = None
499
+ no_stop_trim: bool = False
500
+ ignore_eos: bool = False
501
+ continue_final_message: bool = False
502
+ skip_special_tokens: bool = True
503
+ lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
504
+ session_params: Optional[Dict] = None
505
+ separate_reasoning: bool = True
506
+ stream_reasoning: bool = True
507
+ chat_template_kwargs: Optional[Dict] = None
508
+
509
+ # Custom logit processor for advanced sampling control
510
+ custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
511
+ custom_params: Optional[Dict] = None
512
+
513
+ # For request id
514
+ rid: Optional[Union[List[str], str]] = None
515
+ # Extra key for classifying the request (e.g. cache_salt)
516
+ extra_key: Optional[Union[List[str], str]] = None
517
+ # Cache salt for request caching
518
+ cache_salt: Optional[Union[List[str], str]] = None
519
+ # Priority for the request
520
+ priority: Optional[int] = None
521
+
522
+ # For PD disaggregation
523
+ bootstrap_host: Optional[Union[List[str], str]] = None
524
+ bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
525
+ bootstrap_room: Optional[Union[List[int], int]] = None
526
+
527
+ # OpenAI/SGLang default sampling parameters
528
+ _DEFAULT_SAMPLING_PARAMS = {
529
+ "temperature": 1.0,
530
+ "top_p": 1.0,
531
+ "top_k": -1,
532
+ "min_p": 0.0,
533
+ "repetition_penalty": 1.0,
534
+ }
535
+
456
536
  @model_validator(mode="before")
457
537
  @classmethod
458
538
  def set_tool_choice_default(cls, values):
@@ -523,31 +603,83 @@ class ChatCompletionRequest(BaseModel):
523
603
 
524
604
  return values
525
605
 
526
- # Extra parameters for SRT backend only and will be ignored by OpenAI models.
527
- top_k: int = -1
528
- min_p: float = 0.0
529
- min_tokens: int = 0
530
- regex: Optional[str] = None
531
- ebnf: Optional[str] = None
532
- repetition_penalty: float = 1.0
533
- stop_token_ids: Optional[List[int]] = None
534
- no_stop_trim: bool = False
535
- ignore_eos: bool = False
536
- continue_final_message: bool = False
537
- skip_special_tokens: bool = True
538
- lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
539
- session_params: Optional[Dict] = None
540
- separate_reasoning: bool = True
541
- stream_reasoning: bool = True
542
- chat_template_kwargs: Optional[Dict] = None
606
+ def to_sampling_params(
607
+ self,
608
+ stop: List[str],
609
+ model_generation_config: Dict[str, Any],
610
+ tool_call_constraint: Optional[ToolCallConstraint] = None,
611
+ ) -> Dict[str, Any]:
612
+ """
613
+ Convert request to sampling parameters.
614
+ Priority: user value > model generation_config > OpenAI defaults
615
+ """
616
+
617
+ def get_param(param_name: str):
618
+ value = getattr(self, param_name)
619
+ if value is None:
620
+ return model_generation_config.get(
621
+ param_name, self._DEFAULT_SAMPLING_PARAMS[param_name]
622
+ )
623
+ return value
624
+
625
+ sampling_params = {
626
+ "temperature": get_param("temperature"),
627
+ "max_new_tokens": self.max_tokens or self.max_completion_tokens,
628
+ "min_new_tokens": self.min_tokens,
629
+ "stop": stop,
630
+ "stop_token_ids": self.stop_token_ids,
631
+ "stop_regex": self.stop_regex,
632
+ "top_p": get_param("top_p"),
633
+ "top_k": get_param("top_k"),
634
+ "min_p": get_param("min_p"),
635
+ "presence_penalty": self.presence_penalty,
636
+ "frequency_penalty": self.frequency_penalty,
637
+ "repetition_penalty": get_param("repetition_penalty"),
638
+ "regex": self.regex,
639
+ "ebnf": self.ebnf,
640
+ "n": self.n,
641
+ "no_stop_trim": self.no_stop_trim,
642
+ "ignore_eos": self.ignore_eos,
643
+ "skip_special_tokens": self.skip_special_tokens,
644
+ "logit_bias": self.logit_bias,
645
+ "custom_params": self.custom_params,
646
+ }
543
647
 
544
- # For request id
545
- rid: Optional[Union[List[str], str]] = None
648
+ if self.response_format and self.response_format.type == "json_schema":
649
+ sampling_params["json_schema"] = convert_json_schema_to_str(
650
+ self.response_format.json_schema.schema_
651
+ )
652
+ elif self.response_format and self.response_format.type == "json_object":
653
+ sampling_params["json_schema"] = '{"type": "object"}'
654
+ elif self.response_format and self.response_format.type == "structural_tag":
655
+ sampling_params["structural_tag"] = convert_json_schema_to_str(
656
+ self.response_format.model_dump(by_alias=True)
657
+ )
546
658
 
547
- # For PD disaggregation
548
- bootstrap_host: Optional[Union[List[str], str]] = None
549
- bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
550
- bootstrap_room: Optional[Union[List[int], int]] = None
659
+ # Check if there are already existing output constraints
660
+ has_existing_constraints = (
661
+ sampling_params.get("regex")
662
+ or sampling_params.get("ebnf")
663
+ or sampling_params.get("structural_tag")
664
+ or sampling_params.get("json_schema")
665
+ )
666
+
667
+ if tool_call_constraint and has_existing_constraints:
668
+ logger.warning("Constrained decoding is not compatible with tool calls.")
669
+ elif tool_call_constraint:
670
+ constraint_type, constraint_value = tool_call_constraint
671
+ if constraint_type == "structural_tag":
672
+ sampling_params[constraint_type] = convert_json_schema_to_str(
673
+ constraint_value.model_dump(by_alias=True)
674
+ )
675
+ elif constraint_type == "json_schema":
676
+ sampling_params[constraint_type] = convert_json_schema_to_str(
677
+ constraint_value # type: ignore
678
+ )
679
+ else:
680
+ sampling_params[constraint_type] = constraint_value
681
+
682
+ return sampling_params
551
683
 
552
684
 
553
685
  class ChatMessage(BaseModel):
@@ -644,6 +776,8 @@ class EmbeddingRequest(BaseModel):
644
776
 
645
777
  # The request id.
646
778
  rid: Optional[Union[List[str], str]] = None
779
+ # Priority for the request
780
+ priority: Optional[int] = None
647
781
 
648
782
 
649
783
  class EmbeddingObject(BaseModel):
@@ -652,6 +786,37 @@ class EmbeddingObject(BaseModel):
652
786
  object: str = "embedding"
653
787
 
654
788
 
789
+ ClassifyInput = Union[str, List[str], List[int]]
790
+
791
+
792
+ class ClassifyRequest(BaseModel):
793
+ # OpenAI-compatible classification request
794
+ model: str = DEFAULT_MODEL_NAME
795
+ input: ClassifyInput
796
+ user: Optional[str] = None
797
+
798
+ # The request id.
799
+ rid: Optional[Union[List[str], str]] = None
800
+ # Priority for the request
801
+ priority: Optional[int] = None
802
+
803
+
804
+ class ClassifyData(BaseModel):
805
+ index: int
806
+ label: str
807
+ probs: List[float]
808
+ num_classes: int
809
+
810
+
811
+ class ClassifyResponse(BaseModel):
812
+ id: str
813
+ object: str = "list"
814
+ created: int
815
+ model: str
816
+ data: List[ClassifyData]
817
+ usage: UsageInfo
818
+
819
+
655
820
  class EmbeddingResponse(BaseModel):
656
821
  data: List[EmbeddingObject]
657
822
  model: str
@@ -695,12 +860,51 @@ class RerankResponse(BaseModel):
695
860
  meta_info: Optional[dict] = None
696
861
 
697
862
 
863
+ class TokenizeRequest(BaseModel):
864
+ """Request schema for the /tokenize endpoint."""
865
+
866
+ model: str = DEFAULT_MODEL_NAME
867
+ prompt: Union[str, List[str]]
868
+ add_special_tokens: bool = Field(
869
+ default=True,
870
+ description="whether to add model-specific special tokens (e.g. BOS/EOS) during encoding.",
871
+ )
872
+
873
+
874
+ class TokenizeResponse(BaseModel):
875
+ """Response schema for the /tokenize endpoint."""
876
+
877
+ tokens: Union[List[int], List[List[int]]]
878
+ count: Union[int, List[int]]
879
+ max_model_len: int
880
+
881
+
882
+ class DetokenizeRequest(BaseModel):
883
+ """Request schema for the /detokenize endpoint."""
884
+
885
+ model: str = DEFAULT_MODEL_NAME
886
+ tokens: Union[List[int], List[List[int]]]
887
+ skip_special_tokens: bool = Field(
888
+ default=True,
889
+ description="whether to exclude special tokens (e.g. padding or EOS) during decoding.",
890
+ )
891
+
892
+
893
+ class DetokenizeResponse(BaseModel):
894
+ """Response schema for the /detokenize endpoint."""
895
+
896
+ text: Union[str, List[str]]
897
+
898
+
698
899
  OpenAIServingRequest = Union[
699
900
  ChatCompletionRequest,
700
901
  CompletionRequest,
701
902
  EmbeddingRequest,
903
+ ClassifyRequest,
702
904
  ScoringRequest,
703
905
  V1RerankReqInput,
906
+ TokenizeRequest,
907
+ DetokenizeRequest,
704
908
  ]
705
909
 
706
910
 
@@ -772,6 +976,13 @@ class ResponsesRequest(BaseModel):
772
976
  description="The request_id related to this request. If the caller does not set it, a random uuid will be generated.",
773
977
  )
774
978
  priority: int = Field(default=0, description="Request priority")
979
+ extra_key: Optional[str] = Field(
980
+ default=None,
981
+ description="Extra key for classifying the request (e.g. cache_salt)",
982
+ )
983
+ cache_salt: Optional[str] = Field(
984
+ default=None, description="Cache salt for request caching"
985
+ )
775
986
 
776
987
  # SGLang-specific sampling parameters
777
988
  frequency_penalty: float = 0.0
@@ -860,6 +1071,26 @@ class ResponsesResponse(BaseModel):
860
1071
  tool_choice: str = "auto"
861
1072
  tools: List[ResponseTool] = Field(default_factory=list)
862
1073
 
1074
+ # OpenAI compatibility fields. not all are used at the moment.
1075
+ # Recommend checking https://platform.openai.com/docs/api-reference/responses
1076
+ error: Optional[dict] = None
1077
+ incomplete_details: Optional[dict] = None # TODO(v) support this input
1078
+ instructions: Optional[str] = None
1079
+ max_output_tokens: Optional[int] = None
1080
+ previous_response_id: Optional[str] = None
1081
+ reasoning: Optional[dict] = (
1082
+ # Unused. No model supports this. For GPT-oss, system prompt sets
1083
+ # the field, not server args.
1084
+ None # {"effort": Optional[str], "summary": Optional[str]}
1085
+ )
1086
+ store: Optional[bool] = None
1087
+ temperature: Optional[float] = None
1088
+ text: Optional[dict] = None # e.g. {"format": {"type": "text"}}
1089
+ top_p: Optional[float] = None
1090
+ truncation: Optional[str] = None
1091
+ user: Optional[str] = None
1092
+ metadata: Optional[Dict[str, Any]] = None
1093
+
863
1094
  @classmethod
864
1095
  def from_request(
865
1096
  cls,
@@ -874,6 +1105,41 @@ class ResponsesResponse(BaseModel):
874
1105
  usage: Optional[UsageInfo],
875
1106
  ) -> "ResponsesResponse":
876
1107
  """Create a response from a request."""
1108
+
1109
+ # Determine if the output is plain text only to set text.format
1110
+ def _is_text_only(
1111
+ items: List[
1112
+ Union[
1113
+ ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall
1114
+ ]
1115
+ ],
1116
+ ) -> bool:
1117
+ if not items:
1118
+ return False
1119
+ for it in items:
1120
+ # tool call -> not pure text.
1121
+ if isinstance(it, ResponseReasoningItem) or isinstance(
1122
+ it, ResponseFunctionToolCall
1123
+ ):
1124
+ return False
1125
+ try:
1126
+ if isinstance(it, ResponseOutputText):
1127
+ continue
1128
+ elif isinstance(it, ResponseOutputMessage):
1129
+ if not it.content:
1130
+ continue
1131
+ for c in it.content:
1132
+ if not isinstance(c, ResponseOutputText):
1133
+ return False
1134
+ else:
1135
+ # Unknown type, not considered text-only
1136
+ return False
1137
+ except AttributeError:
1138
+ return False
1139
+ return True
1140
+
1141
+ text_format = {"format": {"type": "text"}} if _is_text_only(output) else None
1142
+
877
1143
  return cls(
878
1144
  id=request.request_id,
879
1145
  created_at=created_time,
@@ -884,6 +1150,23 @@ class ResponsesResponse(BaseModel):
884
1150
  parallel_tool_calls=request.parallel_tool_calls or True,
885
1151
  tool_choice=request.tool_choice,
886
1152
  tools=request.tools,
1153
+ # fields for parity with v1/responses
1154
+ error=None,
1155
+ incomplete_details=None,
1156
+ instructions=request.instructions,
1157
+ max_output_tokens=request.max_output_tokens,
1158
+ previous_response_id=request.previous_response_id, # TODO(v): ensure this is propagated if retrieved from store
1159
+ reasoning={
1160
+ "effort": request.reasoning.effort if request.reasoning else None,
1161
+ "summary": None, # unused
1162
+ },
1163
+ store=request.store,
1164
+ temperature=request.temperature,
1165
+ text=text_format, # TODO(v): Expand coverage per https://platform.openai.com/docs/api-reference/responses/list
1166
+ top_p=request.top_p,
1167
+ truncation=request.truncation,
1168
+ user=request.user,
1169
+ metadata=request.metadata or {},
887
1170
  )
888
1171
 
889
1172
 
@@ -919,7 +1202,17 @@ class MessageProcessingResult:
919
1202
  video_data: Optional[Any]
920
1203
  modalities: List[str]
921
1204
  stop: List[str]
922
- tool_call_constraint: Optional[Any] = None
1205
+ tool_call_constraint: Optional[ToolCallConstraint] = None
1206
+
1207
+
1208
+ class ToolCallProcessingResult(NamedTuple):
1209
+ """Result of processing tool calls in a response."""
1210
+
1211
+ tool_calls: Optional[
1212
+ List[Any]
1213
+ ] # List of ToolCall objects or None if parsing failed
1214
+ remaining_text: str # Text remaining after parsing tool calls
1215
+ finish_reason: Dict[str, Any] # Updated finish reason dictionary
923
1216
 
924
1217
 
925
1218
  class ResponseReasoningTextContent(BaseModel):
@@ -4,8 +4,9 @@ import json
4
4
  import logging
5
5
  import uuid
6
6
  from abc import ABC, abstractmethod
7
- from typing import TYPE_CHECKING, Any, Optional, Union
7
+ from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union
8
8
 
9
+ import orjson
9
10
  from fastapi import HTTPException, Request
10
11
  from fastapi.responses import ORJSONResponse, StreamingResponse
11
12
 
@@ -27,13 +28,59 @@ class OpenAIServingBase(ABC):
27
28
  self.tokenizer_manager = tokenizer_manager
28
29
  self.allowed_custom_labels = (
29
30
  set(
30
- self.tokenizer_manager.server_args.tokenizer_metrics_allowed_customer_labels
31
+ self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
31
32
  )
32
33
  if isinstance(self.tokenizer_manager.server_args, ServerArgs)
33
- and self.tokenizer_manager.server_args.tokenizer_metrics_allowed_customer_labels
34
+ and self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
34
35
  else None
35
36
  )
36
37
 
38
+ def _parse_model_parameter(self, model: str) -> Tuple[str, Optional[str]]:
39
+ """Parse 'base-model:adapter-name' syntax to extract LoRA adapter.
40
+
41
+ Returns (base_model, adapter_name) or (model, None) if no colon present.
42
+ """
43
+ if ":" not in model:
44
+ return model, None
45
+
46
+ # Split on first colon only to handle model paths with multiple colons
47
+ parts = model.split(":", 1)
48
+ base_model = parts[0].strip()
49
+ adapter_name = parts[1].strip() or None
50
+
51
+ return base_model, adapter_name
52
+
53
+ def _resolve_lora_path(
54
+ self,
55
+ request_model: str,
56
+ explicit_lora_path: Optional[Union[str, List[Optional[str]]]],
57
+ ) -> Optional[Union[str, List[Optional[str]]]]:
58
+ """Resolve LoRA adapter with priority: model parameter > explicit lora_path.
59
+
60
+ Returns adapter name or None. Supports both single values and lists (batches).
61
+ """
62
+ _, adapter_from_model = self._parse_model_parameter(request_model)
63
+
64
+ # Model parameter adapter takes precedence
65
+ if adapter_from_model is not None:
66
+ return adapter_from_model
67
+
68
+ # Fall back to explicit lora_path
69
+ return explicit_lora_path
70
+
71
+ def _validate_lora_enabled(self, adapter_name: str) -> None:
72
+ """Check that LoRA is enabled before attempting to use an adapter.
73
+
74
+ Raises ValueError with actionable guidance if --enable-lora flag is missing.
75
+ Adapter existence is validated later by TokenizerManager.lora_registry.
76
+ """
77
+ if not self.tokenizer_manager.server_args.enable_lora:
78
+ raise ValueError(
79
+ f"LoRA adapter '{adapter_name}' was requested, but LoRA is not enabled. "
80
+ "Please launch the server with --enable-lora flag and preload adapters "
81
+ "using --lora-paths or /load_lora_adapter endpoint."
82
+ )
83
+
37
84
  async def handle_request(
38
85
  self, request: OpenAIServingRequest, raw_request: Request
39
86
  ) -> Union[Any, StreamingResponse, ErrorResponse]:
@@ -62,6 +109,12 @@ class OpenAIServingBase(ABC):
62
109
  return self.create_error_response(
63
110
  message=e.detail, err_type=str(e.status_code), status_code=e.status_code
64
111
  )
112
+ except ValueError as e:
113
+ return self.create_error_response(
114
+ message=str(e),
115
+ err_type="BadRequest",
116
+ status_code=400,
117
+ )
65
118
  except Exception as e:
66
119
  logger.exception(f"Error in request: {e}")
67
120
  return self.create_error_response(
@@ -86,6 +139,19 @@ class OpenAIServingBase(ABC):
86
139
 
87
140
  return f"{self._request_id_prefix()}{uuid.uuid4().hex}"
88
141
 
142
+ def _compute_extra_key(self, request: OpenAIServingRequest) -> Optional[str]:
143
+ """Compute the final extra_key by concatenating cache_salt and extra_key if both are provided."""
144
+ parts = []
145
+ for key in ["cache_salt", "extra_key"]:
146
+ value = getattr(request, key, None)
147
+ if value:
148
+ if not isinstance(value, str):
149
+ raise TypeError(
150
+ f"Value of {key} must be a string, but got {type(value).__name__}"
151
+ )
152
+ parts.append(value)
153
+ return "".join(parts) if parts else None
154
+
89
155
  @abstractmethod
90
156
  def _convert_to_internal_request(
91
157
  self,
@@ -165,20 +231,20 @@ class OpenAIServingBase(ABC):
165
231
  )
166
232
  return json.dumps({"error": error.model_dump()})
167
233
 
168
- def extract_customer_labels(self, raw_request):
234
+ def extract_custom_labels(self, raw_request):
169
235
  if (
170
236
  not self.allowed_custom_labels
171
237
  or not self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
172
238
  ):
173
239
  return None
174
240
 
175
- customer_labels = None
241
+ custom_labels = None
176
242
  header = (
177
243
  self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
178
244
  )
179
245
  try:
180
246
  raw_labels = (
181
- json.loads(raw_request.headers.get(header))
247
+ orjson.loads(raw_request.headers.get(header))
182
248
  if raw_request and raw_request.headers.get(header)
183
249
  else None
184
250
  )
@@ -187,9 +253,9 @@ class OpenAIServingBase(ABC):
187
253
  raw_labels = None
188
254
 
189
255
  if isinstance(raw_labels, dict):
190
- customer_labels = {
256
+ custom_labels = {
191
257
  label: value
192
258
  for label, value in raw_labels.items()
193
259
  if label in self.allowed_custom_labels
194
260
  }
195
- return customer_labels
261
+ return custom_labels